summaryrefslogtreecommitdiffstats
path: root/net/base/net_util.cc
diff options
context:
space:
mode:
Diffstat (limited to 'net/base/net_util.cc')
-rw-r--r--net/base/net_util.cc993
1 files changed, 993 insertions, 0 deletions
diff --git a/net/base/net_util.cc b/net/base/net_util.cc
new file mode 100644
index 0000000..416252c
--- /dev/null
+++ b/net/base/net_util.cc
@@ -0,0 +1,993 @@
+// Copyright 2008, Google Inc.
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+// * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+// * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#include <algorithm>
+#include <unicode/ucnv.h>
+#include <unicode/uidna.h>
+#include <unicode/ulocdata.h>
+#include <unicode/uniset.h>
+#include <unicode/uscript.h>
+#include <unicode/uset.h>
+#include <windows.h>
+#include <wininet.h>
+
+#include "net/base/net_util.h"
+
+#include "base/basictypes.h"
+#include "base/file_util.h"
+#include "base/logging.h"
+#include "base/path_service.h"
+#include "base/scoped_ptr.h"
+#include "base/string_tokenizer.h"
+#include "base/string_util.h"
+#include "base/time.h"
+#include "base/string_escape.h"
+#include "googleurl/src/gurl.h"
+#include "googleurl/src/url_canon.h"
+#include "googleurl/src/url_parse.h"
+#include "net/base/escape.h"
+#include "net/base/net_module.h"
+#include "net/base/net_resources.h"
+#include "net/base/base64.h"
+#include "unicode/datefmt.h"
+
+namespace {
+
+// what we prepend to get a file URL
+static const wchar_t kFileURLPrefix[] = L"file:///";
+
+// The general list of blocked ports. Will be blocked unless a specific
+// protocol overrides it. (Ex: ftp can use ports 20 and 21)
+static const int kRestrictedPorts[] = {
+ 1, // tcpmux
+ 7, // echo
+ 9, // discard
+ 11, // systat
+ 13, // daytime
+ 15, // netstat
+ 17, // qotd
+ 19, // chargen
+ 20, // ftp data
+ 21, // ftp access
+ 22, // ssh
+ 23, // telnet
+ 25, // smtp
+ 37, // time
+ 42, // name
+ 43, // nicname
+ 53, // domain
+ 77, // priv-rjs
+ 79, // finger
+ 87, // ttylink
+ 95, // supdup
+ 101, // hostriame
+ 102, // iso-tsap
+ 103, // gppitnp
+ 104, // acr-nema
+ 109, // pop2
+ 110, // pop3
+ 111, // sunrpc
+ 113, // auth
+ 115, // sftp
+ 117, // uucp-path
+ 119, // nntp
+ 123, // NTP
+ 135, // loc-srv /epmap
+ 139, // netbios
+ 143, // imap2
+ 179, // BGP
+ 389, // ldap
+ 465, // smtp+ssl
+ 512, // print / exec
+ 513, // login
+ 514, // shell
+ 515, // printer
+ 526, // tempo
+ 530, // courier
+ 531, // chat
+ 532, // netnews
+ 540, // uucp
+ 556, // remotefs
+ 563, // nntp+ssl
+ 587, // stmp?
+ 601, // ??
+ 636, // ldap+ssl
+ 993, // ldap+ssl
+ 995, // pop3+ssl
+ 2049, // nfs
+ 4045, // lockd
+ 6000, // X11
+};
+
+// FTP overrides the following restricted ports.
+static const int kAllowedFtpPorts[] = {
+ 21, // ftp data
+ 22, // ssh
+};
+
+template<typename STR>
+STR GetSpecificHeaderT(const STR& headers, const STR& name) {
+ // We want to grab the Value from the "Key: Value" pairs in the headers,
+ // which should look like this (no leading spaces, \n-separated) (we format
+ // them this way in url_request_inet.cc):
+ // HTTP/1.1 200 OK\n
+ // ETag: "6d0b8-947-24f35ec0"\n
+ // Content-Length: 2375\n
+ // Content-Type: text/html; charset=UTF-8\n
+ // Last-Modified: Sun, 03 Sep 2006 04:34:43 GMT\n
+ if (headers.empty())
+ return STR();
+
+ STR match;
+ match.push_back('\n');
+ match.append(name);
+ match.push_back(':');
+
+ STR::const_iterator begin =
+ search(headers.begin(), headers.end(), match.begin(), match.end(),
+ CaseInsensitiveCompareASCII<STR::value_type>());
+
+ if (begin == headers.end())
+ return STR();
+
+ begin += match.length();
+
+ STR::const_iterator end = find(begin, headers.end(), '\n');
+
+ STR ret;
+ TrimWhitespace(STR(begin, end), TRIM_ALL, &ret);
+ return ret;
+}
+
+// TODO(jungshik): We have almost identical hex-decoding code else where.
+// Consider refactoring and moving it somewhere(base?). Bug 1224311
+inline bool IsHexDigit(unsigned char c) {
+ return ('0' <= c && c <= '9' || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f');
+}
+
+inline unsigned char HexToInt(unsigned char c) {
+ DCHECK(IsHexDigit(c));
+ static unsigned char kOffset[4] = {0, 0x30u, 0x37u, 0x57u};
+ return c - kOffset[c / 0x20];
+}
+
+// Similar to Base64Decode. Decodes a Q-encoded string to a sequence
+// of bytes. If input is invalid, return false.
+bool QPDecode(const std::string& input, std::string* output) {
+ std::string temp;
+ temp.reserve(input.size());
+ std::string::const_iterator it = input.begin();
+ while (it != input.end()) {
+ if (*it == '_') {
+ temp.push_back(' ');
+ } else if (*it == '=') {
+ if (input.end() - it < 3) {
+ return false;
+ }
+ if (IsHexDigit(static_cast<unsigned char>(*(it + 1))) &&
+ IsHexDigit(static_cast<unsigned char>(*(it + 2)))) {
+ unsigned char ch = HexToInt(*(it + 1)) * 16 + HexToInt(*(it + 2));
+ temp.push_back(static_cast<char>(ch));
+ ++it;
+ ++it;
+ } else {
+ return false;
+ }
+ } else if (0x20 < *it && *it < 0x7F) {
+ // In a Q-encoded word, only printable ASCII characters
+ // represent themselves. Besides, space, '=', '_' and '?' are
+ // not allowed, but they're already filtered out.
+ DCHECK(*it != 0x3D && *it != 0x5F && *it != 0x3F);
+ temp.push_back(*it);
+ } else {
+ return false;
+ }
+ ++it;
+ }
+ output->swap(temp);
+ return true;
+}
+
+enum RFC2047EncodingType {Q_ENCODING, B_ENCODING};
+bool DecodeBQEncoding(const std::string& part, RFC2047EncodingType enc_type,
+ const std::string& charset, std::string* output) {
+ std::string decoded;
+ if (enc_type == B_ENCODING) {
+ if (!Base64Decode(part, &decoded)) {
+ return false;
+ }
+ } else {
+ if (!QPDecode(part, &decoded)) {
+ return false;
+ }
+ }
+
+ UErrorCode err = U_ZERO_ERROR;
+ UConverter* converter(ucnv_open(charset.c_str(), &err));
+ if (U_FAILURE(err)) {
+ return false;
+ }
+
+ // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8.
+ // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes
+ // in UTF-8. Therefore, the expansion ratio is 3 at most.
+ int length = static_cast<int>(decoded.length());
+ char* buf = WriteInto(output, length * 3);
+ length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, length * 3,
+ decoded.data(), length, &err);
+ ucnv_close(converter);
+ if (U_FAILURE(err)) {
+ return false;
+ }
+ output->resize(length);
+ return true;
+}
+
+bool DecodeWord(const std::string& encoded_word,
+ bool *is_rfc2047,
+ std::string* output) {
+ // TODO(jungshik) : Revisit this later. Do we want to pass through non-ASCII
+ // strings which can be mozibake? WinHTTP converts a raw 8bit string
+ // UTF-16 assuming it's in the OS default encoding.
+ if (!IsStringASCII(encoded_word)) {
+ // Try falling back to the NativeMB encoding if the raw input is not UTF-8.
+ if (IsStringUTF8(encoded_word.c_str())) {
+ *output = encoded_word;
+ } else {
+ *output = WideToUTF8(NativeMBToWide(encoded_word));
+ }
+ *is_rfc2047 = false;
+ return true;
+ }
+
+ // RFC 2047 : one of encoding methods supported by Firefox and relatively
+ // widely used by web servers.
+ // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
+ // We don't care about the length restriction (72 bytes) because
+ // many web servers generate encoded words longer than the limit.
+ std::string tmp;
+ *is_rfc2047 = true;
+ int part_index = 0;
+ std::string charset;
+ StringTokenizer t(encoded_word, "?");
+ RFC2047EncodingType enc_type = Q_ENCODING;
+ while (*is_rfc2047 && t.GetNext()) {
+ std::string part = t.token();
+ switch (part_index) {
+ case 0:
+ if (part != "=") {
+ *is_rfc2047 = false;
+ break;
+ }
+ ++part_index;
+ break;
+ case 1:
+ // Do we need charset validity check here?
+ charset = part;
+ ++part_index;
+ break;
+ case 2:
+ if (part.size() > 1 ||
+ part.find_first_of("bBqQ") == std::string::npos) {
+ *is_rfc2047 = false;
+ break;
+ }
+ if (part[0] == 'b' || part[0] == 'B') {
+ enc_type = B_ENCODING;
+ }
+ ++part_index;
+ break;
+ case 3:
+ *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp);
+ if (!*is_rfc2047) {
+ // Last minute failure. Invalid B/Q encoding. Rather than
+ // passing it through, return now.
+ return false;
+ }
+ ++part_index;
+ break;
+ case 4:
+ if (part != "=") {
+ // Another last minute failure !
+ // Likely to be a case of two encoded-words in a row or
+ // an encoded word followed by a non-encoded word. We can be
+ // generous, but it does not help much in terms of compatibility,
+ // I believe. Return immediately.
+ *is_rfc2047 = false;
+ return false;
+ }
+ ++part_index;
+ break;
+ default:
+ *is_rfc2047 = false;
+ return false;
+ }
+ }
+
+ if (*is_rfc2047) {
+ if (*(encoded_word.end() - 1) == '=') {
+ output->swap(tmp);
+ return true;
+ }
+ // encoded_word ending prematurelly with '?' or extra '?'
+ *is_rfc2047 = false;
+ return false;
+ }
+
+ // We're not handling 'especial' characters quoted with '\', but
+ // it should be Ok because we're not an email client but a
+ // web browser.
+
+ // What IE6/7 does: %-escaped UTF-8. We could extend this to
+ // support a rudimentary form of RFC 2231 with charset label, but
+ // it'd gain us little in terms of compatibility.
+ tmp = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES);
+ if (IsStringUTF8(tmp.c_str())) {
+ output->swap(tmp);
+ return true;
+ // We can try either the OS default charset or 'origin charset' here,
+ // As far as I can tell, IE does not support it. However, I've seen
+ // web servers emit %-escaped string in a legacy encoding (usually
+ // origin charset).
+ // TODO(jungshik) : Test IE further and consider adding a fallback here.
+ }
+ return false;
+}
+
+bool DecodeParamValue(const std::string& input, std::string* output) {
+ std::string tmp;
+ // Tokenize with whitespace characters.
+ StringTokenizer t(input, " \t\n\r");
+ t.set_options(StringTokenizer::RETURN_DELIMS);
+ bool is_previous_token_rfc2047 = true;
+ while (t.GetNext()) {
+ if (t.token_is_delim()) {
+ // If the previous non-delimeter token is not RFC2047-encoded,
+ // put in a space in its place. Otheriwse, skip over it.
+ if (!is_previous_token_rfc2047) {
+ tmp.push_back(' ');
+ }
+ continue;
+ }
+ // We don't support a single multibyte character split into
+ // adjacent encoded words. Some broken mail clients emit headers
+ // with that problem, but most web servers usually encode a filename
+ // in a single encoded-word. Firefox/Thunderbird do not support
+ // it, either.
+ std::string decoded;
+ if (!DecodeWord(t.token(), &is_previous_token_rfc2047, &decoded))
+ return false;
+ tmp.append(decoded);
+ }
+ output->swap(tmp);
+ return true;
+}
+
+// TODO(mpcomplete): This is a quick and dirty implementation for now. I'm
+// sure this doesn't properly handle all (most?) cases.
+template<typename STR>
+STR GetHeaderParamValueT(const STR& header, const STR& param_name) {
+ // This assumes args are formatted exactly like "bla; arg1=value; arg2=value".
+ STR::const_iterator param_begin =
+ search(header.begin(), header.end(), param_name.begin(), param_name.end(),
+ CaseInsensitiveCompareASCII<STR::value_type>());
+
+ if (param_begin == header.end())
+ return STR();
+ param_begin += param_name.length();
+
+ STR whitespace;
+ whitespace.push_back(' ');
+ whitespace.push_back('\t');
+ const STR::size_type equals_offset =
+ header.find_first_not_of(whitespace, param_begin - header.begin());
+ if (equals_offset == STR::npos || header.at(equals_offset) != '=')
+ return STR();
+
+ param_begin = header.begin() + equals_offset + 1;
+ if (param_begin == header.end())
+ return STR();
+
+ STR::const_iterator param_end;
+ if (*param_begin == '"') {
+ param_end = find(param_begin+1, header.end(), '"');
+ if (param_end == header.end())
+ return STR(); // poorly formatted param?
+
+ ++param_begin; // skip past the quote.
+ } else {
+ param_end = find(param_begin+1, header.end(), ';');
+ }
+
+ return STR(param_begin, param_end);
+}
+
+// Does some simple normalization of scripts so we can allow certain scripts
+// to exist together.
+// TODO(brettw) bug 880223: we should allow some other languages to be
+// oombined such as Chinese and Latin. We will probably need a more
+// complicated system of language pairs to have more fine-grained control.
+UScriptCode NormalizeScript(UScriptCode code) {
+ switch (code) {
+ case USCRIPT_KATAKANA:
+ case USCRIPT_HIRAGANA:
+ case USCRIPT_KATAKANA_OR_HIRAGANA:
+ case USCRIPT_HANGUL: // This one is arguable.
+ return USCRIPT_HAN;
+ default:
+ return code;
+ }
+}
+
+bool IsIDNComponentInSingleScript(const wchar_t* str, int str_len) {
+ UScriptCode first_script;
+ bool is_first = true;
+
+ int i = 0;
+ while (i < str_len) {
+ unsigned code_point;
+ U16_NEXT(str, i, str_len, code_point);
+
+ UErrorCode err = U_ZERO_ERROR;
+ UScriptCode cur_script = uscript_getScript(code_point, &err);
+ if (err != U_ZERO_ERROR)
+ return false; // Report mixed on error.
+ cur_script = NormalizeScript(cur_script);
+
+ // TODO(brettw) We may have to check for USCRIPT_INHERENT as well.
+ if (is_first && cur_script != USCRIPT_COMMON) {
+ first_script = cur_script;
+ is_first = false;
+ } else {
+ if (cur_script != USCRIPT_COMMON && cur_script != first_script)
+ return false;
+ }
+ }
+ return true;
+}
+
+// Check if the script of a language can be 'safely' mixed with
+// Latin letters in the ASCII range.
+bool IsCompatibleWithASCIILetters(const std::string& lang) {
+ // For now, just list Chinese, Japanese and Korean (positive list).
+ // An alternative is negative-listing (languages using Greek and
+ // Cyrillic letters), but it can be more dangerous.
+ return !lang.substr(0,2).compare("zh") ||
+ !lang.substr(0,2).compare("ja") ||
+ !lang.substr(0,2).compare("ko");
+}
+
+// Returns true if the given Unicode host component is safe to display to the
+// user.
+bool IsIDNComponentSafe(const wchar_t* str,
+ int str_len,
+ const std::wstring& languages) {
+ // Most common cases (non-IDN) do not reach here so that we don't
+ // need a fast return path.
+ // TODO(jungshik) : Check if there's any character inappropriate
+ // (although allowed) for domain names.
+ // See http://www.unicode.org/reports/tr39/#IDN_Security_Profiles and
+ // http://www.unicode.org/reports/tr39/data/xidmodifications.txt
+ // For now, we borrow the list from Mozilla and tweaked it slightly.
+ // (e.g. Characters like U+00A0, U+3000, U+3002 are omitted because
+ // they're gonna be canonicalized to U+0020 and full stop before
+ // reaching here.)
+ // The original list is available at
+ // http://kb.mozillazine.org/Network.IDN.blacklist_chars and
+ // at http://mxr.mozilla.org/seamonkey/source/modules/libpref/src/init/all.js#703
+
+ UErrorCode status = U_ZERO_ERROR;
+#ifdef U_WCHAR_IS_UTF16
+ UnicodeSet dangerous_characters(UnicodeString(
+ L"[[\\ \u00bc\u00bd\u01c3\u0337\u0338"
+ L"\u05c3\u05f4\u06d4\u0702\u115f\u1160][\u2000-\u200b]"
+ L"[\u2024\u2027\u2028\u2029\u2039\u203a\u2044\u205f]"
+ L"[\u2154-\u2156][\u2159-\u215b][\u215f\u2215\u23ae"
+ L"\u29f6\u29f8\u2afb\u2afd][\u2ff0-\u2ffb][\u3014"
+ L"\u3015\u3033\u3164\u321d\u321e\u33ae\u33af\u33c6\u33df\ufe14"
+ L"\ufe15\ufe3f\ufe5d\ufe5e\ufeff\uff0e\uff06\uff61\uffa0\ufff9]"
+ L"[\ufffa-\ufffd]]"), status);
+#else
+ UnicodeSet dangerous_characters(UnicodeString(
+ "[[\\ \\u0020\\u00bc\\u00bd\\u01c3\\u0337\\u0338"
+ "\\u05c3\\u05f4\\u06d4\\u0702\\u115f\\u1160][\\u2000-\\u200b]"
+ "[\\u2024\\u2027\\u2028\\u2029\\u2039\\u203a\\u2044\\u205f]"
+ "[\\u2154-\\u2156][\\u2159-\\u215b][\\u215f\\u2215\\u23ae"
+ "\\u29f6\\u29f8\\u2afb\\u2afd][\\u2ff0-\\u2ffb][\\u3014"
+ "\\u3015\\u3033\\u3164\\u321d\\u321e\\u33ae\\u33af\\u33c6\\u33df\\ufe14"
+ "\\ufe15\\ufe3f\\ufe5d\\ufe5e\\ufeff\\uff0e\\uff06\\uff61\\uffa0\\ufff9]"
+ "[\\ufffa-\\ufffd]]", -1, US_INV), status);
+#endif
+ DCHECK(U_SUCCESS(status));
+ UnicodeSet component_characters;
+ component_characters.addAll(UnicodeString(str, str_len));
+ if (dangerous_characters.containsSome(component_characters))
+ return false;
+
+ // If the language list is empty, the result is completely determined
+ // by whether a component is a single script or not. This will block
+ // even "safe" script mixing cases like <Chinese, Latin-ASCII> that are
+ // allowed with |languages| (while it blocks Chinese + Latin letters with
+ // an accent as should be the case), but we want to err on the safe side
+ // when |languages| is empty.
+ if (languages.empty())
+ return IsIDNComponentInSingleScript(str, str_len);
+
+ // |common_characters| is made up of ASCII numbers, hyphen, plus and
+ // underscore that are used across scripts and allowed in domain names.
+ // (sync'd with characters allowed in url_canon_host with square
+ // brackets excluded.) See kHostCharLookup[] array in url_canon_host.cc.
+ UnicodeSet common_characters(UNICODE_STRING_SIMPLE("[[0-9]\\-_+\\ ]"),
+ status);
+ DCHECK(U_SUCCESS(status));
+ // Subtract common characters because they're always allowed so that
+ // we just have to check if a language-specific set contains
+ // the remainder.
+ component_characters.removeAll(common_characters);
+
+ USet *lang_set = uset_open(1, 0); // create an empty set
+ UnicodeSet ascii_letters(0x61, 0x7a); // [a-z]
+ bool safe = false;
+ std::string languages_list(WideToASCII(languages));
+ StringTokenizer t(languages_list, ",");
+ while (t.GetNext()) {
+ std::string lang = t.token();
+ status = U_ZERO_ERROR;
+ // TODO(jungshik) Cache exemplar sets for locales.
+ ULocaleData* uld = ulocdata_open(lang.c_str(), &status);
+ if (U_SUCCESS(status)) {
+ // Should we use auxiliary set, instead?
+ ulocdata_getExemplarSet(uld, lang_set, 0, ULOCDATA_ES_STANDARD, &status);
+ ulocdata_close(uld);
+ if (U_SUCCESS(status)) {
+ UnicodeSet* allowed_characters =
+ reinterpret_cast<UnicodeSet*>(lang_set);
+ // If |lang| is compatible with ASCII Latin letters, add them.
+ if (IsCompatibleWithASCIILetters(lang))
+ allowed_characters->addAll(ascii_letters);
+ if (allowed_characters->containsAll(component_characters)) {
+ safe = true;
+ break;
+ }
+ }
+ }
+ }
+ uset_close(lang_set);
+ return safe;
+}
+
+// Converts one component of a host (between dots) to IDN if safe. The result
+// will be APPENDED to the given output string and will be the same as the
+// input if it is not IDN or the IDN is unsafe to display.
+void IDNToUnicodeOneComponent(const wchar_t* comp,
+ int comp_len,
+ const std::wstring& languages,
+ std::wstring* out) {
+ DCHECK(comp_len >= 0);
+ if (comp_len == 0)
+ return;
+
+ // Expand the output string to make room for a possibly longer string
+ // (we'll expand if it's still not big enough below).
+ int extra_space = 64;
+ size_t host_begin_in_output = out->size();
+
+ // Just copy the input if it can't be an IDN component.
+ if (comp_len < 4 || wcsncmp(comp, L"xn--", 4)) {
+ out->resize(host_begin_in_output + comp_len);
+ for (int i = 0; i < comp_len; i++)
+ (*out)[host_begin_in_output + i] = comp[i];
+ return;
+ }
+
+ while (true) {
+ out->resize(out->size() + extra_space);
+ UErrorCode status = U_ZERO_ERROR;
+ int output_chars = uidna_IDNToUnicode(
+ comp, comp_len, &(*out)[host_begin_in_output], extra_space,
+ UIDNA_DEFAULT, NULL, &status);
+ if (status == U_ZERO_ERROR) {
+ // Converted successfully.
+ out->resize(host_begin_in_output + output_chars);
+ if (!IsIDNComponentSafe(&out->data()[host_begin_in_output],
+ output_chars,
+ languages))
+ break; // The error handling below will undo the IDN.
+ return;
+ }
+ if (status != U_BUFFER_OVERFLOW_ERROR)
+ break;
+
+ // Need to loop again with a bigger buffer. It looks like ICU will
+ // return the required size of the buffer, but that's not documented,
+ // so we'll just grow by 2x. This should be rare and is not on a
+ // critical path.
+ extra_space *= 2;
+ }
+
+ // We get here on error, in which case we replace anything that was added
+ // with the literal input.
+ out->resize(host_begin_in_output + comp_len);
+ for (int i = 0; i < comp_len; i++)
+ (*out)[host_begin_in_output + i] = comp[i];
+}
+
+// Convert a FILETIME to a localized string. |filetime| may be NULL.
+// TODO(tc): Remove this once bug 1164516 is fixed.
+std::wstring LocalizedDateTime(const FILETIME* filetime) {
+ if (!filetime)
+ return std::wstring();
+
+ Time time = Time::FromFileTime(*filetime);
+ scoped_ptr<DateFormat> formatter(DateFormat::createDateTimeInstance(
+ DateFormat::kShort));
+ UnicodeString date_string;
+ formatter->format(static_cast<UDate>(time.ToDoubleT() * 1000), date_string);
+
+ std::wstring formatted;
+ int capacity = date_string.length() + 1;
+ UErrorCode error = U_ZERO_ERROR;
+ date_string.extract(static_cast<UChar*>(WriteInto(&formatted, capacity)),
+ capacity, error);
+ return formatted;
+}
+
+} // namespace
+
+namespace net_util {
+
+GURL FilePathToFileURL(const std::wstring& file_path) {
+ // Produce a URL like "file:///C:/foo" for a regular file, or
+ // "file://///server/path" for UNC. The URL canonicalizer will fix up the
+ // latter case to be the canonical UNC form: "file://server/path"
+ std::wstring url_str(kFileURLPrefix);
+ url_str.append(file_path);
+
+ // Now do replacement of some characters. Since we assume the input is a
+ // literal filename, anything the URL parser might consider special should
+ // be escaped here.
+
+ // must be the first substitution since others will introduce percents as the
+ // escape character
+ ReplaceSubstringsAfterOffset(&url_str, 0, L"%", L"%25");
+
+ // semicolon is supposed to be some kind of separator according to RFC 2396
+ ReplaceSubstringsAfterOffset(&url_str, 0, L";", L"%3B");
+
+ ReplaceSubstringsAfterOffset(&url_str, 0, L"#", L"%23");
+
+ return GURL(url_str);
+}
+
+bool FileURLToFilePath(const GURL& url, std::wstring* file_path) {
+ file_path->clear();
+
+ if (!url.is_valid())
+ return false;
+
+ std::string path;
+ std::string host = url.host();
+ if (host.empty()) {
+ // URL contains no host, the path is the filename. In this case, the path
+ // will probably be preceeded with a slash, as in "/C:/foo.txt", so we
+ // trim out that here.
+ path = url.path();
+ size_t first_non_slash = path.find_first_not_of("/\\");
+ if (first_non_slash != std::string::npos && first_non_slash > 0)
+ path.erase(0, first_non_slash);
+ } else {
+ // URL contains a host: this means it's UNC. We keep the preceeding slash
+ // on the path.
+ path = "\\\\";
+ path.append(host);
+ path.append(url.path());
+ }
+
+ if (path.empty())
+ return false;
+ std::replace(path.begin(), path.end(), '/', '\\');
+
+ // GURL stores strings as percent-encoded UTF-8, this will undo if possible.
+ path = UnescapeURLComponent(path,
+ UnescapeRule::SPACES | UnescapeRule::PERCENTS);
+
+ if (!IsStringUTF8(path.c_str())) {
+ // Not UTF-8, assume encoding is native codepage and we're done. We know we
+ // are giving the conversion function a nonempty string, and it may fail if
+ // the given string is not in the current encoding and give us an empty
+ // string back. We detect this and report failure.
+ *file_path = NativeMBToWide(path);
+ return !file_path->empty();
+ }
+ file_path->assign(UTF8ToWide(path));
+
+ // Now we have an unescaped filename, but are still not sure about its
+ // encoding. For example, each character could be part of a UTF-8 string.
+ if (file_path->empty() || !IsString8Bit(*file_path)) {
+ // assume our 16-bit encoding is correct if it won't fit into an 8-bit
+ // string
+ return true;
+ }
+
+ // Convert our narrow string into the native wide path.
+ std::string narrow;
+ if (!WideToLatin1(*file_path, &narrow)) {
+ NOTREACHED() << "Should have filtered out non-8-bit strings above.";
+ return false;
+ }
+ if (IsStringUTF8(narrow.c_str())) {
+ // Our string actually looks like it could be UTF-8, convert to 8-bit
+ // UTF-8 and then to the corresponding wide string.
+ *file_path = UTF8ToWide(narrow);
+ } else {
+ // Our wide string contains only 8-bit characters and it's not UTF-8, so
+ // we assume it's in the native codepage.
+ *file_path = NativeMBToWide(narrow);
+ }
+
+ // Fail if 8-bit -> wide conversion failed and gave us an empty string back
+ // (we already filtered out empty strings above).
+ return !file_path->empty();
+}
+
+std::wstring GetSpecificHeader(const std::wstring& headers,
+ const std::wstring& name) {
+ return GetSpecificHeaderT(headers, name);
+}
+
+std::string GetSpecificHeader(const std::string& headers,
+ const std::string& name) {
+ return GetSpecificHeaderT(headers, name);
+}
+
+std::wstring GetFileNameFromCD(const std::string& header) {
+ std::string param_value = GetHeaderParamValue(header, "filename");
+ if (param_value.empty()) {
+ // Some servers use 'name' parameter.
+ param_value = GetHeaderParamValue(header, "name");
+ }
+ if (param_value.empty())
+ return std::wstring();
+ std::string decoded;
+ if (DecodeParamValue(param_value, &decoded))
+ return UTF8ToWide(decoded);
+ return std::wstring();
+}
+
+std::wstring GetHeaderParamValue(const std::wstring& field,
+ const std::wstring& param_name) {
+ return GetHeaderParamValueT(field, param_name);
+}
+
+std::string GetHeaderParamValue(const std::string& field,
+ const std::string& param_name) {
+ return GetHeaderParamValueT(field, param_name);
+}
+
+// TODO(brettw) bug 734373: check the scripts for each host component and
+// don't un-IDN-ize if there is more than one. Alternatively, only IDN for
+// scripts that the user has installed. For now, just put the entire
+// path through IDN. Maybe this feature can be implemented in ICU itself?
+//
+// We may want to skip this step in the case of file URLs to allow unicode
+// UNC hostnames regardless of encodings.
+void IDNToUnicode(const char* host,
+ int host_len,
+ const std::wstring& languages,
+ std::wstring* out) {
+ // Convert the ASCII input to a wide string for ICU.
+ std::wstring wide_input;
+ wide_input.reserve(host_len);
+ for (int i = 0; i < host_len; i++)
+ wide_input.push_back(host[i]);
+
+ // Do each component of the host separately, since we enforce script matching
+ // on a per-component basis.
+ size_t cur_begin = 0; // Beginning of the current component (inclusive).
+ while (cur_begin < wide_input.size()) {
+ // Find the next dot or the end of the string.
+ size_t next_dot = wide_input.find_first_of('.', cur_begin);
+ if (next_dot == std::wstring::npos)
+ next_dot = wide_input.size(); // For getting the last component.
+
+ if (next_dot > cur_begin) {
+ // Add the substring that we just found.
+ IDNToUnicodeOneComponent(&wide_input[cur_begin],
+ static_cast<int>(next_dot - cur_begin),
+ languages,
+ out);
+ }
+
+ // Need to add the dot we just found (if we found one). This needs to be
+ // done before we break out below in case the URL ends in a dot.
+ if (next_dot < wide_input.size())
+ out->push_back('.');
+ else
+ break; // No more components left.
+
+ cur_begin = next_dot + 1;
+ }
+}
+
+template <typename str>
+std::string CanonicalizeHost(const str& host, bool* is_ip_address) {
+ // Try to canonicalize the host.
+ const url_parse::Component raw_host_component(0,
+ static_cast<int>(host.length()));
+ std::string canon_host;
+ url_canon::StdStringCanonOutput canon_host_output(&canon_host);
+ url_parse::Component canon_host_component;
+ if (!url_canon::CanonicalizeHost(host.c_str(), raw_host_component,
+ &canon_host_output, &canon_host_component)) {
+ if (is_ip_address)
+ *is_ip_address = false;
+ return std::string();
+ }
+ canon_host_output.Complete();
+
+ if (is_ip_address) {
+ // See if the host is an IP address.
+ url_canon::RawCanonOutputT<char, 128> ignored_output;
+ url_parse::Component ignored_component;
+ *is_ip_address = url_canon::CanonicalizeIPAddress(canon_host.c_str(),
+ canon_host_component,
+ &ignored_output,
+ &ignored_component);
+ }
+
+ // Return the host as a string, stripping any unnecessary bits off the ends.
+ if ((canon_host_component.begin == 0) &&
+ (canon_host_component.len == canon_host.length()))
+ return canon_host;
+ return canon_host.substr(canon_host_component.begin,
+ canon_host_component.len);
+}
+
+// Forcibly instantiate narrow and wide versions of this function so we don't
+// need to put the function definition in the header.
+template std::string CanonicalizeHost<std::string>(const std::string& host,
+ bool* is_ip_address);
+template std::string CanonicalizeHost<std::wstring>(const std::wstring& host,
+ bool* is_ip_address);
+
+std::string GetDirectoryListingHeader(const std::string& title) {
+ std::string result = NetModule::GetResource(IDR_DIR_HEADER_HTML);
+ if (result.empty()) {
+ NOTREACHED() << "expected resource not found";
+ }
+
+ result.append("<script>start(");
+ string_escape::JavascriptDoubleQuote(title, true, &result);
+ result.append(");</script>\n");
+
+ return result;
+}
+
+std::string GetDirectoryListingEntry(const std::string& name,
+ DWORD attrib,
+ int64 size,
+ const FILETIME* modified) {
+ std::string result;
+ result.append("<script>addRow(");
+ string_escape::JavascriptDoubleQuote(name, true, &result);
+ result.append(",");
+ string_escape::JavascriptDoubleQuote(
+ EscapePath(name), true, &result);
+ if (attrib & FILE_ATTRIBUTE_DIRECTORY) {
+ result.append(",1,");
+ } else {
+ result.append(",0,");
+ }
+
+ string_escape::JavascriptDoubleQuote(
+ FormatBytes(size, GetByteDisplayUnits(size), true), true, &result);
+
+ result.append(",");
+
+ string_escape::JavascriptDoubleQuote(
+ LocalizedDateTime(modified), true, &result);
+
+ result.append(");</script>\n");
+
+ return result;
+}
+
+std::wstring StripWWW(const std::wstring& text) {
+ const std::wstring www(L"www.");
+ return (text.compare(0, www.length(), www) == 0) ?
+ text.substr(www.length()) : text;
+}
+
+std::wstring GetSuggestedFilename(const GURL& url,
+ const std::string& content_disposition,
+ const std::wstring& default_name) {
+ std::wstring filename = GetFileNameFromCD(content_disposition);
+ if (!filename.empty()) {
+ // Remove any path information the server may have sent, take the name
+ // only.
+ filename = file_util::GetFilenameFromPath(filename);
+ // Next, remove "." from the beginning and end of the file name to avoid
+ // tricks with hidden files, "..", and "."
+ TrimString(filename, L".", &filename);
+ }
+ if (filename.empty()) {
+ if (url.is_valid())
+ filename = UnescapeAndDecodeUTF8URLComponent(
+ url.ExtractFileName(), UnescapeRule::SPACES | UnescapeRule::PERCENTS);
+ }
+
+ // Trim '.' once more.
+ TrimString(filename, L".", &filename);
+ // If there's no filename or it gets trimed to be empty, use
+ // the URL hostname or default_name
+ if (filename.empty()) {
+ if (!default_name.empty())
+ filename = default_name;
+ else if (url.is_valid()) {
+ // Some schemes (e.g. file) do not have a hostname. Even though it's
+ // not likely to reach here, let's hardcode the last fallback name.
+ // TODO(jungshik) : Decode a 'punycoded' IDN hostname. (bug 1264451)
+ filename = url.host().empty() ? L"download" : UTF8ToWide(url.host());
+ } else
+ NOTREACHED();
+ }
+
+ file_util::ReplaceIllegalCharacters(&filename, '-');
+ return filename;
+}
+
+std::wstring GetSuggestedFilename(const GURL& url,
+ const std::wstring& content_disposition,
+ const std::wstring& default_name) {
+ return GetSuggestedFilename(
+ url, WideToUTF8(content_disposition), default_name);
+}
+
+bool IsPortAllowedByDefault(int port) {
+ int array_size = arraysize(kRestrictedPorts);
+ for (int i = 0; i < array_size; i++) {
+ if (kRestrictedPorts[i] == port) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool IsPortAllowedByFtp(int port) {
+ int array_size = arraysize(kAllowedFtpPorts);
+ for (int i = 0; i < array_size; i++) {
+ if (kAllowedFtpPorts[i] == port) {
+ return true;
+ }
+ }
+ // Port not explicitly allowed by FTP, so return the default restrictions.
+ return IsPortAllowedByDefault(port);
+}
+
+} // namespace net_util