diff options
Diffstat (limited to 'net/base/net_util.cc')
-rw-r--r-- | net/base/net_util.cc | 993 |
1 files changed, 993 insertions, 0 deletions
diff --git a/net/base/net_util.cc b/net/base/net_util.cc new file mode 100644 index 0000000..416252c --- /dev/null +++ b/net/base/net_util.cc @@ -0,0 +1,993 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include <algorithm> +#include <unicode/ucnv.h> +#include <unicode/uidna.h> +#include <unicode/ulocdata.h> +#include <unicode/uniset.h> +#include <unicode/uscript.h> +#include <unicode/uset.h> +#include <windows.h> +#include <wininet.h> + +#include "net/base/net_util.h" + +#include "base/basictypes.h" +#include "base/file_util.h" +#include "base/logging.h" +#include "base/path_service.h" +#include "base/scoped_ptr.h" +#include "base/string_tokenizer.h" +#include "base/string_util.h" +#include "base/time.h" +#include "base/string_escape.h" +#include "googleurl/src/gurl.h" +#include "googleurl/src/url_canon.h" +#include "googleurl/src/url_parse.h" +#include "net/base/escape.h" +#include "net/base/net_module.h" +#include "net/base/net_resources.h" +#include "net/base/base64.h" +#include "unicode/datefmt.h" + +namespace { + +// what we prepend to get a file URL +static const wchar_t kFileURLPrefix[] = L"file:///"; + +// The general list of blocked ports. Will be blocked unless a specific +// protocol overrides it. (Ex: ftp can use ports 20 and 21) +static const int kRestrictedPorts[] = { + 1, // tcpmux + 7, // echo + 9, // discard + 11, // systat + 13, // daytime + 15, // netstat + 17, // qotd + 19, // chargen + 20, // ftp data + 21, // ftp access + 22, // ssh + 23, // telnet + 25, // smtp + 37, // time + 42, // name + 43, // nicname + 53, // domain + 77, // priv-rjs + 79, // finger + 87, // ttylink + 95, // supdup + 101, // hostriame + 102, // iso-tsap + 103, // gppitnp + 104, // acr-nema + 109, // pop2 + 110, // pop3 + 111, // sunrpc + 113, // auth + 115, // sftp + 117, // uucp-path + 119, // nntp + 123, // NTP + 135, // loc-srv /epmap + 139, // netbios + 143, // imap2 + 179, // BGP + 389, // ldap + 465, // smtp+ssl + 512, // print / exec + 513, // login + 514, // shell + 515, // printer + 526, // tempo + 530, // courier + 531, // chat + 532, // netnews + 540, // uucp + 556, // remotefs + 563, // nntp+ssl + 587, // stmp? + 601, // ?? + 636, // ldap+ssl + 993, // ldap+ssl + 995, // pop3+ssl + 2049, // nfs + 4045, // lockd + 6000, // X11 +}; + +// FTP overrides the following restricted ports. +static const int kAllowedFtpPorts[] = { + 21, // ftp data + 22, // ssh +}; + +template<typename STR> +STR GetSpecificHeaderT(const STR& headers, const STR& name) { + // We want to grab the Value from the "Key: Value" pairs in the headers, + // which should look like this (no leading spaces, \n-separated) (we format + // them this way in url_request_inet.cc): + // HTTP/1.1 200 OK\n + // ETag: "6d0b8-947-24f35ec0"\n + // Content-Length: 2375\n + // Content-Type: text/html; charset=UTF-8\n + // Last-Modified: Sun, 03 Sep 2006 04:34:43 GMT\n + if (headers.empty()) + return STR(); + + STR match; + match.push_back('\n'); + match.append(name); + match.push_back(':'); + + STR::const_iterator begin = + search(headers.begin(), headers.end(), match.begin(), match.end(), + CaseInsensitiveCompareASCII<STR::value_type>()); + + if (begin == headers.end()) + return STR(); + + begin += match.length(); + + STR::const_iterator end = find(begin, headers.end(), '\n'); + + STR ret; + TrimWhitespace(STR(begin, end), TRIM_ALL, &ret); + return ret; +} + +// TODO(jungshik): We have almost identical hex-decoding code else where. +// Consider refactoring and moving it somewhere(base?). Bug 1224311 +inline bool IsHexDigit(unsigned char c) { + return ('0' <= c && c <= '9' || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f'); +} + +inline unsigned char HexToInt(unsigned char c) { + DCHECK(IsHexDigit(c)); + static unsigned char kOffset[4] = {0, 0x30u, 0x37u, 0x57u}; + return c - kOffset[c / 0x20]; +} + +// Similar to Base64Decode. Decodes a Q-encoded string to a sequence +// of bytes. If input is invalid, return false. +bool QPDecode(const std::string& input, std::string* output) { + std::string temp; + temp.reserve(input.size()); + std::string::const_iterator it = input.begin(); + while (it != input.end()) { + if (*it == '_') { + temp.push_back(' '); + } else if (*it == '=') { + if (input.end() - it < 3) { + return false; + } + if (IsHexDigit(static_cast<unsigned char>(*(it + 1))) && + IsHexDigit(static_cast<unsigned char>(*(it + 2)))) { + unsigned char ch = HexToInt(*(it + 1)) * 16 + HexToInt(*(it + 2)); + temp.push_back(static_cast<char>(ch)); + ++it; + ++it; + } else { + return false; + } + } else if (0x20 < *it && *it < 0x7F) { + // In a Q-encoded word, only printable ASCII characters + // represent themselves. Besides, space, '=', '_' and '?' are + // not allowed, but they're already filtered out. + DCHECK(*it != 0x3D && *it != 0x5F && *it != 0x3F); + temp.push_back(*it); + } else { + return false; + } + ++it; + } + output->swap(temp); + return true; +} + +enum RFC2047EncodingType {Q_ENCODING, B_ENCODING}; +bool DecodeBQEncoding(const std::string& part, RFC2047EncodingType enc_type, + const std::string& charset, std::string* output) { + std::string decoded; + if (enc_type == B_ENCODING) { + if (!Base64Decode(part, &decoded)) { + return false; + } + } else { + if (!QPDecode(part, &decoded)) { + return false; + } + } + + UErrorCode err = U_ZERO_ERROR; + UConverter* converter(ucnv_open(charset.c_str(), &err)); + if (U_FAILURE(err)) { + return false; + } + + // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8. + // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes + // in UTF-8. Therefore, the expansion ratio is 3 at most. + int length = static_cast<int>(decoded.length()); + char* buf = WriteInto(output, length * 3); + length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, length * 3, + decoded.data(), length, &err); + ucnv_close(converter); + if (U_FAILURE(err)) { + return false; + } + output->resize(length); + return true; +} + +bool DecodeWord(const std::string& encoded_word, + bool *is_rfc2047, + std::string* output) { + // TODO(jungshik) : Revisit this later. Do we want to pass through non-ASCII + // strings which can be mozibake? WinHTTP converts a raw 8bit string + // UTF-16 assuming it's in the OS default encoding. + if (!IsStringASCII(encoded_word)) { + // Try falling back to the NativeMB encoding if the raw input is not UTF-8. + if (IsStringUTF8(encoded_word.c_str())) { + *output = encoded_word; + } else { + *output = WideToUTF8(NativeMBToWide(encoded_word)); + } + *is_rfc2047 = false; + return true; + } + + // RFC 2047 : one of encoding methods supported by Firefox and relatively + // widely used by web servers. + // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'. + // We don't care about the length restriction (72 bytes) because + // many web servers generate encoded words longer than the limit. + std::string tmp; + *is_rfc2047 = true; + int part_index = 0; + std::string charset; + StringTokenizer t(encoded_word, "?"); + RFC2047EncodingType enc_type = Q_ENCODING; + while (*is_rfc2047 && t.GetNext()) { + std::string part = t.token(); + switch (part_index) { + case 0: + if (part != "=") { + *is_rfc2047 = false; + break; + } + ++part_index; + break; + case 1: + // Do we need charset validity check here? + charset = part; + ++part_index; + break; + case 2: + if (part.size() > 1 || + part.find_first_of("bBqQ") == std::string::npos) { + *is_rfc2047 = false; + break; + } + if (part[0] == 'b' || part[0] == 'B') { + enc_type = B_ENCODING; + } + ++part_index; + break; + case 3: + *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp); + if (!*is_rfc2047) { + // Last minute failure. Invalid B/Q encoding. Rather than + // passing it through, return now. + return false; + } + ++part_index; + break; + case 4: + if (part != "=") { + // Another last minute failure ! + // Likely to be a case of two encoded-words in a row or + // an encoded word followed by a non-encoded word. We can be + // generous, but it does not help much in terms of compatibility, + // I believe. Return immediately. + *is_rfc2047 = false; + return false; + } + ++part_index; + break; + default: + *is_rfc2047 = false; + return false; + } + } + + if (*is_rfc2047) { + if (*(encoded_word.end() - 1) == '=') { + output->swap(tmp); + return true; + } + // encoded_word ending prematurelly with '?' or extra '?' + *is_rfc2047 = false; + return false; + } + + // We're not handling 'especial' characters quoted with '\', but + // it should be Ok because we're not an email client but a + // web browser. + + // What IE6/7 does: %-escaped UTF-8. We could extend this to + // support a rudimentary form of RFC 2231 with charset label, but + // it'd gain us little in terms of compatibility. + tmp = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES); + if (IsStringUTF8(tmp.c_str())) { + output->swap(tmp); + return true; + // We can try either the OS default charset or 'origin charset' here, + // As far as I can tell, IE does not support it. However, I've seen + // web servers emit %-escaped string in a legacy encoding (usually + // origin charset). + // TODO(jungshik) : Test IE further and consider adding a fallback here. + } + return false; +} + +bool DecodeParamValue(const std::string& input, std::string* output) { + std::string tmp; + // Tokenize with whitespace characters. + StringTokenizer t(input, " \t\n\r"); + t.set_options(StringTokenizer::RETURN_DELIMS); + bool is_previous_token_rfc2047 = true; + while (t.GetNext()) { + if (t.token_is_delim()) { + // If the previous non-delimeter token is not RFC2047-encoded, + // put in a space in its place. Otheriwse, skip over it. + if (!is_previous_token_rfc2047) { + tmp.push_back(' '); + } + continue; + } + // We don't support a single multibyte character split into + // adjacent encoded words. Some broken mail clients emit headers + // with that problem, but most web servers usually encode a filename + // in a single encoded-word. Firefox/Thunderbird do not support + // it, either. + std::string decoded; + if (!DecodeWord(t.token(), &is_previous_token_rfc2047, &decoded)) + return false; + tmp.append(decoded); + } + output->swap(tmp); + return true; +} + +// TODO(mpcomplete): This is a quick and dirty implementation for now. I'm +// sure this doesn't properly handle all (most?) cases. +template<typename STR> +STR GetHeaderParamValueT(const STR& header, const STR& param_name) { + // This assumes args are formatted exactly like "bla; arg1=value; arg2=value". + STR::const_iterator param_begin = + search(header.begin(), header.end(), param_name.begin(), param_name.end(), + CaseInsensitiveCompareASCII<STR::value_type>()); + + if (param_begin == header.end()) + return STR(); + param_begin += param_name.length(); + + STR whitespace; + whitespace.push_back(' '); + whitespace.push_back('\t'); + const STR::size_type equals_offset = + header.find_first_not_of(whitespace, param_begin - header.begin()); + if (equals_offset == STR::npos || header.at(equals_offset) != '=') + return STR(); + + param_begin = header.begin() + equals_offset + 1; + if (param_begin == header.end()) + return STR(); + + STR::const_iterator param_end; + if (*param_begin == '"') { + param_end = find(param_begin+1, header.end(), '"'); + if (param_end == header.end()) + return STR(); // poorly formatted param? + + ++param_begin; // skip past the quote. + } else { + param_end = find(param_begin+1, header.end(), ';'); + } + + return STR(param_begin, param_end); +} + +// Does some simple normalization of scripts so we can allow certain scripts +// to exist together. +// TODO(brettw) bug 880223: we should allow some other languages to be +// oombined such as Chinese and Latin. We will probably need a more +// complicated system of language pairs to have more fine-grained control. +UScriptCode NormalizeScript(UScriptCode code) { + switch (code) { + case USCRIPT_KATAKANA: + case USCRIPT_HIRAGANA: + case USCRIPT_KATAKANA_OR_HIRAGANA: + case USCRIPT_HANGUL: // This one is arguable. + return USCRIPT_HAN; + default: + return code; + } +} + +bool IsIDNComponentInSingleScript(const wchar_t* str, int str_len) { + UScriptCode first_script; + bool is_first = true; + + int i = 0; + while (i < str_len) { + unsigned code_point; + U16_NEXT(str, i, str_len, code_point); + + UErrorCode err = U_ZERO_ERROR; + UScriptCode cur_script = uscript_getScript(code_point, &err); + if (err != U_ZERO_ERROR) + return false; // Report mixed on error. + cur_script = NormalizeScript(cur_script); + + // TODO(brettw) We may have to check for USCRIPT_INHERENT as well. + if (is_first && cur_script != USCRIPT_COMMON) { + first_script = cur_script; + is_first = false; + } else { + if (cur_script != USCRIPT_COMMON && cur_script != first_script) + return false; + } + } + return true; +} + +// Check if the script of a language can be 'safely' mixed with +// Latin letters in the ASCII range. +bool IsCompatibleWithASCIILetters(const std::string& lang) { + // For now, just list Chinese, Japanese and Korean (positive list). + // An alternative is negative-listing (languages using Greek and + // Cyrillic letters), but it can be more dangerous. + return !lang.substr(0,2).compare("zh") || + !lang.substr(0,2).compare("ja") || + !lang.substr(0,2).compare("ko"); +} + +// Returns true if the given Unicode host component is safe to display to the +// user. +bool IsIDNComponentSafe(const wchar_t* str, + int str_len, + const std::wstring& languages) { + // Most common cases (non-IDN) do not reach here so that we don't + // need a fast return path. + // TODO(jungshik) : Check if there's any character inappropriate + // (although allowed) for domain names. + // See http://www.unicode.org/reports/tr39/#IDN_Security_Profiles and + // http://www.unicode.org/reports/tr39/data/xidmodifications.txt + // For now, we borrow the list from Mozilla and tweaked it slightly. + // (e.g. Characters like U+00A0, U+3000, U+3002 are omitted because + // they're gonna be canonicalized to U+0020 and full stop before + // reaching here.) + // The original list is available at + // http://kb.mozillazine.org/Network.IDN.blacklist_chars and + // at http://mxr.mozilla.org/seamonkey/source/modules/libpref/src/init/all.js#703 + + UErrorCode status = U_ZERO_ERROR; +#ifdef U_WCHAR_IS_UTF16 + UnicodeSet dangerous_characters(UnicodeString( + L"[[\\ \u00bc\u00bd\u01c3\u0337\u0338" + L"\u05c3\u05f4\u06d4\u0702\u115f\u1160][\u2000-\u200b]" + L"[\u2024\u2027\u2028\u2029\u2039\u203a\u2044\u205f]" + L"[\u2154-\u2156][\u2159-\u215b][\u215f\u2215\u23ae" + L"\u29f6\u29f8\u2afb\u2afd][\u2ff0-\u2ffb][\u3014" + L"\u3015\u3033\u3164\u321d\u321e\u33ae\u33af\u33c6\u33df\ufe14" + L"\ufe15\ufe3f\ufe5d\ufe5e\ufeff\uff0e\uff06\uff61\uffa0\ufff9]" + L"[\ufffa-\ufffd]]"), status); +#else + UnicodeSet dangerous_characters(UnicodeString( + "[[\\ \\u0020\\u00bc\\u00bd\\u01c3\\u0337\\u0338" + "\\u05c3\\u05f4\\u06d4\\u0702\\u115f\\u1160][\\u2000-\\u200b]" + "[\\u2024\\u2027\\u2028\\u2029\\u2039\\u203a\\u2044\\u205f]" + "[\\u2154-\\u2156][\\u2159-\\u215b][\\u215f\\u2215\\u23ae" + "\\u29f6\\u29f8\\u2afb\\u2afd][\\u2ff0-\\u2ffb][\\u3014" + "\\u3015\\u3033\\u3164\\u321d\\u321e\\u33ae\\u33af\\u33c6\\u33df\\ufe14" + "\\ufe15\\ufe3f\\ufe5d\\ufe5e\\ufeff\\uff0e\\uff06\\uff61\\uffa0\\ufff9]" + "[\\ufffa-\\ufffd]]", -1, US_INV), status); +#endif + DCHECK(U_SUCCESS(status)); + UnicodeSet component_characters; + component_characters.addAll(UnicodeString(str, str_len)); + if (dangerous_characters.containsSome(component_characters)) + return false; + + // If the language list is empty, the result is completely determined + // by whether a component is a single script or not. This will block + // even "safe" script mixing cases like <Chinese, Latin-ASCII> that are + // allowed with |languages| (while it blocks Chinese + Latin letters with + // an accent as should be the case), but we want to err on the safe side + // when |languages| is empty. + if (languages.empty()) + return IsIDNComponentInSingleScript(str, str_len); + + // |common_characters| is made up of ASCII numbers, hyphen, plus and + // underscore that are used across scripts and allowed in domain names. + // (sync'd with characters allowed in url_canon_host with square + // brackets excluded.) See kHostCharLookup[] array in url_canon_host.cc. + UnicodeSet common_characters(UNICODE_STRING_SIMPLE("[[0-9]\\-_+\\ ]"), + status); + DCHECK(U_SUCCESS(status)); + // Subtract common characters because they're always allowed so that + // we just have to check if a language-specific set contains + // the remainder. + component_characters.removeAll(common_characters); + + USet *lang_set = uset_open(1, 0); // create an empty set + UnicodeSet ascii_letters(0x61, 0x7a); // [a-z] + bool safe = false; + std::string languages_list(WideToASCII(languages)); + StringTokenizer t(languages_list, ","); + while (t.GetNext()) { + std::string lang = t.token(); + status = U_ZERO_ERROR; + // TODO(jungshik) Cache exemplar sets for locales. + ULocaleData* uld = ulocdata_open(lang.c_str(), &status); + if (U_SUCCESS(status)) { + // Should we use auxiliary set, instead? + ulocdata_getExemplarSet(uld, lang_set, 0, ULOCDATA_ES_STANDARD, &status); + ulocdata_close(uld); + if (U_SUCCESS(status)) { + UnicodeSet* allowed_characters = + reinterpret_cast<UnicodeSet*>(lang_set); + // If |lang| is compatible with ASCII Latin letters, add them. + if (IsCompatibleWithASCIILetters(lang)) + allowed_characters->addAll(ascii_letters); + if (allowed_characters->containsAll(component_characters)) { + safe = true; + break; + } + } + } + } + uset_close(lang_set); + return safe; +} + +// Converts one component of a host (between dots) to IDN if safe. The result +// will be APPENDED to the given output string and will be the same as the +// input if it is not IDN or the IDN is unsafe to display. +void IDNToUnicodeOneComponent(const wchar_t* comp, + int comp_len, + const std::wstring& languages, + std::wstring* out) { + DCHECK(comp_len >= 0); + if (comp_len == 0) + return; + + // Expand the output string to make room for a possibly longer string + // (we'll expand if it's still not big enough below). + int extra_space = 64; + size_t host_begin_in_output = out->size(); + + // Just copy the input if it can't be an IDN component. + if (comp_len < 4 || wcsncmp(comp, L"xn--", 4)) { + out->resize(host_begin_in_output + comp_len); + for (int i = 0; i < comp_len; i++) + (*out)[host_begin_in_output + i] = comp[i]; + return; + } + + while (true) { + out->resize(out->size() + extra_space); + UErrorCode status = U_ZERO_ERROR; + int output_chars = uidna_IDNToUnicode( + comp, comp_len, &(*out)[host_begin_in_output], extra_space, + UIDNA_DEFAULT, NULL, &status); + if (status == U_ZERO_ERROR) { + // Converted successfully. + out->resize(host_begin_in_output + output_chars); + if (!IsIDNComponentSafe(&out->data()[host_begin_in_output], + output_chars, + languages)) + break; // The error handling below will undo the IDN. + return; + } + if (status != U_BUFFER_OVERFLOW_ERROR) + break; + + // Need to loop again with a bigger buffer. It looks like ICU will + // return the required size of the buffer, but that's not documented, + // so we'll just grow by 2x. This should be rare and is not on a + // critical path. + extra_space *= 2; + } + + // We get here on error, in which case we replace anything that was added + // with the literal input. + out->resize(host_begin_in_output + comp_len); + for (int i = 0; i < comp_len; i++) + (*out)[host_begin_in_output + i] = comp[i]; +} + +// Convert a FILETIME to a localized string. |filetime| may be NULL. +// TODO(tc): Remove this once bug 1164516 is fixed. +std::wstring LocalizedDateTime(const FILETIME* filetime) { + if (!filetime) + return std::wstring(); + + Time time = Time::FromFileTime(*filetime); + scoped_ptr<DateFormat> formatter(DateFormat::createDateTimeInstance( + DateFormat::kShort)); + UnicodeString date_string; + formatter->format(static_cast<UDate>(time.ToDoubleT() * 1000), date_string); + + std::wstring formatted; + int capacity = date_string.length() + 1; + UErrorCode error = U_ZERO_ERROR; + date_string.extract(static_cast<UChar*>(WriteInto(&formatted, capacity)), + capacity, error); + return formatted; +} + +} // namespace + +namespace net_util { + +GURL FilePathToFileURL(const std::wstring& file_path) { + // Produce a URL like "file:///C:/foo" for a regular file, or + // "file://///server/path" for UNC. The URL canonicalizer will fix up the + // latter case to be the canonical UNC form: "file://server/path" + std::wstring url_str(kFileURLPrefix); + url_str.append(file_path); + + // Now do replacement of some characters. Since we assume the input is a + // literal filename, anything the URL parser might consider special should + // be escaped here. + + // must be the first substitution since others will introduce percents as the + // escape character + ReplaceSubstringsAfterOffset(&url_str, 0, L"%", L"%25"); + + // semicolon is supposed to be some kind of separator according to RFC 2396 + ReplaceSubstringsAfterOffset(&url_str, 0, L";", L"%3B"); + + ReplaceSubstringsAfterOffset(&url_str, 0, L"#", L"%23"); + + return GURL(url_str); +} + +bool FileURLToFilePath(const GURL& url, std::wstring* file_path) { + file_path->clear(); + + if (!url.is_valid()) + return false; + + std::string path; + std::string host = url.host(); + if (host.empty()) { + // URL contains no host, the path is the filename. In this case, the path + // will probably be preceeded with a slash, as in "/C:/foo.txt", so we + // trim out that here. + path = url.path(); + size_t first_non_slash = path.find_first_not_of("/\\"); + if (first_non_slash != std::string::npos && first_non_slash > 0) + path.erase(0, first_non_slash); + } else { + // URL contains a host: this means it's UNC. We keep the preceeding slash + // on the path. + path = "\\\\"; + path.append(host); + path.append(url.path()); + } + + if (path.empty()) + return false; + std::replace(path.begin(), path.end(), '/', '\\'); + + // GURL stores strings as percent-encoded UTF-8, this will undo if possible. + path = UnescapeURLComponent(path, + UnescapeRule::SPACES | UnescapeRule::PERCENTS); + + if (!IsStringUTF8(path.c_str())) { + // Not UTF-8, assume encoding is native codepage and we're done. We know we + // are giving the conversion function a nonempty string, and it may fail if + // the given string is not in the current encoding and give us an empty + // string back. We detect this and report failure. + *file_path = NativeMBToWide(path); + return !file_path->empty(); + } + file_path->assign(UTF8ToWide(path)); + + // Now we have an unescaped filename, but are still not sure about its + // encoding. For example, each character could be part of a UTF-8 string. + if (file_path->empty() || !IsString8Bit(*file_path)) { + // assume our 16-bit encoding is correct if it won't fit into an 8-bit + // string + return true; + } + + // Convert our narrow string into the native wide path. + std::string narrow; + if (!WideToLatin1(*file_path, &narrow)) { + NOTREACHED() << "Should have filtered out non-8-bit strings above."; + return false; + } + if (IsStringUTF8(narrow.c_str())) { + // Our string actually looks like it could be UTF-8, convert to 8-bit + // UTF-8 and then to the corresponding wide string. + *file_path = UTF8ToWide(narrow); + } else { + // Our wide string contains only 8-bit characters and it's not UTF-8, so + // we assume it's in the native codepage. + *file_path = NativeMBToWide(narrow); + } + + // Fail if 8-bit -> wide conversion failed and gave us an empty string back + // (we already filtered out empty strings above). + return !file_path->empty(); +} + +std::wstring GetSpecificHeader(const std::wstring& headers, + const std::wstring& name) { + return GetSpecificHeaderT(headers, name); +} + +std::string GetSpecificHeader(const std::string& headers, + const std::string& name) { + return GetSpecificHeaderT(headers, name); +} + +std::wstring GetFileNameFromCD(const std::string& header) { + std::string param_value = GetHeaderParamValue(header, "filename"); + if (param_value.empty()) { + // Some servers use 'name' parameter. + param_value = GetHeaderParamValue(header, "name"); + } + if (param_value.empty()) + return std::wstring(); + std::string decoded; + if (DecodeParamValue(param_value, &decoded)) + return UTF8ToWide(decoded); + return std::wstring(); +} + +std::wstring GetHeaderParamValue(const std::wstring& field, + const std::wstring& param_name) { + return GetHeaderParamValueT(field, param_name); +} + +std::string GetHeaderParamValue(const std::string& field, + const std::string& param_name) { + return GetHeaderParamValueT(field, param_name); +} + +// TODO(brettw) bug 734373: check the scripts for each host component and +// don't un-IDN-ize if there is more than one. Alternatively, only IDN for +// scripts that the user has installed. For now, just put the entire +// path through IDN. Maybe this feature can be implemented in ICU itself? +// +// We may want to skip this step in the case of file URLs to allow unicode +// UNC hostnames regardless of encodings. +void IDNToUnicode(const char* host, + int host_len, + const std::wstring& languages, + std::wstring* out) { + // Convert the ASCII input to a wide string for ICU. + std::wstring wide_input; + wide_input.reserve(host_len); + for (int i = 0; i < host_len; i++) + wide_input.push_back(host[i]); + + // Do each component of the host separately, since we enforce script matching + // on a per-component basis. + size_t cur_begin = 0; // Beginning of the current component (inclusive). + while (cur_begin < wide_input.size()) { + // Find the next dot or the end of the string. + size_t next_dot = wide_input.find_first_of('.', cur_begin); + if (next_dot == std::wstring::npos) + next_dot = wide_input.size(); // For getting the last component. + + if (next_dot > cur_begin) { + // Add the substring that we just found. + IDNToUnicodeOneComponent(&wide_input[cur_begin], + static_cast<int>(next_dot - cur_begin), + languages, + out); + } + + // Need to add the dot we just found (if we found one). This needs to be + // done before we break out below in case the URL ends in a dot. + if (next_dot < wide_input.size()) + out->push_back('.'); + else + break; // No more components left. + + cur_begin = next_dot + 1; + } +} + +template <typename str> +std::string CanonicalizeHost(const str& host, bool* is_ip_address) { + // Try to canonicalize the host. + const url_parse::Component raw_host_component(0, + static_cast<int>(host.length())); + std::string canon_host; + url_canon::StdStringCanonOutput canon_host_output(&canon_host); + url_parse::Component canon_host_component; + if (!url_canon::CanonicalizeHost(host.c_str(), raw_host_component, + &canon_host_output, &canon_host_component)) { + if (is_ip_address) + *is_ip_address = false; + return std::string(); + } + canon_host_output.Complete(); + + if (is_ip_address) { + // See if the host is an IP address. + url_canon::RawCanonOutputT<char, 128> ignored_output; + url_parse::Component ignored_component; + *is_ip_address = url_canon::CanonicalizeIPAddress(canon_host.c_str(), + canon_host_component, + &ignored_output, + &ignored_component); + } + + // Return the host as a string, stripping any unnecessary bits off the ends. + if ((canon_host_component.begin == 0) && + (canon_host_component.len == canon_host.length())) + return canon_host; + return canon_host.substr(canon_host_component.begin, + canon_host_component.len); +} + +// Forcibly instantiate narrow and wide versions of this function so we don't +// need to put the function definition in the header. +template std::string CanonicalizeHost<std::string>(const std::string& host, + bool* is_ip_address); +template std::string CanonicalizeHost<std::wstring>(const std::wstring& host, + bool* is_ip_address); + +std::string GetDirectoryListingHeader(const std::string& title) { + std::string result = NetModule::GetResource(IDR_DIR_HEADER_HTML); + if (result.empty()) { + NOTREACHED() << "expected resource not found"; + } + + result.append("<script>start("); + string_escape::JavascriptDoubleQuote(title, true, &result); + result.append(");</script>\n"); + + return result; +} + +std::string GetDirectoryListingEntry(const std::string& name, + DWORD attrib, + int64 size, + const FILETIME* modified) { + std::string result; + result.append("<script>addRow("); + string_escape::JavascriptDoubleQuote(name, true, &result); + result.append(","); + string_escape::JavascriptDoubleQuote( + EscapePath(name), true, &result); + if (attrib & FILE_ATTRIBUTE_DIRECTORY) { + result.append(",1,"); + } else { + result.append(",0,"); + } + + string_escape::JavascriptDoubleQuote( + FormatBytes(size, GetByteDisplayUnits(size), true), true, &result); + + result.append(","); + + string_escape::JavascriptDoubleQuote( + LocalizedDateTime(modified), true, &result); + + result.append(");</script>\n"); + + return result; +} + +std::wstring StripWWW(const std::wstring& text) { + const std::wstring www(L"www."); + return (text.compare(0, www.length(), www) == 0) ? + text.substr(www.length()) : text; +} + +std::wstring GetSuggestedFilename(const GURL& url, + const std::string& content_disposition, + const std::wstring& default_name) { + std::wstring filename = GetFileNameFromCD(content_disposition); + if (!filename.empty()) { + // Remove any path information the server may have sent, take the name + // only. + filename = file_util::GetFilenameFromPath(filename); + // Next, remove "." from the beginning and end of the file name to avoid + // tricks with hidden files, "..", and "." + TrimString(filename, L".", &filename); + } + if (filename.empty()) { + if (url.is_valid()) + filename = UnescapeAndDecodeUTF8URLComponent( + url.ExtractFileName(), UnescapeRule::SPACES | UnescapeRule::PERCENTS); + } + + // Trim '.' once more. + TrimString(filename, L".", &filename); + // If there's no filename or it gets trimed to be empty, use + // the URL hostname or default_name + if (filename.empty()) { + if (!default_name.empty()) + filename = default_name; + else if (url.is_valid()) { + // Some schemes (e.g. file) do not have a hostname. Even though it's + // not likely to reach here, let's hardcode the last fallback name. + // TODO(jungshik) : Decode a 'punycoded' IDN hostname. (bug 1264451) + filename = url.host().empty() ? L"download" : UTF8ToWide(url.host()); + } else + NOTREACHED(); + } + + file_util::ReplaceIllegalCharacters(&filename, '-'); + return filename; +} + +std::wstring GetSuggestedFilename(const GURL& url, + const std::wstring& content_disposition, + const std::wstring& default_name) { + return GetSuggestedFilename( + url, WideToUTF8(content_disposition), default_name); +} + +bool IsPortAllowedByDefault(int port) { + int array_size = arraysize(kRestrictedPorts); + for (int i = 0; i < array_size; i++) { + if (kRestrictedPorts[i] == port) { + return false; + } + } + return true; +} + +bool IsPortAllowedByFtp(int port) { + int array_size = arraysize(kAllowedFtpPorts); + for (int i = 0; i < array_size; i++) { + if (kAllowedFtpPorts[i] == port) { + return true; + } + } + // Port not explicitly allowed by FTP, so return the default restrictions. + return IsPortAllowedByDefault(port); +} + +} // namespace net_util |