diff options
Diffstat (limited to 'net')
-rw-r--r-- | net/base/net_util.cc | 8 | ||||
-rw-r--r-- | net/base/net_util.h | 116 | ||||
-rw-r--r-- | net/base/net_util_icu.cc | 792 | ||||
-rw-r--r-- | net/base/net_util_icu_unittest.cc | 1032 |
4 files changed, 1939 insertions, 9 deletions
diff --git a/net/base/net_util.cc b/net/base/net_util.cc index b27246b..d4b525b 100644 --- a/net/base/net_util.cc +++ b/net/base/net_util.cc @@ -432,6 +432,14 @@ std::string GetHostOrSpecFromURL(const GURL& url) { return url.has_host() ? TrimEndingDot(url.host()) : url.spec(); } +bool CanStripTrailingSlash(const GURL& url) { + // Omit the path only for standard, non-file URLs with nothing but "/" after + // the hostname. + return url.IsStandard() && !url.SchemeIsFile() && + !url.SchemeIsFileSystem() && !url.has_query() && !url.has_ref() + && url.path() == "/"; +} + GURL SimplifyUrlForRequest(const GURL& url) { DCHECK(url.is_valid()); GURL::Replacements replacements; diff --git a/net/base/net_util.h b/net/base/net_util.h index 597e52e..628abe2 100644 --- a/net/base/net_util.h +++ b/net/base/net_util.h @@ -48,11 +48,31 @@ class AddressList; // Keep this in sync. typedef std::vector<unsigned char> IPAddressNumber; +// Used by FormatUrl to specify handling of certain parts of the url. +typedef uint32_t FormatUrlType; +typedef uint32_t FormatUrlTypes; + #if defined(OS_WIN) // Bluetooth address size. Windows Bluetooth is supported via winsock. static const size_t kBluetoothAddressSize = 6; #endif +// Nothing is ommitted. +NET_EXPORT extern const FormatUrlType kFormatUrlOmitNothing; + +// If set, any username and password are removed. +NET_EXPORT extern const FormatUrlType kFormatUrlOmitUsernamePassword; + +// If the scheme is 'http://', it's removed. +NET_EXPORT extern const FormatUrlType kFormatUrlOmitHTTP; + +// Omits the path if it is just a slash and there is no query or ref. This is +// meaningful for non-file "standard" URLs. +NET_EXPORT extern const FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname; + +// Convenience for omitting all unecessary types. +NET_EXPORT extern const FormatUrlType kFormatUrlOmitAll; + // Splits an input of the form <host>[":"<port>] into its consitituent parts. // Saves the result into |*host| and |*port|. If the input did not have // the optional port, sets |*port| to -1. @@ -127,6 +147,24 @@ NET_EXPORT_PRIVATE void GetIdentityFromURL(const GURL& url, // Returns either the host from |url|, or, if the host is empty, the full spec. NET_EXPORT std::string GetHostOrSpecFromURL(const GURL& url); +// Converts the given host name to unicode characters. This can be called for +// any host name, if the input is not IDN or is invalid in some way, we'll just +// return the ASCII source so it is still usable. +// +// The input should be the canonicalized ASCII host name from GURL. This +// function does NOT accept UTF-8! +// +// |languages| is a comma separated list of ISO 639 language codes. It +// is used to determine whether a hostname is 'comprehensible' to a user +// who understands languages listed. |host| will be converted to a +// human-readable form (Unicode) ONLY when each component of |host| is +// regarded as 'comprehensible'. Scipt-mixing is not allowed except that +// Latin letters in the ASCII range can be mixed with a limited set of +// script-language pairs (currently Han, Kana and Hangul for zh,ja and ko). +// When |languages| is empty, even that mixing is not allowed. +NET_EXPORT base::string16 IDNToUnicode(const std::string& host, + const std::string& languages); + // Canonicalizes |host| and returns it. Also fills |host_info| with // IP address information. |host_info| must not be NULL. NET_EXPORT std::string CanonicalizeHost(const std::string& host, @@ -176,6 +214,84 @@ NET_EXPORT base::string16 StripWWWFromHost(const GURL& url); // Set socket to non-blocking mode NET_EXPORT int SetNonBlocking(int fd); +// Formats the host in |url| and appends it to |output|. The host formatter +// takes the same accept languages component as ElideURL(). +NET_EXPORT void AppendFormattedHost(const GURL& url, + const std::string& languages, + base::string16* output); + +// Creates a string representation of |url|. The IDN host name may be in Unicode +// if |languages| accepts the Unicode representation. |format_type| is a bitmask +// of FormatUrlTypes, see it for details. |unescape_rules| defines how to clean +// the URL for human readability. You will generally want |UnescapeRule::SPACES| +// for display to the user if you can handle spaces, or |UnescapeRule::NORMAL| +// if not. If the path part and the query part seem to be encoded in %-encoded +// UTF-8, decodes %-encoding and UTF-8. +// +// The last three parameters may be NULL. +// +// |new_parsed| will be set to the parsing parameters of the resultant URL. +// +// |prefix_end| will be the length before the hostname of the resultant URL. +// +// |offset[s]_for_adjustment| specifies one or more offsets into the original +// URL, representing insertion or selection points between characters: if the +// input is "http://foo.com/", offset 0 is before the entire URL, offset 7 is +// between the scheme and the host, and offset 15 is after the end of the URL. +// Valid input offsets range from 0 to the length of the input URL string. On +// exit, each offset will have been modified to reflect any changes made to the +// output string. For example, if |url| is "http://a:b@c.com/", +// |omit_username_password| is true, and an offset is 12 (pointing between 'c' +// and '.'), then on return the output string will be "http://c.com/" and the +// offset will be 8. If an offset cannot be successfully adjusted (e.g. because +// it points into the middle of a component that was entirely removed or into +// the middle of an encoding sequence), it will be set to base::string16::npos. +// For consistency, if an input offset points between the scheme and the +// username/password, and both are removed, on output this offset will be 0 +// rather than npos; this means that offsets at the starts and ends of removed +// components are always transformed the same way regardless of what other +// components are adjacent. +NET_EXPORT base::string16 FormatUrl(const GURL& url, + const std::string& languages, + FormatUrlTypes format_types, + UnescapeRule::Type unescape_rules, + url::Parsed* new_parsed, + size_t* prefix_end, + size_t* offset_for_adjustment); +NET_EXPORT base::string16 FormatUrlWithOffsets( + const GURL& url, + const std::string& languages, + FormatUrlTypes format_types, + UnescapeRule::Type unescape_rules, + url::Parsed* new_parsed, + size_t* prefix_end, + std::vector<size_t>* offsets_for_adjustment); +// This function is like those above except it takes |adjustments| rather +// than |offset[s]_for_adjustment|. |adjustments| will be set to reflect all +// the transformations that happened to |url| to convert it into the returned +// value. +NET_EXPORT base::string16 FormatUrlWithAdjustments( + const GURL& url, + const std::string& languages, + FormatUrlTypes format_types, + UnescapeRule::Type unescape_rules, + url::Parsed* new_parsed, + size_t* prefix_end, + base::OffsetAdjuster::Adjustments* adjustments); + +// This is a convenience function for FormatUrl() with +// format_types = kFormatUrlOmitAll and unescape = SPACES. This is the typical +// set of flags for "URLs to display to the user". You should be cautious about +// using this for URLs which will be parsed or sent to other applications. +inline base::string16 FormatUrl(const GURL& url, const std::string& languages) { + return FormatUrl(url, languages, kFormatUrlOmitAll, UnescapeRule::SPACES, + NULL, NULL, NULL); +} + +// Returns whether FormatUrl() would strip a trailing slash from |url|, given a +// format flag including kFormatUrlOmitTrailingSlashOnBareHostname. +NET_EXPORT bool CanStripTrailingSlash(const GURL& url); + // Strip the portions of |url| that aren't core to the network request. // - user name / password // - reference section diff --git a/net/base/net_util_icu.cc b/net/base/net_util_icu.cc index c174c92..259baba 100644 --- a/net/base/net_util_icu.cc +++ b/net/base/net_util_icu.cc @@ -4,19 +4,594 @@ #include "net/base/net_util.h" +#include <map> +#include <vector> + #include "base/i18n/time_formatting.h" #include "base/json/string_escape.h" +#include "base/lazy_instance.h" +#include "base/logging.h" +#include "base/memory/singleton.h" +#include "base/stl_util.h" +#include "base/strings/string_tokenizer.h" #include "base/strings/string_util.h" +#include "base/strings/utf_offset_string_conversions.h" #include "base/strings/utf_string_conversions.h" -#include "net/base/escape.h" +#include "base/time/time.h" +#include "url/gurl.h" +#include "third_party/icu/source/common/unicode/uidna.h" +#include "third_party/icu/source/common/unicode/uniset.h" +#include "third_party/icu/source/common/unicode/uscript.h" +#include "third_party/icu/source/common/unicode/uset.h" +#include "third_party/icu/source/i18n/unicode/datefmt.h" +#include "third_party/icu/source/i18n/unicode/regex.h" +#include "third_party/icu/source/i18n/unicode/ulocdata.h" + +using base::Time; namespace net { +namespace { + +typedef std::vector<size_t> Offsets; + +// Does some simple normalization of scripts so we can allow certain scripts +// to exist together. +// TODO(brettw) bug 880223: we should allow some other languages to be +// oombined such as Chinese and Latin. We will probably need a more +// complicated system of language pairs to have more fine-grained control. +UScriptCode NormalizeScript(UScriptCode code) { + switch (code) { + case USCRIPT_KATAKANA: + case USCRIPT_HIRAGANA: + case USCRIPT_KATAKANA_OR_HIRAGANA: + case USCRIPT_HANGUL: // This one is arguable. + return USCRIPT_HAN; + default: + return code; + } +} + +bool IsIDNComponentInSingleScript(const base::char16* str, int str_len) { + UScriptCode first_script = USCRIPT_INVALID_CODE; + bool is_first = true; + + int i = 0; + while (i < str_len) { + unsigned code_point; + U16_NEXT(str, i, str_len, code_point); + + UErrorCode err = U_ZERO_ERROR; + UScriptCode cur_script = uscript_getScript(code_point, &err); + if (err != U_ZERO_ERROR) + return false; // Report mixed on error. + cur_script = NormalizeScript(cur_script); + + // TODO(brettw) We may have to check for USCRIPT_INHERENT as well. + if (is_first && cur_script != USCRIPT_COMMON) { + first_script = cur_script; + is_first = false; + } else { + if (cur_script != USCRIPT_COMMON && cur_script != first_script) + return false; + } + } + return true; +} + +// Check if the script of a language can be 'safely' mixed with +// Latin letters in the ASCII range. +bool IsCompatibleWithASCIILetters(const std::string& lang) { + // For now, just list Chinese, Japanese and Korean (positive list). + // An alternative is negative-listing (languages using Greek and + // Cyrillic letters), but it can be more dangerous. + return !lang.substr(0, 2).compare("zh") || + !lang.substr(0, 2).compare("ja") || + !lang.substr(0, 2).compare("ko"); +} + +typedef std::map<std::string, icu::UnicodeSet*> LangToExemplarSetMap; + +class LangToExemplarSet { + public: + static LangToExemplarSet* GetInstance() { + return Singleton<LangToExemplarSet>::get(); + } + + private: + LangToExemplarSetMap map; + LangToExemplarSet() { } + ~LangToExemplarSet() { + STLDeleteContainerPairSecondPointers(map.begin(), map.end()); + } + + friend class Singleton<LangToExemplarSet>; + friend struct DefaultSingletonTraits<LangToExemplarSet>; + friend bool GetExemplarSetForLang(const std::string&, icu::UnicodeSet**); + friend void SetExemplarSetForLang(const std::string&, icu::UnicodeSet*); + + DISALLOW_COPY_AND_ASSIGN(LangToExemplarSet); +}; + +bool GetExemplarSetForLang(const std::string& lang, + icu::UnicodeSet** lang_set) { + const LangToExemplarSetMap& map = LangToExemplarSet::GetInstance()->map; + LangToExemplarSetMap::const_iterator pos = map.find(lang); + if (pos != map.end()) { + *lang_set = pos->second; + return true; + } + return false; +} + +void SetExemplarSetForLang(const std::string& lang, + icu::UnicodeSet* lang_set) { + LangToExemplarSetMap& map = LangToExemplarSet::GetInstance()->map; + map.insert(std::make_pair(lang, lang_set)); +} + +static base::LazyInstance<base::Lock>::Leaky + g_lang_set_lock = LAZY_INSTANCE_INITIALIZER; + +// Returns true if all the characters in component_characters are used by +// the language |lang|. +bool IsComponentCoveredByLang(const icu::UnicodeSet& component_characters, + const std::string& lang) { + CR_DEFINE_STATIC_LOCAL( + const icu::UnicodeSet, kASCIILetters, ('a', 'z')); + icu::UnicodeSet* lang_set = nullptr; + // We're called from both the UI thread and the history thread. + { + base::AutoLock lock(g_lang_set_lock.Get()); + if (!GetExemplarSetForLang(lang, &lang_set)) { + UErrorCode status = U_ZERO_ERROR; + ULocaleData* uld = ulocdata_open(lang.c_str(), &status); + // TODO(jungshik) Turn this check on when the ICU data file is + // rebuilt with the minimal subset of locale data for languages + // to which Chrome is not localized but which we offer in the list + // of languages selectable for Accept-Languages. With the rebuilt ICU + // data, ulocdata_open never should fall back to the default locale. + // (issue 2078) + // DCHECK(U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING); + if (U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING) { + lang_set = reinterpret_cast<icu::UnicodeSet*>(ulocdata_getExemplarSet( + uld, nullptr, 0, ULOCDATA_ES_STANDARD, &status)); + // On success, if |lang| is compatible with ASCII Latin letters, add + // them. + if (lang_set && IsCompatibleWithASCIILetters(lang)) + lang_set->addAll(kASCIILetters); + } + + if (!lang_set) + lang_set = new icu::UnicodeSet(1, 0); + + lang_set->freeze(); + SetExemplarSetForLang(lang, lang_set); + ulocdata_close(uld); + } + } + return !lang_set->isEmpty() && lang_set->containsAll(component_characters); +} + +// Returns true if the given Unicode host component is safe to display to the +// user. +bool IsIDNComponentSafe(const base::char16* str, + int str_len, + const std::string& languages) { + // Most common cases (non-IDN) do not reach here so that we don't + // need a fast return path. + // TODO(jungshik) : Check if there's any character inappropriate + // (although allowed) for domain names. + // See http://www.unicode.org/reports/tr39/#IDN_Security_Profiles and + // http://www.unicode.org/reports/tr39/data/xidmodifications.txt + // For now, we borrow the list from Mozilla and tweaked it slightly. + // (e.g. Characters like U+00A0, U+3000, U+3002 are omitted because + // they're gonna be canonicalized to U+0020 and full stop before + // reaching here.) + // The original list is available at + // http://kb.mozillazine.org/Network.IDN.blacklist_chars and + // at http://mxr.mozilla.org/seamonkey/source/modules/libpref/src/init/all.js#703 + + UErrorCode status = U_ZERO_ERROR; +#ifdef U_WCHAR_IS_UTF16 + icu::UnicodeSet dangerous_characters( + icu::UnicodeString( + L"[[\\ \u00ad\u00bc\u00bd\u01c3\u0337\u0338" + L"\u05c3\u05f4\u06d4\u0702\u115f\u1160][\u2000-\u200b]" + L"[\u2024\u2027\u2028\u2029\u2039\u203a\u2044\u205f]" + L"[\u2154-\u2156][\u2159-\u215b][\u215f\u2215\u23ae" + L"\u29f6\u29f8\u2afb\u2afd][\u2ff0-\u2ffb][\u3014" + L"\u3015\u3033\u3164\u321d\u321e\u33ae\u33af\u33c6\u33df\ufe14" + L"\ufe15\ufe3f\ufe5d\ufe5e\ufeff\uff0e\uff06\uff61\uffa0\ufff9]" + L"[\ufffa-\ufffd]\U0001f50f\U0001f510\U0001f512\U0001f513]"), + status); + DCHECK(U_SUCCESS(status)); + icu::RegexMatcher dangerous_patterns(icu::UnicodeString( + // Lone katakana no, so, or n + L"[^\\p{Katakana}][\u30ce\u30f3\u30bd][^\\p{Katakana}]" + // Repeating Japanese accent characters + L"|[\u3099\u309a\u309b\u309c][\u3099\u309a\u309b\u309c]"), + 0, status); +#else + icu::UnicodeSet dangerous_characters(icu::UnicodeString( + "[[\\u0020\\u00ad\\u00bc\\u00bd\\u01c3\\u0337\\u0338" + "\\u05c3\\u05f4\\u06d4\\u0702\\u115f\\u1160][\\u2000-\\u200b]" + "[\\u2024\\u2027\\u2028\\u2029\\u2039\\u203a\\u2044\\u205f]" + "[\\u2154-\\u2156][\\u2159-\\u215b][\\u215f\\u2215\\u23ae" + "\\u29f6\\u29f8\\u2afb\\u2afd][\\u2ff0-\\u2ffb][\\u3014" + "\\u3015\\u3033\\u3164\\u321d\\u321e\\u33ae\\u33af\\u33c6\\u33df\\ufe14" + "\\ufe15\\ufe3f\\ufe5d\\ufe5e\\ufeff\\uff0e\\uff06\\uff61\\uffa0\\ufff9]" + "[\\ufffa-\\ufffd]\\U0001f50f\\U0001f510\\U0001f512\\U0001f513]", -1, + US_INV), status); + DCHECK(U_SUCCESS(status)); + icu::RegexMatcher dangerous_patterns(icu::UnicodeString( + // Lone katakana no, so, or n + "[^\\p{Katakana}][\\u30ce\\u30f3\\u30bd][^\\p{Katakana}]" + // Repeating Japanese accent characters + "|[\\u3099\\u309a\\u309b\\u309c][\\u3099\\u309a\\u309b\\u309c]"), + 0, status); +#endif + DCHECK(U_SUCCESS(status)); + icu::UnicodeSet component_characters; + icu::UnicodeString component_string(str, str_len); + component_characters.addAll(component_string); + if (dangerous_characters.containsSome(component_characters)) + return false; + + DCHECK(U_SUCCESS(status)); + dangerous_patterns.reset(component_string); + if (dangerous_patterns.find()) + return false; + + // If the language list is empty, the result is completely determined + // by whether a component is a single script or not. This will block + // even "safe" script mixing cases like <Chinese, Latin-ASCII> that are + // allowed with |languages| (while it blocks Chinese + Latin letters with + // an accent as should be the case), but we want to err on the safe side + // when |languages| is empty. + if (languages.empty()) + return IsIDNComponentInSingleScript(str, str_len); + + // |common_characters| is made up of ASCII numbers, hyphen, plus and + // underscore that are used across scripts and allowed in domain names. + // (sync'd with characters allowed in url_canon_host with square + // brackets excluded.) See kHostCharLookup[] array in url_canon_host.cc. + icu::UnicodeSet common_characters(UNICODE_STRING_SIMPLE("[[0-9]\\-_+\\ ]"), + status); + DCHECK(U_SUCCESS(status)); + // Subtract common characters because they're always allowed so that + // we just have to check if a language-specific set contains + // the remainder. + component_characters.removeAll(common_characters); + + base::StringTokenizer t(languages, ","); + while (t.GetNext()) { + if (IsComponentCoveredByLang(component_characters, t.token())) + return true; + } + return false; +} + +// A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to +// a UTS46/IDNA 2008 handling object opened with uidna_openUTS46(). +// +// We use UTS46 with BiDiCheck to migrate from IDNA 2003 to IDNA 2008 with +// the backward compatibility in mind. What it does: +// +// 1. Use the up-to-date Unicode data. +// 2. Define a case folding/mapping with the up-to-date Unicode data as +// in IDNA 2003. +// 3. Use transitional mechanism for 4 deviation characters (sharp-s, +// final sigma, ZWJ and ZWNJ) for now. +// 4. Continue to allow symbols and punctuations. +// 5. Apply new BiDi check rules more permissive than the IDNA 2003 BiDI rules. +// 6. Do not apply STD3 rules +// 7. Do not allow unassigned code points. +// +// It also closely matches what IE 10 does except for the BiDi check ( +// http://goo.gl/3XBhqw ). +// See http://http://unicode.org/reports/tr46/ and references therein +// for more details. +struct UIDNAWrapper { + UIDNAWrapper() { + UErrorCode err = U_ZERO_ERROR; + // TODO(jungshik): Change options as different parties (browsers, + // registrars, search engines) converge toward a consensus. + value = uidna_openUTS46(UIDNA_CHECK_BIDI, &err); + if (U_FAILURE(err)) + value = NULL; + } + + UIDNA* value; +}; + +static base::LazyInstance<UIDNAWrapper>::Leaky + g_uidna = LAZY_INSTANCE_INITIALIZER; + +// Converts one component of a host (between dots) to IDN if safe. The result +// will be APPENDED to the given output string and will be the same as the input +// if it is not IDN or the IDN is unsafe to display. Returns whether any +// conversion was performed. +bool IDNToUnicodeOneComponent(const base::char16* comp, + size_t comp_len, + const std::string& languages, + base::string16* out) { + DCHECK(out); + if (comp_len == 0) + return false; + + // Only transform if the input can be an IDN component. + static const base::char16 kIdnPrefix[] = {'x', 'n', '-', '-'}; + if ((comp_len > arraysize(kIdnPrefix)) && + !memcmp(comp, kIdnPrefix, arraysize(kIdnPrefix) * sizeof(base::char16))) { + UIDNA* uidna = g_uidna.Get().value; + DCHECK(uidna != NULL); + size_t original_length = out->length(); + int output_length = 64; + UIDNAInfo info = UIDNA_INFO_INITIALIZER; + UErrorCode status; + do { + out->resize(original_length + output_length); + status = U_ZERO_ERROR; + // This returns the actual length required. If this is more than 64 + // code units, |status| will be U_BUFFER_OVERFLOW_ERROR and we'll try + // the conversion again, but with a sufficiently large buffer. + output_length = uidna_labelToUnicode( + uidna, comp, static_cast<int32_t>(comp_len), &(*out)[original_length], + output_length, &info, &status); + } while ((status == U_BUFFER_OVERFLOW_ERROR && info.errors == 0)); + + if (U_SUCCESS(status) && info.errors == 0) { + // Converted successfully. Ensure that the converted component + // can be safely displayed to the user. + out->resize(original_length + output_length); + if (IsIDNComponentSafe(out->data() + original_length, output_length, + languages)) + return true; + } + + // Something went wrong. Revert to original string. + out->resize(original_length); + } + + // We get here with no IDN or on error, in which case we just append the + // literal input. + out->append(comp, comp_len); + return false; +} + +// TODO(brettw) bug 734373: check the scripts for each host component and +// don't un-IDN-ize if there is more than one. Alternatively, only IDN for +// scripts that the user has installed. For now, just put the entire +// path through IDN. Maybe this feature can be implemented in ICU itself? +// +// We may want to skip this step in the case of file URLs to allow unicode +// UNC hostnames regardless of encodings. +base::string16 IDNToUnicodeWithAdjustments( + const std::string& host, + const std::string& languages, + base::OffsetAdjuster::Adjustments* adjustments) { + if (adjustments) + adjustments->clear(); + // Convert the ASCII input to a base::string16 for ICU. + base::string16 input16; + input16.reserve(host.length()); + input16.insert(input16.end(), host.begin(), host.end()); + + // Do each component of the host separately, since we enforce script matching + // on a per-component basis. + base::string16 out16; + { + for (size_t component_start = 0, component_end; + component_start < input16.length(); + component_start = component_end + 1) { + // Find the end of the component. + component_end = input16.find('.', component_start); + if (component_end == base::string16::npos) + component_end = input16.length(); // For getting the last component. + size_t component_length = component_end - component_start; + size_t new_component_start = out16.length(); + bool converted_idn = false; + if (component_end > component_start) { + // Add the substring that we just found. + converted_idn = IDNToUnicodeOneComponent( + input16.data() + component_start, component_length, languages, + &out16); + } + size_t new_component_length = out16.length() - new_component_start; + + if (converted_idn && adjustments) { + adjustments->push_back(base::OffsetAdjuster::Adjustment( + component_start, component_length, new_component_length)); + } + + // Need to add the dot we just found (if we found one). + if (component_end < input16.length()) + out16.push_back('.'); + } + } + return out16; +} + +// If |component| is valid, its begin is incremented by |delta|. +void AdjustComponent(int delta, url::Component* component) { + if (!component->is_valid()) + return; + + DCHECK(delta >= 0 || component->begin >= -delta); + component->begin += delta; +} + +// Adjusts all the components of |parsed| by |delta|, except for the scheme. +void AdjustAllComponentsButScheme(int delta, url::Parsed* parsed) { + AdjustComponent(delta, &(parsed->username)); + AdjustComponent(delta, &(parsed->password)); + AdjustComponent(delta, &(parsed->host)); + AdjustComponent(delta, &(parsed->port)); + AdjustComponent(delta, &(parsed->path)); + AdjustComponent(delta, &(parsed->query)); + AdjustComponent(delta, &(parsed->ref)); +} + +// Helper for FormatUrlWithOffsets(). +base::string16 FormatViewSourceUrl( + const GURL& url, + const std::string& languages, + FormatUrlTypes format_types, + UnescapeRule::Type unescape_rules, + url::Parsed* new_parsed, + size_t* prefix_end, + base::OffsetAdjuster::Adjustments* adjustments) { + DCHECK(new_parsed); + const char kViewSource[] = "view-source:"; + const size_t kViewSourceLength = arraysize(kViewSource) - 1; + + // Format the underlying URL and record adjustments. + const std::string& url_str(url.possibly_invalid_spec()); + adjustments->clear(); + base::string16 result(base::ASCIIToUTF16(kViewSource) + + FormatUrlWithAdjustments(GURL(url_str.substr(kViewSourceLength)), + languages, format_types, unescape_rules, + new_parsed, prefix_end, adjustments)); + // Revise |adjustments| by shifting to the offsets to prefix that the above + // call to FormatUrl didn't get to see. + for (base::OffsetAdjuster::Adjustments::iterator it = adjustments->begin(); + it != adjustments->end(); ++it) + it->original_offset += kViewSourceLength; + + // Adjust positions of the parsed components. + if (new_parsed->scheme.is_nonempty()) { + // Assume "view-source:real-scheme" as a scheme. + new_parsed->scheme.len += kViewSourceLength; + } else { + new_parsed->scheme.begin = 0; + new_parsed->scheme.len = kViewSourceLength - 1; + } + AdjustAllComponentsButScheme(kViewSourceLength, new_parsed); + + if (prefix_end) + *prefix_end += kViewSourceLength; + + return result; +} + +class AppendComponentTransform { + public: + AppendComponentTransform() {} + virtual ~AppendComponentTransform() {} + + virtual base::string16 Execute( + const std::string& component_text, + base::OffsetAdjuster::Adjustments* adjustments) const = 0; + + // NOTE: No DISALLOW_COPY_AND_ASSIGN here, since gcc < 4.3.0 requires an + // accessible copy constructor in order to call AppendFormattedComponent() + // with an inline temporary (see http://gcc.gnu.org/bugs/#cxx%5Frvalbind ). +}; + +class HostComponentTransform : public AppendComponentTransform { + public: + explicit HostComponentTransform(const std::string& languages) + : languages_(languages) { + } + + private: + base::string16 Execute( + const std::string& component_text, + base::OffsetAdjuster::Adjustments* adjustments) const override { + return IDNToUnicodeWithAdjustments(component_text, languages_, + adjustments); + } + + const std::string& languages_; +}; + +class NonHostComponentTransform : public AppendComponentTransform { + public: + explicit NonHostComponentTransform(UnescapeRule::Type unescape_rules) + : unescape_rules_(unescape_rules) { + } + + private: + base::string16 Execute( + const std::string& component_text, + base::OffsetAdjuster::Adjustments* adjustments) const override { + return (unescape_rules_ == UnescapeRule::NONE) ? + base::UTF8ToUTF16WithAdjustments(component_text, adjustments) : + UnescapeAndDecodeUTF8URLComponentWithAdjustments(component_text, + unescape_rules_, adjustments); + } + + const UnescapeRule::Type unescape_rules_; +}; + +// Transforms the portion of |spec| covered by |original_component| according to +// |transform|. Appends the result to |output|. If |output_component| is +// non-NULL, its start and length are set to the transformed component's new +// start and length. If |adjustments| is non-NULL, appends adjustments (if +// any) that reflect the transformation the original component underwent to +// become the transformed value appended to |output|. +void AppendFormattedComponent(const std::string& spec, + const url::Component& original_component, + const AppendComponentTransform& transform, + base::string16* output, + url::Component* output_component, + base::OffsetAdjuster::Adjustments* adjustments) { + DCHECK(output); + if (original_component.is_nonempty()) { + size_t original_component_begin = + static_cast<size_t>(original_component.begin); + size_t output_component_begin = output->length(); + std::string component_str(spec, original_component_begin, + static_cast<size_t>(original_component.len)); + + // Transform |component_str| and modify |adjustments| appropriately. + base::OffsetAdjuster::Adjustments component_transform_adjustments; + output->append( + transform.Execute(component_str, &component_transform_adjustments)); + + // Shift all the adjustments made for this component so the offsets are + // valid for the original string and add them to |adjustments|. + for (base::OffsetAdjuster::Adjustments::iterator comp_iter = + component_transform_adjustments.begin(); + comp_iter != component_transform_adjustments.end(); ++comp_iter) + comp_iter->original_offset += original_component_begin; + if (adjustments) { + adjustments->insert(adjustments->end(), + component_transform_adjustments.begin(), + component_transform_adjustments.end()); + } + + // Set positions of the parsed component. + if (output_component) { + output_component->begin = static_cast<int>(output_component_begin); + output_component->len = + static_cast<int>(output->length() - output_component_begin); + } + } else if (output_component) { + output_component->reset(); + } +} + +} // namespace + +const FormatUrlType kFormatUrlOmitNothing = 0; +const FormatUrlType kFormatUrlOmitUsernamePassword = 1 << 0; +const FormatUrlType kFormatUrlOmitHTTP = 1 << 1; +const FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname = 1 << 2; +const FormatUrlType kFormatUrlOmitAll = kFormatUrlOmitUsernamePassword | + kFormatUrlOmitHTTP | kFormatUrlOmitTrailingSlashOnBareHostname; + +base::string16 IDNToUnicode(const std::string& host, + const std::string& languages) { + return IDNToUnicodeWithAdjustments(host, languages, NULL); +} + std::string GetDirectoryListingEntry(const base::string16& name, const std::string& raw_bytes, bool is_dir, int64_t size, - base::Time modified) { + Time modified) { std::string result; result.append("<script>addRow("); base::EscapeJSONString(name, true, &result); @@ -26,7 +601,6 @@ std::string GetDirectoryListingEntry(const base::string16& name, } else { base::EscapeJSONString(EscapePath(raw_bytes), true, &result); } - if (is_dir) { result.append(",1,"); } else { @@ -43,8 +617,9 @@ std::string GetDirectoryListingEntry(const base::string16& name, base::string16 modified_str; // |modified| can be NULL in FTP listings. - if (!modified.is_null()) + if (!modified.is_null()) { modified_str = base::TimeFormatShortDateAndTime(modified); + } base::EscapeJSONString(modified_str, true, &result); result.append(");</script>\n"); @@ -52,4 +627,213 @@ std::string GetDirectoryListingEntry(const base::string16& name, return result; } +void AppendFormattedHost(const GURL& url, + const std::string& languages, + base::string16* output) { + AppendFormattedComponent(url.possibly_invalid_spec(), + url.parsed_for_possibly_invalid_spec().host, + HostComponentTransform(languages), output, NULL, NULL); +} + +base::string16 FormatUrlWithOffsets( + const GURL& url, + const std::string& languages, + FormatUrlTypes format_types, + UnescapeRule::Type unescape_rules, + url::Parsed* new_parsed, + size_t* prefix_end, + std::vector<size_t>* offsets_for_adjustment) { + base::OffsetAdjuster::Adjustments adjustments; + const base::string16& format_url_return_value = + FormatUrlWithAdjustments(url, languages, format_types, unescape_rules, + new_parsed, prefix_end, &adjustments); + base::OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); + if (offsets_for_adjustment) { + std::for_each( + offsets_for_adjustment->begin(), + offsets_for_adjustment->end(), + base::LimitOffset<std::string>(format_url_return_value.length())); + } + return format_url_return_value; +} + +base::string16 FormatUrlWithAdjustments( + const GURL& url, + const std::string& languages, + FormatUrlTypes format_types, + UnescapeRule::Type unescape_rules, + url::Parsed* new_parsed, + size_t* prefix_end, + base::OffsetAdjuster::Adjustments* adjustments) { + DCHECK(adjustments != NULL); + adjustments->clear(); + url::Parsed parsed_temp; + if (!new_parsed) + new_parsed = &parsed_temp; + else + *new_parsed = url::Parsed(); + + // Special handling for view-source:. Don't use content::kViewSourceScheme + // because this library shouldn't depend on chrome. + const char kViewSource[] = "view-source"; + // Reject "view-source:view-source:..." to avoid deep recursion. + const char kViewSourceTwice[] = "view-source:view-source:"; + if (url.SchemeIs(kViewSource) && + !base::StartsWith(url.possibly_invalid_spec(), kViewSourceTwice, + base::CompareCase::INSENSITIVE_ASCII)) { + return FormatViewSourceUrl(url, languages, format_types, + unescape_rules, new_parsed, prefix_end, + adjustments); + } + + // We handle both valid and invalid URLs (this will give us the spec + // regardless of validity). + const std::string& spec = url.possibly_invalid_spec(); + const url::Parsed& parsed = url.parsed_for_possibly_invalid_spec(); + + // Scheme & separators. These are ASCII. + base::string16 url_string; + url_string.insert( + url_string.end(), spec.begin(), + spec.begin() + parsed.CountCharactersBefore(url::Parsed::USERNAME, true)); + const char kHTTP[] = "http://"; + const char kFTP[] = "ftp."; + // url_fixer::FixupURL() treats "ftp.foo.com" as ftp://ftp.foo.com. This + // means that if we trim "http://" off a URL whose host starts with "ftp." and + // the user inputs this into any field subject to fixup (which is basically + // all input fields), the meaning would be changed. (In fact, often the + // formatted URL is directly pre-filled into an input field.) For this reason + // we avoid stripping "http://" in this case. + bool omit_http = + (format_types & kFormatUrlOmitHTTP) && + base::EqualsASCII(url_string, kHTTP) && + !base::StartsWith(url.host(), kFTP, base::CompareCase::SENSITIVE); + new_parsed->scheme = parsed.scheme; + + // Username & password. + if ((format_types & kFormatUrlOmitUsernamePassword) != 0) { + // Remove the username and password fields. We don't want to display those + // to the user since they can be used for attacks, + // e.g. "http://google.com:search@evil.ru/" + new_parsed->username.reset(); + new_parsed->password.reset(); + // Update the adjustments based on removed username and/or password. + if (parsed.username.is_nonempty() || parsed.password.is_nonempty()) { + if (parsed.username.is_nonempty() && parsed.password.is_nonempty()) { + // The seeming off-by-two is to account for the ':' after the username + // and '@' after the password. + adjustments->push_back(base::OffsetAdjuster::Adjustment( + static_cast<size_t>(parsed.username.begin), + static_cast<size_t>(parsed.username.len + parsed.password.len + 2), + 0)); + } else { + const url::Component* nonempty_component = + parsed.username.is_nonempty() ? &parsed.username : &parsed.password; + // The seeming off-by-one is to account for the '@' after the + // username/password. + adjustments->push_back(base::OffsetAdjuster::Adjustment( + static_cast<size_t>(nonempty_component->begin), + static_cast<size_t>(nonempty_component->len + 1), + 0)); + } + } + } else { + AppendFormattedComponent(spec, parsed.username, + NonHostComponentTransform(unescape_rules), + &url_string, &new_parsed->username, adjustments); + if (parsed.password.is_valid()) + url_string.push_back(':'); + AppendFormattedComponent(spec, parsed.password, + NonHostComponentTransform(unescape_rules), + &url_string, &new_parsed->password, adjustments); + if (parsed.username.is_valid() || parsed.password.is_valid()) + url_string.push_back('@'); + } + if (prefix_end) + *prefix_end = static_cast<size_t>(url_string.length()); + + // Host. + AppendFormattedComponent(spec, parsed.host, HostComponentTransform(languages), + &url_string, &new_parsed->host, adjustments); + + // Port. + if (parsed.port.is_nonempty()) { + url_string.push_back(':'); + new_parsed->port.begin = url_string.length(); + url_string.insert(url_string.end(), + spec.begin() + parsed.port.begin, + spec.begin() + parsed.port.end()); + new_parsed->port.len = url_string.length() - new_parsed->port.begin; + } else { + new_parsed->port.reset(); + } + + // Path & query. Both get the same general unescape & convert treatment. + if (!(format_types & kFormatUrlOmitTrailingSlashOnBareHostname) || + !CanStripTrailingSlash(url)) { + AppendFormattedComponent(spec, parsed.path, + NonHostComponentTransform(unescape_rules), + &url_string, &new_parsed->path, adjustments); + } else { + if (parsed.path.len > 0) { + adjustments->push_back(base::OffsetAdjuster::Adjustment( + parsed.path.begin, parsed.path.len, 0)); + } + } + if (parsed.query.is_valid()) + url_string.push_back('?'); + AppendFormattedComponent(spec, parsed.query, + NonHostComponentTransform(unescape_rules), + &url_string, &new_parsed->query, adjustments); + + // Ref. This is valid, unescaped UTF-8, so we can just convert. + if (parsed.ref.is_valid()) + url_string.push_back('#'); + AppendFormattedComponent(spec, parsed.ref, + NonHostComponentTransform(UnescapeRule::NONE), + &url_string, &new_parsed->ref, adjustments); + + // If we need to strip out http do it after the fact. + if (omit_http && + base::StartsWith(url_string, base::ASCIIToUTF16(kHTTP), + base::CompareCase::SENSITIVE)) { + const size_t kHTTPSize = arraysize(kHTTP) - 1; + url_string = url_string.substr(kHTTPSize); + // Because offsets in the |adjustments| are already calculated with respect + // to the string with the http:// prefix in it, those offsets remain correct + // after stripping the prefix. The only thing necessary is to add an + // adjustment to reflect the stripped prefix. + adjustments->insert(adjustments->begin(), + base::OffsetAdjuster::Adjustment(0, kHTTPSize, 0)); + + if (prefix_end) + *prefix_end -= kHTTPSize; + + // Adjust new_parsed. + DCHECK(new_parsed->scheme.is_valid()); + int delta = -(new_parsed->scheme.len + 3); // +3 for ://. + new_parsed->scheme.reset(); + AdjustAllComponentsButScheme(delta, new_parsed); + } + + return url_string; +} + +base::string16 FormatUrl(const GURL& url, + const std::string& languages, + FormatUrlTypes format_types, + UnescapeRule::Type unescape_rules, + url::Parsed* new_parsed, + size_t* prefix_end, + size_t* offset_for_adjustment) { + Offsets offsets; + if (offset_for_adjustment) + offsets.push_back(*offset_for_adjustment); + base::string16 result = FormatUrlWithOffsets(url, languages, format_types, + unescape_rules, new_parsed, prefix_end, &offsets); + if (offset_for_adjustment) + *offset_for_adjustment = offsets[0]; + return result; +} + } // namespace net diff --git a/net/base/net_util_icu_unittest.cc b/net/base/net_util_icu_unittest.cc index cac922f..f643426 100644 --- a/net/base/net_util_icu_unittest.cc +++ b/net/base/net_util_icu_unittest.cc @@ -4,19 +4,487 @@ #include "net/base/net_util.h" -#include <stdint.h> +#include <string.h> -#include <string> +#include <vector> +#include "base/format_macros.h" +#include "base/strings/string_number_conversions.h" +#include "base/strings/stringprintf.h" #include "base/strings/utf_string_conversions.h" #include "base/time/time.h" #include "testing/gtest/include/gtest/gtest.h" #include "url/gurl.h" +using base::ASCIIToUTF16; +using base::WideToUTF16; + namespace net { namespace { +const size_t kNpos = base::string16::npos; + +const char* const kLanguages[] = { + "", "en", "zh-CN", "ja", "ko", + "he", "ar", "ru", "el", "fr", + "de", "pt", "sv", "th", "hi", + "de,en", "el,en", "zh-TW,en", "ko,ja", "he,ru,en", + "zh,ru,en" +}; + +struct IDNTestCase { + const char* const input; + const wchar_t* unicode_output; + const bool unicode_allowed[arraysize(kLanguages)]; +}; + +// TODO(jungshik) This is just a random sample of languages and is far +// from exhaustive. We may have to generate all the combinations +// of languages (powerset of a set of all the languages). +const IDNTestCase idn_cases[] = { + // No IDN + {"www.google.com", L"www.google.com", + {true, true, true, true, true, + true, true, true, true, true, + true, true, true, true, true, + true, true, true, true, true, + true}}, + {"www.google.com.", L"www.google.com.", + {true, true, true, true, true, + true, true, true, true, true, + true, true, true, true, true, + true, true, true, true, true, + true}}, + {".", L".", + {true, true, true, true, true, + true, true, true, true, true, + true, true, true, true, true, + true, true, true, true, true, + true}}, + {"", L"", + {true, true, true, true, true, + true, true, true, true, true, + true, true, true, true, true, + true, true, true, true, true, + true}}, + // IDN + // Hanzi (Traditional Chinese) + {"xn--1lq90ic7f1rc.cn", L"\x5317\x4eac\x5927\x5b78.cn", + {true, false, true, true, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, true, true, false, + true}}, + // Hanzi ('video' in Simplified Chinese : will pass only in zh-CN,zh) + {"xn--cy2a840a.com", L"\x89c6\x9891.com", + {true, false, true, false, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + true}}, + // Hanzi + '123' + {"www.xn--123-p18d.com", L"www.\x4e00" L"123.com", + {true, false, true, true, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, true, true, false, + true}}, + // Hanzi + Latin : U+56FD is simplified and is regarded + // as not supported in zh-TW. + {"www.xn--hello-9n1hm04c.com", L"www.hello\x4e2d\x56fd.com", + {false, false, true, true, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, true, false, + true}}, + // Kanji + Kana (Japanese) + {"xn--l8jvb1ey91xtjb.jp", L"\x671d\x65e5\x3042\x3055\x3072.jp", + {true, false, false, true, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, true, false, + false}}, + // Katakana including U+30FC + {"xn--tckm4i2e.jp", L"\x30b3\x30de\x30fc\x30b9.jp", + {true, false, false, true, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, true, false, + }}, + {"xn--3ck7a7g.jp", L"\u30ce\u30f3\u30bd.jp", + {true, false, false, true, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, true, false, + }}, + // Katakana + Latin (Japanese) + // TODO(jungshik): Change 'false' in the first element to 'true' + // after upgrading to ICU 4.2.1 to use new uspoof_* APIs instead + // of our IsIDNComponentInSingleScript(). + {"xn--e-efusa1mzf.jp", L"e\x30b3\x30de\x30fc\x30b9.jp", + {false, false, false, true, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, true, false, + }}, + {"xn--3bkxe.jp", L"\x30c8\x309a.jp", + {false, false, false, true, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, true, false, + }}, + // Hangul (Korean) + {"www.xn--or3b17p6jjc.kr", L"www.\xc804\xc790\xc815\xbd80.kr", + {true, false, false, false, true, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, true, false, + false}}, + // b<u-umlaut>cher (German) + {"xn--bcher-kva.de", L"b\x00fc" L"cher.de", + {true, false, false, false, false, + false, false, false, false, true, + true, false, false, false, false, + true, false, false, false, false, + false}}, + // a with diaeresis + {"www.xn--frgbolaget-q5a.se", L"www.f\x00e4rgbolaget.se", + {true, false, false, false, false, + false, false, false, false, false, + true, false, true, false, false, + true, false, false, false, false, + false}}, + // c-cedilla (French) + {"www.xn--alliancefranaise-npb.fr", L"www.alliancefran\x00e7" L"aise.fr", + {true, false, false, false, false, + false, false, false, false, true, + false, true, false, false, false, + false, false, false, false, false, + false}}, + // caf'e with acute accent' (French) + {"xn--caf-dma.fr", L"caf\x00e9.fr", + {true, false, false, false, false, + false, false, false, false, true, + false, true, true, false, false, + false, false, false, false, false, + false}}, + // c-cedillla and a with tilde (Portuguese) + {"xn--poema-9qae5a.com.br", L"p\x00e3oema\x00e7\x00e3.com.br", + {true, false, false, false, false, + false, false, false, false, false, + false, true, false, false, false, + false, false, false, false, false, + false}}, + // s with caron + {"xn--achy-f6a.com", L"\x0161" L"achy.com", + {true, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false}}, + // TODO(jungshik) : Add examples with Cyrillic letters + // only used in some languages written in Cyrillic. + // Eutopia (Greek) + {"xn--kxae4bafwg.gr", L"\x03bf\x03c5\x03c4\x03bf\x03c0\x03af\x03b1.gr", + {true, false, false, false, false, + false, false, false, true, false, + false, false, false, false, false, + false, true, false, false, false, + false}}, + // Eutopia + 123 (Greek) + {"xn---123-pldm0haj2bk.gr", + L"\x03bf\x03c5\x03c4\x03bf\x03c0\x03af\x03b1-123.gr", + {true, false, false, false, false, + false, false, false, true, false, + false, false, false, false, false, + false, true, false, false, false, + false}}, + // Cyrillic (Russian) + {"xn--n1aeec9b.ru", L"\x0442\x043e\x0440\x0442\x044b.ru", + {true, false, false, false, false, + false, false, true, false, false, + false, false, false, false, false, + false, false, false, false, true, + true}}, + // Cyrillic + 123 (Russian) + {"xn---123-45dmmc5f.ru", L"\x0442\x043e\x0440\x0442\x044b-123.ru", + {true, false, false, false, false, + false, false, true, false, false, + false, false, false, false, false, + false, false, false, false, true, + true}}, + // Arabic + {"xn--mgba1fmg.ar", L"\x0627\x0641\x0644\x0627\x0645.ar", + {true, false, false, false, false, + false, true, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false}}, + // Hebrew + {"xn--4dbib.he", L"\x05d5\x05d0\x05d4.he", + {true, false, false, false, false, + true, false, false, false, false, + false, false, false, false, false, + false, false, false, false, true, + false}}, + // Thai + {"xn--12c2cc4ag3b4ccu.th", + L"\x0e2a\x0e32\x0e22\x0e01\x0e32\x0e23\x0e1a\x0e34\x0e19.th", + {true, false, false, false, false, + false, false, false, false, false, + false, false, false, true, false, + false, false, false, false, false, + false}}, + // Devangari (Hindi) + {"www.xn--l1b6a9e1b7c.in", L"www.\x0905\x0915\x094b\x0932\x093e.in", + {true, false, false, false, false, + false, false, false, false, false, + false, false, false, false, true, + false, false, false, false, false, + false}}, + // Invalid IDN + {"xn--hello?world.com", NULL, + {false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false}}, + // Unsafe IDNs + // "payp<alpha>l.com" + {"www.xn--paypl-g9d.com", L"payp\x03b1l.com", + {false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false}}, + // google.gr with Greek omicron and epsilon + {"xn--ggl-6xc1ca.gr", L"g\x03bf\x03bfgl\x03b5.gr", + {false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false}}, + // google.ru with Cyrillic o + {"xn--ggl-tdd6ba.ru", L"g\x043e\x043egl\x0435.ru", + {false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false}}, + // h<e with acute>llo<China in Han>.cn + {"xn--hllo-bpa7979ih5m.cn", L"h\x00e9llo\x4e2d\x56fd.cn", + {false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false}}, + // <Greek rho><Cyrillic a><Cyrillic u>.ru + {"xn--2xa6t2b.ru", L"\x03c1\x0430\x0443.ru", + {false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false}}, + // One that's really long that will force a buffer realloc + {"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + "aaaaaaa", + L"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa" + L"aaaaaaaa", + {true, true, true, true, true, + true, true, true, true, true, + true, true, true, true, true, + true, true, true, true, true, + true}}, + // Test cases for characters we blacklisted although allowed in IDN. + // Embedded spaces will be turned to %20 in the display. + // TODO(jungshik): We need to have more cases. This is a typical + // data-driven trap. The following test cases need to be separated + // and tested only for a couple of languages. + {"xn--osd3820f24c.kr", L"\xac00\xb098\x115f.kr", + {false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false}}, + {"www.xn--google-ho0coa.com", L"www.\x2039google\x203a.com", + {false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + }}, + {"google.xn--comabc-k8d", L"google.com\x0338" L"abc", + {false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + }}, + {"google.xn--com-oh4ba.evil.jp", L"google.com\x309a\x309a.evil.jp", + {false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + }}, + {"google.xn--comevil-v04f.jp", L"google.com\x30ce" L"evil.jp", + {false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + }}, + // Padlock icon spoof. + {"xn--google-hj64e", L"\U0001f512google.com", + {false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + }}, + // Ensure that blacklisting "\xd83d\xdd12" did not inadvertently blacklist + // all strings with the surrogate '\xdd12'. + {"xn--fk9c.com", L"\U00010912.com", + {true, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, false, false, false, + }}, +#if 0 + // These two cases are special. We need a separate test. + // U+3000 and U+3002 are normalized to ASCII space and dot. + {"xn-- -kq6ay5z.cn", L"\x4e2d\x56fd\x3000.cn", + {false, false, true, false, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, true, false, false, + true}}, + {"xn--fiqs8s.cn", L"\x4e2d\x56fd\x3002" L"cn", + {false, false, true, false, false, + false, false, false, false, false, + false, false, false, false, false, + false, false, true, false, false, + true}}, +#endif +}; + +struct AdjustOffsetCase { + size_t input_offset; + size_t output_offset; +}; + +struct UrlTestData { + const char* const description; + const char* const input; + const char* const languages; + FormatUrlTypes format_types; + UnescapeRule::Type escape_rules; + const wchar_t* output; // Use |wchar_t| to handle Unicode constants easily. + size_t prefix_len; +}; + +// A helper for IDN*{Fast,Slow}. +// Append "::<language list>" to |expected| and |actual| to make it +// easy to tell which sub-case fails without debugging. +void AppendLanguagesToOutputs(const char* languages, + base::string16* expected, + base::string16* actual) { + base::string16 to_append = ASCIIToUTF16("::") + ASCIIToUTF16(languages); + expected->append(to_append); + actual->append(to_append); +} + +// A pair of helpers for the FormatUrlWithOffsets() test. +void VerboseExpect(size_t expected, + size_t actual, + const std::string& original_url, + size_t position, + const base::string16& formatted_url) { + EXPECT_EQ(expected, actual) << "Original URL: " << original_url + << " (at char " << position << ")\nFormatted URL: " << formatted_url; +} + +void CheckAdjustedOffsets(const std::string& url_string, + const std::string& languages, + FormatUrlTypes format_types, + UnescapeRule::Type unescape_rules, + const size_t* output_offsets) { + GURL url(url_string); + size_t url_length = url_string.length(); + std::vector<size_t> offsets; + for (size_t i = 0; i <= url_length + 1; ++i) + offsets.push_back(i); + offsets.push_back(500000); // Something larger than any input length. + offsets.push_back(std::string::npos); + base::string16 formatted_url = FormatUrlWithOffsets(url, languages, + format_types, unescape_rules, NULL, NULL, &offsets); + for (size_t i = 0; i < url_length; ++i) + VerboseExpect(output_offsets[i], offsets[i], url_string, i, formatted_url); + VerboseExpect(formatted_url.length(), offsets[url_length], url_string, + url_length, formatted_url); + VerboseExpect(base::string16::npos, offsets[url_length + 1], url_string, + 500000, formatted_url); + VerboseExpect(base::string16::npos, offsets[url_length + 2], url_string, + std::string::npos, formatted_url); +} + +} // anonymous namespace + +TEST(NetUtilTest, IDNToUnicodeFast) { + for (size_t i = 0; i < arraysize(idn_cases); i++) { + for (size_t j = 0; j < arraysize(kLanguages); j++) { + // ja || zh-TW,en || ko,ja -> IDNToUnicodeSlow + if (j == 3 || j == 17 || j == 18) + continue; + base::string16 output(IDNToUnicode(idn_cases[i].input, kLanguages[j])); + base::string16 expected(idn_cases[i].unicode_allowed[j] ? + WideToUTF16(idn_cases[i].unicode_output) : + ASCIIToUTF16(idn_cases[i].input)); + AppendLanguagesToOutputs(kLanguages[j], &expected, &output); + EXPECT_EQ(expected, output) << "input: \"" << idn_cases[i].input + << "\", languages: \"" << kLanguages[j] + << "\""; + } + } +} + +TEST(NetUtilTest, IDNToUnicodeSlow) { + for (size_t i = 0; i < arraysize(idn_cases); i++) { + for (size_t j = 0; j < arraysize(kLanguages); j++) { + // !(ja || zh-TW,en || ko,ja) -> IDNToUnicodeFast + if (!(j == 3 || j == 17 || j == 18)) + continue; + base::string16 output(IDNToUnicode(idn_cases[i].input, kLanguages[j])); + base::string16 expected(idn_cases[i].unicode_allowed[j] ? + WideToUTF16(idn_cases[i].unicode_output) : + ASCIIToUTF16(idn_cases[i].input)); + AppendLanguagesToOutputs(kLanguages[j], &expected, &output); + EXPECT_EQ(expected, output) << "input: \"" << idn_cases[i].input + << "\", languages: \"" << kLanguages[j] + << "\""; + } + } +} + +// ulocdata_getExemplarSet may fail with some locales (currently bn, gu, and +// te), which was causing a crash (See http://crbug.com/510551). This may be an +// icu bug, but regardless, that should not cause a crash. +TEST(NetUtilTest, IDNToUnicodeNeverCrashes) { + for (char c1 = 'a'; c1 <= 'z'; c1++) { + for (char c2 = 'a'; c2 <= 'z'; c2++) { + std::string lang = base::StringPrintf("%c%c", c1, c2); + base::string16 output(IDNToUnicode("xn--74h", lang)); + } + } +} + +TEST(NetUtilTest, StripWWW) { + EXPECT_EQ(base::string16(), StripWWW(base::string16())); + EXPECT_EQ(base::string16(), StripWWW(ASCIIToUTF16("www."))); + EXPECT_EQ(ASCIIToUTF16("blah"), StripWWW(ASCIIToUTF16("www.blah"))); + EXPECT_EQ(ASCIIToUTF16("blah"), StripWWW(ASCIIToUTF16("blah"))); +} + +// This is currently a windows specific function. +#if defined(OS_WIN) +namespace { + struct GetDirectoryListingEntryCase { const wchar_t* name; const char* const raw_bytes; @@ -26,6 +494,8 @@ struct GetDirectoryListingEntryCase { const char* const expected; }; +} // namespace + TEST(NetUtilTest, GetDirectoryListingEntry) { const GetDirectoryListingEntryCase test_cases[] = { {L"Foo", @@ -70,12 +540,564 @@ TEST(NetUtilTest, GetDirectoryListingEntry) { for (size_t i = 0; i < arraysize(test_cases); ++i) { const std::string results = GetDirectoryListingEntry( - base::WideToUTF16(test_cases[i].name), test_cases[i].raw_bytes, - test_cases[i].is_dir, test_cases[i].filesize, test_cases[i].time); + WideToUTF16(test_cases[i].name), + test_cases[i].raw_bytes, + test_cases[i].is_dir, + test_cases[i].filesize, + test_cases[i].time); EXPECT_EQ(test_cases[i].expected, results); } } -} // namespace +#endif + +TEST(NetUtilTest, FormatUrl) { + FormatUrlTypes default_format_type = kFormatUrlOmitUsernamePassword; + const UrlTestData tests[] = { + {"Empty URL", "", "", default_format_type, UnescapeRule::NORMAL, L"", 0}, + + {"Simple URL", + "http://www.google.com/", "", default_format_type, UnescapeRule::NORMAL, + L"http://www.google.com/", 7}, + + {"With a port number and a reference", + "http://www.google.com:8080/#\xE3\x82\xB0", "", default_format_type, + UnescapeRule::NORMAL, + L"http://www.google.com:8080/#\x30B0", 7}, + + // -------- IDN tests -------- + {"Japanese IDN with ja", + "http://xn--l8jvb1ey91xtjb.jp", "ja", default_format_type, + UnescapeRule::NORMAL, L"http://\x671d\x65e5\x3042\x3055\x3072.jp/", 7}, + + {"Japanese IDN with en", + "http://xn--l8jvb1ey91xtjb.jp", "en", default_format_type, + UnescapeRule::NORMAL, L"http://xn--l8jvb1ey91xtjb.jp/", 7}, + + {"Japanese IDN without any languages", + "http://xn--l8jvb1ey91xtjb.jp", "", default_format_type, + UnescapeRule::NORMAL, + // Single script is safe for empty languages. + L"http://\x671d\x65e5\x3042\x3055\x3072.jp/", 7}, + + {"mailto: with Japanese IDN", + "mailto:foo@xn--l8jvb1ey91xtjb.jp", "ja", default_format_type, + UnescapeRule::NORMAL, + // GURL doesn't assume an email address's domain part as a host name. + L"mailto:foo@xn--l8jvb1ey91xtjb.jp", 7}, + + {"file: with Japanese IDN", + "file://xn--l8jvb1ey91xtjb.jp/config.sys", "ja", default_format_type, + UnescapeRule::NORMAL, + L"file://\x671d\x65e5\x3042\x3055\x3072.jp/config.sys", 7}, + + {"ftp: with Japanese IDN", + "ftp://xn--l8jvb1ey91xtjb.jp/config.sys", "ja", default_format_type, + UnescapeRule::NORMAL, + L"ftp://\x671d\x65e5\x3042\x3055\x3072.jp/config.sys", 6}, + + // -------- omit_username_password flag tests -------- + {"With username and password, omit_username_password=false", + "http://user:passwd@example.com/foo", "", + kFormatUrlOmitNothing, UnescapeRule::NORMAL, + L"http://user:passwd@example.com/foo", 19}, + + {"With username and password, omit_username_password=true", + "http://user:passwd@example.com/foo", "", default_format_type, + UnescapeRule::NORMAL, L"http://example.com/foo", 7}, + + {"With username and no password", + "http://user@example.com/foo", "", default_format_type, + UnescapeRule::NORMAL, L"http://example.com/foo", 7}, + + {"Just '@' without username and password", + "http://@example.com/foo", "", default_format_type, UnescapeRule::NORMAL, + L"http://example.com/foo", 7}, + + // GURL doesn't think local-part of an email address is username for URL. + {"mailto:, omit_username_password=true", + "mailto:foo@example.com", "", default_format_type, UnescapeRule::NORMAL, + L"mailto:foo@example.com", 7}, + + // -------- unescape flag tests -------- + {"Do not unescape", + "http://%E3%82%B0%E3%83%BC%E3%82%B0%E3%83%AB.jp/" + "%E3%82%B0%E3%83%BC%E3%82%B0%E3%83%AB" + "?q=%E3%82%B0%E3%83%BC%E3%82%B0%E3%83%AB", "en", default_format_type, + UnescapeRule::NONE, + // GURL parses %-encoded hostnames into Punycode. + L"http://xn--qcka1pmc.jp/%E3%82%B0%E3%83%BC%E3%82%B0%E3%83%AB" + L"?q=%E3%82%B0%E3%83%BC%E3%82%B0%E3%83%AB", 7}, + + {"Unescape normally", + "http://%E3%82%B0%E3%83%BC%E3%82%B0%E3%83%AB.jp/" + "%E3%82%B0%E3%83%BC%E3%82%B0%E3%83%AB" + "?q=%E3%82%B0%E3%83%BC%E3%82%B0%E3%83%AB", "en", default_format_type, + UnescapeRule::NORMAL, + L"http://xn--qcka1pmc.jp/\x30B0\x30FC\x30B0\x30EB" + L"?q=\x30B0\x30FC\x30B0\x30EB", 7}, + + {"Unescape normally with BiDi control character", + "http://example.com/%E2%80%AEabc?q=%E2%80%8Fxy", "en", default_format_type, + UnescapeRule::NORMAL, L"http://example.com/%E2%80%AEabc?q=%E2%80%8Fxy", 7}, + + {"Unescape normally including unescape spaces", + "http://www.google.com/search?q=Hello%20World", "en", default_format_type, + UnescapeRule::SPACES, L"http://www.google.com/search?q=Hello World", 7}, + + /* + {"unescape=true with some special characters", + "http://user%3A:%40passwd@example.com/foo%3Fbar?q=b%26z", "", + kFormatUrlOmitNothing, UnescapeRule::NORMAL, + L"http://user%3A:%40passwd@example.com/foo%3Fbar?q=b%26z", 25}, + */ + // Disabled: the resultant URL becomes "...user%253A:%2540passwd...". + + // -------- omit http: -------- + {"omit http with user name", + "http://user@example.com/foo", "", kFormatUrlOmitAll, + UnescapeRule::NORMAL, L"example.com/foo", 0}, + + {"omit http", + "http://www.google.com/", "en", kFormatUrlOmitHTTP, + UnescapeRule::NORMAL, L"www.google.com/", + 0}, + + {"omit http with https", + "https://www.google.com/", "en", kFormatUrlOmitHTTP, + UnescapeRule::NORMAL, L"https://www.google.com/", + 8}, + + {"omit http starts with ftp.", + "http://ftp.google.com/", "en", kFormatUrlOmitHTTP, + UnescapeRule::NORMAL, L"http://ftp.google.com/", + 7}, + + // -------- omit trailing slash on bare hostname -------- + {"omit slash when it's the entire path", + "http://www.google.com/", "en", + kFormatUrlOmitTrailingSlashOnBareHostname, UnescapeRule::NORMAL, + L"http://www.google.com", 7}, + {"omit slash when there's a ref", + "http://www.google.com/#ref", "en", + kFormatUrlOmitTrailingSlashOnBareHostname, UnescapeRule::NORMAL, + L"http://www.google.com/#ref", 7}, + {"omit slash when there's a query", + "http://www.google.com/?", "en", + kFormatUrlOmitTrailingSlashOnBareHostname, UnescapeRule::NORMAL, + L"http://www.google.com/?", 7}, + {"omit slash when it's not the entire path", + "http://www.google.com/foo", "en", + kFormatUrlOmitTrailingSlashOnBareHostname, UnescapeRule::NORMAL, + L"http://www.google.com/foo", 7}, + {"omit slash for nonstandard URLs", + "data:/", "en", kFormatUrlOmitTrailingSlashOnBareHostname, + UnescapeRule::NORMAL, L"data:/", 5}, + {"omit slash for file URLs", + "file:///", "en", kFormatUrlOmitTrailingSlashOnBareHostname, + UnescapeRule::NORMAL, L"file:///", 7}, + + // -------- view-source: -------- + {"view-source", + "view-source:http://xn--qcka1pmc.jp/", "ja", default_format_type, + UnescapeRule::NORMAL, L"view-source:http://\x30B0\x30FC\x30B0\x30EB.jp/", + 19}, + + {"view-source of view-source", + "view-source:view-source:http://xn--qcka1pmc.jp/", "ja", + default_format_type, UnescapeRule::NORMAL, + L"view-source:view-source:http://xn--qcka1pmc.jp/", 12}, + + // view-source should omit http and trailing slash where non-view-source + // would. + {"view-source omit http", + "view-source:http://a.b/c", "en", kFormatUrlOmitAll, + UnescapeRule::NORMAL, L"view-source:a.b/c", + 12}, + {"view-source omit http starts with ftp.", + "view-source:http://ftp.b/c", "en", kFormatUrlOmitAll, + UnescapeRule::NORMAL, L"view-source:http://ftp.b/c", + 19}, + {"view-source omit slash when it's the entire path", + "view-source:http://a.b/", "en", kFormatUrlOmitAll, + UnescapeRule::NORMAL, L"view-source:a.b", + 12}, + }; + + for (size_t i = 0; i < arraysize(tests); ++i) { + size_t prefix_len; + base::string16 formatted = FormatUrl( + GURL(tests[i].input), tests[i].languages, tests[i].format_types, + tests[i].escape_rules, NULL, &prefix_len, NULL); + EXPECT_EQ(WideToUTF16(tests[i].output), formatted) << tests[i].description; + EXPECT_EQ(tests[i].prefix_len, prefix_len) << tests[i].description; + } +} + +TEST(NetUtilTest, FormatUrlParsed) { + // No unescape case. + url::Parsed parsed; + base::string16 formatted = FormatUrl( + GURL("http://\xE3\x82\xB0:\xE3\x83\xBC@xn--qcka1pmc.jp:8080/" + "%E3%82%B0/?q=%E3%82%B0#\xE3\x82\xB0"), + "ja", kFormatUrlOmitNothing, UnescapeRule::NONE, &parsed, NULL, + NULL); + EXPECT_EQ(WideToUTF16( + L"http://%E3%82%B0:%E3%83%BC@\x30B0\x30FC\x30B0\x30EB.jp:8080" + L"/%E3%82%B0/?q=%E3%82%B0#\x30B0"), formatted); + EXPECT_EQ(WideToUTF16(L"%E3%82%B0"), + formatted.substr(parsed.username.begin, parsed.username.len)); + EXPECT_EQ(WideToUTF16(L"%E3%83%BC"), + formatted.substr(parsed.password.begin, parsed.password.len)); + EXPECT_EQ(WideToUTF16(L"\x30B0\x30FC\x30B0\x30EB.jp"), + formatted.substr(parsed.host.begin, parsed.host.len)); + EXPECT_EQ(WideToUTF16(L"8080"), + formatted.substr(parsed.port.begin, parsed.port.len)); + EXPECT_EQ(WideToUTF16(L"/%E3%82%B0/"), + formatted.substr(parsed.path.begin, parsed.path.len)); + EXPECT_EQ(WideToUTF16(L"q=%E3%82%B0"), + formatted.substr(parsed.query.begin, parsed.query.len)); + EXPECT_EQ(WideToUTF16(L"\x30B0"), + formatted.substr(parsed.ref.begin, parsed.ref.len)); + + // Unescape case. + formatted = FormatUrl( + GURL("http://\xE3\x82\xB0:\xE3\x83\xBC@xn--qcka1pmc.jp:8080/" + "%E3%82%B0/?q=%E3%82%B0#\xE3\x82\xB0"), + "ja", kFormatUrlOmitNothing, UnescapeRule::NORMAL, &parsed, NULL, + NULL); + EXPECT_EQ(WideToUTF16(L"http://\x30B0:\x30FC@\x30B0\x30FC\x30B0\x30EB.jp:8080" + L"/\x30B0/?q=\x30B0#\x30B0"), formatted); + EXPECT_EQ(WideToUTF16(L"\x30B0"), + formatted.substr(parsed.username.begin, parsed.username.len)); + EXPECT_EQ(WideToUTF16(L"\x30FC"), + formatted.substr(parsed.password.begin, parsed.password.len)); + EXPECT_EQ(WideToUTF16(L"\x30B0\x30FC\x30B0\x30EB.jp"), + formatted.substr(parsed.host.begin, parsed.host.len)); + EXPECT_EQ(WideToUTF16(L"8080"), + formatted.substr(parsed.port.begin, parsed.port.len)); + EXPECT_EQ(WideToUTF16(L"/\x30B0/"), + formatted.substr(parsed.path.begin, parsed.path.len)); + EXPECT_EQ(WideToUTF16(L"q=\x30B0"), + formatted.substr(parsed.query.begin, parsed.query.len)); + EXPECT_EQ(WideToUTF16(L"\x30B0"), + formatted.substr(parsed.ref.begin, parsed.ref.len)); + + // Omit_username_password + unescape case. + formatted = FormatUrl( + GURL("http://\xE3\x82\xB0:\xE3\x83\xBC@xn--qcka1pmc.jp:8080/" + "%E3%82%B0/?q=%E3%82%B0#\xE3\x82\xB0"), + "ja", kFormatUrlOmitUsernamePassword, UnescapeRule::NORMAL, &parsed, + NULL, NULL); + EXPECT_EQ(WideToUTF16(L"http://\x30B0\x30FC\x30B0\x30EB.jp:8080" + L"/\x30B0/?q=\x30B0#\x30B0"), formatted); + EXPECT_FALSE(parsed.username.is_valid()); + EXPECT_FALSE(parsed.password.is_valid()); + EXPECT_EQ(WideToUTF16(L"\x30B0\x30FC\x30B0\x30EB.jp"), + formatted.substr(parsed.host.begin, parsed.host.len)); + EXPECT_EQ(WideToUTF16(L"8080"), + formatted.substr(parsed.port.begin, parsed.port.len)); + EXPECT_EQ(WideToUTF16(L"/\x30B0/"), + formatted.substr(parsed.path.begin, parsed.path.len)); + EXPECT_EQ(WideToUTF16(L"q=\x30B0"), + formatted.substr(parsed.query.begin, parsed.query.len)); + EXPECT_EQ(WideToUTF16(L"\x30B0"), + formatted.substr(parsed.ref.begin, parsed.ref.len)); + + // View-source case. + formatted = + FormatUrl(GURL("view-source:http://user:passwd@host:81/path?query#ref"), + std::string(), + kFormatUrlOmitUsernamePassword, + UnescapeRule::NORMAL, + &parsed, + NULL, + NULL); + EXPECT_EQ(WideToUTF16(L"view-source:http://host:81/path?query#ref"), + formatted); + EXPECT_EQ(WideToUTF16(L"view-source:http"), + formatted.substr(parsed.scheme.begin, parsed.scheme.len)); + EXPECT_FALSE(parsed.username.is_valid()); + EXPECT_FALSE(parsed.password.is_valid()); + EXPECT_EQ(WideToUTF16(L"host"), + formatted.substr(parsed.host.begin, parsed.host.len)); + EXPECT_EQ(WideToUTF16(L"81"), + formatted.substr(parsed.port.begin, parsed.port.len)); + EXPECT_EQ(WideToUTF16(L"/path"), + formatted.substr(parsed.path.begin, parsed.path.len)); + EXPECT_EQ(WideToUTF16(L"query"), + formatted.substr(parsed.query.begin, parsed.query.len)); + EXPECT_EQ(WideToUTF16(L"ref"), + formatted.substr(parsed.ref.begin, parsed.ref.len)); + + // omit http case. + formatted = FormatUrl(GURL("http://host:8000/a?b=c#d"), + std::string(), + kFormatUrlOmitHTTP, + UnescapeRule::NORMAL, + &parsed, + NULL, + NULL); + EXPECT_EQ(WideToUTF16(L"host:8000/a?b=c#d"), formatted); + EXPECT_FALSE(parsed.scheme.is_valid()); + EXPECT_FALSE(parsed.username.is_valid()); + EXPECT_FALSE(parsed.password.is_valid()); + EXPECT_EQ(WideToUTF16(L"host"), + formatted.substr(parsed.host.begin, parsed.host.len)); + EXPECT_EQ(WideToUTF16(L"8000"), + formatted.substr(parsed.port.begin, parsed.port.len)); + EXPECT_EQ(WideToUTF16(L"/a"), + formatted.substr(parsed.path.begin, parsed.path.len)); + EXPECT_EQ(WideToUTF16(L"b=c"), + formatted.substr(parsed.query.begin, parsed.query.len)); + EXPECT_EQ(WideToUTF16(L"d"), + formatted.substr(parsed.ref.begin, parsed.ref.len)); + + // omit http starts with ftp case. + formatted = FormatUrl(GURL("http://ftp.host:8000/a?b=c#d"), + std::string(), + kFormatUrlOmitHTTP, + UnescapeRule::NORMAL, + &parsed, + NULL, + NULL); + EXPECT_EQ(WideToUTF16(L"http://ftp.host:8000/a?b=c#d"), formatted); + EXPECT_TRUE(parsed.scheme.is_valid()); + EXPECT_FALSE(parsed.username.is_valid()); + EXPECT_FALSE(parsed.password.is_valid()); + EXPECT_EQ(WideToUTF16(L"http"), + formatted.substr(parsed.scheme.begin, parsed.scheme.len)); + EXPECT_EQ(WideToUTF16(L"ftp.host"), + formatted.substr(parsed.host.begin, parsed.host.len)); + EXPECT_EQ(WideToUTF16(L"8000"), + formatted.substr(parsed.port.begin, parsed.port.len)); + EXPECT_EQ(WideToUTF16(L"/a"), + formatted.substr(parsed.path.begin, parsed.path.len)); + EXPECT_EQ(WideToUTF16(L"b=c"), + formatted.substr(parsed.query.begin, parsed.query.len)); + EXPECT_EQ(WideToUTF16(L"d"), + formatted.substr(parsed.ref.begin, parsed.ref.len)); + + // omit http starts with 'f' case. + formatted = FormatUrl(GURL("http://f/"), + std::string(), + kFormatUrlOmitHTTP, + UnescapeRule::NORMAL, + &parsed, + NULL, + NULL); + EXPECT_EQ(WideToUTF16(L"f/"), formatted); + EXPECT_FALSE(parsed.scheme.is_valid()); + EXPECT_FALSE(parsed.username.is_valid()); + EXPECT_FALSE(parsed.password.is_valid()); + EXPECT_FALSE(parsed.port.is_valid()); + EXPECT_TRUE(parsed.path.is_valid()); + EXPECT_FALSE(parsed.query.is_valid()); + EXPECT_FALSE(parsed.ref.is_valid()); + EXPECT_EQ(WideToUTF16(L"f"), + formatted.substr(parsed.host.begin, parsed.host.len)); + EXPECT_EQ(WideToUTF16(L"/"), + formatted.substr(parsed.path.begin, parsed.path.len)); +} + +// Make sure that calling FormatUrl on a GURL and then converting back to a GURL +// results in the original GURL, for each ASCII character in the path. +TEST(NetUtilTest, FormatUrlRoundTripPathASCII) { + for (unsigned char test_char = 32; test_char < 128; ++test_char) { + GURL url(std::string("http://www.google.com/") + + static_cast<char>(test_char)); + size_t prefix_len; + base::string16 formatted = FormatUrl(url, + std::string(), + kFormatUrlOmitUsernamePassword, + UnescapeRule::NORMAL, + NULL, + &prefix_len, + NULL); + EXPECT_EQ(url.spec(), GURL(formatted).spec()); + } +} + +// Make sure that calling FormatUrl on a GURL and then converting back to a GURL +// results in the original GURL, for each escaped ASCII character in the path. +TEST(NetUtilTest, FormatUrlRoundTripPathEscaped) { + for (unsigned char test_char = 32; test_char < 128; ++test_char) { + std::string original_url("http://www.google.com/"); + original_url.push_back('%'); + original_url.append(base::HexEncode(&test_char, 1)); + + GURL url(original_url); + size_t prefix_len; + base::string16 formatted = FormatUrl(url, + std::string(), + kFormatUrlOmitUsernamePassword, + UnescapeRule::NORMAL, + NULL, + &prefix_len, + NULL); + EXPECT_EQ(url.spec(), GURL(formatted).spec()); + } +} + +// Make sure that calling FormatUrl on a GURL and then converting back to a GURL +// results in the original GURL, for each ASCII character in the query. +TEST(NetUtilTest, FormatUrlRoundTripQueryASCII) { + for (unsigned char test_char = 32; test_char < 128; ++test_char) { + GURL url(std::string("http://www.google.com/?") + + static_cast<char>(test_char)); + size_t prefix_len; + base::string16 formatted = FormatUrl(url, + std::string(), + kFormatUrlOmitUsernamePassword, + UnescapeRule::NORMAL, + NULL, + &prefix_len, + NULL); + EXPECT_EQ(url.spec(), GURL(formatted).spec()); + } +} + +// Make sure that calling FormatUrl on a GURL and then converting back to a GURL +// only results in a different GURL for certain characters. +TEST(NetUtilTest, FormatUrlRoundTripQueryEscaped) { + // A full list of characters which FormatURL should unescape and GURL should + // not escape again, when they appear in a query string. + const char kUnescapedCharacters[] = + "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz-_~"; + for (unsigned char test_char = 0; test_char < 128; ++test_char) { + std::string original_url("http://www.google.com/?"); + original_url.push_back('%'); + original_url.append(base::HexEncode(&test_char, 1)); + + GURL url(original_url); + size_t prefix_len; + base::string16 formatted = FormatUrl(url, + std::string(), + kFormatUrlOmitUsernamePassword, + UnescapeRule::NORMAL, + NULL, + &prefix_len, + NULL); + + if (test_char && + strchr(kUnescapedCharacters, static_cast<char>(test_char))) { + EXPECT_NE(url.spec(), GURL(formatted).spec()); + } else { + EXPECT_EQ(url.spec(), GURL(formatted).spec()); + } + } +} + +TEST(NetUtilTest, FormatUrlWithOffsets) { + CheckAdjustedOffsets(std::string(), "en", kFormatUrlOmitNothing, + UnescapeRule::NORMAL, NULL); + + const size_t basic_offsets[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25 + }; + CheckAdjustedOffsets("http://www.google.com/foo/", "en", + kFormatUrlOmitNothing, UnescapeRule::NORMAL, + basic_offsets); + + const size_t omit_auth_offsets_1[] = { + 0, 1, 2, 3, 4, 5, 6, 7, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, 7, + 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 + }; + CheckAdjustedOffsets("http://foo:bar@www.google.com/", "en", + kFormatUrlOmitUsernamePassword, UnescapeRule::NORMAL, + omit_auth_offsets_1); + + const size_t omit_auth_offsets_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, kNpos, kNpos, kNpos, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21 + }; + CheckAdjustedOffsets("http://foo@www.google.com/", "en", + kFormatUrlOmitUsernamePassword, UnescapeRule::NORMAL, + omit_auth_offsets_2); + + const size_t dont_omit_auth_offsets[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, + kNpos, kNpos, 11, 12, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, + kNpos, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31 + }; + // Unescape to "http://foo\x30B0:\x30B0bar@www.google.com". + CheckAdjustedOffsets("http://foo%E3%82%B0:%E3%82%B0bar@www.google.com/", "en", + kFormatUrlOmitNothing, UnescapeRule::NORMAL, + dont_omit_auth_offsets); + + const size_t view_source_offsets[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, kNpos, + kNpos, kNpos, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33 + }; + CheckAdjustedOffsets("view-source:http://foo@www.google.com/", "en", + kFormatUrlOmitUsernamePassword, UnescapeRule::NORMAL, + view_source_offsets); + + const size_t idn_hostname_offsets_1[] = { + 0, 1, 2, 3, 4, 5, 6, 7, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, + kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, 12, + 13, 14, 15, 16, 17, 18, 19 + }; + // Convert punycode to "http://\x671d\x65e5\x3042\x3055\x3072.jp/foo/". + CheckAdjustedOffsets("http://xn--l8jvb1ey91xtjb.jp/foo/", "ja", + kFormatUrlOmitNothing, UnescapeRule::NORMAL, + idn_hostname_offsets_1); + + const size_t idn_hostname_offsets_2[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, kNpos, kNpos, kNpos, kNpos, kNpos, + kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, 14, 15, kNpos, kNpos, kNpos, + kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, + kNpos, 19, 20, 21, 22, 23, 24 + }; + // Convert punycode to + // "http://test.\x89c6\x9891.\x5317\x4eac\x5927\x5b78.test/". + CheckAdjustedOffsets("http://test.xn--cy2a840a.xn--1lq90ic7f1rc.test/", + "zh-CN", kFormatUrlOmitNothing, UnescapeRule::NORMAL, + idn_hostname_offsets_2); + + const size_t unescape_offsets[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, kNpos, kNpos, 26, 27, 28, 29, 30, kNpos, kNpos, kNpos, + kNpos, kNpos, kNpos, kNpos, kNpos, 31, kNpos, kNpos, kNpos, kNpos, kNpos, + kNpos, kNpos, kNpos, 32, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, + kNpos, 33, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos + }; + // Unescape to "http://www.google.com/foo bar/\x30B0\x30FC\x30B0\x30EB". + CheckAdjustedOffsets( + "http://www.google.com/foo%20bar/%E3%82%B0%E3%83%BC%E3%82%B0%E3%83%AB", + "en", kFormatUrlOmitNothing, UnescapeRule::SPACES, unescape_offsets); + + const size_t ref_offsets[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, + 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, kNpos, kNpos, 32, kNpos, kNpos, + 33 + }; + // Unescape to "http://www.google.com/foo.html#\x30B0\x30B0z". + CheckAdjustedOffsets( + "http://www.google.com/foo.html#\xE3\x82\xB0\xE3\x82\xB0z", "en", + kFormatUrlOmitNothing, UnescapeRule::NORMAL, ref_offsets); + + const size_t omit_http_offsets[] = { + 0, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14 + }; + CheckAdjustedOffsets("http://www.google.com/", "en", kFormatUrlOmitHTTP, + UnescapeRule::NORMAL, omit_http_offsets); + + const size_t omit_http_start_with_ftp_offsets[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21 + }; + CheckAdjustedOffsets("http://ftp.google.com/", "en", kFormatUrlOmitHTTP, + UnescapeRule::NORMAL, omit_http_start_with_ftp_offsets); + + const size_t omit_all_offsets[] = { + 0, kNpos, kNpos, kNpos, kNpos, kNpos, kNpos, 0, kNpos, kNpos, kNpos, kNpos, + 0, 1, 2, 3, 4, 5, 6, 7 + }; + CheckAdjustedOffsets("http://user@foo.com/", "en", kFormatUrlOmitAll, + UnescapeRule::NORMAL, omit_all_offsets); +} } // namespace net |