diff options
Diffstat (limited to 'net/base/net_util_icu.cc')
-rw-r--r-- | net/base/net_util_icu.cc | 792 |
1 files changed, 788 insertions, 4 deletions
diff --git a/net/base/net_util_icu.cc b/net/base/net_util_icu.cc index c174c92..259baba 100644 --- a/net/base/net_util_icu.cc +++ b/net/base/net_util_icu.cc @@ -4,19 +4,594 @@ #include "net/base/net_util.h" +#include <map> +#include <vector> + #include "base/i18n/time_formatting.h" #include "base/json/string_escape.h" +#include "base/lazy_instance.h" +#include "base/logging.h" +#include "base/memory/singleton.h" +#include "base/stl_util.h" +#include "base/strings/string_tokenizer.h" #include "base/strings/string_util.h" +#include "base/strings/utf_offset_string_conversions.h" #include "base/strings/utf_string_conversions.h" -#include "net/base/escape.h" +#include "base/time/time.h" +#include "url/gurl.h" +#include "third_party/icu/source/common/unicode/uidna.h" +#include "third_party/icu/source/common/unicode/uniset.h" +#include "third_party/icu/source/common/unicode/uscript.h" +#include "third_party/icu/source/common/unicode/uset.h" +#include "third_party/icu/source/i18n/unicode/datefmt.h" +#include "third_party/icu/source/i18n/unicode/regex.h" +#include "third_party/icu/source/i18n/unicode/ulocdata.h" + +using base::Time; namespace net { +namespace { + +typedef std::vector<size_t> Offsets; + +// Does some simple normalization of scripts so we can allow certain scripts +// to exist together. +// TODO(brettw) bug 880223: we should allow some other languages to be +// oombined such as Chinese and Latin. We will probably need a more +// complicated system of language pairs to have more fine-grained control. +UScriptCode NormalizeScript(UScriptCode code) { + switch (code) { + case USCRIPT_KATAKANA: + case USCRIPT_HIRAGANA: + case USCRIPT_KATAKANA_OR_HIRAGANA: + case USCRIPT_HANGUL: // This one is arguable. + return USCRIPT_HAN; + default: + return code; + } +} + +bool IsIDNComponentInSingleScript(const base::char16* str, int str_len) { + UScriptCode first_script = USCRIPT_INVALID_CODE; + bool is_first = true; + + int i = 0; + while (i < str_len) { + unsigned code_point; + U16_NEXT(str, i, str_len, code_point); + + UErrorCode err = U_ZERO_ERROR; + UScriptCode cur_script = uscript_getScript(code_point, &err); + if (err != U_ZERO_ERROR) + return false; // Report mixed on error. + cur_script = NormalizeScript(cur_script); + + // TODO(brettw) We may have to check for USCRIPT_INHERENT as well. + if (is_first && cur_script != USCRIPT_COMMON) { + first_script = cur_script; + is_first = false; + } else { + if (cur_script != USCRIPT_COMMON && cur_script != first_script) + return false; + } + } + return true; +} + +// Check if the script of a language can be 'safely' mixed with +// Latin letters in the ASCII range. +bool IsCompatibleWithASCIILetters(const std::string& lang) { + // For now, just list Chinese, Japanese and Korean (positive list). + // An alternative is negative-listing (languages using Greek and + // Cyrillic letters), but it can be more dangerous. + return !lang.substr(0, 2).compare("zh") || + !lang.substr(0, 2).compare("ja") || + !lang.substr(0, 2).compare("ko"); +} + +typedef std::map<std::string, icu::UnicodeSet*> LangToExemplarSetMap; + +class LangToExemplarSet { + public: + static LangToExemplarSet* GetInstance() { + return Singleton<LangToExemplarSet>::get(); + } + + private: + LangToExemplarSetMap map; + LangToExemplarSet() { } + ~LangToExemplarSet() { + STLDeleteContainerPairSecondPointers(map.begin(), map.end()); + } + + friend class Singleton<LangToExemplarSet>; + friend struct DefaultSingletonTraits<LangToExemplarSet>; + friend bool GetExemplarSetForLang(const std::string&, icu::UnicodeSet**); + friend void SetExemplarSetForLang(const std::string&, icu::UnicodeSet*); + + DISALLOW_COPY_AND_ASSIGN(LangToExemplarSet); +}; + +bool GetExemplarSetForLang(const std::string& lang, + icu::UnicodeSet** lang_set) { + const LangToExemplarSetMap& map = LangToExemplarSet::GetInstance()->map; + LangToExemplarSetMap::const_iterator pos = map.find(lang); + if (pos != map.end()) { + *lang_set = pos->second; + return true; + } + return false; +} + +void SetExemplarSetForLang(const std::string& lang, + icu::UnicodeSet* lang_set) { + LangToExemplarSetMap& map = LangToExemplarSet::GetInstance()->map; + map.insert(std::make_pair(lang, lang_set)); +} + +static base::LazyInstance<base::Lock>::Leaky + g_lang_set_lock = LAZY_INSTANCE_INITIALIZER; + +// Returns true if all the characters in component_characters are used by +// the language |lang|. +bool IsComponentCoveredByLang(const icu::UnicodeSet& component_characters, + const std::string& lang) { + CR_DEFINE_STATIC_LOCAL( + const icu::UnicodeSet, kASCIILetters, ('a', 'z')); + icu::UnicodeSet* lang_set = nullptr; + // We're called from both the UI thread and the history thread. + { + base::AutoLock lock(g_lang_set_lock.Get()); + if (!GetExemplarSetForLang(lang, &lang_set)) { + UErrorCode status = U_ZERO_ERROR; + ULocaleData* uld = ulocdata_open(lang.c_str(), &status); + // TODO(jungshik) Turn this check on when the ICU data file is + // rebuilt with the minimal subset of locale data for languages + // to which Chrome is not localized but which we offer in the list + // of languages selectable for Accept-Languages. With the rebuilt ICU + // data, ulocdata_open never should fall back to the default locale. + // (issue 2078) + // DCHECK(U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING); + if (U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING) { + lang_set = reinterpret_cast<icu::UnicodeSet*>(ulocdata_getExemplarSet( + uld, nullptr, 0, ULOCDATA_ES_STANDARD, &status)); + // On success, if |lang| is compatible with ASCII Latin letters, add + // them. + if (lang_set && IsCompatibleWithASCIILetters(lang)) + lang_set->addAll(kASCIILetters); + } + + if (!lang_set) + lang_set = new icu::UnicodeSet(1, 0); + + lang_set->freeze(); + SetExemplarSetForLang(lang, lang_set); + ulocdata_close(uld); + } + } + return !lang_set->isEmpty() && lang_set->containsAll(component_characters); +} + +// Returns true if the given Unicode host component is safe to display to the +// user. +bool IsIDNComponentSafe(const base::char16* str, + int str_len, + const std::string& languages) { + // Most common cases (non-IDN) do not reach here so that we don't + // need a fast return path. + // TODO(jungshik) : Check if there's any character inappropriate + // (although allowed) for domain names. + // See http://www.unicode.org/reports/tr39/#IDN_Security_Profiles and + // http://www.unicode.org/reports/tr39/data/xidmodifications.txt + // For now, we borrow the list from Mozilla and tweaked it slightly. + // (e.g. Characters like U+00A0, U+3000, U+3002 are omitted because + // they're gonna be canonicalized to U+0020 and full stop before + // reaching here.) + // The original list is available at + // http://kb.mozillazine.org/Network.IDN.blacklist_chars and + // at http://mxr.mozilla.org/seamonkey/source/modules/libpref/src/init/all.js#703 + + UErrorCode status = U_ZERO_ERROR; +#ifdef U_WCHAR_IS_UTF16 + icu::UnicodeSet dangerous_characters( + icu::UnicodeString( + L"[[\\ \u00ad\u00bc\u00bd\u01c3\u0337\u0338" + L"\u05c3\u05f4\u06d4\u0702\u115f\u1160][\u2000-\u200b]" + L"[\u2024\u2027\u2028\u2029\u2039\u203a\u2044\u205f]" + L"[\u2154-\u2156][\u2159-\u215b][\u215f\u2215\u23ae" + L"\u29f6\u29f8\u2afb\u2afd][\u2ff0-\u2ffb][\u3014" + L"\u3015\u3033\u3164\u321d\u321e\u33ae\u33af\u33c6\u33df\ufe14" + L"\ufe15\ufe3f\ufe5d\ufe5e\ufeff\uff0e\uff06\uff61\uffa0\ufff9]" + L"[\ufffa-\ufffd]\U0001f50f\U0001f510\U0001f512\U0001f513]"), + status); + DCHECK(U_SUCCESS(status)); + icu::RegexMatcher dangerous_patterns(icu::UnicodeString( + // Lone katakana no, so, or n + L"[^\\p{Katakana}][\u30ce\u30f3\u30bd][^\\p{Katakana}]" + // Repeating Japanese accent characters + L"|[\u3099\u309a\u309b\u309c][\u3099\u309a\u309b\u309c]"), + 0, status); +#else + icu::UnicodeSet dangerous_characters(icu::UnicodeString( + "[[\\u0020\\u00ad\\u00bc\\u00bd\\u01c3\\u0337\\u0338" + "\\u05c3\\u05f4\\u06d4\\u0702\\u115f\\u1160][\\u2000-\\u200b]" + "[\\u2024\\u2027\\u2028\\u2029\\u2039\\u203a\\u2044\\u205f]" + "[\\u2154-\\u2156][\\u2159-\\u215b][\\u215f\\u2215\\u23ae" + "\\u29f6\\u29f8\\u2afb\\u2afd][\\u2ff0-\\u2ffb][\\u3014" + "\\u3015\\u3033\\u3164\\u321d\\u321e\\u33ae\\u33af\\u33c6\\u33df\\ufe14" + "\\ufe15\\ufe3f\\ufe5d\\ufe5e\\ufeff\\uff0e\\uff06\\uff61\\uffa0\\ufff9]" + "[\\ufffa-\\ufffd]\\U0001f50f\\U0001f510\\U0001f512\\U0001f513]", -1, + US_INV), status); + DCHECK(U_SUCCESS(status)); + icu::RegexMatcher dangerous_patterns(icu::UnicodeString( + // Lone katakana no, so, or n + "[^\\p{Katakana}][\\u30ce\\u30f3\\u30bd][^\\p{Katakana}]" + // Repeating Japanese accent characters + "|[\\u3099\\u309a\\u309b\\u309c][\\u3099\\u309a\\u309b\\u309c]"), + 0, status); +#endif + DCHECK(U_SUCCESS(status)); + icu::UnicodeSet component_characters; + icu::UnicodeString component_string(str, str_len); + component_characters.addAll(component_string); + if (dangerous_characters.containsSome(component_characters)) + return false; + + DCHECK(U_SUCCESS(status)); + dangerous_patterns.reset(component_string); + if (dangerous_patterns.find()) + return false; + + // If the language list is empty, the result is completely determined + // by whether a component is a single script or not. This will block + // even "safe" script mixing cases like <Chinese, Latin-ASCII> that are + // allowed with |languages| (while it blocks Chinese + Latin letters with + // an accent as should be the case), but we want to err on the safe side + // when |languages| is empty. + if (languages.empty()) + return IsIDNComponentInSingleScript(str, str_len); + + // |common_characters| is made up of ASCII numbers, hyphen, plus and + // underscore that are used across scripts and allowed in domain names. + // (sync'd with characters allowed in url_canon_host with square + // brackets excluded.) See kHostCharLookup[] array in url_canon_host.cc. + icu::UnicodeSet common_characters(UNICODE_STRING_SIMPLE("[[0-9]\\-_+\\ ]"), + status); + DCHECK(U_SUCCESS(status)); + // Subtract common characters because they're always allowed so that + // we just have to check if a language-specific set contains + // the remainder. + component_characters.removeAll(common_characters); + + base::StringTokenizer t(languages, ","); + while (t.GetNext()) { + if (IsComponentCoveredByLang(component_characters, t.token())) + return true; + } + return false; +} + +// A wrapper to use LazyInstance<>::Leaky with ICU's UIDNA, a C pointer to +// a UTS46/IDNA 2008 handling object opened with uidna_openUTS46(). +// +// We use UTS46 with BiDiCheck to migrate from IDNA 2003 to IDNA 2008 with +// the backward compatibility in mind. What it does: +// +// 1. Use the up-to-date Unicode data. +// 2. Define a case folding/mapping with the up-to-date Unicode data as +// in IDNA 2003. +// 3. Use transitional mechanism for 4 deviation characters (sharp-s, +// final sigma, ZWJ and ZWNJ) for now. +// 4. Continue to allow symbols and punctuations. +// 5. Apply new BiDi check rules more permissive than the IDNA 2003 BiDI rules. +// 6. Do not apply STD3 rules +// 7. Do not allow unassigned code points. +// +// It also closely matches what IE 10 does except for the BiDi check ( +// http://goo.gl/3XBhqw ). +// See http://http://unicode.org/reports/tr46/ and references therein +// for more details. +struct UIDNAWrapper { + UIDNAWrapper() { + UErrorCode err = U_ZERO_ERROR; + // TODO(jungshik): Change options as different parties (browsers, + // registrars, search engines) converge toward a consensus. + value = uidna_openUTS46(UIDNA_CHECK_BIDI, &err); + if (U_FAILURE(err)) + value = NULL; + } + + UIDNA* value; +}; + +static base::LazyInstance<UIDNAWrapper>::Leaky + g_uidna = LAZY_INSTANCE_INITIALIZER; + +// Converts one component of a host (between dots) to IDN if safe. The result +// will be APPENDED to the given output string and will be the same as the input +// if it is not IDN or the IDN is unsafe to display. Returns whether any +// conversion was performed. +bool IDNToUnicodeOneComponent(const base::char16* comp, + size_t comp_len, + const std::string& languages, + base::string16* out) { + DCHECK(out); + if (comp_len == 0) + return false; + + // Only transform if the input can be an IDN component. + static const base::char16 kIdnPrefix[] = {'x', 'n', '-', '-'}; + if ((comp_len > arraysize(kIdnPrefix)) && + !memcmp(comp, kIdnPrefix, arraysize(kIdnPrefix) * sizeof(base::char16))) { + UIDNA* uidna = g_uidna.Get().value; + DCHECK(uidna != NULL); + size_t original_length = out->length(); + int output_length = 64; + UIDNAInfo info = UIDNA_INFO_INITIALIZER; + UErrorCode status; + do { + out->resize(original_length + output_length); + status = U_ZERO_ERROR; + // This returns the actual length required. If this is more than 64 + // code units, |status| will be U_BUFFER_OVERFLOW_ERROR and we'll try + // the conversion again, but with a sufficiently large buffer. + output_length = uidna_labelToUnicode( + uidna, comp, static_cast<int32_t>(comp_len), &(*out)[original_length], + output_length, &info, &status); + } while ((status == U_BUFFER_OVERFLOW_ERROR && info.errors == 0)); + + if (U_SUCCESS(status) && info.errors == 0) { + // Converted successfully. Ensure that the converted component + // can be safely displayed to the user. + out->resize(original_length + output_length); + if (IsIDNComponentSafe(out->data() + original_length, output_length, + languages)) + return true; + } + + // Something went wrong. Revert to original string. + out->resize(original_length); + } + + // We get here with no IDN or on error, in which case we just append the + // literal input. + out->append(comp, comp_len); + return false; +} + +// TODO(brettw) bug 734373: check the scripts for each host component and +// don't un-IDN-ize if there is more than one. Alternatively, only IDN for +// scripts that the user has installed. For now, just put the entire +// path through IDN. Maybe this feature can be implemented in ICU itself? +// +// We may want to skip this step in the case of file URLs to allow unicode +// UNC hostnames regardless of encodings. +base::string16 IDNToUnicodeWithAdjustments( + const std::string& host, + const std::string& languages, + base::OffsetAdjuster::Adjustments* adjustments) { + if (adjustments) + adjustments->clear(); + // Convert the ASCII input to a base::string16 for ICU. + base::string16 input16; + input16.reserve(host.length()); + input16.insert(input16.end(), host.begin(), host.end()); + + // Do each component of the host separately, since we enforce script matching + // on a per-component basis. + base::string16 out16; + { + for (size_t component_start = 0, component_end; + component_start < input16.length(); + component_start = component_end + 1) { + // Find the end of the component. + component_end = input16.find('.', component_start); + if (component_end == base::string16::npos) + component_end = input16.length(); // For getting the last component. + size_t component_length = component_end - component_start; + size_t new_component_start = out16.length(); + bool converted_idn = false; + if (component_end > component_start) { + // Add the substring that we just found. + converted_idn = IDNToUnicodeOneComponent( + input16.data() + component_start, component_length, languages, + &out16); + } + size_t new_component_length = out16.length() - new_component_start; + + if (converted_idn && adjustments) { + adjustments->push_back(base::OffsetAdjuster::Adjustment( + component_start, component_length, new_component_length)); + } + + // Need to add the dot we just found (if we found one). + if (component_end < input16.length()) + out16.push_back('.'); + } + } + return out16; +} + +// If |component| is valid, its begin is incremented by |delta|. +void AdjustComponent(int delta, url::Component* component) { + if (!component->is_valid()) + return; + + DCHECK(delta >= 0 || component->begin >= -delta); + component->begin += delta; +} + +// Adjusts all the components of |parsed| by |delta|, except for the scheme. +void AdjustAllComponentsButScheme(int delta, url::Parsed* parsed) { + AdjustComponent(delta, &(parsed->username)); + AdjustComponent(delta, &(parsed->password)); + AdjustComponent(delta, &(parsed->host)); + AdjustComponent(delta, &(parsed->port)); + AdjustComponent(delta, &(parsed->path)); + AdjustComponent(delta, &(parsed->query)); + AdjustComponent(delta, &(parsed->ref)); +} + +// Helper for FormatUrlWithOffsets(). +base::string16 FormatViewSourceUrl( + const GURL& url, + const std::string& languages, + FormatUrlTypes format_types, + UnescapeRule::Type unescape_rules, + url::Parsed* new_parsed, + size_t* prefix_end, + base::OffsetAdjuster::Adjustments* adjustments) { + DCHECK(new_parsed); + const char kViewSource[] = "view-source:"; + const size_t kViewSourceLength = arraysize(kViewSource) - 1; + + // Format the underlying URL and record adjustments. + const std::string& url_str(url.possibly_invalid_spec()); + adjustments->clear(); + base::string16 result(base::ASCIIToUTF16(kViewSource) + + FormatUrlWithAdjustments(GURL(url_str.substr(kViewSourceLength)), + languages, format_types, unescape_rules, + new_parsed, prefix_end, adjustments)); + // Revise |adjustments| by shifting to the offsets to prefix that the above + // call to FormatUrl didn't get to see. + for (base::OffsetAdjuster::Adjustments::iterator it = adjustments->begin(); + it != adjustments->end(); ++it) + it->original_offset += kViewSourceLength; + + // Adjust positions of the parsed components. + if (new_parsed->scheme.is_nonempty()) { + // Assume "view-source:real-scheme" as a scheme. + new_parsed->scheme.len += kViewSourceLength; + } else { + new_parsed->scheme.begin = 0; + new_parsed->scheme.len = kViewSourceLength - 1; + } + AdjustAllComponentsButScheme(kViewSourceLength, new_parsed); + + if (prefix_end) + *prefix_end += kViewSourceLength; + + return result; +} + +class AppendComponentTransform { + public: + AppendComponentTransform() {} + virtual ~AppendComponentTransform() {} + + virtual base::string16 Execute( + const std::string& component_text, + base::OffsetAdjuster::Adjustments* adjustments) const = 0; + + // NOTE: No DISALLOW_COPY_AND_ASSIGN here, since gcc < 4.3.0 requires an + // accessible copy constructor in order to call AppendFormattedComponent() + // with an inline temporary (see http://gcc.gnu.org/bugs/#cxx%5Frvalbind ). +}; + +class HostComponentTransform : public AppendComponentTransform { + public: + explicit HostComponentTransform(const std::string& languages) + : languages_(languages) { + } + + private: + base::string16 Execute( + const std::string& component_text, + base::OffsetAdjuster::Adjustments* adjustments) const override { + return IDNToUnicodeWithAdjustments(component_text, languages_, + adjustments); + } + + const std::string& languages_; +}; + +class NonHostComponentTransform : public AppendComponentTransform { + public: + explicit NonHostComponentTransform(UnescapeRule::Type unescape_rules) + : unescape_rules_(unescape_rules) { + } + + private: + base::string16 Execute( + const std::string& component_text, + base::OffsetAdjuster::Adjustments* adjustments) const override { + return (unescape_rules_ == UnescapeRule::NONE) ? + base::UTF8ToUTF16WithAdjustments(component_text, adjustments) : + UnescapeAndDecodeUTF8URLComponentWithAdjustments(component_text, + unescape_rules_, adjustments); + } + + const UnescapeRule::Type unescape_rules_; +}; + +// Transforms the portion of |spec| covered by |original_component| according to +// |transform|. Appends the result to |output|. If |output_component| is +// non-NULL, its start and length are set to the transformed component's new +// start and length. If |adjustments| is non-NULL, appends adjustments (if +// any) that reflect the transformation the original component underwent to +// become the transformed value appended to |output|. +void AppendFormattedComponent(const std::string& spec, + const url::Component& original_component, + const AppendComponentTransform& transform, + base::string16* output, + url::Component* output_component, + base::OffsetAdjuster::Adjustments* adjustments) { + DCHECK(output); + if (original_component.is_nonempty()) { + size_t original_component_begin = + static_cast<size_t>(original_component.begin); + size_t output_component_begin = output->length(); + std::string component_str(spec, original_component_begin, + static_cast<size_t>(original_component.len)); + + // Transform |component_str| and modify |adjustments| appropriately. + base::OffsetAdjuster::Adjustments component_transform_adjustments; + output->append( + transform.Execute(component_str, &component_transform_adjustments)); + + // Shift all the adjustments made for this component so the offsets are + // valid for the original string and add them to |adjustments|. + for (base::OffsetAdjuster::Adjustments::iterator comp_iter = + component_transform_adjustments.begin(); + comp_iter != component_transform_adjustments.end(); ++comp_iter) + comp_iter->original_offset += original_component_begin; + if (adjustments) { + adjustments->insert(adjustments->end(), + component_transform_adjustments.begin(), + component_transform_adjustments.end()); + } + + // Set positions of the parsed component. + if (output_component) { + output_component->begin = static_cast<int>(output_component_begin); + output_component->len = + static_cast<int>(output->length() - output_component_begin); + } + } else if (output_component) { + output_component->reset(); + } +} + +} // namespace + +const FormatUrlType kFormatUrlOmitNothing = 0; +const FormatUrlType kFormatUrlOmitUsernamePassword = 1 << 0; +const FormatUrlType kFormatUrlOmitHTTP = 1 << 1; +const FormatUrlType kFormatUrlOmitTrailingSlashOnBareHostname = 1 << 2; +const FormatUrlType kFormatUrlOmitAll = kFormatUrlOmitUsernamePassword | + kFormatUrlOmitHTTP | kFormatUrlOmitTrailingSlashOnBareHostname; + +base::string16 IDNToUnicode(const std::string& host, + const std::string& languages) { + return IDNToUnicodeWithAdjustments(host, languages, NULL); +} + std::string GetDirectoryListingEntry(const base::string16& name, const std::string& raw_bytes, bool is_dir, int64_t size, - base::Time modified) { + Time modified) { std::string result; result.append("<script>addRow("); base::EscapeJSONString(name, true, &result); @@ -26,7 +601,6 @@ std::string GetDirectoryListingEntry(const base::string16& name, } else { base::EscapeJSONString(EscapePath(raw_bytes), true, &result); } - if (is_dir) { result.append(",1,"); } else { @@ -43,8 +617,9 @@ std::string GetDirectoryListingEntry(const base::string16& name, base::string16 modified_str; // |modified| can be NULL in FTP listings. - if (!modified.is_null()) + if (!modified.is_null()) { modified_str = base::TimeFormatShortDateAndTime(modified); + } base::EscapeJSONString(modified_str, true, &result); result.append(");</script>\n"); @@ -52,4 +627,213 @@ std::string GetDirectoryListingEntry(const base::string16& name, return result; } +void AppendFormattedHost(const GURL& url, + const std::string& languages, + base::string16* output) { + AppendFormattedComponent(url.possibly_invalid_spec(), + url.parsed_for_possibly_invalid_spec().host, + HostComponentTransform(languages), output, NULL, NULL); +} + +base::string16 FormatUrlWithOffsets( + const GURL& url, + const std::string& languages, + FormatUrlTypes format_types, + UnescapeRule::Type unescape_rules, + url::Parsed* new_parsed, + size_t* prefix_end, + std::vector<size_t>* offsets_for_adjustment) { + base::OffsetAdjuster::Adjustments adjustments; + const base::string16& format_url_return_value = + FormatUrlWithAdjustments(url, languages, format_types, unescape_rules, + new_parsed, prefix_end, &adjustments); + base::OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment); + if (offsets_for_adjustment) { + std::for_each( + offsets_for_adjustment->begin(), + offsets_for_adjustment->end(), + base::LimitOffset<std::string>(format_url_return_value.length())); + } + return format_url_return_value; +} + +base::string16 FormatUrlWithAdjustments( + const GURL& url, + const std::string& languages, + FormatUrlTypes format_types, + UnescapeRule::Type unescape_rules, + url::Parsed* new_parsed, + size_t* prefix_end, + base::OffsetAdjuster::Adjustments* adjustments) { + DCHECK(adjustments != NULL); + adjustments->clear(); + url::Parsed parsed_temp; + if (!new_parsed) + new_parsed = &parsed_temp; + else + *new_parsed = url::Parsed(); + + // Special handling for view-source:. Don't use content::kViewSourceScheme + // because this library shouldn't depend on chrome. + const char kViewSource[] = "view-source"; + // Reject "view-source:view-source:..." to avoid deep recursion. + const char kViewSourceTwice[] = "view-source:view-source:"; + if (url.SchemeIs(kViewSource) && + !base::StartsWith(url.possibly_invalid_spec(), kViewSourceTwice, + base::CompareCase::INSENSITIVE_ASCII)) { + return FormatViewSourceUrl(url, languages, format_types, + unescape_rules, new_parsed, prefix_end, + adjustments); + } + + // We handle both valid and invalid URLs (this will give us the spec + // regardless of validity). + const std::string& spec = url.possibly_invalid_spec(); + const url::Parsed& parsed = url.parsed_for_possibly_invalid_spec(); + + // Scheme & separators. These are ASCII. + base::string16 url_string; + url_string.insert( + url_string.end(), spec.begin(), + spec.begin() + parsed.CountCharactersBefore(url::Parsed::USERNAME, true)); + const char kHTTP[] = "http://"; + const char kFTP[] = "ftp."; + // url_fixer::FixupURL() treats "ftp.foo.com" as ftp://ftp.foo.com. This + // means that if we trim "http://" off a URL whose host starts with "ftp." and + // the user inputs this into any field subject to fixup (which is basically + // all input fields), the meaning would be changed. (In fact, often the + // formatted URL is directly pre-filled into an input field.) For this reason + // we avoid stripping "http://" in this case. + bool omit_http = + (format_types & kFormatUrlOmitHTTP) && + base::EqualsASCII(url_string, kHTTP) && + !base::StartsWith(url.host(), kFTP, base::CompareCase::SENSITIVE); + new_parsed->scheme = parsed.scheme; + + // Username & password. + if ((format_types & kFormatUrlOmitUsernamePassword) != 0) { + // Remove the username and password fields. We don't want to display those + // to the user since they can be used for attacks, + // e.g. "http://google.com:search@evil.ru/" + new_parsed->username.reset(); + new_parsed->password.reset(); + // Update the adjustments based on removed username and/or password. + if (parsed.username.is_nonempty() || parsed.password.is_nonempty()) { + if (parsed.username.is_nonempty() && parsed.password.is_nonempty()) { + // The seeming off-by-two is to account for the ':' after the username + // and '@' after the password. + adjustments->push_back(base::OffsetAdjuster::Adjustment( + static_cast<size_t>(parsed.username.begin), + static_cast<size_t>(parsed.username.len + parsed.password.len + 2), + 0)); + } else { + const url::Component* nonempty_component = + parsed.username.is_nonempty() ? &parsed.username : &parsed.password; + // The seeming off-by-one is to account for the '@' after the + // username/password. + adjustments->push_back(base::OffsetAdjuster::Adjustment( + static_cast<size_t>(nonempty_component->begin), + static_cast<size_t>(nonempty_component->len + 1), + 0)); + } + } + } else { + AppendFormattedComponent(spec, parsed.username, + NonHostComponentTransform(unescape_rules), + &url_string, &new_parsed->username, adjustments); + if (parsed.password.is_valid()) + url_string.push_back(':'); + AppendFormattedComponent(spec, parsed.password, + NonHostComponentTransform(unescape_rules), + &url_string, &new_parsed->password, adjustments); + if (parsed.username.is_valid() || parsed.password.is_valid()) + url_string.push_back('@'); + } + if (prefix_end) + *prefix_end = static_cast<size_t>(url_string.length()); + + // Host. + AppendFormattedComponent(spec, parsed.host, HostComponentTransform(languages), + &url_string, &new_parsed->host, adjustments); + + // Port. + if (parsed.port.is_nonempty()) { + url_string.push_back(':'); + new_parsed->port.begin = url_string.length(); + url_string.insert(url_string.end(), + spec.begin() + parsed.port.begin, + spec.begin() + parsed.port.end()); + new_parsed->port.len = url_string.length() - new_parsed->port.begin; + } else { + new_parsed->port.reset(); + } + + // Path & query. Both get the same general unescape & convert treatment. + if (!(format_types & kFormatUrlOmitTrailingSlashOnBareHostname) || + !CanStripTrailingSlash(url)) { + AppendFormattedComponent(spec, parsed.path, + NonHostComponentTransform(unescape_rules), + &url_string, &new_parsed->path, adjustments); + } else { + if (parsed.path.len > 0) { + adjustments->push_back(base::OffsetAdjuster::Adjustment( + parsed.path.begin, parsed.path.len, 0)); + } + } + if (parsed.query.is_valid()) + url_string.push_back('?'); + AppendFormattedComponent(spec, parsed.query, + NonHostComponentTransform(unescape_rules), + &url_string, &new_parsed->query, adjustments); + + // Ref. This is valid, unescaped UTF-8, so we can just convert. + if (parsed.ref.is_valid()) + url_string.push_back('#'); + AppendFormattedComponent(spec, parsed.ref, + NonHostComponentTransform(UnescapeRule::NONE), + &url_string, &new_parsed->ref, adjustments); + + // If we need to strip out http do it after the fact. + if (omit_http && + base::StartsWith(url_string, base::ASCIIToUTF16(kHTTP), + base::CompareCase::SENSITIVE)) { + const size_t kHTTPSize = arraysize(kHTTP) - 1; + url_string = url_string.substr(kHTTPSize); + // Because offsets in the |adjustments| are already calculated with respect + // to the string with the http:// prefix in it, those offsets remain correct + // after stripping the prefix. The only thing necessary is to add an + // adjustment to reflect the stripped prefix. + adjustments->insert(adjustments->begin(), + base::OffsetAdjuster::Adjustment(0, kHTTPSize, 0)); + + if (prefix_end) + *prefix_end -= kHTTPSize; + + // Adjust new_parsed. + DCHECK(new_parsed->scheme.is_valid()); + int delta = -(new_parsed->scheme.len + 3); // +3 for ://. + new_parsed->scheme.reset(); + AdjustAllComponentsButScheme(delta, new_parsed); + } + + return url_string; +} + +base::string16 FormatUrl(const GURL& url, + const std::string& languages, + FormatUrlTypes format_types, + UnescapeRule::Type unescape_rules, + url::Parsed* new_parsed, + size_t* prefix_end, + size_t* offset_for_adjustment) { + Offsets offsets; + if (offset_for_adjustment) + offsets.push_back(*offset_for_adjustment); + base::string16 result = FormatUrlWithOffsets(url, languages, format_types, + unescape_rules, new_parsed, prefix_end, &offsets); + if (offset_for_adjustment) + *offset_for_adjustment = offsets[0]; + return result; +} + } // namespace net |