diff options
-rw-r--r-- | base/base.gyp | 3 | ||||
-rw-r--r-- | base/i18n/string_search.cc | 45 | ||||
-rw-r--r-- | base/i18n/string_search.h | 27 | ||||
-rw-r--r-- | base/i18n/string_search_unittest.cc | 154 | ||||
-rw-r--r-- | content/browser/download/download_item.cc | 18 |
5 files changed, 236 insertions, 11 deletions
diff --git a/base/base.gyp b/base/base.gyp index 30dd274..8bca61bee 100644 --- a/base/base.gyp +++ b/base/base.gyp @@ -56,6 +56,8 @@ 'i18n/number_formatting.h', 'i18n/rtl.cc', 'i18n/rtl.h', + 'i18n/string_search.cc', + 'i18n/string_search.h', 'i18n/time_formatting.cc', 'i18n/time_formatting.h', ], @@ -142,6 +144,7 @@ 'i18n/icu_string_conversions_unittest.cc', 'i18n/number_formatting_unittest.cc', 'i18n/rtl_unittest.cc', + 'i18n/string_search_unittest.cc', 'i18n/time_formatting_unittest.cc', 'json/json_reader_unittest.cc', 'json/json_writer_unittest.cc', diff --git a/base/i18n/string_search.cc b/base/i18n/string_search.cc new file mode 100644 index 0000000..b2b29677 --- /dev/null +++ b/base/i18n/string_search.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <unicode/usearch.h> + +#include "base/i18n/string_search.h" + +namespace { + +bool CollationSensitiveStringSearch(const string16& find_this, + const string16& in_this, + UCollationStrength strength) { + UErrorCode status = U_ZERO_ERROR; + + UStringSearch* search = usearch_open(find_this.data(), -1, in_this.data(), -1, + uloc_getDefault(), NULL, &status); + + // Default to basic substring search if usearch fails. According to + // http://icu-project.org/apiref/icu4c/usearch_8h.html, usearch_open will fail + // if either |find_this| or |in_this| are empty. In either case basic + // substring search will give the correct return value. + if (!U_SUCCESS(status)) + return in_this.find(find_this) != string16::npos; + + UCollator* collator = usearch_getCollator(search); + ucol_setStrength(collator, strength); + usearch_reset(search); + + return usearch_first(search, &status) != USEARCH_DONE; +} + +} // namespace + +namespace base { +namespace i18n { + +bool StringSearchIgnoringCaseAndAccents(const string16& find_this, + const string16& in_this) { + return CollationSensitiveStringSearch(find_this, in_this, UCOL_PRIMARY); +} + +} // namespace i18n +} // namespace base + diff --git a/base/i18n/string_search.h b/base/i18n/string_search.h new file mode 100644 index 0000000..6602451 --- /dev/null +++ b/base/i18n/string_search.h @@ -0,0 +1,27 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_I18N_STRING_SEARCH_H_ +#define BASE_I18N_STRING_SEARCH_H_ +#pragma once + +#include "base/i18n/base_i18n_export.h" +#include "base/string16.h" + +namespace base { +namespace i18n { + +// Returns true if |in_this| contains |find_this|. Only differences between base +// letters are taken into consideration. Case and accent differences are +// ignored. Please refer to 'primary level' in +// http://userguide.icu-project.org/collation/concepts for additional details. +BASE_I18N_EXPORT + bool StringSearchIgnoringCaseAndAccents(const string16& find_this, + const string16& in_this); + +} // namespace i18n +} // namespace base + +#endif // BASE_I18N_STRING_SEARCH_H_ + diff --git a/base/i18n/string_search_unittest.cc b/base/i18n/string_search_unittest.cc new file mode 100644 index 0000000..fe877c0 --- /dev/null +++ b/base/i18n/string_search_unittest.cc @@ -0,0 +1,154 @@ +// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <string> +#include <unicode/usearch.h> + +#include "base/i18n/rtl.h" +#include "base/i18n/string_search.h" +#include "base/utf_string_conversions.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace base { +namespace i18n { + +class StringSearchTest : public testing::Test { +}; + +// Note on setting default locale for testing: The current default locale on +// the Mac trybot is en_US_POSIX, with which primary-level collation strength +// string search is case-sensitive, when normally it should be +// case-insensitive. In other locales (including en_US which English speakers +// in the U.S. use), this search would be case-insensitive as expected. + +TEST_F(StringSearchTest, ASCII) { + std::string default_locale(uloc_getDefault()); + bool locale_is_posix = (default_locale == "en_US_POSIX"); + if (locale_is_posix) + SetICUDefaultLocale("en_US"); + + EXPECT_TRUE(StringSearchIgnoringCaseAndAccents( + ASCIIToUTF16("hello"), ASCIIToUTF16("hello world"))); + + EXPECT_FALSE(StringSearchIgnoringCaseAndAccents( + ASCIIToUTF16("h e l l o"), ASCIIToUTF16("h e l l o"))); + + EXPECT_TRUE(StringSearchIgnoringCaseAndAccents( + ASCIIToUTF16("aabaaa"), ASCIIToUTF16("aaabaabaaa"))); + + EXPECT_FALSE(StringSearchIgnoringCaseAndAccents( + ASCIIToUTF16("searching within empty string"), ASCIIToUTF16(""))); + + EXPECT_TRUE(StringSearchIgnoringCaseAndAccents( + ASCIIToUTF16(""), ASCIIToUTF16("searching for empty string"))); + + EXPECT_TRUE(StringSearchIgnoringCaseAndAccents( + ASCIIToUTF16("case insensitivity"), ASCIIToUTF16("CaSe InSeNsItIvItY"))); + + if (locale_is_posix) + SetICUDefaultLocale(default_locale.data()); +} + +TEST_F(StringSearchTest, UnicodeLocaleIndependent) { + // Base characters + const string16 e_base = WideToUTF16(L"e"); + const string16 E_base = WideToUTF16(L"E"); + const string16 a_base = WideToUTF16(L"a"); + + // Composed characters + const string16 e_with_accute_accent = WideToUTF16(L"\u00e9"); + const string16 E_with_accute_accent = WideToUTF16(L"\u00c9"); + const string16 e_with_grave_accent = WideToUTF16(L"\u00e8"); + const string16 E_with_grave_accent = WideToUTF16(L"\u00c8"); + const string16 a_with_accute_accent = WideToUTF16(L"\u00e1"); + + // Decomposed characters + const string16 e_with_accute_combining_mark = WideToUTF16(L"e\u0301"); + const string16 E_with_accute_combining_mark = WideToUTF16(L"E\u0301"); + const string16 e_with_grave_combining_mark = WideToUTF16(L"e\u0300"); + const string16 E_with_grave_combining_mark = WideToUTF16(L"E\u0300"); + const string16 a_with_accute_combining_mark = WideToUTF16(L"a\u0301"); + + std::string default_locale(uloc_getDefault()); + bool locale_is_posix = (default_locale == "en_US_POSIX"); + if (locale_is_posix) + SetICUDefaultLocale("en_US"); + + EXPECT_TRUE(StringSearchIgnoringCaseAndAccents( + e_base, e_with_accute_accent)); + + EXPECT_TRUE(StringSearchIgnoringCaseAndAccents( + e_with_accute_accent, e_base)); + + EXPECT_TRUE(StringSearchIgnoringCaseAndAccents( + e_base, e_with_accute_combining_mark)); + + EXPECT_TRUE(StringSearchIgnoringCaseAndAccents( + e_with_accute_combining_mark, e_base)); + + EXPECT_TRUE(StringSearchIgnoringCaseAndAccents( + e_with_accute_combining_mark, e_with_accute_accent)); + + EXPECT_TRUE(StringSearchIgnoringCaseAndAccents( + e_with_accute_accent, e_with_accute_combining_mark)); + + EXPECT_TRUE(StringSearchIgnoringCaseAndAccents( + e_with_accute_combining_mark, e_with_grave_combining_mark)); + + EXPECT_TRUE(StringSearchIgnoringCaseAndAccents( + e_with_grave_combining_mark, e_with_accute_combining_mark)); + + EXPECT_TRUE(StringSearchIgnoringCaseAndAccents( + e_with_accute_combining_mark, e_with_grave_accent)); + + EXPECT_TRUE(StringSearchIgnoringCaseAndAccents( + e_with_grave_accent, e_with_accute_combining_mark)); + + EXPECT_TRUE(StringSearchIgnoringCaseAndAccents( + E_with_accute_accent, e_with_accute_accent)); + + EXPECT_TRUE(StringSearchIgnoringCaseAndAccents( + E_with_grave_accent, e_with_accute_accent)); + + EXPECT_TRUE(StringSearchIgnoringCaseAndAccents( + E_with_accute_combining_mark, e_with_grave_accent)); + + EXPECT_TRUE(StringSearchIgnoringCaseAndAccents( + E_with_grave_combining_mark, e_with_accute_accent)); + + EXPECT_TRUE(StringSearchIgnoringCaseAndAccents( + E_base, e_with_grave_accent)); + + EXPECT_FALSE(StringSearchIgnoringCaseAndAccents( + a_with_accute_accent, e_with_accute_accent)); + + EXPECT_FALSE(StringSearchIgnoringCaseAndAccents( + a_with_accute_combining_mark, e_with_accute_combining_mark)); + + if (locale_is_posix) + SetICUDefaultLocale(default_locale.data()); +} + +TEST_F(StringSearchTest, UnicodeLocaleDependent) { + // Base characters + const string16 a_base = WideToUTF16(L"a"); + + // Composed characters + const string16 a_with_ring = WideToUTF16(L"\u00e5"); + + EXPECT_TRUE(StringSearchIgnoringCaseAndAccents( + a_base, a_with_ring)); + + const char* default_locale = uloc_getDefault(); + SetICUDefaultLocale("da"); + + EXPECT_FALSE(StringSearchIgnoringCaseAndAccents( + a_base, a_with_ring)); + + SetICUDefaultLocale(default_locale); +} + +} // namespace i18n +} // namespace base + diff --git a/content/browser/download/download_item.cc b/content/browser/download/download_item.cc index 0809872..d37cd51 100644 --- a/content/browser/download/download_item.cc +++ b/content/browser/download/download_item.cc @@ -8,6 +8,7 @@ #include "base/file_util.h" #include "base/format_macros.h" #include "base/i18n/case_conversion.h" +#include "base/i18n/string_search.h" #include "base/logging.h" #include "base/metrics/histogram.h" #include "base/stringprintf.h" @@ -614,8 +615,8 @@ bool DownloadItem::MatchesQuery(const string16& query) const { DCHECK_EQ(query, base::i18n::ToLower(query)); - string16 url_raw(base::i18n::ToLower(UTF8ToUTF16(GetURL().spec()))); - if (url_raw.find(query) != string16::npos) + string16 url_raw(UTF8ToUTF16(GetURL().spec())); + if (base::i18n::StringSearchIgnoringCaseAndAccents(query, url_raw)) return true; // TODO(phajdan.jr): write a test case for the following code. @@ -627,17 +628,12 @@ bool DownloadItem::MatchesQuery(const string16& query) const { TabContents* tab = request_handle_.GetTabContents(); if (tab) languages = content::GetContentClient()->browser()->GetAcceptLangs(tab); - string16 url_formatted( - base::i18n::ToLower(net::FormatUrl(GetURL(), languages))); - if (url_formatted.find(query) != string16::npos) + string16 url_formatted(net::FormatUrl(GetURL(), languages)); + if (base::i18n::StringSearchIgnoringCaseAndAccents(query, url_formatted)) return true; - string16 path(base::i18n::ToLower(full_path().LossyDisplayName())); - // This shouldn't just do a substring match; it is wrong for Unicode - // due to normalization and we have a fancier search-query system - // used elsewhere. - // http://code.google.com/p/chromium/issues/detail?id=71982 - return (path.find(query) != string16::npos); + string16 path(full_path().LossyDisplayName()); + return base::i18n::StringSearchIgnoringCaseAndAccents(query, path); } void DownloadItem::SetFileCheckResults(const DownloadStateInfo& state) { |