Searching for downloads currently does a plain substring search, which does not

take into account different Unicode representations of characters. Used ICU API to handle this. BUG=chromium:71982 TEST=Check that MatchesQuery uses substring search taking into account Unicode normalization at primary level. Review URL: http://codereview.chromium.org/7782009 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@101548 0039d316-1c4b-4281-b951-d872f2087c98
author: vanlam@google.com <vanlam@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2011-09-16 19:48:32 +0000
committer: vanlam@google.com <vanlam@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2011-09-16 19:48:32 +0000
commit: 5a3b50a265fca82189c0947cfef94ef84e351e81 (patch)
tree: 962ccc58d1052d40bc8c5de22bc5e343f9abe466 /base/i18n
parent: 4ee0c305219d65cb8602666b580337fd63eb7aa8 (diff)
download: chromium_src-5a3b50a265fca82189c0947cfef94ef84e351e81.zip
chromium_src-5a3b50a265fca82189c0947cfef94ef84e351e81.tar.gz
chromium_src-5a3b50a265fca82189c0947cfef94ef84e351e81.tar.bz2
3 files changed, 226 insertions, 0 deletions
diff --git a/base/i18n/string_search.cc b/base/i18n/string_search.cc
new file mode 100644
index 0000000..b2b29677
--- /dev/null
+++ b/base/i18n/string_search.cc
@@ -0,0 +1,45 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <unicode/usearch.h>
+
+#include "base/i18n/string_search.h"
+
+namespace {
+
+bool CollationSensitiveStringSearch(const string16& find_this,
+                                    const string16& in_this,
+                                    UCollationStrength strength) {
+  UErrorCode status = U_ZERO_ERROR;
+
+  UStringSearch* search = usearch_open(find_this.data(), -1, in_this.data(), -1,
+                                       uloc_getDefault(), NULL, &status);
+
+  // Default to basic substring search if usearch fails. According to
+  // http://icu-project.org/apiref/icu4c/usearch_8h.html, usearch_open will fail
+  // if either |find_this| or |in_this| are empty. In either case basic
+  // substring search will give the correct return value.
+  if (!U_SUCCESS(status))
+    return in_this.find(find_this) != string16::npos;
+
+  UCollator* collator = usearch_getCollator(search);
+  ucol_setStrength(collator, strength);
+  usearch_reset(search);
+
+  return usearch_first(search, &status) != USEARCH_DONE;
+}
+
+}  // namespace
+
+namespace base {
+namespace i18n {
+
+bool StringSearchIgnoringCaseAndAccents(const string16& find_this,
+                                        const string16& in_this) {
+  return CollationSensitiveStringSearch(find_this, in_this, UCOL_PRIMARY);
+}
+
+}  // namespace i18n
+}  // namespace base
+
diff --git a/base/i18n/string_search.h b/base/i18n/string_search.h
new file mode 100644
index 0000000..6602451
--- /dev/null
+++ b/base/i18n/string_search.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_I18N_STRING_SEARCH_H_
+#define BASE_I18N_STRING_SEARCH_H_
+#pragma once
+
+#include "base/i18n/base_i18n_export.h"
+#include "base/string16.h"
+
+namespace base {
+namespace i18n {
+
+// Returns true if |in_this| contains |find_this|. Only differences between base
+// letters are taken into consideration. Case and accent differences are
+// ignored. Please refer to 'primary level' in
+// http://userguide.icu-project.org/collation/concepts for additional details.
+BASE_I18N_EXPORT
+    bool StringSearchIgnoringCaseAndAccents(const string16& find_this,
+                                            const string16& in_this);
+
+}  // namespace i18n
+}  // namespace base
+
+#endif  // BASE_I18N_STRING_SEARCH_H_
+
diff --git a/base/i18n/string_search_unittest.cc b/base/i18n/string_search_unittest.cc
new file mode 100644
index 0000000..fe877c0
--- /dev/null
+++ b/base/i18n/string_search_unittest.cc
@@ -0,0 +1,154 @@
+// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include <string>
+#include <unicode/usearch.h>
+
+#include "base/i18n/rtl.h"
+#include "base/i18n/string_search.h"
+#include "base/utf_string_conversions.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+namespace base {
+namespace i18n {
+
+class StringSearchTest : public testing::Test {
+};
+
+// Note on setting default locale for testing: The current default locale on
+// the Mac trybot is en_US_POSIX, with which primary-level collation strength
+// string search is case-sensitive, when normally it should be
+// case-insensitive. In other locales (including en_US which English speakers
+// in the U.S. use), this search would be case-insensitive as expected.
+
+TEST_F(StringSearchTest, ASCII) {
+  std::string default_locale(uloc_getDefault());
+  bool locale_is_posix = (default_locale == "en_US_POSIX");
+  if (locale_is_posix)
+    SetICUDefaultLocale("en_US");
+
+  EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(
+      ASCIIToUTF16("hello"), ASCIIToUTF16("hello world")));
+
+  EXPECT_FALSE(StringSearchIgnoringCaseAndAccents(
+      ASCIIToUTF16("h    e l l o"), ASCIIToUTF16("h   e l l o")));
+
+  EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(
+      ASCIIToUTF16("aabaaa"), ASCIIToUTF16("aaabaabaaa")));
+
+  EXPECT_FALSE(StringSearchIgnoringCaseAndAccents(
+      ASCIIToUTF16("searching within empty string"), ASCIIToUTF16("")));
+
+  EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(
+      ASCIIToUTF16(""), ASCIIToUTF16("searching for empty string")));
+
+  EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(
+      ASCIIToUTF16("case insensitivity"), ASCIIToUTF16("CaSe InSeNsItIvItY")));
+
+  if (locale_is_posix)
+    SetICUDefaultLocale(default_locale.data());
+}
+
+TEST_F(StringSearchTest, UnicodeLocaleIndependent) {
+  // Base characters
+  const string16 e_base = WideToUTF16(L"e");
+  const string16 E_base = WideToUTF16(L"E");
+  const string16 a_base = WideToUTF16(L"a");
+
+  // Composed characters
+  const string16 e_with_accute_accent = WideToUTF16(L"\u00e9");
+  const string16 E_with_accute_accent = WideToUTF16(L"\u00c9");
+  const string16 e_with_grave_accent = WideToUTF16(L"\u00e8");
+  const string16 E_with_grave_accent = WideToUTF16(L"\u00c8");
+  const string16 a_with_accute_accent = WideToUTF16(L"\u00e1");
+
+  // Decomposed characters
+  const string16 e_with_accute_combining_mark = WideToUTF16(L"e\u0301");
+  const string16 E_with_accute_combining_mark = WideToUTF16(L"E\u0301");
+  const string16 e_with_grave_combining_mark = WideToUTF16(L"e\u0300");
+  const string16 E_with_grave_combining_mark = WideToUTF16(L"E\u0300");
+  const string16 a_with_accute_combining_mark = WideToUTF16(L"a\u0301");
+
+  std::string default_locale(uloc_getDefault());
+  bool locale_is_posix = (default_locale == "en_US_POSIX");
+  if (locale_is_posix)
+    SetICUDefaultLocale("en_US");
+
+  EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(
+      e_base, e_with_accute_accent));
+
+  EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(
+      e_with_accute_accent, e_base));
+
+  EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(
+      e_base, e_with_accute_combining_mark));
+
+  EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(
+      e_with_accute_combining_mark, e_base));
+
+  EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(
+      e_with_accute_combining_mark, e_with_accute_accent));
+
+  EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(
+      e_with_accute_accent, e_with_accute_combining_mark));
+
+  EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(
+      e_with_accute_combining_mark, e_with_grave_combining_mark));
+
+  EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(
+      e_with_grave_combining_mark, e_with_accute_combining_mark));
+
+  EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(
+      e_with_accute_combining_mark, e_with_grave_accent));
+
+  EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(
+      e_with_grave_accent, e_with_accute_combining_mark));
+
+  EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(
+      E_with_accute_accent, e_with_accute_accent));
+
+  EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(
+      E_with_grave_accent, e_with_accute_accent));
+
+  EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(
+      E_with_accute_combining_mark, e_with_grave_accent));
+
+  EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(
+      E_with_grave_combining_mark, e_with_accute_accent));
+
+  EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(
+      E_base, e_with_grave_accent));
+
+  EXPECT_FALSE(StringSearchIgnoringCaseAndAccents(
+      a_with_accute_accent, e_with_accute_accent));
+
+  EXPECT_FALSE(StringSearchIgnoringCaseAndAccents(
+      a_with_accute_combining_mark, e_with_accute_combining_mark));
+
+  if (locale_is_posix)
+    SetICUDefaultLocale(default_locale.data());
+}
+
+TEST_F(StringSearchTest, UnicodeLocaleDependent) {
+  // Base characters
+  const string16 a_base = WideToUTF16(L"a");
+
+  // Composed characters
+  const string16 a_with_ring = WideToUTF16(L"\u00e5");
+
+  EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(
+      a_base, a_with_ring));
+
+  const char* default_locale = uloc_getDefault();
+  SetICUDefaultLocale("da");
+
+  EXPECT_FALSE(StringSearchIgnoringCaseAndAccents(
+      a_base, a_with_ring));
+
+  SetICUDefaultLocale(default_locale);
+}
+
+}  // namespace i18n
+}  // namespace base
+
author	vanlam@google.com <vanlam@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2011-09-16 19:48:32 +0000
committer	vanlam@google.com <vanlam@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2011-09-16 19:48:32 +0000
commit	5a3b50a265fca82189c0947cfef94ef84e351e81 (patch)
tree	962ccc58d1052d40bc8c5de22bc5e343f9abe466 /base/i18n
parent	4ee0c305219d65cb8602666b580337fd63eb7aa8 (diff)
download	chromium_src-5a3b50a265fca82189c0947cfef94ef84e351e81.zip chromium_src-5a3b50a265fca82189c0947cfef94ef84e351e81.tar.gz chromium_src-5a3b50a265fca82189c0947cfef94ef84e351e81.tar.bz2