Make the glob matcher support UTF8 strings.

This generalizes the existing pattern matching code to support UTF8 strings. BUG=53158 TEST=string_util_unittests.cc Review URL: http://codereview.chromium.org/3295018 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@59071 0039d316-1c4b-4281-b951-d872f2087c98
author: mnissler@chromium.org <mnissler@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-09-10 08:18:46 +0000
committer: mnissler@chromium.org <mnissler@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-09-10 08:18:46 +0000
commit: 4493574d71becba0ad1cbb74ce600466cc4f33e0 (patch)
tree: f50e1a49a49043de9e134087cb98f3b337fdf3b1 /base
parent: a15cbd47814d62163e2d66e64e7cb9e144fa5ca1 (diff)
download: chromium_src-4493574d71becba0ad1cbb74ce600466cc4f33e0.zip
chromium_src-4493574d71becba0ad1cbb74ce600466cc4f33e0.tar.gz
chromium_src-4493574d71becba0ad1cbb74ce600466cc4f33e0.tar.bz2
4 files changed, 139 insertions, 58 deletions
diff --git a/base/string_util.cc b/base/string_util.cc
index a7f5258..68f3d91 100644
--- a/base/string_util.cc
+++ b/base/string_util.cc
@@ -1038,102 +1038,116 @@ string16 ReplaceStringPlaceholders(const string16& format_string,
   return result;
 }
 
-template <class CHAR>
-static bool IsWildcard(CHAR character) {
+static bool IsWildcard(base_icu::UChar32 character) {
   return character == '*' || character == '?';
 }
 
 // Move the strings pointers to the point where they start to differ.
-template <class CHAR>
-static void EatSameChars(const CHAR** pattern, const CHAR** string) {
-  bool escaped = false;
-  while (**pattern && **string) {
-    if (!escaped && IsWildcard(**pattern)) {
+template <typename CHAR, typename NEXT>
+static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end,
+                         const CHAR** string, const CHAR* string_end,
+                         NEXT next) {
+  const CHAR* escape = NULL;
+  while (*pattern != pattern_end && *string != string_end) {
+    if (!escape && IsWildcard(**pattern)) {
       // We don't want to match wildcard here, except if it's escaped.
       return;
     }
 
     // Check if the escapement char is found. If so, skip it and move to the
     // next character.
-    if (!escaped && **pattern == L'\\') {
-      escaped = true;
-      (*pattern)++;
+    if (!escape && **pattern == '\\') {
+      escape = *pattern;
+      next(pattern, pattern_end);
       continue;
     }
 
     // Check if the chars match, if so, increment the ptrs.
-    if (**pattern == **string) {
-      (*pattern)++;
-      (*string)++;
+    const CHAR* pattern_next = *pattern;
+    const CHAR* string_next = *string;
+    base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end);
+    if (pattern_char == next(&string_next, string_end) &&
+        pattern_char != (base_icu::UChar32) CBU_SENTINEL) {
+      *pattern = pattern_next;
+      *string = string_next;
     } else {
       // Uh ho, it did not match, we are done. If the last char was an
       // escapement, that means that it was an error to advance the ptr here,
       // let's put it back where it was. This also mean that the MatchPattern
       // function will return false because if we can't match an escape char
       // here, then no one will.
-      if (escaped) {
-        (*pattern)--;
+      if (escape) {
+        *pattern = escape;
       }
       return;
     }
 
-    escaped = false;
+    escape = NULL;
   }
 }
 
-template <class CHAR>
-static void EatWildcard(const CHAR** pattern) {
-  while (**pattern) {
+template <typename CHAR, typename NEXT>
+static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) {
+  while (*pattern != end) {
     if (!IsWildcard(**pattern))
       return;
-    (*pattern)++;
+    next(pattern, end);
   }
 }
 
-template <class CHAR>
-static bool MatchPatternT(const CHAR* eval, const CHAR* pattern, int depth) {
+template <typename CHAR, typename NEXT>
+static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end,
+                          const CHAR* pattern, const CHAR* pattern_end,
+                          int depth,
+                          NEXT next) {
   const int kMaxDepth = 16;
   if (depth > kMaxDepth)
     return false;
 
   // Eat all the matching chars.
-  EatSameChars(&pattern, &eval);
+  EatSameChars(&pattern, pattern_end, &eval, eval_end, next);
 
   // If the string is empty, then the pattern must be empty too, or contains
   // only wildcards.
-  if (*eval == 0) {
-    EatWildcard(&pattern);
-    if (*pattern)
-      return false;
-    return true;
+  if (eval == eval_end) {
+    EatWildcard(&pattern, pattern_end, next);
+    return pattern == pattern_end;
   }
 
   // Pattern is empty but not string, this is not a match.
-  if (*pattern == 0)
+  if (pattern == pattern_end)
     return false;
 
   // If this is a question mark, then we need to compare the rest with
   // the current string or the string with one character eaten.
+  const CHAR* next_pattern = pattern;
+  next(&next_pattern, pattern_end);
   if (pattern[0] == '?') {
-    if (MatchPatternT(eval, pattern + 1, depth + 1) ||
-        MatchPatternT(eval + 1, pattern + 1, depth + 1))
+    if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
+                      depth + 1, next))
+      return true;
+    const CHAR* next_eval = eval;
+    next(&next_eval, eval_end);
+    if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end,
+                      depth + 1, next))
       return true;
   }
 
   // This is a *, try to match all the possible substrings with the remainder
   // of the pattern.
   if (pattern[0] == '*') {
-    while (*eval) {
-      if (MatchPatternT(eval, pattern + 1, depth + 1))
+    while (eval != eval_end) {
+      if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
+                        depth + 1, next))
         return true;
       eval++;
     }
 
     // We reached the end of the string, let see if the pattern contains only
     // wildcards.
-    if (*eval == 0) {
-      EatWildcard(&pattern);
-      if (*pattern)
+    if (eval == eval_end) {
+      EatWildcard(&pattern, pattern_end, next);
+      if (pattern != pattern_end)
         return false;
       return true;
     }
@@ -1142,13 +1156,36 @@ static bool MatchPatternT(const CHAR* eval, const CHAR* pattern, int depth) {
   return false;
 }
 
-bool MatchPatternWide(const std::wstring& eval, const std::wstring& pattern) {
-  return MatchPatternT(eval.c_str(), pattern.c_str(), 0);
+struct NextCharUTF8 {
+  base_icu::UChar32 operator()(const char** p, const char* end) {
+    base_icu::UChar32 c;
+    int offset = 0;
+    CBU8_NEXT(*p, offset, end - *p, c);
+    *p += offset;
+    return c;
+  }
+};
+
+struct NextCharUTF16 {
+  base_icu::UChar32 operator()(const char16** p, const char16* end) {
+    base_icu::UChar32 c;
+    int offset = 0;
+    CBU16_NEXT(*p, offset, end - *p, c);
+    *p += offset;
+    return c;
+  }
+};
+
+bool MatchPattern(const std::string& eval, const std::string& pattern) {
+  return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(),
+                       pattern.c_str(), pattern.c_str() + pattern.size(),
+                       0, NextCharUTF8());
 }
 
-bool MatchPatternASCII(const std::string& eval, const std::string& pattern) {
-  DCHECK(IsStringASCII(eval) && IsStringASCII(pattern));
-  return MatchPatternT(eval.c_str(), pattern.c_str(), 0);
+bool MatchPattern(const string16& eval, const string16& pattern) {
+  return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(),
+                       pattern.c_str(), pattern.c_str() + pattern.size(),
+                       0, NextCharUTF16());
 }
 
 // The following code is compatible with the OpenBSD lcpy interface.  See:
diff --git a/base/string_util.h b/base/string_util.h
index 7788562..8370f8a 100644
--- a/base/string_util.h
+++ b/base/string_util.h
@@ -599,8 +599,8 @@ bool ElideString(const std::wstring& input, int max_len, std::wstring* output);
 // string can contain wildcards like * and ?
 // The backslash character (\) is an escape character for * and ?
 // We limit the patterns to having a max of 16 * or ? characters.
-bool MatchPatternWide(const std::wstring& string, const std::wstring& pattern);
-bool MatchPatternASCII(const std::string& string, const std::string& pattern);
+bool MatchPattern(const std::string& string, const std::string& pattern);
+bool MatchPattern(const string16& string, const string16& pattern);
 
 // Hack to convert any char-like type to its unsigned counterpart.
 // For example, it will convert char, signed char and unsigned char to unsigned
diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc
index f0524bee..23b1f53 100644
--- a/base/string_util_unittest.cc
+++ b/base/string_util_unittest.cc
@@ -1064,22 +1064,36 @@ TEST(StringUtilTest, SplitStringAlongWhitespace) {
 }
 
 TEST(StringUtilTest, MatchPatternTest) {
-  EXPECT_EQ(MatchPatternASCII("www.google.com", "*.com"), true);
-  EXPECT_EQ(MatchPatternASCII("www.google.com", "*"), true);
-  EXPECT_EQ(MatchPatternASCII("www.google.com", "www*.g*.org"), false);
-  EXPECT_EQ(MatchPatternASCII("Hello", "H?l?o"), true);
-  EXPECT_EQ(MatchPatternASCII("www.google.com", "http://*)"), false);
-  EXPECT_EQ(MatchPatternASCII("www.msn.com", "*.COM"), false);
-  EXPECT_EQ(MatchPatternASCII("Hello*1234", "He??o\\*1*"), true);
-  EXPECT_EQ(MatchPatternASCII("", "*.*"), false);
-  EXPECT_EQ(MatchPatternASCII("", "*"), true);
-  EXPECT_EQ(MatchPatternASCII("", "?"), true);
-  EXPECT_EQ(MatchPatternASCII("", ""), true);
-  EXPECT_EQ(MatchPatternASCII("Hello", ""), false);
-  EXPECT_EQ(MatchPatternASCII("Hello*", "Hello*"), true);
+  EXPECT_TRUE(MatchPattern("www.google.com", "*.com"));
+  EXPECT_TRUE(MatchPattern("www.google.com", "*"));
+  EXPECT_FALSE(MatchPattern("www.google.com", "www*.g*.org"));
+  EXPECT_TRUE(MatchPattern("Hello", "H?l?o"));
+  EXPECT_FALSE(MatchPattern("www.google.com", "http://*)"));
+  EXPECT_FALSE(MatchPattern("www.msn.com", "*.COM"));
+  EXPECT_TRUE(MatchPattern("Hello*1234", "He??o\\*1*"));
+  EXPECT_FALSE(MatchPattern("", "*.*"));
+  EXPECT_TRUE(MatchPattern("", "*"));
+  EXPECT_TRUE(MatchPattern("", "?"));
+  EXPECT_TRUE(MatchPattern("", ""));
+  EXPECT_FALSE(MatchPattern("Hello", ""));
+  EXPECT_TRUE(MatchPattern("Hello*", "Hello*"));
   // Stop after a certain recursion depth.
-  EXPECT_EQ(MatchPatternASCII("12345678901234567890", "???????????????????*"),
-                              false);
+  EXPECT_FALSE(MatchPattern("123456789012345678", "?????????????????*"));
+
+  // Test UTF8 matching.
+  EXPECT_TRUE(MatchPattern("heart: \xe2\x99\xa0", "*\xe2\x99\xa0"));
+  EXPECT_TRUE(MatchPattern("heart: \xe2\x99\xa0.", "heart: ?."));
+  EXPECT_TRUE(MatchPattern("hearts: \xe2\x99\xa0\xe2\x99\xa0", "*"));
+  // Invalid sequences should be handled as a single invalid character.
+  EXPECT_TRUE(MatchPattern("invalid: \xef\xbf\xbe", "invalid: ?"));
+  // If the pattern has invalid characters, it shouldn't match anything.
+  EXPECT_FALSE(MatchPattern("\xf4\x90\x80\x80", "\xf4\x90\x80\x80"));
+
+  // Test UTF16 character matching.
+  EXPECT_TRUE(MatchPattern(UTF8ToUTF16("www.google.com"),
+                           UTF8ToUTF16("*.com")));
+  EXPECT_TRUE(MatchPattern(UTF8ToUTF16("Hello*1234"),
+                           UTF8ToUTF16("He??o\\*1*")));
 }
 
 TEST(StringUtilTest, LcpyTest) {
diff --git a/base/third_party/icu/icu_utf.h b/base/third_party/icu/icu_utf.h
index 4d63eca..43b4967 100644
--- a/base/third_party/icu/icu_utf.h
+++ b/base/third_party/icu/icu_utf.h
@@ -332,6 +332,36 @@ UChar32 utf8_nextCharSafeBody(const uint8 *s, int32 *pi, int32 length, UChar32 c
 #define CBU16_MAX_LENGTH 2
 
 /**
+ * Get a code point from a string at a code point boundary offset,
+ * and advance the offset to the next code point boundary.
+ * (Post-incrementing forward iteration.)
+ * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
+ *
+ * The offset may point to the lead surrogate unit
+ * for a supplementary code point, in which case the macro will read
+ * the following trail surrogate as well.
+ * If the offset points to a trail surrogate or
+ * to a single, unpaired lead surrogate, then that itself
+ * will be returned as the code point.
+ *
+ * @param s const UChar * string
+ * @param i string offset, i<length
+ * @param length string length
+ * @param c output UChar32 variable
+ * @stable ICU 2.4
+ */
+#define CBU16_NEXT(s, i, length, c) { \
+    (c)=(s)[(i)++]; \
+    if(CBU16_IS_LEAD(c)) { \
+        uint16 __c2; \
+        if((i)<(length) && CBU16_IS_TRAIL(__c2=(s)[(i)])) { \
+            ++(i); \
+            (c)=CBU16_GET_SUPPLEMENTARY((c), __c2); \
+        } \
+    } \
+}
+
+/**
  * Append a code point to a string, overwriting 1 or 2 code units.
  * The offset points to the current end of the string contents
  * and is advanced (post-increment).
author	mnissler@chromium.org <mnissler@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-09-10 08:18:46 +0000
committer	mnissler@chromium.org <mnissler@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-09-10 08:18:46 +0000
commit	4493574d71becba0ad1cbb74ce600466cc4f33e0 (patch)
tree	f50e1a49a49043de9e134087cb98f3b337fdf3b1 /base
parent	a15cbd47814d62163e2d66e64e7cb9e144fa5ca1 (diff)
download	chromium_src-4493574d71becba0ad1cbb74ce600466cc4f33e0.zip chromium_src-4493574d71becba0ad1cbb74ce600466cc4f33e0.tar.gz chromium_src-4493574d71becba0ad1cbb74ce600466cc4f33e0.tar.bz2