Make the glob matcher support UTF8 strings.

This generalizes the existing pattern matching code to support UTF8 strings. BUG=53158 TEST=string_util_unittests.cc Review URL: http://codereview.chromium.org/3295018 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@59071 0039d316-1c4b-4281-b951-d872f2087c98
author: mnissler@chromium.org <mnissler@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-09-10 08:18:46 +0000
committer: mnissler@chromium.org <mnissler@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-09-10 08:18:46 +0000
commit: 4493574d71becba0ad1cbb74ce600466cc4f33e0 (patch)
tree: f50e1a49a49043de9e134087cb98f3b337fdf3b1 /base/string_util.cc
parent: a15cbd47814d62163e2d66e64e7cb9e144fa5ca1 (diff)
download: chromium_src-4493574d71becba0ad1cbb74ce600466cc4f33e0.zip
chromium_src-4493574d71becba0ad1cbb74ce600466cc4f33e0.tar.gz
chromium_src-4493574d71becba0ad1cbb74ce600466cc4f33e0.tar.bz2
1 files changed, 78 insertions, 41 deletions
diff --git a/base/string_util.cc b/base/string_util.cc
index a7f5258..68f3d91 100644
--- a/base/string_util.cc
+++ b/base/string_util.cc
@@ -1038,102 +1038,116 @@ string16 ReplaceStringPlaceholders(const string16& format_string,
   return result;
 }
 
-template <class CHAR>
-static bool IsWildcard(CHAR character) {
+static bool IsWildcard(base_icu::UChar32 character) {
   return character == '*' || character == '?';
 }
 
 // Move the strings pointers to the point where they start to differ.
-template <class CHAR>
-static void EatSameChars(const CHAR** pattern, const CHAR** string) {
-  bool escaped = false;
-  while (**pattern && **string) {
-    if (!escaped && IsWildcard(**pattern)) {
+template <typename CHAR, typename NEXT>
+static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end,
+                         const CHAR** string, const CHAR* string_end,
+                         NEXT next) {
+  const CHAR* escape = NULL;
+  while (*pattern != pattern_end && *string != string_end) {
+    if (!escape && IsWildcard(**pattern)) {
       // We don't want to match wildcard here, except if it's escaped.
       return;
     }
 
     // Check if the escapement char is found. If so, skip it and move to the
     // next character.
-    if (!escaped && **pattern == L'\\') {
-      escaped = true;
-      (*pattern)++;
+    if (!escape && **pattern == '\\') {
+      escape = *pattern;
+      next(pattern, pattern_end);
       continue;
     }
 
     // Check if the chars match, if so, increment the ptrs.
-    if (**pattern == **string) {
-      (*pattern)++;
-      (*string)++;
+    const CHAR* pattern_next = *pattern;
+    const CHAR* string_next = *string;
+    base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end);
+    if (pattern_char == next(&string_next, string_end) &&
+        pattern_char != (base_icu::UChar32) CBU_SENTINEL) {
+      *pattern = pattern_next;
+      *string = string_next;
     } else {
       // Uh ho, it did not match, we are done. If the last char was an
       // escapement, that means that it was an error to advance the ptr here,
       // let's put it back where it was. This also mean that the MatchPattern
       // function will return false because if we can't match an escape char
       // here, then no one will.
-      if (escaped) {
-        (*pattern)--;
+      if (escape) {
+        *pattern = escape;
       }
       return;
     }
 
-    escaped = false;
+    escape = NULL;
   }
 }
 
-template <class CHAR>
-static void EatWildcard(const CHAR** pattern) {
-  while (**pattern) {
+template <typename CHAR, typename NEXT>
+static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) {
+  while (*pattern != end) {
     if (!IsWildcard(**pattern))
       return;
-    (*pattern)++;
+    next(pattern, end);
   }
 }
 
-template <class CHAR>
-static bool MatchPatternT(const CHAR* eval, const CHAR* pattern, int depth) {
+template <typename CHAR, typename NEXT>
+static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end,
+                          const CHAR* pattern, const CHAR* pattern_end,
+                          int depth,
+                          NEXT next) {
   const int kMaxDepth = 16;
   if (depth > kMaxDepth)
     return false;
 
   // Eat all the matching chars.
-  EatSameChars(&pattern, &eval);
+  EatSameChars(&pattern, pattern_end, &eval, eval_end, next);
 
   // If the string is empty, then the pattern must be empty too, or contains
   // only wildcards.
-  if (*eval == 0) {
-    EatWildcard(&pattern);
-    if (*pattern)
-      return false;
-    return true;
+  if (eval == eval_end) {
+    EatWildcard(&pattern, pattern_end, next);
+    return pattern == pattern_end;
   }
 
   // Pattern is empty but not string, this is not a match.
-  if (*pattern == 0)
+  if (pattern == pattern_end)
     return false;
 
   // If this is a question mark, then we need to compare the rest with
   // the current string or the string with one character eaten.
+  const CHAR* next_pattern = pattern;
+  next(&next_pattern, pattern_end);
   if (pattern[0] == '?') {
-    if (MatchPatternT(eval, pattern + 1, depth + 1) ||
-        MatchPatternT(eval + 1, pattern + 1, depth + 1))
+    if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
+                      depth + 1, next))
+      return true;
+    const CHAR* next_eval = eval;
+    next(&next_eval, eval_end);
+    if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end,
+                      depth + 1, next))
       return true;
   }
 
   // This is a *, try to match all the possible substrings with the remainder
   // of the pattern.
   if (pattern[0] == '*') {
-    while (*eval) {
-      if (MatchPatternT(eval, pattern + 1, depth + 1))
+    while (eval != eval_end) {
+      if (MatchPatternT(eval, eval_end, next_pattern, pattern_end,
+                        depth + 1, next))
         return true;
       eval++;
     }
 
     // We reached the end of the string, let see if the pattern contains only
     // wildcards.
-    if (*eval == 0) {
-      EatWildcard(&pattern);
-      if (*pattern)
+    if (eval == eval_end) {
+      EatWildcard(&pattern, pattern_end, next);
+      if (pattern != pattern_end)
         return false;
       return true;
     }
@@ -1142,13 +1156,36 @@ static bool MatchPatternT(const CHAR* eval, const CHAR* pattern, int depth) {
   return false;
 }
 
-bool MatchPatternWide(const std::wstring& eval, const std::wstring& pattern) {
-  return MatchPatternT(eval.c_str(), pattern.c_str(), 0);
+struct NextCharUTF8 {
+  base_icu::UChar32 operator()(const char** p, const char* end) {
+    base_icu::UChar32 c;
+    int offset = 0;
+    CBU8_NEXT(*p, offset, end - *p, c);
+    *p += offset;
+    return c;
+  }
+};
+
+struct NextCharUTF16 {
+  base_icu::UChar32 operator()(const char16** p, const char16* end) {
+    base_icu::UChar32 c;
+    int offset = 0;
+    CBU16_NEXT(*p, offset, end - *p, c);
+    *p += offset;
+    return c;
+  }
+};
+
+bool MatchPattern(const std::string& eval, const std::string& pattern) {
+  return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(),
+                       pattern.c_str(), pattern.c_str() + pattern.size(),
+                       0, NextCharUTF8());
 }
 
-bool MatchPatternASCII(const std::string& eval, const std::string& pattern) {
-  DCHECK(IsStringASCII(eval) && IsStringASCII(pattern));
-  return MatchPatternT(eval.c_str(), pattern.c_str(), 0);
+bool MatchPattern(const string16& eval, const string16& pattern) {
+  return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(),
+                       pattern.c_str(), pattern.c_str() + pattern.size(),
+                       0, NextCharUTF16());
 }
 
 // The following code is compatible with the OpenBSD lcpy interface.  See:
author	mnissler@chromium.org <mnissler@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-09-10 08:18:46 +0000
committer	mnissler@chromium.org <mnissler@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-09-10 08:18:46 +0000
commit	4493574d71becba0ad1cbb74ce600466cc4f33e0 (patch)
tree	f50e1a49a49043de9e134087cb98f3b337fdf3b1 /base/string_util.cc
parent	a15cbd47814d62163e2d66e64e7cb9e144fa5ca1 (diff)
download	chromium_src-4493574d71becba0ad1cbb74ce600466cc4f33e0.zip chromium_src-4493574d71becba0ad1cbb74ce600466cc4f33e0.tar.gz chromium_src-4493574d71becba0ad1cbb74ce600466cc4f33e0.tar.bz2