diff options
author | mnissler@chromium.org <mnissler@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-09-10 08:18:46 +0000 |
---|---|---|
committer | mnissler@chromium.org <mnissler@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-09-10 08:18:46 +0000 |
commit | 4493574d71becba0ad1cbb74ce600466cc4f33e0 (patch) | |
tree | f50e1a49a49043de9e134087cb98f3b337fdf3b1 /base/string_util.cc | |
parent | a15cbd47814d62163e2d66e64e7cb9e144fa5ca1 (diff) | |
download | chromium_src-4493574d71becba0ad1cbb74ce600466cc4f33e0.zip chromium_src-4493574d71becba0ad1cbb74ce600466cc4f33e0.tar.gz chromium_src-4493574d71becba0ad1cbb74ce600466cc4f33e0.tar.bz2 |
Make the glob matcher support UTF8 strings.
This generalizes the existing pattern matching code to support UTF8 strings.
BUG=53158
TEST=string_util_unittests.cc
Review URL: http://codereview.chromium.org/3295018
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@59071 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base/string_util.cc')
-rw-r--r-- | base/string_util.cc | 119 |
1 files changed, 78 insertions, 41 deletions
diff --git a/base/string_util.cc b/base/string_util.cc index a7f5258..68f3d91 100644 --- a/base/string_util.cc +++ b/base/string_util.cc @@ -1038,102 +1038,116 @@ string16 ReplaceStringPlaceholders(const string16& format_string, return result; } -template <class CHAR> -static bool IsWildcard(CHAR character) { +static bool IsWildcard(base_icu::UChar32 character) { return character == '*' || character == '?'; } // Move the strings pointers to the point where they start to differ. -template <class CHAR> -static void EatSameChars(const CHAR** pattern, const CHAR** string) { - bool escaped = false; - while (**pattern && **string) { - if (!escaped && IsWildcard(**pattern)) { +template <typename CHAR, typename NEXT> +static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end, + const CHAR** string, const CHAR* string_end, + NEXT next) { + const CHAR* escape = NULL; + while (*pattern != pattern_end && *string != string_end) { + if (!escape && IsWildcard(**pattern)) { // We don't want to match wildcard here, except if it's escaped. return; } // Check if the escapement char is found. If so, skip it and move to the // next character. - if (!escaped && **pattern == L'\\') { - escaped = true; - (*pattern)++; + if (!escape && **pattern == '\\') { + escape = *pattern; + next(pattern, pattern_end); continue; } // Check if the chars match, if so, increment the ptrs. - if (**pattern == **string) { - (*pattern)++; - (*string)++; + const CHAR* pattern_next = *pattern; + const CHAR* string_next = *string; + base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end); + if (pattern_char == next(&string_next, string_end) && + pattern_char != (base_icu::UChar32) CBU_SENTINEL) { + *pattern = pattern_next; + *string = string_next; } else { // Uh ho, it did not match, we are done. If the last char was an // escapement, that means that it was an error to advance the ptr here, // let's put it back where it was. This also mean that the MatchPattern // function will return false because if we can't match an escape char // here, then no one will. - if (escaped) { - (*pattern)--; + if (escape) { + *pattern = escape; } return; } - escaped = false; + escape = NULL; } } -template <class CHAR> -static void EatWildcard(const CHAR** pattern) { - while (**pattern) { +template <typename CHAR, typename NEXT> +static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) { + while (*pattern != end) { if (!IsWildcard(**pattern)) return; - (*pattern)++; + next(pattern, end); } } -template <class CHAR> -static bool MatchPatternT(const CHAR* eval, const CHAR* pattern, int depth) { +template <typename CHAR, typename NEXT> +static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end, + const CHAR* pattern, const CHAR* pattern_end, + int depth, + NEXT next) { const int kMaxDepth = 16; if (depth > kMaxDepth) return false; // Eat all the matching chars. - EatSameChars(&pattern, &eval); + EatSameChars(&pattern, pattern_end, &eval, eval_end, next); // If the string is empty, then the pattern must be empty too, or contains // only wildcards. - if (*eval == 0) { - EatWildcard(&pattern); - if (*pattern) - return false; - return true; + if (eval == eval_end) { + EatWildcard(&pattern, pattern_end, next); + return pattern == pattern_end; } // Pattern is empty but not string, this is not a match. - if (*pattern == 0) + if (pattern == pattern_end) return false; // If this is a question mark, then we need to compare the rest with // the current string or the string with one character eaten. + const CHAR* next_pattern = pattern; + next(&next_pattern, pattern_end); if (pattern[0] == '?') { - if (MatchPatternT(eval, pattern + 1, depth + 1) || - MatchPatternT(eval + 1, pattern + 1, depth + 1)) + if (MatchPatternT(eval, eval_end, next_pattern, pattern_end, + depth + 1, next)) + return true; + const CHAR* next_eval = eval; + next(&next_eval, eval_end); + if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end, + depth + 1, next)) return true; } // This is a *, try to match all the possible substrings with the remainder // of the pattern. if (pattern[0] == '*') { - while (*eval) { - if (MatchPatternT(eval, pattern + 1, depth + 1)) + while (eval != eval_end) { + if (MatchPatternT(eval, eval_end, next_pattern, pattern_end, + depth + 1, next)) return true; eval++; } // We reached the end of the string, let see if the pattern contains only // wildcards. - if (*eval == 0) { - EatWildcard(&pattern); - if (*pattern) + if (eval == eval_end) { + EatWildcard(&pattern, pattern_end, next); + if (pattern != pattern_end) return false; return true; } @@ -1142,13 +1156,36 @@ static bool MatchPatternT(const CHAR* eval, const CHAR* pattern, int depth) { return false; } -bool MatchPatternWide(const std::wstring& eval, const std::wstring& pattern) { - return MatchPatternT(eval.c_str(), pattern.c_str(), 0); +struct NextCharUTF8 { + base_icu::UChar32 operator()(const char** p, const char* end) { + base_icu::UChar32 c; + int offset = 0; + CBU8_NEXT(*p, offset, end - *p, c); + *p += offset; + return c; + } +}; + +struct NextCharUTF16 { + base_icu::UChar32 operator()(const char16** p, const char16* end) { + base_icu::UChar32 c; + int offset = 0; + CBU16_NEXT(*p, offset, end - *p, c); + *p += offset; + return c; + } +}; + +bool MatchPattern(const std::string& eval, const std::string& pattern) { + return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(), + pattern.c_str(), pattern.c_str() + pattern.size(), + 0, NextCharUTF8()); } -bool MatchPatternASCII(const std::string& eval, const std::string& pattern) { - DCHECK(IsStringASCII(eval) && IsStringASCII(pattern)); - return MatchPatternT(eval.c_str(), pattern.c_str(), 0); +bool MatchPattern(const string16& eval, const string16& pattern) { + return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(), + pattern.c_str(), pattern.c_str() + pattern.size(), + 0, NextCharUTF16()); } // The following code is compatible with the OpenBSD lcpy interface. See: |