diff options
author | mnissler@chromium.org <mnissler@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-09-10 08:18:46 +0000 |
---|---|---|
committer | mnissler@chromium.org <mnissler@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-09-10 08:18:46 +0000 |
commit | 4493574d71becba0ad1cbb74ce600466cc4f33e0 (patch) | |
tree | f50e1a49a49043de9e134087cb98f3b337fdf3b1 /base | |
parent | a15cbd47814d62163e2d66e64e7cb9e144fa5ca1 (diff) | |
download | chromium_src-4493574d71becba0ad1cbb74ce600466cc4f33e0.zip chromium_src-4493574d71becba0ad1cbb74ce600466cc4f33e0.tar.gz chromium_src-4493574d71becba0ad1cbb74ce600466cc4f33e0.tar.bz2 |
Make the glob matcher support UTF8 strings.
This generalizes the existing pattern matching code to support UTF8 strings.
BUG=53158
TEST=string_util_unittests.cc
Review URL: http://codereview.chromium.org/3295018
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@59071 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base')
-rw-r--r-- | base/string_util.cc | 119 | ||||
-rw-r--r-- | base/string_util.h | 4 | ||||
-rw-r--r-- | base/string_util_unittest.cc | 44 | ||||
-rw-r--r-- | base/third_party/icu/icu_utf.h | 30 |
4 files changed, 139 insertions, 58 deletions
diff --git a/base/string_util.cc b/base/string_util.cc index a7f5258..68f3d91 100644 --- a/base/string_util.cc +++ b/base/string_util.cc @@ -1038,102 +1038,116 @@ string16 ReplaceStringPlaceholders(const string16& format_string, return result; } -template <class CHAR> -static bool IsWildcard(CHAR character) { +static bool IsWildcard(base_icu::UChar32 character) { return character == '*' || character == '?'; } // Move the strings pointers to the point where they start to differ. -template <class CHAR> -static void EatSameChars(const CHAR** pattern, const CHAR** string) { - bool escaped = false; - while (**pattern && **string) { - if (!escaped && IsWildcard(**pattern)) { +template <typename CHAR, typename NEXT> +static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end, + const CHAR** string, const CHAR* string_end, + NEXT next) { + const CHAR* escape = NULL; + while (*pattern != pattern_end && *string != string_end) { + if (!escape && IsWildcard(**pattern)) { // We don't want to match wildcard here, except if it's escaped. return; } // Check if the escapement char is found. If so, skip it and move to the // next character. - if (!escaped && **pattern == L'\\') { - escaped = true; - (*pattern)++; + if (!escape && **pattern == '\\') { + escape = *pattern; + next(pattern, pattern_end); continue; } // Check if the chars match, if so, increment the ptrs. - if (**pattern == **string) { - (*pattern)++; - (*string)++; + const CHAR* pattern_next = *pattern; + const CHAR* string_next = *string; + base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end); + if (pattern_char == next(&string_next, string_end) && + pattern_char != (base_icu::UChar32) CBU_SENTINEL) { + *pattern = pattern_next; + *string = string_next; } else { // Uh ho, it did not match, we are done. If the last char was an // escapement, that means that it was an error to advance the ptr here, // let's put it back where it was. This also mean that the MatchPattern // function will return false because if we can't match an escape char // here, then no one will. - if (escaped) { - (*pattern)--; + if (escape) { + *pattern = escape; } return; } - escaped = false; + escape = NULL; } } -template <class CHAR> -static void EatWildcard(const CHAR** pattern) { - while (**pattern) { +template <typename CHAR, typename NEXT> +static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) { + while (*pattern != end) { if (!IsWildcard(**pattern)) return; - (*pattern)++; + next(pattern, end); } } -template <class CHAR> -static bool MatchPatternT(const CHAR* eval, const CHAR* pattern, int depth) { +template <typename CHAR, typename NEXT> +static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end, + const CHAR* pattern, const CHAR* pattern_end, + int depth, + NEXT next) { const int kMaxDepth = 16; if (depth > kMaxDepth) return false; // Eat all the matching chars. - EatSameChars(&pattern, &eval); + EatSameChars(&pattern, pattern_end, &eval, eval_end, next); // If the string is empty, then the pattern must be empty too, or contains // only wildcards. - if (*eval == 0) { - EatWildcard(&pattern); - if (*pattern) - return false; - return true; + if (eval == eval_end) { + EatWildcard(&pattern, pattern_end, next); + return pattern == pattern_end; } // Pattern is empty but not string, this is not a match. - if (*pattern == 0) + if (pattern == pattern_end) return false; // If this is a question mark, then we need to compare the rest with // the current string or the string with one character eaten. + const CHAR* next_pattern = pattern; + next(&next_pattern, pattern_end); if (pattern[0] == '?') { - if (MatchPatternT(eval, pattern + 1, depth + 1) || - MatchPatternT(eval + 1, pattern + 1, depth + 1)) + if (MatchPatternT(eval, eval_end, next_pattern, pattern_end, + depth + 1, next)) + return true; + const CHAR* next_eval = eval; + next(&next_eval, eval_end); + if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end, + depth + 1, next)) return true; } // This is a *, try to match all the possible substrings with the remainder // of the pattern. if (pattern[0] == '*') { - while (*eval) { - if (MatchPatternT(eval, pattern + 1, depth + 1)) + while (eval != eval_end) { + if (MatchPatternT(eval, eval_end, next_pattern, pattern_end, + depth + 1, next)) return true; eval++; } // We reached the end of the string, let see if the pattern contains only // wildcards. - if (*eval == 0) { - EatWildcard(&pattern); - if (*pattern) + if (eval == eval_end) { + EatWildcard(&pattern, pattern_end, next); + if (pattern != pattern_end) return false; return true; } @@ -1142,13 +1156,36 @@ static bool MatchPatternT(const CHAR* eval, const CHAR* pattern, int depth) { return false; } -bool MatchPatternWide(const std::wstring& eval, const std::wstring& pattern) { - return MatchPatternT(eval.c_str(), pattern.c_str(), 0); +struct NextCharUTF8 { + base_icu::UChar32 operator()(const char** p, const char* end) { + base_icu::UChar32 c; + int offset = 0; + CBU8_NEXT(*p, offset, end - *p, c); + *p += offset; + return c; + } +}; + +struct NextCharUTF16 { + base_icu::UChar32 operator()(const char16** p, const char16* end) { + base_icu::UChar32 c; + int offset = 0; + CBU16_NEXT(*p, offset, end - *p, c); + *p += offset; + return c; + } +}; + +bool MatchPattern(const std::string& eval, const std::string& pattern) { + return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(), + pattern.c_str(), pattern.c_str() + pattern.size(), + 0, NextCharUTF8()); } -bool MatchPatternASCII(const std::string& eval, const std::string& pattern) { - DCHECK(IsStringASCII(eval) && IsStringASCII(pattern)); - return MatchPatternT(eval.c_str(), pattern.c_str(), 0); +bool MatchPattern(const string16& eval, const string16& pattern) { + return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(), + pattern.c_str(), pattern.c_str() + pattern.size(), + 0, NextCharUTF16()); } // The following code is compatible with the OpenBSD lcpy interface. See: diff --git a/base/string_util.h b/base/string_util.h index 7788562..8370f8a 100644 --- a/base/string_util.h +++ b/base/string_util.h @@ -599,8 +599,8 @@ bool ElideString(const std::wstring& input, int max_len, std::wstring* output); // string can contain wildcards like * and ? // The backslash character (\) is an escape character for * and ? // We limit the patterns to having a max of 16 * or ? characters. -bool MatchPatternWide(const std::wstring& string, const std::wstring& pattern); -bool MatchPatternASCII(const std::string& string, const std::string& pattern); +bool MatchPattern(const std::string& string, const std::string& pattern); +bool MatchPattern(const string16& string, const string16& pattern); // Hack to convert any char-like type to its unsigned counterpart. // For example, it will convert char, signed char and unsigned char to unsigned diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc index f0524bee..23b1f53 100644 --- a/base/string_util_unittest.cc +++ b/base/string_util_unittest.cc @@ -1064,22 +1064,36 @@ TEST(StringUtilTest, SplitStringAlongWhitespace) { } TEST(StringUtilTest, MatchPatternTest) { - EXPECT_EQ(MatchPatternASCII("www.google.com", "*.com"), true); - EXPECT_EQ(MatchPatternASCII("www.google.com", "*"), true); - EXPECT_EQ(MatchPatternASCII("www.google.com", "www*.g*.org"), false); - EXPECT_EQ(MatchPatternASCII("Hello", "H?l?o"), true); - EXPECT_EQ(MatchPatternASCII("www.google.com", "http://*)"), false); - EXPECT_EQ(MatchPatternASCII("www.msn.com", "*.COM"), false); - EXPECT_EQ(MatchPatternASCII("Hello*1234", "He??o\\*1*"), true); - EXPECT_EQ(MatchPatternASCII("", "*.*"), false); - EXPECT_EQ(MatchPatternASCII("", "*"), true); - EXPECT_EQ(MatchPatternASCII("", "?"), true); - EXPECT_EQ(MatchPatternASCII("", ""), true); - EXPECT_EQ(MatchPatternASCII("Hello", ""), false); - EXPECT_EQ(MatchPatternASCII("Hello*", "Hello*"), true); + EXPECT_TRUE(MatchPattern("www.google.com", "*.com")); + EXPECT_TRUE(MatchPattern("www.google.com", "*")); + EXPECT_FALSE(MatchPattern("www.google.com", "www*.g*.org")); + EXPECT_TRUE(MatchPattern("Hello", "H?l?o")); + EXPECT_FALSE(MatchPattern("www.google.com", "http://*)")); + EXPECT_FALSE(MatchPattern("www.msn.com", "*.COM")); + EXPECT_TRUE(MatchPattern("Hello*1234", "He??o\\*1*")); + EXPECT_FALSE(MatchPattern("", "*.*")); + EXPECT_TRUE(MatchPattern("", "*")); + EXPECT_TRUE(MatchPattern("", "?")); + EXPECT_TRUE(MatchPattern("", "")); + EXPECT_FALSE(MatchPattern("Hello", "")); + EXPECT_TRUE(MatchPattern("Hello*", "Hello*")); // Stop after a certain recursion depth. - EXPECT_EQ(MatchPatternASCII("12345678901234567890", "???????????????????*"), - false); + EXPECT_FALSE(MatchPattern("123456789012345678", "?????????????????*")); + + // Test UTF8 matching. + EXPECT_TRUE(MatchPattern("heart: \xe2\x99\xa0", "*\xe2\x99\xa0")); + EXPECT_TRUE(MatchPattern("heart: \xe2\x99\xa0.", "heart: ?.")); + EXPECT_TRUE(MatchPattern("hearts: \xe2\x99\xa0\xe2\x99\xa0", "*")); + // Invalid sequences should be handled as a single invalid character. + EXPECT_TRUE(MatchPattern("invalid: \xef\xbf\xbe", "invalid: ?")); + // If the pattern has invalid characters, it shouldn't match anything. + EXPECT_FALSE(MatchPattern("\xf4\x90\x80\x80", "\xf4\x90\x80\x80")); + + // Test UTF16 character matching. + EXPECT_TRUE(MatchPattern(UTF8ToUTF16("www.google.com"), + UTF8ToUTF16("*.com"))); + EXPECT_TRUE(MatchPattern(UTF8ToUTF16("Hello*1234"), + UTF8ToUTF16("He??o\\*1*"))); } TEST(StringUtilTest, LcpyTest) { diff --git a/base/third_party/icu/icu_utf.h b/base/third_party/icu/icu_utf.h index 4d63eca..43b4967 100644 --- a/base/third_party/icu/icu_utf.h +++ b/base/third_party/icu/icu_utf.h @@ -332,6 +332,36 @@ UChar32 utf8_nextCharSafeBody(const uint8 *s, int32 *pi, int32 length, UChar32 c #define CBU16_MAX_LENGTH 2 /** + * Get a code point from a string at a code point boundary offset, + * and advance the offset to the next code point boundary. + * (Post-incrementing forward iteration.) + * "Safe" macro, handles unpaired surrogates and checks for string boundaries. + * + * The offset may point to the lead surrogate unit + * for a supplementary code point, in which case the macro will read + * the following trail surrogate as well. + * If the offset points to a trail surrogate or + * to a single, unpaired lead surrogate, then that itself + * will be returned as the code point. + * + * @param s const UChar * string + * @param i string offset, i<length + * @param length string length + * @param c output UChar32 variable + * @stable ICU 2.4 + */ +#define CBU16_NEXT(s, i, length, c) { \ + (c)=(s)[(i)++]; \ + if(CBU16_IS_LEAD(c)) { \ + uint16 __c2; \ + if((i)<(length) && CBU16_IS_TRAIL(__c2=(s)[(i)])) { \ + ++(i); \ + (c)=CBU16_GET_SUPPLEMENTARY((c), __c2); \ + } \ + } \ +} + +/** * Append a code point to a string, overwriting 1 or 2 code units. * The offset points to the current end of the string contents * and is advanced (post-increment). |