diff options
author | hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2012-09-04 08:25:11 +0000 |
---|---|---|
committer | hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2012-09-04 08:25:11 +0000 |
commit | 2b7568c160aa04ea8c1ef952c85dfc441d953473 (patch) | |
tree | 3d2fda02798786312d25bda3d2ddb5baa543e431 /chrome/renderer/spellchecker | |
parent | f34706be3d46c4b7f0aaf243c1e27e0015665838 (diff) | |
download | chromium_src-2b7568c160aa04ea8c1ef952c85dfc441d953473.zip chromium_src-2b7568c160aa04ea8c1ef952c85dfc441d953473.tar.gz chromium_src-2b7568c160aa04ea8c1ef952c85dfc441d953473.tar.bz2 |
Treats ASCII numbers as word characters only on LTR languages.
This change is a follow-up change of my r145277 <http://crrev.com/145277>. Even though this change always treats ASCII numbers as word characters, it is not an expected behavior on RTL languages. This change prevents treating ASCII numbers as word characters on RTL languages (Arabic and Hebrew).
BUG=145028
TEST=SpellCheckWordIteratorTest.TreatNumbersAsWordCharacters
Review URL: https://chromiumcodereview.appspot.com/10908031
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@154731 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/renderer/spellchecker')
-rw-r--r-- | chrome/renderer/spellchecker/spellcheck_worditerator.cc | 9 | ||||
-rw-r--r-- | chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc | 64 |
2 files changed, 72 insertions, 1 deletions
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator.cc b/chrome/renderer/spellchecker/spellcheck_worditerator.cc index 96f1cf8..d6e6abe 100644 --- a/chrome/renderer/spellchecker/spellcheck_worditerator.cc +++ b/chrome/renderer/spellchecker/spellcheck_worditerator.cc @@ -64,7 +64,7 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) { // For instance, U+05F4 is MidLetter. So, this may be // better, but it leads to an empty set error in Thai. // "$ALetter = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];" - "$ALetter = [\\p{script=%s} [0123456789]];" + "$ALetter = [\\p{script=%s}%s];" "$MidNumLet = [\\p{Word_Break = MidNumLet}];" "$MidLetter = [\\p{Word_Break = MidLetter}%s];" "$MidNum = [\\p{Word_Break = MidNum}];" @@ -154,6 +154,11 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) { if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI) aletter_plus = kWithDictionary; + // Treat numbers as word characters except for Arabic and Hebrew. + const char* aletter_extra = " [0123456789]"; + if (script_code_ == USCRIPT_HEBREW || script_code_ == USCRIPT_ARABIC) + aletter_extra = ""; + const char kMidLetterExtra[] = ""; // For Hebrew, treat single/double quoation marks as MidLetter. const char kMidLetterExtraHebrew[] = "\"'"; @@ -171,12 +176,14 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) { ruleset_allow_contraction_ = ASCIIToUTF16( base::StringPrintf(kRuleTemplate, aletter, + aletter_extra, midletter_extra, aletter_plus, kAllowContraction)); ruleset_disallow_contraction_ = ASCIIToUTF16( base::StringPrintf(kRuleTemplate, aletter, + aletter_extra, midletter_extra, aletter_plus, kDisallowContraction)); diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc index 37fbc71..1dc8614 100644 --- a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc +++ b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc @@ -164,3 +164,67 @@ TEST(SpellcheckWordIteratorTest, RuleSetConsistency) { EXPECT_EQ(0, actual_start); EXPECT_EQ(0, actual_end); } + +// Vertify our SpellcheckWordIterator can treat ASCII numbers as word characters +// on LTR languages. On the other hand, it should not treat ASCII numbers as +// word characters on RTL languages because they change the text direction from +// RTL to LTR. +TEST(SpellcheckWordIteratorTest, TreatNumbersAsWordCharacters) { + // A set of a language, a dummy word, and a text direction used in this test. + // For each language, this test splits a dummy word, which consists of ASCII + // numbers and an alphabet of the language, into words. When ASCII numbers are + // treated as word characters, the split word becomes equal to the dummy word. + // Otherwise, the split word does not include ASCII numbers. + static const struct { + const char* language; + const wchar_t* text; + bool left_to_right; + } kTestCases[] = { + { + // English + "en-US", L"0123456789" L"a", true, + }, { + // Greek + "el-GR", L"0123456789" L"\x03B1", true, + }, { + // Russian + "ru-RU", L"0123456789" L"\x0430", true, + }, { + // Hebrew + "he-IL", L"0123456789" L"\x05D0", false, + }, { + // Arabic + "ar", L"0123456789" L"\x0627", false, + }, { + // Hindi + "hi-IN", L"0123456789" L"\x0905", true, + }, { + // Thai + "th-TH", L"0123456789" L"\x0e01", true, + }, { + // Korean + "ko-KR", L"0123456789" L"\x1100\x1161", true, + }, + }; + + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kTestCases); ++i) { + SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i, + kTestCases[i].language)); + + SpellcheckCharAttribute attributes; + attributes.SetDefaultLanguage(kTestCases[i].language); + + string16 input_word(WideToUTF16(kTestCases[i].text)); + SpellcheckWordIterator iterator; + EXPECT_TRUE(iterator.Initialize(&attributes, true)); + EXPECT_TRUE(iterator.SetText(input_word.c_str(), input_word.length())); + + string16 actual_word; + int actual_start, actual_end; + EXPECT_TRUE(iterator.GetNextWord(&actual_word, &actual_start, &actual_end)); + if (kTestCases[i].left_to_right) + EXPECT_EQ(input_word, actual_word); + else + EXPECT_NE(input_word, actual_word); + } +} |