summaryrefslogtreecommitdiffstats
path: root/chrome/renderer/spellchecker
diff options
context:
space:
mode:
authorhbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2012-09-04 08:25:11 +0000
committerhbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2012-09-04 08:25:11 +0000
commit2b7568c160aa04ea8c1ef952c85dfc441d953473 (patch)
tree3d2fda02798786312d25bda3d2ddb5baa543e431 /chrome/renderer/spellchecker
parentf34706be3d46c4b7f0aaf243c1e27e0015665838 (diff)
downloadchromium_src-2b7568c160aa04ea8c1ef952c85dfc441d953473.zip
chromium_src-2b7568c160aa04ea8c1ef952c85dfc441d953473.tar.gz
chromium_src-2b7568c160aa04ea8c1ef952c85dfc441d953473.tar.bz2
Treats ASCII numbers as word characters only on LTR languages.
This change is a follow-up change of my r145277 <http://crrev.com/145277>. Even though this change always treats ASCII numbers as word characters, it is not an expected behavior on RTL languages. This change prevents treating ASCII numbers as word characters on RTL languages (Arabic and Hebrew). BUG=145028 TEST=SpellCheckWordIteratorTest.TreatNumbersAsWordCharacters Review URL: https://chromiumcodereview.appspot.com/10908031 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@154731 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/renderer/spellchecker')
-rw-r--r--chrome/renderer/spellchecker/spellcheck_worditerator.cc9
-rw-r--r--chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc64
2 files changed, 72 insertions, 1 deletions
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator.cc b/chrome/renderer/spellchecker/spellcheck_worditerator.cc
index 96f1cf8..d6e6abe 100644
--- a/chrome/renderer/spellchecker/spellcheck_worditerator.cc
+++ b/chrome/renderer/spellchecker/spellcheck_worditerator.cc
@@ -64,7 +64,7 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
// For instance, U+05F4 is MidLetter. So, this may be
// better, but it leads to an empty set error in Thai.
// "$ALetter = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];"
- "$ALetter = [\\p{script=%s} [0123456789]];"
+ "$ALetter = [\\p{script=%s}%s];"
"$MidNumLet = [\\p{Word_Break = MidNumLet}];"
"$MidLetter = [\\p{Word_Break = MidLetter}%s];"
"$MidNum = [\\p{Word_Break = MidNum}];"
@@ -154,6 +154,11 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI)
aletter_plus = kWithDictionary;
+ // Treat numbers as word characters except for Arabic and Hebrew.
+ const char* aletter_extra = " [0123456789]";
+ if (script_code_ == USCRIPT_HEBREW || script_code_ == USCRIPT_ARABIC)
+ aletter_extra = "";
+
const char kMidLetterExtra[] = "";
// For Hebrew, treat single/double quoation marks as MidLetter.
const char kMidLetterExtraHebrew[] = "\"'";
@@ -171,12 +176,14 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
ruleset_allow_contraction_ = ASCIIToUTF16(
base::StringPrintf(kRuleTemplate,
aletter,
+ aletter_extra,
midletter_extra,
aletter_plus,
kAllowContraction));
ruleset_disallow_contraction_ = ASCIIToUTF16(
base::StringPrintf(kRuleTemplate,
aletter,
+ aletter_extra,
midletter_extra,
aletter_plus,
kDisallowContraction));
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
index 37fbc71..1dc8614 100644
--- a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
+++ b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
@@ -164,3 +164,67 @@ TEST(SpellcheckWordIteratorTest, RuleSetConsistency) {
EXPECT_EQ(0, actual_start);
EXPECT_EQ(0, actual_end);
}
+
+// Vertify our SpellcheckWordIterator can treat ASCII numbers as word characters
+// on LTR languages. On the other hand, it should not treat ASCII numbers as
+// word characters on RTL languages because they change the text direction from
+// RTL to LTR.
+TEST(SpellcheckWordIteratorTest, TreatNumbersAsWordCharacters) {
+ // A set of a language, a dummy word, and a text direction used in this test.
+ // For each language, this test splits a dummy word, which consists of ASCII
+ // numbers and an alphabet of the language, into words. When ASCII numbers are
+ // treated as word characters, the split word becomes equal to the dummy word.
+ // Otherwise, the split word does not include ASCII numbers.
+ static const struct {
+ const char* language;
+ const wchar_t* text;
+ bool left_to_right;
+ } kTestCases[] = {
+ {
+ // English
+ "en-US", L"0123456789" L"a", true,
+ }, {
+ // Greek
+ "el-GR", L"0123456789" L"\x03B1", true,
+ }, {
+ // Russian
+ "ru-RU", L"0123456789" L"\x0430", true,
+ }, {
+ // Hebrew
+ "he-IL", L"0123456789" L"\x05D0", false,
+ }, {
+ // Arabic
+ "ar", L"0123456789" L"\x0627", false,
+ }, {
+ // Hindi
+ "hi-IN", L"0123456789" L"\x0905", true,
+ }, {
+ // Thai
+ "th-TH", L"0123456789" L"\x0e01", true,
+ }, {
+ // Korean
+ "ko-KR", L"0123456789" L"\x1100\x1161", true,
+ },
+ };
+
+ for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kTestCases); ++i) {
+ SCOPED_TRACE(base::StringPrintf("kTestCases[%" PRIuS "]: language=%s", i,
+ kTestCases[i].language));
+
+ SpellcheckCharAttribute attributes;
+ attributes.SetDefaultLanguage(kTestCases[i].language);
+
+ string16 input_word(WideToUTF16(kTestCases[i].text));
+ SpellcheckWordIterator iterator;
+ EXPECT_TRUE(iterator.Initialize(&attributes, true));
+ EXPECT_TRUE(iterator.SetText(input_word.c_str(), input_word.length()));
+
+ string16 actual_word;
+ int actual_start, actual_end;
+ EXPECT_TRUE(iterator.GetNextWord(&actual_word, &actual_start, &actual_end));
+ if (kTestCases[i].left_to_right)
+ EXPECT_EQ(input_word, actual_word);
+ else
+ EXPECT_NE(input_word, actual_word);
+ }
+}