Customize Hebrew spellcheck word break iterator

1. Treats ASCII double/single quoation marks between Hebrew letters as MidLetter for Hebrew spellchecker because they're commonly used in place of Geresh and Gershayim. 2. Pass through ASCII double/single quotation marks and Geresh and Gershayim in OutputHebrew. See http://www.unicode.org/reports/tr29/proposed.html (version 6.0.0.0 draft2) about Hebrew tailoring. This alone does not fix bug 22909 completely. This CL will be followed with an ICU data fix. BUG=22909 TEST=unit_tests --gtest_filter=SpellcheckWordIteratorTest.* Review URL: http://codereview.chromium.org/3112015 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@57223 0039d316-1c4b-4281-b951-d872f2087c98
author: jshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-08-24 19:53:54 +0000
committer: jshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-08-24 19:53:54 +0000
commit: 1f9d8817a82e1375901d13bd484bbf220e406170 (patch)
tree: 0bc31066a3ae7cbf4e6c46cfaf0303ffdfe460fe /chrome/renderer/spellchecker
parent: b5977a0c447bfcf0605a28a05ddf3f017feb3ceb (diff)
download: chromium_src-1f9d8817a82e1375901d13bd484bbf220e406170.zip
chromium_src-1f9d8817a82e1375901d13bd484bbf220e406170.tar.gz
chromium_src-1f9d8817a82e1375901d13bd484bbf220e406170.tar.bz2
2 files changed, 36 insertions, 12 deletions
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator.cc b/chrome/renderer/spellchecker/spellcheck_worditerator.cc
index 9647b87..3f5d69c 100644
--- a/chrome/renderer/spellchecker/spellcheck_worditerator.cc
+++ b/chrome/renderer/spellchecker/spellcheck_worditerator.cc
@@ -61,15 +61,19 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
       "$Extend       = [\\p{Word_Break = Extend}];"
       "$Format       = [\\p{Word_Break = Format}];"
       "$Katakana     = [\\p{Word_Break = Katakana}];"
+      // Not all the characters in a given script are ALetter.
+      // For instance, U+05F4 is MidLetter. So, this may be
+      // better, but it leads to an empty set error in Thai.
+      // "$ALetter   = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];"
       "$ALetter      = [\\p{script=%s}];"
       "$MidNumLet    = [\\p{Word_Break = MidNumLet}];"
-      "$MidLetter    = [\\p{Word_Break = MidLetter}];"
+      "$MidLetter    = [\\p{Word_Break = MidLetter}%s];"
       "$MidNum       = [\\p{Word_Break = MidNum}];"
       "$Numeric      = [\\p{Word_Break = Numeric}];"
       "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];"
 
       "$Control        = [\\p{Grapheme_Cluster_Break = Control}]; "
-      "%s"
+      "%s"  // ALetterPlus
 
       "$KatakanaEx     = $Katakana     ($Extend |  $Format)*;"
       "$ALetterEx      = $ALetterPlus  ($Extend |  $Format)*;"
@@ -89,7 +93,7 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
       "[^$CR $LF $Newline]? ($Extend |  $Format)+;"
       "$ALetterEx {200};"
       "$ALetterEx $ALetterEx {200};"
-      "%s"
+      "%s"  // (Allow|Disallow) Contraction
 
       "!!reverse;"
       "$BackALetterEx     = ($Format | $Extend)* $ALetterPlus;"
@@ -151,6 +155,13 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
   if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI)
     aletter_plus = kWithDictionary;
 
+  const char kMidLetterExtra[] = "";
+  // For Hebrew, treat single/double quoation marks as MidLetter.
+  const char kMidLetterExtraHebrew[] = "\"'";
+  const char* midletter_extra = kMidLetterExtra;
+  if (script_code_ == USCRIPT_HEBREW)
+    midletter_extra = kMidLetterExtraHebrew;
+
   // Create two custom rule-sets: one allows contraction and the other does not.
   // We save these strings in UTF-16 so we can use it without conversions. (ICU
   // needs UTF-16 strings.)
@@ -159,9 +170,9 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
   const char kDisallowContraction[] = "";
 
   ruleset_allow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate,
-      aletter, aletter_plus, kAllowContraction));
+      aletter, midletter_extra, aletter_plus, kAllowContraction));
   ruleset_disallow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate,
-      aletter, aletter_plus, kDisallowContraction));
+      aletter, midletter_extra, aletter_plus, kDisallowContraction));
 }
 
 bool SpellcheckCharAttribute::OutputChar(UChar c, string16* output) const {
@@ -246,7 +257,10 @@ bool SpellcheckCharAttribute::OutputHebrew(UChar c, string16* output) const {
   // niqquds as misspelled. (Same as Arabic vowel marks, we need to check
   // niqquds manually and filter them out since their script codes are
   // USCRIPT_HEBREW.)
-  if (0x05D0 <= c && c <= 0x05EA)
+  // Pass through ASCII single/double quotation marks and Hebrew Geresh and
+  // Gershayim.
+  if ((0x05D0 <= c && c <= 0x05EA) || c == 0x22 || c == 0x27 ||
+      c == 0x05F4 || c == 0x05F3)
     output->push_back(c);
   return true;
 }
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
index da279fa..619850f 100644
--- a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
+++ b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
@@ -39,7 +39,13 @@ TEST(SpellcheckWordIteratorTest, SplitWord) {
       L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
       L"\x0443\x0439\x0442\x0435"
       // Hebrew (including niqquds)
-      L"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd"
+      L"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd "
+      // Hebrew words with U+0027 and U+05F3
+      L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
+      // Hebrew words with U+0022 and U+05F4
+      L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
+      // Hebrew words enclosed with ASCII quotes.
+      L"\"\x05e6\x05d4\x0022\x05dc\" '\x05e9\x05c1\x05b8\x05dc\x05d5'"
       // Arabic (including vowel marks)
       L"\x0627\x064e\x0644\x0633\x064e\x0651\x0644\x0627"
       L"\x0645\x064f\x0020\x0639\x064e\x0644\x064e\x064a"
@@ -55,17 +61,18 @@ TEST(SpellcheckWordIteratorTest, SplitWord) {
       L"\x4F60\x597D"
       // Hangul Syllables
       L"\xC548\xB155\xD558\xC138\xC694"
-      // Full-width latin
-      L"\xFF28\xFF45\xFF4C\xFF4C\xFF4F";
+      // Full-width latin : Hello
+      L"\xFF28\xFF45\xFF4C\xFF4C\xFF4F "
+      L"e.g.,";
 
   // The languages and expected results used in this test.
   static const TestCase kTestCases[] = {
     {
       // English (keep contraction words)
-      "en-US", true, L"hello:hello affix Hello"
+      "en-US", true, L"hello:hello affix Hello e.g"
     }, {
       // English (split contraction words)
-      "en-US", false, L"hello hello affix Hello"
+      "en-US", false, L"hello hello affix Hello e g"
     }, {
       // Greek
       "el-GR", true,
@@ -78,7 +85,10 @@ TEST(SpellcheckWordIteratorTest, SplitWord) {
     }, {
       // Hebrew
       "he-IL", true,
-      L"\x05e9\x05dc\x05d5\x05dd"
+      L"\x05e9\x05dc\x05d5\x05dd "
+      L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
+      L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
+      L"\x05e6\x05d4\x0022\x05dc \x05e9\x05dc\x05d5"
     }, {
       // Arabic
       "ar", true,
author	jshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-08-24 19:53:54 +0000
committer	jshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-08-24 19:53:54 +0000
commit	1f9d8817a82e1375901d13bd484bbf220e406170 (patch)
tree	0bc31066a3ae7cbf4e6c46cfaf0303ffdfe460fe /chrome/renderer/spellchecker
parent	b5977a0c447bfcf0605a28a05ddf3f017feb3ceb (diff)
download	chromium_src-1f9d8817a82e1375901d13bd484bbf220e406170.zip chromium_src-1f9d8817a82e1375901d13bd484bbf220e406170.tar.gz chromium_src-1f9d8817a82e1375901d13bd484bbf220e406170.tar.bz2