diff options
author | jshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-08-24 19:53:54 +0000 |
---|---|---|
committer | jshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-08-24 19:53:54 +0000 |
commit | 1f9d8817a82e1375901d13bd484bbf220e406170 (patch) | |
tree | 0bc31066a3ae7cbf4e6c46cfaf0303ffdfe460fe /chrome/renderer/spellchecker | |
parent | b5977a0c447bfcf0605a28a05ddf3f017feb3ceb (diff) | |
download | chromium_src-1f9d8817a82e1375901d13bd484bbf220e406170.zip chromium_src-1f9d8817a82e1375901d13bd484bbf220e406170.tar.gz chromium_src-1f9d8817a82e1375901d13bd484bbf220e406170.tar.bz2 |
Customize Hebrew spellcheck word break iterator
1. Treats ASCII double/single quoation marks between Hebrew letters as MidLetter
for Hebrew spellchecker because they're commonly used in place of Geresh and Gershayim.
2. Pass through ASCII double/single quotation marks and Geresh and Gershayim in OutputHebrew.
See http://www.unicode.org/reports/tr29/proposed.html (version 6.0.0.0 draft2) about Hebrew
tailoring.
This alone does not fix bug 22909 completely. This CL will be followed with an ICU data fix.
BUG=22909
TEST=unit_tests --gtest_filter=SpellcheckWordIteratorTest.*
Review URL: http://codereview.chromium.org/3112015
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@57223 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/renderer/spellchecker')
-rw-r--r-- | chrome/renderer/spellchecker/spellcheck_worditerator.cc | 26 | ||||
-rw-r--r-- | chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc | 22 |
2 files changed, 36 insertions, 12 deletions
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator.cc b/chrome/renderer/spellchecker/spellcheck_worditerator.cc index 9647b87..3f5d69c 100644 --- a/chrome/renderer/spellchecker/spellcheck_worditerator.cc +++ b/chrome/renderer/spellchecker/spellcheck_worditerator.cc @@ -61,15 +61,19 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) { "$Extend = [\\p{Word_Break = Extend}];" "$Format = [\\p{Word_Break = Format}];" "$Katakana = [\\p{Word_Break = Katakana}];" + // Not all the characters in a given script are ALetter. + // For instance, U+05F4 is MidLetter. So, this may be + // better, but it leads to an empty set error in Thai. + // "$ALetter = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];" "$ALetter = [\\p{script=%s}];" "$MidNumLet = [\\p{Word_Break = MidNumLet}];" - "$MidLetter = [\\p{Word_Break = MidLetter}];" + "$MidLetter = [\\p{Word_Break = MidLetter}%s];" "$MidNum = [\\p{Word_Break = MidNum}];" "$Numeric = [\\p{Word_Break = Numeric}];" "$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];" "$Control = [\\p{Grapheme_Cluster_Break = Control}]; " - "%s" + "%s" // ALetterPlus "$KatakanaEx = $Katakana ($Extend | $Format)*;" "$ALetterEx = $ALetterPlus ($Extend | $Format)*;" @@ -89,7 +93,7 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) { "[^$CR $LF $Newline]? ($Extend | $Format)+;" "$ALetterEx {200};" "$ALetterEx $ALetterEx {200};" - "%s" + "%s" // (Allow|Disallow) Contraction "!!reverse;" "$BackALetterEx = ($Format | $Extend)* $ALetterPlus;" @@ -151,6 +155,13 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) { if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI) aletter_plus = kWithDictionary; + const char kMidLetterExtra[] = ""; + // For Hebrew, treat single/double quoation marks as MidLetter. + const char kMidLetterExtraHebrew[] = "\"'"; + const char* midletter_extra = kMidLetterExtra; + if (script_code_ == USCRIPT_HEBREW) + midletter_extra = kMidLetterExtraHebrew; + // Create two custom rule-sets: one allows contraction and the other does not. // We save these strings in UTF-16 so we can use it without conversions. (ICU // needs UTF-16 strings.) @@ -159,9 +170,9 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) { const char kDisallowContraction[] = ""; ruleset_allow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate, - aletter, aletter_plus, kAllowContraction)); + aletter, midletter_extra, aletter_plus, kAllowContraction)); ruleset_disallow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate, - aletter, aletter_plus, kDisallowContraction)); + aletter, midletter_extra, aletter_plus, kDisallowContraction)); } bool SpellcheckCharAttribute::OutputChar(UChar c, string16* output) const { @@ -246,7 +257,10 @@ bool SpellcheckCharAttribute::OutputHebrew(UChar c, string16* output) const { // niqquds as misspelled. (Same as Arabic vowel marks, we need to check // niqquds manually and filter them out since their script codes are // USCRIPT_HEBREW.) - if (0x05D0 <= c && c <= 0x05EA) + // Pass through ASCII single/double quotation marks and Hebrew Geresh and + // Gershayim. + if ((0x05D0 <= c && c <= 0x05EA) || c == 0x22 || c == 0x27 || + c == 0x05F4 || c == 0x05F3) output->push_back(c); return true; } diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc index da279fa..619850f 100644 --- a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc +++ b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc @@ -39,7 +39,13 @@ TEST(SpellcheckWordIteratorTest, SplitWord) { L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432" L"\x0443\x0439\x0442\x0435" // Hebrew (including niqquds) - L"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd" + L"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd " + // Hebrew words with U+0027 and U+05F3 + L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 " + // Hebrew words with U+0022 and U+05F4 + L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc " + // Hebrew words enclosed with ASCII quotes. + L"\"\x05e6\x05d4\x0022\x05dc\" '\x05e9\x05c1\x05b8\x05dc\x05d5'" // Arabic (including vowel marks) L"\x0627\x064e\x0644\x0633\x064e\x0651\x0644\x0627" L"\x0645\x064f\x0020\x0639\x064e\x0644\x064e\x064a" @@ -55,17 +61,18 @@ TEST(SpellcheckWordIteratorTest, SplitWord) { L"\x4F60\x597D" // Hangul Syllables L"\xC548\xB155\xD558\xC138\xC694" - // Full-width latin - L"\xFF28\xFF45\xFF4C\xFF4C\xFF4F"; + // Full-width latin : Hello + L"\xFF28\xFF45\xFF4C\xFF4C\xFF4F " + L"e.g.,"; // The languages and expected results used in this test. static const TestCase kTestCases[] = { { // English (keep contraction words) - "en-US", true, L"hello:hello affix Hello" + "en-US", true, L"hello:hello affix Hello e.g" }, { // English (split contraction words) - "en-US", false, L"hello hello affix Hello" + "en-US", false, L"hello hello affix Hello e g" }, { // Greek "el-GR", true, @@ -78,7 +85,10 @@ TEST(SpellcheckWordIteratorTest, SplitWord) { }, { // Hebrew "he-IL", true, - L"\x05e9\x05dc\x05d5\x05dd" + L"\x05e9\x05dc\x05d5\x05dd " + L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 " + L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc " + L"\x05e6\x05d4\x0022\x05dc \x05e9\x05dc\x05d5" }, { // Arabic "ar", true, |