summaryrefslogtreecommitdiffstats
path: root/chrome/renderer/spellchecker
diff options
context:
space:
mode:
authorjshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-08-24 19:53:54 +0000
committerjshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-08-24 19:53:54 +0000
commit1f9d8817a82e1375901d13bd484bbf220e406170 (patch)
tree0bc31066a3ae7cbf4e6c46cfaf0303ffdfe460fe /chrome/renderer/spellchecker
parentb5977a0c447bfcf0605a28a05ddf3f017feb3ceb (diff)
downloadchromium_src-1f9d8817a82e1375901d13bd484bbf220e406170.zip
chromium_src-1f9d8817a82e1375901d13bd484bbf220e406170.tar.gz
chromium_src-1f9d8817a82e1375901d13bd484bbf220e406170.tar.bz2
Customize Hebrew spellcheck word break iterator
1. Treats ASCII double/single quoation marks between Hebrew letters as MidLetter for Hebrew spellchecker because they're commonly used in place of Geresh and Gershayim. 2. Pass through ASCII double/single quotation marks and Geresh and Gershayim in OutputHebrew. See http://www.unicode.org/reports/tr29/proposed.html (version 6.0.0.0 draft2) about Hebrew tailoring. This alone does not fix bug 22909 completely. This CL will be followed with an ICU data fix. BUG=22909 TEST=unit_tests --gtest_filter=SpellcheckWordIteratorTest.* Review URL: http://codereview.chromium.org/3112015 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@57223 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/renderer/spellchecker')
-rw-r--r--chrome/renderer/spellchecker/spellcheck_worditerator.cc26
-rw-r--r--chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc22
2 files changed, 36 insertions, 12 deletions
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator.cc b/chrome/renderer/spellchecker/spellcheck_worditerator.cc
index 9647b87..3f5d69c 100644
--- a/chrome/renderer/spellchecker/spellcheck_worditerator.cc
+++ b/chrome/renderer/spellchecker/spellcheck_worditerator.cc
@@ -61,15 +61,19 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
"$Extend = [\\p{Word_Break = Extend}];"
"$Format = [\\p{Word_Break = Format}];"
"$Katakana = [\\p{Word_Break = Katakana}];"
+ // Not all the characters in a given script are ALetter.
+ // For instance, U+05F4 is MidLetter. So, this may be
+ // better, but it leads to an empty set error in Thai.
+ // "$ALetter = [[\\p{script=%s}] & [\\p{Word_Break = ALetter}]];"
"$ALetter = [\\p{script=%s}];"
"$MidNumLet = [\\p{Word_Break = MidNumLet}];"
- "$MidLetter = [\\p{Word_Break = MidLetter}];"
+ "$MidLetter = [\\p{Word_Break = MidLetter}%s];"
"$MidNum = [\\p{Word_Break = MidNum}];"
"$Numeric = [\\p{Word_Break = Numeric}];"
"$ExtendNumLet = [\\p{Word_Break = ExtendNumLet}];"
"$Control = [\\p{Grapheme_Cluster_Break = Control}]; "
- "%s"
+ "%s" // ALetterPlus
"$KatakanaEx = $Katakana ($Extend | $Format)*;"
"$ALetterEx = $ALetterPlus ($Extend | $Format)*;"
@@ -89,7 +93,7 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
"[^$CR $LF $Newline]? ($Extend | $Format)+;"
"$ALetterEx {200};"
"$ALetterEx $ALetterEx {200};"
- "%s"
+ "%s" // (Allow|Disallow) Contraction
"!!reverse;"
"$BackALetterEx = ($Format | $Extend)* $ALetterPlus;"
@@ -151,6 +155,13 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
if (script_code_ == USCRIPT_HANGUL || script_code_ == USCRIPT_THAI)
aletter_plus = kWithDictionary;
+ const char kMidLetterExtra[] = "";
+ // For Hebrew, treat single/double quoation marks as MidLetter.
+ const char kMidLetterExtraHebrew[] = "\"'";
+ const char* midletter_extra = kMidLetterExtra;
+ if (script_code_ == USCRIPT_HEBREW)
+ midletter_extra = kMidLetterExtraHebrew;
+
// Create two custom rule-sets: one allows contraction and the other does not.
// We save these strings in UTF-16 so we can use it without conversions. (ICU
// needs UTF-16 strings.)
@@ -159,9 +170,9 @@ void SpellcheckCharAttribute::CreateRuleSets(const std::string& language) {
const char kDisallowContraction[] = "";
ruleset_allow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate,
- aletter, aletter_plus, kAllowContraction));
+ aletter, midletter_extra, aletter_plus, kAllowContraction));
ruleset_disallow_contraction_ = ASCIIToUTF16(StringPrintf(kRuleTemplate,
- aletter, aletter_plus, kDisallowContraction));
+ aletter, midletter_extra, aletter_plus, kDisallowContraction));
}
bool SpellcheckCharAttribute::OutputChar(UChar c, string16* output) const {
@@ -246,7 +257,10 @@ bool SpellcheckCharAttribute::OutputHebrew(UChar c, string16* output) const {
// niqquds as misspelled. (Same as Arabic vowel marks, we need to check
// niqquds manually and filter them out since their script codes are
// USCRIPT_HEBREW.)
- if (0x05D0 <= c && c <= 0x05EA)
+ // Pass through ASCII single/double quotation marks and Hebrew Geresh and
+ // Gershayim.
+ if ((0x05D0 <= c && c <= 0x05EA) || c == 0x22 || c == 0x27 ||
+ c == 0x05F4 || c == 0x05F3)
output->push_back(c);
return true;
}
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
index da279fa..619850f 100644
--- a/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
+++ b/chrome/renderer/spellchecker/spellcheck_worditerator_unittest.cc
@@ -39,7 +39,13 @@ TEST(SpellcheckWordIteratorTest, SplitWord) {
L"\x0437\x0434\x0440\x0430\x0432\x0441\x0442\x0432"
L"\x0443\x0439\x0442\x0435"
// Hebrew (including niqquds)
- L"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd"
+ L"\x05e9\x05c1\x05b8\x05dc\x05d5\x05b9\x05dd "
+ // Hebrew words with U+0027 and U+05F3
+ L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
+ // Hebrew words with U+0022 and U+05F4
+ L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
+ // Hebrew words enclosed with ASCII quotes.
+ L"\"\x05e6\x05d4\x0022\x05dc\" '\x05e9\x05c1\x05b8\x05dc\x05d5'"
// Arabic (including vowel marks)
L"\x0627\x064e\x0644\x0633\x064e\x0651\x0644\x0627"
L"\x0645\x064f\x0020\x0639\x064e\x0644\x064e\x064a"
@@ -55,17 +61,18 @@ TEST(SpellcheckWordIteratorTest, SplitWord) {
L"\x4F60\x597D"
// Hangul Syllables
L"\xC548\xB155\xD558\xC138\xC694"
- // Full-width latin
- L"\xFF28\xFF45\xFF4C\xFF4C\xFF4F";
+ // Full-width latin : Hello
+ L"\xFF28\xFF45\xFF4C\xFF4C\xFF4F "
+ L"e.g.,";
// The languages and expected results used in this test.
static const TestCase kTestCases[] = {
{
// English (keep contraction words)
- "en-US", true, L"hello:hello affix Hello"
+ "en-US", true, L"hello:hello affix Hello e.g"
}, {
// English (split contraction words)
- "en-US", false, L"hello hello affix Hello"
+ "en-US", false, L"hello hello affix Hello e g"
}, {
// Greek
"el-GR", true,
@@ -78,7 +85,10 @@ TEST(SpellcheckWordIteratorTest, SplitWord) {
}, {
// Hebrew
"he-IL", true,
- L"\x05e9\x05dc\x05d5\x05dd"
+ L"\x05e9\x05dc\x05d5\x05dd "
+ L"\x05e6\x0027\x05d9\x05e4\x05e1 \x05e6\x05F3\x05d9\x05e4\x05e1 "
+ L"\x05e6\x05d4\x0022\x05dc \x05e6\x05d4\x05f4\x05dc "
+ L"\x05e6\x05d4\x0022\x05dc \x05e9\x05dc\x05d5"
}, {
// Arabic
"ar", true,