summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorrouslan@chromium.org <rouslan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-01-08 18:04:31 +0000
committerrouslan@chromium.org <rouslan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-01-08 18:04:31 +0000
commitccdc3010506fb0b5612071e367fe9ec36df7585b (patch)
tree43a787740bea5c3669f9877dfb4a5e1328322fa9
parentb980432bf58f0cdb31482a20c8c0937a98f2f3a2 (diff)
downloadchromium_src-ccdc3010506fb0b5612071e367fe9ec36df7585b.zip
chromium_src-ccdc3010506fb0b5612071e367fe9ec36df7585b.tar.gz
chromium_src-ccdc3010506fb0b5612071e367fe9ec36df7585b.tar.bz2
Bump dictionary versions to 3-0
Bumps the dictionary versions to 3-0 to: 1) Use the dictionaries with checksums. 2) Add dictionaries for ko, sq, and ta. 3) Update dictionaries for lv, nl, ru, and sv. BUG=8397,8803,20083,61206,65115,104891,112227,113821 Review URL: https://chromiumcodereview.appspot.com/11566003 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@175549 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r--DEPS2
-rw-r--r--chrome/common/spellcheck_common.cc50
-rw-r--r--chrome/renderer/spellchecker/spellcheck_unittest.cc137
-rw-r--r--chrome/tools/convert_dict/aff_reader.cc55
-rw-r--r--chrome/tools/convert_dict/dic_reader.cc11
5 files changed, 157 insertions, 98 deletions
diff --git a/DEPS b/DEPS
index 218786f..96b4d56 100644
--- a/DEPS
+++ b/DEPS
@@ -84,7 +84,7 @@ deps = {
"/trunk/deps/third_party/hunspell@174476",
"src/third_party/hunspell_dictionaries":
- "/trunk/deps/third_party/hunspell_dictionaries@168258",
+ "/trunk/deps/third_party/hunspell_dictionaries@174658",
"src/third_party/safe_browsing/testing":
(Var("googlecode_url") % "google-safe-browsing") + "/trunk/testing@112",
diff --git a/chrome/common/spellcheck_common.cc b/chrome/common/spellcheck_common.cc
index e2cbdea..1c0976b 100644
--- a/chrome/common/spellcheck_common.cc
+++ b/chrome/common/spellcheck_common.cc
@@ -21,7 +21,7 @@ struct LanguageVersion {
static const LanguageRegion g_supported_spellchecker_languages[] = {
// Several languages are not to be included in the spellchecker list:
- // th-TH
+ // th-TH, vi-VI.
{"af", "af-ZA"},
{"bg", "bg-BG"},
{"ca", "ca-ES"},
@@ -43,6 +43,7 @@ static const LanguageRegion g_supported_spellchecker_languages[] = {
{"hu", "hu-HU"},
{"id", "id-ID"},
{"it", "it-IT"},
+ {"ko", "ko"},
{"lt", "lt-LT"},
{"lv", "lv-LV"},
{"nb", "nb-NO"},
@@ -52,11 +53,13 @@ static const LanguageRegion g_supported_spellchecker_languages[] = {
{"pt-PT", "pt-PT"},
{"ro", "ro-RO"},
{"ru", "ru-RU"},
+ {"sh", "sh"},
{"sk", "sk-SK"},
{"sl", "sl-SI"},
- {"sh", "sh"},
+ {"sq", "sq"},
{"sr", "sr"},
{"sv", "sv-SE"},
+ {"ta", "ta-IN"},
{"tr", "tr-TR"},
{"uk", "uk-UA"},
{"vi", "vi-VN"},
@@ -87,40 +90,17 @@ std::string GetSpellCheckLanguageRegion(const std::string& input_language) {
FilePath GetVersionedFileName(const std::string& input_language,
const FilePath& dict_dir) {
- // The default dictionary version is 1-2. These versions have been augmented
- // with additional words found by the translation team.
- static const char kDefaultVersionString[] = "-1-2";
-
+ // The default dictionary version is 3-0. This version indicates that the bdic
+ // file contains a checksum.
+ static const char kDefaultVersionString[] = "-3-0";
+
+ // Add non-default version strings here. Use the same version for all the
+ // dictionaries that you add at the same time. Increment the major version
+ // number if you're updating either dic or aff files. Increment the minor
+ // version number if you're updating only dic_delta files.
static LanguageVersion special_version_string[] = {
- {"es-ES", "-1-1"}, // 1-1: Have not been augmented with addtional words.
- {"nl-NL", "-1-1"},
- {"sv-SE", "-1-1"},
- {"he-IL", "-1-1"},
- {"el-GR", "-1-1"},
- {"hi-IN", "-1-1"},
- {"tr-TR", "-1-1"},
- {"et-EE", "-1-1"},
- {"lt-LT", "-1-3"}, // 1-3 (Feb 2009): new words, as well as an upgraded
- // dictionary.
- {"pl-PL", "-1-3"},
- {"fr-FR", "-2-0"}, // 2-0 (2010): upgraded dictionaries.
- {"hu-HU", "-2-0"},
- {"ro-RO", "-2-0"},
- {"ru-RU", "-2-0"},
- {"bg-BG", "-2-0"},
- {"sr", "-2-0"},
- {"uk-UA", "-2-0"},
- {"pt-BR", "-2-2"}, // 2-2 (Mar 2011): upgraded a dictionary.
- {"sh", "-2-2"}, // 2-2 (Mar 2011): added a dictionary.
- {"ca-ES", "-2-3"}, // 2-3 (May 2012): upgraded a dictionary.
- {"sv-SE", "-2-3"}, // 2-3 (May 2012): upgraded a dictionary.
- {"af-ZA", "-2-3"}, // 2-3 (May 2012): added a dictionary.
- {"fo-FO", "-2-3"}, // 2-3 (May 2012): added a dictionary.
- {"en-US", "-2-4"}, // 2-4 (October 2012): add more words.
- {"en-CA", "-2-4"},
- {"en-GB", "-2-5"}, // 2-5 (Nov 2012): Added NOSUGGEST flag = !.
- {"en-AU", "-2-5"}, // Marked 1 word in each.
-
+ {"et-EE", "-1-1"}, // No dic/aff files
+ {"tr-TR", "-1-1"}, // No dic/aff files
};
// Generate the bdict file name using default version string or special
diff --git a/chrome/renderer/spellchecker/spellcheck_unittest.cc b/chrome/renderer/spellchecker/spellcheck_unittest.cc
index e581a0c..34d3b0e 100644
--- a/chrome/renderer/spellchecker/spellcheck_unittest.cc
+++ b/chrome/renderer/spellchecker/spellcheck_unittest.cc
@@ -18,6 +18,7 @@
#include "testing/gtest/include/gtest/gtest.h"
#include "third_party/WebKit/Source/WebKit/chromium/public/WebTextCheckingCompletion.h"
#include "third_party/WebKit/Source/WebKit/chromium/public/WebTextCheckingResult.h"
+#include "ui/base/l10n/l10n_util.h"
namespace {
@@ -136,14 +137,14 @@ class MockTextCheckingCompletion : public WebKit::WebTextCheckingCompletion {
// space character;
// * Tests for the function with an invalid English word with a preceding
// non-English word;
-// * Tests for the function with2 an invalid English word with a following
+// * Tests for the function with an invalid English word with a following
// space character;
// * Tests for the function with an invalid English word with a following
// non-English word, and;
// * Tests for the function with two invalid English words concatenated
// with space characters or non-English words.
// A test with a "[ROBUSTNESS]" mark shows it is a robustness test and it uses
-// grammartically incorrect string.
+// grammatically incorrect string.
// TODO(hbono): Please feel free to add more tests.
TEST_F(SpellCheckTest, SpellCheckStrings_EN_US) {
static const struct {
@@ -165,7 +166,7 @@ TEST_F(SpellCheckTest, SpellCheckStrings_EN_US) {
// A valid English word "hello".
{L"hello", true},
- // A valid Chinese word (meaning "hello") consisiting of two CJKV
+ // A valid Chinese word (meaning "hello") consisting of two CJKV
// ideographs
{L"\x4F60\x597D", true},
// A valid Korean word (meaning "hello") consisting of five hangul
@@ -184,7 +185,7 @@ TEST_F(SpellCheckTest, SpellCheckStrings_EN_US) {
// Two valid Greek words (meaning "hello") consisting of seven Greek
// letters
{L"\x03B3\x03B5\x03B9\x03AC" L" " L"\x03C3\x03BF\x03C5", true},
- // A valid Russian word (meainng "hello") consisting of twelve Cyrillic
+ // A valid Russian word (meaning "hello") consisting of twelve Cyrillic
// letters
{L"\x0437\x0434\x0440\x0430\x0432\x0441"
L"\x0442\x0432\x0443\x0439\x0442\x0435", true},
@@ -747,6 +748,31 @@ TEST_F(SpellCheckTest, SpellCheckText) {
L"c\x00E1\x0063 th\x00F4ng tin c\x1EE7\x0061 "
L"th\x1EBF gi\x1EDBi va l\x00E0m cho n\x00F3 universal c\x00F3 "
L"th\x1EC3 truy c\x1EADp va h\x1EEFu d\x1EE5ng h\x01A1n."
+ }, {
+ // Korean
+ "ko",
+ L"Google\xC758 \xBAA9\xD45C\xB294 \xC804\xC138\xACC4\xC758 "
+ L"\xC815\xBCF4\xB97C \xCCB4\xACC4\xD654\xD558\xC5EC \xBAA8\xB450\xAC00 "
+ L"\xD3B8\xB9AC\xD558\xAC8C \xC774\xC6A9\xD560 \xC218 "
+ L"\xC788\xB3C4\xB85D \xD558\xB294 \xAC83\xC785\xB2C8\xB2E4."
+ }, {
+ // Albanian
+ "sq",
+ L"Misioni i Google \x00EBsht\x00EB q\x00EB t\x00EB organizoj\x00EB "
+ L"informacionin e bot\x00EBs dhe t\x00EB b\x00EBjn\x00EB at\x00EB "
+ L"universalisht t\x00EB arritshme dhe t\x00EB dobishme."
+ }, {
+ // Tamil
+ "ta",
+ L"Google \x0B87\x0BA9\x0BCD "
+ L"\x0BA8\x0BC7\x0BBE\x0B95\x0BCD\x0B95\x0BAE\x0BCD "
+ L"\x0B89\x0BB2\x0B95\x0BBF\x0BA9\x0BCD \x0BA4\x0B95\x0BB5\x0BB2\x0BCD "
+ L"\x0B8F\x0BB1\x0BCD\x0BAA\x0BBE\x0B9F\x0BC1 \x0B87\x0BA4\x0BC1 "
+ L"\u0B89\u0BB2\u0B95\u0BB3\u0BBE\u0BB5\u0BBF\u0BAF "
+ L"\x0B85\x0BA3\x0BC1\x0B95\x0B95\x0BCD \x0B95\x0BC2\x0B9F\x0BBF\x0BAF "
+ L"\x0BAE\x0BB1\x0BCD\x0BB1\x0BC1\x0BAE\x0BCD "
+ L"\x0BAA\x0BAF\x0BA9\x0BC1\x0BB3\x0BCD\x0BB3 "
+ L"\x0B9A\x0BC6\x0BAF\x0BCD\x0BAF \x0B89\x0BB3\x0BCD\x0BB3\x0BA4\x0BC1."
},
};
@@ -765,7 +791,13 @@ TEST_F(SpellCheckTest, SpellCheckText) {
&misspelling_start,
&misspelling_length, NULL);
- EXPECT_TRUE(result) << kTestCases[i].language;
+ EXPECT_TRUE(result)
+ << "\""
+ << std::wstring(kTestCases[i].input).substr(
+ misspelling_start, misspelling_length)
+ << "\" is misspelled in "
+ << kTestCases[i].language
+ << ".";
EXPECT_EQ(0, misspelling_start);
EXPECT_EQ(0, misspelling_length);
}
@@ -1156,62 +1188,91 @@ TEST_F(SpellCheckTest, EnglishWords) {
// Checks that NOSUGGEST works in English dictionaries.
TEST_F(SpellCheckTest, NoSuggest) {
static const struct {
+ const char* misspelling;
const char* input;
+ const char* locale;
bool should_pass;
} kTestCases[] = {
- {"cocksucker", true},
- {"cocksuckers", true},
+ {"suckerbert", "cocksucker", "en-GB", true},
+ {"suckerbert", "cocksucker", "en-US", true},
+ {"suckerbert", "cocksucker", "en-CA", true},
+ {"suckerbert", "cocksucker", "en-AU", true},
+ {"suckerbert", "cocksuckers", "en-GB", true},
+ {"suckerbert", "cocksuckers", "en-US", true},
+ {"suckerbert", "cocksuckers", "en-CA", true},
+ {"suckerbert", "cocksuckers", "en-AU", true},
+ {"Batasunaa", "Batasuna", "ca-ES", true},
+ {"pornoo", "porno", "it-IT", true},
+ {"catass", "catas", "lt-LT", true},
+ {"kuracc", "kurac", "sl-SI", true},
+ {"pittt", "pitt", "sv-SE", true},
};
- static const char* kLocales[] = { "en-GB", "en-US", "en-CA", "en-AU" };
-
- // First check that the NOSUGGEST flag didn't mark these words as not
- // being in the dictionary.
size_t test_cases_size = ARRAYSIZE_UNSAFE(kTestCases);
- for (size_t j = 0; j < arraysize(kLocales); ++j) {
- ReinitializeSpellCheck(kLocales[j]);
- for (size_t i = 0; i < test_cases_size; ++i) {
- size_t input_length = 0;
- if (kTestCases[i].input != NULL)
- input_length = strlen(kTestCases[i].input);
+ for (size_t i = 0; i < test_cases_size; ++i) {
+ ReinitializeSpellCheck(kTestCases[i].locale);
+ size_t input_length = 0;
+ if (kTestCases[i].input != NULL)
+ input_length = strlen(kTestCases[i].input);
- int misspelling_start = 0;
- int misspelling_length = 0;
- bool result = spell_check()->SpellCheckWord(
- ASCIIToUTF16(kTestCases[i].input).c_str(),
- static_cast<int>(input_length),
- 0,
- &misspelling_start,
- &misspelling_length, NULL);
+ // First check that the NOSUGGEST flag didn't mark this word as not being in
+ // the dictionary.
+ int misspelling_start = 0;
+ int misspelling_length = 0;
+ bool result = spell_check()->SpellCheckWord(
+ ASCIIToUTF16(kTestCases[i].input).c_str(),
+ static_cast<int>(input_length),
+ 0,
+ &misspelling_start,
+ &misspelling_length, NULL);
- EXPECT_EQ(kTestCases[i].should_pass, result) << kTestCases[i].input <<
- " in " << kLocales[j];
- }
- }
+ EXPECT_EQ(kTestCases[i].should_pass, result) << kTestCases[i].input <<
+ " in " << kTestCases[i].input;
- // Now verify that neither of testCases show up as suggestions.
- for (size_t d = 0; d < arraysize(kLocales); ++d) {
- ReinitializeSpellCheck(kLocales[d]);
- int misspelling_start;
- int misspelling_length;
+ // Now verify that this test case does not show up as a suggestion.
std::vector<string16> suggestions;
- spell_check()->SpellCheckWord(
- ASCIIToUTF16("suckerbert").c_str(),
- 10,
+ input_length = 0;
+ if (kTestCases[i].misspelling != NULL)
+ input_length = strlen(kTestCases[i].misspelling);
+ result = spell_check()->SpellCheckWord(
+ ASCIIToUTF16(kTestCases[i].misspelling).c_str(),
+ static_cast<int>(input_length),
0,
&misspelling_start,
&misspelling_length,
&suggestions);
// Check if the suggested words occur.
+ EXPECT_FALSE(result) << kTestCases[i].misspelling
+ << " is not a misspelling in "
+ << kTestCases[i].locale;
for (int j = 0; j < static_cast<int>(suggestions.size()); j++) {
for (size_t t = 0; t < test_cases_size; t++) {
int compare_result =
suggestions.at(j).compare(ASCIIToUTF16(kTestCases[t].input));
EXPECT_FALSE(compare_result == 0) << kTestCases[t].input <<
- " in " << kLocales[d];
+ " in " << kTestCases[i].locale;
}
}
}
}
+// Check that the correct dictionary files are checked in.
+TEST_F(SpellCheckTest, DictionaryFiles) {
+ std::vector<std::string> locale_codes;
+ l10n_util::GetAcceptLanguagesForLocale("C", &locale_codes);
+ EXPECT_FALSE(locale_codes.empty());
+
+ std::vector<std::string> spellcheck_languages;
+ chrome::spellcheck_common::SpellCheckLanguages(&spellcheck_languages);
+ EXPECT_FALSE(spellcheck_languages.empty());
+ EXPECT_LE(spellcheck_languages.size(), locale_codes.size());
+
+ FilePath hunspell = GetHunspellDirectory();
+ for (size_t i = 0; i < spellcheck_languages.size(); ++i) {
+ FilePath dict = chrome::spellcheck_common::GetVersionedFileName(
+ spellcheck_languages[i], hunspell);
+ EXPECT_TRUE(file_util::PathExists(dict)) << dict.value() << " not found";
+ }
+}
+
#endif
diff --git a/chrome/tools/convert_dict/aff_reader.cc b/chrome/tools/convert_dict/aff_reader.cc
index b3b0381..b24a0d8 100644
--- a/chrome/tools/convert_dict/aff_reader.cc
+++ b/chrome/tools/convert_dict/aff_reader.cc
@@ -122,7 +122,7 @@ bool AffReader::Read() {
exit(1);
} else if (StringBeginsWith(line, "COMPLEXPREFIXES ")) {
printf("We don't support the COMPLEXPREFIXES command yet. This would "
- "mean we have to insert words backwords as well (I think)\n");
+ "mean we have to insert words backwards as well (I think)\n");
exit(1);
} else {
// All other commands get stored in the other commands list.
@@ -241,7 +241,7 @@ void AffReader::AddAffix(std::string* rule) {
// so that means that this prefix would be a compound one.
//
// It expects these rules to use the same alias rules as the .dic
- // file. We've forced it to use aliases, which is a numberical index
+ // file. We've forced it to use aliases, which is a numerical index
// instead of these character flags, and this needs to be consistent.
std::string before_flags = part.substr(0, slash_index + 1);
@@ -250,13 +250,21 @@ void AffReader::AddAffix(std::string* rule) {
// that tells us what to strip.
std::vector<std::string> after_slash;
base::SplitString(part.substr(slash_index + 1), ' ', &after_slash);
- if (after_slash.size() < 2) {
- // Note that we may get a third term here which is the
- // morphological description of this rule. This happens in the tests
- // only, so we can just ignore it.
- printf("ERROR: Didn't get enough after the slash\n");
+ if (after_slash.size() == 0) {
+ printf("ERROR: Found 0 terms after slash in affix rule '%s', "
+ "but need at least 2.\n",
+ part.c_str());
return;
}
+ if (after_slash.size() == 1) {
+ printf("WARNING: Found 1 term after slash in affix rule '%s', "
+ "but expected at least 2. Adding '.'.\n",
+ part.c_str());
+ after_slash.push_back(".");
+ }
+ // Note that we may get a third term here which is the morphological
+ // description of this rule. This happens in the tests only, so we can
+ // just ignore it.
part = base::StringPrintf("%s%d %s",
before_flags.c_str(),
@@ -266,8 +274,11 @@ void AffReader::AddAffix(std::string* rule) {
// Reencode from here
std::string reencoded;
- if (!EncodingToUTF8(part, &reencoded))
+ if (!EncodingToUTF8(part, &reencoded)) {
+ printf("ERROR: Cannot encode affix rule part '%s' to utf8.\n",
+ part.c_str());
break;
+ }
*rule = rule->substr(0, part_start) + reencoded;
break;
@@ -283,19 +294,26 @@ void AffReader::AddAffix(std::string* rule) {
void AffReader::AddReplacement(std::string* rule) {
TrimLine(rule);
+ CollapseDuplicateSpaces(rule);
std::string utf8rule;
- if (!EncodingToUTF8(*rule, &utf8rule))
+ if (!EncodingToUTF8(*rule, &utf8rule)) {
+ printf("ERROR: Cannot encode replacement rule '%s' to utf8.\n",
+ rule->c_str());
return;
+ }
- std::vector<std::string> split;
- base::SplitString(utf8rule, ' ', &split);
-
- // There should be two parts.
- if (split.size() != 2)
+ // The first space separates key and value.
+ size_t space_index = utf8rule.find(' ');
+ if (space_index == std::string::npos) {
+ printf("ERROR: Did not find a space in '%s'.\n", utf8rule.c_str());
return;
+ }
+ std::vector<std::string> split;
+ split.push_back(utf8rule.substr(0, space_index));
+ split.push_back(utf8rule.substr(space_index + 1));
- // Underscores are used to represent spaces
+ // Underscores are used to represent spaces in most aff files
// (since the line is parsed on spaces).
std::replace(split[0].begin(), split[0].end(), '_', ' ');
std::replace(split[1].begin(), split[1].end(), '_', ' ');
@@ -309,8 +327,11 @@ void AffReader::HandleRawCommand(const std::string& line) {
void AffReader::HandleEncodedCommand(const std::string& line) {
std::string utf8;
- if (EncodingToUTF8(line, &utf8))
- other_commands_.push_back(utf8);
+ if (!EncodingToUTF8(line, &utf8)) {
+ printf("ERROR: Cannot encode command '%s' to utf8.\n", line.c_str());
+ return;
+ }
+ other_commands_.push_back(utf8);
}
} // namespace convert_dict
diff --git a/chrome/tools/convert_dict/dic_reader.cc b/chrome/tools/convert_dict/dic_reader.cc
index 967f07e..5ed5cfa 100644
--- a/chrome/tools/convert_dict/dic_reader.cc
+++ b/chrome/tools/convert_dict/dic_reader.cc
@@ -117,13 +117,10 @@ bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader,
std::set<int> affix_vector;
affix_vector.insert(affix_index);
- if (found == word_set->end()) {
+ if (found == word_set->end())
word_set->insert(std::make_pair(utf8word, affix_vector));
- } else {
- // The affixes of the delta file should override those in the
- // dictionary file.
- found->second.swap(affix_vector);
- }
+ else
+ found->second.insert(affix_index);
}
return true;
@@ -170,7 +167,6 @@ bool DicReader::Read(AffReader* aff_reader) {
PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta",
"UTF-8", false);
}
-
// Make sure the words are sorted, they may be unsorted in the input.
for (WordSet::iterator word = word_set.begin(); word != word_set.end();
++word) {
@@ -182,6 +178,7 @@ bool DicReader::Read(AffReader* aff_reader) {
// Double check that the affixes are sorted. This isn't strictly necessary
// but it's nice for the file to have a fixed layout.
std::sort(affixes.begin(), affixes.end());
+ std::reverse(affixes.begin(), affixes.end());
words_.push_back(std::make_pair(word->first, affixes));
}