diff options
author | rouslan@chromium.org <rouslan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-01-08 18:04:31 +0000 |
---|---|---|
committer | rouslan@chromium.org <rouslan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-01-08 18:04:31 +0000 |
commit | ccdc3010506fb0b5612071e367fe9ec36df7585b (patch) | |
tree | 43a787740bea5c3669f9877dfb4a5e1328322fa9 | |
parent | b980432bf58f0cdb31482a20c8c0937a98f2f3a2 (diff) | |
download | chromium_src-ccdc3010506fb0b5612071e367fe9ec36df7585b.zip chromium_src-ccdc3010506fb0b5612071e367fe9ec36df7585b.tar.gz chromium_src-ccdc3010506fb0b5612071e367fe9ec36df7585b.tar.bz2 |
Bump dictionary versions to 3-0
Bumps the dictionary versions to 3-0 to:
1) Use the dictionaries with checksums.
2) Add dictionaries for ko, sq, and ta.
3) Update dictionaries for lv, nl, ru, and sv.
BUG=8397,8803,20083,61206,65115,104891,112227,113821
Review URL: https://chromiumcodereview.appspot.com/11566003
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@175549 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r-- | DEPS | 2 | ||||
-rw-r--r-- | chrome/common/spellcheck_common.cc | 50 | ||||
-rw-r--r-- | chrome/renderer/spellchecker/spellcheck_unittest.cc | 137 | ||||
-rw-r--r-- | chrome/tools/convert_dict/aff_reader.cc | 55 | ||||
-rw-r--r-- | chrome/tools/convert_dict/dic_reader.cc | 11 |
5 files changed, 157 insertions, 98 deletions
@@ -84,7 +84,7 @@ deps = { "/trunk/deps/third_party/hunspell@174476", "src/third_party/hunspell_dictionaries": - "/trunk/deps/third_party/hunspell_dictionaries@168258", + "/trunk/deps/third_party/hunspell_dictionaries@174658", "src/third_party/safe_browsing/testing": (Var("googlecode_url") % "google-safe-browsing") + "/trunk/testing@112", diff --git a/chrome/common/spellcheck_common.cc b/chrome/common/spellcheck_common.cc index e2cbdea..1c0976b 100644 --- a/chrome/common/spellcheck_common.cc +++ b/chrome/common/spellcheck_common.cc @@ -21,7 +21,7 @@ struct LanguageVersion { static const LanguageRegion g_supported_spellchecker_languages[] = { // Several languages are not to be included in the spellchecker list: - // th-TH + // th-TH, vi-VI. {"af", "af-ZA"}, {"bg", "bg-BG"}, {"ca", "ca-ES"}, @@ -43,6 +43,7 @@ static const LanguageRegion g_supported_spellchecker_languages[] = { {"hu", "hu-HU"}, {"id", "id-ID"}, {"it", "it-IT"}, + {"ko", "ko"}, {"lt", "lt-LT"}, {"lv", "lv-LV"}, {"nb", "nb-NO"}, @@ -52,11 +53,13 @@ static const LanguageRegion g_supported_spellchecker_languages[] = { {"pt-PT", "pt-PT"}, {"ro", "ro-RO"}, {"ru", "ru-RU"}, + {"sh", "sh"}, {"sk", "sk-SK"}, {"sl", "sl-SI"}, - {"sh", "sh"}, + {"sq", "sq"}, {"sr", "sr"}, {"sv", "sv-SE"}, + {"ta", "ta-IN"}, {"tr", "tr-TR"}, {"uk", "uk-UA"}, {"vi", "vi-VN"}, @@ -87,40 +90,17 @@ std::string GetSpellCheckLanguageRegion(const std::string& input_language) { FilePath GetVersionedFileName(const std::string& input_language, const FilePath& dict_dir) { - // The default dictionary version is 1-2. These versions have been augmented - // with additional words found by the translation team. - static const char kDefaultVersionString[] = "-1-2"; - + // The default dictionary version is 3-0. This version indicates that the bdic + // file contains a checksum. + static const char kDefaultVersionString[] = "-3-0"; + + // Add non-default version strings here. Use the same version for all the + // dictionaries that you add at the same time. Increment the major version + // number if you're updating either dic or aff files. Increment the minor + // version number if you're updating only dic_delta files. static LanguageVersion special_version_string[] = { - {"es-ES", "-1-1"}, // 1-1: Have not been augmented with addtional words. - {"nl-NL", "-1-1"}, - {"sv-SE", "-1-1"}, - {"he-IL", "-1-1"}, - {"el-GR", "-1-1"}, - {"hi-IN", "-1-1"}, - {"tr-TR", "-1-1"}, - {"et-EE", "-1-1"}, - {"lt-LT", "-1-3"}, // 1-3 (Feb 2009): new words, as well as an upgraded - // dictionary. - {"pl-PL", "-1-3"}, - {"fr-FR", "-2-0"}, // 2-0 (2010): upgraded dictionaries. - {"hu-HU", "-2-0"}, - {"ro-RO", "-2-0"}, - {"ru-RU", "-2-0"}, - {"bg-BG", "-2-0"}, - {"sr", "-2-0"}, - {"uk-UA", "-2-0"}, - {"pt-BR", "-2-2"}, // 2-2 (Mar 2011): upgraded a dictionary. - {"sh", "-2-2"}, // 2-2 (Mar 2011): added a dictionary. - {"ca-ES", "-2-3"}, // 2-3 (May 2012): upgraded a dictionary. - {"sv-SE", "-2-3"}, // 2-3 (May 2012): upgraded a dictionary. - {"af-ZA", "-2-3"}, // 2-3 (May 2012): added a dictionary. - {"fo-FO", "-2-3"}, // 2-3 (May 2012): added a dictionary. - {"en-US", "-2-4"}, // 2-4 (October 2012): add more words. - {"en-CA", "-2-4"}, - {"en-GB", "-2-5"}, // 2-5 (Nov 2012): Added NOSUGGEST flag = !. - {"en-AU", "-2-5"}, // Marked 1 word in each. - + {"et-EE", "-1-1"}, // No dic/aff files + {"tr-TR", "-1-1"}, // No dic/aff files }; // Generate the bdict file name using default version string or special diff --git a/chrome/renderer/spellchecker/spellcheck_unittest.cc b/chrome/renderer/spellchecker/spellcheck_unittest.cc index e581a0c..34d3b0e 100644 --- a/chrome/renderer/spellchecker/spellcheck_unittest.cc +++ b/chrome/renderer/spellchecker/spellcheck_unittest.cc @@ -18,6 +18,7 @@ #include "testing/gtest/include/gtest/gtest.h" #include "third_party/WebKit/Source/WebKit/chromium/public/WebTextCheckingCompletion.h" #include "third_party/WebKit/Source/WebKit/chromium/public/WebTextCheckingResult.h" +#include "ui/base/l10n/l10n_util.h" namespace { @@ -136,14 +137,14 @@ class MockTextCheckingCompletion : public WebKit::WebTextCheckingCompletion { // space character; // * Tests for the function with an invalid English word with a preceding // non-English word; -// * Tests for the function with2 an invalid English word with a following +// * Tests for the function with an invalid English word with a following // space character; // * Tests for the function with an invalid English word with a following // non-English word, and; // * Tests for the function with two invalid English words concatenated // with space characters or non-English words. // A test with a "[ROBUSTNESS]" mark shows it is a robustness test and it uses -// grammartically incorrect string. +// grammatically incorrect string. // TODO(hbono): Please feel free to add more tests. TEST_F(SpellCheckTest, SpellCheckStrings_EN_US) { static const struct { @@ -165,7 +166,7 @@ TEST_F(SpellCheckTest, SpellCheckStrings_EN_US) { // A valid English word "hello". {L"hello", true}, - // A valid Chinese word (meaning "hello") consisiting of two CJKV + // A valid Chinese word (meaning "hello") consisting of two CJKV // ideographs {L"\x4F60\x597D", true}, // A valid Korean word (meaning "hello") consisting of five hangul @@ -184,7 +185,7 @@ TEST_F(SpellCheckTest, SpellCheckStrings_EN_US) { // Two valid Greek words (meaning "hello") consisting of seven Greek // letters {L"\x03B3\x03B5\x03B9\x03AC" L" " L"\x03C3\x03BF\x03C5", true}, - // A valid Russian word (meainng "hello") consisting of twelve Cyrillic + // A valid Russian word (meaning "hello") consisting of twelve Cyrillic // letters {L"\x0437\x0434\x0440\x0430\x0432\x0441" L"\x0442\x0432\x0443\x0439\x0442\x0435", true}, @@ -747,6 +748,31 @@ TEST_F(SpellCheckTest, SpellCheckText) { L"c\x00E1\x0063 th\x00F4ng tin c\x1EE7\x0061 " L"th\x1EBF gi\x1EDBi va l\x00E0m cho n\x00F3 universal c\x00F3 " L"th\x1EC3 truy c\x1EADp va h\x1EEFu d\x1EE5ng h\x01A1n." + }, { + // Korean + "ko", + L"Google\xC758 \xBAA9\xD45C\xB294 \xC804\xC138\xACC4\xC758 " + L"\xC815\xBCF4\xB97C \xCCB4\xACC4\xD654\xD558\xC5EC \xBAA8\xB450\xAC00 " + L"\xD3B8\xB9AC\xD558\xAC8C \xC774\xC6A9\xD560 \xC218 " + L"\xC788\xB3C4\xB85D \xD558\xB294 \xAC83\xC785\xB2C8\xB2E4." + }, { + // Albanian + "sq", + L"Misioni i Google \x00EBsht\x00EB q\x00EB t\x00EB organizoj\x00EB " + L"informacionin e bot\x00EBs dhe t\x00EB b\x00EBjn\x00EB at\x00EB " + L"universalisht t\x00EB arritshme dhe t\x00EB dobishme." + }, { + // Tamil + "ta", + L"Google \x0B87\x0BA9\x0BCD " + L"\x0BA8\x0BC7\x0BBE\x0B95\x0BCD\x0B95\x0BAE\x0BCD " + L"\x0B89\x0BB2\x0B95\x0BBF\x0BA9\x0BCD \x0BA4\x0B95\x0BB5\x0BB2\x0BCD " + L"\x0B8F\x0BB1\x0BCD\x0BAA\x0BBE\x0B9F\x0BC1 \x0B87\x0BA4\x0BC1 " + L"\u0B89\u0BB2\u0B95\u0BB3\u0BBE\u0BB5\u0BBF\u0BAF " + L"\x0B85\x0BA3\x0BC1\x0B95\x0B95\x0BCD \x0B95\x0BC2\x0B9F\x0BBF\x0BAF " + L"\x0BAE\x0BB1\x0BCD\x0BB1\x0BC1\x0BAE\x0BCD " + L"\x0BAA\x0BAF\x0BA9\x0BC1\x0BB3\x0BCD\x0BB3 " + L"\x0B9A\x0BC6\x0BAF\x0BCD\x0BAF \x0B89\x0BB3\x0BCD\x0BB3\x0BA4\x0BC1." }, }; @@ -765,7 +791,13 @@ TEST_F(SpellCheckTest, SpellCheckText) { &misspelling_start, &misspelling_length, NULL); - EXPECT_TRUE(result) << kTestCases[i].language; + EXPECT_TRUE(result) + << "\"" + << std::wstring(kTestCases[i].input).substr( + misspelling_start, misspelling_length) + << "\" is misspelled in " + << kTestCases[i].language + << "."; EXPECT_EQ(0, misspelling_start); EXPECT_EQ(0, misspelling_length); } @@ -1156,62 +1188,91 @@ TEST_F(SpellCheckTest, EnglishWords) { // Checks that NOSUGGEST works in English dictionaries. TEST_F(SpellCheckTest, NoSuggest) { static const struct { + const char* misspelling; const char* input; + const char* locale; bool should_pass; } kTestCases[] = { - {"cocksucker", true}, - {"cocksuckers", true}, + {"suckerbert", "cocksucker", "en-GB", true}, + {"suckerbert", "cocksucker", "en-US", true}, + {"suckerbert", "cocksucker", "en-CA", true}, + {"suckerbert", "cocksucker", "en-AU", true}, + {"suckerbert", "cocksuckers", "en-GB", true}, + {"suckerbert", "cocksuckers", "en-US", true}, + {"suckerbert", "cocksuckers", "en-CA", true}, + {"suckerbert", "cocksuckers", "en-AU", true}, + {"Batasunaa", "Batasuna", "ca-ES", true}, + {"pornoo", "porno", "it-IT", true}, + {"catass", "catas", "lt-LT", true}, + {"kuracc", "kurac", "sl-SI", true}, + {"pittt", "pitt", "sv-SE", true}, }; - static const char* kLocales[] = { "en-GB", "en-US", "en-CA", "en-AU" }; - - // First check that the NOSUGGEST flag didn't mark these words as not - // being in the dictionary. size_t test_cases_size = ARRAYSIZE_UNSAFE(kTestCases); - for (size_t j = 0; j < arraysize(kLocales); ++j) { - ReinitializeSpellCheck(kLocales[j]); - for (size_t i = 0; i < test_cases_size; ++i) { - size_t input_length = 0; - if (kTestCases[i].input != NULL) - input_length = strlen(kTestCases[i].input); + for (size_t i = 0; i < test_cases_size; ++i) { + ReinitializeSpellCheck(kTestCases[i].locale); + size_t input_length = 0; + if (kTestCases[i].input != NULL) + input_length = strlen(kTestCases[i].input); - int misspelling_start = 0; - int misspelling_length = 0; - bool result = spell_check()->SpellCheckWord( - ASCIIToUTF16(kTestCases[i].input).c_str(), - static_cast<int>(input_length), - 0, - &misspelling_start, - &misspelling_length, NULL); + // First check that the NOSUGGEST flag didn't mark this word as not being in + // the dictionary. + int misspelling_start = 0; + int misspelling_length = 0; + bool result = spell_check()->SpellCheckWord( + ASCIIToUTF16(kTestCases[i].input).c_str(), + static_cast<int>(input_length), + 0, + &misspelling_start, + &misspelling_length, NULL); - EXPECT_EQ(kTestCases[i].should_pass, result) << kTestCases[i].input << - " in " << kLocales[j]; - } - } + EXPECT_EQ(kTestCases[i].should_pass, result) << kTestCases[i].input << + " in " << kTestCases[i].input; - // Now verify that neither of testCases show up as suggestions. - for (size_t d = 0; d < arraysize(kLocales); ++d) { - ReinitializeSpellCheck(kLocales[d]); - int misspelling_start; - int misspelling_length; + // Now verify that this test case does not show up as a suggestion. std::vector<string16> suggestions; - spell_check()->SpellCheckWord( - ASCIIToUTF16("suckerbert").c_str(), - 10, + input_length = 0; + if (kTestCases[i].misspelling != NULL) + input_length = strlen(kTestCases[i].misspelling); + result = spell_check()->SpellCheckWord( + ASCIIToUTF16(kTestCases[i].misspelling).c_str(), + static_cast<int>(input_length), 0, &misspelling_start, &misspelling_length, &suggestions); // Check if the suggested words occur. + EXPECT_FALSE(result) << kTestCases[i].misspelling + << " is not a misspelling in " + << kTestCases[i].locale; for (int j = 0; j < static_cast<int>(suggestions.size()); j++) { for (size_t t = 0; t < test_cases_size; t++) { int compare_result = suggestions.at(j).compare(ASCIIToUTF16(kTestCases[t].input)); EXPECT_FALSE(compare_result == 0) << kTestCases[t].input << - " in " << kLocales[d]; + " in " << kTestCases[i].locale; } } } } +// Check that the correct dictionary files are checked in. +TEST_F(SpellCheckTest, DictionaryFiles) { + std::vector<std::string> locale_codes; + l10n_util::GetAcceptLanguagesForLocale("C", &locale_codes); + EXPECT_FALSE(locale_codes.empty()); + + std::vector<std::string> spellcheck_languages; + chrome::spellcheck_common::SpellCheckLanguages(&spellcheck_languages); + EXPECT_FALSE(spellcheck_languages.empty()); + EXPECT_LE(spellcheck_languages.size(), locale_codes.size()); + + FilePath hunspell = GetHunspellDirectory(); + for (size_t i = 0; i < spellcheck_languages.size(); ++i) { + FilePath dict = chrome::spellcheck_common::GetVersionedFileName( + spellcheck_languages[i], hunspell); + EXPECT_TRUE(file_util::PathExists(dict)) << dict.value() << " not found"; + } +} + #endif diff --git a/chrome/tools/convert_dict/aff_reader.cc b/chrome/tools/convert_dict/aff_reader.cc index b3b0381..b24a0d8 100644 --- a/chrome/tools/convert_dict/aff_reader.cc +++ b/chrome/tools/convert_dict/aff_reader.cc @@ -122,7 +122,7 @@ bool AffReader::Read() { exit(1); } else if (StringBeginsWith(line, "COMPLEXPREFIXES ")) { printf("We don't support the COMPLEXPREFIXES command yet. This would " - "mean we have to insert words backwords as well (I think)\n"); + "mean we have to insert words backwards as well (I think)\n"); exit(1); } else { // All other commands get stored in the other commands list. @@ -241,7 +241,7 @@ void AffReader::AddAffix(std::string* rule) { // so that means that this prefix would be a compound one. // // It expects these rules to use the same alias rules as the .dic - // file. We've forced it to use aliases, which is a numberical index + // file. We've forced it to use aliases, which is a numerical index // instead of these character flags, and this needs to be consistent. std::string before_flags = part.substr(0, slash_index + 1); @@ -250,13 +250,21 @@ void AffReader::AddAffix(std::string* rule) { // that tells us what to strip. std::vector<std::string> after_slash; base::SplitString(part.substr(slash_index + 1), ' ', &after_slash); - if (after_slash.size() < 2) { - // Note that we may get a third term here which is the - // morphological description of this rule. This happens in the tests - // only, so we can just ignore it. - printf("ERROR: Didn't get enough after the slash\n"); + if (after_slash.size() == 0) { + printf("ERROR: Found 0 terms after slash in affix rule '%s', " + "but need at least 2.\n", + part.c_str()); return; } + if (after_slash.size() == 1) { + printf("WARNING: Found 1 term after slash in affix rule '%s', " + "but expected at least 2. Adding '.'.\n", + part.c_str()); + after_slash.push_back("."); + } + // Note that we may get a third term here which is the morphological + // description of this rule. This happens in the tests only, so we can + // just ignore it. part = base::StringPrintf("%s%d %s", before_flags.c_str(), @@ -266,8 +274,11 @@ void AffReader::AddAffix(std::string* rule) { // Reencode from here std::string reencoded; - if (!EncodingToUTF8(part, &reencoded)) + if (!EncodingToUTF8(part, &reencoded)) { + printf("ERROR: Cannot encode affix rule part '%s' to utf8.\n", + part.c_str()); break; + } *rule = rule->substr(0, part_start) + reencoded; break; @@ -283,19 +294,26 @@ void AffReader::AddAffix(std::string* rule) { void AffReader::AddReplacement(std::string* rule) { TrimLine(rule); + CollapseDuplicateSpaces(rule); std::string utf8rule; - if (!EncodingToUTF8(*rule, &utf8rule)) + if (!EncodingToUTF8(*rule, &utf8rule)) { + printf("ERROR: Cannot encode replacement rule '%s' to utf8.\n", + rule->c_str()); return; + } - std::vector<std::string> split; - base::SplitString(utf8rule, ' ', &split); - - // There should be two parts. - if (split.size() != 2) + // The first space separates key and value. + size_t space_index = utf8rule.find(' '); + if (space_index == std::string::npos) { + printf("ERROR: Did not find a space in '%s'.\n", utf8rule.c_str()); return; + } + std::vector<std::string> split; + split.push_back(utf8rule.substr(0, space_index)); + split.push_back(utf8rule.substr(space_index + 1)); - // Underscores are used to represent spaces + // Underscores are used to represent spaces in most aff files // (since the line is parsed on spaces). std::replace(split[0].begin(), split[0].end(), '_', ' '); std::replace(split[1].begin(), split[1].end(), '_', ' '); @@ -309,8 +327,11 @@ void AffReader::HandleRawCommand(const std::string& line) { void AffReader::HandleEncodedCommand(const std::string& line) { std::string utf8; - if (EncodingToUTF8(line, &utf8)) - other_commands_.push_back(utf8); + if (!EncodingToUTF8(line, &utf8)) { + printf("ERROR: Cannot encode command '%s' to utf8.\n", line.c_str()); + return; + } + other_commands_.push_back(utf8); } } // namespace convert_dict diff --git a/chrome/tools/convert_dict/dic_reader.cc b/chrome/tools/convert_dict/dic_reader.cc index 967f07e..5ed5cfa 100644 --- a/chrome/tools/convert_dict/dic_reader.cc +++ b/chrome/tools/convert_dict/dic_reader.cc @@ -117,13 +117,10 @@ bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader, std::set<int> affix_vector; affix_vector.insert(affix_index); - if (found == word_set->end()) { + if (found == word_set->end()) word_set->insert(std::make_pair(utf8word, affix_vector)); - } else { - // The affixes of the delta file should override those in the - // dictionary file. - found->second.swap(affix_vector); - } + else + found->second.insert(affix_index); } return true; @@ -170,7 +167,6 @@ bool DicReader::Read(AffReader* aff_reader) { PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta", "UTF-8", false); } - // Make sure the words are sorted, they may be unsorted in the input. for (WordSet::iterator word = word_set.begin(); word != word_set.end(); ++word) { @@ -182,6 +178,7 @@ bool DicReader::Read(AffReader* aff_reader) { // Double check that the affixes are sorted. This isn't strictly necessary // but it's nice for the file to have a fixed layout. std::sort(affixes.begin(), affixes.end()); + std::reverse(affixes.begin(), affixes.end()); words_.push_back(std::make_pair(word->first, affixes)); } |