Bump dictionary versions to 3-0

Bumps the dictionary versions to 3-0 to: 1) Use the dictionaries with checksums. 2) Add dictionaries for ko, sq, and ta. 3) Update dictionaries for lv, nl, ru, and sv. BUG=8397,8803,20083,61206,65115,104891,112227,113821 Review URL: https://chromiumcodereview.appspot.com/11566003 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@175549 0039d316-1c4b-4281-b951-d872f2087c98
author: rouslan@chromium.org <rouslan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2013-01-08 18:04:31 +0000
committer: rouslan@chromium.org <rouslan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2013-01-08 18:04:31 +0000
commit: ccdc3010506fb0b5612071e367fe9ec36df7585b (patch)
tree: 43a787740bea5c3669f9877dfb4a5e1328322fa9
parent: b980432bf58f0cdb31482a20c8c0937a98f2f3a2 (diff)
download: chromium_src-ccdc3010506fb0b5612071e367fe9ec36df7585b.zip
chromium_src-ccdc3010506fb0b5612071e367fe9ec36df7585b.tar.gz
chromium_src-ccdc3010506fb0b5612071e367fe9ec36df7585b.tar.bz2
5 files changed, 157 insertions, 98 deletions
diff --git a/DEPS b/DEPS
index 218786f..96b4d56 100644
--- a/DEPS
+++ b/DEPS
@@ -84,7 +84,7 @@ deps = {
    "/trunk/deps/third_party/hunspell@174476",
 
   "src/third_party/hunspell_dictionaries":
-    "/trunk/deps/third_party/hunspell_dictionaries@168258",
+    "/trunk/deps/third_party/hunspell_dictionaries@174658",
 
   "src/third_party/safe_browsing/testing":
     (Var("googlecode_url") % "google-safe-browsing") + "/trunk/testing@112",
diff --git a/chrome/common/spellcheck_common.cc b/chrome/common/spellcheck_common.cc
index e2cbdea..1c0976b 100644
--- a/chrome/common/spellcheck_common.cc
+++ b/chrome/common/spellcheck_common.cc
@@ -21,7 +21,7 @@ struct LanguageVersion {
 
 static const LanguageRegion g_supported_spellchecker_languages[] = {
   // Several languages are not to be included in the spellchecker list:
-  // th-TH
+  // th-TH, vi-VI.
   {"af", "af-ZA"},
   {"bg", "bg-BG"},
   {"ca", "ca-ES"},
@@ -43,6 +43,7 @@ static const LanguageRegion g_supported_spellchecker_languages[] = {
   {"hu", "hu-HU"},
   {"id", "id-ID"},
   {"it", "it-IT"},
+  {"ko", "ko"},
   {"lt", "lt-LT"},
   {"lv", "lv-LV"},
   {"nb", "nb-NO"},
@@ -52,11 +53,13 @@ static const LanguageRegion g_supported_spellchecker_languages[] = {
   {"pt-PT", "pt-PT"},
   {"ro", "ro-RO"},
   {"ru", "ru-RU"},
+  {"sh", "sh"},
   {"sk", "sk-SK"},
   {"sl", "sl-SI"},
-  {"sh", "sh"},
+  {"sq", "sq"},
   {"sr", "sr"},
   {"sv", "sv-SE"},
+  {"ta", "ta-IN"},
   {"tr", "tr-TR"},
   {"uk", "uk-UA"},
   {"vi", "vi-VN"},
@@ -87,40 +90,17 @@ std::string GetSpellCheckLanguageRegion(const std::string& input_language) {
 
 FilePath GetVersionedFileName(const std::string& input_language,
                               const FilePath& dict_dir) {
-  // The default dictionary version is 1-2. These versions have been augmented
-  // with additional words found by the translation team.
-  static const char kDefaultVersionString[] = "-1-2";
-
+  // The default dictionary version is 3-0. This version indicates that the bdic
+  // file contains a checksum.
+  static const char kDefaultVersionString[] = "-3-0";
+
+  // Add non-default version strings here. Use the same version for all the
+  // dictionaries that you add at the same time. Increment the major version
+  // number if you're updating either dic or aff files. Increment the minor
+  // version number if you're updating only dic_delta files.
   static LanguageVersion special_version_string[] = {
-    {"es-ES", "-1-1"},  // 1-1: Have not been augmented with addtional words.
-    {"nl-NL", "-1-1"},
-    {"sv-SE", "-1-1"},
-    {"he-IL", "-1-1"},
-    {"el-GR", "-1-1"},
-    {"hi-IN", "-1-1"},
-    {"tr-TR", "-1-1"},
-    {"et-EE", "-1-1"},
-    {"lt-LT", "-1-3"},  // 1-3 (Feb 2009): new words, as well as an upgraded
-                        // dictionary.
-    {"pl-PL", "-1-3"},
-    {"fr-FR", "-2-0"},  // 2-0 (2010): upgraded dictionaries.
-    {"hu-HU", "-2-0"},
-    {"ro-RO", "-2-0"},
-    {"ru-RU", "-2-0"},
-    {"bg-BG", "-2-0"},
-    {"sr",    "-2-0"},
-    {"uk-UA", "-2-0"},
-    {"pt-BR", "-2-2"},  // 2-2 (Mar 2011): upgraded a dictionary.
-    {"sh",    "-2-2"},  // 2-2 (Mar 2011): added a dictionary.
-    {"ca-ES", "-2-3"},  // 2-3 (May 2012): upgraded a dictionary.
-    {"sv-SE", "-2-3"},  // 2-3 (May 2012): upgraded a dictionary.
-    {"af-ZA", "-2-3"},  // 2-3 (May 2012): added a dictionary.
-    {"fo-FO", "-2-3"},  // 2-3 (May 2012): added a dictionary.
-    {"en-US", "-2-4"},  // 2-4 (October 2012): add more words.
-    {"en-CA", "-2-4"},
-    {"en-GB", "-2-5"},  // 2-5 (Nov 2012): Added NOSUGGEST flag = !.
-    {"en-AU", "-2-5"},  // Marked 1 word in each.
-
+    {"et-EE", "-1-1"},  // No dic/aff files
+    {"tr-TR", "-1-1"},  // No dic/aff files
   };
 
   // Generate the bdict file name using default version string or special
diff --git a/chrome/renderer/spellchecker/spellcheck_unittest.cc b/chrome/renderer/spellchecker/spellcheck_unittest.cc
index e581a0c..34d3b0e 100644
--- a/chrome/renderer/spellchecker/spellcheck_unittest.cc
+++ b/chrome/renderer/spellchecker/spellcheck_unittest.cc
@@ -18,6 +18,7 @@
 #include "testing/gtest/include/gtest/gtest.h"
 #include "third_party/WebKit/Source/WebKit/chromium/public/WebTextCheckingCompletion.h"
 #include "third_party/WebKit/Source/WebKit/chromium/public/WebTextCheckingResult.h"
+#include "ui/base/l10n/l10n_util.h"
 
 namespace {
 
@@ -136,14 +137,14 @@ class MockTextCheckingCompletion : public WebKit::WebTextCheckingCompletion {
 //     space character;
 //   * Tests for the function with an invalid English word with a preceding
 //     non-English word;
-//   * Tests for the function with2 an invalid English word with a following
+//   * Tests for the function with an invalid English word with a following
 //     space character;
 //   * Tests for the function with an invalid English word with a following
 //     non-English word, and;
 //   * Tests for the function with two invalid English words concatenated
 //     with space characters or non-English words.
 // A test with a "[ROBUSTNESS]" mark shows it is a robustness test and it uses
-// grammartically incorrect string.
+// grammatically incorrect string.
 // TODO(hbono): Please feel free to add more tests.
 TEST_F(SpellCheckTest, SpellCheckStrings_EN_US) {
   static const struct {
@@ -165,7 +166,7 @@ TEST_F(SpellCheckTest, SpellCheckStrings_EN_US) {
 
     // A valid English word "hello".
     {L"hello", true},
-    // A valid Chinese word (meaning "hello") consisiting of two CJKV
+    // A valid Chinese word (meaning "hello") consisting of two CJKV
     // ideographs
     {L"\x4F60\x597D", true},
     // A valid Korean word (meaning "hello") consisting of five hangul
@@ -184,7 +185,7 @@ TEST_F(SpellCheckTest, SpellCheckStrings_EN_US) {
     // Two valid Greek words (meaning "hello") consisting of seven Greek
     // letters
     {L"\x03B3\x03B5\x03B9\x03AC" L" " L"\x03C3\x03BF\x03C5", true},
-    // A valid Russian word (meainng "hello") consisting of twelve Cyrillic
+    // A valid Russian word (meaning "hello") consisting of twelve Cyrillic
     // letters
     {L"\x0437\x0434\x0440\x0430\x0432\x0441"
      L"\x0442\x0432\x0443\x0439\x0442\x0435", true},
@@ -747,6 +748,31 @@ TEST_F(SpellCheckTest, SpellCheckText) {
       L"c\x00E1\x0063 th\x00F4ng tin c\x1EE7\x0061 "
       L"th\x1EBF gi\x1EDBi va l\x00E0m cho n\x00F3 universal c\x00F3 "
       L"th\x1EC3 truy c\x1EADp va h\x1EEFu d\x1EE5ng h\x01A1n."
+    }, {
+      // Korean
+      "ko",
+      L"Google\xC758 \xBAA9\xD45C\xB294 \xC804\xC138\xACC4\xC758 "
+      L"\xC815\xBCF4\xB97C \xCCB4\xACC4\xD654\xD558\xC5EC \xBAA8\xB450\xAC00 "
+      L"\xD3B8\xB9AC\xD558\xAC8C \xC774\xC6A9\xD560 \xC218 "
+      L"\xC788\xB3C4\xB85D \xD558\xB294 \xAC83\xC785\xB2C8\xB2E4."
+    }, {
+      // Albanian
+      "sq",
+      L"Misioni i Google \x00EBsht\x00EB q\x00EB t\x00EB organizoj\x00EB "
+      L"informacionin e bot\x00EBs dhe t\x00EB b\x00EBjn\x00EB at\x00EB "
+      L"universalisht t\x00EB arritshme dhe t\x00EB dobishme."
+    }, {
+      // Tamil
+      "ta",
+      L"Google \x0B87\x0BA9\x0BCD "
+      L"\x0BA8\x0BC7\x0BBE\x0B95\x0BCD\x0B95\x0BAE\x0BCD "
+      L"\x0B89\x0BB2\x0B95\x0BBF\x0BA9\x0BCD \x0BA4\x0B95\x0BB5\x0BB2\x0BCD "
+      L"\x0B8F\x0BB1\x0BCD\x0BAA\x0BBE\x0B9F\x0BC1 \x0B87\x0BA4\x0BC1 "
+      L"\u0B89\u0BB2\u0B95\u0BB3\u0BBE\u0BB5\u0BBF\u0BAF "
+      L"\x0B85\x0BA3\x0BC1\x0B95\x0B95\x0BCD \x0B95\x0BC2\x0B9F\x0BBF\x0BAF "
+      L"\x0BAE\x0BB1\x0BCD\x0BB1\x0BC1\x0BAE\x0BCD "
+      L"\x0BAA\x0BAF\x0BA9\x0BC1\x0BB3\x0BCD\x0BB3 "
+      L"\x0B9A\x0BC6\x0BAF\x0BCD\x0BAF \x0B89\x0BB3\x0BCD\x0BB3\x0BA4\x0BC1."
     },
   };
 
@@ -765,7 +791,13 @@ TEST_F(SpellCheckTest, SpellCheckText) {
         &misspelling_start,
         &misspelling_length, NULL);
 
-    EXPECT_TRUE(result) << kTestCases[i].language;
+    EXPECT_TRUE(result)
+        << "\""
+        << std::wstring(kTestCases[i].input).substr(
+               misspelling_start, misspelling_length)
+        << "\" is misspelled in "
+        << kTestCases[i].language
+        << ".";
     EXPECT_EQ(0, misspelling_start);
     EXPECT_EQ(0, misspelling_length);
   }
@@ -1156,62 +1188,91 @@ TEST_F(SpellCheckTest, EnglishWords) {
 // Checks that NOSUGGEST works in English dictionaries.
 TEST_F(SpellCheckTest, NoSuggest) {
   static const struct {
+    const char* misspelling;
     const char* input;
+    const char* locale;
     bool should_pass;
   } kTestCases[] = {
-    {"cocksucker", true},
-    {"cocksuckers", true},
+    {"suckerbert", "cocksucker",  "en-GB", true},
+    {"suckerbert", "cocksucker",  "en-US", true},
+    {"suckerbert", "cocksucker",  "en-CA", true},
+    {"suckerbert", "cocksucker",  "en-AU", true},
+    {"suckerbert", "cocksuckers", "en-GB", true},
+    {"suckerbert", "cocksuckers", "en-US", true},
+    {"suckerbert", "cocksuckers", "en-CA", true},
+    {"suckerbert", "cocksuckers", "en-AU", true},
+    {"Batasunaa",  "Batasuna",    "ca-ES", true},
+    {"pornoo",     "porno",       "it-IT", true},
+    {"catass",     "catas",       "lt-LT", true},
+    {"kuracc",     "kurac",       "sl-SI", true},
+    {"pittt",      "pitt",        "sv-SE", true},
   };
 
-  static const char* kLocales[] = { "en-GB", "en-US", "en-CA", "en-AU" };
-
-  // First check that the NOSUGGEST flag didn't mark these words as not
-  // being in the dictionary.
   size_t test_cases_size = ARRAYSIZE_UNSAFE(kTestCases);
-  for (size_t j = 0; j < arraysize(kLocales); ++j) {
-    ReinitializeSpellCheck(kLocales[j]);
-    for (size_t i = 0; i < test_cases_size; ++i) {
-      size_t input_length = 0;
-      if (kTestCases[i].input != NULL)
-        input_length = strlen(kTestCases[i].input);
+  for (size_t i = 0; i < test_cases_size; ++i) {
+    ReinitializeSpellCheck(kTestCases[i].locale);
+    size_t input_length = 0;
+    if (kTestCases[i].input != NULL)
+      input_length = strlen(kTestCases[i].input);
 
-      int misspelling_start = 0;
-      int misspelling_length = 0;
-      bool result = spell_check()->SpellCheckWord(
-          ASCIIToUTF16(kTestCases[i].input).c_str(),
-          static_cast<int>(input_length),
-          0,
-          &misspelling_start,
-          &misspelling_length, NULL);
+    // First check that the NOSUGGEST flag didn't mark this word as not being in
+    // the dictionary.
+    int misspelling_start = 0;
+    int misspelling_length = 0;
+    bool result = spell_check()->SpellCheckWord(
+        ASCIIToUTF16(kTestCases[i].input).c_str(),
+        static_cast<int>(input_length),
+        0,
+        &misspelling_start,
+        &misspelling_length, NULL);
 
-      EXPECT_EQ(kTestCases[i].should_pass, result) << kTestCases[i].input <<
-          " in " << kLocales[j];
-    }
-  }
+    EXPECT_EQ(kTestCases[i].should_pass, result) << kTestCases[i].input <<
+        " in " << kTestCases[i].input;
 
-  // Now verify that neither of testCases show up as suggestions.
-  for (size_t d = 0; d < arraysize(kLocales); ++d) {
-    ReinitializeSpellCheck(kLocales[d]);
-    int misspelling_start;
-    int misspelling_length;
+    // Now verify that this test case does not show up as a suggestion.
     std::vector<string16> suggestions;
-    spell_check()->SpellCheckWord(
-        ASCIIToUTF16("suckerbert").c_str(),
-        10,
+    input_length = 0;
+    if (kTestCases[i].misspelling != NULL)
+      input_length = strlen(kTestCases[i].misspelling);
+    result = spell_check()->SpellCheckWord(
+        ASCIIToUTF16(kTestCases[i].misspelling).c_str(),
+        static_cast<int>(input_length),
         0,
         &misspelling_start,
         &misspelling_length,
         &suggestions);
     // Check if the suggested words occur.
+    EXPECT_FALSE(result) << kTestCases[i].misspelling
+                         << " is not a misspelling in "
+                         << kTestCases[i].locale;
     for (int j = 0; j < static_cast<int>(suggestions.size()); j++) {
       for (size_t t = 0; t < test_cases_size; t++) {
         int compare_result =
             suggestions.at(j).compare(ASCIIToUTF16(kTestCases[t].input));
         EXPECT_FALSE(compare_result == 0) << kTestCases[t].input <<
-            " in " << kLocales[d];
+            " in " << kTestCases[i].locale;
       }
     }
   }
 }
 
+// Check that the correct dictionary files are checked in.
+TEST_F(SpellCheckTest, DictionaryFiles) {
+  std::vector<std::string> locale_codes;
+  l10n_util::GetAcceptLanguagesForLocale("C", &locale_codes);
+  EXPECT_FALSE(locale_codes.empty());
+
+  std::vector<std::string> spellcheck_languages;
+  chrome::spellcheck_common::SpellCheckLanguages(&spellcheck_languages);
+  EXPECT_FALSE(spellcheck_languages.empty());
+  EXPECT_LE(spellcheck_languages.size(), locale_codes.size());
+
+  FilePath hunspell = GetHunspellDirectory();
+  for (size_t i = 0; i < spellcheck_languages.size(); ++i) {
+    FilePath dict = chrome::spellcheck_common::GetVersionedFileName(
+        spellcheck_languages[i], hunspell);
+    EXPECT_TRUE(file_util::PathExists(dict)) << dict.value() << " not found";
+  }
+}
+
 #endif
diff --git a/chrome/tools/convert_dict/aff_reader.cc b/chrome/tools/convert_dict/aff_reader.cc
index b3b0381..b24a0d8 100644
--- a/chrome/tools/convert_dict/aff_reader.cc
+++ b/chrome/tools/convert_dict/aff_reader.cc
@@ -122,7 +122,7 @@ bool AffReader::Read() {
       exit(1);
     } else if (StringBeginsWith(line, "COMPLEXPREFIXES ")) {
       printf("We don't support the COMPLEXPREFIXES command yet. This would "
-        "mean we have to insert words backwords as well (I think)\n");
+        "mean we have to insert words backwards as well (I think)\n");
       exit(1);
     } else {
       // All other commands get stored in the other commands list.
@@ -241,7 +241,7 @@ void AffReader::AddAffix(std::string* rule) {
           // so that means that this prefix would be a compound one.
           //
           // It expects these rules to use the same alias rules as the .dic
-          // file. We've forced it to use aliases, which is a numberical index
+          // file. We've forced it to use aliases, which is a numerical index
           // instead of these character flags, and this needs to be consistent.
 
           std::string before_flags = part.substr(0, slash_index + 1);
@@ -250,13 +250,21 @@ void AffReader::AddAffix(std::string* rule) {
           // that tells us what to strip.
           std::vector<std::string> after_slash;
           base::SplitString(part.substr(slash_index + 1), ' ', &after_slash);
-          if (after_slash.size() < 2) {
-            // Note that we may get a third term here which is the
-            // morphological description of this rule. This happens in the tests
-            // only, so we can just ignore it.
-            printf("ERROR: Didn't get enough after the slash\n");
+          if (after_slash.size() == 0) {
+            printf("ERROR: Found 0 terms after slash in affix rule '%s', "
+                      "but need at least 2.\n",
+                   part.c_str());
             return;
           }
+          if (after_slash.size() == 1) {
+            printf("WARNING: Found 1 term after slash in affix rule '%s', "
+                      "but expected at least 2. Adding '.'.\n",
+                   part.c_str());
+            after_slash.push_back(".");
+          }
+          // Note that we may get a third term here which is the morphological
+          // description of this rule. This happens in the tests only, so we can
+          // just ignore it.
 
           part = base::StringPrintf("%s%d %s",
                                     before_flags.c_str(),
@@ -266,8 +274,11 @@ void AffReader::AddAffix(std::string* rule) {
 
         // Reencode from here
         std::string reencoded;
-        if (!EncodingToUTF8(part, &reencoded))
+        if (!EncodingToUTF8(part, &reencoded)) {
+          printf("ERROR: Cannot encode affix rule part '%s' to utf8.\n",
+                 part.c_str());
           break;
+        }
 
         *rule = rule->substr(0, part_start) + reencoded;
         break;
@@ -283,19 +294,26 @@ void AffReader::AddAffix(std::string* rule) {
 
 void AffReader::AddReplacement(std::string* rule) {
   TrimLine(rule);
+  CollapseDuplicateSpaces(rule);
 
   std::string utf8rule;
-  if (!EncodingToUTF8(*rule, &utf8rule))
+  if (!EncodingToUTF8(*rule, &utf8rule)) {
+    printf("ERROR: Cannot encode replacement rule '%s' to utf8.\n",
+           rule->c_str());
     return;
+  }
 
-  std::vector<std::string> split;
-  base::SplitString(utf8rule, ' ', &split);
-
-  // There should be two parts.
-  if (split.size() != 2)
+  // The first space separates key and value.
+  size_t space_index = utf8rule.find(' ');
+  if (space_index == std::string::npos) {
+    printf("ERROR: Did not find a space in '%s'.\n", utf8rule.c_str());
     return;
+  }
+  std::vector<std::string> split;
+  split.push_back(utf8rule.substr(0, space_index));
+  split.push_back(utf8rule.substr(space_index + 1));
 
-  // Underscores are used to represent spaces
+  // Underscores are used to represent spaces in most aff files
   // (since the line is parsed on spaces).
   std::replace(split[0].begin(), split[0].end(), '_', ' ');
   std::replace(split[1].begin(), split[1].end(), '_', ' ');
@@ -309,8 +327,11 @@ void AffReader::HandleRawCommand(const std::string& line) {
 
 void AffReader::HandleEncodedCommand(const std::string& line) {
   std::string utf8;
-  if (EncodingToUTF8(line, &utf8))
-    other_commands_.push_back(utf8);
+  if (!EncodingToUTF8(line, &utf8)) {
+    printf("ERROR: Cannot encode command '%s' to utf8.\n", line.c_str());
+    return;
+  }
+  other_commands_.push_back(utf8);
 }
 
 }  // namespace convert_dict
diff --git a/chrome/tools/convert_dict/dic_reader.cc b/chrome/tools/convert_dict/dic_reader.cc
index 967f07e..5ed5cfa 100644
--- a/chrome/tools/convert_dict/dic_reader.cc
+++ b/chrome/tools/convert_dict/dic_reader.cc
@@ -117,13 +117,10 @@ bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader,
     std::set<int> affix_vector;
     affix_vector.insert(affix_index);
 
-    if (found == word_set->end()) {
+    if (found == word_set->end())
       word_set->insert(std::make_pair(utf8word, affix_vector));
-    } else {
-      // The affixes of the delta file should override those in the
-      // dictionary file.
-      found->second.swap(affix_vector);
-    }
+    else
+      found->second.insert(affix_index);
   }
 
   return true;
@@ -170,7 +167,6 @@ bool DicReader::Read(AffReader* aff_reader) {
     PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta",
                     "UTF-8", false);
   }
-
   // Make sure the words are sorted, they may be unsorted in the input.
   for (WordSet::iterator word = word_set.begin(); word != word_set.end();
        ++word) {
@@ -182,6 +178,7 @@ bool DicReader::Read(AffReader* aff_reader) {
     // Double check that the affixes are sorted. This isn't strictly necessary
     // but it's nice for the file to have a fixed layout.
     std::sort(affixes.begin(), affixes.end());
+    std::reverse(affixes.begin(), affixes.end());
     words_.push_back(std::make_pair(word->first, affixes));
   }
author	rouslan@chromium.org <rouslan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2013-01-08 18:04:31 +0000
committer	rouslan@chromium.org <rouslan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2013-01-08 18:04:31 +0000
commit	ccdc3010506fb0b5612071e367fe9ec36df7585b (patch)
tree	43a787740bea5c3669f9877dfb4a5e1328322fa9
parent	b980432bf58f0cdb31482a20c8c0937a98f2f3a2 (diff)
download	chromium_src-ccdc3010506fb0b5612071e367fe9ec36df7585b.zip chromium_src-ccdc3010506fb0b5612071e367fe9ec36df7585b.tar.gz chromium_src-ccdc3010506fb0b5612071e367fe9ec36df7585b.tar.bz2