Fix Issue 1356872 "Russian bdic is corrupted".

The AffReader::AddAffix() function does not reencode a "stripping prefix (or suffix)" field of a PFX (or SFX) line. Unfortunately, this function creates corrupted prefix (and suffix) rules for some dictionaries whose encoding is not UTF-8 (e.g. Russian, Polish, etc.) because a "stripping prefix (or suffix)" field isn't only a length of a prefix (or suffix) but also it is a string to be replaced. To solve this problem, this change branch checks the third token of an PFX (and SFX) line represents a "stripping characters" field or a "cross product" field, and reencode them only if the third token represents a suffix or a prefix. Needless to say, we also have to re-create bdics and push them to our download server to solve this issue. :) BUG=1428 Review URL: http://codereview.chromium.org/3183 Review URL: http://codereview.chromium.org/3183 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@3903 0039d316-1c4b-4281-b951-d872f2087c98
author: hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2008-10-24 02:33:22 +0000
committer: hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2008-10-24 02:33:22 +0000
commit: 361504f357e6882891f56aaf84dfa521a87f8739 (patch)
tree: eec77fc4990d6b183833117abb407eacf5ea88ef /chrome
parent: 3453068e7facd042b827a011a56ee516409fa07b (diff)
download: chromium_src-361504f357e6882891f56aaf84dfa521a87f8739.zip
chromium_src-361504f357e6882891f56aaf84dfa521a87f8739.tar.gz
chromium_src-361504f357e6882891f56aaf84dfa521a87f8739.tar.bz2
3 files changed, 20 insertions, 4 deletions
diff --git a/chrome/third_party/hunspell/google/bdict.h b/chrome/third_party/hunspell/google/bdict.h
index b96a2e4..3157616 100644
--- a/chrome/third_party/hunspell/google/bdict.h
+++ b/chrome/third_party/hunspell/google/bdict.h
@@ -98,6 +98,10 @@ class BDict {
  public:
   // File header.
   enum { SIGNATURE = 0x63694442 };
+  enum {
+    MAJOR_VERSION = 1,
+    MINOR_VERSION = 1
+  };
   struct Header {
     uint32 signature;
 
diff --git a/chrome/third_party/hunspell/google/bdict_writer.cc b/chrome/third_party/hunspell/google/bdict_writer.cc
index 67be677..fcb060d 100644
--- a/chrome/third_party/hunspell/google/bdict_writer.cc
+++ b/chrome/third_party/hunspell/google/bdict_writer.cc
@@ -451,8 +451,8 @@ std::string BDictWriter::GetBDict() const {
   hunspell::BDict::Header* header =
       reinterpret_cast<hunspell::BDict::Header*>(&ret[0]);
   header->signature = hunspell::BDict::SIGNATURE;
-  header->major_version = 1;
-  header->minor_version = 0;
+  header->major_version = hunspell::BDict::MAJOR_VERSION;
+  header->minor_version = hunspell::BDict::MINOR_VERSION;
   header->aff_offset = static_cast<uint32>(aff_offset);
   header->dic_offset = static_cast<uint32>(dic_offset);
 
diff --git a/chrome/tools/convert_dict/aff_reader.cc b/chrome/tools/convert_dict/aff_reader.cc
index 06ec131..ea797fb 100644
--- a/chrome/tools/convert_dict/aff_reader.cc
+++ b/chrome/tools/convert_dict/aff_reader.cc
@@ -190,11 +190,20 @@ void AffReader::AddAffix(std::string* rule) {
   // will re-encode the number on the first line, but that will be a NOP. If
   // there are not that many groups, we won't reencode it, but pass it through.
   int found_spaces = 0;
+  std::string token;
   for (size_t i = 0; i < rule->length(); i++) {
     if ((*rule)[i] == ' ') {
       found_spaces++;
       if (found_spaces == 3) {
-        std::string part = rule->substr(i);  // From here to end.
+        size_t part_start = i;
+        std::string part;
+        if (token[0] != 'Y' && token[0] != 'N') {
+          // This token represents a stripping prefix or suffix, which is
+          // either a length or a string to be replaced.
+          // We also reencode them to UTF-8.
+          part_start = i - token.length();
+        }
+        part = rule->substr(part_start);  // From here to end.
 
         size_t slash_index = part.find('/');
         if (slash_index != std::string::npos && !has_indexed_affixes()) {
@@ -233,9 +242,12 @@ void AffReader::AddAffix(std::string* rule) {
         if (!EncodingToUTF8(part, &reencoded))
           break;
 
-        *rule = rule->substr(0, i) + reencoded;
+        *rule = rule->substr(0, part_start) + reencoded;
         break;
       }
+      token.clear();
+    } else {
+      token.push_back((*rule)[i]);
     }
   }
author	hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2008-10-24 02:33:22 +0000
committer	hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2008-10-24 02:33:22 +0000
commit	361504f357e6882891f56aaf84dfa521a87f8739 (patch)
tree	eec77fc4990d6b183833117abb407eacf5ea88ef /chrome
parent	3453068e7facd042b827a011a56ee516409fa07b (diff)
download	chromium_src-361504f357e6882891f56aaf84dfa521a87f8739.zip chromium_src-361504f357e6882891f56aaf84dfa521a87f8739.tar.gz chromium_src-361504f357e6882891f56aaf84dfa521a87f8739.tar.bz2