Fix the dic reader so that it now accepts additional words from dic_delta files encoded as UTF-8.

Test: Tested the new encoder with the additional words. The words with the special characters are now working well - i.e., are included in the corresponding spellcheck bdic dictionary file. Review URL: http://codereview.chromium.org/17324 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@7829 0039d316-1c4b-4281-b951-d872f2087c98
author: sidchat@google.com <sidchat@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2009-01-09 20:56:44 +0000
committer: sidchat@google.com <sidchat@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2009-01-09 20:56:44 +0000
commit: 5f1746bff720c94389dd288f88c80d6f4e9cccc0 (patch)
tree: b0daa1ef06c94575b772fc1a9eafad3ce01cf64d /chrome/tools/convert_dict
parent: 60e3858a7728f1fc46fe95fed3094aa23b1cb843 (diff)
download: chromium_src-5f1746bff720c94389dd288f88c80d6f4e9cccc0.zip
chromium_src-5f1746bff720c94389dd288f88c80d6f4e9cccc0.tar.gz
chromium_src-5f1746bff720c94389dd288f88c80d6f4e9cccc0.tar.bz2
1 files changed, 12 insertions, 7 deletions
diff --git a/chrome/tools/convert_dict/dic_reader.cc b/chrome/tools/convert_dict/dic_reader.cc
index 0cb18f6..1d63532 100644
--- a/chrome/tools/convert_dict/dic_reader.cc
+++ b/chrome/tools/convert_dict/dic_reader.cc
@@ -52,9 +52,9 @@ void SplitDicLine(const std::string& line, std::vector<std::string>* output) {
 // contains the number of words or not. If it does, skip the first line. If it
 // does not, then the first line contains a word.
 bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader,
-                     const char* file_type, 
+                     const char* file_type, const char* encoding,
                      bool file_has_word_count_in_the_first_line) {
-  printf("Extracting words from %s file...\n", file_type);
+  printf("Extracting words from %s file\nEncoding: %s\n", file_type, encoding);
 
   int line_number = 0;
   while (!feof(file)) {
@@ -82,9 +82,12 @@ bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader,
     // The first part is the word, the second (optional) part is the affix. We
     // always use UTF-8 as the encoding to simplify life.
     std::string utf8word;
-    if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) {
+    std::string encoding_string(encoding);
+    if (encoding_string == "UTF-8") {
+      utf8word = split[0];
+    } else if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) {
       printf("Unable to convert line %d from %s to UTF-8 in the %s file\n",
-             line_number, aff_reader->encoding(), file_type);
+             line_number, encoding, file_type);
       return false;
     }
 
@@ -140,14 +143,16 @@ bool DicReader::Read(AffReader* aff_reader) {
 
   // Add words from the dic file to the word set.
   // Note that the first line is the word count in the file.
-  if (!PopulateWordSet(&word_set, file_, aff_reader, "dic", true))
+  if (!PopulateWordSet(&word_set, file_, aff_reader, "dic", 
+                       aff_reader->encoding(), true))
     return false;
 
-  // Add words from the dic delta file to the word set, if it exists.
+  // Add words from the .dic_delta file to the word set, if it exists.
   // The first line is the first word to add. Word count line is not present.
+  // NOTE: These additional words should be encoded as UTF-8.
   if (additional_words_file_ != NULL) {
     PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta",
-                    false);
+                    "UTF-8", false);
   }
 
   // Make sure the words are sorted, they may be unsorted in the input.
author	sidchat@google.com <sidchat@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2009-01-09 20:56:44 +0000
committer	sidchat@google.com <sidchat@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2009-01-09 20:56:44 +0000
commit	5f1746bff720c94389dd288f88c80d6f4e9cccc0 (patch)
tree	b0daa1ef06c94575b772fc1a9eafad3ce01cf64d /chrome/tools/convert_dict
parent	60e3858a7728f1fc46fe95fed3094aa23b1cb843 (diff)
download	chromium_src-5f1746bff720c94389dd288f88c80d6f4e9cccc0.zip chromium_src-5f1746bff720c94389dd288f88c80d6f4e9cccc0.tar.gz chromium_src-5f1746bff720c94389dd288f88c80d6f4e9cccc0.tar.bz2