Part 1 of 'Add common words for each language, and remove forbidden words'.

Updated the Dic Reader to read additional words from additional-words dictionary file, created as a text file with UTF-8 encoding, with extension dic_delta. Test: Tested with all the 21 languages that are currently supported by the spell checker and for which common/additional words have been obtained. Resulting bdic files generated have those additional words. Issue=2837. Review URL: http://codereview.chromium.org/14856 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@7492 0039d316-1c4b-4281-b951-d872f2087c98
author: sidchat@google.com <sidchat@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2008-12-29 22:59:42 +0000
committer: sidchat@google.com <sidchat@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2008-12-29 22:59:42 +0000
commit: 589abbb3dc82525779a410794c03cce602ae6d8a (patch)
tree: 5676d9b50e380125a2cb32890433fe6a4fcbef33 /chrome/tools/convert_dict
parent: 56f42e74ce15ebe77c48c10126730d115372546c (diff)
download: chromium_src-589abbb3dc82525779a410794c03cce602ae6d8a.zip
chromium_src-589abbb3dc82525779a410794c03cce602ae6d8a.tar.gz
chromium_src-589abbb3dc82525779a410794c03cce602ae6d8a.tar.bz2
2 files changed, 59 insertions, 27 deletions
diff --git a/chrome/tools/convert_dict/dic_reader.cc b/chrome/tools/convert_dict/dic_reader.cc
index 8bdb8ec..0cb18f6 100644
--- a/chrome/tools/convert_dict/dic_reader.cc
+++ b/chrome/tools/convert_dict/dic_reader.cc
@@ -43,43 +43,39 @@ void SplitDicLine(const std::string& line, std::vector<std::string>* output) {
     output->push_back(line.substr(slash_index + 1));
 }
 
-}  // namespace
-
-DicReader::DicReader(const std::string& filename) {
-  file_ = file_util::OpenFile(filename, "r");
-}
-
-DicReader::~DicReader() {
-  if (file_)
-    file_util::CloseFile(file_);
-}
+// This function reads words from a .dic file, or a .dic_delta file. Note that 
+// we read 'all' the words in the file, irrespective of the word count given
+// in the first non empty line of a .dic file. Also note that, for a .dic_delta
+// file, the first line actually does _not_ have the number of words. In order
+// to control this, we use the |file_has_word_count_in_the_first_line| 
+// parameter to tell this method whether the first non empty line in the file 
+// contains the number of words or not. If it does, skip the first line. If it
+// does not, then the first line contains a word.
+bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader,
+                     const char* file_type, 
+                     bool file_has_word_count_in_the_first_line) {
+  printf("Extracting words from %s file...\n", file_type);
 
-bool DicReader::Read(AffReader* aff_reader) {
-  if (!file_)
-    return false;
-
-  bool got_count = false;
   int line_number = 0;
-
-  WordSet word_set;
-  while (!feof(file_)) {
-    std::string line = ReadLine(file_);
+  while (!feof(file)) {
+    std::string line = ReadLine(file);
     line_number++;
     StripComment(&line);
     if (line.empty())
       continue;
 
-    if (!got_count) {
+    if (file_has_word_count_in_the_first_line) {
       // Skip the first nonempty line, this is the line count. We don't bother
       // with it and just read all the lines.
-      got_count = true;
+      file_has_word_count_in_the_first_line = false;
       continue;
     }
 
     std::vector<std::string> split;
     SplitDicLine(line, &split);
     if (split.size() == 0 || split.size() > 2) {
-      printf("Line %d has extra slashes in the dic file\n", line_number);
+      printf("Line %d has extra slashes in the %s file\n", line_number,
+             file_type);
       return false;
     }
 
@@ -87,8 +83,8 @@ bool DicReader::Read(AffReader* aff_reader) {
     // always use UTF-8 as the encoding to simplify life.
     std::string utf8word;
     if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) {
-      printf("Unable to convert line %d from %s to UTF-8 in the dic file\n",
-             line_number, aff_reader->encoding());
+      printf("Unable to convert line %d from %s to UTF-8 in the %s file\n",
+             line_number, aff_reader->encoding(), file_type);
       return false;
     }
 
@@ -109,16 +105,51 @@ bool DicReader::Read(AffReader* aff_reader) {
         affix_index = aff_reader->GetAFIndexForAFString(split[1]);
     }
 
-    WordSet::iterator found = word_set.find(utf8word);
-    if (found == word_set.end()) {
+    WordSet::iterator found = word_set->find(utf8word);
+    if (found == word_set->end()) {
       std::set<int> affix_vector;
       affix_vector.insert(affix_index);
-      word_set.insert(std::make_pair(utf8word, affix_vector));
+      word_set->insert(std::make_pair(utf8word, affix_vector));
     } else {
       found->second.insert(affix_index);
     }
   }
 
+  return true;
+}
+
+}  // namespace
+
+DicReader::DicReader(const std::string& filename) {
+  file_ = file_util::OpenFile(filename, "r");
+  additional_words_file_ = file_util::OpenFile(filename + "_delta", "r");
+}
+
+DicReader::~DicReader() {
+  if (file_)
+    file_util::CloseFile(file_);
+  if (additional_words_file_)
+    file_util::CloseFile(additional_words_file_);
+}
+
+bool DicReader::Read(AffReader* aff_reader) {
+  if (!file_)
+    return false;
+
+  WordSet word_set;
+
+  // Add words from the dic file to the word set.
+  // Note that the first line is the word count in the file.
+  if (!PopulateWordSet(&word_set, file_, aff_reader, "dic", true))
+    return false;
+
+  // Add words from the dic delta file to the word set, if it exists.
+  // The first line is the first word to add. Word count line is not present.
+  if (additional_words_file_ != NULL) {
+    PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta",
+                    false);
+  }
+
   // Make sure the words are sorted, they may be unsorted in the input.
   for (WordSet::iterator word = word_set.begin(); word != word_set.end();
        ++word) {
diff --git a/chrome/tools/convert_dict/dic_reader.h b/chrome/tools/convert_dict/dic_reader.h
index 99fcdf3..32d6eff 100644
--- a/chrome/tools/convert_dict/dic_reader.h
+++ b/chrome/tools/convert_dict/dic_reader.h
@@ -35,6 +35,7 @@ class DicReader {
 
  private:
   FILE* file_;
+  FILE* additional_words_file_;
 
   // Contains all words and their corresponding affix index.
   WordList words_;
author	sidchat@google.com <sidchat@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2008-12-29 22:59:42 +0000
committer	sidchat@google.com <sidchat@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2008-12-29 22:59:42 +0000
commit	589abbb3dc82525779a410794c03cce602ae6d8a (patch)
tree	5676d9b50e380125a2cb32890433fe6a4fcbef33 /chrome/tools/convert_dict
parent	56f42e74ce15ebe77c48c10126730d115372546c (diff)
download	chromium_src-589abbb3dc82525779a410794c03cce602ae6d8a.zip chromium_src-589abbb3dc82525779a410794c03cce602ae6d8a.tar.gz chromium_src-589abbb3dc82525779a410794c03cce602ae6d8a.tar.bz2