From 589abbb3dc82525779a410794c03cce602ae6d8a Mon Sep 17 00:00:00 2001 From: "sidchat@google.com" Date: Mon, 29 Dec 2008 22:59:42 +0000 Subject: Part 1 of 'Add common words for each language, and remove forbidden words'. Updated the Dic Reader to read additional words from additional-words dictionary file, created as a text file with UTF-8 encoding, with extension dic_delta. Test: Tested with all the 21 languages that are currently supported by the spell checker and for which common/additional words have been obtained. Resulting bdic files generated have those additional words. Issue=2837. Review URL: http://codereview.chromium.org/14856 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@7492 0039d316-1c4b-4281-b951-d872f2087c98 --- chrome/tools/convert_dict/dic_reader.cc | 85 ++++++++++++++++++++++----------- chrome/tools/convert_dict/dic_reader.h | 1 + 2 files changed, 59 insertions(+), 27 deletions(-) (limited to 'chrome/tools/convert_dict') diff --git a/chrome/tools/convert_dict/dic_reader.cc b/chrome/tools/convert_dict/dic_reader.cc index 8bdb8ec..0cb18f6 100644 --- a/chrome/tools/convert_dict/dic_reader.cc +++ b/chrome/tools/convert_dict/dic_reader.cc @@ -43,43 +43,39 @@ void SplitDicLine(const std::string& line, std::vector* output) { output->push_back(line.substr(slash_index + 1)); } -} // namespace - -DicReader::DicReader(const std::string& filename) { - file_ = file_util::OpenFile(filename, "r"); -} - -DicReader::~DicReader() { - if (file_) - file_util::CloseFile(file_); -} +// This function reads words from a .dic file, or a .dic_delta file. Note that +// we read 'all' the words in the file, irrespective of the word count given +// in the first non empty line of a .dic file. Also note that, for a .dic_delta +// file, the first line actually does _not_ have the number of words. In order +// to control this, we use the |file_has_word_count_in_the_first_line| +// parameter to tell this method whether the first non empty line in the file +// contains the number of words or not. If it does, skip the first line. If it +// does not, then the first line contains a word. +bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader, + const char* file_type, + bool file_has_word_count_in_the_first_line) { + printf("Extracting words from %s file...\n", file_type); -bool DicReader::Read(AffReader* aff_reader) { - if (!file_) - return false; - - bool got_count = false; int line_number = 0; - - WordSet word_set; - while (!feof(file_)) { - std::string line = ReadLine(file_); + while (!feof(file)) { + std::string line = ReadLine(file); line_number++; StripComment(&line); if (line.empty()) continue; - if (!got_count) { + if (file_has_word_count_in_the_first_line) { // Skip the first nonempty line, this is the line count. We don't bother // with it and just read all the lines. - got_count = true; + file_has_word_count_in_the_first_line = false; continue; } std::vector split; SplitDicLine(line, &split); if (split.size() == 0 || split.size() > 2) { - printf("Line %d has extra slashes in the dic file\n", line_number); + printf("Line %d has extra slashes in the %s file\n", line_number, + file_type); return false; } @@ -87,8 +83,8 @@ bool DicReader::Read(AffReader* aff_reader) { // always use UTF-8 as the encoding to simplify life. std::string utf8word; if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) { - printf("Unable to convert line %d from %s to UTF-8 in the dic file\n", - line_number, aff_reader->encoding()); + printf("Unable to convert line %d from %s to UTF-8 in the %s file\n", + line_number, aff_reader->encoding(), file_type); return false; } @@ -109,16 +105,51 @@ bool DicReader::Read(AffReader* aff_reader) { affix_index = aff_reader->GetAFIndexForAFString(split[1]); } - WordSet::iterator found = word_set.find(utf8word); - if (found == word_set.end()) { + WordSet::iterator found = word_set->find(utf8word); + if (found == word_set->end()) { std::set affix_vector; affix_vector.insert(affix_index); - word_set.insert(std::make_pair(utf8word, affix_vector)); + word_set->insert(std::make_pair(utf8word, affix_vector)); } else { found->second.insert(affix_index); } } + return true; +} + +} // namespace + +DicReader::DicReader(const std::string& filename) { + file_ = file_util::OpenFile(filename, "r"); + additional_words_file_ = file_util::OpenFile(filename + "_delta", "r"); +} + +DicReader::~DicReader() { + if (file_) + file_util::CloseFile(file_); + if (additional_words_file_) + file_util::CloseFile(additional_words_file_); +} + +bool DicReader::Read(AffReader* aff_reader) { + if (!file_) + return false; + + WordSet word_set; + + // Add words from the dic file to the word set. + // Note that the first line is the word count in the file. + if (!PopulateWordSet(&word_set, file_, aff_reader, "dic", true)) + return false; + + // Add words from the dic delta file to the word set, if it exists. + // The first line is the first word to add. Word count line is not present. + if (additional_words_file_ != NULL) { + PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta", + false); + } + // Make sure the words are sorted, they may be unsorted in the input. for (WordSet::iterator word = word_set.begin(); word != word_set.end(); ++word) { diff --git a/chrome/tools/convert_dict/dic_reader.h b/chrome/tools/convert_dict/dic_reader.h index 99fcdf3..32d6eff 100644 --- a/chrome/tools/convert_dict/dic_reader.h +++ b/chrome/tools/convert_dict/dic_reader.h @@ -35,6 +35,7 @@ class DicReader { private: FILE* file_; + FILE* additional_words_file_; // Contains all words and their corresponding affix index. WordList words_; -- cgit v1.1