summaryrefslogtreecommitdiffstats
path: root/chrome/tools/convert_dict
diff options
context:
space:
mode:
authorsidchat@google.com <sidchat@google.com@0039d316-1c4b-4281-b951-d872f2087c98>2008-12-29 22:59:42 +0000
committersidchat@google.com <sidchat@google.com@0039d316-1c4b-4281-b951-d872f2087c98>2008-12-29 22:59:42 +0000
commit589abbb3dc82525779a410794c03cce602ae6d8a (patch)
tree5676d9b50e380125a2cb32890433fe6a4fcbef33 /chrome/tools/convert_dict
parent56f42e74ce15ebe77c48c10126730d115372546c (diff)
downloadchromium_src-589abbb3dc82525779a410794c03cce602ae6d8a.zip
chromium_src-589abbb3dc82525779a410794c03cce602ae6d8a.tar.gz
chromium_src-589abbb3dc82525779a410794c03cce602ae6d8a.tar.bz2
Part 1 of 'Add common words for each language, and remove forbidden words'.
Updated the Dic Reader to read additional words from additional-words dictionary file, created as a text file with UTF-8 encoding, with extension dic_delta. Test: Tested with all the 21 languages that are currently supported by the spell checker and for which common/additional words have been obtained. Resulting bdic files generated have those additional words. Issue=2837. Review URL: http://codereview.chromium.org/14856 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@7492 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/tools/convert_dict')
-rw-r--r--chrome/tools/convert_dict/dic_reader.cc85
-rw-r--r--chrome/tools/convert_dict/dic_reader.h1
2 files changed, 59 insertions, 27 deletions
diff --git a/chrome/tools/convert_dict/dic_reader.cc b/chrome/tools/convert_dict/dic_reader.cc
index 8bdb8ec..0cb18f6 100644
--- a/chrome/tools/convert_dict/dic_reader.cc
+++ b/chrome/tools/convert_dict/dic_reader.cc
@@ -43,43 +43,39 @@ void SplitDicLine(const std::string& line, std::vector<std::string>* output) {
output->push_back(line.substr(slash_index + 1));
}
-} // namespace
-
-DicReader::DicReader(const std::string& filename) {
- file_ = file_util::OpenFile(filename, "r");
-}
-
-DicReader::~DicReader() {
- if (file_)
- file_util::CloseFile(file_);
-}
+// This function reads words from a .dic file, or a .dic_delta file. Note that
+// we read 'all' the words in the file, irrespective of the word count given
+// in the first non empty line of a .dic file. Also note that, for a .dic_delta
+// file, the first line actually does _not_ have the number of words. In order
+// to control this, we use the |file_has_word_count_in_the_first_line|
+// parameter to tell this method whether the first non empty line in the file
+// contains the number of words or not. If it does, skip the first line. If it
+// does not, then the first line contains a word.
+bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader,
+ const char* file_type,
+ bool file_has_word_count_in_the_first_line) {
+ printf("Extracting words from %s file...\n", file_type);
-bool DicReader::Read(AffReader* aff_reader) {
- if (!file_)
- return false;
-
- bool got_count = false;
int line_number = 0;
-
- WordSet word_set;
- while (!feof(file_)) {
- std::string line = ReadLine(file_);
+ while (!feof(file)) {
+ std::string line = ReadLine(file);
line_number++;
StripComment(&line);
if (line.empty())
continue;
- if (!got_count) {
+ if (file_has_word_count_in_the_first_line) {
// Skip the first nonempty line, this is the line count. We don't bother
// with it and just read all the lines.
- got_count = true;
+ file_has_word_count_in_the_first_line = false;
continue;
}
std::vector<std::string> split;
SplitDicLine(line, &split);
if (split.size() == 0 || split.size() > 2) {
- printf("Line %d has extra slashes in the dic file\n", line_number);
+ printf("Line %d has extra slashes in the %s file\n", line_number,
+ file_type);
return false;
}
@@ -87,8 +83,8 @@ bool DicReader::Read(AffReader* aff_reader) {
// always use UTF-8 as the encoding to simplify life.
std::string utf8word;
if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) {
- printf("Unable to convert line %d from %s to UTF-8 in the dic file\n",
- line_number, aff_reader->encoding());
+ printf("Unable to convert line %d from %s to UTF-8 in the %s file\n",
+ line_number, aff_reader->encoding(), file_type);
return false;
}
@@ -109,16 +105,51 @@ bool DicReader::Read(AffReader* aff_reader) {
affix_index = aff_reader->GetAFIndexForAFString(split[1]);
}
- WordSet::iterator found = word_set.find(utf8word);
- if (found == word_set.end()) {
+ WordSet::iterator found = word_set->find(utf8word);
+ if (found == word_set->end()) {
std::set<int> affix_vector;
affix_vector.insert(affix_index);
- word_set.insert(std::make_pair(utf8word, affix_vector));
+ word_set->insert(std::make_pair(utf8word, affix_vector));
} else {
found->second.insert(affix_index);
}
}
+ return true;
+}
+
+} // namespace
+
+DicReader::DicReader(const std::string& filename) {
+ file_ = file_util::OpenFile(filename, "r");
+ additional_words_file_ = file_util::OpenFile(filename + "_delta", "r");
+}
+
+DicReader::~DicReader() {
+ if (file_)
+ file_util::CloseFile(file_);
+ if (additional_words_file_)
+ file_util::CloseFile(additional_words_file_);
+}
+
+bool DicReader::Read(AffReader* aff_reader) {
+ if (!file_)
+ return false;
+
+ WordSet word_set;
+
+ // Add words from the dic file to the word set.
+ // Note that the first line is the word count in the file.
+ if (!PopulateWordSet(&word_set, file_, aff_reader, "dic", true))
+ return false;
+
+ // Add words from the dic delta file to the word set, if it exists.
+ // The first line is the first word to add. Word count line is not present.
+ if (additional_words_file_ != NULL) {
+ PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta",
+ false);
+ }
+
// Make sure the words are sorted, they may be unsorted in the input.
for (WordSet::iterator word = word_set.begin(); word != word_set.end();
++word) {
diff --git a/chrome/tools/convert_dict/dic_reader.h b/chrome/tools/convert_dict/dic_reader.h
index 99fcdf3..32d6eff 100644
--- a/chrome/tools/convert_dict/dic_reader.h
+++ b/chrome/tools/convert_dict/dic_reader.h
@@ -35,6 +35,7 @@ class DicReader {
private:
FILE* file_;
+ FILE* additional_words_file_;
// Contains all words and their corresponding affix index.
WordList words_;