diff options
author | sidchat@google.com <sidchat@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-01-09 20:56:44 +0000 |
---|---|---|
committer | sidchat@google.com <sidchat@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-01-09 20:56:44 +0000 |
commit | 5f1746bff720c94389dd288f88c80d6f4e9cccc0 (patch) | |
tree | b0daa1ef06c94575b772fc1a9eafad3ce01cf64d /chrome/tools/convert_dict | |
parent | 60e3858a7728f1fc46fe95fed3094aa23b1cb843 (diff) | |
download | chromium_src-5f1746bff720c94389dd288f88c80d6f4e9cccc0.zip chromium_src-5f1746bff720c94389dd288f88c80d6f4e9cccc0.tar.gz chromium_src-5f1746bff720c94389dd288f88c80d6f4e9cccc0.tar.bz2 |
Fix the dic reader so that it now accepts additional words from dic_delta files encoded as UTF-8.
Test: Tested the new encoder with the additional words. The words with the special characters are now working well - i.e., are included in the corresponding spellcheck bdic dictionary file.
Review URL: http://codereview.chromium.org/17324
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@7829 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/tools/convert_dict')
-rw-r--r-- | chrome/tools/convert_dict/dic_reader.cc | 19 |
1 files changed, 12 insertions, 7 deletions
diff --git a/chrome/tools/convert_dict/dic_reader.cc b/chrome/tools/convert_dict/dic_reader.cc index 0cb18f6..1d63532 100644 --- a/chrome/tools/convert_dict/dic_reader.cc +++ b/chrome/tools/convert_dict/dic_reader.cc @@ -52,9 +52,9 @@ void SplitDicLine(const std::string& line, std::vector<std::string>* output) { // contains the number of words or not. If it does, skip the first line. If it // does not, then the first line contains a word. bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader, - const char* file_type, + const char* file_type, const char* encoding, bool file_has_word_count_in_the_first_line) { - printf("Extracting words from %s file...\n", file_type); + printf("Extracting words from %s file\nEncoding: %s\n", file_type, encoding); int line_number = 0; while (!feof(file)) { @@ -82,9 +82,12 @@ bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader, // The first part is the word, the second (optional) part is the affix. We // always use UTF-8 as the encoding to simplify life. std::string utf8word; - if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) { + std::string encoding_string(encoding); + if (encoding_string == "UTF-8") { + utf8word = split[0]; + } else if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) { printf("Unable to convert line %d from %s to UTF-8 in the %s file\n", - line_number, aff_reader->encoding(), file_type); + line_number, encoding, file_type); return false; } @@ -140,14 +143,16 @@ bool DicReader::Read(AffReader* aff_reader) { // Add words from the dic file to the word set. // Note that the first line is the word count in the file. - if (!PopulateWordSet(&word_set, file_, aff_reader, "dic", true)) + if (!PopulateWordSet(&word_set, file_, aff_reader, "dic", + aff_reader->encoding(), true)) return false; - // Add words from the dic delta file to the word set, if it exists. + // Add words from the .dic_delta file to the word set, if it exists. // The first line is the first word to add. Word count line is not present. + // NOTE: These additional words should be encoded as UTF-8. if (additional_words_file_ != NULL) { PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta", - false); + "UTF-8", false); } // Make sure the words are sorted, they may be unsorted in the input. |