summaryrefslogtreecommitdiffstats
path: root/chrome/tools/convert_dict
diff options
context:
space:
mode:
authorsidchat@google.com <sidchat@google.com@0039d316-1c4b-4281-b951-d872f2087c98>2009-01-09 20:56:44 +0000
committersidchat@google.com <sidchat@google.com@0039d316-1c4b-4281-b951-d872f2087c98>2009-01-09 20:56:44 +0000
commit5f1746bff720c94389dd288f88c80d6f4e9cccc0 (patch)
treeb0daa1ef06c94575b772fc1a9eafad3ce01cf64d /chrome/tools/convert_dict
parent60e3858a7728f1fc46fe95fed3094aa23b1cb843 (diff)
downloadchromium_src-5f1746bff720c94389dd288f88c80d6f4e9cccc0.zip
chromium_src-5f1746bff720c94389dd288f88c80d6f4e9cccc0.tar.gz
chromium_src-5f1746bff720c94389dd288f88c80d6f4e9cccc0.tar.bz2
Fix the dic reader so that it now accepts additional words from dic_delta files encoded as UTF-8.
Test: Tested the new encoder with the additional words. The words with the special characters are now working well - i.e., are included in the corresponding spellcheck bdic dictionary file. Review URL: http://codereview.chromium.org/17324 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@7829 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/tools/convert_dict')
-rw-r--r--chrome/tools/convert_dict/dic_reader.cc19
1 files changed, 12 insertions, 7 deletions
diff --git a/chrome/tools/convert_dict/dic_reader.cc b/chrome/tools/convert_dict/dic_reader.cc
index 0cb18f6..1d63532 100644
--- a/chrome/tools/convert_dict/dic_reader.cc
+++ b/chrome/tools/convert_dict/dic_reader.cc
@@ -52,9 +52,9 @@ void SplitDicLine(const std::string& line, std::vector<std::string>* output) {
// contains the number of words or not. If it does, skip the first line. If it
// does not, then the first line contains a word.
bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader,
- const char* file_type,
+ const char* file_type, const char* encoding,
bool file_has_word_count_in_the_first_line) {
- printf("Extracting words from %s file...\n", file_type);
+ printf("Extracting words from %s file\nEncoding: %s\n", file_type, encoding);
int line_number = 0;
while (!feof(file)) {
@@ -82,9 +82,12 @@ bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader,
// The first part is the word, the second (optional) part is the affix. We
// always use UTF-8 as the encoding to simplify life.
std::string utf8word;
- if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) {
+ std::string encoding_string(encoding);
+ if (encoding_string == "UTF-8") {
+ utf8word = split[0];
+ } else if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) {
printf("Unable to convert line %d from %s to UTF-8 in the %s file\n",
- line_number, aff_reader->encoding(), file_type);
+ line_number, encoding, file_type);
return false;
}
@@ -140,14 +143,16 @@ bool DicReader::Read(AffReader* aff_reader) {
// Add words from the dic file to the word set.
// Note that the first line is the word count in the file.
- if (!PopulateWordSet(&word_set, file_, aff_reader, "dic", true))
+ if (!PopulateWordSet(&word_set, file_, aff_reader, "dic",
+ aff_reader->encoding(), true))
return false;
- // Add words from the dic delta file to the word set, if it exists.
+ // Add words from the .dic_delta file to the word set, if it exists.
// The first line is the first word to add. Word count line is not present.
+ // NOTE: These additional words should be encoded as UTF-8.
if (additional_words_file_ != NULL) {
PopulateWordSet(&word_set, additional_words_file_, aff_reader, "dic delta",
- false);
+ "UTF-8", false);
}
// Make sure the words are sorted, they may be unsorted in the input.