diff options
author | hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-02-02 10:02:26 +0000 |
---|---|---|
committer | hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-02-02 10:02:26 +0000 |
commit | bbffa669691c4d2f9f1ab8f95226171be7b2dd04 (patch) | |
tree | 0336c36be3524514fe5ab0f2d7341a6ae270877d /chrome/tools | |
parent | 4f4c43ca4eed4bff261f6e4ff760a02455ef50aa (diff) | |
download | chromium_src-bbffa669691c4d2f9f1ab8f95226171be7b2dd04.zip chromium_src-bbffa669691c4d2f9f1ab8f95226171be7b2dd04.tar.gz chromium_src-bbffa669691c4d2f9f1ab8f95226171be7b2dd04.tar.bz2 |
The first step towards supporting the Hungarian spell-checking dictionary.
This change fixes a couple of problems needed for using a Hungarian dictionary in Chrome.
1. Use TrimWhitespace() in TrimLine()
Sorry, this is caused by my mistake that used TrimWhiteSpaceUTF8() without checking it deeply.
2. Replace morphing rules with compound rules.
it seems existing Hungarian dictionaries use (language-specific) morphing rules to handle words that have both prefixes and suffixes, e.g. "legjobb" (best). It is better to replace such (language-dependent) morphing rules with (language-independent) compound rules to avoid language-specific issues. (As far as I tested, this change fixes many quality problems caused by Hungarian compounds.)
This change also adds simple tests for our dictionary converter.
BUG=15558
TEST=unit_test --gtest_filter=ConvertDictTest*
Review URL: http://codereview.chromium.org/553087
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@37816 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/tools')
-rw-r--r-- | chrome/tools/convert_dict/aff_reader.cc | 17 | ||||
-rw-r--r-- | chrome/tools/convert_dict/convert_dict_unittest.cc | 197 | ||||
-rw-r--r-- | chrome/tools/convert_dict/dic_reader.cc | 7 | ||||
-rw-r--r-- | chrome/tools/convert_dict/hunspell_reader.cc | 8 |
4 files changed, 225 insertions, 4 deletions
diff --git a/chrome/tools/convert_dict/aff_reader.cc b/chrome/tools/convert_dict/aff_reader.cc index 33fa522..6636252 100644 --- a/chrome/tools/convert_dict/aff_reader.cc +++ b/chrome/tools/convert_dict/aff_reader.cc @@ -211,6 +211,23 @@ void AffReader::AddAffix(std::string* rule) { } part = rule->substr(part_start); // From here to end. + if (part.find('-') != std::string::npos) { + // This rule has a morph rule used by old Hungarian dictionaries. + // When a line has a morph rule, its format becomes as listed below. + // AFX D 0 d e - M + // To make hunspell work more happily, replace this morph rule with + // a compound flag as listed below. + // AFX D 0 d/M e + std::vector<std::string> tokens; + SplitString(part, ' ', &tokens); + if (tokens.size() >= 5) { + part = StringPrintf("%s %s/%s %s", + tokens[0].c_str(), + tokens[1].c_str(), tokens[4].c_str(), + tokens[2].c_str()); + } + } + size_t slash_index = part.find('/'); if (slash_index != std::string::npos && !has_indexed_affixes()) { // This can also have a rule string associated with it following a diff --git a/chrome/tools/convert_dict/convert_dict_unittest.cc b/chrome/tools/convert_dict/convert_dict_unittest.cc new file mode 100644 index 0000000..e42800c --- /dev/null +++ b/chrome/tools/convert_dict/convert_dict_unittest.cc @@ -0,0 +1,197 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <map> +#include <string> + +#include "base/file_util.h" +#include "base/format_macros.h" +#include "base/i18n/icu_string_conversions.h" +#include "base/string_util.h" +#include "chrome/tools/convert_dict/aff_reader.h" +#include "chrome/tools/convert_dict/dic_reader.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "third_party/hunspell/google/bdict_reader.h" +#include "third_party/hunspell/google/bdict_writer.h" + +namespace { + +// Compares the given word list with the serialized trie to make sure they +// are the same. +// (This function is copied from "chrome/tools/convert_dict/convert_dict.cc"). +bool VerifyWords(const convert_dict::DicReader::WordList& org_words, + const std::string& serialized) { + hunspell::BDictReader reader; + EXPECT_TRUE( + reader.Init(reinterpret_cast<const unsigned char*>(serialized.data()), + serialized.size())); + + hunspell::WordIterator iter = reader.GetAllWordIterator(); + + int affix_ids[hunspell::BDict::MAX_AFFIXES_PER_WORD]; + + static const int kBufSize = 128; + char buf[kBufSize]; + for (size_t i = 0; i < org_words.size(); i++) { + SCOPED_TRACE(StringPrintf("org_words[%" PRIuS "]: %s", + i, org_words[i].first.c_str())); + + int affix_matches = iter.Advance(buf, kBufSize, affix_ids); + EXPECT_NE(0, affix_matches); + EXPECT_EQ(org_words[i].first, std::string(buf)); + EXPECT_EQ(affix_matches, static_cast<int>(org_words[i].second.size())); + + // Check the individual affix indices. + for (size_t affix_index = 0; affix_index < org_words[i].second.size(); + affix_index++) { + EXPECT_EQ(affix_ids[affix_index], org_words[i].second[affix_index]); + } + } + + return true; +} + +// Implements the test process used by ConvertDictTest. +// This function encapsulates all complicated operations used by +// ConvertDictTest so we can conceal them from the tests themselves. +// This function consists of the following parts: +// * Creates a dummy affix file and a dictionary file. +// * Reads the dummy files. +// * Creates bdict data. +// * Verify the bdict data. +void RunDictionaryTest(const char* codepage, + const std::map<std::wstring, bool>& word_list) { + // Create an affix data and a dictionary data. + std::string aff_data(StringPrintf("SET %s\n", codepage)); + + std::string dic_data(StringPrintf("%" PRIuS "\n", word_list.size())); + for (std::map<std::wstring, bool>::const_iterator it = word_list.begin(); + it != word_list.end(); ++it) { + std::string encoded_word; + EXPECT_TRUE(WideToCodepage(it->first, + codepage, + base::OnStringConversionError::FAIL, + &encoded_word)); + dic_data += encoded_word; + dic_data += "\n"; + } + + // Create a temporary affix file and a dictionary file from the test data. + FilePath aff_file; + file_util::CreateTemporaryFile(&aff_file); + file_util::WriteFile(aff_file, aff_data.c_str(), aff_data.length()); + + FilePath dic_file; + file_util::CreateTemporaryFile(&dic_file); + file_util::WriteFile(dic_file, dic_data.c_str(), dic_data.length()); + + { + // Read the above affix file with AffReader and read the dictionary file + // with DicReader, respectively. +#if defined(OS_WIN) + std::string aff_path = WideToUTF8(aff_file.value()); + std::string dic_path = WideToUTF8(dic_file.value()); +#else + std::string aff_path = aff_file.value(); + std::string dic_path = dic_file.value(); +#endif + convert_dict::AffReader aff_reader(aff_path); + EXPECT_TRUE(aff_reader.Read()); + + convert_dict::DicReader dic_reader(dic_path); + EXPECT_TRUE(dic_reader.Read(&aff_reader)); + + // Verify this DicReader includes all the input words. + EXPECT_EQ(word_list.size(), dic_reader.words().size()); + for (size_t i = 0; i < dic_reader.words().size(); ++i) { + SCOPED_TRACE(StringPrintf("dic_reader.words()[%" PRIuS "]: %s", + i, dic_reader.words()[i].first.c_str())); + std::wstring word(UTF8ToWide(dic_reader.words()[i].first)); + EXPECT_TRUE(word_list.find(word) != word_list.end()); + } + + // Create BDICT data and verify it. + hunspell::BDictWriter writer; + writer.SetComment(aff_reader.comments()); + writer.SetAffixRules(aff_reader.affix_rules()); + writer.SetAffixGroups(aff_reader.GetAffixGroups()); + writer.SetReplacements(aff_reader.replacements()); + writer.SetOtherCommands(aff_reader.other_commands()); + writer.SetWords(dic_reader.words()); + + VerifyWords(dic_reader.words(), writer.GetBDict()); + } + + // Deletes the temporary files. + // We need to delete them after the above AffReader and DicReader are deleted + // since they close the input files in their destructors. + file_util::Delete(aff_file, false); + file_util::Delete(dic_file, false); +} + +} // namespace + +// Tests whether or not our DicReader can read all the input English words +TEST(ConvertDictTest, English) { + const char kCodepage[] = "UTF-8"; + const wchar_t* kWords[] = { + L"I", + L"he", + L"she", + L"it", + L"we", + L"you", + L"they", + }; + + std::map<std::wstring, bool> word_list; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kWords); ++i) + word_list.insert(std::make_pair<std::wstring, bool>(kWords[i], true)); + + RunDictionaryTest(kCodepage, word_list); +} + +// Tests whether or not our DicReader can read all the input Russian words. +TEST(ConvertDictTest, Russian) { + const char kCodepage[] = "KOI8-R"; + const wchar_t* kWords[] = { + L"\x044f", + L"\x0442\x044b", + L"\x043e\x043d", + L"\x043e\x043d\x0430", + L"\x043e\x043d\x043e", + L"\x043c\x044b", + L"\x0432\x044b", + L"\x043e\x043d\x0438", + }; + + std::map<std::wstring, bool> word_list; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kWords); ++i) + word_list.insert(std::make_pair<std::wstring, bool>(kWords[i], true)); + + RunDictionaryTest(kCodepage, word_list); +} + +// Tests whether or not our DicReader can read all the input Hungarian words. +TEST(ConvertDictTest, Hungarian) { + const char kCodepage[] = "ISO8859-2"; + const wchar_t* kWords[] = { + L"\x00e9\x006e", + L"\x0074\x0065", + L"\x0151", + L"\x00f6\x006e", + L"\x006d\x0061\x0067\x0061", + L"\x006d\x0069", + L"\x0074\x0069", + L"\x0151\x006b", + L"\x00f6\x006e\x00f6\x006b", + L"\x006d\x0061\x0067\x0075\x006b", + }; + + std::map<std::wstring, bool> word_list; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kWords); ++i) + word_list.insert(std::make_pair<std::wstring, bool>(kWords[i], true)); + + RunDictionaryTest(kCodepage, word_list); +} diff --git a/chrome/tools/convert_dict/dic_reader.cc b/chrome/tools/convert_dict/dic_reader.cc index 70c30a9..2233d04 100644 --- a/chrome/tools/convert_dict/dic_reader.cc +++ b/chrome/tools/convert_dict/dic_reader.cc @@ -106,6 +106,13 @@ bool PopulateWordSet(WordSet* word_set, FILE* file, AffReader* aff_reader, affix_index = aff_reader->GetAFIndexForAFString(split[1]); } + // Discard the morphological description if it is attached to the first + // token. (It is attached to the first token if a word doesn't have affix + // rules.) + size_t word_tab_offset = utf8word.find('\t'); + if (word_tab_offset != std::string::npos) + utf8word = utf8word.substr(0, word_tab_offset); + WordSet::iterator found = word_set->find(utf8word); if (found == word_set->end()) { std::set<int> affix_vector; diff --git a/chrome/tools/convert_dict/hunspell_reader.cc b/chrome/tools/convert_dict/hunspell_reader.cc index b573b1c..d197c4d 100644 --- a/chrome/tools/convert_dict/hunspell_reader.cc +++ b/chrome/tools/convert_dict/hunspell_reader.cc @@ -20,10 +20,10 @@ void TrimLine(std::string* line) { static_cast<unsigned char>((*line)[2]) == 0xbf) *line = line->substr(3); - std::wstring line_input_wide = UTF8ToWide(*line); - std::wstring line_output_wide; - TrimWhitespace(line_input_wide, TRIM_ALL, &line_output_wide); - *line = WideToUTF8(line_output_wide); + // Treat this text as an ASCII text and trim whitespace characters as + // hunspell does. The returned text is to be converted into UTF-8 text with + // the encoding defined in an affix file. + TrimWhitespace(*line, TRIM_ALL, line); } std::string ReadLine(FILE* file) { |