diff options
author | initial.commit <initial.commit@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-07-26 23:55:29 +0000 |
---|---|---|
committer | initial.commit <initial.commit@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-07-26 23:55:29 +0000 |
commit | 09911bf300f1a419907a9412154760efd0b7abc3 (patch) | |
tree | f131325fb4e2ad12c6d3504ab75b16dd92facfed /chrome/tools/convert_dict/dic_reader.cc | |
parent | 586acc5fe142f498261f52c66862fa417c3d52d2 (diff) | |
download | chromium_src-09911bf300f1a419907a9412154760efd0b7abc3.zip chromium_src-09911bf300f1a419907a9412154760efd0b7abc3.tar.gz chromium_src-09911bf300f1a419907a9412154760efd0b7abc3.tar.bz2 |
Add chrome to the repository.
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@15 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/tools/convert_dict/dic_reader.cc')
-rw-r--r-- | chrome/tools/convert_dict/dic_reader.cc | 165 |
1 files changed, 165 insertions, 0 deletions
diff --git a/chrome/tools/convert_dict/dic_reader.cc b/chrome/tools/convert_dict/dic_reader.cc new file mode 100644 index 0000000..8358e13 --- /dev/null +++ b/chrome/tools/convert_dict/dic_reader.cc @@ -0,0 +1,165 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "chrome/tools/convert_dict/dic_reader.h" + +#include <algorithm> +#include <set> + +#include "base/string_util.h" +#include "chrome/tools/convert_dict/aff_reader.h" +#include "chrome/tools/convert_dict/hunspell_reader.h" + +namespace convert_dict { + +namespace { + +// Maps each unique word to the unique affix group IDs associated with it. +typedef std::map<std::string, std::set<int> > WordSet; + +void SplitDicLine(const std::string& line, std::vector<std::string>* output) { + // We split the line on a slash not preceeded by a backslash. A slash at the + // beginning of the line is not a separator either. + size_t slash_index = line.size(); + for (size_t i = 0; i < line.size(); i++) { + if (line[i] == '/' && i > 0 && line[i - 1] != '\\') { + slash_index = i; + break; + } + } + + output->clear(); + + // Everything before the slash index is the first term. We also need to + // convert all escaped slashes ("\/" sequences) to regular slashes. + std::string word = line.substr(0, slash_index); + ReplaceSubstringsAfterOffset(&word, 0, "\\/", "/"); + output->push_back(word); + + // Everything (if anything) after the slash is the second. + if (slash_index < line.size() - 1) + output->push_back(line.substr(slash_index + 1)); +} + +} // namespace + +DicReader::DicReader(const std::string& filename) { + fopen_s(&file_, filename.c_str(), "r"); +} + +DicReader::~DicReader() { + if (file_) + fclose(file_); +} + +bool DicReader::Read(AffReader* aff_reader) { + if (!file_) + return false; + + bool got_count = false; + int line_number = 0; + + WordSet word_set; + while (!feof(file_)) { + std::string line = ReadLine(file_); + line_number++; + StripComment(&line); + if (line.empty()) + continue; + + if (!got_count) { + // Skip the first nonempty line, this is the line count. We don't bother + // with it and just read all the lines. + got_count = true; + continue; + } + + std::vector<std::string> split; + SplitDicLine(line, &split); + if (split.size() == 0 || split.size() > 2) { + printf("Line %d has extra slashes in the dic file\n", line_number); + return false; + } + + // The first part is the word, the second (optional) part is the affix. We + // always use UTF-8 as the encoding to simplify life. + std::string utf8word; + if (!aff_reader->EncodingToUTF8(split[0], &utf8word)) { + printf("Unable to convert line %d from %s to UTF-8 in the dic file\n", + line_number, aff_reader->encoding()); + return false; + } + + // We always convert the affix to an index. 0 means no affix. + int affix_index = 0; + if (split.size() == 2) { + // Got a rule, which is the stuff after the slash. The line may also have + // an optional term separated by a tab. This is the morphological + // description. We don't care about this (it is used in the tests to + // generate a nice dump), so we remove it. + size_t split1_tab_offset = split[1].find('\t'); + if (split1_tab_offset != std::string::npos) + split[1] = split[1].substr(0, split1_tab_offset); + + if (aff_reader->has_indexed_affixes()) + affix_index = atoi(split[1].c_str()); + else + affix_index = aff_reader->GetAFIndexForAFString(split[1]); + } + + WordSet::iterator found = word_set.find(utf8word); + if (found == word_set.end()) { + std::set<int> affix_vector; + affix_vector.insert(affix_index); + word_set.insert(std::make_pair(utf8word, affix_vector)); + } else { + found->second.insert(affix_index); + } + } + + // Make sure the words are sorted, they may be unsorted in the input. + for (WordSet::iterator word = word_set.begin(); word != word_set.end(); + ++word) { + std::vector<int> affixes; + for (std::set<int>::iterator aff = word->second.begin(); + aff != word->second.end(); ++aff) + affixes.push_back(*aff); + + // Double check that the affixes are sorted. This isn't strictly necessary + // but it's nice for the file to have a fixed layout. + std::sort(affixes.begin(), affixes.end()); + words_.push_back(std::make_pair(word->first, affixes)); + } + + // Double-check that the words are sorted. + std::sort(words_.begin(), words_.end()); + return true; +} + +} // namespace convert_dict |