summaryrefslogtreecommitdiffstats
path: root/chrome/renderer/spellchecker/spellcheck_worditerator.cc
diff options
context:
space:
mode:
authorestade@chromium.org <estade@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-11-06 03:05:46 +0000
committerestade@chromium.org <estade@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-11-06 03:05:46 +0000
commit85c55dcd717445cd3763b5c94f9902b4cdd194b0 (patch)
tree2deea721cfac202e3eb8556f66a4cf317a331288 /chrome/renderer/spellchecker/spellcheck_worditerator.cc
parentf1a8b962f0a6f1deb6c8c05a3f86d541e2ba61dd (diff)
downloadchromium_src-85c55dcd717445cd3763b5c94f9902b4cdd194b0.zip
chromium_src-85c55dcd717445cd3763b5c94f9902b4cdd194b0.tar.gz
chromium_src-85c55dcd717445cd3763b5c94f9902b4cdd194b0.tar.bz2
Move the spellchecker to the renderer.
The motivation is that this removes the sync IPC on every call to the spellchecker. Also, currently we spellcheck in the IO thread, which frequently needs to go to disk (in particular, the entire spellcheck dictionary starts paged out), so this will block just the single renderer when that happens, rather than the whole IO thread. This breaks the SpellChecker class into two new classes. 1) On the browser side, we have SpellCheckHost. This class handles browser-wide tasks, such as keeping the custom words list in sync with the on-disk custom words dictionary, downloading missing dictionaries, etc. On Posix, it also opens the bdic file since the renderer isn't allowed to open files. SpellCheckHost is created and destroyed on the UI thread. It is initialized on the file thread. 2) On the renderer side, SpellChecker2. This class will one day be renamed SpellChecker. It handles actual checking of the words, memory maps the dictionary file, loads hunspell, etc. There is one SpellChecker2 per RenderThread (hence one per render process). My intention is for this patch to move Linux to this new approach, and follow up with ports for Windows (which will involve passing a dictionary file name rather than a file descriptor through to the renderer) and Mac (which will involve adding sync ViewHost IPC callsfor when the platform spellchecker is enabled). Note that anyone using the platform spellchecker rather than Hunspell will get no benefit out of this refactor. There should be no loss of functionality for Linux (or any other platform) in this patch. The following should all still work: - dictionary is loaded lazily - hunspell is initialized lazily, per renderer - language changes work. - Dynamic downloading of new dictionaries - auto spell correct works (as well as toggling it). - disabling spellcheck works. - custom words work (including adding in one renderer and immediately having it take effect in other renderers, for certain values of "immediate") TODO: - move spellchecker unit tests to test SpellCheck2 - add sync IPC for platform spellchecker; port to Mac - add dictionary location fallback; port to Windows - remove SpellChecker classes from browser/ BUG=25677 Review URL: http://codereview.chromium.org/357003 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@31199 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/renderer/spellchecker/spellcheck_worditerator.cc')
-rw-r--r--chrome/renderer/spellchecker/spellcheck_worditerator.cc274
1 files changed, 274 insertions, 0 deletions
diff --git a/chrome/renderer/spellchecker/spellcheck_worditerator.cc b/chrome/renderer/spellchecker/spellcheck_worditerator.cc
new file mode 100644
index 0000000..827d9ee
--- /dev/null
+++ b/chrome/renderer/spellchecker/spellcheck_worditerator.cc
@@ -0,0 +1,274 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/renderer/spellchecker/spellcheck_worditerator.h"
+
+#include <map>
+#include <string>
+
+#include "base/basictypes.h"
+#include "base/string_util.h"
+#include "chrome/renderer/spellchecker/spellcheck.h"
+
+#include "third_party/icu/public/common/unicode/normlzr.h"
+#include "third_party/icu/public/common/unicode/schriter.h"
+#include "third_party/icu/public/common/unicode/uchar.h"
+#include "third_party/icu/public/common/unicode/uscript.h"
+#include "third_party/icu/public/common/unicode/uset.h"
+#include "third_party/icu/public/i18n/unicode/ulocdata.h"
+
+SpellcheckCharAttribute::SpellcheckCharAttribute() {
+ InitializeScriptTable();
+
+ // Even though many dictionaries treats numbers and contractions as words and
+ // treats USCRIPT_COMMON characters as word characters, the
+ // SpellcheckWordIterator class treats USCRIPT_COMMON characters as non-word
+ // characters to strictly-distinguish contraction characters from word
+ // characters.
+ SetWordScript(USCRIPT_COMMON, false);
+
+ // Initialize the table of characters used for contractions.
+ // This array consists of the 'Midletter' and 'MidNumLet' characters of the
+ // word-break property list provided by Unicode, Inc.:
+ // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakProperty.txt
+ static const UChar32 kMidLetters[] = {
+ L'\x003A', // MidLetter # COLON
+ L'\x00B7', // MidLetter # MIDDLE DOT
+ L'\x0387', // MidLetter # GREEK ANO TELEIA
+ L'\x05F4', // MidLetter # HEBREW PUNCTUATION GERSHAYIM
+ L'\x2027', // MidLetter # HYPHENATION POINT
+ L'\xFE13', // MidLetter # PRESENTATION FORM FOR VERTICAL COLON
+ L'\xFE55', // MidLetter # SMALL COLON
+ L'\xFF1A', // MidLetter # FULLWIDTH COLON
+ L'\x0027', // MidNumLet # APOSTROPHE
+ L'\x002E', // MidNumLet # FULL STOP
+ L'\x2018', // MidNumLet # LEFT SINGLE QUOTATION MARK
+ L'\x2019', // MidNumLet # RIGHT SINGLE QUOTATION MARK
+ L'\x2024', // MidNumLet # ONE DOT LEADER
+ L'\xFE52', // MidNumLet # SMALL FULL STOP
+ L'\xFF07', // MidNumLet # FULLWIDTH APOSTROPHE
+ L'\xFF0E', // MidNumLet # FULLWIDTH FULL STOP
+ };
+ for (size_t i = 0; i < arraysize(kMidLetters); ++i)
+ middle_letters_[kMidLetters[i]] = true;
+}
+
+SpellcheckCharAttribute::~SpellcheckCharAttribute() {
+}
+
+// Sets the default language for this object.
+// This function retrieves the exemplar set to set up the default character
+// attributes.
+void SpellcheckCharAttribute::SetDefaultLanguage(const std::string& language) {
+ UErrorCode status = U_ZERO_ERROR;
+ ULocaleData* locale_data = ulocdata_open(language.c_str(), &status);
+ if (U_FAILURE(status))
+ return;
+
+ // Retrieves the exemplar set of the given language and update the
+ // character-attribute table to treat its characters as word characters.
+ USet* exemplar_set = uset_open(1, 0);
+ ulocdata_getExemplarSet(locale_data, exemplar_set, 0, ULOCDATA_ES_STANDARD,
+ &status);
+ ulocdata_close(locale_data);
+ if (U_SUCCESS(status)) {
+ int length = uset_size(exemplar_set);
+ for (int i = 0; i < length; ++i) {
+ UChar32 character = uset_charAt(exemplar_set, i);
+ SetWordScript(GetScriptCode(character), true);
+ }
+
+ // Many languages use combining characters to input their characters from
+ // keyboards. On the other hand, this exemplar set does not always include
+ // combining characters for such languages.
+ // To treat such combining characters as word characters, we decompose
+ // this exemplar set and treat the decomposed characters as word characters.
+ icu::UnicodeString composed;
+ for (int i = 0; i < length; ++i)
+ composed.append(uset_charAt(exemplar_set, i));
+
+ icu::UnicodeString decomposed;
+ icu::Normalizer::decompose(composed, FALSE, 0, decomposed, status);
+ if (U_SUCCESS(status)) {
+ icu::StringCharacterIterator iterator(decomposed);
+ UChar32 character = iterator.first32();
+ while (character != icu::CharacterIterator::DONE) {
+ SetWordScript(GetScriptCode(character), true);
+ character = iterator.next32();
+ }
+ }
+ }
+ uset_close(exemplar_set);
+}
+
+// Returns whether or not the given character is a character used by the
+// selected dictionary.
+bool SpellcheckCharAttribute::IsWordChar(UChar32 character) const {
+ return IsWordScript(GetScriptCode(character)) && !u_isdigit(character);
+}
+
+// Returns whether or not the given character is a character used by
+// contractions.
+bool SpellcheckCharAttribute::IsContractionChar(UChar32 character) const {
+ std::map<UChar32, bool>::const_iterator iterator;
+ iterator = middle_letters_.find(character);
+ if (iterator == middle_letters_.end())
+ return false;
+ return iterator->second;
+}
+
+// Initializes the mapping table.
+void SpellcheckCharAttribute::InitializeScriptTable() {
+ for (size_t i = 0; i < arraysize(script_attributes_); ++i)
+ script_attributes_[i] = false;
+}
+
+// Retrieves the ICU script code.
+UScriptCode SpellcheckCharAttribute::GetScriptCode(UChar32 character) const {
+ UErrorCode status = U_ZERO_ERROR;
+ UScriptCode script_code = uscript_getScript(character, &status);
+ return U_SUCCESS(status) ? script_code : USCRIPT_INVALID_CODE;
+}
+
+// Updates the mapping table from an ICU script code to its attribute, i.e.
+// whether not a script is used by the selected dictionary.
+void SpellcheckCharAttribute::SetWordScript(const int script_code,
+ bool in_use) {
+ if (script_code < 0 ||
+ static_cast<size_t>(script_code) >= arraysize(script_attributes_))
+ return;
+ script_attributes_[script_code] = in_use;
+}
+
+// Returns whether or not the given script is used by the selected
+// dictionary.
+bool SpellcheckCharAttribute::IsWordScript(
+ const UScriptCode script_code) const {
+ if (script_code < 0 ||
+ static_cast<size_t>(script_code) >= arraysize(script_attributes_))
+ return false;
+ return script_attributes_[script_code];
+}
+
+SpellcheckWordIterator::SpellcheckWordIterator()
+ : word_(NULL),
+ length_(0),
+ position_(0),
+ allow_contraction_(false),
+ attribute_(NULL) {
+}
+
+SpellcheckWordIterator::~SpellcheckWordIterator() {
+}
+
+// Initialize a word-iterator object.
+void SpellcheckWordIterator::Initialize(
+ const SpellcheckCharAttribute* attribute,
+ const char16* word,
+ size_t length,
+ bool allow_contraction) {
+ word_ = word;
+ position_ = 0;
+ length_ = static_cast<int>(length);
+ allow_contraction_ = allow_contraction;
+ attribute_ = attribute;
+}
+
+// Retrieves a word (or a contraction).
+// When a contraction is enclosed with contraction characters (e.g. 'isn't',
+// 'rock'n'roll'), we should discard the beginning and the end of the
+// contraction but we should never split the contraction.
+// To handle this case easily, we should firstly extract a segment consisting
+// of word characters and contraction characters, and discard contraction
+// characters at the beginning and the end of the extracted segment.
+bool SpellcheckWordIterator::GetNextWord(string16* word_string,
+ int* word_start,
+ int* word_length) {
+ word_string->empty();
+ *word_start = 0;
+ *word_length = 0;
+ while (position_ < length_) {
+ int segment_start = 0;
+ int segment_end = 0;
+ GetSegment(&segment_start, &segment_end);
+ TrimSegment(segment_start, segment_end, word_start, word_length);
+ if (*word_length > 0)
+ return Normalize(*word_start, *word_length, word_string);
+ }
+
+ return false;
+}
+
+// Retrieves a segment consisting of word characters (and contraction
+// characters if the |allow_contraction_| value is true).
+// When the current position refers to a non-word character, this function
+// returns a non-empty segment consisting of the character itself. In this
+// case, the TrimSegment() function discards the character and returns an
+// empty word (i.e. |word_length| == 0).
+void SpellcheckWordIterator::GetSegment(int* segment_start,
+ int* segment_end) {
+ int position = position_;
+ while (position < length_) {
+ UChar32 character;
+ U16_NEXT(word_, position, length_, character);
+ if (!attribute_->IsWordChar(character)) {
+ if (!allow_contraction_ || !attribute_->IsContractionChar(character))
+ break;
+ }
+ }
+ *segment_start = position_;
+ *segment_end = position;
+ position_ = position;
+}
+
+// Discards non-word characters at the beginning and the end of the given
+// segment.
+void SpellcheckWordIterator::TrimSegment(int segment_start,
+ int segment_end,
+ int* word_start,
+ int* word_length) const {
+ while (segment_start < segment_end) {
+ UChar32 character;
+ int segment_next = segment_start;
+ U16_NEXT(word_, segment_next, segment_end, character);
+ if (attribute_->IsWordChar(character)) {
+ *word_start = segment_start;
+ break;
+ }
+ segment_start = segment_next;
+ }
+ while (segment_end >= segment_start) {
+ UChar32 character;
+ int segment_prev = segment_end;
+ U16_PREV(word_, segment_start, segment_prev, character);
+ if (attribute_->IsWordChar(character)) {
+ *word_length = segment_end - segment_start;
+ break;
+ }
+ segment_end = segment_prev;
+ }
+}
+
+// Normalizes a non-terminated string into its canonical form so that
+// a spellchecker object can check spellings of words which contain ligatures,
+// full-width letters, etc.
+// USCRIPT_LATIN does not only consists of US-ASCII and ISO/IEC 8859-1, but
+// also consists of ISO/IEC 8859-{2,3,4,9,10}, ligatures, fullwidth latin,
+// etc. For its details, please read the script table in
+// "http://www.unicode.org/Public/UNIDATA/Scripts.txt".
+bool SpellcheckWordIterator::Normalize(int input_start,
+ int input_length,
+ string16* output_string) const {
+ // Unicode Standard Annex #15 "http://www.unicode.org/unicode/reports/tr15/"
+ // does not only write NFKD and NFKC can compose ligatures into their ASCII
+ // alternatives, but also write NFKC keeps accents of characters.
+ // Therefore, NFKC seems to be the best option for hunspell.
+ icu::UnicodeString input(FALSE, &word_[input_start], input_length);
+ UErrorCode status = U_ZERO_ERROR;
+ icu::UnicodeString output;
+ icu::Normalizer::normalize(input, UNORM_NFKC, 0, output, status);
+ if (U_SUCCESS(status))
+ output_string->assign(output.getTerminatedBuffer());
+ return status == U_ZERO_ERROR || status == U_STRING_NOT_TERMINATED_WARNING;
+}