diff options
author | hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-02-20 09:10:03 +0000 |
---|---|---|
committer | hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-02-20 09:10:03 +0000 |
commit | eba18c8ab6c52e1673f65b6d82ee5559e4556f9a (patch) | |
tree | fb9f021426196a4f3b3671a2017580c9abd17fc7 | |
parent | 6448e5aac7ebef9627a330d9c66c3dfb24517ff3 (diff) | |
download | chromium_src-eba18c8ab6c52e1673f65b6d82ee5559e4556f9a.zip chromium_src-eba18c8ab6c52e1673f65b6d82ee5559e4556f9a.tar.gz chromium_src-eba18c8ab6c52e1673f65b6d82ee5559e4556f9a.tar.bz2 |
Fix for Issue 6431 "Two issues about Vietnamese Spell-Checker".
This issue is caused by my stupid mistake in the SpellcheckWordIterator class. Unfortunately, the class does not treat combining characters as word characters for languages which uses combining characters (e.g. Vietnamese, Thai, etc.) because the ICU exemplar set is canonicalized and it does not include combining characters.
To fix this, this change decomposes an exemplar set and also mark the decomposed characters (including combining characters) as word characters.
BUG=6431
Review URL: http://codereview.chromium.org/21079
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@10087 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r-- | chrome/browser/spellcheck_worditerator.cc | 50 |
1 files changed, 29 insertions, 21 deletions
diff --git a/chrome/browser/spellcheck_worditerator.cc b/chrome/browser/spellcheck_worditerator.cc index 7dc5b4f..2ea5879 100644 --- a/chrome/browser/spellcheck_worditerator.cc +++ b/chrome/browser/spellcheck_worditerator.cc @@ -10,8 +10,9 @@ #include "base/basictypes.h" #include "base/string_util.h" +#include "third_party/icu38/public/common/unicode/normlzr.h" +#include "third_party/icu38/public/common/unicode/schriter.h" #include "third_party/icu38/public/common/unicode/uchar.h" -#include "third_party/icu38/public/common/unicode/unorm.h" #include "third_party/icu38/public/common/unicode/uscript.h" #include "third_party/icu38/public/common/unicode/uset.h" #include "third_party/icu38/public/i18n/unicode/ulocdata.h" @@ -80,6 +81,26 @@ void SpellcheckCharAttribute::SetDefaultLanguage(const std::wstring& language) { UChar32 character = uset_charAt(exemplar_set, i); SetWordScript(GetScriptCode(character), true); } + + // Many languages use combining characters to input their characters from + // keyboards. On the other hand, this exemplar set does not always include + // combining characters for such languages. + // To treat such combining characters as word characters, we decompose + // this exemplar set and treat the decomposed characters as word characters. + UnicodeString composed; + for (int i = 0; i < length; ++i) + composed.append(uset_charAt(exemplar_set, i)); + + UnicodeString decomposed; + Normalizer::decompose(composed, FALSE, 0, decomposed, status); + if (U_SUCCESS(status)) { + StringCharacterIterator iterator(decomposed); + UChar32 character = iterator.first32(); + while (character != CharacterIterator::DONE) { + SetWordScript(GetScriptCode(character), true); + character = iterator.next32(); + } + } } uset_close(exemplar_set); } @@ -246,25 +267,12 @@ bool SpellcheckWordIterator::Normalize(int input_start, // does not only write NFKD and NFKC can compose ligatures into their ASCII // alternatives, but also write NFKC keeps accents of characters. // Therefore, NFKC seems to be the best option for hunspell. - // To use NKFC for normalization, the length of the output string is mostly - // equal to the one of the input string. (One exception is ligatures.) - // To avoid the unorm_normalize() function from being called always twice, - // we temporarily allocate |input_length| + 1 characters to the output string - // and call the function with it. We re-allocate the output string - // only if it cannot store the normalized string, i.e. the output string is - // longer than the input one. - const char16* input_string = &word_[input_start]; - UErrorCode error_code = U_ZERO_ERROR; - int output_length = input_length + 1; - char16* output_buffer = WriteInto(output_string, output_length); - output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0, - output_buffer, output_length, &error_code); - if (error_code == U_BUFFER_OVERFLOW_ERROR) { - error_code = U_ZERO_ERROR; - output_buffer = WriteInto(output_string, ++output_length); - output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0, - output_buffer, output_length, &error_code); - } - return (error_code == U_ZERO_ERROR); + UnicodeString input(FALSE, &word_[input_start], input_length); + UErrorCode status = U_ZERO_ERROR; + UnicodeString output; + Normalizer::normalize(input, UNORM_NFKC, 0, output, status); + if (U_SUCCESS(status)) + output_string->assign(output.getTerminatedBuffer()); + return (status == U_ZERO_ERROR); } |