diff options
Diffstat (limited to 'chrome/browser/spellcheck_worditerator.cc')
-rw-r--r-- | chrome/browser/spellcheck_worditerator.cc | 50 |
1 files changed, 29 insertions, 21 deletions
diff --git a/chrome/browser/spellcheck_worditerator.cc b/chrome/browser/spellcheck_worditerator.cc index 7dc5b4f..2ea5879 100644 --- a/chrome/browser/spellcheck_worditerator.cc +++ b/chrome/browser/spellcheck_worditerator.cc @@ -10,8 +10,9 @@ #include "base/basictypes.h" #include "base/string_util.h" +#include "third_party/icu38/public/common/unicode/normlzr.h" +#include "third_party/icu38/public/common/unicode/schriter.h" #include "third_party/icu38/public/common/unicode/uchar.h" -#include "third_party/icu38/public/common/unicode/unorm.h" #include "third_party/icu38/public/common/unicode/uscript.h" #include "third_party/icu38/public/common/unicode/uset.h" #include "third_party/icu38/public/i18n/unicode/ulocdata.h" @@ -80,6 +81,26 @@ void SpellcheckCharAttribute::SetDefaultLanguage(const std::wstring& language) { UChar32 character = uset_charAt(exemplar_set, i); SetWordScript(GetScriptCode(character), true); } + + // Many languages use combining characters to input their characters from + // keyboards. On the other hand, this exemplar set does not always include + // combining characters for such languages. + // To treat such combining characters as word characters, we decompose + // this exemplar set and treat the decomposed characters as word characters. + UnicodeString composed; + for (int i = 0; i < length; ++i) + composed.append(uset_charAt(exemplar_set, i)); + + UnicodeString decomposed; + Normalizer::decompose(composed, FALSE, 0, decomposed, status); + if (U_SUCCESS(status)) { + StringCharacterIterator iterator(decomposed); + UChar32 character = iterator.first32(); + while (character != CharacterIterator::DONE) { + SetWordScript(GetScriptCode(character), true); + character = iterator.next32(); + } + } } uset_close(exemplar_set); } @@ -246,25 +267,12 @@ bool SpellcheckWordIterator::Normalize(int input_start, // does not only write NFKD and NFKC can compose ligatures into their ASCII // alternatives, but also write NFKC keeps accents of characters. // Therefore, NFKC seems to be the best option for hunspell. - // To use NKFC for normalization, the length of the output string is mostly - // equal to the one of the input string. (One exception is ligatures.) - // To avoid the unorm_normalize() function from being called always twice, - // we temporarily allocate |input_length| + 1 characters to the output string - // and call the function with it. We re-allocate the output string - // only if it cannot store the normalized string, i.e. the output string is - // longer than the input one. - const char16* input_string = &word_[input_start]; - UErrorCode error_code = U_ZERO_ERROR; - int output_length = input_length + 1; - char16* output_buffer = WriteInto(output_string, output_length); - output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0, - output_buffer, output_length, &error_code); - if (error_code == U_BUFFER_OVERFLOW_ERROR) { - error_code = U_ZERO_ERROR; - output_buffer = WriteInto(output_string, ++output_length); - output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0, - output_buffer, output_length, &error_code); - } - return (error_code == U_ZERO_ERROR); + UnicodeString input(FALSE, &word_[input_start], input_length); + UErrorCode status = U_ZERO_ERROR; + UnicodeString output; + Normalizer::normalize(input, UNORM_NFKC, 0, output, status); + if (U_SUCCESS(status)) + output_string->assign(output.getTerminatedBuffer()); + return (status == U_ZERO_ERROR); } |