summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorhbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-02-20 09:10:03 +0000
committerhbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-02-20 09:10:03 +0000
commiteba18c8ab6c52e1673f65b6d82ee5559e4556f9a (patch)
treefb9f021426196a4f3b3671a2017580c9abd17fc7
parent6448e5aac7ebef9627a330d9c66c3dfb24517ff3 (diff)
downloadchromium_src-eba18c8ab6c52e1673f65b6d82ee5559e4556f9a.zip
chromium_src-eba18c8ab6c52e1673f65b6d82ee5559e4556f9a.tar.gz
chromium_src-eba18c8ab6c52e1673f65b6d82ee5559e4556f9a.tar.bz2
Fix for Issue 6431 "Two issues about Vietnamese Spell-Checker".
This issue is caused by my stupid mistake in the SpellcheckWordIterator class. Unfortunately, the class does not treat combining characters as word characters for languages which uses combining characters (e.g. Vietnamese, Thai, etc.) because the ICU exemplar set is canonicalized and it does not include combining characters. To fix this, this change decomposes an exemplar set and also mark the decomposed characters (including combining characters) as word characters. BUG=6431 Review URL: http://codereview.chromium.org/21079 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@10087 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r--chrome/browser/spellcheck_worditerator.cc50
1 files changed, 29 insertions, 21 deletions
diff --git a/chrome/browser/spellcheck_worditerator.cc b/chrome/browser/spellcheck_worditerator.cc
index 7dc5b4f..2ea5879 100644
--- a/chrome/browser/spellcheck_worditerator.cc
+++ b/chrome/browser/spellcheck_worditerator.cc
@@ -10,8 +10,9 @@
#include "base/basictypes.h"
#include "base/string_util.h"
+#include "third_party/icu38/public/common/unicode/normlzr.h"
+#include "third_party/icu38/public/common/unicode/schriter.h"
#include "third_party/icu38/public/common/unicode/uchar.h"
-#include "third_party/icu38/public/common/unicode/unorm.h"
#include "third_party/icu38/public/common/unicode/uscript.h"
#include "third_party/icu38/public/common/unicode/uset.h"
#include "third_party/icu38/public/i18n/unicode/ulocdata.h"
@@ -80,6 +81,26 @@ void SpellcheckCharAttribute::SetDefaultLanguage(const std::wstring& language) {
UChar32 character = uset_charAt(exemplar_set, i);
SetWordScript(GetScriptCode(character), true);
}
+
+ // Many languages use combining characters to input their characters from
+ // keyboards. On the other hand, this exemplar set does not always include
+ // combining characters for such languages.
+ // To treat such combining characters as word characters, we decompose
+ // this exemplar set and treat the decomposed characters as word characters.
+ UnicodeString composed;
+ for (int i = 0; i < length; ++i)
+ composed.append(uset_charAt(exemplar_set, i));
+
+ UnicodeString decomposed;
+ Normalizer::decompose(composed, FALSE, 0, decomposed, status);
+ if (U_SUCCESS(status)) {
+ StringCharacterIterator iterator(decomposed);
+ UChar32 character = iterator.first32();
+ while (character != CharacterIterator::DONE) {
+ SetWordScript(GetScriptCode(character), true);
+ character = iterator.next32();
+ }
+ }
}
uset_close(exemplar_set);
}
@@ -246,25 +267,12 @@ bool SpellcheckWordIterator::Normalize(int input_start,
// does not only write NFKD and NFKC can compose ligatures into their ASCII
// alternatives, but also write NFKC keeps accents of characters.
// Therefore, NFKC seems to be the best option for hunspell.
- // To use NKFC for normalization, the length of the output string is mostly
- // equal to the one of the input string. (One exception is ligatures.)
- // To avoid the unorm_normalize() function from being called always twice,
- // we temporarily allocate |input_length| + 1 characters to the output string
- // and call the function with it. We re-allocate the output string
- // only if it cannot store the normalized string, i.e. the output string is
- // longer than the input one.
- const char16* input_string = &word_[input_start];
- UErrorCode error_code = U_ZERO_ERROR;
- int output_length = input_length + 1;
- char16* output_buffer = WriteInto(output_string, output_length);
- output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,
- output_buffer, output_length, &error_code);
- if (error_code == U_BUFFER_OVERFLOW_ERROR) {
- error_code = U_ZERO_ERROR;
- output_buffer = WriteInto(output_string, ++output_length);
- output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,
- output_buffer, output_length, &error_code);
- }
- return (error_code == U_ZERO_ERROR);
+ UnicodeString input(FALSE, &word_[input_start], input_length);
+ UErrorCode status = U_ZERO_ERROR;
+ UnicodeString output;
+ Normalizer::normalize(input, UNORM_NFKC, 0, output, status);
+ if (U_SUCCESS(status))
+ output_string->assign(output.getTerminatedBuffer());
+ return (status == U_ZERO_ERROR);
}