Fix for Issue 6431 "Two issues about Vietnamese Spell-Checker".

This issue is caused by my stupid mistake in the SpellcheckWordIterator class. Unfortunately, the class does not treat combining characters as word characters for languages which uses combining characters (e.g. Vietnamese, Thai, etc.) because the ICU exemplar set is canonicalized and it does not include combining characters. To fix this, this change decomposes an exemplar set and also mark the decomposed characters (including combining characters) as word characters. BUG=6431 Review URL: http://codereview.chromium.org/21079 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@10087 0039d316-1c4b-4281-b951-d872f2087c98
author: hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-02-20 09:10:03 +0000
committer: hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-02-20 09:10:03 +0000
commit: eba18c8ab6c52e1673f65b6d82ee5559e4556f9a (patch)
tree: fb9f021426196a4f3b3671a2017580c9abd17fc7
parent: 6448e5aac7ebef9627a330d9c66c3dfb24517ff3 (diff)
download: chromium_src-eba18c8ab6c52e1673f65b6d82ee5559e4556f9a.zip
chromium_src-eba18c8ab6c52e1673f65b6d82ee5559e4556f9a.tar.gz
chromium_src-eba18c8ab6c52e1673f65b6d82ee5559e4556f9a.tar.bz2
1 files changed, 29 insertions, 21 deletions
diff --git a/chrome/browser/spellcheck_worditerator.cc b/chrome/browser/spellcheck_worditerator.cc
index 7dc5b4f..2ea5879 100644
--- a/chrome/browser/spellcheck_worditerator.cc
+++ b/chrome/browser/spellcheck_worditerator.cc
@@ -10,8 +10,9 @@
 #include "base/basictypes.h"
 #include "base/string_util.h"
 
+#include "third_party/icu38/public/common/unicode/normlzr.h"
+#include "third_party/icu38/public/common/unicode/schriter.h"
 #include "third_party/icu38/public/common/unicode/uchar.h"
-#include "third_party/icu38/public/common/unicode/unorm.h"
 #include "third_party/icu38/public/common/unicode/uscript.h"
 #include "third_party/icu38/public/common/unicode/uset.h"
 #include "third_party/icu38/public/i18n/unicode/ulocdata.h"
@@ -80,6 +81,26 @@ void SpellcheckCharAttribute::SetDefaultLanguage(const std::wstring& language) {
       UChar32 character = uset_charAt(exemplar_set, i);
       SetWordScript(GetScriptCode(character), true);
     }
+
+    // Many languages use combining characters to input their characters from
+    // keyboards. On the other hand, this exemplar set does not always include
+    // combining characters for such languages.
+    // To treat such combining characters as word characters, we decompose
+    // this exemplar set and treat the decomposed characters as word characters.
+    UnicodeString composed;
+    for (int i = 0; i < length; ++i)
+      composed.append(uset_charAt(exemplar_set, i));
+
+    UnicodeString decomposed;
+    Normalizer::decompose(composed, FALSE, 0, decomposed, status);
+    if (U_SUCCESS(status)) {
+      StringCharacterIterator iterator(decomposed);
+      UChar32 character = iterator.first32();
+      while (character != CharacterIterator::DONE) {
+        SetWordScript(GetScriptCode(character), true);
+        character = iterator.next32();
+      }
+    }
   }
   uset_close(exemplar_set);
 }
@@ -246,25 +267,12 @@ bool SpellcheckWordIterator::Normalize(int input_start,
   // does not only write NFKD and NFKC can compose ligatures into their ASCII
   // alternatives, but also write NFKC keeps accents of characters.
   // Therefore, NFKC seems to be the best option for hunspell.
-  // To use NKFC for normalization, the length of the output string is mostly
-  // equal to the one of the input string. (One exception is ligatures.)
-  // To avoid the unorm_normalize() function from being called always twice,
-  // we temporarily allocate |input_length| + 1 characters to the output string
-  // and call the function with it. We re-allocate the output string
-  // only if it cannot store the normalized string, i.e. the output string is
-  // longer than the input one.
-  const char16* input_string = &word_[input_start];
-  UErrorCode error_code = U_ZERO_ERROR;
-  int output_length = input_length + 1;
-  char16* output_buffer = WriteInto(output_string, output_length);
-  output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,
-                                  output_buffer, output_length, &error_code);
-  if (error_code == U_BUFFER_OVERFLOW_ERROR) {
-    error_code = U_ZERO_ERROR;
-    output_buffer = WriteInto(output_string, ++output_length);
-    output_length = unorm_normalize(input_string, input_length, UNORM_NFKC, 0,
-                                    output_buffer, output_length, &error_code);
-  }
-  return (error_code == U_ZERO_ERROR);
+  UnicodeString input(FALSE, &word_[input_start], input_length);
+  UErrorCode status = U_ZERO_ERROR;
+  UnicodeString output;
+  Normalizer::normalize(input, UNORM_NFKC, 0, output, status);
+  if (U_SUCCESS(status))
+    output_string->assign(output.getTerminatedBuffer());
+  return (status == U_ZERO_ERROR);
 }
author	hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-02-20 09:10:03 +0000
committer	hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-02-20 09:10:03 +0000
commit	eba18c8ab6c52e1673f65b6d82ee5559e4556f9a (patch)
tree	fb9f021426196a4f3b3671a2017580c9abd17fc7
parent	6448e5aac7ebef9627a330d9c66c3dfb24517ff3 (diff)
download	chromium_src-eba18c8ab6c52e1673f65b6d82ee5559e4556f9a.zip chromium_src-eba18c8ab6c52e1673f65b6d82ee5559e4556f9a.tar.gz chromium_src-eba18c8ab6c52e1673f65b6d82ee5559e4556f9a.tar.bz2