Changing the text normalization in the CLD to use the ICU library

instead of the Windows LCMapString API that the sanbox is blocking. BUG=32648 TEST=Use the Translate extension on XP. The renderer should not crash. TBR=jshin Review URL: http://codereview.chromium.org/545137 git-svn-id: svn://svn.chromium.org/chrome/branches/249/src@36694 0039d316-1c4b-4281-b951-d872f2087c98
author: jcampan@chromium.org <jcampan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-01-20 23:00:20 +0000
committer: jcampan@chromium.org <jcampan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-01-20 23:00:20 +0000
commit: e3ad14ec5b929277ad6546e18b095de66348d6f3 (patch)
tree: 28e86d508e2f98d9540fb4e5b344d0f60d62f4f2
parent: 13ce1416712f6d7385d2485c134916c58b45d30c (diff)
download: chromium_src-e3ad14ec5b929277ad6546e18b095de66348d6f3.zip
chromium_src-e3ad14ec5b929277ad6546e18b095de66348d6f3.tar.gz
chromium_src-e3ad14ec5b929277ad6546e18b095de66348d6f3.tar.bz2
3 files changed, 27 insertions, 138 deletions
diff --git a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc
index 85dae05..64d604f 100644
--- a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc
+++ b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc
@@ -7,91 +7,34 @@
 #include <tchar.h>
 #include <windows.h>
 
+#include <string>
 #include <vector>  // to compile bar/common/component.h
 
 #include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det.h"
 #include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_scopedptr.h"
 #include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/normalizedunicodetext.h"
 
-std::string NormalizeText(const WCHAR* text,
-                          int* num_languages,
-                          DWORD* error_code) {
-  if (!text || !num_languages) {
-    if (error_code)
-      *error_code = ERROR_INVALID_PARAMETER;
+#include "unicode/normlzr.h"
+#include "unicode/unistr.h"
+#include "unicode/ustring.h"
+
+std::string NormalizeText(const UChar* text) {
+  // To avoid a copy, use the read-only aliasing ctor.
+  icu::UnicodeString source(1, text, -1);
+  icu::UnicodeString normalized;
+  UErrorCode status = U_ZERO_ERROR;
+  icu::Normalizer::normalize(source, UNORM_NFC, 0, normalized, status);
+  if (U_FAILURE(status))
     return std::string();
-  }
-
-  // Normalize text here.  We do not check the return value here since there
-  // is no meaningful recovery we can do in case of failure anyway.
-  // Since the vast majority of texts on the Internet is already normalized
-  // and languages which require normalization are easy to recognize by CLD
-  // anyway, we'll benefit more from trying to detect language in non-normalized
-  // text (and, with some probability, fail to recognize it) than to give up
-  // right away and return the unknown language here.
-  NormalizedUnicodeText nomalized_text;
-  nomalized_text.Normalize(NormalizationC, text);
-
-  // Determine the size of the buffer required to store a lowercased text.
-  int lowercase_text_size =
-      ::LCMapString(NULL, LCMAP_LOWERCASE | LCMAP_LINGUISTIC_CASING,
-                    nomalized_text.get(), -1,
-                    NULL, 0);
-  if (!lowercase_text_size) {
-    if (error_code)
-      *error_code = ::GetLastError();
-    return std::string();
-  }
-
-  scoped_array<WCHAR> lowercase_text(new WCHAR[lowercase_text_size]);
-  if (!lowercase_text.get())
-    return std::string();
-
-  // Covert text to lowercase.
-  int lowercasing_result =
-      ::LCMapString(NULL, LCMAP_LOWERCASE | LCMAP_LINGUISTIC_CASING,
-                    nomalized_text.get(), -1,
-                    lowercase_text.get(), lowercase_text_size);
-  if (!lowercasing_result) {
-    if (error_code)
-      *error_code = ::GetLastError();
-    return std::string();
-  }
-
-  // Determine the size of the buffer required to covert text to UTF-8.
-  int utf8_encoded_buffer_size =
-      ::WideCharToMultiByte(CP_UTF8, 0,
-                            lowercase_text.get(), -1,
-                            NULL, 0,
-                            NULL, NULL);
-  if (!utf8_encoded_buffer_size) {
-    if (error_code)
-      *error_code = ::GetLastError();
-    return std::string();
-  }
-
-  scoped_array<char> utf8_encoded_buffer(new char[utf8_encoded_buffer_size]);
-
-  // Convert text to UTF-8.
-  int utf8_encoding_result =
-      ::WideCharToMultiByte(CP_UTF8, 0,
-                            lowercase_text.get(), -1,
-                            utf8_encoded_buffer.get(),
-                            utf8_encoded_buffer_size,
-                            NULL, NULL);
-  if (!utf8_encoding_result) {
-    if (error_code)
-      *error_code = ::GetLastError();
-    return std::string();
-  }
-
-  if (error_code)
-    *error_code = 0;
-
-  return std::string(utf8_encoded_buffer.get());
+  normalized.toLower();
+  std::string utf8;
+  // Internally, toUTF8String uses a 1kB stack buffer (which is not large enough
+  // for most web pages) and does pre-flighting followed by malloc for larger
+  // strings. We have to switch to obtaining the buffer with the maximum size
+  // (UTF-16 length * 3) without pre-flighting if necessary.
+  return normalized.toUTF8String(utf8);
 }
 
-
 // Detects a language of the UTF-16 encoded zero-terminated text.
 // Returns: Language enum.
 // TODO : make it reuse already allocated buffers to avoid excessive
@@ -102,13 +45,13 @@ std::string NormalizeText(const WCHAR* text,
 Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text,
                                      bool* is_reliable, int* num_languages,
                                      DWORD* error_code) {
-  // Normalize text.
-  std::string utf8_encoded_string_buffer = NormalizeText(text, num_languages,
-                                                         error_code);
-  if (utf8_encoded_string_buffer.empty())
+  if (!text || !num_languages)
     return NUM_LANGUAGES;
 
-  int utf8_encoded_buffer_size = utf8_encoded_string_buffer.length();
+  // Normalize text to NFC, lowercase and convert to UTF-8.
+  std::string utf8_encoded = NormalizeText(text);
+  if (utf8_encoded.empty())
+    return NUM_LANGUAGES;
 
   // Engage core CLD library language detection.
   Language language3[3] = {
@@ -123,8 +66,8 @@ Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text,
   // See the actual code in compact_lang_det_impl.cc, CalcSummaryLang function.
   // language3 array is always set according to the detection results and
   // is not affected by this heuristic.
-  CompactLangDet::DetectLanguageSummary(utf8_encoded_string_buffer.c_str(),
-                                        utf8_encoded_buffer_size,
+  CompactLangDet::DetectLanguageSummary(utf8_encoded.c_str(),
+                                        utf8_encoded.length(),
                                         is_plain_text, language3, percent3,
                                         &text_bytes, is_reliable);
 
@@ -143,43 +86,3 @@ Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text,
   return language3[0];
 }
 
-void DetectLanguageSummaryOfUnicodeText(const WCHAR* text,
-                                        bool is_plain_text,
-                                        Language language[3],
-                                        int percent[3],
-                                        int* text_bytes,
-                                        bool* is_reliable) {
-  int num_languages;
-  DWORD error_code;
-  std::string utf8_encoded_string_buffer = NormalizeText(text, &num_languages,
-                                                         &error_code);
-
-  // Normalize text.
-  if (utf8_encoded_string_buffer.empty())
-    return;
-
-  int utf8_encoded_buffer_size = utf8_encoded_string_buffer.length();
-
-  // Engage core CLD library language detection.
-  language[0] = language[1] = language[2] = UNKNOWN_LANGUAGE;
-  percent[0] = 100;
-  percent[1] = percent[2] = 0;
-
-  if (utf8_encoded_string_buffer.empty()) {
-    *is_reliable = false;
-    *text_bytes = 0;
-    return;
-  }
-
-  // We ignore return value here due to the problem described in bug 1800161.
-  // For example, translate.google.com was detected as Indonesian.  It happened
-  // due to the heuristic in CLD, which ignores English as a top language
-  // in the presence of another reliably detected language.
-  // See the actual code in compact_lang_det_impl.cc, CalcSummaryLang function.
-  // language3 array is always set according to the detection results and
-  // is not affected by this heuristic.
-  CompactLangDet::DetectLanguageSummary(utf8_encoded_string_buffer.c_str(),
-                                        utf8_encoded_buffer_size,
-                                        is_plain_text, language, percent,
-                                        text_bytes, is_reliable);
-}
diff --git a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h
index 5759dcf..354fbf9 100644
--- a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h
+++ b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h
@@ -30,19 +30,4 @@ Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text,
                                      bool* is_reliable, int* num_languages,
                                      DWORD* error_code);
 
-// Detects the top 3 languages in the UTF-16 encoded zero-terminated text.
-// [in] text - UTF-16 encoded text.
-// [in] is_plain_text - true if plain text, false otherwise (e.g. HTML).
-// [out] language[3] - Top 3 languages (default: UNKNOWN_LANGUAGE)
-// [out] percent[3] - Percentages of the languages (default: language3[0] = 100.
-//                  language3[1] = language3[2] = 0).
-// [out] is_reliable - true if reliable.
-// See CompactLangDet::DetectLanguageSummary() for more information.
-void DetectLanguageSummaryOfUnicodeText(const WCHAR* text,
-                                        bool is_plain_text,
-                                        Language language[3],
-                                        int percent[3],
-                                        int* text_bytes,
-                                        bool* is_reliable);
-
 #endif  // BAR_TOOLBAR_CLD_I18N_ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_
diff --git a/third_party/cld/cld.gyp b/third_party/cld/cld.gyp
index eb8b695..5e62ff3 100644
--- a/third_party/cld/cld.gyp
+++ b/third_party/cld/cld.gyp
@@ -11,6 +11,7 @@
           'type': '<(library)',
           'dependencies': [
             '../../base/base.gyp:base',
+            '../icu/icu.gyp:icuuc',
           ],
           'msvs_disabled_warnings': [4005, 4006, 4018, 4244, 4309, 4800],
           'defines': [
author	jcampan@chromium.org <jcampan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-01-20 23:00:20 +0000
committer	jcampan@chromium.org <jcampan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-01-20 23:00:20 +0000
commit	e3ad14ec5b929277ad6546e18b095de66348d6f3 (patch)
tree	28e86d508e2f98d9540fb4e5b344d0f60d62f4f2
parent	13ce1416712f6d7385d2485c134916c58b45d30c (diff)
download	chromium_src-e3ad14ec5b929277ad6546e18b095de66348d6f3.zip chromium_src-e3ad14ec5b929277ad6546e18b095de66348d6f3.tar.gz chromium_src-e3ad14ec5b929277ad6546e18b095de66348d6f3.tar.bz2