summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjcampan@chromium.org <jcampan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-01-20 23:00:20 +0000
committerjcampan@chromium.org <jcampan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-01-20 23:00:20 +0000
commite3ad14ec5b929277ad6546e18b095de66348d6f3 (patch)
tree28e86d508e2f98d9540fb4e5b344d0f60d62f4f2
parent13ce1416712f6d7385d2485c134916c58b45d30c (diff)
downloadchromium_src-e3ad14ec5b929277ad6546e18b095de66348d6f3.zip
chromium_src-e3ad14ec5b929277ad6546e18b095de66348d6f3.tar.gz
chromium_src-e3ad14ec5b929277ad6546e18b095de66348d6f3.tar.bz2
Changing the text normalization in the CLD to use the ICU library
instead of the Windows LCMapString API that the sanbox is blocking. BUG=32648 TEST=Use the Translate extension on XP. The renderer should not crash. TBR=jshin Review URL: http://codereview.chromium.org/545137 git-svn-id: svn://svn.chromium.org/chrome/branches/249/src@36694 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r--third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc149
-rw-r--r--third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h15
-rw-r--r--third_party/cld/cld.gyp1
3 files changed, 27 insertions, 138 deletions
diff --git a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc
index 85dae05..64d604f 100644
--- a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc
+++ b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc
@@ -7,91 +7,34 @@
#include <tchar.h>
#include <windows.h>
+#include <string>
#include <vector> // to compile bar/common/component.h
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det.h"
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_scopedptr.h"
#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/normalizedunicodetext.h"
-std::string NormalizeText(const WCHAR* text,
- int* num_languages,
- DWORD* error_code) {
- if (!text || !num_languages) {
- if (error_code)
- *error_code = ERROR_INVALID_PARAMETER;
+#include "unicode/normlzr.h"
+#include "unicode/unistr.h"
+#include "unicode/ustring.h"
+
+std::string NormalizeText(const UChar* text) {
+ // To avoid a copy, use the read-only aliasing ctor.
+ icu::UnicodeString source(1, text, -1);
+ icu::UnicodeString normalized;
+ UErrorCode status = U_ZERO_ERROR;
+ icu::Normalizer::normalize(source, UNORM_NFC, 0, normalized, status);
+ if (U_FAILURE(status))
return std::string();
- }
-
- // Normalize text here. We do not check the return value here since there
- // is no meaningful recovery we can do in case of failure anyway.
- // Since the vast majority of texts on the Internet is already normalized
- // and languages which require normalization are easy to recognize by CLD
- // anyway, we'll benefit more from trying to detect language in non-normalized
- // text (and, with some probability, fail to recognize it) than to give up
- // right away and return the unknown language here.
- NormalizedUnicodeText nomalized_text;
- nomalized_text.Normalize(NormalizationC, text);
-
- // Determine the size of the buffer required to store a lowercased text.
- int lowercase_text_size =
- ::LCMapString(NULL, LCMAP_LOWERCASE | LCMAP_LINGUISTIC_CASING,
- nomalized_text.get(), -1,
- NULL, 0);
- if (!lowercase_text_size) {
- if (error_code)
- *error_code = ::GetLastError();
- return std::string();
- }
-
- scoped_array<WCHAR> lowercase_text(new WCHAR[lowercase_text_size]);
- if (!lowercase_text.get())
- return std::string();
-
- // Covert text to lowercase.
- int lowercasing_result =
- ::LCMapString(NULL, LCMAP_LOWERCASE | LCMAP_LINGUISTIC_CASING,
- nomalized_text.get(), -1,
- lowercase_text.get(), lowercase_text_size);
- if (!lowercasing_result) {
- if (error_code)
- *error_code = ::GetLastError();
- return std::string();
- }
-
- // Determine the size of the buffer required to covert text to UTF-8.
- int utf8_encoded_buffer_size =
- ::WideCharToMultiByte(CP_UTF8, 0,
- lowercase_text.get(), -1,
- NULL, 0,
- NULL, NULL);
- if (!utf8_encoded_buffer_size) {
- if (error_code)
- *error_code = ::GetLastError();
- return std::string();
- }
-
- scoped_array<char> utf8_encoded_buffer(new char[utf8_encoded_buffer_size]);
-
- // Convert text to UTF-8.
- int utf8_encoding_result =
- ::WideCharToMultiByte(CP_UTF8, 0,
- lowercase_text.get(), -1,
- utf8_encoded_buffer.get(),
- utf8_encoded_buffer_size,
- NULL, NULL);
- if (!utf8_encoding_result) {
- if (error_code)
- *error_code = ::GetLastError();
- return std::string();
- }
-
- if (error_code)
- *error_code = 0;
-
- return std::string(utf8_encoded_buffer.get());
+ normalized.toLower();
+ std::string utf8;
+ // Internally, toUTF8String uses a 1kB stack buffer (which is not large enough
+ // for most web pages) and does pre-flighting followed by malloc for larger
+ // strings. We have to switch to obtaining the buffer with the maximum size
+ // (UTF-16 length * 3) without pre-flighting if necessary.
+ return normalized.toUTF8String(utf8);
}
-
// Detects a language of the UTF-16 encoded zero-terminated text.
// Returns: Language enum.
// TODO : make it reuse already allocated buffers to avoid excessive
@@ -102,13 +45,13 @@ std::string NormalizeText(const WCHAR* text,
Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text,
bool* is_reliable, int* num_languages,
DWORD* error_code) {
- // Normalize text.
- std::string utf8_encoded_string_buffer = NormalizeText(text, num_languages,
- error_code);
- if (utf8_encoded_string_buffer.empty())
+ if (!text || !num_languages)
return NUM_LANGUAGES;
- int utf8_encoded_buffer_size = utf8_encoded_string_buffer.length();
+ // Normalize text to NFC, lowercase and convert to UTF-8.
+ std::string utf8_encoded = NormalizeText(text);
+ if (utf8_encoded.empty())
+ return NUM_LANGUAGES;
// Engage core CLD library language detection.
Language language3[3] = {
@@ -123,8 +66,8 @@ Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text,
// See the actual code in compact_lang_det_impl.cc, CalcSummaryLang function.
// language3 array is always set according to the detection results and
// is not affected by this heuristic.
- CompactLangDet::DetectLanguageSummary(utf8_encoded_string_buffer.c_str(),
- utf8_encoded_buffer_size,
+ CompactLangDet::DetectLanguageSummary(utf8_encoded.c_str(),
+ utf8_encoded.length(),
is_plain_text, language3, percent3,
&text_bytes, is_reliable);
@@ -143,43 +86,3 @@ Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text,
return language3[0];
}
-void DetectLanguageSummaryOfUnicodeText(const WCHAR* text,
- bool is_plain_text,
- Language language[3],
- int percent[3],
- int* text_bytes,
- bool* is_reliable) {
- int num_languages;
- DWORD error_code;
- std::string utf8_encoded_string_buffer = NormalizeText(text, &num_languages,
- &error_code);
-
- // Normalize text.
- if (utf8_encoded_string_buffer.empty())
- return;
-
- int utf8_encoded_buffer_size = utf8_encoded_string_buffer.length();
-
- // Engage core CLD library language detection.
- language[0] = language[1] = language[2] = UNKNOWN_LANGUAGE;
- percent[0] = 100;
- percent[1] = percent[2] = 0;
-
- if (utf8_encoded_string_buffer.empty()) {
- *is_reliable = false;
- *text_bytes = 0;
- return;
- }
-
- // We ignore return value here due to the problem described in bug 1800161.
- // For example, translate.google.com was detected as Indonesian. It happened
- // due to the heuristic in CLD, which ignores English as a top language
- // in the presence of another reliably detected language.
- // See the actual code in compact_lang_det_impl.cc, CalcSummaryLang function.
- // language3 array is always set according to the detection results and
- // is not affected by this heuristic.
- CompactLangDet::DetectLanguageSummary(utf8_encoded_string_buffer.c_str(),
- utf8_encoded_buffer_size,
- is_plain_text, language, percent,
- text_bytes, is_reliable);
-}
diff --git a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h
index 5759dcf..354fbf9 100644
--- a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h
+++ b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h
@@ -30,19 +30,4 @@ Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text,
bool* is_reliable, int* num_languages,
DWORD* error_code);
-// Detects the top 3 languages in the UTF-16 encoded zero-terminated text.
-// [in] text - UTF-16 encoded text.
-// [in] is_plain_text - true if plain text, false otherwise (e.g. HTML).
-// [out] language[3] - Top 3 languages (default: UNKNOWN_LANGUAGE)
-// [out] percent[3] - Percentages of the languages (default: language3[0] = 100.
-// language3[1] = language3[2] = 0).
-// [out] is_reliable - true if reliable.
-// See CompactLangDet::DetectLanguageSummary() for more information.
-void DetectLanguageSummaryOfUnicodeText(const WCHAR* text,
- bool is_plain_text,
- Language language[3],
- int percent[3],
- int* text_bytes,
- bool* is_reliable);
-
#endif // BAR_TOOLBAR_CLD_I18N_ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_
diff --git a/third_party/cld/cld.gyp b/third_party/cld/cld.gyp
index eb8b695..5e62ff3 100644
--- a/third_party/cld/cld.gyp
+++ b/third_party/cld/cld.gyp
@@ -11,6 +11,7 @@
'type': '<(library)',
'dependencies': [
'../../base/base.gyp:base',
+ '../icu/icu.gyp:icuuc',
],
'msvs_disabled_warnings': [4005, 4006, 4018, 4244, 4309, 4800],
'defines': [