diff options
author | jcampan@chromium.org <jcampan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-01-20 23:00:20 +0000 |
---|---|---|
committer | jcampan@chromium.org <jcampan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-01-20 23:00:20 +0000 |
commit | e3ad14ec5b929277ad6546e18b095de66348d6f3 (patch) | |
tree | 28e86d508e2f98d9540fb4e5b344d0f60d62f4f2 | |
parent | 13ce1416712f6d7385d2485c134916c58b45d30c (diff) | |
download | chromium_src-e3ad14ec5b929277ad6546e18b095de66348d6f3.zip chromium_src-e3ad14ec5b929277ad6546e18b095de66348d6f3.tar.gz chromium_src-e3ad14ec5b929277ad6546e18b095de66348d6f3.tar.bz2 |
Changing the text normalization in the CLD to use the ICU library
instead of the Windows LCMapString API that the sanbox is blocking.
BUG=32648
TEST=Use the Translate extension on XP. The renderer should not crash.
TBR=jshin
Review URL: http://codereview.chromium.org/545137
git-svn-id: svn://svn.chromium.org/chrome/branches/249/src@36694 0039d316-1c4b-4281-b951-d872f2087c98
3 files changed, 27 insertions, 138 deletions
diff --git a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc index 85dae05..64d604f 100644 --- a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc +++ b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc @@ -7,91 +7,34 @@ #include <tchar.h> #include <windows.h> +#include <string> #include <vector> // to compile bar/common/component.h #include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det.h" #include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_scopedptr.h" #include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/normalizedunicodetext.h" -std::string NormalizeText(const WCHAR* text, - int* num_languages, - DWORD* error_code) { - if (!text || !num_languages) { - if (error_code) - *error_code = ERROR_INVALID_PARAMETER; +#include "unicode/normlzr.h" +#include "unicode/unistr.h" +#include "unicode/ustring.h" + +std::string NormalizeText(const UChar* text) { + // To avoid a copy, use the read-only aliasing ctor. + icu::UnicodeString source(1, text, -1); + icu::UnicodeString normalized; + UErrorCode status = U_ZERO_ERROR; + icu::Normalizer::normalize(source, UNORM_NFC, 0, normalized, status); + if (U_FAILURE(status)) return std::string(); - } - - // Normalize text here. We do not check the return value here since there - // is no meaningful recovery we can do in case of failure anyway. - // Since the vast majority of texts on the Internet is already normalized - // and languages which require normalization are easy to recognize by CLD - // anyway, we'll benefit more from trying to detect language in non-normalized - // text (and, with some probability, fail to recognize it) than to give up - // right away and return the unknown language here. - NormalizedUnicodeText nomalized_text; - nomalized_text.Normalize(NormalizationC, text); - - // Determine the size of the buffer required to store a lowercased text. - int lowercase_text_size = - ::LCMapString(NULL, LCMAP_LOWERCASE | LCMAP_LINGUISTIC_CASING, - nomalized_text.get(), -1, - NULL, 0); - if (!lowercase_text_size) { - if (error_code) - *error_code = ::GetLastError(); - return std::string(); - } - - scoped_array<WCHAR> lowercase_text(new WCHAR[lowercase_text_size]); - if (!lowercase_text.get()) - return std::string(); - - // Covert text to lowercase. - int lowercasing_result = - ::LCMapString(NULL, LCMAP_LOWERCASE | LCMAP_LINGUISTIC_CASING, - nomalized_text.get(), -1, - lowercase_text.get(), lowercase_text_size); - if (!lowercasing_result) { - if (error_code) - *error_code = ::GetLastError(); - return std::string(); - } - - // Determine the size of the buffer required to covert text to UTF-8. - int utf8_encoded_buffer_size = - ::WideCharToMultiByte(CP_UTF8, 0, - lowercase_text.get(), -1, - NULL, 0, - NULL, NULL); - if (!utf8_encoded_buffer_size) { - if (error_code) - *error_code = ::GetLastError(); - return std::string(); - } - - scoped_array<char> utf8_encoded_buffer(new char[utf8_encoded_buffer_size]); - - // Convert text to UTF-8. - int utf8_encoding_result = - ::WideCharToMultiByte(CP_UTF8, 0, - lowercase_text.get(), -1, - utf8_encoded_buffer.get(), - utf8_encoded_buffer_size, - NULL, NULL); - if (!utf8_encoding_result) { - if (error_code) - *error_code = ::GetLastError(); - return std::string(); - } - - if (error_code) - *error_code = 0; - - return std::string(utf8_encoded_buffer.get()); + normalized.toLower(); + std::string utf8; + // Internally, toUTF8String uses a 1kB stack buffer (which is not large enough + // for most web pages) and does pre-flighting followed by malloc for larger + // strings. We have to switch to obtaining the buffer with the maximum size + // (UTF-16 length * 3) without pre-flighting if necessary. + return normalized.toUTF8String(utf8); } - // Detects a language of the UTF-16 encoded zero-terminated text. // Returns: Language enum. // TODO : make it reuse already allocated buffers to avoid excessive @@ -102,13 +45,13 @@ std::string NormalizeText(const WCHAR* text, Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text, bool* is_reliable, int* num_languages, DWORD* error_code) { - // Normalize text. - std::string utf8_encoded_string_buffer = NormalizeText(text, num_languages, - error_code); - if (utf8_encoded_string_buffer.empty()) + if (!text || !num_languages) return NUM_LANGUAGES; - int utf8_encoded_buffer_size = utf8_encoded_string_buffer.length(); + // Normalize text to NFC, lowercase and convert to UTF-8. + std::string utf8_encoded = NormalizeText(text); + if (utf8_encoded.empty()) + return NUM_LANGUAGES; // Engage core CLD library language detection. Language language3[3] = { @@ -123,8 +66,8 @@ Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text, // See the actual code in compact_lang_det_impl.cc, CalcSummaryLang function. // language3 array is always set according to the detection results and // is not affected by this heuristic. - CompactLangDet::DetectLanguageSummary(utf8_encoded_string_buffer.c_str(), - utf8_encoded_buffer_size, + CompactLangDet::DetectLanguageSummary(utf8_encoded.c_str(), + utf8_encoded.length(), is_plain_text, language3, percent3, &text_bytes, is_reliable); @@ -143,43 +86,3 @@ Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text, return language3[0]; } -void DetectLanguageSummaryOfUnicodeText(const WCHAR* text, - bool is_plain_text, - Language language[3], - int percent[3], - int* text_bytes, - bool* is_reliable) { - int num_languages; - DWORD error_code; - std::string utf8_encoded_string_buffer = NormalizeText(text, &num_languages, - &error_code); - - // Normalize text. - if (utf8_encoded_string_buffer.empty()) - return; - - int utf8_encoded_buffer_size = utf8_encoded_string_buffer.length(); - - // Engage core CLD library language detection. - language[0] = language[1] = language[2] = UNKNOWN_LANGUAGE; - percent[0] = 100; - percent[1] = percent[2] = 0; - - if (utf8_encoded_string_buffer.empty()) { - *is_reliable = false; - *text_bytes = 0; - return; - } - - // We ignore return value here due to the problem described in bug 1800161. - // For example, translate.google.com was detected as Indonesian. It happened - // due to the heuristic in CLD, which ignores English as a top language - // in the presence of another reliably detected language. - // See the actual code in compact_lang_det_impl.cc, CalcSummaryLang function. - // language3 array is always set according to the detection results and - // is not affected by this heuristic. - CompactLangDet::DetectLanguageSummary(utf8_encoded_string_buffer.c_str(), - utf8_encoded_buffer_size, - is_plain_text, language, percent, - text_bytes, is_reliable); -} diff --git a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h index 5759dcf..354fbf9 100644 --- a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h +++ b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h @@ -30,19 +30,4 @@ Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text, bool* is_reliable, int* num_languages, DWORD* error_code); -// Detects the top 3 languages in the UTF-16 encoded zero-terminated text. -// [in] text - UTF-16 encoded text. -// [in] is_plain_text - true if plain text, false otherwise (e.g. HTML). -// [out] language[3] - Top 3 languages (default: UNKNOWN_LANGUAGE) -// [out] percent[3] - Percentages of the languages (default: language3[0] = 100. -// language3[1] = language3[2] = 0). -// [out] is_reliable - true if reliable. -// See CompactLangDet::DetectLanguageSummary() for more information. -void DetectLanguageSummaryOfUnicodeText(const WCHAR* text, - bool is_plain_text, - Language language[3], - int percent[3], - int* text_bytes, - bool* is_reliable); - #endif // BAR_TOOLBAR_CLD_I18N_ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_ diff --git a/third_party/cld/cld.gyp b/third_party/cld/cld.gyp index eb8b695..5e62ff3 100644 --- a/third_party/cld/cld.gyp +++ b/third_party/cld/cld.gyp @@ -11,6 +11,7 @@ 'type': '<(library)', 'dependencies': [ '../../base/base.gyp:base', + '../icu/icu.gyp:icuuc', ], 'msvs_disabled_warnings': [4005, 4006, 4018, 4244, 4309, 4800], 'defines': [ |