diff options
author | jcivelli@chromium.org <jcivelli@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-08-02 18:40:08 +0000 |
---|---|---|
committer | jcivelli@chromium.org <jcivelli@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2010-08-02 18:40:08 +0000 |
commit | a955868874c52f54e5ad1431cb0d377865257377 (patch) | |
tree | a66f7cdaae9bb44714b8d16cf980b324b6069d15 /third_party | |
parent | 969a777eb646c394744e504f943778f34e624694 (diff) | |
download | chromium_src-a955868874c52f54e5ad1431cb0d377865257377.zip chromium_src-a955868874c52f54e5ad1431cb0d377865257377.tar.gz chromium_src-a955868874c52f54e5ad1431cb0d377865257377.tar.bz2 |
Fix language detection with short text pages.
Changed CLD wrapper function to return the number of bytes used for language detection, and using that in the render view to ignore detection for short texts.
BUG=45156
TEST=Visit www.voila.fr, an infobar should show reporting the page is in French.
Visit http://jdvhotels.com/calendar.html?datetime=1274857200000&id=0, no translate infobar should show.
Review URL: http://codereview.chromium.org/3038018
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@54568 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'third_party')
-rw-r--r-- | third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc | 8 | ||||
-rw-r--r-- | third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h | 2 |
2 files changed, 6 insertions, 4 deletions
diff --git a/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc b/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc index 0430cd4..9da01f7 100644 --- a/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +++ b/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc @@ -37,7 +37,7 @@ Language DetectLanguageOfUnicodeText( const CompactLangDet::DetectionTables* detection_tables, const UChar* text, bool is_plain_text, bool* is_reliable, int* num_languages, - int* error_code) { + int* error_code, int* text_bytes) { if (!text || !num_languages) return NUM_LANGUAGES; // Normalize text to NFC, lowercase and convert to UTF-8. @@ -50,7 +50,7 @@ Language DetectLanguageOfUnicodeText( UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE }; int percent3[3] = { 0, 0, 0 }; - int text_bytes = 0; + int text_bytes_tmp = 0; // We ignore return value here due to the problem described in bug 1800161. // For example, translate.google.com was detected as Indonesian. It happened // due to the heuristic in CLD, which ignores English as a top language @@ -62,11 +62,13 @@ Language DetectLanguageOfUnicodeText( utf8_encoded.c_str(), utf8_encoded.length(), is_plain_text, language3, percent3, - &text_bytes, is_reliable); + &text_bytes_tmp, is_reliable); // Calcualte a number of languages detected in more than 20% of the text. const int kMinTextPercentToCountLanguage = 20; *num_languages = 0; + if (text_bytes) + *text_bytes = text_bytes_tmp; COMPILE_ASSERT(arraysize(language3) == arraysize(percent3), language3_and_percent3_should_be_of_the_same_size); for (int i = 0; i < arraysize(language3); ++i) { diff --git a/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h b/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h index f79bad8..2bd4127 100644 --- a/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h +++ b/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h @@ -34,7 +34,7 @@ Language DetectLanguageOfUnicodeText( const CompactLangDet::DetectionTables* detection_tables, const UChar* text, bool is_plain_text, bool* is_reliable, int* num_languages, - int* error_code); + int* error_code, int* text_bytes); #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_ |