summaryrefslogtreecommitdiffstats
path: root/third_party
diff options
context:
space:
mode:
authorjcivelli@chromium.org <jcivelli@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-08-02 18:40:08 +0000
committerjcivelli@chromium.org <jcivelli@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2010-08-02 18:40:08 +0000
commita955868874c52f54e5ad1431cb0d377865257377 (patch)
treea66f7cdaae9bb44714b8d16cf980b324b6069d15 /third_party
parent969a777eb646c394744e504f943778f34e624694 (diff)
downloadchromium_src-a955868874c52f54e5ad1431cb0d377865257377.zip
chromium_src-a955868874c52f54e5ad1431cb0d377865257377.tar.gz
chromium_src-a955868874c52f54e5ad1431cb0d377865257377.tar.bz2
Fix language detection with short text pages.
Changed CLD wrapper function to return the number of bytes used for language detection, and using that in the render view to ignore detection for short texts. BUG=45156 TEST=Visit www.voila.fr, an infobar should show reporting the page is in French. Visit http://jdvhotels.com/calendar.html?datetime=1274857200000&id=0, no translate infobar should show. Review URL: http://codereview.chromium.org/3038018 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@54568 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'third_party')
-rw-r--r--third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc8
-rw-r--r--third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h2
2 files changed, 6 insertions, 4 deletions
diff --git a/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc b/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc
index 0430cd4..9da01f7 100644
--- a/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc
+++ b/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc
@@ -37,7 +37,7 @@ Language DetectLanguageOfUnicodeText(
const CompactLangDet::DetectionTables* detection_tables,
const UChar* text, bool is_plain_text,
bool* is_reliable, int* num_languages,
- int* error_code) {
+ int* error_code, int* text_bytes) {
if (!text || !num_languages)
return NUM_LANGUAGES;
// Normalize text to NFC, lowercase and convert to UTF-8.
@@ -50,7 +50,7 @@ Language DetectLanguageOfUnicodeText(
UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE
};
int percent3[3] = { 0, 0, 0 };
- int text_bytes = 0;
+ int text_bytes_tmp = 0;
// We ignore return value here due to the problem described in bug 1800161.
// For example, translate.google.com was detected as Indonesian. It happened
// due to the heuristic in CLD, which ignores English as a top language
@@ -62,11 +62,13 @@ Language DetectLanguageOfUnicodeText(
utf8_encoded.c_str(),
utf8_encoded.length(),
is_plain_text, language3, percent3,
- &text_bytes, is_reliable);
+ &text_bytes_tmp, is_reliable);
// Calcualte a number of languages detected in more than 20% of the text.
const int kMinTextPercentToCountLanguage = 20;
*num_languages = 0;
+ if (text_bytes)
+ *text_bytes = text_bytes_tmp;
COMPILE_ASSERT(arraysize(language3) == arraysize(percent3),
language3_and_percent3_should_be_of_the_same_size);
for (int i = 0; i < arraysize(language3); ++i) {
diff --git a/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h b/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h
index f79bad8..2bd4127 100644
--- a/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h
+++ b/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h
@@ -34,7 +34,7 @@ Language DetectLanguageOfUnicodeText(
const CompactLangDet::DetectionTables* detection_tables,
const UChar* text, bool is_plain_text,
bool* is_reliable, int* num_languages,
- int* error_code);
+ int* error_code, int* text_bytes);
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_