diff options
-rw-r--r-- | chrome/renderer/render_view.cc | 13 | ||||
-rw-r--r-- | third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc | 8 | ||||
-rw-r--r-- | third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h | 2 |
3 files changed, 12 insertions, 11 deletions
diff --git a/chrome/renderer/render_view.cc b/chrome/renderer/render_view.cc index c2db5d5..5bdf140 100644 --- a/chrome/renderer/render_view.cc +++ b/chrome/renderer/render_view.cc @@ -371,18 +371,17 @@ static bool CrossesExtensionExtents(WebFrame* frame, const GURL& new_url) { // Note this only works on Windows at this time. It always returns 'unknown' // on other platforms. static std::string DetermineTextLanguage(const string16& text) { - // Text with less than 100 bytes will probably not provide good results. - // Report it as unknown language. - if (text.length() < 100) - return chrome::kUnknownLanguageCode; - std::string language = chrome::kUnknownLanguageCode; int num_languages = 0; + int text_bytes = 0; bool is_reliable = false; Language cld_language = DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable, - &num_languages, NULL); - if (is_reliable && cld_language != NUM_LANGUAGES && + &num_languages, NULL, &text_bytes); + // We don't trust the result if the CLD reports that the detection is not + // reliable, or if the actual text used to detect the language was less than + // 100 bytes (short texts can often lead to wrong results). + if (is_reliable && text_bytes >= 100 && cld_language != NUM_LANGUAGES && cld_language != UNKNOWN_LANGUAGE && cld_language != TG_UNKNOWN_LANGUAGE) { // We should not use LanguageCode_ISO_639_1 because it does not cover all // the languages CLD can detect. As a result, it'll return the invalid diff --git a/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc b/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc index 0430cd4..9da01f7 100644 --- a/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc +++ b/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc @@ -37,7 +37,7 @@ Language DetectLanguageOfUnicodeText( const CompactLangDet::DetectionTables* detection_tables, const UChar* text, bool is_plain_text, bool* is_reliable, int* num_languages, - int* error_code) { + int* error_code, int* text_bytes) { if (!text || !num_languages) return NUM_LANGUAGES; // Normalize text to NFC, lowercase and convert to UTF-8. @@ -50,7 +50,7 @@ Language DetectLanguageOfUnicodeText( UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE }; int percent3[3] = { 0, 0, 0 }; - int text_bytes = 0; + int text_bytes_tmp = 0; // We ignore return value here due to the problem described in bug 1800161. // For example, translate.google.com was detected as Indonesian. It happened // due to the heuristic in CLD, which ignores English as a top language @@ -62,11 +62,13 @@ Language DetectLanguageOfUnicodeText( utf8_encoded.c_str(), utf8_encoded.length(), is_plain_text, language3, percent3, - &text_bytes, is_reliable); + &text_bytes_tmp, is_reliable); // Calcualte a number of languages detected in more than 20% of the text. const int kMinTextPercentToCountLanguage = 20; *num_languages = 0; + if (text_bytes) + *text_bytes = text_bytes_tmp; COMPILE_ASSERT(arraysize(language3) == arraysize(percent3), language3_and_percent3_should_be_of_the_same_size); for (int i = 0; i < arraysize(language3); ++i) { diff --git a/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h b/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h index f79bad8..2bd4127 100644 --- a/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h +++ b/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h @@ -34,7 +34,7 @@ Language DetectLanguageOfUnicodeText( const CompactLangDet::DetectionTables* detection_tables, const UChar* text, bool is_plain_text, bool* is_reliable, int* num_languages, - int* error_code); + int* error_code, int* text_bytes); #endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_ |