3 files changed, 12 insertions, 11 deletions
diff --git a/chrome/renderer/render_view.cc b/chrome/renderer/render_view.cc
index c2db5d5..5bdf140 100644
--- a/chrome/renderer/render_view.cc
+++ b/chrome/renderer/render_view.cc
@@ -371,18 +371,17 @@ static bool CrossesExtensionExtents(WebFrame* frame, const GURL& new_url) {
 // Note this only works on Windows at this time.  It always returns 'unknown'
 // on other platforms.
 static std::string DetermineTextLanguage(const string16& text) {
-  // Text with less than 100 bytes will probably not provide good results.
-  // Report it as unknown language.
-  if (text.length() < 100)
-    return chrome::kUnknownLanguageCode;
-
   std::string language = chrome::kUnknownLanguageCode;
   int num_languages = 0;
+  int text_bytes = 0;
   bool is_reliable = false;
   Language cld_language =
       DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,
-                                  &num_languages, NULL);
-  if (is_reliable && cld_language != NUM_LANGUAGES &&
+                                  &num_languages, NULL, &text_bytes);
+  // We don't trust the result if the CLD reports that the detection is not
+  // reliable, or if the actual text used to detect the language was less than
+  // 100 bytes (short texts can often lead to wrong results).
+  if (is_reliable && text_bytes >= 100 && cld_language != NUM_LANGUAGES &&
       cld_language != UNKNOWN_LANGUAGE && cld_language != TG_UNKNOWN_LANGUAGE) {
     // We should not use LanguageCode_ISO_639_1 because it does not cover all
     // the languages CLD can detect. As a result, it'll return the invalid
diff --git a/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc b/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc
index 0430cd4..9da01f7 100644
--- a/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc
+++ b/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc
@@ -37,7 +37,7 @@ Language DetectLanguageOfUnicodeText(
     const CompactLangDet::DetectionTables* detection_tables,
     const UChar* text, bool is_plain_text,
     bool* is_reliable, int* num_languages,
-    int* error_code) {
+    int* error_code, int* text_bytes) {
   if (!text || !num_languages)
     return NUM_LANGUAGES;
   // Normalize text to NFC, lowercase and convert to UTF-8.
@@ -50,7 +50,7 @@ Language DetectLanguageOfUnicodeText(
     UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE
   };
   int percent3[3] = { 0, 0, 0 };
-  int text_bytes = 0;
+  int text_bytes_tmp = 0;
   // We ignore return value here due to the problem described in bug 1800161.
   // For example, translate.google.com was detected as Indonesian.  It happened
   // due to the heuristic in CLD, which ignores English as a top language
@@ -62,11 +62,13 @@ Language DetectLanguageOfUnicodeText(
                                         utf8_encoded.c_str(),
                                         utf8_encoded.length(),
                                         is_plain_text, language3, percent3,
-                                        &text_bytes, is_reliable);
+                                        &text_bytes_tmp, is_reliable);
 
   // Calcualte a number of languages detected in more than 20% of the text.
   const int kMinTextPercentToCountLanguage = 20;
   *num_languages = 0;
+  if (text_bytes)
+    *text_bytes = text_bytes_tmp;
   COMPILE_ASSERT(arraysize(language3) == arraysize(percent3),
                  language3_and_percent3_should_be_of_the_same_size);
   for (int i = 0; i < arraysize(language3); ++i) {
diff --git a/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h b/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h
index f79bad8..2bd4127 100644
--- a/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h
+++ b/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h
@@ -34,7 +34,7 @@ Language DetectLanguageOfUnicodeText(
     const CompactLangDet::DetectionTables* detection_tables,
     const UChar* text, bool is_plain_text,
     bool* is_reliable, int* num_languages,
-    int* error_code);
+    int* error_code, int* text_bytes);
 
 
 #endif  // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_