Fix language detection with short text pages.

Changed CLD wrapper function to return the number of bytes used for language detection, and using that in the render view to ignore detection for short texts. BUG=45156 TEST=Visit www.voila.fr, an infobar should show reporting the page is in French. Visit http://jdvhotels.com/calendar.html?datetime=1274857200000&id=0, no translate infobar should show. Review URL: http://codereview.chromium.org/3038018 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@54568 0039d316-1c4b-4281-b951-d872f2087c98
author: jcivelli@chromium.org <jcivelli@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-08-02 18:40:08 +0000
committer: jcivelli@chromium.org <jcivelli@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-08-02 18:40:08 +0000
commit: a955868874c52f54e5ad1431cb0d377865257377 (patch)
tree: a66f7cdaae9bb44714b8d16cf980b324b6069d15 /chrome/renderer
parent: 969a777eb646c394744e504f943778f34e624694 (diff)
download: chromium_src-a955868874c52f54e5ad1431cb0d377865257377.zip
chromium_src-a955868874c52f54e5ad1431cb0d377865257377.tar.gz
chromium_src-a955868874c52f54e5ad1431cb0d377865257377.tar.bz2
1 files changed, 6 insertions, 7 deletions
diff --git a/chrome/renderer/render_view.cc b/chrome/renderer/render_view.cc
index c2db5d5..5bdf140 100644
--- a/chrome/renderer/render_view.cc
+++ b/chrome/renderer/render_view.cc
@@ -371,18 +371,17 @@ static bool CrossesExtensionExtents(WebFrame* frame, const GURL& new_url) {
 // Note this only works on Windows at this time.  It always returns 'unknown'
 // on other platforms.
 static std::string DetermineTextLanguage(const string16& text) {
-  // Text with less than 100 bytes will probably not provide good results.
-  // Report it as unknown language.
-  if (text.length() < 100)
-    return chrome::kUnknownLanguageCode;
-
   std::string language = chrome::kUnknownLanguageCode;
   int num_languages = 0;
+  int text_bytes = 0;
   bool is_reliable = false;
   Language cld_language =
       DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,
-                                  &num_languages, NULL);
-  if (is_reliable && cld_language != NUM_LANGUAGES &&
+                                  &num_languages, NULL, &text_bytes);
+  // We don't trust the result if the CLD reports that the detection is not
+  // reliable, or if the actual text used to detect the language was less than
+  // 100 bytes (short texts can often lead to wrong results).
+  if (is_reliable && text_bytes >= 100 && cld_language != NUM_LANGUAGES &&
       cld_language != UNKNOWN_LANGUAGE && cld_language != TG_UNKNOWN_LANGUAGE) {
     // We should not use LanguageCode_ISO_639_1 because it does not cover all
     // the languages CLD can detect. As a result, it'll return the invalid
author	jcivelli@chromium.org <jcivelli@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-08-02 18:40:08 +0000
committer	jcivelli@chromium.org <jcivelli@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-08-02 18:40:08 +0000
commit	a955868874c52f54e5ad1431cb0d377865257377 (patch)
tree	a66f7cdaae9bb44714b8d16cf980b324b6069d15 /chrome/renderer
parent	969a777eb646c394744e504f943778f34e624694 (diff)
download	chromium_src-a955868874c52f54e5ad1431cb0d377865257377.zip chromium_src-a955868874c52f54e5ad1431cb0d377865257377.tar.gz chromium_src-a955868874c52f54e5ad1431cb0d377865257377.tar.bz2