summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--chrome/renderer/render_view.cc13
-rw-r--r--third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc8
-rw-r--r--third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h2
3 files changed, 12 insertions, 11 deletions
diff --git a/chrome/renderer/render_view.cc b/chrome/renderer/render_view.cc
index c2db5d5..5bdf140 100644
--- a/chrome/renderer/render_view.cc
+++ b/chrome/renderer/render_view.cc
@@ -371,18 +371,17 @@ static bool CrossesExtensionExtents(WebFrame* frame, const GURL& new_url) {
// Note this only works on Windows at this time. It always returns 'unknown'
// on other platforms.
static std::string DetermineTextLanguage(const string16& text) {
- // Text with less than 100 bytes will probably not provide good results.
- // Report it as unknown language.
- if (text.length() < 100)
- return chrome::kUnknownLanguageCode;
-
std::string language = chrome::kUnknownLanguageCode;
int num_languages = 0;
+ int text_bytes = 0;
bool is_reliable = false;
Language cld_language =
DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,
- &num_languages, NULL);
- if (is_reliable && cld_language != NUM_LANGUAGES &&
+ &num_languages, NULL, &text_bytes);
+ // We don't trust the result if the CLD reports that the detection is not
+ // reliable, or if the actual text used to detect the language was less than
+ // 100 bytes (short texts can often lead to wrong results).
+ if (is_reliable && text_bytes >= 100 && cld_language != NUM_LANGUAGES &&
cld_language != UNKNOWN_LANGUAGE && cld_language != TG_UNKNOWN_LANGUAGE) {
// We should not use LanguageCode_ISO_639_1 because it does not cover all
// the languages CLD can detect. As a result, it'll return the invalid
diff --git a/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc b/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc
index 0430cd4..9da01f7 100644
--- a/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc
+++ b/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.cc
@@ -37,7 +37,7 @@ Language DetectLanguageOfUnicodeText(
const CompactLangDet::DetectionTables* detection_tables,
const UChar* text, bool is_plain_text,
bool* is_reliable, int* num_languages,
- int* error_code) {
+ int* error_code, int* text_bytes) {
if (!text || !num_languages)
return NUM_LANGUAGES;
// Normalize text to NFC, lowercase and convert to UTF-8.
@@ -50,7 +50,7 @@ Language DetectLanguageOfUnicodeText(
UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE
};
int percent3[3] = { 0, 0, 0 };
- int text_bytes = 0;
+ int text_bytes_tmp = 0;
// We ignore return value here due to the problem described in bug 1800161.
// For example, translate.google.com was detected as Indonesian. It happened
// due to the heuristic in CLD, which ignores English as a top language
@@ -62,11 +62,13 @@ Language DetectLanguageOfUnicodeText(
utf8_encoded.c_str(),
utf8_encoded.length(),
is_plain_text, language3, percent3,
- &text_bytes, is_reliable);
+ &text_bytes_tmp, is_reliable);
// Calcualte a number of languages detected in more than 20% of the text.
const int kMinTextPercentToCountLanguage = 20;
*num_languages = 0;
+ if (text_bytes)
+ *text_bytes = text_bytes_tmp;
COMPILE_ASSERT(arraysize(language3) == arraysize(percent3),
language3_and_percent3_should_be_of_the_same_size);
for (int i = 0; i < arraysize(language3); ++i) {
diff --git a/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h b/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h
index f79bad8..2bd4127 100644
--- a/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h
+++ b/third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h
@@ -34,7 +34,7 @@ Language DetectLanguageOfUnicodeText(
const CompactLangDet::DetectionTables* detection_tables,
const UChar* text, bool is_plain_text,
bool* is_reliable, int* num_languages,
- int* error_code);
+ int* error_code, int* text_bytes);
#endif // ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_