summaryrefslogtreecommitdiffstats
path: root/chrome
diff options
context:
space:
mode:
authortoyoshim@chromium.org <toyoshim@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-02-27 03:31:34 +0000
committertoyoshim@chromium.org <toyoshim@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-02-27 03:31:34 +0000
commit4e189749a72c06c4802d8c8fbe32c8f5f7fd8905 (patch)
treefe997113e688f333d09c7800caa29e4e8ada613e /chrome
parent68c9dd44cdada6a7a5b9976296095fa876e8b1bb (diff)
downloadchromium_src-4e189749a72c06c4802d8c8fbe32c8f5f7fd8905.zip
chromium_src-4e189749a72c06c4802d8c8fbe32c8f5f7fd8905.tar.gz
chromium_src-4e189749a72c06c4802d8c8fbe32c8f5f7fd8905.tar.bz2
Translate: run CLD even though Content-Language is provided.
If CLD disagree a Content-Language value with confidence, Chrome should not rely on both CLD and Content-Language, then give up to suggest translation. BUG=91205 Review URL: https://chromiumcodereview.appspot.com/12218074 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@184859 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome')
-rw-r--r--chrome/renderer/translate_helper.cc70
-rw-r--r--chrome/renderer/translate_helper.h6
-rw-r--r--chrome/renderer/translate_helper_browsertest.cc1
-rw-r--r--chrome/renderer/translate_helper_unittest.cc14
4 files changed, 68 insertions, 23 deletions
diff --git a/chrome/renderer/translate_helper.cc b/chrome/renderer/translate_helper.cc
index 33a5bb1..c4cece5 100644
--- a/chrome/renderer/translate_helper.cc
+++ b/chrome/renderer/translate_helper.cc
@@ -92,30 +92,10 @@ void TranslateHelper::PageCaptured(const string16& contents) {
// language of the intended audience (a distinction really only
// relevant for things like langauge textbooks). This distinction
// shouldn't affect translation.
- std::string language = document.contentLanguage().utf8();
- CorrectLanguageCodeTypo(&language);
-
- // Convert language code synonym firstly because sometime synonym code is in
- // invalid format, e.g. 'fil'. After the conversion, make invalid code empty
- // string.
- ConvertLanguageCodeSynonym(&language);
- ResetInvalidLanguageCode(&language);
-
-#if defined(ENABLE_LANGUAGE_DETECTION)
- if (language.empty()) {
- base::TimeTicks begin_time = base::TimeTicks::Now();
- language = DetermineTextLanguage(contents);
- UMA_HISTOGRAM_MEDIUM_TIMES("Renderer4.LanguageDetection",
- base::TimeTicks::Now() - begin_time);
- // Apply synonym conversion here because CLD may return 'fil'.
- ConvertLanguageCodeSynonym(&language);
- } else {
- VLOG(9) << "PageLanguageFromMetaTag: " << language;
- }
-#else
+ std::string content_language = document.contentLanguage().utf8();
+ std::string language = DeterminePageLanguage(content_language, contents);
if (language.empty())
return;
-#endif // defined(ENABLE_LANGUAGE_DETECTION)
Send(new ChromeViewHostMsg_TranslateLanguageDetermined(
routing_id(), language, IsPageTranslatable(&document)));
@@ -142,6 +122,9 @@ std::string TranslateHelper::DetermineTextLanguage(const string16& text) {
// We don't trust the result if the CLD reports that the detection is not
// reliable, or if the actual text used to detect the language was less than
// 100 bytes (short texts can often lead to wrong results).
+ // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that
+ // the determined language code is correct with 50% confidence. Chrome should
+ // handle the real confidence value to judge.
if (is_reliable && text_bytes >= 100 && cld_language != NUM_LANGUAGES &&
cld_language != UNKNOWN_LANGUAGE && cld_language != TG_UNKNOWN_LANGUAGE) {
// We should not use LanguageCode_ISO_639_1 because it does not cover all
@@ -279,6 +262,49 @@ void TranslateHelper::ResetInvalidLanguageCode(std::string* code) {
}
// static
+std::string TranslateHelper::DeterminePageLanguage(const std::string& code,
+ const string16& contents) {
+#if defined(ENABLE_LANGUAGE_DETECTION)
+ base::TimeTicks begin_time = base::TimeTicks::Now();
+ std::string cld_language = DetermineTextLanguage(contents);
+ UMA_HISTOGRAM_MEDIUM_TIMES("Renderer4.LanguageDetection",
+ base::TimeTicks::Now() - begin_time);
+ ConvertLanguageCodeSynonym(&cld_language);
+ VLOG(9) << "CLD determined language code: " << cld_language;
+
+ // If |code| is empty, just use CLD result even though it might be
+ // chrome::kUnknownLanguageCode.
+ if (code.empty())
+ return cld_language;
+#endif // defined(ENABLE_LANGUAGE_DETECTION)
+
+ // Correct well-known format errors.
+ std::string language = code;
+ CorrectLanguageCodeTypo(&language);
+
+ // Convert language code synonym firstly because sometime synonym code is in
+ // invalid format, e.g. 'fil'. After validation, such a 3 characters language
+ // gets converted to an empty string.
+ ConvertLanguageCodeSynonym(&language);
+ ResetInvalidLanguageCode(&language);
+ VLOG(9) << "Content-Language based language code: " << language;
+
+#if defined(ENABLE_LANGUAGE_DETECTION)
+ if (cld_language != chrome::kUnknownLanguageCode &&
+ cld_language != language) {
+ // Content-Language value might be wrong because CLD says that this page
+ // is written in another language with confidence.
+ // In this case, Chrome doesn't rely on any of the language codes, and
+ // gives up suggesting a translation.
+ VLOG(9) << "CLD disagreed with the Content-Language value with confidence.";
+ return std::string(chrome::kUnknownLanguageCode);
+ }
+#endif // defined(ENABLE_LANGUAGE_DETECTION)
+
+ return language;
+}
+
+// static
bool TranslateHelper::IsPageTranslatable(WebDocument* document) {
std::vector<WebElement> meta_elements;
webkit_glue::GetMetaElementsWithAttribute(document,
diff --git a/chrome/renderer/translate_helper.h b/chrome/renderer/translate_helper.h
index 1279b07..22b8098 100644
--- a/chrome/renderer/translate_helper.h
+++ b/chrome/renderer/translate_helper.h
@@ -69,6 +69,8 @@ class TranslateHelper : public content::RenderViewObserver {
FRIEND_TEST_ALL_PREFIXES(TranslateHelperTest, LanguageCodeTypoCorrection);
FRIEND_TEST_ALL_PREFIXES(TranslateHelperTest, LanguageCodeSynonyms);
FRIEND_TEST_ALL_PREFIXES(TranslateHelperTest, ResetInvalidLanguageCode);
+ FRIEND_TEST_ALL_PREFIXES(TranslateHelperTest,
+ CLDDisagreeWithWrongLanguageCode);
// Correct language code if it contains well-known mistakes.
static void CorrectLanguageCodeTypo(std::string* code);
@@ -79,6 +81,10 @@ class TranslateHelper : public content::RenderViewObserver {
// Reset language code if the specified string is apparently invalid.
static void ResetInvalidLanguageCode(std::string* code);
+ // Determine content page language from Content-Language code and contents.
+ static std::string DeterminePageLanguage(const std::string& code,
+ const string16& contents);
+
// Returns whether the page associated with |document| is a candidate for
// translation. Some pages can explictly specify (via a meta-tag) that they
// should not be translated.
diff --git a/chrome/renderer/translate_helper_browsertest.cc b/chrome/renderer/translate_helper_browsertest.cc
index 0ad7820..bee6b83 100644
--- a/chrome/renderer/translate_helper_browsertest.cc
+++ b/chrome/renderer/translate_helper_browsertest.cc
@@ -414,7 +414,6 @@ TEST_F(ChromeRenderViewTest, LanguageCommonMistakesAreCorrected) {
render_thread_->sink().ClearMessages();
}
-
// Tests that a back navigation gets a translate language message.
TEST_F(ChromeRenderViewTest, BackToTranslatablePage) {
SendContentStateImmediately();
diff --git a/chrome/renderer/translate_helper_unittest.cc b/chrome/renderer/translate_helper_unittest.cc
index 8e5bb82..c04e108 100644
--- a/chrome/renderer/translate_helper_unittest.cc
+++ b/chrome/renderer/translate_helper_unittest.cc
@@ -4,6 +4,8 @@
#include "chrome/renderer/translate_helper.h"
+#include "base/utf_string_conversions.h"
+#include "chrome/common/chrome_constants.h"
#include "testing/gtest/include/gtest/gtest.h"
typedef testing::Test TranslateHelperTest;
@@ -77,3 +79,15 @@ TEST_F(TranslateHelperTest, ResetInvalidLanguageCode) {
EXPECT_TRUE(language.empty());
}
+// Tests that the language meta tag providing wrong information is ignored by
+// TranslateHelper due to disagreement between meta tag and CLD.
+TEST_F(TranslateHelperTest, CLDDisagreeWithWrongLanguageCode) {
+ string16 contents = ASCIIToUTF16(
+ "<html><head><meta http-equiv='Content-Language' content='ja'></head>"
+ "<body>This is a page apparently written in English. Even though "
+ "content-language is provided, the value will be ignored if the value "
+ "is suspicious.</body></html>");
+ std::string language =
+ TranslateHelper::DeterminePageLanguage(std::string("ja"), contents);
+ EXPECT_EQ(chrome::kUnknownLanguageCode, language);
+}