diff options
author | toyoshim@chromium.org <toyoshim@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-02-27 03:31:34 +0000 |
---|---|---|
committer | toyoshim@chromium.org <toyoshim@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-02-27 03:31:34 +0000 |
commit | 4e189749a72c06c4802d8c8fbe32c8f5f7fd8905 (patch) | |
tree | fe997113e688f333d09c7800caa29e4e8ada613e /chrome | |
parent | 68c9dd44cdada6a7a5b9976296095fa876e8b1bb (diff) | |
download | chromium_src-4e189749a72c06c4802d8c8fbe32c8f5f7fd8905.zip chromium_src-4e189749a72c06c4802d8c8fbe32c8f5f7fd8905.tar.gz chromium_src-4e189749a72c06c4802d8c8fbe32c8f5f7fd8905.tar.bz2 |
Translate: run CLD even though Content-Language is provided.
If CLD disagree a Content-Language value with confidence, Chrome should
not rely on both CLD and Content-Language, then give up to suggest
translation.
BUG=91205
Review URL: https://chromiumcodereview.appspot.com/12218074
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@184859 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome')
-rw-r--r-- | chrome/renderer/translate_helper.cc | 70 | ||||
-rw-r--r-- | chrome/renderer/translate_helper.h | 6 | ||||
-rw-r--r-- | chrome/renderer/translate_helper_browsertest.cc | 1 | ||||
-rw-r--r-- | chrome/renderer/translate_helper_unittest.cc | 14 |
4 files changed, 68 insertions, 23 deletions
diff --git a/chrome/renderer/translate_helper.cc b/chrome/renderer/translate_helper.cc index 33a5bb1..c4cece5 100644 --- a/chrome/renderer/translate_helper.cc +++ b/chrome/renderer/translate_helper.cc @@ -92,30 +92,10 @@ void TranslateHelper::PageCaptured(const string16& contents) { // language of the intended audience (a distinction really only // relevant for things like langauge textbooks). This distinction // shouldn't affect translation. - std::string language = document.contentLanguage().utf8(); - CorrectLanguageCodeTypo(&language); - - // Convert language code synonym firstly because sometime synonym code is in - // invalid format, e.g. 'fil'. After the conversion, make invalid code empty - // string. - ConvertLanguageCodeSynonym(&language); - ResetInvalidLanguageCode(&language); - -#if defined(ENABLE_LANGUAGE_DETECTION) - if (language.empty()) { - base::TimeTicks begin_time = base::TimeTicks::Now(); - language = DetermineTextLanguage(contents); - UMA_HISTOGRAM_MEDIUM_TIMES("Renderer4.LanguageDetection", - base::TimeTicks::Now() - begin_time); - // Apply synonym conversion here because CLD may return 'fil'. - ConvertLanguageCodeSynonym(&language); - } else { - VLOG(9) << "PageLanguageFromMetaTag: " << language; - } -#else + std::string content_language = document.contentLanguage().utf8(); + std::string language = DeterminePageLanguage(content_language, contents); if (language.empty()) return; -#endif // defined(ENABLE_LANGUAGE_DETECTION) Send(new ChromeViewHostMsg_TranslateLanguageDetermined( routing_id(), language, IsPageTranslatable(&document))); @@ -142,6 +122,9 @@ std::string TranslateHelper::DetermineTextLanguage(const string16& text) { // We don't trust the result if the CLD reports that the detection is not // reliable, or if the actual text used to detect the language was less than // 100 bytes (short texts can often lead to wrong results). + // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that + // the determined language code is correct with 50% confidence. Chrome should + // handle the real confidence value to judge. if (is_reliable && text_bytes >= 100 && cld_language != NUM_LANGUAGES && cld_language != UNKNOWN_LANGUAGE && cld_language != TG_UNKNOWN_LANGUAGE) { // We should not use LanguageCode_ISO_639_1 because it does not cover all @@ -279,6 +262,49 @@ void TranslateHelper::ResetInvalidLanguageCode(std::string* code) { } // static +std::string TranslateHelper::DeterminePageLanguage(const std::string& code, + const string16& contents) { +#if defined(ENABLE_LANGUAGE_DETECTION) + base::TimeTicks begin_time = base::TimeTicks::Now(); + std::string cld_language = DetermineTextLanguage(contents); + UMA_HISTOGRAM_MEDIUM_TIMES("Renderer4.LanguageDetection", + base::TimeTicks::Now() - begin_time); + ConvertLanguageCodeSynonym(&cld_language); + VLOG(9) << "CLD determined language code: " << cld_language; + + // If |code| is empty, just use CLD result even though it might be + // chrome::kUnknownLanguageCode. + if (code.empty()) + return cld_language; +#endif // defined(ENABLE_LANGUAGE_DETECTION) + + // Correct well-known format errors. + std::string language = code; + CorrectLanguageCodeTypo(&language); + + // Convert language code synonym firstly because sometime synonym code is in + // invalid format, e.g. 'fil'. After validation, such a 3 characters language + // gets converted to an empty string. + ConvertLanguageCodeSynonym(&language); + ResetInvalidLanguageCode(&language); + VLOG(9) << "Content-Language based language code: " << language; + +#if defined(ENABLE_LANGUAGE_DETECTION) + if (cld_language != chrome::kUnknownLanguageCode && + cld_language != language) { + // Content-Language value might be wrong because CLD says that this page + // is written in another language with confidence. + // In this case, Chrome doesn't rely on any of the language codes, and + // gives up suggesting a translation. + VLOG(9) << "CLD disagreed with the Content-Language value with confidence."; + return std::string(chrome::kUnknownLanguageCode); + } +#endif // defined(ENABLE_LANGUAGE_DETECTION) + + return language; +} + +// static bool TranslateHelper::IsPageTranslatable(WebDocument* document) { std::vector<WebElement> meta_elements; webkit_glue::GetMetaElementsWithAttribute(document, diff --git a/chrome/renderer/translate_helper.h b/chrome/renderer/translate_helper.h index 1279b07..22b8098 100644 --- a/chrome/renderer/translate_helper.h +++ b/chrome/renderer/translate_helper.h @@ -69,6 +69,8 @@ class TranslateHelper : public content::RenderViewObserver { FRIEND_TEST_ALL_PREFIXES(TranslateHelperTest, LanguageCodeTypoCorrection); FRIEND_TEST_ALL_PREFIXES(TranslateHelperTest, LanguageCodeSynonyms); FRIEND_TEST_ALL_PREFIXES(TranslateHelperTest, ResetInvalidLanguageCode); + FRIEND_TEST_ALL_PREFIXES(TranslateHelperTest, + CLDDisagreeWithWrongLanguageCode); // Correct language code if it contains well-known mistakes. static void CorrectLanguageCodeTypo(std::string* code); @@ -79,6 +81,10 @@ class TranslateHelper : public content::RenderViewObserver { // Reset language code if the specified string is apparently invalid. static void ResetInvalidLanguageCode(std::string* code); + // Determine content page language from Content-Language code and contents. + static std::string DeterminePageLanguage(const std::string& code, + const string16& contents); + // Returns whether the page associated with |document| is a candidate for // translation. Some pages can explictly specify (via a meta-tag) that they // should not be translated. diff --git a/chrome/renderer/translate_helper_browsertest.cc b/chrome/renderer/translate_helper_browsertest.cc index 0ad7820..bee6b83 100644 --- a/chrome/renderer/translate_helper_browsertest.cc +++ b/chrome/renderer/translate_helper_browsertest.cc @@ -414,7 +414,6 @@ TEST_F(ChromeRenderViewTest, LanguageCommonMistakesAreCorrected) { render_thread_->sink().ClearMessages(); } - // Tests that a back navigation gets a translate language message. TEST_F(ChromeRenderViewTest, BackToTranslatablePage) { SendContentStateImmediately(); diff --git a/chrome/renderer/translate_helper_unittest.cc b/chrome/renderer/translate_helper_unittest.cc index 8e5bb82..c04e108 100644 --- a/chrome/renderer/translate_helper_unittest.cc +++ b/chrome/renderer/translate_helper_unittest.cc @@ -4,6 +4,8 @@ #include "chrome/renderer/translate_helper.h" +#include "base/utf_string_conversions.h" +#include "chrome/common/chrome_constants.h" #include "testing/gtest/include/gtest/gtest.h" typedef testing::Test TranslateHelperTest; @@ -77,3 +79,15 @@ TEST_F(TranslateHelperTest, ResetInvalidLanguageCode) { EXPECT_TRUE(language.empty()); } +// Tests that the language meta tag providing wrong information is ignored by +// TranslateHelper due to disagreement between meta tag and CLD. +TEST_F(TranslateHelperTest, CLDDisagreeWithWrongLanguageCode) { + string16 contents = ASCIIToUTF16( + "<html><head><meta http-equiv='Content-Language' content='ja'></head>" + "<body>This is a page apparently written in English. Even though " + "content-language is provided, the value will be ignored if the value " + "is suspicious.</body></html>"); + std::string language = + TranslateHelper::DeterminePageLanguage(std::string("ja"), contents); + EXPECT_EQ(chrome::kUnknownLanguageCode, language); +} |