diff options
author | toyoshim@chromium.org <toyoshim@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-05-27 15:00:08 +0000 |
---|---|---|
committer | toyoshim@chromium.org <toyoshim@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-05-27 15:00:08 +0000 |
commit | 8d74bb4c1997f0cf3db888ed161917c18605261c (patch) | |
tree | 25e961cb937c38308427a575f48d83a63f541b0e /chrome/renderer | |
parent | 54b362c8539bdc59e1dc0df75220e4789ff5221c (diff) | |
download | chromium_src-8d74bb4c1997f0cf3db888ed161917c18605261c.zip chromium_src-8d74bb4c1997f0cf3db888ed161917c18605261c.tar.gz chromium_src-8d74bb4c1997f0cf3db888ed161917c18605261c.tar.bz2 |
Translate: adopt html lang attribute if valid value is provided
Language decision is made by heuristics using Content-Language and CLD
determined language. This change adds html lang attribute, e.g.,
<html lang="ja">, as a third hint for language decision.
Content-Language is often invalid due to invalid server configuration.
On the other hand, lang attribute is provided by pages, or authoring
tools or authors themselves. As a result, they are more reliable than
Content-Language. With this change, language decision uses html lang
attribute instead of Content-Language if valid value is provided.
Other dicision process is not changed at all.
BUG=222073
TEST=unit_tests, browser_tests
Committed: https://src.chromium.org/viewvc/chrome?view=rev&revision=201856
Review URL: https://chromiumcodereview.appspot.com/15728002
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@202427 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/renderer')
6 files changed, 148 insertions, 64 deletions
diff --git a/chrome/renderer/translate/translate_helper.cc b/chrome/renderer/translate/translate_helper.cc index 84e68fe..1709fa6 100644 --- a/chrome/renderer/translate/translate_helper.cc +++ b/chrome/renderer/translate/translate_helper.cc @@ -93,18 +93,24 @@ void TranslateHelper::PageCaptured(const string16& contents) { // language of the intended audience (a distinction really only // relevant for things like langauge textbooks). This distinction // shouldn't affect translation. - WebDocument document = GetMainFrame()->document(); + WebFrame* main_frame = GetMainFrame(); + if (!main_frame) + return; + WebDocument document = main_frame->document(); std::string content_language = document.contentLanguage().utf8(); + std::string html_lang = + document.documentElement().getAttribute("lang").utf8(); std::string cld_language; bool is_cld_reliable; std::string language = DeterminePageLanguage( - content_language, contents, &cld_language, &is_cld_reliable); + content_language, html_lang, contents, &cld_language, &is_cld_reliable); if (language.empty()) return; language_determined_time_ = base::TimeTicks::Now(); + // TODO(toyoshim): Add |html_lang| to LanguageDetectionDetails. GURL url(document.url()); LanguageDetectionDetails details; details.time = base::Time::Now(); @@ -319,7 +325,20 @@ void TranslateHelper::ResetInvalidLanguageCode(std::string* code) { } // static +void TranslateHelper::ApplyLanguageCodeCorrection(std::string* code) { + // Correct well-known format errors. + CorrectLanguageCodeTypo(code); + + // Convert language code synonym firstly because sometime synonym code is in + // invalid format, e.g. 'fil'. After validation, such a 3 characters language + // gets converted to an empty string. + ConvertLanguageCodeSynonym(code); + ResetInvalidLanguageCode(code); +} + +// static std::string TranslateHelper::DeterminePageLanguage(const std::string& code, + const std::string& html_lang, const string16& contents, std::string* cld_language_p, bool* is_cld_reliable_p) { @@ -337,17 +356,27 @@ std::string TranslateHelper::DeterminePageLanguage(const std::string& code, ConvertLanguageCodeSynonym(&cld_language); #endif // defined(ENABLE_LANGUAGE_DETECTION) - // Correct well-known format errors. - std::string language = code; - CorrectLanguageCodeTypo(&language); + // Check if html lang attribute is valid. + std::string modified_html_lang; + if (!html_lang.empty()) { + modified_html_lang = html_lang; + ApplyLanguageCodeCorrection(&modified_html_lang); + TranslateHelperMetrics::ReportHtmlLang(html_lang, modified_html_lang); + VLOG(9) << "html lang based language code: " << modified_html_lang; + } - // Convert language code synonym firstly because sometime synonym code is in - // invalid format, e.g. 'fil'. After validation, such a 3 characters language - // gets converted to an empty string. - ConvertLanguageCodeSynonym(&language); - ResetInvalidLanguageCode(&language); + // Check if Content-Language is valid. + std::string modified_code; + if (!code.empty()) { + modified_code = code; + ApplyLanguageCodeCorrection(&modified_code); + TranslateHelperMetrics::ReportContentLanguage(code, modified_code); + } - TranslateHelperMetrics::ReportContentLanguage(code, language); + // Adopt |modified_html_lang| if it is valid. Otherwise, adopt + // |modified_code|. + std::string language = modified_html_lang.empty() ? modified_code : + modified_html_lang; #if defined(ENABLE_LANGUAGE_DETECTION) // If |language| is empty, just use CLD result even though it might be @@ -433,7 +462,8 @@ void TranslateHelper::OnTranslatePage(int page_id, const std::string& translate_script, const std::string& source_lang, const std::string& target_lang) { - if (render_view()->GetPageId() != page_id) + WebFrame* main_frame = GetMainFrame(); + if (!main_frame || render_view()->GetPageId() != page_id) return; // We navigated away, nothing to do. if (translation_pending_ && page_id == page_id_ && @@ -457,7 +487,7 @@ void TranslateHelper::OnTranslatePage(int page_id, TranslateHelperMetrics::ReportUserActionDuration(language_determined_time_, base::TimeTicks::Now()); - GURL url(GetMainFrame()->document().url()); + GURL url(main_frame->document().url()); TranslateHelperMetrics::ReportPageScheme(url.scheme()); if (!IsTranslateLibAvailable()) { @@ -590,12 +620,10 @@ void TranslateHelper::NotifyBrowserTranslationFailed( WebFrame* TranslateHelper::GetMainFrame() { WebView* web_view = render_view()->GetWebView(); - if (!web_view) { - // When the WebView is going away, the render view should have called - // CancelPendingTranslation() which should have stopped any pending work, so - // that case should not happen. - NOTREACHED(); + + // When the tab is going to be closed, the web_view can be NULL. + if (!web_view) return NULL; - } + return web_view->mainFrame(); } diff --git a/chrome/renderer/translate/translate_helper.h b/chrome/renderer/translate/translate_helper.h index eebc9ff..6c6662c 100644 --- a/chrome/renderer/translate/translate_helper.h +++ b/chrome/renderer/translate/translate_helper.h @@ -96,18 +96,23 @@ class TranslateHelper : public content::RenderViewObserver { CLDAgreeWithLanguageCodeHavingCountryCode); FRIEND_TEST_ALL_PREFIXES(TranslateHelperTest, InvalidLanguageMetaTagProviding); + FRIEND_TEST_ALL_PREFIXES(TranslateHelperTest, AdoptHtmlLang); - // Correct language code if it contains well-known mistakes. + // Corrects language code if it contains well-known mistakes. static void CorrectLanguageCodeTypo(std::string* code); - // Convert language code to the one used in server supporting list. + // Converts language code to the one used in server supporting list. static void ConvertLanguageCodeSynonym(std::string* code); - // Reset language code if the specified string is apparently invalid. + // Resets language code if the specified string is apparently invalid. static void ResetInvalidLanguageCode(std::string* code); - // Determine content page language from Content-Language code and contents. + // Applies a series of language code modification in proper order. + static void ApplyLanguageCodeCorrection(std::string* code); + + // Determines content page language from Content-Language code and contents. static std::string DeterminePageLanguage(const std::string& code, + const std::string& html_lang, const string16& contents, std::string* cld_language, bool* is_cld_reliable); diff --git a/chrome/renderer/translate/translate_helper_metrics.cc b/chrome/renderer/translate/translate_helper_metrics.cc index 91626b1..cf28339 100644 --- a/chrome/renderer/translate/translate_helper_metrics.cc +++ b/chrome/renderer/translate/translate_helper_metrics.cc @@ -13,6 +13,7 @@ namespace { // a corresponding index in MetricsNameIndex and an entry in |kMetricsEntries|. const char kRenderer4LanguageDetection[] = "Renderer4.LanguageDetection"; const char kTranslateContentLanguage[] = "Translate.ContentLanguage"; +const char kTranslateHtmlLang[] = "Translate.HtmlLang"; const char kTranslateLanguageVerification[] = "Translate.LanguageVerification"; const char kTranslateTimeToBeReady[] = "Translate.TimeToBeReady"; const char kTranslateTimeToLoad[] = "Translate.TimeToLoad"; @@ -34,6 +35,8 @@ const MetricsEntry kMetricsEntries[] = { kRenderer4LanguageDetection }, { TranslateHelperMetrics::UMA_CONTENT_LANGUAGE, kTranslateContentLanguage }, + { TranslateHelperMetrics::UMA_HTML_LANG, + kTranslateHtmlLang }, { TranslateHelperMetrics::UMA_LANGUAGE_VERIFICATION, kTranslateLanguageVerification }, { TranslateHelperMetrics::UMA_TIME_TO_BE_READY, @@ -51,25 +54,32 @@ const MetricsEntry kMetricsEntries[] = { COMPILE_ASSERT(arraysize(kMetricsEntries) == TranslateHelperMetrics::UMA_MAX, arraysize_of_kMetricsEntries_should_be_UMA_MAX); +TranslateHelperMetrics::LanguageCheckType GetLanguageCheckMetric( + const std::string& provided_code, + const std::string& revised_code) { + if (provided_code.empty()) + return TranslateHelperMetrics::LANGUAGE_NOT_PROVIDED; + else if (provided_code == revised_code) + return TranslateHelperMetrics::LANGUAGE_VALID; + return TranslateHelperMetrics::LANGUAGE_INVALID; +} + } // namespace namespace TranslateHelperMetrics { void ReportContentLanguage(const std::string& provided_code, const std::string& revised_code) { - if (provided_code.empty()) { - UMA_HISTOGRAM_ENUMERATION(kTranslateContentLanguage, - CONTENT_LANGUAGE_NOT_PROVIDED, - CONTENT_LANGUAGE_MAX); - } else if (provided_code == revised_code) { - UMA_HISTOGRAM_ENUMERATION(kTranslateContentLanguage, - CONTENT_LANGUAGE_VALID, - CONTENT_LANGUAGE_MAX); - } else { - UMA_HISTOGRAM_ENUMERATION(kTranslateContentLanguage, - CONTENT_LANGUAGE_INVALID, - CONTENT_LANGUAGE_MAX); - } + UMA_HISTOGRAM_ENUMERATION(kTranslateContentLanguage, + GetLanguageCheckMetric(provided_code, revised_code), + TranslateHelperMetrics::LANGUAGE_MAX); +} + +void ReportHtmlLang(const std::string& provided_code, + const std::string& revised_code) { + UMA_HISTOGRAM_ENUMERATION(kTranslateHtmlLang, + GetLanguageCheckMetric(provided_code, revised_code), + TranslateHelperMetrics::LANGUAGE_MAX); } void ReportLanguageVerification(LanguageVerificationType type) { diff --git a/chrome/renderer/translate/translate_helper_metrics.h b/chrome/renderer/translate/translate_helper_metrics.h index a7fb55e..74caf38 100644 --- a/chrome/renderer/translate/translate_helper_metrics.h +++ b/chrome/renderer/translate/translate_helper_metrics.h @@ -16,6 +16,7 @@ namespace TranslateHelperMetrics { enum MetricsNameIndex { UMA_LANGUAGE_DETECTION, UMA_CONTENT_LANGUAGE, + UMA_HTML_LANG, UMA_LANGUAGE_VERIFICATION, UMA_TIME_TO_BE_READY, UMA_TIME_TO_LOAD, @@ -27,11 +28,11 @@ enum MetricsNameIndex { // A page may provide a Content-Language HTTP header or a META tag. // TranslateHelper checks if a server provides a valid Content-Language. -enum ContentLanguageType { - CONTENT_LANGUAGE_NOT_PROVIDED, - CONTENT_LANGUAGE_VALID, - CONTENT_LANGUAGE_INVALID, - CONTENT_LANGUAGE_MAX, +enum LanguageCheckType { + LANGUAGE_NOT_PROVIDED, + LANGUAGE_VALID, + LANGUAGE_INVALID, + LANGUAGE_MAX, }; // When a valid Content-Language is provided, TranslateHelper checks if a @@ -54,12 +55,19 @@ enum SchemeType { }; // Called after TranslateHelper verifies a server providing Content-Language -// header. |provided_code| contains a Content-Language header value which +// header. |provided_code| contains a Content-Language header value which a // server provides. It can be empty string when a server doesn't provide it. // |revised_code| is a value modified by format error corrector. void ReportContentLanguage(const std::string& provided_code, const std::string& revised_code); +// Called after TranslateHelper verifies a page providing html lang attribute. +// |provided_code| contains a html lang attribute which a page provides. It can +// be empty string when a page doesn't provide it. |revised_code| is a value +// modified by format error corrector. +void ReportHtmlLang(const std::string& provided_code, + const std::string& revised_code); + // Called when CLD verifies Content-Language header. void ReportLanguageVerification(LanguageVerificationType type); diff --git a/chrome/renderer/translate/translate_helper_metrics_unittest.cc b/chrome/renderer/translate/translate_helper_metrics_unittest.cc index 50a0b30..5a503ecd 100644 --- a/chrome/renderer/translate/translate_helper_metrics_unittest.cc +++ b/chrome/renderer/translate/translate_helper_metrics_unittest.cc @@ -33,23 +33,20 @@ class MetricsRecorder { base_samples_ = histogram->SnapshotSamples(); } - void CheckContentLanguage(int expected_not_provided, - int expected_valid, - int expected_invalid) { - ASSERT_EQ(TranslateHelperMetrics::GetMetricsName( - TranslateHelperMetrics::UMA_CONTENT_LANGUAGE), key_); + void CheckLanguage(TranslateHelperMetrics::MetricsNameIndex index, + int expected_not_provided, + int expected_valid, + int expected_invalid) { + ASSERT_EQ(TranslateHelperMetrics::GetMetricsName(index), key_); Snapshot(); - EXPECT_EQ( - expected_not_provided, - GetCount(TranslateHelperMetrics::CONTENT_LANGUAGE_NOT_PROVIDED)); - EXPECT_EQ( - expected_valid, - GetCount(TranslateHelperMetrics::CONTENT_LANGUAGE_VALID)); - EXPECT_EQ( - expected_invalid, - GetCount(TranslateHelperMetrics::CONTENT_LANGUAGE_INVALID)); + EXPECT_EQ(expected_not_provided, + GetCount(TranslateHelperMetrics::LANGUAGE_NOT_PROVIDED)); + EXPECT_EQ(expected_valid, + GetCount(TranslateHelperMetrics::LANGUAGE_VALID)); + EXPECT_EQ(expected_invalid, + GetCount(TranslateHelperMetrics::LANGUAGE_INVALID)); } void CheckLanguageVerification(int expected_cld_disabled, @@ -150,13 +147,26 @@ TEST(TranslateHelperMetricsTest, ReportContentLanguage) { MetricsRecorder recorder(TranslateHelperMetrics::GetMetricsName( TranslateHelperMetrics::UMA_CONTENT_LANGUAGE)); - recorder.CheckContentLanguage(0, 0, 0); + recorder.CheckLanguage(TranslateHelperMetrics::UMA_CONTENT_LANGUAGE, 0, 0, 0); TranslateHelperMetrics::ReportContentLanguage(std::string(), std::string()); - recorder.CheckContentLanguage(1, 0, 0); + recorder.CheckLanguage(TranslateHelperMetrics::UMA_CONTENT_LANGUAGE, 1, 0, 0); TranslateHelperMetrics::ReportContentLanguage("ja_JP", "ja-JP"); - recorder.CheckContentLanguage(1, 0, 1); + recorder.CheckLanguage(TranslateHelperMetrics::UMA_CONTENT_LANGUAGE, 1, 0, 1); TranslateHelperMetrics::ReportContentLanguage("en", "en"); - recorder.CheckContentLanguage(1, 1, 1); + recorder.CheckLanguage(TranslateHelperMetrics::UMA_CONTENT_LANGUAGE, 1, 1, 1); +} + +TEST(TranslateHelperMetricsTest, ReportHtmlLang) { + MetricsRecorder recorder(TranslateHelperMetrics::GetMetricsName( + TranslateHelperMetrics::UMA_HTML_LANG)); + + recorder.CheckLanguage(TranslateHelperMetrics::UMA_HTML_LANG, 0, 0, 0); + TranslateHelperMetrics::ReportHtmlLang(std::string(), std::string()); + recorder.CheckLanguage(TranslateHelperMetrics::UMA_HTML_LANG, 1, 0, 0); + TranslateHelperMetrics::ReportHtmlLang("ja_JP", "ja-JP"); + recorder.CheckLanguage(TranslateHelperMetrics::UMA_HTML_LANG, 1, 0, 1); + TranslateHelperMetrics::ReportHtmlLang("en", "en"); + recorder.CheckLanguage(TranslateHelperMetrics::UMA_HTML_LANG, 1, 1, 1); } TEST(TranslateHelperMetricsTest, ReportLanguageVerification) { diff --git a/chrome/renderer/translate/translate_helper_unittest.cc b/chrome/renderer/translate/translate_helper_unittest.cc index e1d1175..f92a815 100644 --- a/chrome/renderer/translate/translate_helper_unittest.cc +++ b/chrome/renderer/translate/translate_helper_unittest.cc @@ -90,8 +90,9 @@ TEST_F(TranslateHelperTest, CLDDisagreeWithWrongLanguageCode) { std::string cld_language; bool is_cld_reliable; std::string language = - TranslateHelper::DeterminePageLanguage(std::string("ja"), contents, - &cld_language, &is_cld_reliable); + TranslateHelper::DeterminePageLanguage(std::string("ja"), std::string(), + contents, &cld_language, + &is_cld_reliable); EXPECT_EQ(chrome::kUnknownLanguageCode, language); EXPECT_EQ("en", cld_language); EXPECT_TRUE(is_cld_reliable); @@ -108,7 +109,8 @@ TEST_F(TranslateHelperTest, CLDAgreeWithLanguageCodeHavingCountryCode) { std::string cld_language; bool is_cld_reliable; std::string language = - TranslateHelper::DeterminePageLanguage(std::string("en-US"), contents, + TranslateHelper::DeterminePageLanguage(std::string("en-US"), + std::string(), contents, &cld_language, &is_cld_reliable); EXPECT_EQ("en-US", language); EXPECT_EQ("en", cld_language); @@ -126,9 +128,30 @@ TEST_F(TranslateHelperTest, InvalidLanguageMetaTagProviding) { std::string cld_language; bool is_cld_reliable; std::string language = - TranslateHelper::DeterminePageLanguage(std::string("utf-8"), contents, + TranslateHelper::DeterminePageLanguage(std::string("utf-8"), + std::string(), contents, &cld_language, &is_cld_reliable); EXPECT_EQ("en", language); EXPECT_EQ("en", cld_language); EXPECT_TRUE(is_cld_reliable); } + +// Tests that the language meta tag providing wrong information is ignored +// because of valid html lang attribute. +TEST_F(TranslateHelperTest, AdoptHtmlLang) { + string16 contents = ASCIIToUTF16( + "<html lang='en'><head><meta http-equiv='Content-Language' content='ja'>" + "</head><body>This is a page apparently written in English. Even though " + "content-language is provided, the value will be ignored if the value " + "is suspicious.</body></html>"); + std::string cld_language; + bool is_cld_reliable; + std::string language = + TranslateHelper::DeterminePageLanguage(std::string("ja"), + std::string("en"), + contents, &cld_language, + &is_cld_reliable); + EXPECT_EQ("en", language); + EXPECT_EQ("en", cld_language); + EXPECT_TRUE(is_cld_reliable); +} |