summaryrefslogtreecommitdiffstats
path: root/chrome/renderer
diff options
context:
space:
mode:
authortoyoshim@chromium.org <toyoshim@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-05-27 15:00:08 +0000
committertoyoshim@chromium.org <toyoshim@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-05-27 15:00:08 +0000
commit8d74bb4c1997f0cf3db888ed161917c18605261c (patch)
tree25e961cb937c38308427a575f48d83a63f541b0e /chrome/renderer
parent54b362c8539bdc59e1dc0df75220e4789ff5221c (diff)
downloadchromium_src-8d74bb4c1997f0cf3db888ed161917c18605261c.zip
chromium_src-8d74bb4c1997f0cf3db888ed161917c18605261c.tar.gz
chromium_src-8d74bb4c1997f0cf3db888ed161917c18605261c.tar.bz2
Translate: adopt html lang attribute if valid value is provided
Language decision is made by heuristics using Content-Language and CLD determined language. This change adds html lang attribute, e.g., <html lang="ja">, as a third hint for language decision. Content-Language is often invalid due to invalid server configuration. On the other hand, lang attribute is provided by pages, or authoring tools or authors themselves. As a result, they are more reliable than Content-Language. With this change, language decision uses html lang attribute instead of Content-Language if valid value is provided. Other dicision process is not changed at all. BUG=222073 TEST=unit_tests, browser_tests Committed: https://src.chromium.org/viewvc/chrome?view=rev&revision=201856 Review URL: https://chromiumcodereview.appspot.com/15728002 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@202427 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/renderer')
-rw-r--r--chrome/renderer/translate/translate_helper.cc66
-rw-r--r--chrome/renderer/translate/translate_helper.h13
-rw-r--r--chrome/renderer/translate/translate_helper_metrics.cc36
-rw-r--r--chrome/renderer/translate/translate_helper_metrics.h20
-rw-r--r--chrome/renderer/translate/translate_helper_metrics_unittest.cc46
-rw-r--r--chrome/renderer/translate/translate_helper_unittest.cc31
6 files changed, 148 insertions, 64 deletions
diff --git a/chrome/renderer/translate/translate_helper.cc b/chrome/renderer/translate/translate_helper.cc
index 84e68fe..1709fa6 100644
--- a/chrome/renderer/translate/translate_helper.cc
+++ b/chrome/renderer/translate/translate_helper.cc
@@ -93,18 +93,24 @@ void TranslateHelper::PageCaptured(const string16& contents) {
// language of the intended audience (a distinction really only
// relevant for things like langauge textbooks). This distinction
// shouldn't affect translation.
- WebDocument document = GetMainFrame()->document();
+ WebFrame* main_frame = GetMainFrame();
+ if (!main_frame)
+ return;
+ WebDocument document = main_frame->document();
std::string content_language = document.contentLanguage().utf8();
+ std::string html_lang =
+ document.documentElement().getAttribute("lang").utf8();
std::string cld_language;
bool is_cld_reliable;
std::string language = DeterminePageLanguage(
- content_language, contents, &cld_language, &is_cld_reliable);
+ content_language, html_lang, contents, &cld_language, &is_cld_reliable);
if (language.empty())
return;
language_determined_time_ = base::TimeTicks::Now();
+ // TODO(toyoshim): Add |html_lang| to LanguageDetectionDetails.
GURL url(document.url());
LanguageDetectionDetails details;
details.time = base::Time::Now();
@@ -319,7 +325,20 @@ void TranslateHelper::ResetInvalidLanguageCode(std::string* code) {
}
// static
+void TranslateHelper::ApplyLanguageCodeCorrection(std::string* code) {
+ // Correct well-known format errors.
+ CorrectLanguageCodeTypo(code);
+
+ // Convert language code synonym firstly because sometime synonym code is in
+ // invalid format, e.g. 'fil'. After validation, such a 3 characters language
+ // gets converted to an empty string.
+ ConvertLanguageCodeSynonym(code);
+ ResetInvalidLanguageCode(code);
+}
+
+// static
std::string TranslateHelper::DeterminePageLanguage(const std::string& code,
+ const std::string& html_lang,
const string16& contents,
std::string* cld_language_p,
bool* is_cld_reliable_p) {
@@ -337,17 +356,27 @@ std::string TranslateHelper::DeterminePageLanguage(const std::string& code,
ConvertLanguageCodeSynonym(&cld_language);
#endif // defined(ENABLE_LANGUAGE_DETECTION)
- // Correct well-known format errors.
- std::string language = code;
- CorrectLanguageCodeTypo(&language);
+ // Check if html lang attribute is valid.
+ std::string modified_html_lang;
+ if (!html_lang.empty()) {
+ modified_html_lang = html_lang;
+ ApplyLanguageCodeCorrection(&modified_html_lang);
+ TranslateHelperMetrics::ReportHtmlLang(html_lang, modified_html_lang);
+ VLOG(9) << "html lang based language code: " << modified_html_lang;
+ }
- // Convert language code synonym firstly because sometime synonym code is in
- // invalid format, e.g. 'fil'. After validation, such a 3 characters language
- // gets converted to an empty string.
- ConvertLanguageCodeSynonym(&language);
- ResetInvalidLanguageCode(&language);
+ // Check if Content-Language is valid.
+ std::string modified_code;
+ if (!code.empty()) {
+ modified_code = code;
+ ApplyLanguageCodeCorrection(&modified_code);
+ TranslateHelperMetrics::ReportContentLanguage(code, modified_code);
+ }
- TranslateHelperMetrics::ReportContentLanguage(code, language);
+ // Adopt |modified_html_lang| if it is valid. Otherwise, adopt
+ // |modified_code|.
+ std::string language = modified_html_lang.empty() ? modified_code :
+ modified_html_lang;
#if defined(ENABLE_LANGUAGE_DETECTION)
// If |language| is empty, just use CLD result even though it might be
@@ -433,7 +462,8 @@ void TranslateHelper::OnTranslatePage(int page_id,
const std::string& translate_script,
const std::string& source_lang,
const std::string& target_lang) {
- if (render_view()->GetPageId() != page_id)
+ WebFrame* main_frame = GetMainFrame();
+ if (!main_frame || render_view()->GetPageId() != page_id)
return; // We navigated away, nothing to do.
if (translation_pending_ && page_id == page_id_ &&
@@ -457,7 +487,7 @@ void TranslateHelper::OnTranslatePage(int page_id,
TranslateHelperMetrics::ReportUserActionDuration(language_determined_time_,
base::TimeTicks::Now());
- GURL url(GetMainFrame()->document().url());
+ GURL url(main_frame->document().url());
TranslateHelperMetrics::ReportPageScheme(url.scheme());
if (!IsTranslateLibAvailable()) {
@@ -590,12 +620,10 @@ void TranslateHelper::NotifyBrowserTranslationFailed(
WebFrame* TranslateHelper::GetMainFrame() {
WebView* web_view = render_view()->GetWebView();
- if (!web_view) {
- // When the WebView is going away, the render view should have called
- // CancelPendingTranslation() which should have stopped any pending work, so
- // that case should not happen.
- NOTREACHED();
+
+ // When the tab is going to be closed, the web_view can be NULL.
+ if (!web_view)
return NULL;
- }
+
return web_view->mainFrame();
}
diff --git a/chrome/renderer/translate/translate_helper.h b/chrome/renderer/translate/translate_helper.h
index eebc9ff..6c6662c 100644
--- a/chrome/renderer/translate/translate_helper.h
+++ b/chrome/renderer/translate/translate_helper.h
@@ -96,18 +96,23 @@ class TranslateHelper : public content::RenderViewObserver {
CLDAgreeWithLanguageCodeHavingCountryCode);
FRIEND_TEST_ALL_PREFIXES(TranslateHelperTest,
InvalidLanguageMetaTagProviding);
+ FRIEND_TEST_ALL_PREFIXES(TranslateHelperTest, AdoptHtmlLang);
- // Correct language code if it contains well-known mistakes.
+ // Corrects language code if it contains well-known mistakes.
static void CorrectLanguageCodeTypo(std::string* code);
- // Convert language code to the one used in server supporting list.
+ // Converts language code to the one used in server supporting list.
static void ConvertLanguageCodeSynonym(std::string* code);
- // Reset language code if the specified string is apparently invalid.
+ // Resets language code if the specified string is apparently invalid.
static void ResetInvalidLanguageCode(std::string* code);
- // Determine content page language from Content-Language code and contents.
+ // Applies a series of language code modification in proper order.
+ static void ApplyLanguageCodeCorrection(std::string* code);
+
+ // Determines content page language from Content-Language code and contents.
static std::string DeterminePageLanguage(const std::string& code,
+ const std::string& html_lang,
const string16& contents,
std::string* cld_language,
bool* is_cld_reliable);
diff --git a/chrome/renderer/translate/translate_helper_metrics.cc b/chrome/renderer/translate/translate_helper_metrics.cc
index 91626b1..cf28339 100644
--- a/chrome/renderer/translate/translate_helper_metrics.cc
+++ b/chrome/renderer/translate/translate_helper_metrics.cc
@@ -13,6 +13,7 @@ namespace {
// a corresponding index in MetricsNameIndex and an entry in |kMetricsEntries|.
const char kRenderer4LanguageDetection[] = "Renderer4.LanguageDetection";
const char kTranslateContentLanguage[] = "Translate.ContentLanguage";
+const char kTranslateHtmlLang[] = "Translate.HtmlLang";
const char kTranslateLanguageVerification[] = "Translate.LanguageVerification";
const char kTranslateTimeToBeReady[] = "Translate.TimeToBeReady";
const char kTranslateTimeToLoad[] = "Translate.TimeToLoad";
@@ -34,6 +35,8 @@ const MetricsEntry kMetricsEntries[] = {
kRenderer4LanguageDetection },
{ TranslateHelperMetrics::UMA_CONTENT_LANGUAGE,
kTranslateContentLanguage },
+ { TranslateHelperMetrics::UMA_HTML_LANG,
+ kTranslateHtmlLang },
{ TranslateHelperMetrics::UMA_LANGUAGE_VERIFICATION,
kTranslateLanguageVerification },
{ TranslateHelperMetrics::UMA_TIME_TO_BE_READY,
@@ -51,25 +54,32 @@ const MetricsEntry kMetricsEntries[] = {
COMPILE_ASSERT(arraysize(kMetricsEntries) == TranslateHelperMetrics::UMA_MAX,
arraysize_of_kMetricsEntries_should_be_UMA_MAX);
+TranslateHelperMetrics::LanguageCheckType GetLanguageCheckMetric(
+ const std::string& provided_code,
+ const std::string& revised_code) {
+ if (provided_code.empty())
+ return TranslateHelperMetrics::LANGUAGE_NOT_PROVIDED;
+ else if (provided_code == revised_code)
+ return TranslateHelperMetrics::LANGUAGE_VALID;
+ return TranslateHelperMetrics::LANGUAGE_INVALID;
+}
+
} // namespace
namespace TranslateHelperMetrics {
void ReportContentLanguage(const std::string& provided_code,
const std::string& revised_code) {
- if (provided_code.empty()) {
- UMA_HISTOGRAM_ENUMERATION(kTranslateContentLanguage,
- CONTENT_LANGUAGE_NOT_PROVIDED,
- CONTENT_LANGUAGE_MAX);
- } else if (provided_code == revised_code) {
- UMA_HISTOGRAM_ENUMERATION(kTranslateContentLanguage,
- CONTENT_LANGUAGE_VALID,
- CONTENT_LANGUAGE_MAX);
- } else {
- UMA_HISTOGRAM_ENUMERATION(kTranslateContentLanguage,
- CONTENT_LANGUAGE_INVALID,
- CONTENT_LANGUAGE_MAX);
- }
+ UMA_HISTOGRAM_ENUMERATION(kTranslateContentLanguage,
+ GetLanguageCheckMetric(provided_code, revised_code),
+ TranslateHelperMetrics::LANGUAGE_MAX);
+}
+
+void ReportHtmlLang(const std::string& provided_code,
+ const std::string& revised_code) {
+ UMA_HISTOGRAM_ENUMERATION(kTranslateHtmlLang,
+ GetLanguageCheckMetric(provided_code, revised_code),
+ TranslateHelperMetrics::LANGUAGE_MAX);
}
void ReportLanguageVerification(LanguageVerificationType type) {
diff --git a/chrome/renderer/translate/translate_helper_metrics.h b/chrome/renderer/translate/translate_helper_metrics.h
index a7fb55e..74caf38 100644
--- a/chrome/renderer/translate/translate_helper_metrics.h
+++ b/chrome/renderer/translate/translate_helper_metrics.h
@@ -16,6 +16,7 @@ namespace TranslateHelperMetrics {
enum MetricsNameIndex {
UMA_LANGUAGE_DETECTION,
UMA_CONTENT_LANGUAGE,
+ UMA_HTML_LANG,
UMA_LANGUAGE_VERIFICATION,
UMA_TIME_TO_BE_READY,
UMA_TIME_TO_LOAD,
@@ -27,11 +28,11 @@ enum MetricsNameIndex {
// A page may provide a Content-Language HTTP header or a META tag.
// TranslateHelper checks if a server provides a valid Content-Language.
-enum ContentLanguageType {
- CONTENT_LANGUAGE_NOT_PROVIDED,
- CONTENT_LANGUAGE_VALID,
- CONTENT_LANGUAGE_INVALID,
- CONTENT_LANGUAGE_MAX,
+enum LanguageCheckType {
+ LANGUAGE_NOT_PROVIDED,
+ LANGUAGE_VALID,
+ LANGUAGE_INVALID,
+ LANGUAGE_MAX,
};
// When a valid Content-Language is provided, TranslateHelper checks if a
@@ -54,12 +55,19 @@ enum SchemeType {
};
// Called after TranslateHelper verifies a server providing Content-Language
-// header. |provided_code| contains a Content-Language header value which
+// header. |provided_code| contains a Content-Language header value which a
// server provides. It can be empty string when a server doesn't provide it.
// |revised_code| is a value modified by format error corrector.
void ReportContentLanguage(const std::string& provided_code,
const std::string& revised_code);
+// Called after TranslateHelper verifies a page providing html lang attribute.
+// |provided_code| contains a html lang attribute which a page provides. It can
+// be empty string when a page doesn't provide it. |revised_code| is a value
+// modified by format error corrector.
+void ReportHtmlLang(const std::string& provided_code,
+ const std::string& revised_code);
+
// Called when CLD verifies Content-Language header.
void ReportLanguageVerification(LanguageVerificationType type);
diff --git a/chrome/renderer/translate/translate_helper_metrics_unittest.cc b/chrome/renderer/translate/translate_helper_metrics_unittest.cc
index 50a0b30..5a503ecd 100644
--- a/chrome/renderer/translate/translate_helper_metrics_unittest.cc
+++ b/chrome/renderer/translate/translate_helper_metrics_unittest.cc
@@ -33,23 +33,20 @@ class MetricsRecorder {
base_samples_ = histogram->SnapshotSamples();
}
- void CheckContentLanguage(int expected_not_provided,
- int expected_valid,
- int expected_invalid) {
- ASSERT_EQ(TranslateHelperMetrics::GetMetricsName(
- TranslateHelperMetrics::UMA_CONTENT_LANGUAGE), key_);
+ void CheckLanguage(TranslateHelperMetrics::MetricsNameIndex index,
+ int expected_not_provided,
+ int expected_valid,
+ int expected_invalid) {
+ ASSERT_EQ(TranslateHelperMetrics::GetMetricsName(index), key_);
Snapshot();
- EXPECT_EQ(
- expected_not_provided,
- GetCount(TranslateHelperMetrics::CONTENT_LANGUAGE_NOT_PROVIDED));
- EXPECT_EQ(
- expected_valid,
- GetCount(TranslateHelperMetrics::CONTENT_LANGUAGE_VALID));
- EXPECT_EQ(
- expected_invalid,
- GetCount(TranslateHelperMetrics::CONTENT_LANGUAGE_INVALID));
+ EXPECT_EQ(expected_not_provided,
+ GetCount(TranslateHelperMetrics::LANGUAGE_NOT_PROVIDED));
+ EXPECT_EQ(expected_valid,
+ GetCount(TranslateHelperMetrics::LANGUAGE_VALID));
+ EXPECT_EQ(expected_invalid,
+ GetCount(TranslateHelperMetrics::LANGUAGE_INVALID));
}
void CheckLanguageVerification(int expected_cld_disabled,
@@ -150,13 +147,26 @@ TEST(TranslateHelperMetricsTest, ReportContentLanguage) {
MetricsRecorder recorder(TranslateHelperMetrics::GetMetricsName(
TranslateHelperMetrics::UMA_CONTENT_LANGUAGE));
- recorder.CheckContentLanguage(0, 0, 0);
+ recorder.CheckLanguage(TranslateHelperMetrics::UMA_CONTENT_LANGUAGE, 0, 0, 0);
TranslateHelperMetrics::ReportContentLanguage(std::string(), std::string());
- recorder.CheckContentLanguage(1, 0, 0);
+ recorder.CheckLanguage(TranslateHelperMetrics::UMA_CONTENT_LANGUAGE, 1, 0, 0);
TranslateHelperMetrics::ReportContentLanguage("ja_JP", "ja-JP");
- recorder.CheckContentLanguage(1, 0, 1);
+ recorder.CheckLanguage(TranslateHelperMetrics::UMA_CONTENT_LANGUAGE, 1, 0, 1);
TranslateHelperMetrics::ReportContentLanguage("en", "en");
- recorder.CheckContentLanguage(1, 1, 1);
+ recorder.CheckLanguage(TranslateHelperMetrics::UMA_CONTENT_LANGUAGE, 1, 1, 1);
+}
+
+TEST(TranslateHelperMetricsTest, ReportHtmlLang) {
+ MetricsRecorder recorder(TranslateHelperMetrics::GetMetricsName(
+ TranslateHelperMetrics::UMA_HTML_LANG));
+
+ recorder.CheckLanguage(TranslateHelperMetrics::UMA_HTML_LANG, 0, 0, 0);
+ TranslateHelperMetrics::ReportHtmlLang(std::string(), std::string());
+ recorder.CheckLanguage(TranslateHelperMetrics::UMA_HTML_LANG, 1, 0, 0);
+ TranslateHelperMetrics::ReportHtmlLang("ja_JP", "ja-JP");
+ recorder.CheckLanguage(TranslateHelperMetrics::UMA_HTML_LANG, 1, 0, 1);
+ TranslateHelperMetrics::ReportHtmlLang("en", "en");
+ recorder.CheckLanguage(TranslateHelperMetrics::UMA_HTML_LANG, 1, 1, 1);
}
TEST(TranslateHelperMetricsTest, ReportLanguageVerification) {
diff --git a/chrome/renderer/translate/translate_helper_unittest.cc b/chrome/renderer/translate/translate_helper_unittest.cc
index e1d1175..f92a815 100644
--- a/chrome/renderer/translate/translate_helper_unittest.cc
+++ b/chrome/renderer/translate/translate_helper_unittest.cc
@@ -90,8 +90,9 @@ TEST_F(TranslateHelperTest, CLDDisagreeWithWrongLanguageCode) {
std::string cld_language;
bool is_cld_reliable;
std::string language =
- TranslateHelper::DeterminePageLanguage(std::string("ja"), contents,
- &cld_language, &is_cld_reliable);
+ TranslateHelper::DeterminePageLanguage(std::string("ja"), std::string(),
+ contents, &cld_language,
+ &is_cld_reliable);
EXPECT_EQ(chrome::kUnknownLanguageCode, language);
EXPECT_EQ("en", cld_language);
EXPECT_TRUE(is_cld_reliable);
@@ -108,7 +109,8 @@ TEST_F(TranslateHelperTest, CLDAgreeWithLanguageCodeHavingCountryCode) {
std::string cld_language;
bool is_cld_reliable;
std::string language =
- TranslateHelper::DeterminePageLanguage(std::string("en-US"), contents,
+ TranslateHelper::DeterminePageLanguage(std::string("en-US"),
+ std::string(), contents,
&cld_language, &is_cld_reliable);
EXPECT_EQ("en-US", language);
EXPECT_EQ("en", cld_language);
@@ -126,9 +128,30 @@ TEST_F(TranslateHelperTest, InvalidLanguageMetaTagProviding) {
std::string cld_language;
bool is_cld_reliable;
std::string language =
- TranslateHelper::DeterminePageLanguage(std::string("utf-8"), contents,
+ TranslateHelper::DeterminePageLanguage(std::string("utf-8"),
+ std::string(), contents,
&cld_language, &is_cld_reliable);
EXPECT_EQ("en", language);
EXPECT_EQ("en", cld_language);
EXPECT_TRUE(is_cld_reliable);
}
+
+// Tests that the language meta tag providing wrong information is ignored
+// because of valid html lang attribute.
+TEST_F(TranslateHelperTest, AdoptHtmlLang) {
+ string16 contents = ASCIIToUTF16(
+ "<html lang='en'><head><meta http-equiv='Content-Language' content='ja'>"
+ "</head><body>This is a page apparently written in English. Even though "
+ "content-language is provided, the value will be ignored if the value "
+ "is suspicious.</body></html>");
+ std::string cld_language;
+ bool is_cld_reliable;
+ std::string language =
+ TranslateHelper::DeterminePageLanguage(std::string("ja"),
+ std::string("en"),
+ contents, &cld_language,
+ &is_cld_reliable);
+ EXPECT_EQ("en", language);
+ EXPECT_EQ("en", cld_language);
+ EXPECT_TRUE(is_cld_reliable);
+}