diff options
author | droger@chromium.org <droger@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-07-11 13:54:22 +0000 |
---|---|---|
committer | droger@chromium.org <droger@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-07-11 13:54:22 +0000 |
commit | d7575c2d6ed0a3459a539ba9c63c26e39e335f5b (patch) | |
tree | f405169b772c4a73e3081b385bbfdd583c324e82 /chrome/common | |
parent | 2dda4b49e79423b7d9ba22e7b6b5a300adb11857 (diff) | |
download | chromium_src-d7575c2d6ed0a3459a539ba9c63c26e39e335f5b.zip chromium_src-d7575c2d6ed0a3459a539ba9c63c26e39e335f5b.tar.gz chromium_src-d7575c2d6ed0a3459a539ba9c63c26e39e335f5b.tar.bz2 |
Move language detection to chrome/common/.
This CL moves the language detection code from chrome/renderer/translate
to chrome/common/translate, in order to be able to use it on iOS.
This CL also enables the related unittests on iOS.
BUG=
Review URL: https://chromiumcodereview.appspot.com/18911002
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@211108 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/common')
-rw-r--r-- | chrome/common/DEPS | 1 | ||||
-rw-r--r-- | chrome/common/translate/language_detection_util.cc | 304 | ||||
-rw-r--r-- | chrome/common/translate/language_detection_util.h | 44 | ||||
-rw-r--r-- | chrome/common/translate/language_detection_util_unittest.cc | 158 | ||||
-rw-r--r-- | chrome/common/translate/translate_common_metrics.cc | 146 | ||||
-rw-r--r-- | chrome/common/translate/translate_common_metrics.h | 108 | ||||
-rw-r--r-- | chrome/common/translate/translate_common_metrics_unittest.cc | 304 |
7 files changed, 1065 insertions, 0 deletions
diff --git a/chrome/common/DEPS b/chrome/common/DEPS index a066e70..52aad28 100644 --- a/chrome/common/DEPS +++ b/chrome/common/DEPS @@ -30,6 +30,7 @@ include_rules = [ # Other libraries. "+chrome/third_party/xdg_user_dirs", "+third_party/bzip2", + "+third_party/cld/encodings/compact_lang_det/win", "+third_party/mt19937ar", "+third_party/npapi", "+third_party/re2", diff --git a/chrome/common/translate/language_detection_util.cc b/chrome/common/translate/language_detection_util.cc new file mode 100644 index 0000000..9710614 --- /dev/null +++ b/chrome/common/translate/language_detection_util.cc @@ -0,0 +1,304 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "chrome/common/translate/language_detection_util.h" + +#include "base/logging.h" +#include "base/strings/string_split.h" +#include "base/strings/string_util.h" +#include "base/time/time.h" +#include "chrome/common/chrome_constants.h" +#include "chrome/common/translate/translate_common_metrics.h" +#include "chrome/common/translate/translate_util.h" + +#if defined(ENABLE_LANGUAGE_DETECTION) +#include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" +#endif + +namespace { + +// Similar language code list. Some languages are very similar and difficult +// for CLD to distinguish. +struct SimilarLanguageCode { + const char* const code; + int group; +}; + +const SimilarLanguageCode kSimilarLanguageCodes[] = { + {"bs", 1}, + {"hr", 1}, + {"hi", 2}, + {"ne", 2}, +}; + +// Checks |kSimilarLanguageCodes| and returns group code. +int GetSimilarLanguageGroupCode(const std::string& language) { + for (size_t i = 0; i < arraysize(kSimilarLanguageCodes); ++i) { + if (language.find(kSimilarLanguageCodes[i].code) != 0) + continue; + return kSimilarLanguageCodes[i].group; + } + return 0; +} + +// Well-known languages which often have wrong server configuration of +// Content-Language: en. +// TODO(toyoshim): Remove these static tables and caller functions to +// chrome/common/translate, and implement them as std::set<>. +const char* kWellKnownCodesOnWrongConfiguration[] = { + "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th" +}; + +// Applies a series of language code modification in proper order. +void ApplyLanguageCodeCorrection(std::string* code) { + // Correct well-known format errors. + LanguageDetectionUtil::CorrectLanguageCodeTypo(code); + + if (!LanguageDetectionUtil::IsValidLanguageCode(*code)) { + *code = std::string(); + return; + } + + TranslateUtil::ToTranslateLanguageSynonym(code); +} + +#if defined(ENABLE_LANGUAGE_DETECTION) +// Returns the ISO 639 language code of the specified |text|, or 'unknown' if it +// failed. +// |is_cld_reliable| will be set as true if CLD says the detection is reliable. +std::string DetermineTextLanguage(const base::string16& text, + bool* is_cld_reliable) { + std::string language = chrome::kUnknownLanguageCode; + int num_languages = 0; + int text_bytes = 0; + bool is_reliable = false; + Language cld_language = + DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable, + &num_languages, NULL, &text_bytes); + if (is_cld_reliable != NULL) + *is_cld_reliable = is_reliable; + + // We don't trust the result if the CLD reports that the detection is not + // reliable, or if the actual text used to detect the language was less than + // 100 bytes (short texts can often lead to wrong results). + // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that + // the determined language code is correct with 50% confidence. Chrome should + // handle the real confidence value to judge. + if (is_reliable && text_bytes >= 100 && cld_language != NUM_LANGUAGES && + cld_language != UNKNOWN_LANGUAGE && cld_language != TG_UNKNOWN_LANGUAGE) { + // We should not use LanguageCode_ISO_639_1 because it does not cover all + // the languages CLD can detect. As a result, it'll return the invalid + // language code for tradtional Chinese among others. + // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and + // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN + // for Simplified Chinese. + language = LanguageCodeWithDialects(cld_language); + } + VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text + << "\n*************************************\n"; + return language; +} +#endif // defined(ENABLE_LANGUAGE_DETECTION) + +// Checks if CLD can complement a sub code when the page language doesn't know +// the sub code. +bool CanCLDComplementSubCode( + const std::string& page_language, const std::string& cld_language) { + // Translate server cannot treat general Chinese. If Content-Language and + // CLD agree that the language is Chinese and Content-Language doesn't know + // which dialect is used, CLD language has priority. + // TODO(hajimehoshi): How about the other dialects like zh-MO? + return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false); +} + +} // namespace + +namespace LanguageDetectionUtil { + +std::string DeterminePageLanguage(const std::string& code, + const std::string& html_lang, + const base::string16& contents, + std::string* cld_language_p, + bool* is_cld_reliable_p) { +#if defined(ENABLE_LANGUAGE_DETECTION) + base::TimeTicks begin_time = base::TimeTicks::Now(); + bool is_cld_reliable; + std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable); + TranslateCommonMetrics::ReportLanguageDetectionTime(begin_time, + base::TimeTicks::Now()); + + if (cld_language_p != NULL) + *cld_language_p = cld_language; + if (is_cld_reliable_p != NULL) + *is_cld_reliable_p = is_cld_reliable; + TranslateUtil::ToTranslateLanguageSynonym(&cld_language); +#endif // defined(ENABLE_LANGUAGE_DETECTION) + + // Check if html lang attribute is valid. + std::string modified_html_lang; + if (!html_lang.empty()) { + modified_html_lang = html_lang; + ApplyLanguageCodeCorrection(&modified_html_lang); + TranslateCommonMetrics::ReportHtmlLang(html_lang, modified_html_lang); + VLOG(9) << "html lang based language code: " << modified_html_lang; + } + + // Check if Content-Language is valid. + std::string modified_code; + if (!code.empty()) { + modified_code = code; + ApplyLanguageCodeCorrection(&modified_code); + TranslateCommonMetrics::ReportContentLanguage(code, modified_code); + } + + // Adopt |modified_html_lang| if it is valid. Otherwise, adopt + // |modified_code|. + std::string language = modified_html_lang.empty() ? modified_code : + modified_html_lang; + +#if defined(ENABLE_LANGUAGE_DETECTION) + // If |language| is empty, just use CLD result even though it might be + // chrome::kUnknownLanguageCode. + if (language.empty()) { + TranslateCommonMetrics::ReportLanguageVerification( + TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_ONLY); + return cld_language; + } + + if (cld_language == chrome::kUnknownLanguageCode) { + TranslateCommonMetrics::ReportLanguageVerification( + TranslateCommonMetrics::LANGUAGE_VERIFICATION_UNKNOWN); + return language; + } else if (IsSameOrSimilarLanguages(language, cld_language)) { + TranslateCommonMetrics::ReportLanguageVerification( + TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_AGREE); + return language; + } else if (MaybeServerWrongConfiguration(language, cld_language)) { + TranslateCommonMetrics::ReportLanguageVerification( + TranslateCommonMetrics::LANGUAGE_VERIFICATION_TRUST_CLD); + return cld_language; + } else if (CanCLDComplementSubCode(language, cld_language)) { + TranslateCommonMetrics::ReportLanguageVerification( + TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE); + return cld_language; + } else { + TranslateCommonMetrics::ReportLanguageVerification( + TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISAGREE); + // Content-Language value might be wrong because CLD says that this page + // is written in another language with confidence. + // In this case, Chrome doesn't rely on any of the language codes, and + // gives up suggesting a translation. + return std::string(chrome::kUnknownLanguageCode); + } +#else // defined(ENABLE_LANGUAGE_DETECTION) + TranslateCommonMetrics::ReportLanguageVerification( + TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISABLED); +#endif // defined(ENABLE_LANGUAGE_DETECTION) + + return language; +} + +void CorrectLanguageCodeTypo(std::string* code) { + DCHECK(code); + + size_t coma_index = code->find(','); + if (coma_index != std::string::npos) { + // There are more than 1 language specified, just keep the first one. + *code = code->substr(0, coma_index); + } + TrimWhitespaceASCII(*code, TRIM_ALL, code); + + // An underscore instead of a dash is a frequent mistake. + size_t underscore_index = code->find('_'); + if (underscore_index != std::string::npos) + (*code)[underscore_index] = '-'; + + // Change everything up to a dash to lower-case and everything after to upper. + size_t dash_index = code->find('-'); + if (dash_index != std::string::npos) { + *code = StringToLowerASCII(code->substr(0, dash_index)) + + StringToUpperASCII(code->substr(dash_index)); + } else { + *code = StringToLowerASCII(*code); + } +} + +bool IsValidLanguageCode(const std::string& code) { + // Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/. + // TODO(hajimehoshi): How about es-419, which is used as an Accept language? + std::vector<std::string> chunks; + base::SplitString(code, '-', &chunks); + + if (chunks.size() < 1 || 2 < chunks.size()) + return false; + + const std::string& main_code = chunks[0]; + + if (main_code.size() < 1 || 3 < main_code.size()) + return false; + + for (std::string::const_iterator it = main_code.begin(); + it != main_code.end(); ++it) { + if (!IsAsciiAlpha(*it)) + return false; + } + + if (chunks.size() == 1) + return true; + + const std::string& sub_code = chunks[1]; + + if (sub_code.size() != 2) + return false; + + for (std::string::const_iterator it = sub_code.begin(); + it != sub_code.end(); ++it) { + if (!IsAsciiAlpha(*it)) + return false; + } + + return true; +} + +bool IsSameOrSimilarLanguages(const std::string& page_language, + const std::string& cld_language) { + // Language code part of |page_language| is matched to one of |cld_language|. + // Country code is ignored here. + if (page_language.size() >= 2 && + cld_language.find(page_language.c_str(), 0, 2) == 0) { + // Languages are matched strictly. Reports false to metrics, but returns + // true. + TranslateCommonMetrics::ReportSimilarLanguageMatch(false); + return true; + } + + // Check if |page_language| and |cld_language| are in the similar language + // list and belong to the same language group. + int page_code = GetSimilarLanguageGroupCode(page_language); + bool match = page_code != 0 && + page_code == GetSimilarLanguageGroupCode(cld_language); + + TranslateCommonMetrics::ReportSimilarLanguageMatch(match); + return match; +} + +bool MaybeServerWrongConfiguration(const std::string& page_language, + const std::string& cld_language) { + // If |page_language| is not "en-*", respect it and just return false here. + if (!StartsWithASCII(page_language, "en", false)) + return false; + + // A server provides a language meta information representing "en-*". But it + // might be just a default value due to missing user configuration. + // Let's trust |cld_language| if the determined language is not difficult to + // distinguish from English, and the language is one of well-known languages + // which often provide "en-*" meta information mistakenly. + for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { + if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) + return true; + } + return false; +} + +} // namespace LanguageDetectionUtil diff --git a/chrome/common/translate/language_detection_util.h b/chrome/common/translate/language_detection_util.h new file mode 100644 index 0000000..787c0781 --- /dev/null +++ b/chrome/common/translate/language_detection_util.h @@ -0,0 +1,44 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef CHROME_COMMON_TRANSLATE_LANGUAGE_DETECTION_UTIL_H_ +#define CHROME_COMMON_TRANSLATE_LANGUAGE_DETECTION_UTIL_H_ + +#include <string> + +#include "base/strings/string16.h" + +namespace LanguageDetectionUtil { + +// Determines content page language from Content-Language code and contents. +std::string DeterminePageLanguage(const std::string& code, + const std::string& html_lang, + const base::string16& contents, + std::string* cld_language, + bool* is_cld_reliable); + +// Corrects language code if it contains well-known mistakes. +// Called only by tests. +void CorrectLanguageCodeTypo(std::string* code); + +// Checks if the language code's format is valid. +// Called only by tests. +bool IsValidLanguageCode(const std::string& code); + +// Checks if languages are matched, or similar. This function returns true +// against a language pair containing a language which is difficult for CLD to +// distinguish. +// Called only by tests. +bool IsSameOrSimilarLanguages(const std::string& page_language, + const std::string& cld_language); + +// Checks if languages pair is one of well-known pairs of wrong server +// configuration. +// Called only by tests. +bool MaybeServerWrongConfiguration(const std::string& page_language, + const std::string& cld_language); + +} // namespace LanguageDetectionUtil + +#endif // CHROME_COMMON_TRANSLATE_LANGUAGE_DETECTION_UTIL_H_ diff --git a/chrome/common/translate/language_detection_util_unittest.cc b/chrome/common/translate/language_detection_util_unittest.cc new file mode 100644 index 0000000..4f8dbf3 --- /dev/null +++ b/chrome/common/translate/language_detection_util_unittest.cc @@ -0,0 +1,158 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "chrome/common/translate/language_detection_util.h" + +#include "base/strings/string16.h" +#include "base/strings/utf_string_conversions.h" +#include "chrome/common/chrome_constants.h" +#include "testing/gtest/include/gtest/gtest.h" + +typedef testing::Test LanguageDetectionUtilTest; + +// Tests that well-known language code typos are fixed. +TEST_F(LanguageDetectionUtilTest, LanguageCodeTypoCorrection) { + std::string language; + + // Strip the second and later codes. + language = std::string("ja,en"); + LanguageDetectionUtil::CorrectLanguageCodeTypo(&language); + EXPECT_EQ("ja", language); + + // Replace dash with hyphen. + language = std::string("ja_JP"); + LanguageDetectionUtil::CorrectLanguageCodeTypo(&language); + EXPECT_EQ("ja-JP", language); + + // Correct wrong cases. + language = std::string("JA-jp"); + LanguageDetectionUtil::CorrectLanguageCodeTypo(&language); + EXPECT_EQ("ja-JP", language); +} + +// Tests if the language codes' format is invalid. +TEST_F(LanguageDetectionUtilTest, IsValidLanguageCode) { + std::string language; + + language = std::string("ja"); + EXPECT_TRUE(LanguageDetectionUtil::IsValidLanguageCode(language)); + + language = std::string("ja-JP"); + EXPECT_TRUE(LanguageDetectionUtil::IsValidLanguageCode(language)); + + language = std::string("ceb"); + EXPECT_TRUE(LanguageDetectionUtil::IsValidLanguageCode(language)); + + language = std::string("ceb-XX"); + EXPECT_TRUE(LanguageDetectionUtil::IsValidLanguageCode(language)); + + // Invalid because the sub code consists of a number. + language = std::string("utf-8"); + EXPECT_FALSE(LanguageDetectionUtil::IsValidLanguageCode(language)); + + // Invalid because of six characters after hyphen. + language = std::string("ja-YUKARI"); + EXPECT_FALSE(LanguageDetectionUtil::IsValidLanguageCode(language)); + + // Invalid because of four characters. + language = std::string("DHMO"); + EXPECT_FALSE(LanguageDetectionUtil::IsValidLanguageCode(language)); +} + +// Tests that similar language table works. +TEST_F(LanguageDetectionUtilTest, SimilarLanguageCode) { + EXPECT_TRUE(LanguageDetectionUtil::IsSameOrSimilarLanguages("en", "en")); + EXPECT_FALSE(LanguageDetectionUtil::IsSameOrSimilarLanguages("en", "ja")); + EXPECT_TRUE(LanguageDetectionUtil::IsSameOrSimilarLanguages("bs", "hr")); + EXPECT_TRUE(LanguageDetectionUtil::IsSameOrSimilarLanguages("sr-ME", "sr")); + EXPECT_TRUE(LanguageDetectionUtil::IsSameOrSimilarLanguages("ne", "hi")); + EXPECT_FALSE(LanguageDetectionUtil::IsSameOrSimilarLanguages("bs", "hi")); +} + +// Tests that well-known languages which often have wrong server configuration +// are handles. +TEST_F(LanguageDetectionUtilTest, WellKnownWrongConfiguration) { + EXPECT_TRUE(LanguageDetectionUtil::MaybeServerWrongConfiguration("en", "ja")); + EXPECT_TRUE(LanguageDetectionUtil::MaybeServerWrongConfiguration("en-US", + "ja")); + EXPECT_TRUE(LanguageDetectionUtil::MaybeServerWrongConfiguration("en", + "zh-CN")); + EXPECT_FALSE(LanguageDetectionUtil::MaybeServerWrongConfiguration("ja", + "en")); + EXPECT_FALSE(LanguageDetectionUtil::MaybeServerWrongConfiguration("en", + "he")); +} + +// Tests that the language meta tag providing wrong information is ignored by +// LanguageDetectionUtil due to disagreement between meta tag and CLD. +TEST_F(LanguageDetectionUtilTest, CLDDisagreeWithWrongLanguageCode) { + base::string16 contents = ASCIIToUTF16( + "<html><head><meta http-equiv='Content-Language' content='ja'></head>" + "<body>This is a page apparently written in English. Even though " + "content-language is provided, the value will be ignored if the value " + "is suspicious.</body></html>"); + std::string cld_language; + bool is_cld_reliable; + std::string language = LanguageDetectionUtil::DeterminePageLanguage( + std::string("ja"), std::string(), contents, &cld_language, + &is_cld_reliable); + EXPECT_EQ(chrome::kUnknownLanguageCode, language); + EXPECT_EQ("en", cld_language); + EXPECT_TRUE(is_cld_reliable); +} + +// Tests that the language meta tag providing "en-US" style information is +// agreed by CLD. +TEST_F(LanguageDetectionUtilTest, CLDAgreeWithLanguageCodeHavingCountryCode) { + base::string16 contents = ASCIIToUTF16( + "<html><head><meta http-equiv='Content-Language' content='en-US'></head>" + "<body>This is a page apparently written in English. Even though " + "content-language is provided, the value will be ignored if the value " + "is suspicious.</body></html>"); + std::string cld_language; + bool is_cld_reliable; + std::string language = LanguageDetectionUtil::DeterminePageLanguage( + std::string("en-US"), std::string(), contents, &cld_language, + &is_cld_reliable); + EXPECT_EQ("en-US", language); + EXPECT_EQ("en", cld_language); + EXPECT_TRUE(is_cld_reliable); +} + +// Tests that the language meta tag providing wrong information is ignored and +// CLD's language will be adopted by LanguageDetectionUtil due to an invalid +// meta tag. +TEST_F(LanguageDetectionUtilTest, InvalidLanguageMetaTagProviding) { + base::string16 contents = ASCIIToUTF16( + "<html><head><meta http-equiv='Content-Language' content='utf-8'></head>" + "<body>This is a page apparently written in English. Even though " + "content-language is provided, the value will be ignored and CLD's" + " language will be adopted if the value is invalid.</body></html>"); + std::string cld_language; + bool is_cld_reliable; + std::string language = LanguageDetectionUtil::DeterminePageLanguage( + std::string("utf-8"), std::string(), contents, &cld_language, + &is_cld_reliable); + EXPECT_EQ("en", language); + EXPECT_EQ("en", cld_language); + EXPECT_TRUE(is_cld_reliable); +} + +// Tests that the language meta tag providing wrong information is ignored +// because of valid html lang attribute. +TEST_F(LanguageDetectionUtilTest, AdoptHtmlLang) { + base::string16 contents = ASCIIToUTF16( + "<html lang='en'><head><meta http-equiv='Content-Language' content='ja'>" + "</head><body>This is a page apparently written in English. Even though " + "content-language is provided, the value will be ignored if the value " + "is suspicious.</body></html>"); + std::string cld_language; + bool is_cld_reliable; + std::string language = LanguageDetectionUtil::DeterminePageLanguage( + std::string("ja"), std::string("en"), contents, &cld_language, + &is_cld_reliable); + EXPECT_EQ("en", language); + EXPECT_EQ("en", cld_language); + EXPECT_TRUE(is_cld_reliable); +} diff --git a/chrome/common/translate/translate_common_metrics.cc b/chrome/common/translate/translate_common_metrics.cc new file mode 100644 index 0000000..028be6d --- /dev/null +++ b/chrome/common/translate/translate_common_metrics.cc @@ -0,0 +1,146 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "chrome/common/translate/translate_common_metrics.h" + +#include "base/basictypes.h" +#include "base/metrics/histogram.h" + +namespace { + +// Constant string values to indicate UMA names. All entries should have +// a corresponding index in MetricsNameIndex and an entry in |kMetricsEntries|. +const char kRenderer4LanguageDetection[] = "Renderer4.LanguageDetection"; +const char kTranslateContentLanguage[] = "Translate.ContentLanguage"; +const char kTranslateHtmlLang[] = "Translate.HtmlLang"; +const char kTranslateLanguageVerification[] = "Translate.LanguageVerification"; +const char kTranslateTimeToBeReady[] = "Translate.TimeToBeReady"; +const char kTranslateTimeToLoad[] = "Translate.TimeToLoad"; +const char kTranslateTimeToTranslate[] = "Translate.TimeToTranslate"; +const char kTranslateUserActionDuration[] = "Translate.UserActionDuration"; +const char kTranslatePageScheme[] = "Translate.PageScheme"; +const char kTranslateSimilarLanguageMatch[] = "Translate.SimilarLanguageMatch"; + +const char kSchemeHttp[] = "http"; +const char kSchemeHttps[] = "https"; + +struct MetricsEntry { + TranslateCommonMetrics::MetricsNameIndex index; + const char* const name; +}; + +// This entry table should be updated when new UMA items are added. +const MetricsEntry kMetricsEntries[] = { + { TranslateCommonMetrics::UMA_LANGUAGE_DETECTION, + kRenderer4LanguageDetection }, + { TranslateCommonMetrics::UMA_CONTENT_LANGUAGE, + kTranslateContentLanguage }, + { TranslateCommonMetrics::UMA_HTML_LANG, + kTranslateHtmlLang }, + { TranslateCommonMetrics::UMA_LANGUAGE_VERIFICATION, + kTranslateLanguageVerification }, + { TranslateCommonMetrics::UMA_TIME_TO_BE_READY, + kTranslateTimeToBeReady }, + { TranslateCommonMetrics::UMA_TIME_TO_LOAD, + kTranslateTimeToLoad }, + { TranslateCommonMetrics::UMA_TIME_TO_TRANSLATE, + kTranslateTimeToTranslate }, + { TranslateCommonMetrics::UMA_USER_ACTION_DURATION, + kTranslateUserActionDuration }, + { TranslateCommonMetrics::UMA_PAGE_SCHEME, + kTranslatePageScheme }, + { TranslateCommonMetrics::UMA_SIMILAR_LANGUAGE_MATCH, + kTranslateSimilarLanguageMatch }, +}; + +COMPILE_ASSERT(arraysize(kMetricsEntries) == TranslateCommonMetrics::UMA_MAX, + arraysize_of_kMetricsEntries_should_be_UMA_MAX); + +TranslateCommonMetrics::LanguageCheckType GetLanguageCheckMetric( + const std::string& provided_code, + const std::string& revised_code) { + if (provided_code.empty()) + return TranslateCommonMetrics::LANGUAGE_NOT_PROVIDED; + else if (provided_code == revised_code) + return TranslateCommonMetrics::LANGUAGE_VALID; + return TranslateCommonMetrics::LANGUAGE_INVALID; +} + +} // namespace + +namespace TranslateCommonMetrics { + +void ReportContentLanguage(const std::string& provided_code, + const std::string& revised_code) { + UMA_HISTOGRAM_ENUMERATION(kTranslateContentLanguage, + GetLanguageCheckMetric(provided_code, revised_code), + TranslateCommonMetrics::LANGUAGE_MAX); +} + +void ReportHtmlLang(const std::string& provided_code, + const std::string& revised_code) { + UMA_HISTOGRAM_ENUMERATION(kTranslateHtmlLang, + GetLanguageCheckMetric(provided_code, revised_code), + TranslateCommonMetrics::LANGUAGE_MAX); +} + +void ReportLanguageVerification(LanguageVerificationType type) { + UMA_HISTOGRAM_ENUMERATION(kTranslateLanguageVerification, + type, + LANGUAGE_VERIFICATION_MAX); +} + +void ReportTimeToBeReady(double time_in_msec) { + UMA_HISTOGRAM_MEDIUM_TIMES( + kTranslateTimeToBeReady, + base::TimeDelta::FromMicroseconds(time_in_msec * 1000.0)); +} + +void ReportTimeToLoad(double time_in_msec) { + UMA_HISTOGRAM_MEDIUM_TIMES( + kTranslateTimeToLoad, + base::TimeDelta::FromMicroseconds(time_in_msec * 1000.0)); +} + +void ReportTimeToTranslate(double time_in_msec) { + UMA_HISTOGRAM_MEDIUM_TIMES( + kTranslateTimeToTranslate, + base::TimeDelta::FromMicroseconds(time_in_msec * 1000.0)); +} + +void ReportUserActionDuration(base::TimeTicks begin, base::TimeTicks end) { + UMA_HISTOGRAM_LONG_TIMES(kTranslateUserActionDuration, end - begin); +} + +void ReportPageScheme(const std::string& scheme) { + SchemeType type = SCHEME_OTHERS; + if (scheme == kSchemeHttp) + type = SCHEME_HTTP; + else if (scheme == kSchemeHttps) + type = SCHEME_HTTPS; + UMA_HISTOGRAM_ENUMERATION(kTranslatePageScheme, type, SCHEME_MAX); +} + +#if defined(ENABLE_LANGUAGE_DETECTION) + +void ReportLanguageDetectionTime(base::TimeTicks begin, base::TimeTicks end) { + UMA_HISTOGRAM_MEDIUM_TIMES(kRenderer4LanguageDetection, end - begin); +} + +#endif // defined(ENABLE_LANGUAGE_DETECTION) + +void ReportSimilarLanguageMatch(bool match) { + UMA_HISTOGRAM_BOOLEAN(kTranslateSimilarLanguageMatch, match); +} + +const char* GetMetricsName(MetricsNameIndex index) { + for (size_t i = 0; i < arraysize(kMetricsEntries); ++i) { + if (kMetricsEntries[i].index == index) + return kMetricsEntries[i].name; + } + NOTREACHED(); + return NULL; +} + +} // namespace TranslateCommonMetrics diff --git a/chrome/common/translate/translate_common_metrics.h b/chrome/common/translate/translate_common_metrics.h new file mode 100644 index 0000000..cfd6b14 --- /dev/null +++ b/chrome/common/translate/translate_common_metrics.h @@ -0,0 +1,108 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef CHROME_COMMON_TRANSLATE_TRANSLATE_COMMON_METRICS_H_ +#define CHROME_COMMON_TRANSLATE_TRANSLATE_COMMON_METRICS_H_ + +#include <string> + +#include "base/time/time.h" + +namespace TranslateCommonMetrics { + +// An indexing type to query each UMA entry name via GetMetricsName() function. +// Note: |kMetricsEntries| should be updated when a new entry is added here. +enum MetricsNameIndex { + UMA_LANGUAGE_DETECTION, + UMA_CONTENT_LANGUAGE, + UMA_HTML_LANG, + UMA_LANGUAGE_VERIFICATION, + UMA_TIME_TO_BE_READY, + UMA_TIME_TO_LOAD, + UMA_TIME_TO_TRANSLATE, + UMA_USER_ACTION_DURATION, + UMA_PAGE_SCHEME, + UMA_SIMILAR_LANGUAGE_MATCH, + UMA_MAX, +}; + +// A page may provide a Content-Language HTTP header or a META tag. +// TranslateHelper checks if a server provides a valid Content-Language. +enum LanguageCheckType { + LANGUAGE_NOT_PROVIDED, + LANGUAGE_VALID, + LANGUAGE_INVALID, + LANGUAGE_MAX, +}; + +// When a valid Content-Language is provided, TranslateHelper checks if a +// server provided Content-Language matches to a language CLD determined. +enum LanguageVerificationType { + LANGUAGE_VERIFICATION_CLD_DISABLED, + LANGUAGE_VERIFICATION_CLD_ONLY, + LANGUAGE_VERIFICATION_UNKNOWN, + LANGUAGE_VERIFICATION_CLD_AGREE, + LANGUAGE_VERIFICATION_CLD_DISAGREE, + LANGUAGE_VERIFICATION_TRUST_CLD, + LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE, + LANGUAGE_VERIFICATION_MAX, +}; + +// Scheme type of pages Chrome is going to translate. +enum SchemeType { + SCHEME_HTTP, + SCHEME_HTTPS, + SCHEME_OTHERS, + SCHEME_MAX, +}; + +// Called after TranslateHelper verifies a server providing Content-Language +// header. |provided_code| contains a Content-Language header value which a +// server provides. It can be empty string when a server doesn't provide it. +// |revised_code| is a value modified by format error corrector. +void ReportContentLanguage(const std::string& provided_code, + const std::string& revised_code); + +// Called after TranslateHelper verifies a page providing html lang attribute. +// |provided_code| contains a html lang attribute which a page provides. It can +// be empty string when a page doesn't provide it. |revised_code| is a value +// modified by format error corrector. +void ReportHtmlLang(const std::string& provided_code, + const std::string& revised_code); + +// Called when CLD verifies Content-Language header. +void ReportLanguageVerification(LanguageVerificationType type); + +// Called when the Translate Element library is ready. +void ReportTimeToBeReady(double time_in_msec); + +// Called when the Translate Element library is loaded. +void ReportTimeToLoad(double time_in_msec); + +// Called when a page translation is finished. +void ReportTimeToTranslate(double time_in_msec); + +// Called when a translation is triggered. +void ReportUserActionDuration(base::TimeTicks begin, base::TimeTicks end); + +// Called when a translation is triggered. +void ReportPageScheme(const std::string& scheme); + +#if defined(ENABLE_LANGUAGE_DETECTION) + +// Called when CLD detects page language. +void ReportLanguageDetectionTime(base::TimeTicks begin, base::TimeTicks end); + +#endif // defined(ENABLE_LANGUAGE_DETECTION) + +// Called when CLD agreed on a language which is different, but in the similar +// language list. +void ReportSimilarLanguageMatch(bool match); + +// Gets UMA name for an entry specified by |index|. +const char* GetMetricsName(MetricsNameIndex index); + +} // namespace TranslateCommonMetrics + +#endif // CHROME_COMMON_TRANSLATE_TRANSLATE_COMMON_METRICS_H_ diff --git a/chrome/common/translate/translate_common_metrics_unittest.cc b/chrome/common/translate/translate_common_metrics_unittest.cc new file mode 100644 index 0000000..4815cb1 --- /dev/null +++ b/chrome/common/translate/translate_common_metrics_unittest.cc @@ -0,0 +1,304 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "chrome/common/translate/translate_common_metrics.h" + +#include "base/basictypes.h" +#include "base/memory/scoped_ptr.h" +#include "base/metrics/histogram.h" +#include "base/metrics/histogram_samples.h" +#include "base/metrics/statistics_recorder.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "testing/platform_test.h" + +using base::HistogramBase; +using base::HistogramSamples; +using base::SampleCountIterator; +using base::StatisticsRecorder; +using base::TimeTicks; + +namespace { + +const int kTrue = 1; +const int kFalse = 0; + +class MetricsRecorder { + public: + explicit MetricsRecorder(const char* key) : key_(key) { + StatisticsRecorder::Initialize(); + + HistogramBase* histogram = StatisticsRecorder::FindHistogram(key_); + if (histogram) + base_samples_ = histogram->SnapshotSamples(); + } + + void CheckLanguage(TranslateCommonMetrics::MetricsNameIndex index, + int expected_not_provided, + int expected_valid, + int expected_invalid) { + ASSERT_EQ(TranslateCommonMetrics::GetMetricsName(index), key_); + + Snapshot(); + + EXPECT_EQ(expected_not_provided, + GetCountWithoutSnapshot( + TranslateCommonMetrics::LANGUAGE_NOT_PROVIDED)); + EXPECT_EQ(expected_valid, + GetCountWithoutSnapshot( + TranslateCommonMetrics::LANGUAGE_VALID)); + EXPECT_EQ(expected_invalid, + GetCountWithoutSnapshot( + TranslateCommonMetrics::LANGUAGE_INVALID)); + } + + void CheckLanguageVerification(int expected_cld_disabled, + int expected_cld_only, + int expected_unknown, + int expected_cld_agree, + int expected_cld_disagree, + int expected_trust_cld, + int expected_cld_complement_sub_code) { + ASSERT_EQ(TranslateCommonMetrics::GetMetricsName( + TranslateCommonMetrics::UMA_LANGUAGE_VERIFICATION), key_); + + Snapshot(); + + EXPECT_EQ( + expected_cld_disabled, + GetCountWithoutSnapshot( + TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISABLED)); + EXPECT_EQ( + expected_cld_only, + GetCountWithoutSnapshot( + TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_ONLY)); + EXPECT_EQ( + expected_unknown, + GetCountWithoutSnapshot( + TranslateCommonMetrics::LANGUAGE_VERIFICATION_UNKNOWN)); + EXPECT_EQ( + expected_cld_agree, + GetCountWithoutSnapshot( + TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_AGREE)); + EXPECT_EQ( + expected_cld_disagree, + GetCountWithoutSnapshot( + TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISAGREE)); + EXPECT_EQ( + expected_trust_cld, + GetCountWithoutSnapshot( + TranslateCommonMetrics::LANGUAGE_VERIFICATION_TRUST_CLD)); + EXPECT_EQ( + expected_cld_complement_sub_code, + GetCountWithoutSnapshot( + TranslateCommonMetrics:: + LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE)); + } + + void CheckScheme(int expected_http, int expected_https, int expected_others) { + ASSERT_EQ(TranslateCommonMetrics::GetMetricsName( + TranslateCommonMetrics::UMA_PAGE_SCHEME), key_); + + Snapshot(); + + EXPECT_EQ(expected_http, + GetCountWithoutSnapshot(TranslateCommonMetrics::SCHEME_HTTP)); + EXPECT_EQ(expected_https, + GetCountWithoutSnapshot(TranslateCommonMetrics::SCHEME_HTTPS)); + EXPECT_EQ(expected_others, + GetCountWithoutSnapshot(TranslateCommonMetrics::SCHEME_OTHERS)); + } + + void CheckTotalCount(int count) { + Snapshot(); + EXPECT_EQ(count, GetTotalCount()); + } + + void CheckValueInLogs(double value) { + Snapshot(); + ASSERT_TRUE(samples_.get()); + for (scoped_ptr<SampleCountIterator> i = samples_->Iterator(); + !i->Done(); + i->Next()) { + HistogramBase::Sample min; + HistogramBase::Sample max; + HistogramBase::Count count; + i->Get(&min, &max, &count); + if (min <= value && value <= max && count >= 1) + return; + } + EXPECT_FALSE(true); + } + + HistogramBase::Count GetCount(HistogramBase::Sample value) { + Snapshot(); + return GetCountWithoutSnapshot(value); + } + + private: + void Snapshot() { + HistogramBase* histogram = StatisticsRecorder::FindHistogram(key_); + if (!histogram) + return; + samples_ = histogram->SnapshotSamples(); + } + + HistogramBase::Count GetCountWithoutSnapshot(HistogramBase::Sample value) { + if (!samples_.get()) + return 0; + HistogramBase::Count count = samples_->GetCount(value); + if (!base_samples_.get()) + return count; + return count - base_samples_->GetCount(value); + } + + HistogramBase::Count GetTotalCount() { + if (!samples_.get()) + return 0; + HistogramBase::Count count = samples_->TotalCount(); + if (!base_samples_.get()) + return count; + return count - base_samples_->TotalCount(); + } + + std::string key_; + scoped_ptr<HistogramSamples> base_samples_; + scoped_ptr<HistogramSamples> samples_; + + DISALLOW_COPY_AND_ASSIGN(MetricsRecorder); +}; + +} // namespace + +TEST(TranslateCommonMetricsTest, ReportContentLanguage) { + MetricsRecorder recorder(TranslateCommonMetrics::GetMetricsName( + TranslateCommonMetrics::UMA_CONTENT_LANGUAGE)); + + recorder.CheckLanguage(TranslateCommonMetrics::UMA_CONTENT_LANGUAGE, 0, 0, 0); + TranslateCommonMetrics::ReportContentLanguage(std::string(), std::string()); + recorder.CheckLanguage(TranslateCommonMetrics::UMA_CONTENT_LANGUAGE, 1, 0, 0); + TranslateCommonMetrics::ReportContentLanguage("ja_JP", "ja-JP"); + recorder.CheckLanguage(TranslateCommonMetrics::UMA_CONTENT_LANGUAGE, 1, 0, 1); + TranslateCommonMetrics::ReportContentLanguage("en", "en"); + recorder.CheckLanguage(TranslateCommonMetrics::UMA_CONTENT_LANGUAGE, 1, 1, 1); +} + +TEST(TranslateCommonMetricsTest, ReportHtmlLang) { + MetricsRecorder recorder(TranslateCommonMetrics::GetMetricsName( + TranslateCommonMetrics::UMA_HTML_LANG)); + + recorder.CheckLanguage(TranslateCommonMetrics::UMA_HTML_LANG, 0, 0, 0); + TranslateCommonMetrics::ReportHtmlLang(std::string(), std::string()); + recorder.CheckLanguage(TranslateCommonMetrics::UMA_HTML_LANG, 1, 0, 0); + TranslateCommonMetrics::ReportHtmlLang("ja_JP", "ja-JP"); + recorder.CheckLanguage(TranslateCommonMetrics::UMA_HTML_LANG, 1, 0, 1); + TranslateCommonMetrics::ReportHtmlLang("en", "en"); + recorder.CheckLanguage(TranslateCommonMetrics::UMA_HTML_LANG, 1, 1, 1); +} + +TEST(TranslateCommonMetricsTest, ReportLanguageVerification) { + MetricsRecorder recorder(TranslateCommonMetrics::GetMetricsName( + TranslateCommonMetrics::UMA_LANGUAGE_VERIFICATION)); + + recorder.CheckLanguageVerification(0, 0, 0, 0, 0, 0, 0); + TranslateCommonMetrics::ReportLanguageVerification( + TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISABLED); + recorder.CheckLanguageVerification(1, 0, 0, 0, 0, 0, 0); + TranslateCommonMetrics::ReportLanguageVerification( + TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_ONLY); + recorder.CheckLanguageVerification(1, 1, 0, 0, 0, 0, 0); + TranslateCommonMetrics::ReportLanguageVerification( + TranslateCommonMetrics::LANGUAGE_VERIFICATION_UNKNOWN); + recorder.CheckLanguageVerification(1, 1, 1, 0, 0, 0, 0); + TranslateCommonMetrics::ReportLanguageVerification( + TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_AGREE); + recorder.CheckLanguageVerification(1, 1, 1, 1, 0, 0, 0); + TranslateCommonMetrics::ReportLanguageVerification( + TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_DISAGREE); + recorder.CheckLanguageVerification(1, 1, 1, 1, 1, 0, 0); + TranslateCommonMetrics::ReportLanguageVerification( + TranslateCommonMetrics::LANGUAGE_VERIFICATION_TRUST_CLD); + recorder.CheckLanguageVerification(1, 1, 1, 1, 1, 1, 0); + TranslateCommonMetrics::ReportLanguageVerification( + TranslateCommonMetrics::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE); + recorder.CheckLanguageVerification(1, 1, 1, 1, 1, 1, 1); +} + +TEST(TranslateCommonMetricsTest, ReportTimeToBeReady) { + MetricsRecorder recorder(TranslateCommonMetrics::GetMetricsName( + TranslateCommonMetrics::UMA_TIME_TO_BE_READY)); + recorder.CheckTotalCount(0); + TranslateCommonMetrics::ReportTimeToBeReady(3.14); + recorder.CheckValueInLogs(3.14); + recorder.CheckTotalCount(1); +} + +TEST(TranslateCommonMetricsTest, ReportTimeToLoad) { + MetricsRecorder recorder(TranslateCommonMetrics::GetMetricsName( + TranslateCommonMetrics::UMA_TIME_TO_LOAD)); + recorder.CheckTotalCount(0); + TranslateCommonMetrics::ReportTimeToLoad(573.0); + recorder.CheckValueInLogs(573.0); + recorder.CheckTotalCount(1); +} + +TEST(TranslateCommonMetricsTest, ReportTimeToTranslate) { + MetricsRecorder recorder(TranslateCommonMetrics::GetMetricsName( + TranslateCommonMetrics::UMA_TIME_TO_TRANSLATE)); + recorder.CheckTotalCount(0); + TranslateCommonMetrics::ReportTimeToTranslate(4649.0); + recorder.CheckValueInLogs(4649.0); + recorder.CheckTotalCount(1); +} + +TEST(TranslateCommonMetricsTest, ReportUserActionDuration) { + MetricsRecorder recorder(TranslateCommonMetrics::GetMetricsName( + TranslateCommonMetrics::UMA_USER_ACTION_DURATION)); + recorder.CheckTotalCount(0); + TimeTicks begin = TimeTicks::Now(); + TimeTicks end = begin + base::TimeDelta::FromSeconds(3776); + TranslateCommonMetrics::ReportUserActionDuration(begin, end); + recorder.CheckValueInLogs(3776000.0); + recorder.CheckTotalCount(1); +} + +TEST(TranslateCommonMetricsTest, ReportPageScheme) { + MetricsRecorder recorder(TranslateCommonMetrics::GetMetricsName( + TranslateCommonMetrics::UMA_PAGE_SCHEME)); + recorder.CheckScheme(0, 0, 0); + TranslateCommonMetrics::ReportPageScheme("http"); + recorder.CheckScheme(1, 0, 0); + TranslateCommonMetrics::ReportPageScheme("https"); + recorder.CheckScheme(1, 1, 0); + TranslateCommonMetrics::ReportPageScheme("ftp"); + recorder.CheckScheme(1, 1, 1); +} + +TEST(TranslateCommonMetricsTest, ReportSimilarLanguageMatch) { + MetricsRecorder recorder(TranslateCommonMetrics::GetMetricsName( + TranslateCommonMetrics::UMA_SIMILAR_LANGUAGE_MATCH)); + recorder.CheckTotalCount(0); + EXPECT_EQ(0, recorder.GetCount(kTrue)); + EXPECT_EQ(0, recorder.GetCount(kFalse)); + TranslateCommonMetrics::ReportSimilarLanguageMatch(true); + EXPECT_EQ(1, recorder.GetCount(kTrue)); + EXPECT_EQ(0, recorder.GetCount(kFalse)); + TranslateCommonMetrics::ReportSimilarLanguageMatch(false); + EXPECT_EQ(1, recorder.GetCount(kTrue)); + EXPECT_EQ(1, recorder.GetCount(kFalse)); +} + +#if defined(ENABLE_LANGUAGE_DETECTION) + +TEST(TranslateCommonMetricsTest, ReportLanguageDetectionTime) { + MetricsRecorder recorder(TranslateCommonMetrics::GetMetricsName( + TranslateCommonMetrics::UMA_LANGUAGE_DETECTION)); + recorder.CheckTotalCount(0); + TimeTicks begin = TimeTicks::Now(); + TimeTicks end = begin + base::TimeDelta::FromMicroseconds(9009); + TranslateCommonMetrics::ReportLanguageDetectionTime(begin, end); + recorder.CheckValueInLogs(9.009); + recorder.CheckTotalCount(1); +} + +#endif // defined(ENABLE_LANGUAGE_DETECTION) |