diff options
author | droger@chromium.org <droger@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-10-04 16:03:09 +0000 |
---|---|---|
committer | droger@chromium.org <droger@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-10-04 16:03:09 +0000 |
commit | 499e7c55f53e2a118c31fddd079456a3dbe8b35c (patch) | |
tree | 185e0a4be375f4b45276ea0472a0b472665ca1ba /components | |
parent | b38806a8ad4ecb73bfffaecfaab28a1dfe3a5402 (diff) | |
download | chromium_src-499e7c55f53e2a118c31fddd079456a3dbe8b35c.zip chromium_src-499e7c55f53e2a118c31fddd079456a3dbe8b35c.tar.gz chromium_src-499e7c55f53e2a118c31fddd079456a3dbe8b35c.tar.bz2 |
Move language detection to a component
Language detection is used from the renderer on most platform, but from the
browser on iOS. This CL moves it from chrome/common/ to a new "translate"
component, which allows to track and address more cleanly dependencies issues.
BUG=297777
Review URL: https://codereview.chromium.org/25531002
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@227015 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'components')
21 files changed, 1536 insertions, 0 deletions
diff --git a/components/OWNERS b/components/OWNERS index 4c134eb..bcdeb28 100644 --- a/components/OWNERS +++ b/components/OWNERS @@ -41,6 +41,10 @@ per-file sessions.gypi=sky@chromium.org per-file tracing*=jbauman@chromium.org per-file tracing*=nduca@chromium.org +per-file translate.gypi=hajimehoshi@chromium.org +per-file translate.gypi=mad@chromium.org +per-file translate.gypi=toyoshim@chromium.org + per-file startup_metric_utils.gypi=jeremy@chromium.org per-file user_prefs.gypi=battre@chromium.org diff --git a/components/components.gyp b/components/components.gyp index 0bfe536..ea5af28 100644 --- a/components/components.gyp +++ b/components/components.gyp @@ -21,6 +21,7 @@ 'policy.gypi', 'sessions.gypi', 'startup_metric_utils.gypi', + 'translate.gypi', 'user_prefs.gypi', 'variations.gypi', 'visitedlink.gypi', diff --git a/components/components_tests.gypi b/components/components_tests.gypi index e5c1f59..ba6b5e2 100644 --- a/components/components_tests.gypi +++ b/components/components_tests.gypi @@ -24,6 +24,9 @@ 'navigation_interception/intercept_navigation_resource_throttle_unittest.cc', 'sessions/serialized_navigation_entry_unittest.cc', 'test/run_all_unittests.cc', + 'translate/common/translate_metrics_unittest.cc', + 'translate/common/translate_util_unittest.cc', + 'translate/language_detection/language_detection_util_unittest.cc', # TODO(asvitkine): These should be tested on iOS too. 'variations/entropy_provider_unittest.cc', 'variations/metrics_util_unittest.cc', @@ -74,6 +77,10 @@ 'sessions', 'sessions_test_support', + # Dependencies of translate. + 'translate_common', + 'translate_language_detection', + # Dependencies of variations 'variations', @@ -93,6 +100,7 @@ # http://crbug.com/303011. # TODO(asvitkine): Bring up varations/ unittests on iOS. ['include', '^dom_distiller'], + ['include', '^translate'], ], 'dependencies!': [ 'autofill_core_common', diff --git a/components/translate.gypi b/components/translate.gypi new file mode 100644 index 0000000..7c19fe8 --- /dev/null +++ b/components/translate.gypi @@ -0,0 +1,57 @@ +# Copyright 2013 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +{ + 'targets': [ + { + 'target_name': 'translate_common', + 'type': 'static_library', + 'dependencies': [ + '../base/base.gyp:base', + '../url/url.gyp:url_lib', + ], + 'include_dirs': [ + '..', + ], + 'sources': [ + 'translate/common/translate_constants.cc', + 'translate/common/translate_constants.h', + 'translate/common/translate_metrics.cc', + 'translate/common/translate_metrics.h', + 'translate/common/translate_switches.cc', + 'translate/common/translate_switches.h', + 'translate/common/translate_util.cc', + 'translate/common/translate_util.h', + ], + }, + { + 'target_name': 'translate_language_detection', + 'type': 'static_library', + 'dependencies': [ + 'translate_common', + '../base/base.gyp:base', + '../url/url.gyp:url_lib', + ], + 'include_dirs': [ + '..', + ], + 'sources': [ + 'translate/language_detection/language_detection_util.cc', + 'translate/language_detection/language_detection_util.h', + ], + 'conditions': [ + ['cld_version==0 or cld_version==1', { + 'dependencies': [ + '<(DEPTH)/third_party/cld/cld.gyp:cld', + ], + }], + ['cld_version==0 or cld_version==2', { + 'dependencies': [ + '<(DEPTH)/third_party/cld_2/cld_2.gyp:cld_2', + ], + }], + ], + }, + ], +} diff --git a/components/translate/DEPS b/components/translate/DEPS new file mode 100644 index 0000000..8f4bfc0 --- /dev/null +++ b/components/translate/DEPS @@ -0,0 +1,4 @@ +include_rules = [ + # translate is used on iOS, which cannot depend on content. + "-content", +] diff --git a/components/translate/OWNERS b/components/translate/OWNERS new file mode 100644 index 0000000..a214ab1 --- /dev/null +++ b/components/translate/OWNERS @@ -0,0 +1,3 @@ +hajimehoshi@chromium.org +mad@chromium.org +toyoshim@chromium.org diff --git a/components/translate/README b/components/translate/README new file mode 100644 index 0000000..076afe9 --- /dev/null +++ b/components/translate/README @@ -0,0 +1,8 @@ +- translate/language detection depends on the CLD library and should only be +used from the renderer to avoid bloating the DLLs on Windows. + +- Translate is not allowed to depend on content/, because it is used by iOS. +If dependences on content/ need to be added to Translate, it will have to be +made into a layered component: see +http://www.chromium.org/developers/design-documents/layered-components-design +for more information. diff --git a/components/translate/common/translate_constants.cc b/components/translate/common/translate_constants.cc new file mode 100644 index 0000000..a51bb93 --- /dev/null +++ b/components/translate/common/translate_constants.cc @@ -0,0 +1,11 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/translate/common/translate_constants.h" + +namespace translate { + +const char* const kUnknownLanguageCode = "und"; + +} // namespace translate diff --git a/components/translate/common/translate_constants.h b/components/translate/common/translate_constants.h new file mode 100644 index 0000000..81a6369 --- /dev/null +++ b/components/translate/common/translate_constants.h @@ -0,0 +1,16 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COMPONENTS_TRANSLATE_COMMON_TRANSLATE_CONSTANTS_H_ +#define COMPONENTS_TRANSLATE_COMMON_TRANSLATE_CONSTANTS_H_ + +namespace translate { + +// The language code used when the language of a page could not be detected. +// (Matches what the CLD -Compact Language Detection- library reports.) +extern const char* const kUnknownLanguageCode; + +} // namespace translate + +#endif // COMPONENTS_TRANSLATE_COMMON_TRANSLATE_CONSTANTS_H_ diff --git a/components/translate/common/translate_metrics.cc b/components/translate/common/translate_metrics.cc new file mode 100644 index 0000000..b095084 --- /dev/null +++ b/components/translate/common/translate_metrics.cc @@ -0,0 +1,130 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/translate/common/translate_metrics.h" + +#include "base/basictypes.h" +#include "base/metrics/histogram.h" + +namespace translate { + +namespace { + +// Constant string values to indicate UMA names. All entries should have +// a corresponding index in MetricsNameIndex and an entry in |kMetricsEntries|. +const char kRenderer4LanguageDetection[] = "Renderer4.LanguageDetection"; +const char kTranslateContentLanguage[] = "Translate.ContentLanguage"; +const char kTranslateHtmlLang[] = "Translate.HtmlLang"; +const char kTranslateLanguageVerification[] = "Translate.LanguageVerification"; +const char kTranslateTimeToBeReady[] = "Translate.TimeToBeReady"; +const char kTranslateTimeToLoad[] = "Translate.TimeToLoad"; +const char kTranslateTimeToTranslate[] = "Translate.TimeToTranslate"; +const char kTranslateUserActionDuration[] = "Translate.UserActionDuration"; +const char kTranslatePageScheme[] = "Translate.PageScheme"; +const char kTranslateSimilarLanguageMatch[] = "Translate.SimilarLanguageMatch"; + +const char kSchemeHttp[] = "http"; +const char kSchemeHttps[] = "https"; + +struct MetricsEntry { + MetricsNameIndex index; + const char* const name; +}; + +// This entry table should be updated when new UMA items are added. +const MetricsEntry kMetricsEntries[] = { + {UMA_LANGUAGE_DETECTION, kRenderer4LanguageDetection}, + {UMA_CONTENT_LANGUAGE, kTranslateContentLanguage}, + {UMA_HTML_LANG, kTranslateHtmlLang}, + {UMA_LANGUAGE_VERIFICATION, kTranslateLanguageVerification}, + {UMA_TIME_TO_BE_READY, kTranslateTimeToBeReady}, + {UMA_TIME_TO_LOAD, kTranslateTimeToLoad}, + {UMA_TIME_TO_TRANSLATE, kTranslateTimeToTranslate}, + {UMA_USER_ACTION_DURATION, kTranslateUserActionDuration}, + {UMA_PAGE_SCHEME, kTranslatePageScheme}, + {UMA_SIMILAR_LANGUAGE_MATCH, kTranslateSimilarLanguageMatch}, }; + +COMPILE_ASSERT(arraysize(kMetricsEntries) == UMA_MAX, + arraysize_of_kMetricsEntries_should_be_UMA_MAX); + +LanguageCheckType GetLanguageCheckMetric(const std::string& provided_code, + const std::string& revised_code) { + if (provided_code.empty()) + return LANGUAGE_NOT_PROVIDED; + else if (provided_code == revised_code) + return LANGUAGE_VALID; + return LANGUAGE_INVALID; +} + +} // namespace + +void ReportContentLanguage(const std::string& provided_code, + const std::string& revised_code) { + UMA_HISTOGRAM_ENUMERATION(kTranslateContentLanguage, + GetLanguageCheckMetric(provided_code, revised_code), + LANGUAGE_MAX); +} + +void ReportHtmlLang(const std::string& provided_code, + const std::string& revised_code) { + UMA_HISTOGRAM_ENUMERATION(kTranslateHtmlLang, + GetLanguageCheckMetric(provided_code, revised_code), + LANGUAGE_MAX); +} + +void ReportLanguageVerification(LanguageVerificationType type) { + UMA_HISTOGRAM_ENUMERATION(kTranslateLanguageVerification, + type, + LANGUAGE_VERIFICATION_MAX); +} + +void ReportTimeToBeReady(double time_in_msec) { + UMA_HISTOGRAM_MEDIUM_TIMES( + kTranslateTimeToBeReady, + base::TimeDelta::FromMicroseconds(time_in_msec * 1000.0)); +} + +void ReportTimeToLoad(double time_in_msec) { + UMA_HISTOGRAM_MEDIUM_TIMES( + kTranslateTimeToLoad, + base::TimeDelta::FromMicroseconds(time_in_msec * 1000.0)); +} + +void ReportTimeToTranslate(double time_in_msec) { + UMA_HISTOGRAM_MEDIUM_TIMES( + kTranslateTimeToTranslate, + base::TimeDelta::FromMicroseconds(time_in_msec * 1000.0)); +} + +void ReportUserActionDuration(base::TimeTicks begin, base::TimeTicks end) { + UMA_HISTOGRAM_LONG_TIMES(kTranslateUserActionDuration, end - begin); +} + +void ReportPageScheme(const std::string& scheme) { + SchemeType type = SCHEME_OTHERS; + if (scheme == kSchemeHttp) + type = SCHEME_HTTP; + else if (scheme == kSchemeHttps) + type = SCHEME_HTTPS; + UMA_HISTOGRAM_ENUMERATION(kTranslatePageScheme, type, SCHEME_MAX); +} + +void ReportLanguageDetectionTime(base::TimeTicks begin, base::TimeTicks end) { + UMA_HISTOGRAM_MEDIUM_TIMES(kRenderer4LanguageDetection, end - begin); +} + +void ReportSimilarLanguageMatch(bool match) { + UMA_HISTOGRAM_BOOLEAN(kTranslateSimilarLanguageMatch, match); +} + +const char* GetMetricsName(MetricsNameIndex index) { + for (size_t i = 0; i < arraysize(kMetricsEntries); ++i) { + if (kMetricsEntries[i].index == index) + return kMetricsEntries[i].name; + } + NOTREACHED(); + return NULL; +} + +} // namespace translate diff --git a/components/translate/common/translate_metrics.h b/components/translate/common/translate_metrics.h new file mode 100644 index 0000000..9baa268 --- /dev/null +++ b/components/translate/common/translate_metrics.h @@ -0,0 +1,104 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COMPONENTS_TRANSLATE_COMMON_TRANSLATE_METRICS_H_ +#define COMPONENTS_TRANSLATE_COMMON_TRANSLATE_METRICS_H_ + +#include <string> + +#include "base/time/time.h" + +namespace translate { + +// An indexing type to query each UMA entry name via GetMetricsName() function. +// Note: |kMetricsEntries| should be updated when a new entry is added here. +enum MetricsNameIndex { + UMA_LANGUAGE_DETECTION, + UMA_CONTENT_LANGUAGE, + UMA_HTML_LANG, + UMA_LANGUAGE_VERIFICATION, + UMA_TIME_TO_BE_READY, + UMA_TIME_TO_LOAD, + UMA_TIME_TO_TRANSLATE, + UMA_USER_ACTION_DURATION, + UMA_PAGE_SCHEME, + UMA_SIMILAR_LANGUAGE_MATCH, + UMA_MAX, +}; + +// A page may provide a Content-Language HTTP header or a META tag. +// TranslateHelper checks if a server provides a valid Content-Language. +enum LanguageCheckType { + LANGUAGE_NOT_PROVIDED, + LANGUAGE_VALID, + LANGUAGE_INVALID, + LANGUAGE_MAX, +}; + +// When a valid Content-Language is provided, TranslateHelper checks if a +// server provided Content-Language matches to a language CLD determined. +enum LanguageVerificationType { + LANGUAGE_VERIFICATION_CLD_DISABLED, // obsolete + LANGUAGE_VERIFICATION_CLD_ONLY, + LANGUAGE_VERIFICATION_UNKNOWN, + LANGUAGE_VERIFICATION_CLD_AGREE, + LANGUAGE_VERIFICATION_CLD_DISAGREE, + LANGUAGE_VERIFICATION_TRUST_CLD, + LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE, + LANGUAGE_VERIFICATION_MAX, +}; + +// Scheme type of pages Chrome is going to translate. +enum SchemeType { + SCHEME_HTTP, + SCHEME_HTTPS, + SCHEME_OTHERS, + SCHEME_MAX, +}; + +// Called after TranslateHelper verifies a server providing Content-Language +// header. |provided_code| contains a Content-Language header value which a +// server provides. It can be empty string when a server doesn't provide it. +// |revised_code| is a value modified by format error corrector. +void ReportContentLanguage(const std::string& provided_code, + const std::string& revised_code); + +// Called after TranslateHelper verifies a page providing html lang attribute. +// |provided_code| contains a html lang attribute which a page provides. It can +// be empty string when a page doesn't provide it. |revised_code| is a value +// modified by format error corrector. +void ReportHtmlLang(const std::string& provided_code, + const std::string& revised_code); + +// Called when CLD verifies Content-Language header. +void ReportLanguageVerification(LanguageVerificationType type); + +// Called when the Translate Element library is ready. +void ReportTimeToBeReady(double time_in_msec); + +// Called when the Translate Element library is loaded. +void ReportTimeToLoad(double time_in_msec); + +// Called when a page translation is finished. +void ReportTimeToTranslate(double time_in_msec); + +// Called when a translation is triggered. +void ReportUserActionDuration(base::TimeTicks begin, base::TimeTicks end); + +// Called when a translation is triggered. +void ReportPageScheme(const std::string& scheme); + +// Called when CLD detects page language. +void ReportLanguageDetectionTime(base::TimeTicks begin, base::TimeTicks end); + +// Called when CLD agreed on a language which is different, but in the similar +// language list. +void ReportSimilarLanguageMatch(bool match); + +// Gets UMA name for an entry specified by |index|. +const char* GetMetricsName(MetricsNameIndex index); + +} // namespace translate + +#endif // COMPONENTS_TRANSLATE_COMMON_TRANSLATE_METRICS_H_ diff --git a/components/translate/common/translate_metrics_unittest.cc b/components/translate/common/translate_metrics_unittest.cc new file mode 100644 index 0000000..dac4b09 --- /dev/null +++ b/components/translate/common/translate_metrics_unittest.cc @@ -0,0 +1,284 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/translate/common/translate_metrics.h" + +#include "base/basictypes.h" +#include "base/memory/scoped_ptr.h" +#include "base/metrics/histogram.h" +#include "base/metrics/histogram_samples.h" +#include "base/metrics/statistics_recorder.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "testing/platform_test.h" + +using base::HistogramBase; +using base::HistogramSamples; +using base::SampleCountIterator; +using base::StatisticsRecorder; +using base::TimeTicks; + +namespace { + +const int kTrue = 1; +const int kFalse = 0; + +class MetricsRecorder { + public: + explicit MetricsRecorder(const char* key) : key_(key) { + StatisticsRecorder::Initialize(); + + HistogramBase* histogram = StatisticsRecorder::FindHistogram(key_); + if (histogram) + base_samples_ = histogram->SnapshotSamples(); + } + + void CheckLanguage(translate::MetricsNameIndex index, + int expected_not_provided, + int expected_valid, + int expected_invalid) { + ASSERT_EQ(translate::GetMetricsName(index), key_); + + Snapshot(); + + EXPECT_EQ(expected_not_provided, + GetCountWithoutSnapshot(translate::LANGUAGE_NOT_PROVIDED)); + EXPECT_EQ(expected_valid, + GetCountWithoutSnapshot(translate::LANGUAGE_VALID)); + EXPECT_EQ(expected_invalid, + GetCountWithoutSnapshot(translate::LANGUAGE_INVALID)); + } + + void CheckLanguageVerification(int expected_cld_disabled, + int expected_cld_only, + int expected_unknown, + int expected_cld_agree, + int expected_cld_disagree, + int expected_trust_cld, + int expected_cld_complement_sub_code) { + ASSERT_EQ(translate::GetMetricsName(translate::UMA_LANGUAGE_VERIFICATION), + key_); + + Snapshot(); + + EXPECT_EQ( + expected_cld_disabled, + GetCountWithoutSnapshot(translate::LANGUAGE_VERIFICATION_CLD_DISABLED)); + EXPECT_EQ( + expected_cld_only, + GetCountWithoutSnapshot(translate::LANGUAGE_VERIFICATION_CLD_ONLY)); + EXPECT_EQ( + expected_unknown, + GetCountWithoutSnapshot(translate::LANGUAGE_VERIFICATION_UNKNOWN)); + EXPECT_EQ( + expected_cld_agree, + GetCountWithoutSnapshot(translate::LANGUAGE_VERIFICATION_CLD_AGREE)); + EXPECT_EQ( + expected_cld_disagree, + GetCountWithoutSnapshot(translate::LANGUAGE_VERIFICATION_CLD_DISAGREE)); + EXPECT_EQ( + expected_trust_cld, + GetCountWithoutSnapshot(translate::LANGUAGE_VERIFICATION_TRUST_CLD)); + EXPECT_EQ(expected_cld_complement_sub_code, + GetCountWithoutSnapshot( + translate::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE)); + } + + void CheckScheme(int expected_http, int expected_https, int expected_others) { + ASSERT_EQ(translate::GetMetricsName(translate::UMA_PAGE_SCHEME), key_); + + Snapshot(); + + EXPECT_EQ(expected_http, GetCountWithoutSnapshot(translate::SCHEME_HTTP)); + EXPECT_EQ(expected_https, GetCountWithoutSnapshot(translate::SCHEME_HTTPS)); + EXPECT_EQ(expected_others, + GetCountWithoutSnapshot(translate::SCHEME_OTHERS)); + } + + void CheckTotalCount(int count) { + Snapshot(); + EXPECT_EQ(count, GetTotalCount()); + } + + void CheckValueInLogs(double value) { + Snapshot(); + ASSERT_TRUE(samples_.get()); + for (scoped_ptr<SampleCountIterator> i = samples_->Iterator(); !i->Done(); + i->Next()) { + HistogramBase::Sample min; + HistogramBase::Sample max; + HistogramBase::Count count; + i->Get(&min, &max, &count); + if (min <= value && value <= max && count >= 1) + return; + } + EXPECT_FALSE(true); + } + + HistogramBase::Count GetCount(HistogramBase::Sample value) { + Snapshot(); + return GetCountWithoutSnapshot(value); + } + + private: + void Snapshot() { + HistogramBase* histogram = StatisticsRecorder::FindHistogram(key_); + if (!histogram) + return; + samples_ = histogram->SnapshotSamples(); + } + + HistogramBase::Count GetCountWithoutSnapshot(HistogramBase::Sample value) { + if (!samples_.get()) + return 0; + HistogramBase::Count count = samples_->GetCount(value); + if (!base_samples_.get()) + return count; + return count - base_samples_->GetCount(value); + } + + HistogramBase::Count GetTotalCount() { + if (!samples_.get()) + return 0; + HistogramBase::Count count = samples_->TotalCount(); + if (!base_samples_.get()) + return count; + return count - base_samples_->TotalCount(); + } + + std::string key_; + scoped_ptr<HistogramSamples> base_samples_; + scoped_ptr<HistogramSamples> samples_; + + DISALLOW_COPY_AND_ASSIGN(MetricsRecorder); +}; + +} // namespace + +TEST(TranslateMetricsTest, ReportContentLanguage) { + MetricsRecorder recorder( + translate::GetMetricsName(translate::UMA_CONTENT_LANGUAGE)); + + recorder.CheckLanguage(translate::UMA_CONTENT_LANGUAGE, 0, 0, 0); + translate::ReportContentLanguage(std::string(), std::string()); + recorder.CheckLanguage(translate::UMA_CONTENT_LANGUAGE, 1, 0, 0); + translate::ReportContentLanguage("ja_JP", "ja-JP"); + recorder.CheckLanguage(translate::UMA_CONTENT_LANGUAGE, 1, 0, 1); + translate::ReportContentLanguage("en", "en"); + recorder.CheckLanguage(translate::UMA_CONTENT_LANGUAGE, 1, 1, 1); +} + +TEST(TranslateMetricsTest, ReportHtmlLang) { + MetricsRecorder recorder(translate::GetMetricsName(translate::UMA_HTML_LANG)); + + recorder.CheckLanguage(translate::UMA_HTML_LANG, 0, 0, 0); + translate::ReportHtmlLang(std::string(), std::string()); + recorder.CheckLanguage(translate::UMA_HTML_LANG, 1, 0, 0); + translate::ReportHtmlLang("ja_JP", "ja-JP"); + recorder.CheckLanguage(translate::UMA_HTML_LANG, 1, 0, 1); + translate::ReportHtmlLang("en", "en"); + recorder.CheckLanguage(translate::UMA_HTML_LANG, 1, 1, 1); +} + +TEST(TranslateMetricsTest, ReportLanguageVerification) { + MetricsRecorder recorder( + translate::GetMetricsName(translate::UMA_LANGUAGE_VERIFICATION)); + + recorder.CheckLanguageVerification(0, 0, 0, 0, 0, 0, 0); + translate::ReportLanguageVerification( + translate::LANGUAGE_VERIFICATION_CLD_DISABLED); + recorder.CheckLanguageVerification(1, 0, 0, 0, 0, 0, 0); + translate::ReportLanguageVerification( + translate::LANGUAGE_VERIFICATION_CLD_ONLY); + recorder.CheckLanguageVerification(1, 1, 0, 0, 0, 0, 0); + translate::ReportLanguageVerification( + translate::LANGUAGE_VERIFICATION_UNKNOWN); + recorder.CheckLanguageVerification(1, 1, 1, 0, 0, 0, 0); + translate::ReportLanguageVerification( + translate::LANGUAGE_VERIFICATION_CLD_AGREE); + recorder.CheckLanguageVerification(1, 1, 1, 1, 0, 0, 0); + translate::ReportLanguageVerification( + translate::LANGUAGE_VERIFICATION_CLD_DISAGREE); + recorder.CheckLanguageVerification(1, 1, 1, 1, 1, 0, 0); + translate::ReportLanguageVerification( + translate::LANGUAGE_VERIFICATION_TRUST_CLD); + recorder.CheckLanguageVerification(1, 1, 1, 1, 1, 1, 0); + translate::ReportLanguageVerification( + translate::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE); + recorder.CheckLanguageVerification(1, 1, 1, 1, 1, 1, 1); +} + +TEST(TranslateMetricsTest, ReportTimeToBeReady) { + MetricsRecorder recorder( + translate::GetMetricsName(translate::UMA_TIME_TO_BE_READY)); + recorder.CheckTotalCount(0); + translate::ReportTimeToBeReady(3.14); + recorder.CheckValueInLogs(3.14); + recorder.CheckTotalCount(1); +} + +TEST(TranslateMetricsTest, ReportTimeToLoad) { + MetricsRecorder recorder( + translate::GetMetricsName(translate::UMA_TIME_TO_LOAD)); + recorder.CheckTotalCount(0); + translate::ReportTimeToLoad(573.0); + recorder.CheckValueInLogs(573.0); + recorder.CheckTotalCount(1); +} + +TEST(TranslateMetricsTest, ReportTimeToTranslate) { + MetricsRecorder recorder( + translate::GetMetricsName(translate::UMA_TIME_TO_TRANSLATE)); + recorder.CheckTotalCount(0); + translate::ReportTimeToTranslate(4649.0); + recorder.CheckValueInLogs(4649.0); + recorder.CheckTotalCount(1); +} + +TEST(TranslateMetricsTest, ReportUserActionDuration) { + MetricsRecorder recorder( + translate::GetMetricsName(translate::UMA_USER_ACTION_DURATION)); + recorder.CheckTotalCount(0); + TimeTicks begin = TimeTicks::Now(); + TimeTicks end = begin + base::TimeDelta::FromSeconds(3776); + translate::ReportUserActionDuration(begin, end); + recorder.CheckValueInLogs(3776000.0); + recorder.CheckTotalCount(1); +} + +TEST(TranslateMetricsTest, ReportPageScheme) { + MetricsRecorder recorder( + translate::GetMetricsName(translate::UMA_PAGE_SCHEME)); + recorder.CheckScheme(0, 0, 0); + translate::ReportPageScheme("http"); + recorder.CheckScheme(1, 0, 0); + translate::ReportPageScheme("https"); + recorder.CheckScheme(1, 1, 0); + translate::ReportPageScheme("ftp"); + recorder.CheckScheme(1, 1, 1); +} + +TEST(TranslateMetricsTest, ReportSimilarLanguageMatch) { + MetricsRecorder recorder( + translate::GetMetricsName(translate::UMA_SIMILAR_LANGUAGE_MATCH)); + recorder.CheckTotalCount(0); + EXPECT_EQ(0, recorder.GetCount(kTrue)); + EXPECT_EQ(0, recorder.GetCount(kFalse)); + translate::ReportSimilarLanguageMatch(true); + EXPECT_EQ(1, recorder.GetCount(kTrue)); + EXPECT_EQ(0, recorder.GetCount(kFalse)); + translate::ReportSimilarLanguageMatch(false); + EXPECT_EQ(1, recorder.GetCount(kTrue)); + EXPECT_EQ(1, recorder.GetCount(kFalse)); +} + +TEST(TranslateMetricsTest, ReportLanguageDetectionTime) { + MetricsRecorder recorder( + translate::GetMetricsName(translate::UMA_LANGUAGE_DETECTION)); + recorder.CheckTotalCount(0); + TimeTicks begin = TimeTicks::Now(); + TimeTicks end = begin + base::TimeDelta::FromMicroseconds(9009); + translate::ReportLanguageDetectionTime(begin, end); + recorder.CheckValueInLogs(9.009); + recorder.CheckTotalCount(1); +} diff --git a/components/translate/common/translate_switches.cc b/components/translate/common/translate_switches.cc new file mode 100644 index 0000000..714bbfb --- /dev/null +++ b/components/translate/common/translate_switches.cc @@ -0,0 +1,14 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/translate/common/translate_switches.h" + +namespace translate { +namespace switches { + +// Overrides security-origin with which Translate runs in an isolated world. +const char kTranslateSecurityOrigin[] = "translate-security-origin"; + +} // namespace switches +} // namespace translate diff --git a/components/translate/common/translate_switches.h b/components/translate/common/translate_switches.h new file mode 100644 index 0000000..2b42bb5 --- /dev/null +++ b/components/translate/common/translate_switches.h @@ -0,0 +1,16 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COMPONENTS_TRANSLATE_COMMON_TRANSLATE_SWITCHES_H_ +#define COMPONENTS_TRANSLATE_COMMON_TRANSLATE_SWITCHES_H_ + +namespace translate { +namespace switches { + +extern const char kTranslateSecurityOrigin[]; + +} // namespace switches +} // namespace translate + +#endif // COMPONENTS_TRANSLATE_COMMON_TRANSLATE_SWITCHES_H_ diff --git a/components/translate/common/translate_util.cc b/components/translate/common/translate_util.cc new file mode 100644 index 0000000..4e151d3 --- /dev/null +++ b/components/translate/common/translate_util.cc @@ -0,0 +1,125 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/translate/common/translate_util.h" + +#include "base/basictypes.h" +#include "base/command_line.h" +#include "base/logging.h" +#include "base/strings/string_split.h" +#include "components/translate/common/translate_switches.h" +#include "url/gurl.h" + +namespace { + +// Split the |language| into two parts. For example, if |language| is 'en-US', +// this will be split into the main part 'en' and the tail part '-US'. +void SplitIntoMainAndTail(const std::string& language, + std::string* main_part, + std::string* tail_part) { + DCHECK(main_part); + DCHECK(tail_part); + + std::vector<std::string> chunks; + base::SplitString(language, '-', &chunks); + if (chunks.size() == 0u) + return; + + *main_part = chunks[0]; + *tail_part = language.substr(main_part->size()); +} + +} // namespace + +namespace translate { + +struct LanguageCodePair { + // Code used in supporting list of Translate. + const char* const translate_language; + + // Code used in Chrome internal. + const char* const chrome_language; +}; + +// Some languages are treated as same languages in Translate even though they +// are different to be exact. +// +// If this table is updated, please sync this with the synonym table in +// chrome/browser/resources/options/language_options.js +const LanguageCodePair kLanguageCodeSimilitudes[] = { + {"no", "nb"}, + {"tl", "fil"}, +}; + +// Some languages have changed codes over the years and sometimes the older +// codes are used, so we must see them as synonyms. +// +// If this table is updated, please sync this with the synonym table in +// chrome/browser/resources/options/language_options.js +const LanguageCodePair kLanguageCodeSynonyms[] = { + {"iw", "he"}, + {"jw", "jv"}, +}; + +const char kSecurityOrigin[] = "https://translate.googleapis.com/"; + +void ToTranslateLanguageSynonym(std::string* language) { + for (size_t i = 0; i < arraysize(kLanguageCodeSimilitudes); ++i) { + if (*language == kLanguageCodeSimilitudes[i].chrome_language) { + *language = kLanguageCodeSimilitudes[i].translate_language; + return; + } + } + + std::string main_part, tail_part; + SplitIntoMainAndTail(*language, &main_part, &tail_part); + if (main_part.empty()) + return; + + // Apply liner search here because number of items in the list is just four. + for (size_t i = 0; i < arraysize(kLanguageCodeSynonyms); ++i) { + if (main_part == kLanguageCodeSynonyms[i].chrome_language) { + main_part = std::string(kLanguageCodeSynonyms[i].translate_language); + break; + } + } + + *language = main_part + tail_part; +} + +void ToChromeLanguageSynonym(std::string* language) { + for (size_t i = 0; i < arraysize(kLanguageCodeSimilitudes); ++i) { + if (*language == kLanguageCodeSimilitudes[i].translate_language) { + *language = kLanguageCodeSimilitudes[i].chrome_language; + return; + } + } + + std::string main_part, tail_part; + SplitIntoMainAndTail(*language, &main_part, &tail_part); + if (main_part.empty()) + return; + + // Apply liner search here because number of items in the list is just four. + for (size_t i = 0; i < arraysize(kLanguageCodeSynonyms); ++i) { + if (main_part == kLanguageCodeSynonyms[i].translate_language) { + main_part = std::string(kLanguageCodeSynonyms[i].chrome_language); + break; + } + } + + *language = main_part + tail_part; +} + +GURL GetTranslateSecurityOrigin() { + std::string security_origin(kSecurityOrigin); + CommandLine* command_line = CommandLine::ForCurrentProcess(); + if (command_line->HasSwitch(switches::kTranslateSecurityOrigin)) { + security_origin = + command_line->GetSwitchValueASCII(switches::kTranslateSecurityOrigin); + } + return GURL(security_origin); +} + +} // namespace translate diff --git a/components/translate/common/translate_util.h b/components/translate/common/translate_util.h new file mode 100644 index 0000000..41b3496 --- /dev/null +++ b/components/translate/common/translate_util.h @@ -0,0 +1,28 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COMPONENTS_TRANSLATE_COMMON_TRANSLATE_UTIL_H_ +#define COMPONENTS_TRANSLATE_COMMON_TRANSLATE_UTIL_H_ + +#include <string> + +class GURL; + +namespace translate { + +// Isolated world sets following security-origin by default. +extern const char kSecurityOrigin[]; + +// Converts language code synonym to use at Translate server. +void ToTranslateLanguageSynonym(std::string* language); + +// Converts language code synonym to use at Chrome internal. +void ToChromeLanguageSynonym(std::string* language); + +// Get Security origin with which Translate runs. +GURL GetTranslateSecurityOrigin(); + +} // namespace translate + +#endif // COMPONENTS_TRANSLATE_COMMON_TRANSLATE_UTIL_H_ diff --git a/components/translate/common/translate_util_unittest.cc b/components/translate/common/translate_util_unittest.cc new file mode 100644 index 0000000..f53996b --- /dev/null +++ b/components/translate/common/translate_util_unittest.cc @@ -0,0 +1,96 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/translate/common/translate_util.h" + +#include "base/command_line.h" +#include "components/translate/common/translate_switches.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "url/gurl.h" + +typedef testing::Test TranslateUtilTest; + +// Tests that synonym language code is converted to one used in supporting list. +TEST_F(TranslateUtilTest, ToTranslateLanguageSynonym) { + std::string language; + + language = std::string("nb"); + translate::ToTranslateLanguageSynonym(&language); + EXPECT_EQ("no", language); + + language = std::string("he"); + translate::ToTranslateLanguageSynonym(&language); + EXPECT_EQ("iw", language); + + language = std::string("jv"); + translate::ToTranslateLanguageSynonym(&language); + EXPECT_EQ("jw", language); + + language = std::string("fil"); + translate::ToTranslateLanguageSynonym(&language); + EXPECT_EQ("tl", language); + + // Preserve a sub code if the language has a synonym. + language = std::string("he-IL"); + translate::ToTranslateLanguageSynonym(&language); + EXPECT_EQ("iw-IL", language); + + // Don't preserve a sub code if the language has just a similitude. + language = std::string("nb-NO"); + translate::ToTranslateLanguageSynonym(&language); + EXPECT_EQ("nb-NO", language); + + // Preserve the argument if it doesn't have its synonym. + language = std::string("en"); + translate::ToTranslateLanguageSynonym(&language); + EXPECT_EQ("en", language); +} + +// Tests that synonym language code is converted to one used in Chrome internal. +TEST_F(TranslateUtilTest, ToChromeLanguageSynonym) { + std::string language; + + language = std::string("no"); + translate::ToChromeLanguageSynonym(&language); + EXPECT_EQ("nb", language); + + language = std::string("iw"); + translate::ToChromeLanguageSynonym(&language); + EXPECT_EQ("he", language); + + language = std::string("jw"); + translate::ToChromeLanguageSynonym(&language); + EXPECT_EQ("jv", language); + + language = std::string("tl"); + translate::ToChromeLanguageSynonym(&language); + EXPECT_EQ("fil", language); + + // Preserve a sub code if the language has a synonym. + language = std::string("iw-IL"); + translate::ToChromeLanguageSynonym(&language); + EXPECT_EQ("he-IL", language); + + // Don't preserve a sub code if the language has just a similitude. + language = std::string("no-NO"); + translate::ToChromeLanguageSynonym(&language); + EXPECT_EQ("no-NO", language); + + // Preserve the argument if it doesn't have its synonym. + language = std::string("en"); + translate::ToChromeLanguageSynonym(&language); + EXPECT_EQ("en", language); +} + +TEST_F(TranslateUtilTest, SecurityOrigin) { + GURL origin = translate::GetTranslateSecurityOrigin(); + EXPECT_EQ(std::string(translate::kSecurityOrigin), origin.spec()); + + const std::string running_origin("http://www.tamurayukari.com/"); + CommandLine* command_line = CommandLine::ForCurrentProcess(); + command_line->AppendSwitchASCII(translate::switches::kTranslateSecurityOrigin, + running_origin); + GURL modified_origin = translate::GetTranslateSecurityOrigin(); + EXPECT_EQ(running_origin, modified_origin.spec()); +} diff --git a/components/translate/language_detection/DEPS b/components/translate/language_detection/DEPS new file mode 100644 index 0000000..a8cdc15 --- /dev/null +++ b/components/translate/language_detection/DEPS @@ -0,0 +1,6 @@ +include_rules = [ + # CLD library. + "+third_party/cld", + "+third_party/cld/encodings/compact_lang_det/win", + "+third_party/cld_2/src", +] diff --git a/components/translate/language_detection/language_detection_util.cc b/components/translate/language_detection/language_detection_util.cc new file mode 100644 index 0000000..01910a4 --- /dev/null +++ b/components/translate/language_detection/language_detection_util.cc @@ -0,0 +1,401 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/translate/language_detection/language_detection_util.h" + +#include "base/logging.h" +#include "base/metrics/field_trial.h" +#include "base/strings/string_split.h" +#include "base/strings/string_util.h" +#include "base/strings/utf_string_conversions.h" +#include "base/time/time.h" +#include "components/translate/common/translate_constants.h" +#include "components/translate/common/translate_metrics.h" +#include "components/translate/common/translate_util.h" + +#if !defined(CLD_VERSION) || CLD_VERSION==1 +#include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h" +#include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h" +#endif + +#if !defined(CLD_VERSION) || CLD_VERSION==2 +#include "third_party/cld_2/src/public/compact_lang_det.h" +#endif + +namespace { + +// Similar language code list. Some languages are very similar and difficult +// for CLD to distinguish. +struct SimilarLanguageCode { + const char* const code; + int group; +}; + +const SimilarLanguageCode kSimilarLanguageCodes[] = { + {"bs", 1}, + {"hr", 1}, + {"hi", 2}, + {"ne", 2}, +}; + +// Checks |kSimilarLanguageCodes| and returns group code. +int GetSimilarLanguageGroupCode(const std::string& language) { + for (size_t i = 0; i < arraysize(kSimilarLanguageCodes); ++i) { + if (language.find(kSimilarLanguageCodes[i].code) != 0) + continue; + return kSimilarLanguageCodes[i].group; + } + return 0; +} + +// Well-known languages which often have wrong server configuration of +// Content-Language: en. +// TODO(toyoshim): Remove these static tables and caller functions to +// translate/common, and implement them as std::set<>. +const char* kWellKnownCodesOnWrongConfiguration[] = { + "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th" +}; + +// Applies a series of language code modification in proper order. +void ApplyLanguageCodeCorrection(std::string* code) { + // Correct well-known format errors. + translate::CorrectLanguageCodeTypo(code); + + if (!translate::IsValidLanguageCode(*code)) { + *code = std::string(); + return; + } + + translate::ToTranslateLanguageSynonym(code); +} + +int GetCLDMajorVersion() { +#if !defined(CLD_VERSION) + std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2"); + if (group_name == "CLD2") + return 2; + else + return 1; +#else + return CLD_VERSION; +#endif +} + +// Returns the ISO 639 language code of the specified |text|, or 'unknown' if it +// failed. +// |is_cld_reliable| will be set as true if CLD says the detection is reliable. +std::string DetermineTextLanguage(const base::string16& text, + bool* is_cld_reliable) { + std::string language = translate::kUnknownLanguageCode; + int text_bytes = 0; + bool is_reliable = false; + + // Language or CLD2::Language + int cld_language = 0; + bool is_valid_language = false; + + switch (GetCLDMajorVersion()) { +#if !defined(CLD_VERSION) || CLD_VERSION==1 + case 1: { + int num_languages = 0; + cld_language = + DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable, + &num_languages, NULL, &text_bytes); + is_valid_language = cld_language != NUM_LANGUAGES && + cld_language != UNKNOWN_LANGUAGE && + cld_language != TG_UNKNOWN_LANGUAGE; + break; + } +#endif +#if !defined(CLD_VERSION) || CLD_VERSION==2 + case 2: { + std::string utf8_text(UTF16ToUTF8(text)); + CLD2::Language language3[3]; + int percent3[3]; + cld_language = CLD2::DetectLanguageSummary( + utf8_text.c_str(), (int)utf8_text.size(), true, language3, percent3, + &text_bytes, &is_reliable); + is_valid_language = cld_language != CLD2::NUM_LANGUAGES && + cld_language != CLD2::UNKNOWN_LANGUAGE && + cld_language != CLD2::TG_UNKNOWN_LANGUAGE; + break; + } +#endif + default: + NOTREACHED(); + } + + if (is_cld_reliable != NULL) + *is_cld_reliable = is_reliable; + + // We don't trust the result if the CLD reports that the detection is not + // reliable, or if the actual text used to detect the language was less than + // 100 bytes (short texts can often lead to wrong results). + // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that + // the determined language code is correct with 50% confidence. Chrome should + // handle the real confidence value to judge. + if (is_reliable && text_bytes >= 100 && is_valid_language) { + // We should not use LanguageCode_ISO_639_1 because it does not cover all + // the languages CLD can detect. As a result, it'll return the invalid + // language code for tradtional Chinese among others. + // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and + // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN + // for Simplified Chinese. + switch (GetCLDMajorVersion()) { +#if !defined(CLD_VERSION) || CLD_VERSION==1 + case 1: + language = + LanguageCodeWithDialects(static_cast<Language>(cld_language)); + break; +#endif +#if !defined(CLD_VERSION) || CLD_VERSION==2 + case 2: + // (1) CLD2's LanguageCode returns general Chinese 'zh' for + // CLD2::CHINESE, but Translate server doesn't accept it. This is + // converted to 'zh-CN' in the same way as CLD1's + // LanguageCodeWithDialects. + // + // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for + // CLD2::CHINESE_T. This is technically more precise for the language + // code of traditional Chinese, while Translate server hasn't accepted + // zh-Hant yet. + if (cld_language == CLD2::CHINESE) { + language = "zh-CN"; + } else if (cld_language == CLD2::CHINESE_T) { + language = "zh-TW"; + } else { + language = + CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language)); + } + break; +#endif + default: + NOTREACHED(); + } + } + VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text + << "\n*************************************\n"; + return language; +} + +// Checks if CLD can complement a sub code when the page language doesn't know +// the sub code. +bool CanCLDComplementSubCode( + const std::string& page_language, const std::string& cld_language) { + // Translate server cannot treat general Chinese. If Content-Language and + // CLD agree that the language is Chinese and Content-Language doesn't know + // which dialect is used, CLD language has priority. + // TODO(hajimehoshi): How about the other dialects like zh-MO? + return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false); +} + +} // namespace + +namespace translate { + +std::string DeterminePageLanguage(const std::string& code, + const std::string& html_lang, + const base::string16& contents, + std::string* cld_language_p, + bool* is_cld_reliable_p) { + base::TimeTicks begin_time = base::TimeTicks::Now(); + bool is_cld_reliable; + std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable); + translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now()); + + if (cld_language_p != NULL) + *cld_language_p = cld_language; + if (is_cld_reliable_p != NULL) + *is_cld_reliable_p = is_cld_reliable; + translate::ToTranslateLanguageSynonym(&cld_language); + + // Check if html lang attribute is valid. + std::string modified_html_lang; + if (!html_lang.empty()) { + modified_html_lang = html_lang; + ApplyLanguageCodeCorrection(&modified_html_lang); + translate::ReportHtmlLang(html_lang, modified_html_lang); + VLOG(9) << "html lang based language code: " << modified_html_lang; + } + + // Check if Content-Language is valid. + std::string modified_code; + if (!code.empty()) { + modified_code = code; + ApplyLanguageCodeCorrection(&modified_code); + translate::ReportContentLanguage(code, modified_code); + } + + // Adopt |modified_html_lang| if it is valid. Otherwise, adopt + // |modified_code|. + std::string language = modified_html_lang.empty() ? modified_code : + modified_html_lang; + + // If |language| is empty, just use CLD result even though it might be + // translate::kUnknownLanguageCode. + if (language.empty()) { + translate::ReportLanguageVerification( + translate::LANGUAGE_VERIFICATION_CLD_ONLY); + return cld_language; + } + + if (cld_language == kUnknownLanguageCode) { + translate::ReportLanguageVerification( + translate::LANGUAGE_VERIFICATION_UNKNOWN); + return language; + } else if (CanCLDComplementSubCode(language, cld_language)) { + translate::ReportLanguageVerification( + translate::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE); + return cld_language; + } else if (IsSameOrSimilarLanguages(language, cld_language)) { + translate::ReportLanguageVerification( + translate::LANGUAGE_VERIFICATION_CLD_AGREE); + return language; + } else if (MaybeServerWrongConfiguration(language, cld_language)) { + translate::ReportLanguageVerification( + translate::LANGUAGE_VERIFICATION_TRUST_CLD); + return cld_language; + } else { + translate::ReportLanguageVerification( + translate::LANGUAGE_VERIFICATION_CLD_DISAGREE); + // Content-Language value might be wrong because CLD says that this page + // is written in another language with confidence. + // In this case, Chrome doesn't rely on any of the language codes, and + // gives up suggesting a translation. + return std::string(kUnknownLanguageCode); + } + + return language; +} + +void CorrectLanguageCodeTypo(std::string* code) { + DCHECK(code); + + size_t coma_index = code->find(','); + if (coma_index != std::string::npos) { + // There are more than 1 language specified, just keep the first one. + *code = code->substr(0, coma_index); + } + TrimWhitespaceASCII(*code, TRIM_ALL, code); + + // An underscore instead of a dash is a frequent mistake. + size_t underscore_index = code->find('_'); + if (underscore_index != std::string::npos) + (*code)[underscore_index] = '-'; + + // Change everything up to a dash to lower-case and everything after to upper. + size_t dash_index = code->find('-'); + if (dash_index != std::string::npos) { + *code = StringToLowerASCII(code->substr(0, dash_index)) + + StringToUpperASCII(code->substr(dash_index)); + } else { + *code = StringToLowerASCII(*code); + } +} + +bool IsValidLanguageCode(const std::string& code) { + // Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/. + // TODO(hajimehoshi): How about es-419, which is used as an Accept language? + std::vector<std::string> chunks; + base::SplitString(code, '-', &chunks); + + if (chunks.size() < 1 || 2 < chunks.size()) + return false; + + const std::string& main_code = chunks[0]; + + if (main_code.size() < 1 || 3 < main_code.size()) + return false; + + for (std::string::const_iterator it = main_code.begin(); + it != main_code.end(); ++it) { + if (!IsAsciiAlpha(*it)) + return false; + } + + if (chunks.size() == 1) + return true; + + const std::string& sub_code = chunks[1]; + + if (sub_code.size() != 2) + return false; + + for (std::string::const_iterator it = sub_code.begin(); + it != sub_code.end(); ++it) { + if (!IsAsciiAlpha(*it)) + return false; + } + + return true; +} + +bool IsSameOrSimilarLanguages(const std::string& page_language, + const std::string& cld_language) { + std::vector<std::string> chunks; + + base::SplitString(page_language, '-', &chunks); + if (chunks.size() == 0) + return false; + std::string page_language_main_part = chunks[0]; + + base::SplitString(cld_language, '-', &chunks); + if (chunks.size() == 0) + return false; + std::string cld_language_main_part = chunks[0]; + + // Language code part of |page_language| is matched to one of |cld_language|. + // Country code is ignored here. + if (page_language_main_part == cld_language_main_part) { + // Languages are matched strictly. Reports false to metrics, but returns + // true. + translate::ReportSimilarLanguageMatch(false); + return true; + } + + // Check if |page_language| and |cld_language| are in the similar language + // list and belong to the same language group. + int page_code = GetSimilarLanguageGroupCode(page_language); + bool match = page_code != 0 && + page_code == GetSimilarLanguageGroupCode(cld_language); + + translate::ReportSimilarLanguageMatch(match); + return match; +} + +bool MaybeServerWrongConfiguration(const std::string& page_language, + const std::string& cld_language) { + // If |page_language| is not "en-*", respect it and just return false here. + if (!StartsWithASCII(page_language, "en", false)) + return false; + + // A server provides a language meta information representing "en-*". But it + // might be just a default value due to missing user configuration. + // Let's trust |cld_language| if the determined language is not difficult to + // distinguish from English, and the language is one of well-known languages + // which often provide "en-*" meta information mistakenly. + for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) { + if (cld_language == kWellKnownCodesOnWrongConfiguration[i]) + return true; + } + return false; +} + +std::string GetCLDVersion() { + switch (GetCLDMajorVersion()) { +#if !defined(CLD_VERSION) || CLD_VERSION==1 + case 1: + return CompactLangDet::DetectLanguageVersion(); +#endif +#if !defined(CLD_VERSION) || CLD_VERSION==2 + case 2: + return CLD2::DetectLanguageVersion(); +#endif + default: + NOTREACHED(); + } + return ""; +} + +} // namespace translate diff --git a/components/translate/language_detection/language_detection_util.h b/components/translate/language_detection/language_detection_util.h new file mode 100644 index 0000000..8af9fce --- /dev/null +++ b/components/translate/language_detection/language_detection_util.h @@ -0,0 +1,47 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COMPONENTS_TRANSLATE_LANGUAGE_DETECTION_LANGUAGE_DETECTION_UTIL_H_ +#define COMPONENTS_TRANSLATE_LANGUAGE_DETECTION_LANGUAGE_DETECTION_UTIL_H_ + +#include <string> + +#include "base/strings/string16.h" + +namespace translate { + +// Determines content page language from Content-Language code and contents. +std::string DeterminePageLanguage(const std::string& code, + const std::string& html_lang, + const base::string16& contents, + std::string* cld_language, + bool* is_cld_reliable); + +// Corrects language code if it contains well-known mistakes. +// Called only by tests. +void CorrectLanguageCodeTypo(std::string* code); + +// Checks if the language code's format is valid. +// Called only by tests. +bool IsValidLanguageCode(const std::string& code); + +// Checks if languages are matched, or similar. This function returns true +// against a language pair containing a language which is difficult for CLD to +// distinguish. +// Called only by tests. +bool IsSameOrSimilarLanguages(const std::string& page_language, + const std::string& cld_language); + +// Checks if languages pair is one of well-known pairs of wrong server +// configuration. +// Called only by tests. +bool MaybeServerWrongConfiguration(const std::string& page_language, + const std::string& cld_language); + +// Returns the version string of CLD. +std::string GetCLDVersion(); + +} // namespace translate + +#endif // COMPONENTS_TRANSLATE_LANGUAGE_DETECTION_LANGUAGE_DETECTION_UTIL_H_ diff --git a/components/translate/language_detection/language_detection_util_unittest.cc b/components/translate/language_detection/language_detection_util_unittest.cc new file mode 100644 index 0000000..c3bf6c1 --- /dev/null +++ b/components/translate/language_detection/language_detection_util_unittest.cc @@ -0,0 +1,173 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/translate/language_detection/language_detection_util.h" + +#include "base/strings/string16.h" +#include "base/strings/utf_string_conversions.h" +#include "components/translate/common/translate_constants.h" +#include "testing/gtest/include/gtest/gtest.h" + +typedef testing::Test LanguageDetectionUtilTest; + +// Tests that well-known language code typos are fixed. +TEST_F(LanguageDetectionUtilTest, LanguageCodeTypoCorrection) { + std::string language; + + // Strip the second and later codes. + language = std::string("ja,en"); + translate::CorrectLanguageCodeTypo(&language); + EXPECT_EQ("ja", language); + + // Replace dash with hyphen. + language = std::string("ja_JP"); + translate::CorrectLanguageCodeTypo(&language); + EXPECT_EQ("ja-JP", language); + + // Correct wrong cases. + language = std::string("JA-jp"); + translate::CorrectLanguageCodeTypo(&language); + EXPECT_EQ("ja-JP", language); +} + +// Tests if the language codes' format is invalid. +TEST_F(LanguageDetectionUtilTest, IsValidLanguageCode) { + std::string language; + + language = std::string("ja"); + EXPECT_TRUE(translate::IsValidLanguageCode(language)); + + language = std::string("ja-JP"); + EXPECT_TRUE(translate::IsValidLanguageCode(language)); + + language = std::string("ceb"); + EXPECT_TRUE(translate::IsValidLanguageCode(language)); + + language = std::string("ceb-XX"); + EXPECT_TRUE(translate::IsValidLanguageCode(language)); + + // Invalid because the sub code consists of a number. + language = std::string("utf-8"); + EXPECT_FALSE(translate::IsValidLanguageCode(language)); + + // Invalid because of six characters after hyphen. + language = std::string("ja-YUKARI"); + EXPECT_FALSE(translate::IsValidLanguageCode(language)); + + // Invalid because of four characters. + language = std::string("DHMO"); + EXPECT_FALSE(translate::IsValidLanguageCode(language)); +} + +// Tests that similar language table works. +TEST_F(LanguageDetectionUtilTest, SimilarLanguageCode) { + EXPECT_TRUE(translate::IsSameOrSimilarLanguages("en", "en")); + EXPECT_FALSE(translate::IsSameOrSimilarLanguages("en", "ja")); + + // Language codes are same if the main parts are same. The synonyms should be + // took into account (ex: 'iw' and 'he'). + EXPECT_TRUE(translate::IsSameOrSimilarLanguages("sr-ME", "sr")); + EXPECT_TRUE(translate::IsSameOrSimilarLanguages("sr", "sr-ME")); + EXPECT_TRUE(translate::IsSameOrSimilarLanguages("he", "he-IL")); + EXPECT_TRUE(translate::IsSameOrSimilarLanguages("eng", "eng-US")); + EXPECT_TRUE(translate::IsSameOrSimilarLanguages("eng-US", "eng")); + EXPECT_FALSE(translate::IsSameOrSimilarLanguages("eng", "enm")); + + // Even though the main parts are different, some special language pairs are + // recognized as same languages. + EXPECT_TRUE(translate::IsSameOrSimilarLanguages("bs", "hr")); + EXPECT_TRUE(translate::IsSameOrSimilarLanguages("ne", "hi")); + EXPECT_FALSE(translate::IsSameOrSimilarLanguages("bs", "hi")); +} + +// Tests that well-known languages which often have wrong server configuration +// are handles. +TEST_F(LanguageDetectionUtilTest, WellKnownWrongConfiguration) { + EXPECT_TRUE(translate::MaybeServerWrongConfiguration("en", "ja")); + EXPECT_TRUE(translate::MaybeServerWrongConfiguration("en-US", "ja")); + EXPECT_TRUE(translate::MaybeServerWrongConfiguration("en", "zh-CN")); + EXPECT_FALSE(translate::MaybeServerWrongConfiguration("ja", "en")); + EXPECT_FALSE(translate::MaybeServerWrongConfiguration("en", "he")); +} + +// Tests that the language meta tag providing wrong information is ignored by +// LanguageDetectionUtil due to disagreement between meta tag and CLD. +TEST_F(LanguageDetectionUtilTest, CLDDisagreeWithWrongLanguageCode) { + base::string16 contents = ASCIIToUTF16( + "<html><head><meta http-equiv='Content-Language' content='ja'></head>" + "<body>This is a page apparently written in English. Even though " + "content-language is provided, the value will be ignored if the value " + "is suspicious.</body></html>"); + std::string cld_language; + bool is_cld_reliable; + std::string language = translate::DeterminePageLanguage(std::string("ja"), + std::string(), + contents, + &cld_language, + &is_cld_reliable); + EXPECT_EQ(translate::kUnknownLanguageCode, language); + EXPECT_EQ("en", cld_language); + EXPECT_TRUE(is_cld_reliable); +} + +// Tests that the language meta tag providing "en-US" style information is +// agreed by CLD. +TEST_F(LanguageDetectionUtilTest, CLDAgreeWithLanguageCodeHavingCountryCode) { + base::string16 contents = ASCIIToUTF16( + "<html><head><meta http-equiv='Content-Language' content='en-US'></head>" + "<body>This is a page apparently written in English. Even though " + "content-language is provided, the value will be ignored if the value " + "is suspicious.</body></html>"); + std::string cld_language; + bool is_cld_reliable; + std::string language = translate::DeterminePageLanguage(std::string("en-US"), + std::string(), + contents, + &cld_language, + &is_cld_reliable); + EXPECT_EQ("en-US", language); + EXPECT_EQ("en", cld_language); + EXPECT_TRUE(is_cld_reliable); +} + +// Tests that the language meta tag providing wrong information is ignored and +// CLD's language will be adopted by LanguageDetectionUtil due to an invalid +// meta tag. +TEST_F(LanguageDetectionUtilTest, InvalidLanguageMetaTagProviding) { + base::string16 contents = ASCIIToUTF16( + "<html><head><meta http-equiv='Content-Language' content='utf-8'></head>" + "<body>This is a page apparently written in English. Even though " + "content-language is provided, the value will be ignored and CLD's" + " language will be adopted if the value is invalid.</body></html>"); + std::string cld_language; + bool is_cld_reliable; + std::string language = translate::DeterminePageLanguage(std::string("utf-8"), + std::string(), + contents, + &cld_language, + &is_cld_reliable); + EXPECT_EQ("en", language); + EXPECT_EQ("en", cld_language); + EXPECT_TRUE(is_cld_reliable); +} + +// Tests that the language meta tag providing wrong information is ignored +// because of valid html lang attribute. +TEST_F(LanguageDetectionUtilTest, AdoptHtmlLang) { + base::string16 contents = ASCIIToUTF16( + "<html lang='en'><head><meta http-equiv='Content-Language' content='ja'>" + "</head><body>This is a page apparently written in English. Even though " + "content-language is provided, the value will be ignored if the value " + "is suspicious.</body></html>"); + std::string cld_language; + bool is_cld_reliable; + std::string language = translate::DeterminePageLanguage(std::string("ja"), + std::string("en"), + contents, + &cld_language, + &is_cld_reliable); + EXPECT_EQ("en", language); + EXPECT_EQ("en", cld_language); + EXPECT_TRUE(is_cld_reliable); +} |