summaryrefslogtreecommitdiffstats
path: root/components
diff options
context:
space:
mode:
authordroger@chromium.org <droger@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-10-04 16:03:09 +0000
committerdroger@chromium.org <droger@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-10-04 16:03:09 +0000
commit499e7c55f53e2a118c31fddd079456a3dbe8b35c (patch)
tree185e0a4be375f4b45276ea0472a0b472665ca1ba /components
parentb38806a8ad4ecb73bfffaecfaab28a1dfe3a5402 (diff)
downloadchromium_src-499e7c55f53e2a118c31fddd079456a3dbe8b35c.zip
chromium_src-499e7c55f53e2a118c31fddd079456a3dbe8b35c.tar.gz
chromium_src-499e7c55f53e2a118c31fddd079456a3dbe8b35c.tar.bz2
Move language detection to a component
Language detection is used from the renderer on most platform, but from the browser on iOS. This CL moves it from chrome/common/ to a new "translate" component, which allows to track and address more cleanly dependencies issues. BUG=297777 Review URL: https://codereview.chromium.org/25531002 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@227015 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'components')
-rw-r--r--components/OWNERS4
-rw-r--r--components/components.gyp1
-rw-r--r--components/components_tests.gypi8
-rw-r--r--components/translate.gypi57
-rw-r--r--components/translate/DEPS4
-rw-r--r--components/translate/OWNERS3
-rw-r--r--components/translate/README8
-rw-r--r--components/translate/common/translate_constants.cc11
-rw-r--r--components/translate/common/translate_constants.h16
-rw-r--r--components/translate/common/translate_metrics.cc130
-rw-r--r--components/translate/common/translate_metrics.h104
-rw-r--r--components/translate/common/translate_metrics_unittest.cc284
-rw-r--r--components/translate/common/translate_switches.cc14
-rw-r--r--components/translate/common/translate_switches.h16
-rw-r--r--components/translate/common/translate_util.cc125
-rw-r--r--components/translate/common/translate_util.h28
-rw-r--r--components/translate/common/translate_util_unittest.cc96
-rw-r--r--components/translate/language_detection/DEPS6
-rw-r--r--components/translate/language_detection/language_detection_util.cc401
-rw-r--r--components/translate/language_detection/language_detection_util.h47
-rw-r--r--components/translate/language_detection/language_detection_util_unittest.cc173
21 files changed, 1536 insertions, 0 deletions
diff --git a/components/OWNERS b/components/OWNERS
index 4c134eb..bcdeb28 100644
--- a/components/OWNERS
+++ b/components/OWNERS
@@ -41,6 +41,10 @@ per-file sessions.gypi=sky@chromium.org
per-file tracing*=jbauman@chromium.org
per-file tracing*=nduca@chromium.org
+per-file translate.gypi=hajimehoshi@chromium.org
+per-file translate.gypi=mad@chromium.org
+per-file translate.gypi=toyoshim@chromium.org
+
per-file startup_metric_utils.gypi=jeremy@chromium.org
per-file user_prefs.gypi=battre@chromium.org
diff --git a/components/components.gyp b/components/components.gyp
index 0bfe536..ea5af28 100644
--- a/components/components.gyp
+++ b/components/components.gyp
@@ -21,6 +21,7 @@
'policy.gypi',
'sessions.gypi',
'startup_metric_utils.gypi',
+ 'translate.gypi',
'user_prefs.gypi',
'variations.gypi',
'visitedlink.gypi',
diff --git a/components/components_tests.gypi b/components/components_tests.gypi
index e5c1f59..ba6b5e2 100644
--- a/components/components_tests.gypi
+++ b/components/components_tests.gypi
@@ -24,6 +24,9 @@
'navigation_interception/intercept_navigation_resource_throttle_unittest.cc',
'sessions/serialized_navigation_entry_unittest.cc',
'test/run_all_unittests.cc',
+ 'translate/common/translate_metrics_unittest.cc',
+ 'translate/common/translate_util_unittest.cc',
+ 'translate/language_detection/language_detection_util_unittest.cc',
# TODO(asvitkine): These should be tested on iOS too.
'variations/entropy_provider_unittest.cc',
'variations/metrics_util_unittest.cc',
@@ -74,6 +77,10 @@
'sessions',
'sessions_test_support',
+ # Dependencies of translate.
+ 'translate_common',
+ 'translate_language_detection',
+
# Dependencies of variations
'variations',
@@ -93,6 +100,7 @@
# http://crbug.com/303011.
# TODO(asvitkine): Bring up varations/ unittests on iOS.
['include', '^dom_distiller'],
+ ['include', '^translate'],
],
'dependencies!': [
'autofill_core_common',
diff --git a/components/translate.gypi b/components/translate.gypi
new file mode 100644
index 0000000..7c19fe8
--- /dev/null
+++ b/components/translate.gypi
@@ -0,0 +1,57 @@
+# Copyright 2013 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+{
+ 'targets': [
+ {
+ 'target_name': 'translate_common',
+ 'type': 'static_library',
+ 'dependencies': [
+ '../base/base.gyp:base',
+ '../url/url.gyp:url_lib',
+ ],
+ 'include_dirs': [
+ '..',
+ ],
+ 'sources': [
+ 'translate/common/translate_constants.cc',
+ 'translate/common/translate_constants.h',
+ 'translate/common/translate_metrics.cc',
+ 'translate/common/translate_metrics.h',
+ 'translate/common/translate_switches.cc',
+ 'translate/common/translate_switches.h',
+ 'translate/common/translate_util.cc',
+ 'translate/common/translate_util.h',
+ ],
+ },
+ {
+ 'target_name': 'translate_language_detection',
+ 'type': 'static_library',
+ 'dependencies': [
+ 'translate_common',
+ '../base/base.gyp:base',
+ '../url/url.gyp:url_lib',
+ ],
+ 'include_dirs': [
+ '..',
+ ],
+ 'sources': [
+ 'translate/language_detection/language_detection_util.cc',
+ 'translate/language_detection/language_detection_util.h',
+ ],
+ 'conditions': [
+ ['cld_version==0 or cld_version==1', {
+ 'dependencies': [
+ '<(DEPTH)/third_party/cld/cld.gyp:cld',
+ ],
+ }],
+ ['cld_version==0 or cld_version==2', {
+ 'dependencies': [
+ '<(DEPTH)/third_party/cld_2/cld_2.gyp:cld_2',
+ ],
+ }],
+ ],
+ },
+ ],
+}
diff --git a/components/translate/DEPS b/components/translate/DEPS
new file mode 100644
index 0000000..8f4bfc0
--- /dev/null
+++ b/components/translate/DEPS
@@ -0,0 +1,4 @@
+include_rules = [
+ # translate is used on iOS, which cannot depend on content.
+ "-content",
+]
diff --git a/components/translate/OWNERS b/components/translate/OWNERS
new file mode 100644
index 0000000..a214ab1
--- /dev/null
+++ b/components/translate/OWNERS
@@ -0,0 +1,3 @@
+hajimehoshi@chromium.org
+mad@chromium.org
+toyoshim@chromium.org
diff --git a/components/translate/README b/components/translate/README
new file mode 100644
index 0000000..076afe9
--- /dev/null
+++ b/components/translate/README
@@ -0,0 +1,8 @@
+- translate/language detection depends on the CLD library and should only be
+used from the renderer to avoid bloating the DLLs on Windows.
+
+- Translate is not allowed to depend on content/, because it is used by iOS.
+If dependences on content/ need to be added to Translate, it will have to be
+made into a layered component: see
+http://www.chromium.org/developers/design-documents/layered-components-design
+for more information.
diff --git a/components/translate/common/translate_constants.cc b/components/translate/common/translate_constants.cc
new file mode 100644
index 0000000..a51bb93
--- /dev/null
+++ b/components/translate/common/translate_constants.cc
@@ -0,0 +1,11 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/translate/common/translate_constants.h"
+
+namespace translate {
+
+const char* const kUnknownLanguageCode = "und";
+
+} // namespace translate
diff --git a/components/translate/common/translate_constants.h b/components/translate/common/translate_constants.h
new file mode 100644
index 0000000..81a6369
--- /dev/null
+++ b/components/translate/common/translate_constants.h
@@ -0,0 +1,16 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef COMPONENTS_TRANSLATE_COMMON_TRANSLATE_CONSTANTS_H_
+#define COMPONENTS_TRANSLATE_COMMON_TRANSLATE_CONSTANTS_H_
+
+namespace translate {
+
+// The language code used when the language of a page could not be detected.
+// (Matches what the CLD -Compact Language Detection- library reports.)
+extern const char* const kUnknownLanguageCode;
+
+} // namespace translate
+
+#endif // COMPONENTS_TRANSLATE_COMMON_TRANSLATE_CONSTANTS_H_
diff --git a/components/translate/common/translate_metrics.cc b/components/translate/common/translate_metrics.cc
new file mode 100644
index 0000000..b095084
--- /dev/null
+++ b/components/translate/common/translate_metrics.cc
@@ -0,0 +1,130 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/translate/common/translate_metrics.h"
+
+#include "base/basictypes.h"
+#include "base/metrics/histogram.h"
+
+namespace translate {
+
+namespace {
+
+// Constant string values to indicate UMA names. All entries should have
+// a corresponding index in MetricsNameIndex and an entry in |kMetricsEntries|.
+const char kRenderer4LanguageDetection[] = "Renderer4.LanguageDetection";
+const char kTranslateContentLanguage[] = "Translate.ContentLanguage";
+const char kTranslateHtmlLang[] = "Translate.HtmlLang";
+const char kTranslateLanguageVerification[] = "Translate.LanguageVerification";
+const char kTranslateTimeToBeReady[] = "Translate.TimeToBeReady";
+const char kTranslateTimeToLoad[] = "Translate.TimeToLoad";
+const char kTranslateTimeToTranslate[] = "Translate.TimeToTranslate";
+const char kTranslateUserActionDuration[] = "Translate.UserActionDuration";
+const char kTranslatePageScheme[] = "Translate.PageScheme";
+const char kTranslateSimilarLanguageMatch[] = "Translate.SimilarLanguageMatch";
+
+const char kSchemeHttp[] = "http";
+const char kSchemeHttps[] = "https";
+
+struct MetricsEntry {
+ MetricsNameIndex index;
+ const char* const name;
+};
+
+// This entry table should be updated when new UMA items are added.
+const MetricsEntry kMetricsEntries[] = {
+ {UMA_LANGUAGE_DETECTION, kRenderer4LanguageDetection},
+ {UMA_CONTENT_LANGUAGE, kTranslateContentLanguage},
+ {UMA_HTML_LANG, kTranslateHtmlLang},
+ {UMA_LANGUAGE_VERIFICATION, kTranslateLanguageVerification},
+ {UMA_TIME_TO_BE_READY, kTranslateTimeToBeReady},
+ {UMA_TIME_TO_LOAD, kTranslateTimeToLoad},
+ {UMA_TIME_TO_TRANSLATE, kTranslateTimeToTranslate},
+ {UMA_USER_ACTION_DURATION, kTranslateUserActionDuration},
+ {UMA_PAGE_SCHEME, kTranslatePageScheme},
+ {UMA_SIMILAR_LANGUAGE_MATCH, kTranslateSimilarLanguageMatch}, };
+
+COMPILE_ASSERT(arraysize(kMetricsEntries) == UMA_MAX,
+ arraysize_of_kMetricsEntries_should_be_UMA_MAX);
+
+LanguageCheckType GetLanguageCheckMetric(const std::string& provided_code,
+ const std::string& revised_code) {
+ if (provided_code.empty())
+ return LANGUAGE_NOT_PROVIDED;
+ else if (provided_code == revised_code)
+ return LANGUAGE_VALID;
+ return LANGUAGE_INVALID;
+}
+
+} // namespace
+
+void ReportContentLanguage(const std::string& provided_code,
+ const std::string& revised_code) {
+ UMA_HISTOGRAM_ENUMERATION(kTranslateContentLanguage,
+ GetLanguageCheckMetric(provided_code, revised_code),
+ LANGUAGE_MAX);
+}
+
+void ReportHtmlLang(const std::string& provided_code,
+ const std::string& revised_code) {
+ UMA_HISTOGRAM_ENUMERATION(kTranslateHtmlLang,
+ GetLanguageCheckMetric(provided_code, revised_code),
+ LANGUAGE_MAX);
+}
+
+void ReportLanguageVerification(LanguageVerificationType type) {
+ UMA_HISTOGRAM_ENUMERATION(kTranslateLanguageVerification,
+ type,
+ LANGUAGE_VERIFICATION_MAX);
+}
+
+void ReportTimeToBeReady(double time_in_msec) {
+ UMA_HISTOGRAM_MEDIUM_TIMES(
+ kTranslateTimeToBeReady,
+ base::TimeDelta::FromMicroseconds(time_in_msec * 1000.0));
+}
+
+void ReportTimeToLoad(double time_in_msec) {
+ UMA_HISTOGRAM_MEDIUM_TIMES(
+ kTranslateTimeToLoad,
+ base::TimeDelta::FromMicroseconds(time_in_msec * 1000.0));
+}
+
+void ReportTimeToTranslate(double time_in_msec) {
+ UMA_HISTOGRAM_MEDIUM_TIMES(
+ kTranslateTimeToTranslate,
+ base::TimeDelta::FromMicroseconds(time_in_msec * 1000.0));
+}
+
+void ReportUserActionDuration(base::TimeTicks begin, base::TimeTicks end) {
+ UMA_HISTOGRAM_LONG_TIMES(kTranslateUserActionDuration, end - begin);
+}
+
+void ReportPageScheme(const std::string& scheme) {
+ SchemeType type = SCHEME_OTHERS;
+ if (scheme == kSchemeHttp)
+ type = SCHEME_HTTP;
+ else if (scheme == kSchemeHttps)
+ type = SCHEME_HTTPS;
+ UMA_HISTOGRAM_ENUMERATION(kTranslatePageScheme, type, SCHEME_MAX);
+}
+
+void ReportLanguageDetectionTime(base::TimeTicks begin, base::TimeTicks end) {
+ UMA_HISTOGRAM_MEDIUM_TIMES(kRenderer4LanguageDetection, end - begin);
+}
+
+void ReportSimilarLanguageMatch(bool match) {
+ UMA_HISTOGRAM_BOOLEAN(kTranslateSimilarLanguageMatch, match);
+}
+
+const char* GetMetricsName(MetricsNameIndex index) {
+ for (size_t i = 0; i < arraysize(kMetricsEntries); ++i) {
+ if (kMetricsEntries[i].index == index)
+ return kMetricsEntries[i].name;
+ }
+ NOTREACHED();
+ return NULL;
+}
+
+} // namespace translate
diff --git a/components/translate/common/translate_metrics.h b/components/translate/common/translate_metrics.h
new file mode 100644
index 0000000..9baa268
--- /dev/null
+++ b/components/translate/common/translate_metrics.h
@@ -0,0 +1,104 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef COMPONENTS_TRANSLATE_COMMON_TRANSLATE_METRICS_H_
+#define COMPONENTS_TRANSLATE_COMMON_TRANSLATE_METRICS_H_
+
+#include <string>
+
+#include "base/time/time.h"
+
+namespace translate {
+
+// An indexing type to query each UMA entry name via GetMetricsName() function.
+// Note: |kMetricsEntries| should be updated when a new entry is added here.
+enum MetricsNameIndex {
+ UMA_LANGUAGE_DETECTION,
+ UMA_CONTENT_LANGUAGE,
+ UMA_HTML_LANG,
+ UMA_LANGUAGE_VERIFICATION,
+ UMA_TIME_TO_BE_READY,
+ UMA_TIME_TO_LOAD,
+ UMA_TIME_TO_TRANSLATE,
+ UMA_USER_ACTION_DURATION,
+ UMA_PAGE_SCHEME,
+ UMA_SIMILAR_LANGUAGE_MATCH,
+ UMA_MAX,
+};
+
+// A page may provide a Content-Language HTTP header or a META tag.
+// TranslateHelper checks if a server provides a valid Content-Language.
+enum LanguageCheckType {
+ LANGUAGE_NOT_PROVIDED,
+ LANGUAGE_VALID,
+ LANGUAGE_INVALID,
+ LANGUAGE_MAX,
+};
+
+// When a valid Content-Language is provided, TranslateHelper checks if a
+// server provided Content-Language matches to a language CLD determined.
+enum LanguageVerificationType {
+ LANGUAGE_VERIFICATION_CLD_DISABLED, // obsolete
+ LANGUAGE_VERIFICATION_CLD_ONLY,
+ LANGUAGE_VERIFICATION_UNKNOWN,
+ LANGUAGE_VERIFICATION_CLD_AGREE,
+ LANGUAGE_VERIFICATION_CLD_DISAGREE,
+ LANGUAGE_VERIFICATION_TRUST_CLD,
+ LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE,
+ LANGUAGE_VERIFICATION_MAX,
+};
+
+// Scheme type of pages Chrome is going to translate.
+enum SchemeType {
+ SCHEME_HTTP,
+ SCHEME_HTTPS,
+ SCHEME_OTHERS,
+ SCHEME_MAX,
+};
+
+// Called after TranslateHelper verifies a server providing Content-Language
+// header. |provided_code| contains a Content-Language header value which a
+// server provides. It can be empty string when a server doesn't provide it.
+// |revised_code| is a value modified by format error corrector.
+void ReportContentLanguage(const std::string& provided_code,
+ const std::string& revised_code);
+
+// Called after TranslateHelper verifies a page providing html lang attribute.
+// |provided_code| contains a html lang attribute which a page provides. It can
+// be empty string when a page doesn't provide it. |revised_code| is a value
+// modified by format error corrector.
+void ReportHtmlLang(const std::string& provided_code,
+ const std::string& revised_code);
+
+// Called when CLD verifies Content-Language header.
+void ReportLanguageVerification(LanguageVerificationType type);
+
+// Called when the Translate Element library is ready.
+void ReportTimeToBeReady(double time_in_msec);
+
+// Called when the Translate Element library is loaded.
+void ReportTimeToLoad(double time_in_msec);
+
+// Called when a page translation is finished.
+void ReportTimeToTranslate(double time_in_msec);
+
+// Called when a translation is triggered.
+void ReportUserActionDuration(base::TimeTicks begin, base::TimeTicks end);
+
+// Called when a translation is triggered.
+void ReportPageScheme(const std::string& scheme);
+
+// Called when CLD detects page language.
+void ReportLanguageDetectionTime(base::TimeTicks begin, base::TimeTicks end);
+
+// Called when CLD agreed on a language which is different, but in the similar
+// language list.
+void ReportSimilarLanguageMatch(bool match);
+
+// Gets UMA name for an entry specified by |index|.
+const char* GetMetricsName(MetricsNameIndex index);
+
+} // namespace translate
+
+#endif // COMPONENTS_TRANSLATE_COMMON_TRANSLATE_METRICS_H_
diff --git a/components/translate/common/translate_metrics_unittest.cc b/components/translate/common/translate_metrics_unittest.cc
new file mode 100644
index 0000000..dac4b09
--- /dev/null
+++ b/components/translate/common/translate_metrics_unittest.cc
@@ -0,0 +1,284 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/translate/common/translate_metrics.h"
+
+#include "base/basictypes.h"
+#include "base/memory/scoped_ptr.h"
+#include "base/metrics/histogram.h"
+#include "base/metrics/histogram_samples.h"
+#include "base/metrics/statistics_recorder.h"
+#include "testing/gtest/include/gtest/gtest.h"
+#include "testing/platform_test.h"
+
+using base::HistogramBase;
+using base::HistogramSamples;
+using base::SampleCountIterator;
+using base::StatisticsRecorder;
+using base::TimeTicks;
+
+namespace {
+
+const int kTrue = 1;
+const int kFalse = 0;
+
+class MetricsRecorder {
+ public:
+ explicit MetricsRecorder(const char* key) : key_(key) {
+ StatisticsRecorder::Initialize();
+
+ HistogramBase* histogram = StatisticsRecorder::FindHistogram(key_);
+ if (histogram)
+ base_samples_ = histogram->SnapshotSamples();
+ }
+
+ void CheckLanguage(translate::MetricsNameIndex index,
+ int expected_not_provided,
+ int expected_valid,
+ int expected_invalid) {
+ ASSERT_EQ(translate::GetMetricsName(index), key_);
+
+ Snapshot();
+
+ EXPECT_EQ(expected_not_provided,
+ GetCountWithoutSnapshot(translate::LANGUAGE_NOT_PROVIDED));
+ EXPECT_EQ(expected_valid,
+ GetCountWithoutSnapshot(translate::LANGUAGE_VALID));
+ EXPECT_EQ(expected_invalid,
+ GetCountWithoutSnapshot(translate::LANGUAGE_INVALID));
+ }
+
+ void CheckLanguageVerification(int expected_cld_disabled,
+ int expected_cld_only,
+ int expected_unknown,
+ int expected_cld_agree,
+ int expected_cld_disagree,
+ int expected_trust_cld,
+ int expected_cld_complement_sub_code) {
+ ASSERT_EQ(translate::GetMetricsName(translate::UMA_LANGUAGE_VERIFICATION),
+ key_);
+
+ Snapshot();
+
+ EXPECT_EQ(
+ expected_cld_disabled,
+ GetCountWithoutSnapshot(translate::LANGUAGE_VERIFICATION_CLD_DISABLED));
+ EXPECT_EQ(
+ expected_cld_only,
+ GetCountWithoutSnapshot(translate::LANGUAGE_VERIFICATION_CLD_ONLY));
+ EXPECT_EQ(
+ expected_unknown,
+ GetCountWithoutSnapshot(translate::LANGUAGE_VERIFICATION_UNKNOWN));
+ EXPECT_EQ(
+ expected_cld_agree,
+ GetCountWithoutSnapshot(translate::LANGUAGE_VERIFICATION_CLD_AGREE));
+ EXPECT_EQ(
+ expected_cld_disagree,
+ GetCountWithoutSnapshot(translate::LANGUAGE_VERIFICATION_CLD_DISAGREE));
+ EXPECT_EQ(
+ expected_trust_cld,
+ GetCountWithoutSnapshot(translate::LANGUAGE_VERIFICATION_TRUST_CLD));
+ EXPECT_EQ(expected_cld_complement_sub_code,
+ GetCountWithoutSnapshot(
+ translate::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE));
+ }
+
+ void CheckScheme(int expected_http, int expected_https, int expected_others) {
+ ASSERT_EQ(translate::GetMetricsName(translate::UMA_PAGE_SCHEME), key_);
+
+ Snapshot();
+
+ EXPECT_EQ(expected_http, GetCountWithoutSnapshot(translate::SCHEME_HTTP));
+ EXPECT_EQ(expected_https, GetCountWithoutSnapshot(translate::SCHEME_HTTPS));
+ EXPECT_EQ(expected_others,
+ GetCountWithoutSnapshot(translate::SCHEME_OTHERS));
+ }
+
+ void CheckTotalCount(int count) {
+ Snapshot();
+ EXPECT_EQ(count, GetTotalCount());
+ }
+
+ void CheckValueInLogs(double value) {
+ Snapshot();
+ ASSERT_TRUE(samples_.get());
+ for (scoped_ptr<SampleCountIterator> i = samples_->Iterator(); !i->Done();
+ i->Next()) {
+ HistogramBase::Sample min;
+ HistogramBase::Sample max;
+ HistogramBase::Count count;
+ i->Get(&min, &max, &count);
+ if (min <= value && value <= max && count >= 1)
+ return;
+ }
+ EXPECT_FALSE(true);
+ }
+
+ HistogramBase::Count GetCount(HistogramBase::Sample value) {
+ Snapshot();
+ return GetCountWithoutSnapshot(value);
+ }
+
+ private:
+ void Snapshot() {
+ HistogramBase* histogram = StatisticsRecorder::FindHistogram(key_);
+ if (!histogram)
+ return;
+ samples_ = histogram->SnapshotSamples();
+ }
+
+ HistogramBase::Count GetCountWithoutSnapshot(HistogramBase::Sample value) {
+ if (!samples_.get())
+ return 0;
+ HistogramBase::Count count = samples_->GetCount(value);
+ if (!base_samples_.get())
+ return count;
+ return count - base_samples_->GetCount(value);
+ }
+
+ HistogramBase::Count GetTotalCount() {
+ if (!samples_.get())
+ return 0;
+ HistogramBase::Count count = samples_->TotalCount();
+ if (!base_samples_.get())
+ return count;
+ return count - base_samples_->TotalCount();
+ }
+
+ std::string key_;
+ scoped_ptr<HistogramSamples> base_samples_;
+ scoped_ptr<HistogramSamples> samples_;
+
+ DISALLOW_COPY_AND_ASSIGN(MetricsRecorder);
+};
+
+} // namespace
+
+TEST(TranslateMetricsTest, ReportContentLanguage) {
+ MetricsRecorder recorder(
+ translate::GetMetricsName(translate::UMA_CONTENT_LANGUAGE));
+
+ recorder.CheckLanguage(translate::UMA_CONTENT_LANGUAGE, 0, 0, 0);
+ translate::ReportContentLanguage(std::string(), std::string());
+ recorder.CheckLanguage(translate::UMA_CONTENT_LANGUAGE, 1, 0, 0);
+ translate::ReportContentLanguage("ja_JP", "ja-JP");
+ recorder.CheckLanguage(translate::UMA_CONTENT_LANGUAGE, 1, 0, 1);
+ translate::ReportContentLanguage("en", "en");
+ recorder.CheckLanguage(translate::UMA_CONTENT_LANGUAGE, 1, 1, 1);
+}
+
+TEST(TranslateMetricsTest, ReportHtmlLang) {
+ MetricsRecorder recorder(translate::GetMetricsName(translate::UMA_HTML_LANG));
+
+ recorder.CheckLanguage(translate::UMA_HTML_LANG, 0, 0, 0);
+ translate::ReportHtmlLang(std::string(), std::string());
+ recorder.CheckLanguage(translate::UMA_HTML_LANG, 1, 0, 0);
+ translate::ReportHtmlLang("ja_JP", "ja-JP");
+ recorder.CheckLanguage(translate::UMA_HTML_LANG, 1, 0, 1);
+ translate::ReportHtmlLang("en", "en");
+ recorder.CheckLanguage(translate::UMA_HTML_LANG, 1, 1, 1);
+}
+
+TEST(TranslateMetricsTest, ReportLanguageVerification) {
+ MetricsRecorder recorder(
+ translate::GetMetricsName(translate::UMA_LANGUAGE_VERIFICATION));
+
+ recorder.CheckLanguageVerification(0, 0, 0, 0, 0, 0, 0);
+ translate::ReportLanguageVerification(
+ translate::LANGUAGE_VERIFICATION_CLD_DISABLED);
+ recorder.CheckLanguageVerification(1, 0, 0, 0, 0, 0, 0);
+ translate::ReportLanguageVerification(
+ translate::LANGUAGE_VERIFICATION_CLD_ONLY);
+ recorder.CheckLanguageVerification(1, 1, 0, 0, 0, 0, 0);
+ translate::ReportLanguageVerification(
+ translate::LANGUAGE_VERIFICATION_UNKNOWN);
+ recorder.CheckLanguageVerification(1, 1, 1, 0, 0, 0, 0);
+ translate::ReportLanguageVerification(
+ translate::LANGUAGE_VERIFICATION_CLD_AGREE);
+ recorder.CheckLanguageVerification(1, 1, 1, 1, 0, 0, 0);
+ translate::ReportLanguageVerification(
+ translate::LANGUAGE_VERIFICATION_CLD_DISAGREE);
+ recorder.CheckLanguageVerification(1, 1, 1, 1, 1, 0, 0);
+ translate::ReportLanguageVerification(
+ translate::LANGUAGE_VERIFICATION_TRUST_CLD);
+ recorder.CheckLanguageVerification(1, 1, 1, 1, 1, 1, 0);
+ translate::ReportLanguageVerification(
+ translate::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE);
+ recorder.CheckLanguageVerification(1, 1, 1, 1, 1, 1, 1);
+}
+
+TEST(TranslateMetricsTest, ReportTimeToBeReady) {
+ MetricsRecorder recorder(
+ translate::GetMetricsName(translate::UMA_TIME_TO_BE_READY));
+ recorder.CheckTotalCount(0);
+ translate::ReportTimeToBeReady(3.14);
+ recorder.CheckValueInLogs(3.14);
+ recorder.CheckTotalCount(1);
+}
+
+TEST(TranslateMetricsTest, ReportTimeToLoad) {
+ MetricsRecorder recorder(
+ translate::GetMetricsName(translate::UMA_TIME_TO_LOAD));
+ recorder.CheckTotalCount(0);
+ translate::ReportTimeToLoad(573.0);
+ recorder.CheckValueInLogs(573.0);
+ recorder.CheckTotalCount(1);
+}
+
+TEST(TranslateMetricsTest, ReportTimeToTranslate) {
+ MetricsRecorder recorder(
+ translate::GetMetricsName(translate::UMA_TIME_TO_TRANSLATE));
+ recorder.CheckTotalCount(0);
+ translate::ReportTimeToTranslate(4649.0);
+ recorder.CheckValueInLogs(4649.0);
+ recorder.CheckTotalCount(1);
+}
+
+TEST(TranslateMetricsTest, ReportUserActionDuration) {
+ MetricsRecorder recorder(
+ translate::GetMetricsName(translate::UMA_USER_ACTION_DURATION));
+ recorder.CheckTotalCount(0);
+ TimeTicks begin = TimeTicks::Now();
+ TimeTicks end = begin + base::TimeDelta::FromSeconds(3776);
+ translate::ReportUserActionDuration(begin, end);
+ recorder.CheckValueInLogs(3776000.0);
+ recorder.CheckTotalCount(1);
+}
+
+TEST(TranslateMetricsTest, ReportPageScheme) {
+ MetricsRecorder recorder(
+ translate::GetMetricsName(translate::UMA_PAGE_SCHEME));
+ recorder.CheckScheme(0, 0, 0);
+ translate::ReportPageScheme("http");
+ recorder.CheckScheme(1, 0, 0);
+ translate::ReportPageScheme("https");
+ recorder.CheckScheme(1, 1, 0);
+ translate::ReportPageScheme("ftp");
+ recorder.CheckScheme(1, 1, 1);
+}
+
+TEST(TranslateMetricsTest, ReportSimilarLanguageMatch) {
+ MetricsRecorder recorder(
+ translate::GetMetricsName(translate::UMA_SIMILAR_LANGUAGE_MATCH));
+ recorder.CheckTotalCount(0);
+ EXPECT_EQ(0, recorder.GetCount(kTrue));
+ EXPECT_EQ(0, recorder.GetCount(kFalse));
+ translate::ReportSimilarLanguageMatch(true);
+ EXPECT_EQ(1, recorder.GetCount(kTrue));
+ EXPECT_EQ(0, recorder.GetCount(kFalse));
+ translate::ReportSimilarLanguageMatch(false);
+ EXPECT_EQ(1, recorder.GetCount(kTrue));
+ EXPECT_EQ(1, recorder.GetCount(kFalse));
+}
+
+TEST(TranslateMetricsTest, ReportLanguageDetectionTime) {
+ MetricsRecorder recorder(
+ translate::GetMetricsName(translate::UMA_LANGUAGE_DETECTION));
+ recorder.CheckTotalCount(0);
+ TimeTicks begin = TimeTicks::Now();
+ TimeTicks end = begin + base::TimeDelta::FromMicroseconds(9009);
+ translate::ReportLanguageDetectionTime(begin, end);
+ recorder.CheckValueInLogs(9.009);
+ recorder.CheckTotalCount(1);
+}
diff --git a/components/translate/common/translate_switches.cc b/components/translate/common/translate_switches.cc
new file mode 100644
index 0000000..714bbfb
--- /dev/null
+++ b/components/translate/common/translate_switches.cc
@@ -0,0 +1,14 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/translate/common/translate_switches.h"
+
+namespace translate {
+namespace switches {
+
+// Overrides security-origin with which Translate runs in an isolated world.
+const char kTranslateSecurityOrigin[] = "translate-security-origin";
+
+} // namespace switches
+} // namespace translate
diff --git a/components/translate/common/translate_switches.h b/components/translate/common/translate_switches.h
new file mode 100644
index 0000000..2b42bb5
--- /dev/null
+++ b/components/translate/common/translate_switches.h
@@ -0,0 +1,16 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef COMPONENTS_TRANSLATE_COMMON_TRANSLATE_SWITCHES_H_
+#define COMPONENTS_TRANSLATE_COMMON_TRANSLATE_SWITCHES_H_
+
+namespace translate {
+namespace switches {
+
+extern const char kTranslateSecurityOrigin[];
+
+} // namespace switches
+} // namespace translate
+
+#endif // COMPONENTS_TRANSLATE_COMMON_TRANSLATE_SWITCHES_H_
diff --git a/components/translate/common/translate_util.cc b/components/translate/common/translate_util.cc
new file mode 100644
index 0000000..4e151d3
--- /dev/null
+++ b/components/translate/common/translate_util.cc
@@ -0,0 +1,125 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/translate/common/translate_util.h"
+
+#include "base/basictypes.h"
+#include "base/command_line.h"
+#include "base/logging.h"
+#include "base/strings/string_split.h"
+#include "components/translate/common/translate_switches.h"
+#include "url/gurl.h"
+
+namespace {
+
+// Split the |language| into two parts. For example, if |language| is 'en-US',
+// this will be split into the main part 'en' and the tail part '-US'.
+void SplitIntoMainAndTail(const std::string& language,
+ std::string* main_part,
+ std::string* tail_part) {
+ DCHECK(main_part);
+ DCHECK(tail_part);
+
+ std::vector<std::string> chunks;
+ base::SplitString(language, '-', &chunks);
+ if (chunks.size() == 0u)
+ return;
+
+ *main_part = chunks[0];
+ *tail_part = language.substr(main_part->size());
+}
+
+} // namespace
+
+namespace translate {
+
+struct LanguageCodePair {
+ // Code used in supporting list of Translate.
+ const char* const translate_language;
+
+ // Code used in Chrome internal.
+ const char* const chrome_language;
+};
+
+// Some languages are treated as same languages in Translate even though they
+// are different to be exact.
+//
+// If this table is updated, please sync this with the synonym table in
+// chrome/browser/resources/options/language_options.js
+const LanguageCodePair kLanguageCodeSimilitudes[] = {
+ {"no", "nb"},
+ {"tl", "fil"},
+};
+
+// Some languages have changed codes over the years and sometimes the older
+// codes are used, so we must see them as synonyms.
+//
+// If this table is updated, please sync this with the synonym table in
+// chrome/browser/resources/options/language_options.js
+const LanguageCodePair kLanguageCodeSynonyms[] = {
+ {"iw", "he"},
+ {"jw", "jv"},
+};
+
+const char kSecurityOrigin[] = "https://translate.googleapis.com/";
+
+void ToTranslateLanguageSynonym(std::string* language) {
+ for (size_t i = 0; i < arraysize(kLanguageCodeSimilitudes); ++i) {
+ if (*language == kLanguageCodeSimilitudes[i].chrome_language) {
+ *language = kLanguageCodeSimilitudes[i].translate_language;
+ return;
+ }
+ }
+
+ std::string main_part, tail_part;
+ SplitIntoMainAndTail(*language, &main_part, &tail_part);
+ if (main_part.empty())
+ return;
+
+ // Apply liner search here because number of items in the list is just four.
+ for (size_t i = 0; i < arraysize(kLanguageCodeSynonyms); ++i) {
+ if (main_part == kLanguageCodeSynonyms[i].chrome_language) {
+ main_part = std::string(kLanguageCodeSynonyms[i].translate_language);
+ break;
+ }
+ }
+
+ *language = main_part + tail_part;
+}
+
+void ToChromeLanguageSynonym(std::string* language) {
+ for (size_t i = 0; i < arraysize(kLanguageCodeSimilitudes); ++i) {
+ if (*language == kLanguageCodeSimilitudes[i].translate_language) {
+ *language = kLanguageCodeSimilitudes[i].chrome_language;
+ return;
+ }
+ }
+
+ std::string main_part, tail_part;
+ SplitIntoMainAndTail(*language, &main_part, &tail_part);
+ if (main_part.empty())
+ return;
+
+ // Apply liner search here because number of items in the list is just four.
+ for (size_t i = 0; i < arraysize(kLanguageCodeSynonyms); ++i) {
+ if (main_part == kLanguageCodeSynonyms[i].translate_language) {
+ main_part = std::string(kLanguageCodeSynonyms[i].chrome_language);
+ break;
+ }
+ }
+
+ *language = main_part + tail_part;
+}
+
+GURL GetTranslateSecurityOrigin() {
+ std::string security_origin(kSecurityOrigin);
+ CommandLine* command_line = CommandLine::ForCurrentProcess();
+ if (command_line->HasSwitch(switches::kTranslateSecurityOrigin)) {
+ security_origin =
+ command_line->GetSwitchValueASCII(switches::kTranslateSecurityOrigin);
+ }
+ return GURL(security_origin);
+}
+
+} // namespace translate
diff --git a/components/translate/common/translate_util.h b/components/translate/common/translate_util.h
new file mode 100644
index 0000000..41b3496
--- /dev/null
+++ b/components/translate/common/translate_util.h
@@ -0,0 +1,28 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef COMPONENTS_TRANSLATE_COMMON_TRANSLATE_UTIL_H_
+#define COMPONENTS_TRANSLATE_COMMON_TRANSLATE_UTIL_H_
+
+#include <string>
+
+class GURL;
+
+namespace translate {
+
+// Isolated world sets following security-origin by default.
+extern const char kSecurityOrigin[];
+
+// Converts language code synonym to use at Translate server.
+void ToTranslateLanguageSynonym(std::string* language);
+
+// Converts language code synonym to use at Chrome internal.
+void ToChromeLanguageSynonym(std::string* language);
+
+// Get Security origin with which Translate runs.
+GURL GetTranslateSecurityOrigin();
+
+} // namespace translate
+
+#endif // COMPONENTS_TRANSLATE_COMMON_TRANSLATE_UTIL_H_
diff --git a/components/translate/common/translate_util_unittest.cc b/components/translate/common/translate_util_unittest.cc
new file mode 100644
index 0000000..f53996b
--- /dev/null
+++ b/components/translate/common/translate_util_unittest.cc
@@ -0,0 +1,96 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/translate/common/translate_util.h"
+
+#include "base/command_line.h"
+#include "components/translate/common/translate_switches.h"
+#include "testing/gtest/include/gtest/gtest.h"
+#include "url/gurl.h"
+
+typedef testing::Test TranslateUtilTest;
+
+// Tests that synonym language code is converted to one used in supporting list.
+TEST_F(TranslateUtilTest, ToTranslateLanguageSynonym) {
+ std::string language;
+
+ language = std::string("nb");
+ translate::ToTranslateLanguageSynonym(&language);
+ EXPECT_EQ("no", language);
+
+ language = std::string("he");
+ translate::ToTranslateLanguageSynonym(&language);
+ EXPECT_EQ("iw", language);
+
+ language = std::string("jv");
+ translate::ToTranslateLanguageSynonym(&language);
+ EXPECT_EQ("jw", language);
+
+ language = std::string("fil");
+ translate::ToTranslateLanguageSynonym(&language);
+ EXPECT_EQ("tl", language);
+
+ // Preserve a sub code if the language has a synonym.
+ language = std::string("he-IL");
+ translate::ToTranslateLanguageSynonym(&language);
+ EXPECT_EQ("iw-IL", language);
+
+ // Don't preserve a sub code if the language has just a similitude.
+ language = std::string("nb-NO");
+ translate::ToTranslateLanguageSynonym(&language);
+ EXPECT_EQ("nb-NO", language);
+
+ // Preserve the argument if it doesn't have its synonym.
+ language = std::string("en");
+ translate::ToTranslateLanguageSynonym(&language);
+ EXPECT_EQ("en", language);
+}
+
+// Tests that synonym language code is converted to one used in Chrome internal.
+TEST_F(TranslateUtilTest, ToChromeLanguageSynonym) {
+ std::string language;
+
+ language = std::string("no");
+ translate::ToChromeLanguageSynonym(&language);
+ EXPECT_EQ("nb", language);
+
+ language = std::string("iw");
+ translate::ToChromeLanguageSynonym(&language);
+ EXPECT_EQ("he", language);
+
+ language = std::string("jw");
+ translate::ToChromeLanguageSynonym(&language);
+ EXPECT_EQ("jv", language);
+
+ language = std::string("tl");
+ translate::ToChromeLanguageSynonym(&language);
+ EXPECT_EQ("fil", language);
+
+ // Preserve a sub code if the language has a synonym.
+ language = std::string("iw-IL");
+ translate::ToChromeLanguageSynonym(&language);
+ EXPECT_EQ("he-IL", language);
+
+ // Don't preserve a sub code if the language has just a similitude.
+ language = std::string("no-NO");
+ translate::ToChromeLanguageSynonym(&language);
+ EXPECT_EQ("no-NO", language);
+
+ // Preserve the argument if it doesn't have its synonym.
+ language = std::string("en");
+ translate::ToChromeLanguageSynonym(&language);
+ EXPECT_EQ("en", language);
+}
+
+TEST_F(TranslateUtilTest, SecurityOrigin) {
+ GURL origin = translate::GetTranslateSecurityOrigin();
+ EXPECT_EQ(std::string(translate::kSecurityOrigin), origin.spec());
+
+ const std::string running_origin("http://www.tamurayukari.com/");
+ CommandLine* command_line = CommandLine::ForCurrentProcess();
+ command_line->AppendSwitchASCII(translate::switches::kTranslateSecurityOrigin,
+ running_origin);
+ GURL modified_origin = translate::GetTranslateSecurityOrigin();
+ EXPECT_EQ(running_origin, modified_origin.spec());
+}
diff --git a/components/translate/language_detection/DEPS b/components/translate/language_detection/DEPS
new file mode 100644
index 0000000..a8cdc15
--- /dev/null
+++ b/components/translate/language_detection/DEPS
@@ -0,0 +1,6 @@
+include_rules = [
+ # CLD library.
+ "+third_party/cld",
+ "+third_party/cld/encodings/compact_lang_det/win",
+ "+third_party/cld_2/src",
+]
diff --git a/components/translate/language_detection/language_detection_util.cc b/components/translate/language_detection/language_detection_util.cc
new file mode 100644
index 0000000..01910a4
--- /dev/null
+++ b/components/translate/language_detection/language_detection_util.cc
@@ -0,0 +1,401 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/translate/language_detection/language_detection_util.h"
+
+#include "base/logging.h"
+#include "base/metrics/field_trial.h"
+#include "base/strings/string_split.h"
+#include "base/strings/string_util.h"
+#include "base/strings/utf_string_conversions.h"
+#include "base/time/time.h"
+#include "components/translate/common/translate_constants.h"
+#include "components/translate/common/translate_metrics.h"
+#include "components/translate/common/translate_util.h"
+
+#if !defined(CLD_VERSION) || CLD_VERSION==1
+#include "third_party/cld/encodings/compact_lang_det/compact_lang_det.h"
+#include "third_party/cld/encodings/compact_lang_det/win/cld_unicodetext.h"
+#endif
+
+#if !defined(CLD_VERSION) || CLD_VERSION==2
+#include "third_party/cld_2/src/public/compact_lang_det.h"
+#endif
+
+namespace {
+
+// Similar language code list. Some languages are very similar and difficult
+// for CLD to distinguish.
+struct SimilarLanguageCode {
+ const char* const code;
+ int group;
+};
+
+const SimilarLanguageCode kSimilarLanguageCodes[] = {
+ {"bs", 1},
+ {"hr", 1},
+ {"hi", 2},
+ {"ne", 2},
+};
+
+// Checks |kSimilarLanguageCodes| and returns group code.
+int GetSimilarLanguageGroupCode(const std::string& language) {
+ for (size_t i = 0; i < arraysize(kSimilarLanguageCodes); ++i) {
+ if (language.find(kSimilarLanguageCodes[i].code) != 0)
+ continue;
+ return kSimilarLanguageCodes[i].group;
+ }
+ return 0;
+}
+
+// Well-known languages which often have wrong server configuration of
+// Content-Language: en.
+// TODO(toyoshim): Remove these static tables and caller functions to
+// translate/common, and implement them as std::set<>.
+const char* kWellKnownCodesOnWrongConfiguration[] = {
+ "es", "pt", "ja", "ru", "de", "zh-CN", "zh-TW", "ar", "id", "fr", "it", "th"
+};
+
+// Applies a series of language code modification in proper order.
+void ApplyLanguageCodeCorrection(std::string* code) {
+ // Correct well-known format errors.
+ translate::CorrectLanguageCodeTypo(code);
+
+ if (!translate::IsValidLanguageCode(*code)) {
+ *code = std::string();
+ return;
+ }
+
+ translate::ToTranslateLanguageSynonym(code);
+}
+
+int GetCLDMajorVersion() {
+#if !defined(CLD_VERSION)
+ std::string group_name = base::FieldTrialList::FindFullName("CLD1VsCLD2");
+ if (group_name == "CLD2")
+ return 2;
+ else
+ return 1;
+#else
+ return CLD_VERSION;
+#endif
+}
+
+// Returns the ISO 639 language code of the specified |text|, or 'unknown' if it
+// failed.
+// |is_cld_reliable| will be set as true if CLD says the detection is reliable.
+std::string DetermineTextLanguage(const base::string16& text,
+ bool* is_cld_reliable) {
+ std::string language = translate::kUnknownLanguageCode;
+ int text_bytes = 0;
+ bool is_reliable = false;
+
+ // Language or CLD2::Language
+ int cld_language = 0;
+ bool is_valid_language = false;
+
+ switch (GetCLDMajorVersion()) {
+#if !defined(CLD_VERSION) || CLD_VERSION==1
+ case 1: {
+ int num_languages = 0;
+ cld_language =
+ DetectLanguageOfUnicodeText(NULL, text.c_str(), true, &is_reliable,
+ &num_languages, NULL, &text_bytes);
+ is_valid_language = cld_language != NUM_LANGUAGES &&
+ cld_language != UNKNOWN_LANGUAGE &&
+ cld_language != TG_UNKNOWN_LANGUAGE;
+ break;
+ }
+#endif
+#if !defined(CLD_VERSION) || CLD_VERSION==2
+ case 2: {
+ std::string utf8_text(UTF16ToUTF8(text));
+ CLD2::Language language3[3];
+ int percent3[3];
+ cld_language = CLD2::DetectLanguageSummary(
+ utf8_text.c_str(), (int)utf8_text.size(), true, language3, percent3,
+ &text_bytes, &is_reliable);
+ is_valid_language = cld_language != CLD2::NUM_LANGUAGES &&
+ cld_language != CLD2::UNKNOWN_LANGUAGE &&
+ cld_language != CLD2::TG_UNKNOWN_LANGUAGE;
+ break;
+ }
+#endif
+ default:
+ NOTREACHED();
+ }
+
+ if (is_cld_reliable != NULL)
+ *is_cld_reliable = is_reliable;
+
+ // We don't trust the result if the CLD reports that the detection is not
+ // reliable, or if the actual text used to detect the language was less than
+ // 100 bytes (short texts can often lead to wrong results).
+ // TODO(toyoshim): CLD provides |is_reliable| flag. But, it just says that
+ // the determined language code is correct with 50% confidence. Chrome should
+ // handle the real confidence value to judge.
+ if (is_reliable && text_bytes >= 100 && is_valid_language) {
+ // We should not use LanguageCode_ISO_639_1 because it does not cover all
+ // the languages CLD can detect. As a result, it'll return the invalid
+ // language code for tradtional Chinese among others.
+ // |LanguageCodeWithDialect| will go through ISO 639-1, ISO-639-2 and
+ // 'other' tables to do the 'right' thing. In addition, it'll return zh-CN
+ // for Simplified Chinese.
+ switch (GetCLDMajorVersion()) {
+#if !defined(CLD_VERSION) || CLD_VERSION==1
+ case 1:
+ language =
+ LanguageCodeWithDialects(static_cast<Language>(cld_language));
+ break;
+#endif
+#if !defined(CLD_VERSION) || CLD_VERSION==2
+ case 2:
+ // (1) CLD2's LanguageCode returns general Chinese 'zh' for
+ // CLD2::CHINESE, but Translate server doesn't accept it. This is
+ // converted to 'zh-CN' in the same way as CLD1's
+ // LanguageCodeWithDialects.
+ //
+ // (2) CLD2's LanguageCode returns zh-Hant instead of zh-TW for
+ // CLD2::CHINESE_T. This is technically more precise for the language
+ // code of traditional Chinese, while Translate server hasn't accepted
+ // zh-Hant yet.
+ if (cld_language == CLD2::CHINESE) {
+ language = "zh-CN";
+ } else if (cld_language == CLD2::CHINESE_T) {
+ language = "zh-TW";
+ } else {
+ language =
+ CLD2::LanguageCode(static_cast<CLD2::Language>(cld_language));
+ }
+ break;
+#endif
+ default:
+ NOTREACHED();
+ }
+ }
+ VLOG(9) << "Detected lang_id: " << language << ", from Text:\n" << text
+ << "\n*************************************\n";
+ return language;
+}
+
+// Checks if CLD can complement a sub code when the page language doesn't know
+// the sub code.
+bool CanCLDComplementSubCode(
+ const std::string& page_language, const std::string& cld_language) {
+ // Translate server cannot treat general Chinese. If Content-Language and
+ // CLD agree that the language is Chinese and Content-Language doesn't know
+ // which dialect is used, CLD language has priority.
+ // TODO(hajimehoshi): How about the other dialects like zh-MO?
+ return page_language == "zh" && StartsWithASCII(cld_language, "zh-", false);
+}
+
+} // namespace
+
+namespace translate {
+
+std::string DeterminePageLanguage(const std::string& code,
+ const std::string& html_lang,
+ const base::string16& contents,
+ std::string* cld_language_p,
+ bool* is_cld_reliable_p) {
+ base::TimeTicks begin_time = base::TimeTicks::Now();
+ bool is_cld_reliable;
+ std::string cld_language = DetermineTextLanguage(contents, &is_cld_reliable);
+ translate::ReportLanguageDetectionTime(begin_time, base::TimeTicks::Now());
+
+ if (cld_language_p != NULL)
+ *cld_language_p = cld_language;
+ if (is_cld_reliable_p != NULL)
+ *is_cld_reliable_p = is_cld_reliable;
+ translate::ToTranslateLanguageSynonym(&cld_language);
+
+ // Check if html lang attribute is valid.
+ std::string modified_html_lang;
+ if (!html_lang.empty()) {
+ modified_html_lang = html_lang;
+ ApplyLanguageCodeCorrection(&modified_html_lang);
+ translate::ReportHtmlLang(html_lang, modified_html_lang);
+ VLOG(9) << "html lang based language code: " << modified_html_lang;
+ }
+
+ // Check if Content-Language is valid.
+ std::string modified_code;
+ if (!code.empty()) {
+ modified_code = code;
+ ApplyLanguageCodeCorrection(&modified_code);
+ translate::ReportContentLanguage(code, modified_code);
+ }
+
+ // Adopt |modified_html_lang| if it is valid. Otherwise, adopt
+ // |modified_code|.
+ std::string language = modified_html_lang.empty() ? modified_code :
+ modified_html_lang;
+
+ // If |language| is empty, just use CLD result even though it might be
+ // translate::kUnknownLanguageCode.
+ if (language.empty()) {
+ translate::ReportLanguageVerification(
+ translate::LANGUAGE_VERIFICATION_CLD_ONLY);
+ return cld_language;
+ }
+
+ if (cld_language == kUnknownLanguageCode) {
+ translate::ReportLanguageVerification(
+ translate::LANGUAGE_VERIFICATION_UNKNOWN);
+ return language;
+ } else if (CanCLDComplementSubCode(language, cld_language)) {
+ translate::ReportLanguageVerification(
+ translate::LANGUAGE_VERIFICATION_CLD_COMPLEMENT_SUB_CODE);
+ return cld_language;
+ } else if (IsSameOrSimilarLanguages(language, cld_language)) {
+ translate::ReportLanguageVerification(
+ translate::LANGUAGE_VERIFICATION_CLD_AGREE);
+ return language;
+ } else if (MaybeServerWrongConfiguration(language, cld_language)) {
+ translate::ReportLanguageVerification(
+ translate::LANGUAGE_VERIFICATION_TRUST_CLD);
+ return cld_language;
+ } else {
+ translate::ReportLanguageVerification(
+ translate::LANGUAGE_VERIFICATION_CLD_DISAGREE);
+ // Content-Language value might be wrong because CLD says that this page
+ // is written in another language with confidence.
+ // In this case, Chrome doesn't rely on any of the language codes, and
+ // gives up suggesting a translation.
+ return std::string(kUnknownLanguageCode);
+ }
+
+ return language;
+}
+
+void CorrectLanguageCodeTypo(std::string* code) {
+ DCHECK(code);
+
+ size_t coma_index = code->find(',');
+ if (coma_index != std::string::npos) {
+ // There are more than 1 language specified, just keep the first one.
+ *code = code->substr(0, coma_index);
+ }
+ TrimWhitespaceASCII(*code, TRIM_ALL, code);
+
+ // An underscore instead of a dash is a frequent mistake.
+ size_t underscore_index = code->find('_');
+ if (underscore_index != std::string::npos)
+ (*code)[underscore_index] = '-';
+
+ // Change everything up to a dash to lower-case and everything after to upper.
+ size_t dash_index = code->find('-');
+ if (dash_index != std::string::npos) {
+ *code = StringToLowerASCII(code->substr(0, dash_index)) +
+ StringToUpperASCII(code->substr(dash_index));
+ } else {
+ *code = StringToLowerASCII(*code);
+ }
+}
+
+bool IsValidLanguageCode(const std::string& code) {
+ // Roughly check if the language code follows /[a-zA-Z]{2,3}(-[a-zA-Z]{2})?/.
+ // TODO(hajimehoshi): How about es-419, which is used as an Accept language?
+ std::vector<std::string> chunks;
+ base::SplitString(code, '-', &chunks);
+
+ if (chunks.size() < 1 || 2 < chunks.size())
+ return false;
+
+ const std::string& main_code = chunks[0];
+
+ if (main_code.size() < 1 || 3 < main_code.size())
+ return false;
+
+ for (std::string::const_iterator it = main_code.begin();
+ it != main_code.end(); ++it) {
+ if (!IsAsciiAlpha(*it))
+ return false;
+ }
+
+ if (chunks.size() == 1)
+ return true;
+
+ const std::string& sub_code = chunks[1];
+
+ if (sub_code.size() != 2)
+ return false;
+
+ for (std::string::const_iterator it = sub_code.begin();
+ it != sub_code.end(); ++it) {
+ if (!IsAsciiAlpha(*it))
+ return false;
+ }
+
+ return true;
+}
+
+bool IsSameOrSimilarLanguages(const std::string& page_language,
+ const std::string& cld_language) {
+ std::vector<std::string> chunks;
+
+ base::SplitString(page_language, '-', &chunks);
+ if (chunks.size() == 0)
+ return false;
+ std::string page_language_main_part = chunks[0];
+
+ base::SplitString(cld_language, '-', &chunks);
+ if (chunks.size() == 0)
+ return false;
+ std::string cld_language_main_part = chunks[0];
+
+ // Language code part of |page_language| is matched to one of |cld_language|.
+ // Country code is ignored here.
+ if (page_language_main_part == cld_language_main_part) {
+ // Languages are matched strictly. Reports false to metrics, but returns
+ // true.
+ translate::ReportSimilarLanguageMatch(false);
+ return true;
+ }
+
+ // Check if |page_language| and |cld_language| are in the similar language
+ // list and belong to the same language group.
+ int page_code = GetSimilarLanguageGroupCode(page_language);
+ bool match = page_code != 0 &&
+ page_code == GetSimilarLanguageGroupCode(cld_language);
+
+ translate::ReportSimilarLanguageMatch(match);
+ return match;
+}
+
+bool MaybeServerWrongConfiguration(const std::string& page_language,
+ const std::string& cld_language) {
+ // If |page_language| is not "en-*", respect it and just return false here.
+ if (!StartsWithASCII(page_language, "en", false))
+ return false;
+
+ // A server provides a language meta information representing "en-*". But it
+ // might be just a default value due to missing user configuration.
+ // Let's trust |cld_language| if the determined language is not difficult to
+ // distinguish from English, and the language is one of well-known languages
+ // which often provide "en-*" meta information mistakenly.
+ for (size_t i = 0; i < arraysize(kWellKnownCodesOnWrongConfiguration); ++i) {
+ if (cld_language == kWellKnownCodesOnWrongConfiguration[i])
+ return true;
+ }
+ return false;
+}
+
+std::string GetCLDVersion() {
+ switch (GetCLDMajorVersion()) {
+#if !defined(CLD_VERSION) || CLD_VERSION==1
+ case 1:
+ return CompactLangDet::DetectLanguageVersion();
+#endif
+#if !defined(CLD_VERSION) || CLD_VERSION==2
+ case 2:
+ return CLD2::DetectLanguageVersion();
+#endif
+ default:
+ NOTREACHED();
+ }
+ return "";
+}
+
+} // namespace translate
diff --git a/components/translate/language_detection/language_detection_util.h b/components/translate/language_detection/language_detection_util.h
new file mode 100644
index 0000000..8af9fce
--- /dev/null
+++ b/components/translate/language_detection/language_detection_util.h
@@ -0,0 +1,47 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef COMPONENTS_TRANSLATE_LANGUAGE_DETECTION_LANGUAGE_DETECTION_UTIL_H_
+#define COMPONENTS_TRANSLATE_LANGUAGE_DETECTION_LANGUAGE_DETECTION_UTIL_H_
+
+#include <string>
+
+#include "base/strings/string16.h"
+
+namespace translate {
+
+// Determines content page language from Content-Language code and contents.
+std::string DeterminePageLanguage(const std::string& code,
+ const std::string& html_lang,
+ const base::string16& contents,
+ std::string* cld_language,
+ bool* is_cld_reliable);
+
+// Corrects language code if it contains well-known mistakes.
+// Called only by tests.
+void CorrectLanguageCodeTypo(std::string* code);
+
+// Checks if the language code's format is valid.
+// Called only by tests.
+bool IsValidLanguageCode(const std::string& code);
+
+// Checks if languages are matched, or similar. This function returns true
+// against a language pair containing a language which is difficult for CLD to
+// distinguish.
+// Called only by tests.
+bool IsSameOrSimilarLanguages(const std::string& page_language,
+ const std::string& cld_language);
+
+// Checks if languages pair is one of well-known pairs of wrong server
+// configuration.
+// Called only by tests.
+bool MaybeServerWrongConfiguration(const std::string& page_language,
+ const std::string& cld_language);
+
+// Returns the version string of CLD.
+std::string GetCLDVersion();
+
+} // namespace translate
+
+#endif // COMPONENTS_TRANSLATE_LANGUAGE_DETECTION_LANGUAGE_DETECTION_UTIL_H_
diff --git a/components/translate/language_detection/language_detection_util_unittest.cc b/components/translate/language_detection/language_detection_util_unittest.cc
new file mode 100644
index 0000000..c3bf6c1
--- /dev/null
+++ b/components/translate/language_detection/language_detection_util_unittest.cc
@@ -0,0 +1,173 @@
+// Copyright 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "components/translate/language_detection/language_detection_util.h"
+
+#include "base/strings/string16.h"
+#include "base/strings/utf_string_conversions.h"
+#include "components/translate/common/translate_constants.h"
+#include "testing/gtest/include/gtest/gtest.h"
+
+typedef testing::Test LanguageDetectionUtilTest;
+
+// Tests that well-known language code typos are fixed.
+TEST_F(LanguageDetectionUtilTest, LanguageCodeTypoCorrection) {
+ std::string language;
+
+ // Strip the second and later codes.
+ language = std::string("ja,en");
+ translate::CorrectLanguageCodeTypo(&language);
+ EXPECT_EQ("ja", language);
+
+ // Replace dash with hyphen.
+ language = std::string("ja_JP");
+ translate::CorrectLanguageCodeTypo(&language);
+ EXPECT_EQ("ja-JP", language);
+
+ // Correct wrong cases.
+ language = std::string("JA-jp");
+ translate::CorrectLanguageCodeTypo(&language);
+ EXPECT_EQ("ja-JP", language);
+}
+
+// Tests if the language codes' format is invalid.
+TEST_F(LanguageDetectionUtilTest, IsValidLanguageCode) {
+ std::string language;
+
+ language = std::string("ja");
+ EXPECT_TRUE(translate::IsValidLanguageCode(language));
+
+ language = std::string("ja-JP");
+ EXPECT_TRUE(translate::IsValidLanguageCode(language));
+
+ language = std::string("ceb");
+ EXPECT_TRUE(translate::IsValidLanguageCode(language));
+
+ language = std::string("ceb-XX");
+ EXPECT_TRUE(translate::IsValidLanguageCode(language));
+
+ // Invalid because the sub code consists of a number.
+ language = std::string("utf-8");
+ EXPECT_FALSE(translate::IsValidLanguageCode(language));
+
+ // Invalid because of six characters after hyphen.
+ language = std::string("ja-YUKARI");
+ EXPECT_FALSE(translate::IsValidLanguageCode(language));
+
+ // Invalid because of four characters.
+ language = std::string("DHMO");
+ EXPECT_FALSE(translate::IsValidLanguageCode(language));
+}
+
+// Tests that similar language table works.
+TEST_F(LanguageDetectionUtilTest, SimilarLanguageCode) {
+ EXPECT_TRUE(translate::IsSameOrSimilarLanguages("en", "en"));
+ EXPECT_FALSE(translate::IsSameOrSimilarLanguages("en", "ja"));
+
+ // Language codes are same if the main parts are same. The synonyms should be
+ // took into account (ex: 'iw' and 'he').
+ EXPECT_TRUE(translate::IsSameOrSimilarLanguages("sr-ME", "sr"));
+ EXPECT_TRUE(translate::IsSameOrSimilarLanguages("sr", "sr-ME"));
+ EXPECT_TRUE(translate::IsSameOrSimilarLanguages("he", "he-IL"));
+ EXPECT_TRUE(translate::IsSameOrSimilarLanguages("eng", "eng-US"));
+ EXPECT_TRUE(translate::IsSameOrSimilarLanguages("eng-US", "eng"));
+ EXPECT_FALSE(translate::IsSameOrSimilarLanguages("eng", "enm"));
+
+ // Even though the main parts are different, some special language pairs are
+ // recognized as same languages.
+ EXPECT_TRUE(translate::IsSameOrSimilarLanguages("bs", "hr"));
+ EXPECT_TRUE(translate::IsSameOrSimilarLanguages("ne", "hi"));
+ EXPECT_FALSE(translate::IsSameOrSimilarLanguages("bs", "hi"));
+}
+
+// Tests that well-known languages which often have wrong server configuration
+// are handles.
+TEST_F(LanguageDetectionUtilTest, WellKnownWrongConfiguration) {
+ EXPECT_TRUE(translate::MaybeServerWrongConfiguration("en", "ja"));
+ EXPECT_TRUE(translate::MaybeServerWrongConfiguration("en-US", "ja"));
+ EXPECT_TRUE(translate::MaybeServerWrongConfiguration("en", "zh-CN"));
+ EXPECT_FALSE(translate::MaybeServerWrongConfiguration("ja", "en"));
+ EXPECT_FALSE(translate::MaybeServerWrongConfiguration("en", "he"));
+}
+
+// Tests that the language meta tag providing wrong information is ignored by
+// LanguageDetectionUtil due to disagreement between meta tag and CLD.
+TEST_F(LanguageDetectionUtilTest, CLDDisagreeWithWrongLanguageCode) {
+ base::string16 contents = ASCIIToUTF16(
+ "<html><head><meta http-equiv='Content-Language' content='ja'></head>"
+ "<body>This is a page apparently written in English. Even though "
+ "content-language is provided, the value will be ignored if the value "
+ "is suspicious.</body></html>");
+ std::string cld_language;
+ bool is_cld_reliable;
+ std::string language = translate::DeterminePageLanguage(std::string("ja"),
+ std::string(),
+ contents,
+ &cld_language,
+ &is_cld_reliable);
+ EXPECT_EQ(translate::kUnknownLanguageCode, language);
+ EXPECT_EQ("en", cld_language);
+ EXPECT_TRUE(is_cld_reliable);
+}
+
+// Tests that the language meta tag providing "en-US" style information is
+// agreed by CLD.
+TEST_F(LanguageDetectionUtilTest, CLDAgreeWithLanguageCodeHavingCountryCode) {
+ base::string16 contents = ASCIIToUTF16(
+ "<html><head><meta http-equiv='Content-Language' content='en-US'></head>"
+ "<body>This is a page apparently written in English. Even though "
+ "content-language is provided, the value will be ignored if the value "
+ "is suspicious.</body></html>");
+ std::string cld_language;
+ bool is_cld_reliable;
+ std::string language = translate::DeterminePageLanguage(std::string("en-US"),
+ std::string(),
+ contents,
+ &cld_language,
+ &is_cld_reliable);
+ EXPECT_EQ("en-US", language);
+ EXPECT_EQ("en", cld_language);
+ EXPECT_TRUE(is_cld_reliable);
+}
+
+// Tests that the language meta tag providing wrong information is ignored and
+// CLD's language will be adopted by LanguageDetectionUtil due to an invalid
+// meta tag.
+TEST_F(LanguageDetectionUtilTest, InvalidLanguageMetaTagProviding) {
+ base::string16 contents = ASCIIToUTF16(
+ "<html><head><meta http-equiv='Content-Language' content='utf-8'></head>"
+ "<body>This is a page apparently written in English. Even though "
+ "content-language is provided, the value will be ignored and CLD's"
+ " language will be adopted if the value is invalid.</body></html>");
+ std::string cld_language;
+ bool is_cld_reliable;
+ std::string language = translate::DeterminePageLanguage(std::string("utf-8"),
+ std::string(),
+ contents,
+ &cld_language,
+ &is_cld_reliable);
+ EXPECT_EQ("en", language);
+ EXPECT_EQ("en", cld_language);
+ EXPECT_TRUE(is_cld_reliable);
+}
+
+// Tests that the language meta tag providing wrong information is ignored
+// because of valid html lang attribute.
+TEST_F(LanguageDetectionUtilTest, AdoptHtmlLang) {
+ base::string16 contents = ASCIIToUTF16(
+ "<html lang='en'><head><meta http-equiv='Content-Language' content='ja'>"
+ "</head><body>This is a page apparently written in English. Even though "
+ "content-language is provided, the value will be ignored if the value "
+ "is suspicious.</body></html>");
+ std::string cld_language;
+ bool is_cld_reliable;
+ std::string language = translate::DeterminePageLanguage(std::string("ja"),
+ std::string("en"),
+ contents,
+ &cld_language,
+ &is_cld_reliable);
+ EXPECT_EQ("en", language);
+ EXPECT_EQ("en", cld_language);
+ EXPECT_TRUE(is_cld_reliable);
+}