Add method in CLD to get top 3 languages, and their percentages, for a given unicode text. This is required for an extension API which returns the top three languages for a web page.

BUG=none TEST=none Review URL: http://codereview.chromium.org/174004 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@23972 0039d316-1c4b-4281-b951-d872f2087c98
author: sidchat@google.com <sidchat@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2009-08-21 17:32:54 +0000
committer: sidchat@google.com <sidchat@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2009-08-21 17:32:54 +0000
commit: 7ed6bc6bcd311741da22bf45d8a88f5de89eadf8 (patch)
tree: b48eba80b69d188cbbd7fe1b83704f84417468b5 /third_party/cld
parent: be9551df15b9d5ebbc1e52cd736c1c95e38f1821 (diff)
download: chromium_src-7ed6bc6bcd311741da22bf45d8a88f5de89eadf8.zip
chromium_src-7ed6bc6bcd311741da22bf45d8a88f5de89eadf8.tar.gz
chromium_src-7ed6bc6bcd311741da22bf45d8a88f5de89eadf8.tar.bz2
3 files changed, 118 insertions, 22 deletions
diff --git a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det_unittest_small.cc b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det_unittest_small.cc
index 97ff742..e62cd46 100644
--- a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det_unittest_small.cc
+++ b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/compact_lang_det_unittest_small.cc
@@ -24,6 +24,7 @@
 #include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/unittest_data.h"
 
 #include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_commandlineflags.h"
+#include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h"
 #include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_google.h"
 
 //DEFINE_bool(html, false, "Print language spans in HTML on stderr");
@@ -38,6 +39,10 @@ const char* kTeststr_en =
   "jury of members two courts combine for the purpose the most important cases "
   "of all are brought jurors or";
 
+const char* kTeststr_en_fr_de =
+    " a backup credit card by visiting your billing preferences page or visit the adwords help centre for more details https adwords google com support bin answer py answer hl en we were unable to process the payment of for your outstanding google adwords"  // ENGLISH
+    " a accès aux collections et aux frontaux qui lui ont été attribués il peut consulter et modifier ses collections et exporter des configurations de collection toutefois il ne peut pas créer ni supprimer des collections enfin il a accès aux fonctions"  // FRENCH
+    " abschnitt ordner aktivieren werden die ordnereinstellungen im farbabschnitt deaktiviert öchten sie wirklich fortfahren eldtypen angeben optional n diesem schritt geben sie für jedesfeld aus dem datenset den typ an ieser schritt ist optional eldtypen";  // GERMAN
 
 // UTF8 constants. Use a UTF-8 aware editor for this file
 const char* kTeststr_ks =
@@ -135,6 +140,18 @@ class CompactLangDetTest : public testing::Test {
                             &is_reliable);
     return lang;
   }
+
+  // Detect the top three languages using DetectLanguageSummary.
+  void TestDetectLanguageSummary(const char* src, Language* language3) {
+    bool is_plain_text = true;
+    int percent3[3];
+    int text_bytes;
+    bool is_reliable;
+
+    CompactLangDet::DetectLanguageSummary(src, strlen(src), is_plain_text,
+                                          language3, percent3, &text_bytes,
+                                          &is_reliable);
+  }
 };  // End class CompactLangDetTest.
 
 }  // End namespace.
@@ -389,3 +406,13 @@ TEST_F(CompactLangDetTest, ExtendedTests) {
   EXPECT_EQ(ENGLISH, TestCompactLangDetPlain(kTeststr_zzh_Latn));
   EXPECT_EQ(ENGLISH, TestExtCompactLangDetPlain(kTeststr_zzh_Latn));
 }
+
+
+TEST_F(CompactLangDetTest, DetectLanguageSummaryTests) {
+  Language language3[3];
+  TestDetectLanguageSummary(kTeststr_en_fr_de, language3);
+  EXPECT_EQ(FRENCH, language3[0]);
+  EXPECT_EQ(GERMAN, language3[1]);
+  EXPECT_EQ(ENGLISH, language3[2]);
+}
+
diff --git a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc
index 2fc1cfd..85dae05 100644
--- a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc
+++ b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.cc
@@ -13,24 +13,16 @@
 #include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_scopedptr.h"
 #include "third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/normalizedunicodetext.h"
 
-
-// Detects a language of the UTF-16 encoded zero-terminated text.
-// Returns: Language enum.
-// TODO : make it reuse already allocated buffers to avoid excessive
-// allocate/free call pairs.  The idea is to have two buffers allocated and
-// alternate their use for every Windows API call.
-// Let's leave it as it is, simple and working and optimize it as the next step
-// if it will consume too much resources (after careful measuring, indeed).
-Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text,
-                                     bool* is_reliable, int* num_languages,
-                                     DWORD* error_code) {
+std::string NormalizeText(const WCHAR* text,
+                          int* num_languages,
+                          DWORD* error_code) {
   if (!text || !num_languages) {
     if (error_code)
       *error_code = ERROR_INVALID_PARAMETER;
-    return NUM_LANGUAGES;
+    return std::string();
   }
 
-  // Normalize text first.  We do not check the return value here since there
+  // Normalize text here.  We do not check the return value here since there
   // is no meaningful recovery we can do in case of failure anyway.
   // Since the vast majority of texts on the Internet is already normalized
   // and languages which require normalization are easy to recognize by CLD
@@ -48,12 +40,12 @@ Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text,
   if (!lowercase_text_size) {
     if (error_code)
       *error_code = ::GetLastError();
-    return NUM_LANGUAGES;
+    return std::string();
   }
 
   scoped_array<WCHAR> lowercase_text(new WCHAR[lowercase_text_size]);
   if (!lowercase_text.get())
-    return NUM_LANGUAGES;
+    return std::string();
 
   // Covert text to lowercase.
   int lowercasing_result =
@@ -63,7 +55,7 @@ Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text,
   if (!lowercasing_result) {
     if (error_code)
       *error_code = ::GetLastError();
-    return NUM_LANGUAGES;
+    return std::string();
   }
 
   // Determine the size of the buffer required to covert text to UTF-8.
@@ -75,27 +67,49 @@ Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text,
   if (!utf8_encoded_buffer_size) {
     if (error_code)
       *error_code = ::GetLastError();
-    return NUM_LANGUAGES;
+    return std::string();
   }
 
-  scoped_array<char> utf8_encoded_buffer(
-      new char[utf8_encoded_buffer_size]);
+  scoped_array<char> utf8_encoded_buffer(new char[utf8_encoded_buffer_size]);
 
   // Convert text to UTF-8.
   int utf8_encoding_result =
       ::WideCharToMultiByte(CP_UTF8, 0,
                             lowercase_text.get(), -1,
-                            utf8_encoded_buffer.get(), utf8_encoded_buffer_size,
+                            utf8_encoded_buffer.get(),
+                            utf8_encoded_buffer_size,
                             NULL, NULL);
   if (!utf8_encoding_result) {
     if (error_code)
       *error_code = ::GetLastError();
-    return NUM_LANGUAGES;
+    return std::string();
   }
 
   if (error_code)
     *error_code = 0;
 
+  return std::string(utf8_encoded_buffer.get());
+}
+
+
+// Detects a language of the UTF-16 encoded zero-terminated text.
+// Returns: Language enum.
+// TODO : make it reuse already allocated buffers to avoid excessive
+// allocate/free call pairs.  The idea is to have two buffers allocated and
+// alternate their use for every Windows API call.
+// Let's leave it as it is, simple and working and optimize it as the next step
+// if it will consume too much resources (after careful measuring, indeed).
+Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text,
+                                     bool* is_reliable, int* num_languages,
+                                     DWORD* error_code) {
+  // Normalize text.
+  std::string utf8_encoded_string_buffer = NormalizeText(text, num_languages,
+                                                         error_code);
+  if (utf8_encoded_string_buffer.empty())
+    return NUM_LANGUAGES;
+
+  int utf8_encoded_buffer_size = utf8_encoded_string_buffer.length();
+
   // Engage core CLD library language detection.
   Language language3[3] = {
     UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE, UNKNOWN_LANGUAGE
@@ -109,7 +123,7 @@ Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text,
   // See the actual code in compact_lang_det_impl.cc, CalcSummaryLang function.
   // language3 array is always set according to the detection results and
   // is not affected by this heuristic.
-  CompactLangDet::DetectLanguageSummary(utf8_encoded_buffer.get(),
+  CompactLangDet::DetectLanguageSummary(utf8_encoded_string_buffer.c_str(),
                                         utf8_encoded_buffer_size,
                                         is_plain_text, language3, percent3,
                                         &text_bytes, is_reliable);
@@ -128,3 +142,44 @@ Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text,
 
   return language3[0];
 }
+
+void DetectLanguageSummaryOfUnicodeText(const WCHAR* text,
+                                        bool is_plain_text,
+                                        Language language[3],
+                                        int percent[3],
+                                        int* text_bytes,
+                                        bool* is_reliable) {
+  int num_languages;
+  DWORD error_code;
+  std::string utf8_encoded_string_buffer = NormalizeText(text, &num_languages,
+                                                         &error_code);
+
+  // Normalize text.
+  if (utf8_encoded_string_buffer.empty())
+    return;
+
+  int utf8_encoded_buffer_size = utf8_encoded_string_buffer.length();
+
+  // Engage core CLD library language detection.
+  language[0] = language[1] = language[2] = UNKNOWN_LANGUAGE;
+  percent[0] = 100;
+  percent[1] = percent[2] = 0;
+
+  if (utf8_encoded_string_buffer.empty()) {
+    *is_reliable = false;
+    *text_bytes = 0;
+    return;
+  }
+
+  // We ignore return value here due to the problem described in bug 1800161.
+  // For example, translate.google.com was detected as Indonesian.  It happened
+  // due to the heuristic in CLD, which ignores English as a top language
+  // in the presence of another reliably detected language.
+  // See the actual code in compact_lang_det_impl.cc, CalcSummaryLang function.
+  // language3 array is always set according to the detection results and
+  // is not affected by this heuristic.
+  CompactLangDet::DetectLanguageSummary(utf8_encoded_string_buffer.c_str(),
+                                        utf8_encoded_buffer_size,
+                                        is_plain_text, language, percent,
+                                        text_bytes, is_reliable);
+}
diff --git a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h
index 763c81e..5759dcf 100644
--- a/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h
+++ b/third_party/cld/bar/toolbar/cld/i18n/encodings/compact_lang_det/win/cld_unicodetext.h
@@ -30,5 +30,19 @@ Language DetectLanguageOfUnicodeText(const WCHAR* text, bool is_plain_text,
                                      bool* is_reliable, int* num_languages,
                                      DWORD* error_code);
 
+// Detects the top 3 languages in the UTF-16 encoded zero-terminated text.
+// [in] text - UTF-16 encoded text.
+// [in] is_plain_text - true if plain text, false otherwise (e.g. HTML).
+// [out] language[3] - Top 3 languages (default: UNKNOWN_LANGUAGE)
+// [out] percent[3] - Percentages of the languages (default: language3[0] = 100.
+//                  language3[1] = language3[2] = 0).
+// [out] is_reliable - true if reliable.
+// See CompactLangDet::DetectLanguageSummary() for more information.
+void DetectLanguageSummaryOfUnicodeText(const WCHAR* text,
+                                        bool is_plain_text,
+                                        Language language[3],
+                                        int percent[3],
+                                        int* text_bytes,
+                                        bool* is_reliable);
 
 #endif  // BAR_TOOLBAR_CLD_I18N_ENCODINGS_COMPACT_LANG_DET_WIN_CLD_UNICODETEXT_H_
author	sidchat@google.com <sidchat@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2009-08-21 17:32:54 +0000
committer	sidchat@google.com <sidchat@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2009-08-21 17:32:54 +0000
commit	7ed6bc6bcd311741da22bf45d8a88f5de89eadf8 (patch)
tree	b48eba80b69d188cbbd7fe1b83704f84417468b5 /third_party/cld
parent	be9551df15b9d5ebbc1e52cd736c1c95e38f1821 (diff)
download	chromium_src-7ed6bc6bcd311741da22bf45d8a88f5de89eadf8.zip chromium_src-7ed6bc6bcd311741da22bf45d8a88f5de89eadf8.tar.gz chromium_src-7ed6bc6bcd311741da22bf45d8a88f5de89eadf8.tar.bz2