summaryrefslogtreecommitdiffstats
path: root/base/i18n
diff options
context:
space:
mode:
Diffstat (limited to 'base/i18n')
-rw-r--r--base/i18n/icu_encoding_detection.cc38
-rw-r--r--base/i18n/icu_encoding_detection.h19
2 files changed, 57 insertions, 0 deletions
diff --git a/base/i18n/icu_encoding_detection.cc b/base/i18n/icu_encoding_detection.cc
new file mode 100644
index 0000000..55785c5
--- /dev/null
+++ b/base/i18n/icu_encoding_detection.cc
@@ -0,0 +1,38 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/i18n/icu_encoding_detection.h"
+
+#include "base/string_util.h"
+#include "unicode/ucsdet.h"
+
+namespace base {
+
+// TODO(jungshik): We can apply more heuristics here (e.g. using various hints
+// like TLD, the UI language/default encoding of a client, etc).
+bool DetectEncoding(const std::string& text, std::string* encoding) {
+ if (IsStringASCII(text)) {
+ *encoding = std::string();
+ return true;
+ }
+
+ UErrorCode status = U_ZERO_ERROR;
+ UCharsetDetector* detector = ucsdet_open(&status);
+ ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()),
+ &status);
+ // TODO(jungshik): Should we check the quality of the match? A rather
+ // arbitrary number is assigned by ICU and it's hard to come up with
+ // a lower limit.
+ const UCharsetMatch* match = ucsdet_detect(detector, &status);
+ const char* detected_encoding = ucsdet_getName(match, &status);
+ ucsdet_close(detector);
+
+ if (U_FAILURE(status))
+ return false;
+
+ *encoding = detected_encoding;
+ return true;
+}
+
+} // namespace base
diff --git a/base/i18n/icu_encoding_detection.h b/base/i18n/icu_encoding_detection.h
new file mode 100644
index 0000000..0d8e5d8
--- /dev/null
+++ b/base/i18n/icu_encoding_detection.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2010 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_I18N_ICU_ENCODING_DETECTION_H_
+#define BASE_I18N_ICU_ENCODING_DETECTION_H_
+
+#include <string>
+
+namespace base {
+
+// Detect encoding of |text| and put the name of encoding (as returned by ICU)
+// in |encoding|. For ASCII texts |encoding| will be set to an empty string.
+// Returns true on success.
+bool DetectEncoding(const std::string& text, std::string* encoding);
+
+} // namespace base
+
+#endif // BASE_I18N_ICU_ENCODING_DETECTION_H_