diff options
Diffstat (limited to 'base/i18n')
-rw-r--r-- | base/i18n/icu_encoding_detection.cc | 38 | ||||
-rw-r--r-- | base/i18n/icu_encoding_detection.h | 19 |
2 files changed, 57 insertions, 0 deletions
diff --git a/base/i18n/icu_encoding_detection.cc b/base/i18n/icu_encoding_detection.cc new file mode 100644 index 0000000..55785c5 --- /dev/null +++ b/base/i18n/icu_encoding_detection.cc @@ -0,0 +1,38 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/i18n/icu_encoding_detection.h" + +#include "base/string_util.h" +#include "unicode/ucsdet.h" + +namespace base { + +// TODO(jungshik): We can apply more heuristics here (e.g. using various hints +// like TLD, the UI language/default encoding of a client, etc). +bool DetectEncoding(const std::string& text, std::string* encoding) { + if (IsStringASCII(text)) { + *encoding = std::string(); + return true; + } + + UErrorCode status = U_ZERO_ERROR; + UCharsetDetector* detector = ucsdet_open(&status); + ucsdet_setText(detector, text.data(), static_cast<int32_t>(text.length()), + &status); + // TODO(jungshik): Should we check the quality of the match? A rather + // arbitrary number is assigned by ICU and it's hard to come up with + // a lower limit. + const UCharsetMatch* match = ucsdet_detect(detector, &status); + const char* detected_encoding = ucsdet_getName(match, &status); + ucsdet_close(detector); + + if (U_FAILURE(status)) + return false; + + *encoding = detected_encoding; + return true; +} + +} // namespace base diff --git a/base/i18n/icu_encoding_detection.h b/base/i18n/icu_encoding_detection.h new file mode 100644 index 0000000..0d8e5d8 --- /dev/null +++ b/base/i18n/icu_encoding_detection.h @@ -0,0 +1,19 @@ +// Copyright (c) 2010 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_I18N_ICU_ENCODING_DETECTION_H_ +#define BASE_I18N_ICU_ENCODING_DETECTION_H_ + +#include <string> + +namespace base { + +// Detect encoding of |text| and put the name of encoding (as returned by ICU) +// in |encoding|. For ASCII texts |encoding| will be set to an empty string. +// Returns true on success. +bool DetectEncoding(const std::string& text, std::string* encoding); + +} // namespace base + +#endif // BASE_I18N_ICU_ENCODING_DETECTION_H_ |