summaryrefslogtreecommitdiffstats
path: root/base/i18n/file_util_icu.cc
diff options
context:
space:
mode:
authorbrettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-10-08 17:38:30 +0000
committerbrettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2009-10-08 17:38:30 +0000
commitd0767cb54b2b5ee4d9cf00b3ee0fa585826b4036 (patch)
tree1c2ee733bf62a44c31dc11f76dad53243a84439f /base/i18n/file_util_icu.cc
parente91d532339c854ff0a082c6562a519647524fa66 (diff)
downloadchromium_src-d0767cb54b2b5ee4d9cf00b3ee0fa585826b4036.zip
chromium_src-d0767cb54b2b5ee4d9cf00b3ee0fa585826b4036.tar.gz
chromium_src-d0767cb54b2b5ee4d9cf00b3ee0fa585826b4036.tar.bz2
Separate out some more ICU from base and into base/i18n.
This moves string_util_icu. I moved the number formatting function into base/i18n/number_formatting and just removed the other function in string_util_icu which was TrimWhitespaceUTF8. It is only used in a few places and isn't actually helpful (and the fact that it round-trips through UTF-16 is better for the caller to see). This takes out the sorting from the FileEnumerator. The comment says the sorting is not guaranteed. I moved it into file_util_icu as a standalone function for callers of FileEnumerator to call manually if they need sorted results. I modified the directory lister to use this sorting instead, and filed a bug on doing more optimal JS-based sorting. TEST=none BUG=none Review URL: http://codereview.chromium.org/267001 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@28405 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base/i18n/file_util_icu.cc')
-rw-r--r--base/i18n/file_util_icu.cc193
1 files changed, 193 insertions, 0 deletions
diff --git a/base/i18n/file_util_icu.cc b/base/i18n/file_util_icu.cc
new file mode 100644
index 0000000..0bc9db6
--- /dev/null
+++ b/base/i18n/file_util_icu.cc
@@ -0,0 +1,193 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// File utilities that use the ICU library go in this file.
+
+#include "base/i18n/file_util_icu.h"
+
+#include "base/file_path.h"
+#include "base/scoped_ptr.h"
+#include "base/singleton.h"
+#include "base/string_util.h"
+#include "base/sys_string_conversions.h"
+#include "build/build_config.h"
+#include "unicode/coll.h"
+#include "unicode/uniset.h"
+
+namespace {
+
+class IllegalCharacters {
+ public:
+ bool contains(UChar32 ucs4) {
+ return !!set->contains(ucs4);
+ }
+
+ bool containsNone(const string16 &s) {
+ return !!set->containsNone(icu::UnicodeString(s.c_str(), s.size()));
+ }
+
+ private:
+ friend class Singleton<IllegalCharacters>;
+ friend struct DefaultSingletonTraits<IllegalCharacters>;
+
+ IllegalCharacters();
+ ~IllegalCharacters() { }
+
+ scoped_ptr<icu::UnicodeSet> set;
+
+ DISALLOW_COPY_AND_ASSIGN(IllegalCharacters);
+};
+
+IllegalCharacters::IllegalCharacters() {
+ UErrorCode status = U_ZERO_ERROR;
+ // Control characters, formatting characters, non-characters, and
+ // some printable ASCII characters regarded as dangerous ('"*/:<>?\\').
+ // See http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx
+ // and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx
+ // TODO(jungshik): Revisit the set. ZWJ and ZWNJ are excluded because they
+ // are legitimate in Arabic and some S/SE Asian scripts. However, when used
+ // elsewhere, they can be confusing/problematic.
+ // Also, consider wrapping the set with our Singleton class to create and
+ // freeze it only once. Note that there's a trade-off between memory and
+ // speed.
+#if defined(WCHAR_T_IS_UTF16)
+ set.reset(new icu::UnicodeSet(icu::UnicodeString(
+ L"[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\u200c\u200d]]"), status));
+#else
+ set.reset(new icu::UnicodeSet(UNICODE_STRING_SIMPLE(
+ "[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\\u200c\\u200d]]").unescape(),
+ status));
+#endif
+ DCHECK(U_SUCCESS(status));
+ // Add non-characters. If this becomes a performance bottleneck by
+ // any chance, do not add these to |set| and change IsFilenameLegal()
+ // to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addiition to calling
+ // containsNone().
+ set->add(0xFDD0, 0xFDEF);
+ for (int i = 0; i <= 0x10; ++i) {
+ int plane_base = 0x10000 * i;
+ set->add(plane_base + 0xFFFE, plane_base + 0xFFFF);
+ }
+ set->freeze();
+}
+
+class LocaleAwareComparator {
+ public:
+ LocaleAwareComparator() {
+ UErrorCode error_code = U_ZERO_ERROR;
+ // Use the default collator. The default locale should have been properly
+ // set by the time this constructor is called.
+ collator_.reset(icu::Collator::createInstance(error_code));
+ DCHECK(U_SUCCESS(error_code));
+ // Make it case-sensitive.
+ collator_->setStrength(icu::Collator::TERTIARY);
+ // Note: We do not set UCOL_NORMALIZATION_MODE attribute. In other words, we
+ // do not pay performance penalty to guarantee sort order correctness for
+ // non-FCD (http://unicode.org/notes/tn5/#FCD) file names. This should be a
+ // reasonable tradeoff because such file names should be rare and the sort
+ // order doesn't change much anyway.
+ }
+
+ // Note: A similar function is available in l10n_util.
+ // We cannot use it because base should not depend on l10n_util.
+ // TODO(yuzo): Move some of l10n_util to base.
+ int Compare(const string16& a, const string16& b) {
+ // We are not sure if Collator::compare is thread-safe.
+ // Use an AutoLock just in case.
+ AutoLock auto_lock(lock_);
+
+ UErrorCode error_code = U_ZERO_ERROR;
+ UCollationResult result = collator_->compare(
+ static_cast<const UChar*>(a.c_str()),
+ static_cast<int>(a.length()),
+ static_cast<const UChar*>(b.c_str()),
+ static_cast<int>(b.length()),
+ error_code);
+ DCHECK(U_SUCCESS(error_code));
+ return result;
+ }
+
+ private:
+ scoped_ptr<icu::Collator> collator_;
+ Lock lock_;
+ friend struct DefaultSingletonTraits<LocaleAwareComparator>;
+
+ DISALLOW_COPY_AND_ASSIGN(LocaleAwareComparator);
+};
+
+} // namespace
+
+namespace file_util {
+
+bool IsFilenameLegal(const string16& file_name) {
+ return Singleton<IllegalCharacters>()->containsNone(file_name);
+}
+
+void ReplaceIllegalCharacters(std::wstring* file_name, int replace_char) {
+ DCHECK(file_name);
+
+ DCHECK(!(Singleton<IllegalCharacters>()->contains(replace_char)) &&
+ replace_char < 0x10000);
+
+ // Remove leading and trailing whitespace.
+ TrimWhitespace(*file_name, TRIM_ALL, file_name);
+
+ if (IsFilenameLegal(WideToUTF16(*file_name)))
+ return;
+
+ std::wstring::size_type i = 0;
+ std::wstring::size_type length = file_name->size();
+ const wchar_t* wstr = file_name->data();
+#if defined(WCHAR_T_IS_UTF16)
+ // Using |span| method of UnicodeSet might speed things up a bit, but
+ // it's not likely to matter here.
+ std::wstring temp;
+ temp.reserve(length);
+ while (i < length) {
+ UChar32 ucs4;
+ std::wstring::size_type prev = i;
+ U16_NEXT(wstr, i, length, ucs4);
+ if (Singleton<IllegalCharacters>()->contains(ucs4)) {
+ temp.push_back(replace_char);
+ } else if (ucs4 < 0x10000) {
+ temp.push_back(ucs4);
+ } else {
+ temp.push_back(wstr[prev]);
+ temp.push_back(wstr[prev + 1]);
+ }
+ }
+ file_name->swap(temp);
+#elif defined(WCHAR_T_IS_UTF32)
+ while (i < length) {
+ if (Singleton<IllegalCharacters>()->contains(wstr[i])) {
+ (*file_name)[i] = replace_char;
+ }
+ ++i;
+ }
+#else
+#error wchar_t* should be either UTF-16 or UTF-32
+#endif
+}
+
+bool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b) {
+#if defined(OS_WIN)
+ return Singleton<LocaleAwareComparator>()->Compare(a.value().c_str(),
+ b.value().c_str()) < 0;
+
+#elif defined(OS_POSIX)
+ // On linux, the file system encoding is not defined. We assume
+ // SysNativeMBToWide takes care of it.
+ //
+ // ICU's collator can take strings in OS native encoding. But we convert the
+ // strings to UTF-16 ourselves to ensure conversion consistency.
+ // TODO(yuzo): Perhaps we should define SysNativeMBToUTF16?
+ return Singleton<LocaleAwareComparator>()->Compare(
+ WideToUTF16(base::SysNativeMBToWide(a.value().c_str())),
+ WideToUTF16(base::SysNativeMBToWide(b.value().c_str()))) < 0;
+#else
+ #error Not implemented on your system
+#endif
+}
+
+} // namespace