Separate out some more ICU from base and into base/i18n.

This moves string_util_icu. I moved the number formatting function into base/i18n/number_formatting and just removed the other function in string_util_icu which was TrimWhitespaceUTF8. It is only used in a few places and isn't actually helpful (and the fact that it round-trips through UTF-16 is better for the caller to see). This takes out the sorting from the FileEnumerator. The comment says the sorting is not guaranteed. I moved it into file_util_icu as a standalone function for callers of FileEnumerator to call manually if they need sorted results. I modified the directory lister to use this sorting instead, and filed a bug on doing more optimal JS-based sorting. TEST=none BUG=none Review URL: http://codereview.chromium.org/267001 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@28405 0039d316-1c4b-4281-b951-d872f2087c98
author: brettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-10-08 17:38:30 +0000
committer: brettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-10-08 17:38:30 +0000
commit: d0767cb54b2b5ee4d9cf00b3ee0fa585826b4036 (patch)
tree: 1c2ee733bf62a44c31dc11f76dad53243a84439f /base/i18n
parent: e91d532339c854ff0a082c6562a519647524fa66 (diff)
download: chromium_src-d0767cb54b2b5ee4d9cf00b3ee0fa585826b4036.zip
chromium_src-d0767cb54b2b5ee4d9cf00b3ee0fa585826b4036.tar.gz
chromium_src-d0767cb54b2b5ee4d9cf00b3ee0fa585826b4036.tar.bz2
5 files changed, 364 insertions, 0 deletions
diff --git a/base/i18n/file_util_icu.cc b/base/i18n/file_util_icu.cc
new file mode 100644
index 0000000..0bc9db6
--- /dev/null
+++ b/base/i18n/file_util_icu.cc
@@ -0,0 +1,193 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// File utilities that use the ICU library go in this file.
+
+#include "base/i18n/file_util_icu.h"
+
+#include "base/file_path.h"
+#include "base/scoped_ptr.h"
+#include "base/singleton.h"
+#include "base/string_util.h"
+#include "base/sys_string_conversions.h"
+#include "build/build_config.h"
+#include "unicode/coll.h"
+#include "unicode/uniset.h"
+
+namespace {
+
+class IllegalCharacters {
+ public:
+  bool contains(UChar32 ucs4) {
+    return !!set->contains(ucs4);
+  }
+
+  bool containsNone(const string16 &s) {
+    return !!set->containsNone(icu::UnicodeString(s.c_str(), s.size()));
+  }
+
+ private:
+  friend class Singleton<IllegalCharacters>;
+  friend struct DefaultSingletonTraits<IllegalCharacters>;
+
+  IllegalCharacters();
+  ~IllegalCharacters() { }
+
+  scoped_ptr<icu::UnicodeSet> set;
+
+  DISALLOW_COPY_AND_ASSIGN(IllegalCharacters);
+};
+
+IllegalCharacters::IllegalCharacters() {
+  UErrorCode status = U_ZERO_ERROR;
+  // Control characters, formatting characters, non-characters, and
+  // some printable ASCII characters regarded as dangerous ('"*/:<>?\\').
+  // See  http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx
+  // and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx
+  // TODO(jungshik): Revisit the set. ZWJ and ZWNJ are excluded because they
+  // are legitimate in Arabic and some S/SE Asian scripts. However, when used
+  // elsewhere, they can be confusing/problematic.
+  // Also, consider wrapping the set with our Singleton class to create and
+  // freeze it only once. Note that there's a trade-off between memory and
+  // speed.
+#if defined(WCHAR_T_IS_UTF16)
+  set.reset(new icu::UnicodeSet(icu::UnicodeString(
+      L"[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\u200c\u200d]]"), status));
+#else
+  set.reset(new icu::UnicodeSet(UNICODE_STRING_SIMPLE(
+      "[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\\u200c\\u200d]]").unescape(),
+      status));
+#endif
+  DCHECK(U_SUCCESS(status));
+  // Add non-characters. If this becomes a performance bottleneck by
+  // any chance, do not add these to |set| and change IsFilenameLegal()
+  // to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addiition to calling
+  // containsNone().
+  set->add(0xFDD0, 0xFDEF);
+  for (int i = 0; i <= 0x10; ++i) {
+    int plane_base = 0x10000 * i;
+    set->add(plane_base + 0xFFFE, plane_base + 0xFFFF);
+  }
+  set->freeze();
+}
+
+class LocaleAwareComparator {
+ public:
+  LocaleAwareComparator() {
+    UErrorCode error_code = U_ZERO_ERROR;
+    // Use the default collator. The default locale should have been properly
+    // set by the time this constructor is called.
+    collator_.reset(icu::Collator::createInstance(error_code));
+    DCHECK(U_SUCCESS(error_code));
+    // Make it case-sensitive.
+    collator_->setStrength(icu::Collator::TERTIARY);
+    // Note: We do not set UCOL_NORMALIZATION_MODE attribute. In other words, we
+    // do not pay performance penalty to guarantee sort order correctness for
+    // non-FCD (http://unicode.org/notes/tn5/#FCD) file names. This should be a
+    // reasonable tradeoff because such file names should be rare and the sort
+    // order doesn't change much anyway.
+  }
+
+  // Note: A similar function is available in l10n_util.
+  // We cannot use it because base should not depend on l10n_util.
+  // TODO(yuzo): Move some of l10n_util to base.
+  int Compare(const string16& a, const string16& b) {
+    // We are not sure if Collator::compare is thread-safe.
+    // Use an AutoLock just in case.
+    AutoLock auto_lock(lock_);
+
+    UErrorCode error_code = U_ZERO_ERROR;
+    UCollationResult result = collator_->compare(
+        static_cast<const UChar*>(a.c_str()),
+        static_cast<int>(a.length()),
+        static_cast<const UChar*>(b.c_str()),
+        static_cast<int>(b.length()),
+        error_code);
+    DCHECK(U_SUCCESS(error_code));
+    return result;
+  }
+
+ private:
+  scoped_ptr<icu::Collator> collator_;
+  Lock lock_;
+  friend struct DefaultSingletonTraits<LocaleAwareComparator>;
+
+  DISALLOW_COPY_AND_ASSIGN(LocaleAwareComparator);
+};
+
+}  // namespace
+
+namespace file_util {
+
+bool IsFilenameLegal(const string16& file_name) {
+  return Singleton<IllegalCharacters>()->containsNone(file_name);
+}
+
+void ReplaceIllegalCharacters(std::wstring* file_name, int replace_char) {
+  DCHECK(file_name);
+
+  DCHECK(!(Singleton<IllegalCharacters>()->contains(replace_char)) &&
+         replace_char < 0x10000);
+
+  // Remove leading and trailing whitespace.
+  TrimWhitespace(*file_name, TRIM_ALL, file_name);
+
+  if (IsFilenameLegal(WideToUTF16(*file_name)))
+    return;
+
+  std::wstring::size_type i = 0;
+  std::wstring::size_type length = file_name->size();
+  const wchar_t* wstr = file_name->data();
+#if defined(WCHAR_T_IS_UTF16)
+  // Using |span| method of UnicodeSet might speed things up a bit, but
+  // it's not likely to matter here.
+  std::wstring temp;
+  temp.reserve(length);
+  while (i < length) {
+    UChar32 ucs4;
+    std::wstring::size_type prev = i;
+    U16_NEXT(wstr, i, length, ucs4);
+    if (Singleton<IllegalCharacters>()->contains(ucs4)) {
+      temp.push_back(replace_char);
+    } else if (ucs4 < 0x10000) {
+      temp.push_back(ucs4);
+    } else {
+      temp.push_back(wstr[prev]);
+      temp.push_back(wstr[prev + 1]);
+    }
+  }
+  file_name->swap(temp);
+#elif defined(WCHAR_T_IS_UTF32)
+  while (i < length) {
+    if (Singleton<IllegalCharacters>()->contains(wstr[i])) {
+      (*file_name)[i] = replace_char;
+    }
+    ++i;
+  }
+#else
+#error wchar_t* should be either UTF-16 or UTF-32
+#endif
+}
+
+bool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b) {
+#if defined(OS_WIN)
+  return Singleton<LocaleAwareComparator>()->Compare(a.value().c_str(),
+                                                     b.value().c_str()) < 0;
+
+#elif defined(OS_POSIX)
+  // On linux, the file system encoding is not defined. We assume
+  // SysNativeMBToWide takes care of it.
+  //
+  // ICU's collator can take strings in OS native encoding. But we convert the
+  // strings to UTF-16 ourselves to ensure conversion consistency.
+  // TODO(yuzo): Perhaps we should define SysNativeMBToUTF16?
+  return Singleton<LocaleAwareComparator>()->Compare(
+      WideToUTF16(base::SysNativeMBToWide(a.value().c_str())),
+      WideToUTF16(base::SysNativeMBToWide(b.value().c_str()))) < 0;
+#else
+  #error Not implemented on your system
+#endif
+}
+
+}  // namespace
diff --git a/base/i18n/file_util_icu.h b/base/i18n/file_util_icu.h
new file mode 100644
index 0000000..c309a9e
--- /dev/null
+++ b/base/i18n/file_util_icu.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// File utilities that use the ICU library go in this file.
+
+#include <string>
+
+#include "base/string16.h"
+
+class FilePath;
+
+namespace file_util {
+
+// Returns true if file_name does not have any illegal character. The input
+// param has the same restriction as that for ReplaceIllegalCharacters.
+bool IsFilenameLegal(const string16& file_name);
+
+// Replaces characters in 'file_name' that are illegal for file names with
+// 'replace_char'. 'file_name' must not be a full or relative path, but just the
+// file name component. Any leading or trailing whitespace in 'file_name' is
+// removed.
+// Example:
+//   file_name == "bad:file*name?.txt", changed to: "bad-file-name-.txt" when
+//   'replace_char' is '-'.
+void ReplaceIllegalCharacters(std::wstring* file_name, int replace_char);
+
+// Compares two filenames using the current locale information. This can be
+// used to sort directory listings. It behaves like "operator<" for use in
+// std::sort.
+bool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b);
+
+}  // namespace file_util
diff --git a/base/i18n/file_util_icu_unittest.cc b/base/i18n/file_util_icu_unittest.cc
new file mode 100644
index 0000000..aebcd0df
--- /dev/null
+++ b/base/i18n/file_util_icu_unittest.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/i18n/file_util_icu.h"
+
+#include "base/file_util.h"
+#include "base/path_service.h"
+#include "testing/gtest/include/gtest/gtest.h"
+#include "testing/platform_test.h"
+
+// file_util winds up using autoreleased objects on the Mac, so this needs
+// to be a PlatformTest
+class FileUtilICUTest : public PlatformTest {
+ protected:
+  virtual void SetUp() {
+    PlatformTest::SetUp();
+    // Name a subdirectory of the temp directory.
+    ASSERT_TRUE(PathService::Get(base::DIR_TEMP, &test_dir_));
+    test_dir_ = test_dir_.Append(FILE_PATH_LITERAL("FileUtilTest"));
+
+    // Create a fresh, empty copy of this directory.
+    file_util::Delete(test_dir_, true);
+    file_util::CreateDirectory(test_dir_);
+  }
+  virtual void TearDown() {
+    PlatformTest::TearDown();
+    // Clean up test directory
+    ASSERT_TRUE(file_util::Delete(test_dir_, true));
+    ASSERT_FALSE(file_util::PathExists(test_dir_));
+  }
+
+  // the path to temporary directory used to contain the test operations
+  FilePath test_dir_;
+};
+
+static const struct goodbad_pair {
+  std::wstring bad_name;
+  std::wstring good_name;
+} kIllegalCharacterCases[] = {
+  {L"bad*file:name?.jpg", L"bad-file-name-.jpg"},
+  {L"**********::::.txt", L"--------------.txt"},
+  // We can't use UCNs (universal character names) for C0/C1 characters and
+  // U+007F, but \x escape is interpreted by MSVC and gcc as we intend.
+  {L"bad\x0003\x0091 file\u200E\u200Fname.png", L"bad-- file--name.png"},
+#if defined(OS_WIN)
+  {L"bad*file\\name.jpg", L"bad-file-name.jpg"},
+  {L"\t  bad*file\\name/.jpg ", L"bad-file-name-.jpg"},
+#elif defined(OS_POSIX)
+  {L"bad*file?name.jpg", L"bad-file-name.jpg"},
+  {L"\t  bad*file?name/.jpg ", L"bad-file-name-.jpg"},
+#endif
+  {L"this_file_name is okay!.mp3", L"this_file_name is okay!.mp3"},
+  {L"\u4E00\uAC00.mp3", L"\u4E00\uAC00.mp3"},
+  {L"\u0635\u200C\u0644.mp3", L"\u0635\u200C\u0644.mp3"},
+  {L"\U00010330\U00010331.mp3", L"\U00010330\U00010331.mp3"},
+  // Unassigned codepoints are ok.
+  {L"\u0378\U00040001.mp3", L"\u0378\U00040001.mp3"},
+  // Non-characters are not allowed.
+  {L"bad\uFFFFfile\U0010FFFEname.jpg ", L"bad-file-name.jpg"},
+  {L"bad\uFDD0file\uFDEFname.jpg ", L"bad-file-name.jpg"},
+};
+
+TEST_F(FileUtilICUTest, ReplaceIllegalCharactersTest) {
+  for (unsigned int i = 0; i < arraysize(kIllegalCharacterCases); ++i) {
+    std::wstring bad_name(kIllegalCharacterCases[i].bad_name);
+    file_util::ReplaceIllegalCharacters(&bad_name, L'-');
+    EXPECT_EQ(kIllegalCharacterCases[i].good_name, bad_name);
+  }
+}
+
diff --git a/base/i18n/number_formatting.cc b/base/i18n/number_formatting.cc
new file mode 100644
index 0000000..fef1b7d
--- /dev/null
+++ b/base/i18n/number_formatting.cc
@@ -0,0 +1,48 @@
+// Copyright (c) 2008 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/i18n/number_formatting.h"
+
+#include "base/logging.h"
+#include "base/singleton.h"
+#include "base/string_util.h"
+#include "base/utf_string_conversions.h"
+#include "unicode/numfmt.h"
+#include "unicode/ustring.h"
+
+namespace base {
+
+namespace {
+
+struct NumberFormatSingletonTraits
+    : public DefaultSingletonTraits<icu::NumberFormat> {
+  static icu::NumberFormat* New() {
+    UErrorCode status = U_ZERO_ERROR;
+    icu::NumberFormat* formatter = icu::NumberFormat::createInstance(status);
+    DCHECK(U_SUCCESS(status));
+    return formatter;
+  }
+  // There's no ICU call to destroy a NumberFormat object other than
+  // operator delete, so use the default Delete, which calls operator delete.
+  // This can cause problems if a different allocator is used by this file than
+  // by ICU.
+};
+
+}  // namespace
+
+string16 FormatNumber(int64 number) {
+  icu::NumberFormat* number_format =
+      Singleton<icu::NumberFormat, NumberFormatSingletonTraits>::get();
+
+  if (!number_format) {
+    // As a fallback, just return the raw number in a string.
+    return UTF8ToUTF16(StringPrintf("%lld", number));
+  }
+  icu::UnicodeString ustr;
+  number_format->format(number, ustr);
+
+  return string16(ustr.getBuffer(), static_cast<size_t>(ustr.length()));
+}
+
+}  // namespace base
diff --git a/base/i18n/number_formatting.h b/base/i18n/number_formatting.h
new file mode 100644
index 0000000..9fa2b18
--- /dev/null
+++ b/base/i18n/number_formatting.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2008 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_I18N_NUMBER_FORMATTING_H_
+#define BASE_I18N_NUMBER_FORMATTING_H_
+
+#include <string>
+
+#include "base/basictypes.h"
+#include "base/string16.h"
+
+namespace base {
+
+string16 FormatNumber(int64 number);
+
+}  // namespace base
+
+#endif  // BASE_I18N_NUMBER_FORMATTING_H_
author	brettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-10-08 17:38:30 +0000
committer	brettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-10-08 17:38:30 +0000
commit	d0767cb54b2b5ee4d9cf00b3ee0fa585826b4036 (patch)
tree	1c2ee733bf62a44c31dc11f76dad53243a84439f /base/i18n
parent	e91d532339c854ff0a082c6562a519647524fa66 (diff)
download	chromium_src-d0767cb54b2b5ee4d9cf00b3ee0fa585826b4036.zip chromium_src-d0767cb54b2b5ee4d9cf00b3ee0fa585826b4036.tar.gz chromium_src-d0767cb54b2b5ee4d9cf00b3ee0fa585826b4036.tar.bz2