From d0767cb54b2b5ee4d9cf00b3ee0fa585826b4036 Mon Sep 17 00:00:00 2001
From: "brettw@chromium.org"
 <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>
Date: Thu, 8 Oct 2009 17:38:30 +0000
Subject: Separate out some more ICU from base and into base/i18n.

This moves string_util_icu. I moved the number formatting function into
base/i18n/number_formatting and just removed the other function in
string_util_icu which was TrimWhitespaceUTF8. It is only used in a few places
and isn't actually helpful (and the fact that it round-trips through UTF-16 is
better for the caller to see).

This takes out the sorting from the FileEnumerator. The comment says the
sorting is not guaranteed. I moved it into file_util_icu as a standalone
function for callers of FileEnumerator to call manually if they need sorted
results. I modified the directory lister to use this sorting instead, and filed
a bug on doing more optimal JS-based sorting.

TEST=none
BUG=none
Review URL: http://codereview.chromium.org/267001

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@28405 0039d316-1c4b-4281-b951-d872f2087c98
---
 base/base.gyp                       |   7 +-
 base/file_util.h                    |  19 +---
 base/file_util_icu.cc               | 129 ------------------------
 base/file_util_posix.cc             |  70 -------------
 base/file_util_unittest.cc          |  89 +----------------
 base/i18n/file_util_icu.cc          | 193 ++++++++++++++++++++++++++++++++++++
 base/i18n/file_util_icu.h           |  33 ++++++
 base/i18n/file_util_icu_unittest.cc |  71 +++++++++++++
 base/i18n/number_formatting.cc      |  48 +++++++++
 base/i18n/number_formatting.h       |  19 ++++
 base/string_util.h                  |   5 -
 base/string_util_icu.cc             |  80 ---------------
 base/string_util_unittest.cc        |  46 ---------
 13 files changed, 371 insertions(+), 438 deletions(-)
 delete mode 100644 base/file_util_icu.cc
 create mode 100644 base/i18n/file_util_icu.cc
 create mode 100644 base/i18n/file_util_icu.h
 create mode 100644 base/i18n/file_util_icu_unittest.cc
 create mode 100644 base/i18n/number_formatting.cc
 create mode 100644 base/i18n/number_formatting.h
 delete mode 100644 base/string_util_icu.cc

(limited to 'base')

diff --git a/base/base.gyp b/base/base.gyp
index 41108dd..570e9f0 100644
--- a/base/base.gyp
+++ b/base/base.gyp
@@ -121,7 +121,6 @@
         'file_path.h',
         'file_util.cc',
         'file_util.h',
-        'file_util_icu.cc',
         'file_util_linux.cc',
         'file_util_mac.mm',
         'file_util_posix.cc',
@@ -141,8 +140,12 @@
         'hmac_mac.cc',
         'hmac_nss.cc',
         'hmac_win.cc',
+        'i18n/file_util_icu.cc',
+        'i18n/file_util_icu.h',
         'i18n/icu_string_conversions.cc',
         'i18n/icu_string_conversions.h',
+        'i18n/number_formatting.cc',
+        'i18n/number_formatting.h',
         'iat_patch.cc',
         'iat_patch.h',
         'icu_util.cc',
@@ -284,7 +287,6 @@
         'string_tokenizer.h',
         'string_util.cc',
         'string_util.h',
-        'string_util_icu.cc',
         'string_util_win.h',
         'sys_info.h',
         'sys_info_chromeos.cc',
@@ -582,6 +584,7 @@
         'histogram_unittest.cc',
         'hmac_unittest.cc',
         'id_map_unittest.cc',
+        'i18n/file_util_icu_unittest.cc',
         'json_reader_unittest.cc',
         'json_writer_unittest.cc',
         'lazy_instance_unittest.cc',
diff --git a/base/file_util.h b/base/file_util.h
index 6474f048..37634b9 100644
--- a/base/file_util.h
+++ b/base/file_util.h
@@ -103,19 +103,6 @@ void InsertBeforeExtension(FilePath* path, const FilePath::StringType& suffix);
 void ReplaceExtension(FilePath* file_name,
                       const FilePath::StringType& extension);
 
-// Replaces characters in 'file_name' that are illegal for file names with
-// 'replace_char'. 'file_name' must not be a full or relative path, but just the
-// file name component. Any leading or trailing whitespace in 'file_name' is
-// removed.
-// Example:
-//   file_name == "bad:file*name?.txt", changed to: "bad-file-name-.txt" when
-//   'replace_char' is '-'.
-void ReplaceIllegalCharacters(std::wstring* file_name, int replace_char);
-
-// Returns true if file_name does not have any illegal character. The input
-// param has the same restriction as that for ReplaceIllegalCharacters.
-bool IsFilenameLegal(const string16& file_name);
-
 //-----------------------------------------------------------------------------
 // Functions that involve filesystem access or modification:
 
@@ -490,10 +477,6 @@ class FileEnumerator {
   static bool ReadDirectory(std::vector<DirectoryEntryInfo>* entries,
                             const FilePath& source, bool show_links);
 
-  // Comparison function to neatly sort directory entries
-  static bool CompareFiles(const DirectoryEntryInfo& a,
-                           const DirectoryEntryInfo& b);
-
   // The files in the current directory
   std::vector<DirectoryEntryInfo> directory_entries_;
 
@@ -501,7 +484,7 @@ class FileEnumerator {
   size_t current_directory_entry_;
 #endif
 
-  DISALLOW_EVIL_CONSTRUCTORS(FileEnumerator);
+  DISALLOW_COPY_AND_ASSIGN(FileEnumerator);
 };
 
 class MemoryMappedFile {
diff --git a/base/file_util_icu.cc b/base/file_util_icu.cc
deleted file mode 100644
index eeffa92..0000000
--- a/base/file_util_icu.cc
+++ /dev/null
@@ -1,129 +0,0 @@
-// Copyright (c) 2009 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-// File utilities that use the ICU library go in this file.  Functions using ICU
-// are separated from the other functions to prevent ICU being pulled in by the
-// linker if there is a false dependency.
-//
-// (The VS2005 linker finds such a false dependency and adds ~300K of ICU to
-// chrome.exe if this code lives in file_util.cc, even though none of this code
-// is called.)
-
-#include "base/file_util.h"
-
-#include "base/singleton.h"
-#include "base/string_util.h"
-#include "unicode/uniset.h"
-
-namespace {
-class IllegalCharacters {
- public:
-  bool contains(UChar32 ucs4) {
-    return !!set->contains(ucs4);
-  }
-
-  bool containsNone(const string16 &s) {
-    return !!set->containsNone(icu::UnicodeString(s.c_str(), s.size()));
-  }
-
- private:
-  friend class Singleton<IllegalCharacters>;
-  friend struct DefaultSingletonTraits<IllegalCharacters>;
-
-  IllegalCharacters();
-  ~IllegalCharacters() { }
-
-  scoped_ptr<icu::UnicodeSet> set;
-
-  DISALLOW_COPY_AND_ASSIGN(IllegalCharacters);
-};
-
-IllegalCharacters::IllegalCharacters() {
-  UErrorCode status = U_ZERO_ERROR;
-  // Control characters, formatting characters, non-characters, and
-  // some printable ASCII characters regarded as dangerous ('"*/:<>?\\').
-  // See  http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx
-  // and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx
-  // TODO(jungshik): Revisit the set. ZWJ and ZWNJ are excluded because they
-  // are legitimate in Arabic and some S/SE Asian scripts. However, when used
-  // elsewhere, they can be confusing/problematic.
-  // Also, consider wrapping the set with our Singleton class to create and
-  // freeze it only once. Note that there's a trade-off between memory and
-  // speed.
-#if defined(WCHAR_T_IS_UTF16)
-  set.reset(new icu::UnicodeSet(icu::UnicodeString(
-      L"[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\u200c\u200d]]"), status));
-#else
-  set.reset(new icu::UnicodeSet(UNICODE_STRING_SIMPLE(
-      "[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\\u200c\\u200d]]").unescape(),
-      status));
-#endif
-  DCHECK(U_SUCCESS(status));
-  // Add non-characters. If this becomes a performance bottleneck by
-  // any chance, do not add these to |set| and change IsFilenameLegal()
-  // to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addiition to calling
-  // containsNone().
-  set->add(0xFDD0, 0xFDEF);
-  for (int i = 0; i <= 0x10; ++i) {
-    int plane_base = 0x10000 * i;
-    set->add(plane_base + 0xFFFE, plane_base + 0xFFFF);
-  }
-  set->freeze();
-}
-
-}  // namespace
-
-namespace file_util {
-
-bool IsFilenameLegal(const string16& file_name) {
-  return Singleton<IllegalCharacters>()->containsNone(file_name);
-}
-
-void ReplaceIllegalCharacters(std::wstring* file_name, int replace_char) {
-  DCHECK(file_name);
-
-  DCHECK(!(Singleton<IllegalCharacters>()->contains(replace_char)) &&
-         replace_char < 0x10000);
-
-  // Remove leading and trailing whitespace.
-  TrimWhitespace(*file_name, TRIM_ALL, file_name);
-
-  if (IsFilenameLegal(WideToUTF16(*file_name)))
-    return;
-
-  std::wstring::size_type i = 0;
-  std::wstring::size_type length = file_name->size();
-  const wchar_t* wstr = file_name->data();
-#if defined(WCHAR_T_IS_UTF16)
-  // Using |span| method of UnicodeSet might speed things up a bit, but
-  // it's not likely to matter here.
-  std::wstring temp;
-  temp.reserve(length);
-  while (i < length) {
-    UChar32 ucs4;
-    std::wstring::size_type prev = i;
-    U16_NEXT(wstr, i, length, ucs4);
-    if (Singleton<IllegalCharacters>()->contains(ucs4)) {
-      temp.push_back(replace_char);
-    } else if (ucs4 < 0x10000) {
-      temp.push_back(ucs4);
-    } else {
-      temp.push_back(wstr[prev]);
-      temp.push_back(wstr[prev + 1]);
-    }
-  }
-  file_name->swap(temp);
-#elif defined(WCHAR_T_IS_UTF32)
-  while (i < length) {
-    if (Singleton<IllegalCharacters>()->contains(wstr[i])) {
-      (*file_name)[i] = replace_char;
-    }
-    ++i;
-  }
-#else
-#error wchar_t* should be either UTF-16 or UTF-32
-#endif
-}
-
-}  // namespace
diff --git a/base/file_util_posix.cc b/base/file_util_posix.cc
index 27adbfa..4621bb3 100644
--- a/base/file_util_posix.cc
+++ b/base/file_util_posix.cc
@@ -35,56 +35,6 @@
 #include "base/sys_string_conversions.h"
 #include "base/time.h"
 #include "base/utf_string_conversions.h"
-#include "unicode/coll.h"
-
-
-namespace {
-
-class LocaleAwareComparator {
- public:
-  LocaleAwareComparator() {
-    UErrorCode error_code = U_ZERO_ERROR;
-    // Use the default collator. The default locale should have been properly
-    // set by the time this constructor is called.
-    collator_.reset(icu::Collator::createInstance(error_code));
-    DCHECK(U_SUCCESS(error_code));
-    // Make it case-sensitive.
-    collator_->setStrength(icu::Collator::TERTIARY);
-    // Note: We do not set UCOL_NORMALIZATION_MODE attribute. In other words, we
-    // do not pay performance penalty to guarantee sort order correctness for
-    // non-FCD (http://unicode.org/notes/tn5/#FCD) file names. This should be a
-    // reasonable tradeoff because such file names should be rare and the sort
-    // order doesn't change much anyway.
-  }
-
-  // Note: A similar function is available in l10n_util.
-  // We cannot use it because base should not depend on l10n_util.
-  // TODO(yuzo): Move some of l10n_util to base.
-  int Compare(const string16& a, const string16& b) {
-    // We are not sure if Collator::compare is thread-safe.
-    // Use an AutoLock just in case.
-    AutoLock auto_lock(lock_);
-
-    UErrorCode error_code = U_ZERO_ERROR;
-    UCollationResult result = collator_->compare(
-        static_cast<const UChar*>(a.c_str()),
-        static_cast<int>(a.length()),
-        static_cast<const UChar*>(b.c_str()),
-        static_cast<int>(b.length()),
-        error_code);
-    DCHECK(U_SUCCESS(error_code));
-    return result;
-  }
-
- private:
-  scoped_ptr<icu::Collator> collator_;
-  Lock lock_;
-  friend struct DefaultSingletonTraits<LocaleAwareComparator>;
-
-  DISALLOW_COPY_AND_ASSIGN(LocaleAwareComparator);
-};
-
-}  // namespace
 
 namespace file_util {
 
@@ -623,9 +573,6 @@ FilePath FileEnumerator::Next() {
     if (!ReadDirectory(&entries, root_path_, file_type_ & SHOW_SYM_LINKS))
       continue;
 
-    // The API says that order is not guaranteed, but order affects UX
-    std::sort(entries.begin(), entries.end(), CompareFiles);
-
     directory_entries_.clear();
     current_directory_entry_ = 0;
     for (std::vector<DirectoryEntryInfo>::const_iterator
@@ -691,23 +638,6 @@ bool FileEnumerator::ReadDirectory(std::vector<DirectoryEntryInfo>* entries,
   return true;
 }
 
-bool FileEnumerator::CompareFiles(const DirectoryEntryInfo& a,
-                                  const DirectoryEntryInfo& b) {
-  // Order lexicographically with directories before other files.
-  if (S_ISDIR(a.stat.st_mode) != S_ISDIR(b.stat.st_mode))
-    return S_ISDIR(a.stat.st_mode);
-
-  // On linux, the file system encoding is not defined. We assume
-  // SysNativeMBToWide takes care of it.
-  //
-  // ICU's collator can take strings in OS native encoding. But we convert the
-  // strings to UTF-16 ourselves to ensure conversion consistency.
-  // TODO(yuzo): Perhaps we should define SysNativeMBToUTF16?
-  return Singleton<LocaleAwareComparator>()->Compare(
-      WideToUTF16(base::SysNativeMBToWide(a.filename.value().c_str())),
-      WideToUTF16(base::SysNativeMBToWide(b.filename.value().c_str()))) < 0;
-}
-
 ///////////////////////////////////////////////
 // MemoryMappedFile
 
diff --git a/base/file_util_unittest.cc b/base/file_util_unittest.cc
index 5b606c9..b1f9fed 100644
--- a/base/file_util_unittest.cc
+++ b/base/file_util_unittest.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
@@ -876,41 +876,6 @@ TEST_F(FileUtilTest, DetectDirectoryTest) {
   EXPECT_TRUE(file_util::Delete(test_root, true));
 }
 
-static const struct goodbad_pair {
-  std::wstring bad_name;
-  std::wstring good_name;
-} kIllegalCharacterCases[] = {
-  {L"bad*file:name?.jpg", L"bad-file-name-.jpg"},
-  {L"**********::::.txt", L"--------------.txt"},
-  // We can't use UCNs (universal character names) for C0/C1 characters and
-  // U+007F, but \x escape is interpreted by MSVC and gcc as we intend.
-  {L"bad\x0003\x0091 file\u200E\u200Fname.png", L"bad-- file--name.png"},
-#if defined(OS_WIN)
-  {L"bad*file\\name.jpg", L"bad-file-name.jpg"},
-  {L"\t  bad*file\\name/.jpg ", L"bad-file-name-.jpg"},
-#elif defined(OS_POSIX)
-  {L"bad*file?name.jpg", L"bad-file-name.jpg"},
-  {L"\t  bad*file?name/.jpg ", L"bad-file-name-.jpg"},
-#endif
-  {L"this_file_name is okay!.mp3", L"this_file_name is okay!.mp3"},
-  {L"\u4E00\uAC00.mp3", L"\u4E00\uAC00.mp3"},
-  {L"\u0635\u200C\u0644.mp3", L"\u0635\u200C\u0644.mp3"},
-  {L"\U00010330\U00010331.mp3", L"\U00010330\U00010331.mp3"},
-  // Unassigned codepoints are ok.
-  {L"\u0378\U00040001.mp3", L"\u0378\U00040001.mp3"},
-  // Non-characters are not allowed.
-  {L"bad\uFFFFfile\U0010FFFEname.jpg ", L"bad-file-name.jpg"},
-  {L"bad\uFDD0file\uFDEFname.jpg ", L"bad-file-name.jpg"},
-};
-
-TEST_F(FileUtilTest, ReplaceIllegalCharactersTest) {
-  for (unsigned int i = 0; i < arraysize(kIllegalCharacterCases); ++i) {
-    std::wstring bad_name(kIllegalCharacterCases[i].bad_name);
-    file_util::ReplaceIllegalCharacters(&bad_name, L'-');
-    EXPECT_EQ(kIllegalCharacterCases[i].good_name, bad_name);
-  }
-}
-
 static const struct ReplaceExtensionCase {
   std::wstring file_name;
   FilePath::StringType extension;
@@ -1069,58 +1034,6 @@ TEST_F(FileUtilTest, FileEnumeratorTest) {
                                             // (we don't care what).
 }
 
-TEST_F(FileUtilTest, FileEnumeratorOrderTest) {
-  FilePath fileA = test_dir_.Append(FILE_PATH_LITERAL("a"));
-  FilePath fileB = test_dir_.Append(FILE_PATH_LITERAL("B"));
-  FilePath dirC = test_dir_.Append(FILE_PATH_LITERAL("C"));
-  FilePath dirD = test_dir_.Append(FILE_PATH_LITERAL("d"));
-  FilePath dirE = test_dir_.Append(FILE_PATH_LITERAL("e"));
-  FilePath fileF = test_dir_.Append(FILE_PATH_LITERAL("f"));
-
-  // Create files/directories in near random order.
-  CreateTextFile(fileF, L"");
-  CreateTextFile(fileA, L"");
-  CreateTextFile(fileB, L"");
-  EXPECT_TRUE(file_util::CreateDirectory(dirE));
-  EXPECT_TRUE(file_util::CreateDirectory(dirC));
-  EXPECT_TRUE(file_util::CreateDirectory(dirD));
-
-  // On Windows, files and directories are enumerated in the lexicographical
-  // order, ignoring case and whether they are files or directories. On posix,
-  // we order directories before files.
-  file_util::FileEnumerator enumerator(test_dir_, false, FILES_AND_DIRECTORIES);
-  FilePath cur_file = enumerator.Next();
-#if defined(OS_WIN)
-  EXPECT_EQ(fileA.value(), cur_file.value());
-  cur_file = enumerator.Next();
-  EXPECT_EQ(fileB.value(), cur_file.value());
-  cur_file = enumerator.Next();
-  EXPECT_EQ(dirC.value(), cur_file.value());
-  cur_file = enumerator.Next();
-  EXPECT_EQ(dirD.value(), cur_file.value());
-  cur_file = enumerator.Next();
-  EXPECT_EQ(dirE.value(), cur_file.value());
-  cur_file = enumerator.Next();
-  EXPECT_EQ(fileF.value(), cur_file.value());
-  cur_file = enumerator.Next();
-#elif defined(OS_POSIX)
-  EXPECT_EQ(dirC.value(), cur_file.value());
-  cur_file = enumerator.Next();
-  EXPECT_EQ(dirD.value(), cur_file.value());
-  cur_file = enumerator.Next();
-  EXPECT_EQ(dirE.value(), cur_file.value());
-  cur_file = enumerator.Next();
-  EXPECT_EQ(fileA.value(), cur_file.value());
-  cur_file = enumerator.Next();
-  EXPECT_EQ(fileB.value(), cur_file.value());
-  cur_file = enumerator.Next();
-  EXPECT_EQ(fileF.value(), cur_file.value());
-  cur_file = enumerator.Next();
-#endif
-
-  EXPECT_EQ(FILE_PATH_LITERAL(""), cur_file.value());
-}
-
 TEST_F(FileUtilTest, Contains) {
   FilePath data_dir = test_dir_.Append(FILE_PATH_LITERAL("FilePathTest"));
 
diff --git a/base/i18n/file_util_icu.cc b/base/i18n/file_util_icu.cc
new file mode 100644
index 0000000..0bc9db6
--- /dev/null
+++ b/base/i18n/file_util_icu.cc
@@ -0,0 +1,193 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// File utilities that use the ICU library go in this file.
+
+#include "base/i18n/file_util_icu.h"
+
+#include "base/file_path.h"
+#include "base/scoped_ptr.h"
+#include "base/singleton.h"
+#include "base/string_util.h"
+#include "base/sys_string_conversions.h"
+#include "build/build_config.h"
+#include "unicode/coll.h"
+#include "unicode/uniset.h"
+
+namespace {
+
+class IllegalCharacters {
+ public:
+  bool contains(UChar32 ucs4) {
+    return !!set->contains(ucs4);
+  }
+
+  bool containsNone(const string16 &s) {
+    return !!set->containsNone(icu::UnicodeString(s.c_str(), s.size()));
+  }
+
+ private:
+  friend class Singleton<IllegalCharacters>;
+  friend struct DefaultSingletonTraits<IllegalCharacters>;
+
+  IllegalCharacters();
+  ~IllegalCharacters() { }
+
+  scoped_ptr<icu::UnicodeSet> set;
+
+  DISALLOW_COPY_AND_ASSIGN(IllegalCharacters);
+};
+
+IllegalCharacters::IllegalCharacters() {
+  UErrorCode status = U_ZERO_ERROR;
+  // Control characters, formatting characters, non-characters, and
+  // some printable ASCII characters regarded as dangerous ('"*/:<>?\\').
+  // See  http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx
+  // and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx
+  // TODO(jungshik): Revisit the set. ZWJ and ZWNJ are excluded because they
+  // are legitimate in Arabic and some S/SE Asian scripts. However, when used
+  // elsewhere, they can be confusing/problematic.
+  // Also, consider wrapping the set with our Singleton class to create and
+  // freeze it only once. Note that there's a trade-off between memory and
+  // speed.
+#if defined(WCHAR_T_IS_UTF16)
+  set.reset(new icu::UnicodeSet(icu::UnicodeString(
+      L"[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\u200c\u200d]]"), status));
+#else
+  set.reset(new icu::UnicodeSet(UNICODE_STRING_SIMPLE(
+      "[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\\u200c\\u200d]]").unescape(),
+      status));
+#endif
+  DCHECK(U_SUCCESS(status));
+  // Add non-characters. If this becomes a performance bottleneck by
+  // any chance, do not add these to |set| and change IsFilenameLegal()
+  // to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addiition to calling
+  // containsNone().
+  set->add(0xFDD0, 0xFDEF);
+  for (int i = 0; i <= 0x10; ++i) {
+    int plane_base = 0x10000 * i;
+    set->add(plane_base + 0xFFFE, plane_base + 0xFFFF);
+  }
+  set->freeze();
+}
+
+class LocaleAwareComparator {
+ public:
+  LocaleAwareComparator() {
+    UErrorCode error_code = U_ZERO_ERROR;
+    // Use the default collator. The default locale should have been properly
+    // set by the time this constructor is called.
+    collator_.reset(icu::Collator::createInstance(error_code));
+    DCHECK(U_SUCCESS(error_code));
+    // Make it case-sensitive.
+    collator_->setStrength(icu::Collator::TERTIARY);
+    // Note: We do not set UCOL_NORMALIZATION_MODE attribute. In other words, we
+    // do not pay performance penalty to guarantee sort order correctness for
+    // non-FCD (http://unicode.org/notes/tn5/#FCD) file names. This should be a
+    // reasonable tradeoff because such file names should be rare and the sort
+    // order doesn't change much anyway.
+  }
+
+  // Note: A similar function is available in l10n_util.
+  // We cannot use it because base should not depend on l10n_util.
+  // TODO(yuzo): Move some of l10n_util to base.
+  int Compare(const string16& a, const string16& b) {
+    // We are not sure if Collator::compare is thread-safe.
+    // Use an AutoLock just in case.
+    AutoLock auto_lock(lock_);
+
+    UErrorCode error_code = U_ZERO_ERROR;
+    UCollationResult result = collator_->compare(
+        static_cast<const UChar*>(a.c_str()),
+        static_cast<int>(a.length()),
+        static_cast<const UChar*>(b.c_str()),
+        static_cast<int>(b.length()),
+        error_code);
+    DCHECK(U_SUCCESS(error_code));
+    return result;
+  }
+
+ private:
+  scoped_ptr<icu::Collator> collator_;
+  Lock lock_;
+  friend struct DefaultSingletonTraits<LocaleAwareComparator>;
+
+  DISALLOW_COPY_AND_ASSIGN(LocaleAwareComparator);
+};
+
+}  // namespace
+
+namespace file_util {
+
+bool IsFilenameLegal(const string16& file_name) {
+  return Singleton<IllegalCharacters>()->containsNone(file_name);
+}
+
+void ReplaceIllegalCharacters(std::wstring* file_name, int replace_char) {
+  DCHECK(file_name);
+
+  DCHECK(!(Singleton<IllegalCharacters>()->contains(replace_char)) &&
+         replace_char < 0x10000);
+
+  // Remove leading and trailing whitespace.
+  TrimWhitespace(*file_name, TRIM_ALL, file_name);
+
+  if (IsFilenameLegal(WideToUTF16(*file_name)))
+    return;
+
+  std::wstring::size_type i = 0;
+  std::wstring::size_type length = file_name->size();
+  const wchar_t* wstr = file_name->data();
+#if defined(WCHAR_T_IS_UTF16)
+  // Using |span| method of UnicodeSet might speed things up a bit, but
+  // it's not likely to matter here.
+  std::wstring temp;
+  temp.reserve(length);
+  while (i < length) {
+    UChar32 ucs4;
+    std::wstring::size_type prev = i;
+    U16_NEXT(wstr, i, length, ucs4);
+    if (Singleton<IllegalCharacters>()->contains(ucs4)) {
+      temp.push_back(replace_char);
+    } else if (ucs4 < 0x10000) {
+      temp.push_back(ucs4);
+    } else {
+      temp.push_back(wstr[prev]);
+      temp.push_back(wstr[prev + 1]);
+    }
+  }
+  file_name->swap(temp);
+#elif defined(WCHAR_T_IS_UTF32)
+  while (i < length) {
+    if (Singleton<IllegalCharacters>()->contains(wstr[i])) {
+      (*file_name)[i] = replace_char;
+    }
+    ++i;
+  }
+#else
+#error wchar_t* should be either UTF-16 or UTF-32
+#endif
+}
+
+bool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b) {
+#if defined(OS_WIN)
+  return Singleton<LocaleAwareComparator>()->Compare(a.value().c_str(),
+                                                     b.value().c_str()) < 0;
+
+#elif defined(OS_POSIX)
+  // On linux, the file system encoding is not defined. We assume
+  // SysNativeMBToWide takes care of it.
+  //
+  // ICU's collator can take strings in OS native encoding. But we convert the
+  // strings to UTF-16 ourselves to ensure conversion consistency.
+  // TODO(yuzo): Perhaps we should define SysNativeMBToUTF16?
+  return Singleton<LocaleAwareComparator>()->Compare(
+      WideToUTF16(base::SysNativeMBToWide(a.value().c_str())),
+      WideToUTF16(base::SysNativeMBToWide(b.value().c_str()))) < 0;
+#else
+  #error Not implemented on your system
+#endif
+}
+
+}  // namespace
diff --git a/base/i18n/file_util_icu.h b/base/i18n/file_util_icu.h
new file mode 100644
index 0000000..c309a9e
--- /dev/null
+++ b/base/i18n/file_util_icu.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+// File utilities that use the ICU library go in this file.
+
+#include <string>
+
+#include "base/string16.h"
+
+class FilePath;
+
+namespace file_util {
+
+// Returns true if file_name does not have any illegal character. The input
+// param has the same restriction as that for ReplaceIllegalCharacters.
+bool IsFilenameLegal(const string16& file_name);
+
+// Replaces characters in 'file_name' that are illegal for file names with
+// 'replace_char'. 'file_name' must not be a full or relative path, but just the
+// file name component. Any leading or trailing whitespace in 'file_name' is
+// removed.
+// Example:
+//   file_name == "bad:file*name?.txt", changed to: "bad-file-name-.txt" when
+//   'replace_char' is '-'.
+void ReplaceIllegalCharacters(std::wstring* file_name, int replace_char);
+
+// Compares two filenames using the current locale information. This can be
+// used to sort directory listings. It behaves like "operator<" for use in
+// std::sort.
+bool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b);
+
+}  // namespace file_util
diff --git a/base/i18n/file_util_icu_unittest.cc b/base/i18n/file_util_icu_unittest.cc
new file mode 100644
index 0000000..aebcd0df
--- /dev/null
+++ b/base/i18n/file_util_icu_unittest.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2009 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/i18n/file_util_icu.h"
+
+#include "base/file_util.h"
+#include "base/path_service.h"
+#include "testing/gtest/include/gtest/gtest.h"
+#include "testing/platform_test.h"
+
+// file_util winds up using autoreleased objects on the Mac, so this needs
+// to be a PlatformTest
+class FileUtilICUTest : public PlatformTest {
+ protected:
+  virtual void SetUp() {
+    PlatformTest::SetUp();
+    // Name a subdirectory of the temp directory.
+    ASSERT_TRUE(PathService::Get(base::DIR_TEMP, &test_dir_));
+    test_dir_ = test_dir_.Append(FILE_PATH_LITERAL("FileUtilTest"));
+
+    // Create a fresh, empty copy of this directory.
+    file_util::Delete(test_dir_, true);
+    file_util::CreateDirectory(test_dir_);
+  }
+  virtual void TearDown() {
+    PlatformTest::TearDown();
+    // Clean up test directory
+    ASSERT_TRUE(file_util::Delete(test_dir_, true));
+    ASSERT_FALSE(file_util::PathExists(test_dir_));
+  }
+
+  // the path to temporary directory used to contain the test operations
+  FilePath test_dir_;
+};
+
+static const struct goodbad_pair {
+  std::wstring bad_name;
+  std::wstring good_name;
+} kIllegalCharacterCases[] = {
+  {L"bad*file:name?.jpg", L"bad-file-name-.jpg"},
+  {L"**********::::.txt", L"--------------.txt"},
+  // We can't use UCNs (universal character names) for C0/C1 characters and
+  // U+007F, but \x escape is interpreted by MSVC and gcc as we intend.
+  {L"bad\x0003\x0091 file\u200E\u200Fname.png", L"bad-- file--name.png"},
+#if defined(OS_WIN)
+  {L"bad*file\\name.jpg", L"bad-file-name.jpg"},
+  {L"\t  bad*file\\name/.jpg ", L"bad-file-name-.jpg"},
+#elif defined(OS_POSIX)
+  {L"bad*file?name.jpg", L"bad-file-name.jpg"},
+  {L"\t  bad*file?name/.jpg ", L"bad-file-name-.jpg"},
+#endif
+  {L"this_file_name is okay!.mp3", L"this_file_name is okay!.mp3"},
+  {L"\u4E00\uAC00.mp3", L"\u4E00\uAC00.mp3"},
+  {L"\u0635\u200C\u0644.mp3", L"\u0635\u200C\u0644.mp3"},
+  {L"\U00010330\U00010331.mp3", L"\U00010330\U00010331.mp3"},
+  // Unassigned codepoints are ok.
+  {L"\u0378\U00040001.mp3", L"\u0378\U00040001.mp3"},
+  // Non-characters are not allowed.
+  {L"bad\uFFFFfile\U0010FFFEname.jpg ", L"bad-file-name.jpg"},
+  {L"bad\uFDD0file\uFDEFname.jpg ", L"bad-file-name.jpg"},
+};
+
+TEST_F(FileUtilICUTest, ReplaceIllegalCharactersTest) {
+  for (unsigned int i = 0; i < arraysize(kIllegalCharacterCases); ++i) {
+    std::wstring bad_name(kIllegalCharacterCases[i].bad_name);
+    file_util::ReplaceIllegalCharacters(&bad_name, L'-');
+    EXPECT_EQ(kIllegalCharacterCases[i].good_name, bad_name);
+  }
+}
+
diff --git a/base/i18n/number_formatting.cc b/base/i18n/number_formatting.cc
new file mode 100644
index 0000000..fef1b7d
--- /dev/null
+++ b/base/i18n/number_formatting.cc
@@ -0,0 +1,48 @@
+// Copyright (c) 2008 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "base/i18n/number_formatting.h"
+
+#include "base/logging.h"
+#include "base/singleton.h"
+#include "base/string_util.h"
+#include "base/utf_string_conversions.h"
+#include "unicode/numfmt.h"
+#include "unicode/ustring.h"
+
+namespace base {
+
+namespace {
+
+struct NumberFormatSingletonTraits
+    : public DefaultSingletonTraits<icu::NumberFormat> {
+  static icu::NumberFormat* New() {
+    UErrorCode status = U_ZERO_ERROR;
+    icu::NumberFormat* formatter = icu::NumberFormat::createInstance(status);
+    DCHECK(U_SUCCESS(status));
+    return formatter;
+  }
+  // There's no ICU call to destroy a NumberFormat object other than
+  // operator delete, so use the default Delete, which calls operator delete.
+  // This can cause problems if a different allocator is used by this file than
+  // by ICU.
+};
+
+}  // namespace
+
+string16 FormatNumber(int64 number) {
+  icu::NumberFormat* number_format =
+      Singleton<icu::NumberFormat, NumberFormatSingletonTraits>::get();
+
+  if (!number_format) {
+    // As a fallback, just return the raw number in a string.
+    return UTF8ToUTF16(StringPrintf("%lld", number));
+  }
+  icu::UnicodeString ustr;
+  number_format->format(number, ustr);
+
+  return string16(ustr.getBuffer(), static_cast<size_t>(ustr.length()));
+}
+
+}  // namespace base
diff --git a/base/i18n/number_formatting.h b/base/i18n/number_formatting.h
new file mode 100644
index 0000000..9fa2b18
--- /dev/null
+++ b/base/i18n/number_formatting.h
@@ -0,0 +1,19 @@
+// Copyright (c) 2008 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef BASE_I18N_NUMBER_FORMATTING_H_
+#define BASE_I18N_NUMBER_FORMATTING_H_
+
+#include <string>
+
+#include "base/basictypes.h"
+#include "base/string16.h"
+
+namespace base {
+
+string16 FormatNumber(int64 number);
+
+}  // namespace base
+
+#endif  // BASE_I18N_NUMBER_FORMATTING_H_
diff --git a/base/string_util.h b/base/string_util.h
index 254e18f..c6b9fb1 100644
--- a/base/string_util.h
+++ b/base/string_util.h
@@ -140,8 +140,6 @@ bool TrimString(const std::string& input,
 // The non-wide version has two functions:
 // * TrimWhitespaceASCII()
 //   This function is for ASCII strings and only looks for ASCII whitespace;
-// * TrimWhitespaceUTF8()
-//   This function is for UTF-8 strings and looks for Unicode whitespace.
 // Please choose the best one according to your usage.
 // NOTE: Safe to use the same variable for both input and output.
 enum TrimPositions {
@@ -156,9 +154,6 @@ TrimPositions TrimWhitespace(const std::wstring& input,
 TrimPositions TrimWhitespaceASCII(const std::string& input,
                                   TrimPositions positions,
                                   std::string* output);
-TrimPositions TrimWhitespaceUTF8(const std::string& input,
-                                 TrimPositions positions,
-                                 std::string* output);
 
 // Deprecated. This function is only for backward compatibility and calls
 // TrimWhitespaceASCII().
diff --git a/base/string_util_icu.cc b/base/string_util_icu.cc
deleted file mode 100644
index 68fbd10..0000000
--- a/base/string_util_icu.cc
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style license that can be
-// found in the LICENSE file.
-
-#include "base/string_util.h"
-
-#include <string.h>
-#include <vector>
-
-#include "base/basictypes.h"
-#include "base/logging.h"
-#include "base/singleton.h"
-#include "unicode/numfmt.h"
-#include "unicode/ustring.h"
-
-// Number formatting -----------------------------------------------------------
-
-namespace {
-
-struct NumberFormatSingletonTraits
-    : public DefaultSingletonTraits<icu::NumberFormat> {
-  static icu::NumberFormat* New() {
-    UErrorCode status = U_ZERO_ERROR;
-    icu::NumberFormat* formatter = icu::NumberFormat::createInstance(status);
-    DCHECK(U_SUCCESS(status));
-    return formatter;
-  }
-  // There's no ICU call to destroy a NumberFormat object other than
-  // operator delete, so use the default Delete, which calls operator delete.
-  // This can cause problems if a different allocator is used by this file than
-  // by ICU.
-};
-
-}  // namespace
-
-std::wstring FormatNumber(int64 number) {
-  icu::NumberFormat* number_format =
-      Singleton<icu::NumberFormat, NumberFormatSingletonTraits>::get();
-
-  if (!number_format) {
-    // As a fallback, just return the raw number in a string.
-    return StringPrintf(L"%lld", number);
-  }
-  icu::UnicodeString ustr;
-  number_format->format(number, ustr);
-
-#if defined(WCHAR_T_IS_UTF16)
-  return std::wstring(ustr.getBuffer(),
-                      static_cast<std::wstring::size_type>(ustr.length()));
-#elif defined(WCHAR_T_IS_UTF32)
-  wchar_t buffer[64];  // A int64 is less than 20 chars long,  so 64 chars
-                       // leaves plenty of room for formating stuff.
-  int length = 0;
-  UErrorCode error = U_ZERO_ERROR;
-  u_strToWCS(buffer, 64, &length, ustr.getBuffer(), ustr.length() , &error);
-  if (U_FAILURE(error)) {
-    NOTREACHED();
-    // As a fallback, just return the raw number in a string.
-    return StringPrintf(L"%lld", number);
-  }
-  return std::wstring(buffer, static_cast<std::wstring::size_type>(length));
-#endif  // defined(WCHAR_T_IS_UTF32)
-}
-
-// Although this function isn't specific to ICU, we implemented it here so
-// that chrome.exe won't pull it in.  Moving this function to string_util.cc
-// causes chrome.exe to grow by 400k because of more ICU being pulled in.
-TrimPositions TrimWhitespaceUTF8(const std::string& input,
-                                 TrimPositions positions,
-                                 std::string* output) {
-  // This implementation is not so fast since it converts the text encoding
-  // twice. Please feel free to file a bug if this function hurts the
-  // performance of Chrome.
-  DCHECK(IsStringUTF8(input));
-  std::wstring input_wide = UTF8ToWide(input);
-  std::wstring output_wide;
-  TrimPositions result = TrimWhitespace(input_wide, positions, &output_wide);
-  *output = WideToUTF8(output_wide);
-  return result;
-}
diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc
index 2723541..a70b03e 100644
--- a/base/string_util_unittest.cc
+++ b/base/string_util_unittest.cc
@@ -100,52 +100,6 @@ TEST(StringUtilTest, TrimWhitespace) {
   }
 }
 
-static const struct trim_case_utf8 {
-  const char* input;
-  const TrimPositions positions;
-  const char* output;
-  const TrimPositions return_value;
-} trim_cases_utf8[] = {
-  // UTF-8 strings that start (and end) with Unicode space characters
-  // (including zero-width spaces).
-  {"\xE2\x80\x80Test String\xE2\x80\x81", TRIM_ALL, "Test String", TRIM_ALL},
-  {"\xE2\x80\x82Test String\xE2\x80\x83", TRIM_ALL, "Test String", TRIM_ALL},
-  {"\xE2\x80\x84Test String\xE2\x80\x85", TRIM_ALL, "Test String", TRIM_ALL},
-  {"\xE2\x80\x86Test String\xE2\x80\x87", TRIM_ALL, "Test String", TRIM_ALL},
-  {"\xE2\x80\x88Test String\xE2\x80\x8A", TRIM_ALL, "Test String", TRIM_ALL},
-  {"\xE3\x80\x80Test String\xE3\x80\x80", TRIM_ALL, "Test String", TRIM_ALL},
-  // UTF-8 strings that end with 0x85 (NEL in ISO-8859).
-  {"\xD0\x85", TRIM_TRAILING, "\xD0\x85", TRIM_NONE},
-  {"\xD9\x85", TRIM_TRAILING, "\xD9\x85", TRIM_NONE},
-  {"\xEC\x97\x85", TRIM_TRAILING, "\xEC\x97\x85", TRIM_NONE},
-  {"\xF0\x90\x80\x85", TRIM_TRAILING, "\xF0\x90\x80\x85", TRIM_NONE},
-  // UTF-8 strings that end with 0xA0 (non-break space in ISO-8859-1).
-  {"\xD0\xA0", TRIM_TRAILING, "\xD0\xA0", TRIM_NONE},
-  {"\xD9\xA0", TRIM_TRAILING, "\xD9\xA0", TRIM_NONE},
-  {"\xEC\x97\xA0", TRIM_TRAILING, "\xEC\x97\xA0", TRIM_NONE},
-  {"\xF0\x90\x80\xA0", TRIM_TRAILING, "\xF0\x90\x80\xA0", TRIM_NONE},
-};
-
-TEST(StringUtilTest, TrimWhitespaceUTF8) {
-  std::string output_ascii;
-  for (size_t i = 0; i < arraysize(trim_cases_ascii); ++i) {
-    const trim_case_ascii& value = trim_cases_ascii[i];
-    EXPECT_EQ(value.return_value,
-              TrimWhitespaceASCII(value.input, value.positions, &output_ascii));
-    EXPECT_EQ(value.output, output_ascii);
-  }
-
-  // Test that TrimWhiteSpaceUTF8() can remove Unicode space characters and
-  // prevent from removing UTF-8 characters that end with an ISO-8859 NEL.
-  std::string output_utf8;
-  for (size_t i = 0; i < arraysize(trim_cases_utf8); ++i) {
-    const trim_case_utf8& value = trim_cases_utf8[i];
-    EXPECT_EQ(value.return_value,
-              TrimWhitespaceUTF8(value.input, value.positions, &output_utf8));
-    EXPECT_EQ(value.output, output_utf8);
-  }
-}
-
 static const struct collapse_case {
   const wchar_t* input;
   const bool trim;
-- 
cgit v1.1