diff options
author | brettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-10-08 17:38:30 +0000 |
---|---|---|
committer | brettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-10-08 17:38:30 +0000 |
commit | d0767cb54b2b5ee4d9cf00b3ee0fa585826b4036 (patch) | |
tree | 1c2ee733bf62a44c31dc11f76dad53243a84439f /base/i18n | |
parent | e91d532339c854ff0a082c6562a519647524fa66 (diff) | |
download | chromium_src-d0767cb54b2b5ee4d9cf00b3ee0fa585826b4036.zip chromium_src-d0767cb54b2b5ee4d9cf00b3ee0fa585826b4036.tar.gz chromium_src-d0767cb54b2b5ee4d9cf00b3ee0fa585826b4036.tar.bz2 |
Separate out some more ICU from base and into base/i18n.
This moves string_util_icu. I moved the number formatting function into
base/i18n/number_formatting and just removed the other function in
string_util_icu which was TrimWhitespaceUTF8. It is only used in a few places
and isn't actually helpful (and the fact that it round-trips through UTF-16 is
better for the caller to see).
This takes out the sorting from the FileEnumerator. The comment says the
sorting is not guaranteed. I moved it into file_util_icu as a standalone
function for callers of FileEnumerator to call manually if they need sorted
results. I modified the directory lister to use this sorting instead, and filed
a bug on doing more optimal JS-based sorting.
TEST=none
BUG=none
Review URL: http://codereview.chromium.org/267001
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@28405 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base/i18n')
-rw-r--r-- | base/i18n/file_util_icu.cc | 193 | ||||
-rw-r--r-- | base/i18n/file_util_icu.h | 33 | ||||
-rw-r--r-- | base/i18n/file_util_icu_unittest.cc | 71 | ||||
-rw-r--r-- | base/i18n/number_formatting.cc | 48 | ||||
-rw-r--r-- | base/i18n/number_formatting.h | 19 |
5 files changed, 364 insertions, 0 deletions
diff --git a/base/i18n/file_util_icu.cc b/base/i18n/file_util_icu.cc new file mode 100644 index 0000000..0bc9db6 --- /dev/null +++ b/base/i18n/file_util_icu.cc @@ -0,0 +1,193 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// File utilities that use the ICU library go in this file. + +#include "base/i18n/file_util_icu.h" + +#include "base/file_path.h" +#include "base/scoped_ptr.h" +#include "base/singleton.h" +#include "base/string_util.h" +#include "base/sys_string_conversions.h" +#include "build/build_config.h" +#include "unicode/coll.h" +#include "unicode/uniset.h" + +namespace { + +class IllegalCharacters { + public: + bool contains(UChar32 ucs4) { + return !!set->contains(ucs4); + } + + bool containsNone(const string16 &s) { + return !!set->containsNone(icu::UnicodeString(s.c_str(), s.size())); + } + + private: + friend class Singleton<IllegalCharacters>; + friend struct DefaultSingletonTraits<IllegalCharacters>; + + IllegalCharacters(); + ~IllegalCharacters() { } + + scoped_ptr<icu::UnicodeSet> set; + + DISALLOW_COPY_AND_ASSIGN(IllegalCharacters); +}; + +IllegalCharacters::IllegalCharacters() { + UErrorCode status = U_ZERO_ERROR; + // Control characters, formatting characters, non-characters, and + // some printable ASCII characters regarded as dangerous ('"*/:<>?\\'). + // See http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx + // and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx + // TODO(jungshik): Revisit the set. ZWJ and ZWNJ are excluded because they + // are legitimate in Arabic and some S/SE Asian scripts. However, when used + // elsewhere, they can be confusing/problematic. + // Also, consider wrapping the set with our Singleton class to create and + // freeze it only once. Note that there's a trade-off between memory and + // speed. +#if defined(WCHAR_T_IS_UTF16) + set.reset(new icu::UnicodeSet(icu::UnicodeString( + L"[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\u200c\u200d]]"), status)); +#else + set.reset(new icu::UnicodeSet(UNICODE_STRING_SIMPLE( + "[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\\u200c\\u200d]]").unescape(), + status)); +#endif + DCHECK(U_SUCCESS(status)); + // Add non-characters. If this becomes a performance bottleneck by + // any chance, do not add these to |set| and change IsFilenameLegal() + // to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addiition to calling + // containsNone(). + set->add(0xFDD0, 0xFDEF); + for (int i = 0; i <= 0x10; ++i) { + int plane_base = 0x10000 * i; + set->add(plane_base + 0xFFFE, plane_base + 0xFFFF); + } + set->freeze(); +} + +class LocaleAwareComparator { + public: + LocaleAwareComparator() { + UErrorCode error_code = U_ZERO_ERROR; + // Use the default collator. The default locale should have been properly + // set by the time this constructor is called. + collator_.reset(icu::Collator::createInstance(error_code)); + DCHECK(U_SUCCESS(error_code)); + // Make it case-sensitive. + collator_->setStrength(icu::Collator::TERTIARY); + // Note: We do not set UCOL_NORMALIZATION_MODE attribute. In other words, we + // do not pay performance penalty to guarantee sort order correctness for + // non-FCD (http://unicode.org/notes/tn5/#FCD) file names. This should be a + // reasonable tradeoff because such file names should be rare and the sort + // order doesn't change much anyway. + } + + // Note: A similar function is available in l10n_util. + // We cannot use it because base should not depend on l10n_util. + // TODO(yuzo): Move some of l10n_util to base. + int Compare(const string16& a, const string16& b) { + // We are not sure if Collator::compare is thread-safe. + // Use an AutoLock just in case. + AutoLock auto_lock(lock_); + + UErrorCode error_code = U_ZERO_ERROR; + UCollationResult result = collator_->compare( + static_cast<const UChar*>(a.c_str()), + static_cast<int>(a.length()), + static_cast<const UChar*>(b.c_str()), + static_cast<int>(b.length()), + error_code); + DCHECK(U_SUCCESS(error_code)); + return result; + } + + private: + scoped_ptr<icu::Collator> collator_; + Lock lock_; + friend struct DefaultSingletonTraits<LocaleAwareComparator>; + + DISALLOW_COPY_AND_ASSIGN(LocaleAwareComparator); +}; + +} // namespace + +namespace file_util { + +bool IsFilenameLegal(const string16& file_name) { + return Singleton<IllegalCharacters>()->containsNone(file_name); +} + +void ReplaceIllegalCharacters(std::wstring* file_name, int replace_char) { + DCHECK(file_name); + + DCHECK(!(Singleton<IllegalCharacters>()->contains(replace_char)) && + replace_char < 0x10000); + + // Remove leading and trailing whitespace. + TrimWhitespace(*file_name, TRIM_ALL, file_name); + + if (IsFilenameLegal(WideToUTF16(*file_name))) + return; + + std::wstring::size_type i = 0; + std::wstring::size_type length = file_name->size(); + const wchar_t* wstr = file_name->data(); +#if defined(WCHAR_T_IS_UTF16) + // Using |span| method of UnicodeSet might speed things up a bit, but + // it's not likely to matter here. + std::wstring temp; + temp.reserve(length); + while (i < length) { + UChar32 ucs4; + std::wstring::size_type prev = i; + U16_NEXT(wstr, i, length, ucs4); + if (Singleton<IllegalCharacters>()->contains(ucs4)) { + temp.push_back(replace_char); + } else if (ucs4 < 0x10000) { + temp.push_back(ucs4); + } else { + temp.push_back(wstr[prev]); + temp.push_back(wstr[prev + 1]); + } + } + file_name->swap(temp); +#elif defined(WCHAR_T_IS_UTF32) + while (i < length) { + if (Singleton<IllegalCharacters>()->contains(wstr[i])) { + (*file_name)[i] = replace_char; + } + ++i; + } +#else +#error wchar_t* should be either UTF-16 or UTF-32 +#endif +} + +bool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b) { +#if defined(OS_WIN) + return Singleton<LocaleAwareComparator>()->Compare(a.value().c_str(), + b.value().c_str()) < 0; + +#elif defined(OS_POSIX) + // On linux, the file system encoding is not defined. We assume + // SysNativeMBToWide takes care of it. + // + // ICU's collator can take strings in OS native encoding. But we convert the + // strings to UTF-16 ourselves to ensure conversion consistency. + // TODO(yuzo): Perhaps we should define SysNativeMBToUTF16? + return Singleton<LocaleAwareComparator>()->Compare( + WideToUTF16(base::SysNativeMBToWide(a.value().c_str())), + WideToUTF16(base::SysNativeMBToWide(b.value().c_str()))) < 0; +#else + #error Not implemented on your system +#endif +} + +} // namespace diff --git a/base/i18n/file_util_icu.h b/base/i18n/file_util_icu.h new file mode 100644 index 0000000..c309a9e --- /dev/null +++ b/base/i18n/file_util_icu.h @@ -0,0 +1,33 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// File utilities that use the ICU library go in this file. + +#include <string> + +#include "base/string16.h" + +class FilePath; + +namespace file_util { + +// Returns true if file_name does not have any illegal character. The input +// param has the same restriction as that for ReplaceIllegalCharacters. +bool IsFilenameLegal(const string16& file_name); + +// Replaces characters in 'file_name' that are illegal for file names with +// 'replace_char'. 'file_name' must not be a full or relative path, but just the +// file name component. Any leading or trailing whitespace in 'file_name' is +// removed. +// Example: +// file_name == "bad:file*name?.txt", changed to: "bad-file-name-.txt" when +// 'replace_char' is '-'. +void ReplaceIllegalCharacters(std::wstring* file_name, int replace_char); + +// Compares two filenames using the current locale information. This can be +// used to sort directory listings. It behaves like "operator<" for use in +// std::sort. +bool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b); + +} // namespace file_util diff --git a/base/i18n/file_util_icu_unittest.cc b/base/i18n/file_util_icu_unittest.cc new file mode 100644 index 0000000..aebcd0df --- /dev/null +++ b/base/i18n/file_util_icu_unittest.cc @@ -0,0 +1,71 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/i18n/file_util_icu.h" + +#include "base/file_util.h" +#include "base/path_service.h" +#include "testing/gtest/include/gtest/gtest.h" +#include "testing/platform_test.h" + +// file_util winds up using autoreleased objects on the Mac, so this needs +// to be a PlatformTest +class FileUtilICUTest : public PlatformTest { + protected: + virtual void SetUp() { + PlatformTest::SetUp(); + // Name a subdirectory of the temp directory. + ASSERT_TRUE(PathService::Get(base::DIR_TEMP, &test_dir_)); + test_dir_ = test_dir_.Append(FILE_PATH_LITERAL("FileUtilTest")); + + // Create a fresh, empty copy of this directory. + file_util::Delete(test_dir_, true); + file_util::CreateDirectory(test_dir_); + } + virtual void TearDown() { + PlatformTest::TearDown(); + // Clean up test directory + ASSERT_TRUE(file_util::Delete(test_dir_, true)); + ASSERT_FALSE(file_util::PathExists(test_dir_)); + } + + // the path to temporary directory used to contain the test operations + FilePath test_dir_; +}; + +static const struct goodbad_pair { + std::wstring bad_name; + std::wstring good_name; +} kIllegalCharacterCases[] = { + {L"bad*file:name?.jpg", L"bad-file-name-.jpg"}, + {L"**********::::.txt", L"--------------.txt"}, + // We can't use UCNs (universal character names) for C0/C1 characters and + // U+007F, but \x escape is interpreted by MSVC and gcc as we intend. + {L"bad\x0003\x0091 file\u200E\u200Fname.png", L"bad-- file--name.png"}, +#if defined(OS_WIN) + {L"bad*file\\name.jpg", L"bad-file-name.jpg"}, + {L"\t bad*file\\name/.jpg ", L"bad-file-name-.jpg"}, +#elif defined(OS_POSIX) + {L"bad*file?name.jpg", L"bad-file-name.jpg"}, + {L"\t bad*file?name/.jpg ", L"bad-file-name-.jpg"}, +#endif + {L"this_file_name is okay!.mp3", L"this_file_name is okay!.mp3"}, + {L"\u4E00\uAC00.mp3", L"\u4E00\uAC00.mp3"}, + {L"\u0635\u200C\u0644.mp3", L"\u0635\u200C\u0644.mp3"}, + {L"\U00010330\U00010331.mp3", L"\U00010330\U00010331.mp3"}, + // Unassigned codepoints are ok. + {L"\u0378\U00040001.mp3", L"\u0378\U00040001.mp3"}, + // Non-characters are not allowed. + {L"bad\uFFFFfile\U0010FFFEname.jpg ", L"bad-file-name.jpg"}, + {L"bad\uFDD0file\uFDEFname.jpg ", L"bad-file-name.jpg"}, +}; + +TEST_F(FileUtilICUTest, ReplaceIllegalCharactersTest) { + for (unsigned int i = 0; i < arraysize(kIllegalCharacterCases); ++i) { + std::wstring bad_name(kIllegalCharacterCases[i].bad_name); + file_util::ReplaceIllegalCharacters(&bad_name, L'-'); + EXPECT_EQ(kIllegalCharacterCases[i].good_name, bad_name); + } +} + diff --git a/base/i18n/number_formatting.cc b/base/i18n/number_formatting.cc new file mode 100644 index 0000000..fef1b7d --- /dev/null +++ b/base/i18n/number_formatting.cc @@ -0,0 +1,48 @@ +// Copyright (c) 2008 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/i18n/number_formatting.h" + +#include "base/logging.h" +#include "base/singleton.h" +#include "base/string_util.h" +#include "base/utf_string_conversions.h" +#include "unicode/numfmt.h" +#include "unicode/ustring.h" + +namespace base { + +namespace { + +struct NumberFormatSingletonTraits + : public DefaultSingletonTraits<icu::NumberFormat> { + static icu::NumberFormat* New() { + UErrorCode status = U_ZERO_ERROR; + icu::NumberFormat* formatter = icu::NumberFormat::createInstance(status); + DCHECK(U_SUCCESS(status)); + return formatter; + } + // There's no ICU call to destroy a NumberFormat object other than + // operator delete, so use the default Delete, which calls operator delete. + // This can cause problems if a different allocator is used by this file than + // by ICU. +}; + +} // namespace + +string16 FormatNumber(int64 number) { + icu::NumberFormat* number_format = + Singleton<icu::NumberFormat, NumberFormatSingletonTraits>::get(); + + if (!number_format) { + // As a fallback, just return the raw number in a string. + return UTF8ToUTF16(StringPrintf("%lld", number)); + } + icu::UnicodeString ustr; + number_format->format(number, ustr); + + return string16(ustr.getBuffer(), static_cast<size_t>(ustr.length())); +} + +} // namespace base diff --git a/base/i18n/number_formatting.h b/base/i18n/number_formatting.h new file mode 100644 index 0000000..9fa2b18 --- /dev/null +++ b/base/i18n/number_formatting.h @@ -0,0 +1,19 @@ +// Copyright (c) 2008 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_I18N_NUMBER_FORMATTING_H_ +#define BASE_I18N_NUMBER_FORMATTING_H_ + +#include <string> + +#include "base/basictypes.h" +#include "base/string16.h" + +namespace base { + +string16 FormatNumber(int64 number); + +} // namespace base + +#endif // BASE_I18N_NUMBER_FORMATTING_H_ |