summaryrefslogtreecommitdiffstats
path: root/base/i18n/file_util_icu.cc
blob: 4d33e3a2d898ed0f4d4deeb3a7f6b668dc1b8232 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
// Copyright (c) 2009 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

// File utilities that use the ICU library go in this file.

#include "base/i18n/file_util_icu.h"

#include "base/file_path.h"
#include "base/scoped_ptr.h"
#include "base/singleton.h"
#include "base/string_util.h"
#include "base/sys_string_conversions.h"
#include "build/build_config.h"
#include "unicode/coll.h"
#include "unicode/uniset.h"

namespace {

class IllegalCharacters {
 public:
  bool contains(UChar32 ucs4) {
    return !!set->contains(ucs4);
  }

  bool containsNone(const string16 &s) {
    return !!set->containsNone(icu::UnicodeString(s.c_str(), s.size()));
  }

 private:
  friend class Singleton<IllegalCharacters>;
  friend struct DefaultSingletonTraits<IllegalCharacters>;

  IllegalCharacters();
  ~IllegalCharacters() { }

  scoped_ptr<icu::UnicodeSet> set;

  DISALLOW_COPY_AND_ASSIGN(IllegalCharacters);
};

IllegalCharacters::IllegalCharacters() {
  UErrorCode status = U_ZERO_ERROR;
  // Control characters, formatting characters, non-characters, and
  // some printable ASCII characters regarded as dangerous ('"*/:<>?\\').
  // See  http://blogs.msdn.com/michkap/archive/2006/11/03/941420.aspx
  // and http://msdn2.microsoft.com/en-us/library/Aa365247.aspx
  // TODO(jungshik): Revisit the set. ZWJ and ZWNJ are excluded because they
  // are legitimate in Arabic and some S/SE Asian scripts. However, when used
  // elsewhere, they can be confusing/problematic.
  // Also, consider wrapping the set with our Singleton class to create and
  // freeze it only once. Note that there's a trade-off between memory and
  // speed.
#if defined(WCHAR_T_IS_UTF16)
  set.reset(new icu::UnicodeSet(icu::UnicodeString(
      L"[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\u200c\u200d]]"), status));
#else
  set.reset(new icu::UnicodeSet(UNICODE_STRING_SIMPLE(
      "[[\"*/:<>?\\\\|][:Cc:][:Cf:] - [\\u200c\\u200d]]").unescape(),
      status));
#endif
  DCHECK(U_SUCCESS(status));
  // Add non-characters. If this becomes a performance bottleneck by
  // any chance, do not add these to |set| and change IsFilenameLegal()
  // to check |ucs4 & 0xFFFEu == 0xFFFEu|, in addiition to calling
  // containsNone().
  set->add(0xFDD0, 0xFDEF);
  for (int i = 0; i <= 0x10; ++i) {
    int plane_base = 0x10000 * i;
    set->add(plane_base + 0xFFFE, plane_base + 0xFFFF);
  }
  set->freeze();
}

class LocaleAwareComparator {
 public:
  LocaleAwareComparator() {
    UErrorCode error_code = U_ZERO_ERROR;
    // Use the default collator. The default locale should have been properly
    // set by the time this constructor is called.
    collator_.reset(icu::Collator::createInstance(error_code));
    DCHECK(U_SUCCESS(error_code));
    // Make it case-sensitive.
    collator_->setStrength(icu::Collator::TERTIARY);
    // Note: We do not set UCOL_NORMALIZATION_MODE attribute. In other words, we
    // do not pay performance penalty to guarantee sort order correctness for
    // non-FCD (http://unicode.org/notes/tn5/#FCD) file names. This should be a
    // reasonable tradeoff because such file names should be rare and the sort
    // order doesn't change much anyway.
  }

  // Note: A similar function is available in l10n_util.
  // We cannot use it because base should not depend on l10n_util.
  // TODO(yuzo): Move some of l10n_util to base.
  int Compare(const string16& a, const string16& b) {
    // We are not sure if Collator::compare is thread-safe.
    // Use an AutoLock just in case.
    AutoLock auto_lock(lock_);

    UErrorCode error_code = U_ZERO_ERROR;
    UCollationResult result = collator_->compare(
        static_cast<const UChar*>(a.c_str()),
        static_cast<int>(a.length()),
        static_cast<const UChar*>(b.c_str()),
        static_cast<int>(b.length()),
        error_code);
    DCHECK(U_SUCCESS(error_code));
    return result;
  }

 private:
  scoped_ptr<icu::Collator> collator_;
  Lock lock_;
  friend struct DefaultSingletonTraits<LocaleAwareComparator>;

  DISALLOW_COPY_AND_ASSIGN(LocaleAwareComparator);
};

}  // namespace

namespace file_util {

bool IsFilenameLegal(const string16& file_name) {
  return Singleton<IllegalCharacters>()->containsNone(file_name);
}

void ReplaceIllegalCharactersInPath(FilePath::StringType* file_name,
                                    char replace_char) {
  DCHECK(file_name);

  DCHECK(!(Singleton<IllegalCharacters>()->contains(replace_char)));

  // Remove leading and trailing whitespace.
  TrimWhitespace(*file_name, TRIM_ALL, file_name);

  IllegalCharacters* illegal = Singleton<IllegalCharacters>::get();
  int cursor = 0;  // The ICU macros expect an int.
  while (cursor < static_cast<int>(file_name->size())) {
    int char_begin = cursor;
    uint32 code_point;
#if defined(OS_MACOSX)
    // Mac uses UTF-8 encoding for filenames.
    U8_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),
            code_point);
#elif defined(OS_WIN)
    // Windows uses UTF-16 encoding for filenames.
    U16_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),
             code_point);
#elif defined(OS_LINUX)
    // Linux doesn't actually define an encoding. It basically allows anything
    // except for a few special ASCII characters.
    unsigned char cur_char = static_cast<unsigned char>((*file_name)[cursor++]);
    if (cur_char >= 0x80)
      continue;
    code_point = cur_char;
#else
    NOTREACHED();
#endif

    if (illegal->contains(code_point)) {
      file_name->replace(char_begin, cursor - char_begin, 1, replace_char);
      // We just made the potentially multi-byte/word char into one that only
      // takes one byte/word, so need to adjust the cursor to point to the next
      // character again.
      cursor = char_begin + 1;
    }
  }
}

bool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b) {
#if defined(OS_WIN)
  return Singleton<LocaleAwareComparator>()->Compare(a.value().c_str(),
                                                     b.value().c_str()) < 0;

#elif defined(OS_POSIX)
  // On linux, the file system encoding is not defined. We assume
  // SysNativeMBToWide takes care of it.
  //
  // ICU's collator can take strings in OS native encoding. But we convert the
  // strings to UTF-16 ourselves to ensure conversion consistency.
  // TODO(yuzo): Perhaps we should define SysNativeMBToUTF16?
  return Singleton<LocaleAwareComparator>()->Compare(
      WideToUTF16(base::SysNativeMBToWide(a.value().c_str())),
      WideToUTF16(base::SysNativeMBToWide(b.value().c_str()))) < 0;
#else
  #error Not implemented on your system
#endif
}

}  // namespace