Do some cleanup of file path name handling.

This started trying to cleanup DownloadManager::GenerateFilename which asserts if your system locale isn't UTF-8 (I ran into this when mine got messed up). The solution is to have GetSuggestedFilename return a FilePath rather than calling FromWStringHack. The rest of the patch is a result of trying to write GetSuggestedFilename in a reasonable way. I changed ReplaceIllegalCharacters to work on a FilePath::StringType. Some places in the code calling these functions got cleaner, some got messier. I think overall the ones that got messier are the ones doing sketchy things with paths and the ones that got cleaner are the ones doing things more properly. The only code here that gets called a nontrivial number of times is the weburlloader, and I think the new code does about the same number of string conversions overall (though on certain platforms the number will be higher or lower). BUG=none TEST=none Review URL: http://codereview.chromium.org/271056 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@29832 0039d316-1c4b-4281-b951-d872f2087c98
author: brettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-10-22 23:06:12 +0000
committer: brettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-10-22 23:06:12 +0000
commit: de29433580baeecc204edc9ca0c2aa47c51aa93a (patch)
tree: ec55f655f63f7e09fb373c1e7d784c2e721c6119 /base
parent: 2c1639589b5932b565c9d420cb79b56a4212a706 (diff)
download: chromium_src-de29433580baeecc204edc9ca0c2aa47c51aa93a.zip
chromium_src-de29433580baeecc204edc9ca0c2aa47c51aa93a.tar.gz
chromium_src-de29433580baeecc204edc9ca0c2aa47c51aa93a.tar.bz2
3 files changed, 77 insertions, 44 deletions
diff --git a/base/i18n/file_util_icu.cc b/base/i18n/file_util_icu.cc
index 0bc9db6..4d33e3a 100644
--- a/base/i18n/file_util_icu.cc
+++ b/base/i18n/file_util_icu.cc
@@ -124,50 +124,47 @@ bool IsFilenameLegal(const string16& file_name) {
   return Singleton<IllegalCharacters>()->containsNone(file_name);
 }
 
-void ReplaceIllegalCharacters(std::wstring* file_name, int replace_char) {
+void ReplaceIllegalCharactersInPath(FilePath::StringType* file_name,
+                                    char replace_char) {
   DCHECK(file_name);
 
-  DCHECK(!(Singleton<IllegalCharacters>()->contains(replace_char)) &&
-         replace_char < 0x10000);
+  DCHECK(!(Singleton<IllegalCharacters>()->contains(replace_char)));
 
   // Remove leading and trailing whitespace.
   TrimWhitespace(*file_name, TRIM_ALL, file_name);
 
-  if (IsFilenameLegal(WideToUTF16(*file_name)))
-    return;
+  IllegalCharacters* illegal = Singleton<IllegalCharacters>::get();
+  int cursor = 0;  // The ICU macros expect an int.
+  while (cursor < static_cast<int>(file_name->size())) {
+    int char_begin = cursor;
+    uint32 code_point;
+#if defined(OS_MACOSX)
+    // Mac uses UTF-8 encoding for filenames.
+    U8_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),
+            code_point);
+#elif defined(OS_WIN)
+    // Windows uses UTF-16 encoding for filenames.
+    U16_NEXT(file_name->data(), cursor, static_cast<int>(file_name->length()),
+             code_point);
+#elif defined(OS_LINUX)
+    // Linux doesn't actually define an encoding. It basically allows anything
+    // except for a few special ASCII characters.
+    unsigned char cur_char = static_cast<unsigned char>((*file_name)[cursor++]);
+    if (cur_char >= 0x80)
+      continue;
+    code_point = cur_char;
+#else
+    NOTREACHED();
+#endif
 
-  std::wstring::size_type i = 0;
-  std::wstring::size_type length = file_name->size();
-  const wchar_t* wstr = file_name->data();
-#if defined(WCHAR_T_IS_UTF16)
-  // Using |span| method of UnicodeSet might speed things up a bit, but
-  // it's not likely to matter here.
-  std::wstring temp;
-  temp.reserve(length);
-  while (i < length) {
-    UChar32 ucs4;
-    std::wstring::size_type prev = i;
-    U16_NEXT(wstr, i, length, ucs4);
-    if (Singleton<IllegalCharacters>()->contains(ucs4)) {
-      temp.push_back(replace_char);
-    } else if (ucs4 < 0x10000) {
-      temp.push_back(ucs4);
-    } else {
-      temp.push_back(wstr[prev]);
-      temp.push_back(wstr[prev + 1]);
+    if (illegal->contains(code_point)) {
+      file_name->replace(char_begin, cursor - char_begin, 1, replace_char);
+      // We just made the potentially multi-byte/word char into one that only
+      // takes one byte/word, so need to adjust the cursor to point to the next
+      // character again.
+      cursor = char_begin + 1;
     }
   }
-  file_name->swap(temp);
-#elif defined(WCHAR_T_IS_UTF32)
-  while (i < length) {
-    if (Singleton<IllegalCharacters>()->contains(wstr[i])) {
-      (*file_name)[i] = replace_char;
-    }
-    ++i;
-  }
-#else
-#error wchar_t* should be either UTF-16 or UTF-32
-#endif
 }
 
 bool LocaleAwareCompareFilenames(const FilePath& a, const FilePath& b) {
diff --git a/base/i18n/file_util_icu.h b/base/i18n/file_util_icu.h
index c309a9e..54ddb08 100644
--- a/base/i18n/file_util_icu.h
+++ b/base/i18n/file_util_icu.h
@@ -6,6 +6,7 @@
 
 #include <string>
 
+#include "base/file_path.h"
 #include "base/string16.h"
 
 class FilePath;
@@ -18,12 +19,13 @@ bool IsFilenameLegal(const string16& file_name);
 
 // Replaces characters in 'file_name' that are illegal for file names with
 // 'replace_char'. 'file_name' must not be a full or relative path, but just the
-// file name component. Any leading or trailing whitespace in 'file_name' is
-// removed.
+// file name component (since slashes are considered illegal). Any leading or
+// trailing whitespace in 'file_name' is removed.
 // Example:
 //   file_name == "bad:file*name?.txt", changed to: "bad-file-name-.txt" when
 //   'replace_char' is '-'.
-void ReplaceIllegalCharacters(std::wstring* file_name, int replace_char);
+void ReplaceIllegalCharactersInPath(FilePath::StringType* file_name,
+                                    char replace_char);
 
 // Compares two filenames using the current locale information. This can be
 // used to sort directory listings. It behaves like "operator<" for use in
diff --git a/base/i18n/file_util_icu_unittest.cc b/base/i18n/file_util_icu_unittest.cc
index aebcd0df..b46fe55 100644
--- a/base/i18n/file_util_icu_unittest.cc
+++ b/base/i18n/file_util_icu_unittest.cc
@@ -6,6 +6,7 @@
 
 #include "base/file_util.h"
 #include "base/path_service.h"
+#include "base/utf_string_conversions.h"
 #include "testing/gtest/include/gtest/gtest.h"
 #include "testing/platform_test.h"
 
@@ -34,9 +35,34 @@ class FileUtilICUTest : public PlatformTest {
   FilePath test_dir_;
 };
 
+#if defined(OS_LINUX)
+
+// Linux disallows some evil ASCII characters, but passes all non-ASCII.
+static const struct goodbad_pair {
+  const char* bad_name;
+  const char* good_name;
+} kIllegalCharacterCases[] = {
+  {"bad*file:name?.jpg", "bad-file-name-.jpg"},
+  {"**********::::.txt", "--------------.txt"},
+  {"\xe9\xf0zzzz.\xff", "\xe9\xf0zzzz.\xff"},
+};
+
+TEST_F(FileUtilICUTest, ReplaceIllegalCharacersInPathLinuxTest) {
+  for (size_t i = 0; i < arraysize(kIllegalCharacterCases); ++i) {
+    std::string bad_name(kIllegalCharacterCases[i].bad_name);
+    file_util::ReplaceIllegalCharactersInPath(&bad_name, '-');
+    EXPECT_EQ(kIllegalCharacterCases[i].good_name, bad_name);
+  }
+}
+
+#else
+
+// For Mac & Windows, which both do Unicode validation on filenames. These
+// characters are given as wide strings since its more convenient to specify
+// unicode characters. For Mac they should be converted to UTF-8.
 static const struct goodbad_pair {
-  std::wstring bad_name;
-  std::wstring good_name;
+  const wchar_t* bad_name;
+  const wchar_t* good_name;
 } kIllegalCharacterCases[] = {
   {L"bad*file:name?.jpg", L"bad-file-name-.jpg"},
   {L"**********::::.txt", L"--------------.txt"},
@@ -46,7 +72,7 @@ static const struct goodbad_pair {
 #if defined(OS_WIN)
   {L"bad*file\\name.jpg", L"bad-file-name.jpg"},
   {L"\t  bad*file\\name/.jpg ", L"bad-file-name-.jpg"},
-#elif defined(OS_POSIX)
+#elif defined(OS_MACOSX)
   {L"bad*file?name.jpg", L"bad-file-name.jpg"},
   {L"\t  bad*file?name/.jpg ", L"bad-file-name-.jpg"},
 #endif
@@ -61,11 +87,19 @@ static const struct goodbad_pair {
   {L"bad\uFDD0file\uFDEFname.jpg ", L"bad-file-name.jpg"},
 };
 
-TEST_F(FileUtilICUTest, ReplaceIllegalCharactersTest) {
-  for (unsigned int i = 0; i < arraysize(kIllegalCharacterCases); ++i) {
+TEST_F(FileUtilICUTest, ReplaceIllegalCharactersInPathTest) {
+  for (size_t i = 0; i < arraysize(kIllegalCharacterCases); ++i) {
+#if defined(OS_WIN)
     std::wstring bad_name(kIllegalCharacterCases[i].bad_name);
-    file_util::ReplaceIllegalCharacters(&bad_name, L'-');
+    file_util::ReplaceIllegalCharactersInPath(&bad_name, '-');
     EXPECT_EQ(kIllegalCharacterCases[i].good_name, bad_name);
+#elif defined(OS_MACOSX)
+    std::string bad_name(WideToUTF8(kIllegalCharacterCases[i].bad_name));
+    file_util::ReplaceIllegalCharactersInPath(&bad_name, '-');
+    EXPECT_EQ(WideToUTF8(kIllegalCharacterCases[i].good_name), bad_name);
+#endif
   }
 }
 
+#endif
+
author	brettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-10-22 23:06:12 +0000
committer	brettw@chromium.org <brettw@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-10-22 23:06:12 +0000
commit	de29433580baeecc204edc9ca0c2aa47c51aa93a (patch)
tree	ec55f655f63f7e09fb373c1e7d784c2e721c6119 /base
parent	2c1639589b5932b565c9d420cb79b56a4212a706 (diff)
download	chromium_src-de29433580baeecc204edc9ca0c2aa47c51aa93a.zip chromium_src-de29433580baeecc204edc9ca0c2aa47c51aa93a.tar.gz chromium_src-de29433580baeecc204edc9ca0c2aa47c51aa93a.tar.bz2