Add support for the extended header parameter syntax in Content-Disposition header (RFC 5987).

It's not generic, but is only used for 'filename' param. The CL is originally by James Simonsen I reviewed at http://codereview.chromium.org/4254001/show I added a check for ASCIIness for RFC 5987 extended header and a few tests to NetUti*.GetFileNameFromCD (net_unittests) and I*.ConvertCo*Norma* (base_unittests). I also replaced '\uxxxx' notation with the corresponding UTF-8 byte sequence because Visual Studio does not understand it yet. BUG=57830 TEST="net_unittests --gtest_filter=NetU*.GetFil*", "base_unittests --gtest_filter=I*.Conver*Norm*" and tests at http://greenbytes.de/tech/tc2231/ Original CL / Review: By James Simonsen; at http://codereview.chromium.org/4254001/show Review URL: http://codereview.chromium.org/4435001 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@64987 0039d316-1c4b-4281-b951-d872f2087c98
author: jshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-11-03 23:25:55 +0000
committer: jshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2010-11-03 23:25:55 +0000
commit: c09fb1c79c0a3e76dbb6091e4b718fd9bb197395 (patch)
tree: aab648921cd1817792614596ba4e6f94c9c3d7e0 /base/i18n
parent: 0912579b25f74d5b66c8adc0d3d8a7f805141e89 (diff)
download: chromium_src-c09fb1c79c0a3e76dbb6091e4b718fd9bb197395.zip
chromium_src-c09fb1c79c0a3e76dbb6091e4b718fd9bb197395.tar.gz
chromium_src-c09fb1c79c0a3e76dbb6091e4b718fd9bb197395.tar.bz2
3 files changed, 62 insertions, 0 deletions
diff --git a/base/i18n/icu_string_conversions.cc b/base/i18n/icu_string_conversions.cc
index 9014a7b..c353feb 100644
--- a/base/i18n/icu_string_conversions.cc
+++ b/base/i18n/icu_string_conversions.cc
@@ -9,9 +9,11 @@
 #include "base/basictypes.h"
 #include "base/logging.h"
 #include "base/string_util.h"
+#include "base/utf_string_conversions.h"
 #include "unicode/ucnv.h"
 #include "unicode/ucnv_cb.h"
 #include "unicode/ucnv_err.h"
+#include "unicode/unorm.h"
 #include "unicode/ustring.h"
 
 namespace base {
@@ -264,4 +266,28 @@ bool CodepageToWide(const std::string& encoded,
 #endif  // defined(WCHAR_T_IS_UTF32)
 }
 
+bool ConvertToUtf8AndNormalize(const std::string& text,
+                               const std::string& charset,
+                               std::string* result) {
+  result->clear();
+  string16 utf16;
+  if (!CodepageToUTF16(
+      text, charset.c_str(), OnStringConversionError::FAIL, &utf16))
+    return false;
+
+  UErrorCode status = U_ZERO_ERROR;
+  size_t max_length = utf16.length() + 1;
+  string16 normalized_utf16;
+  int actual_length = unorm_normalize(
+      utf16.c_str(), utf16.length(), UNORM_NFC, 0,
+      WriteInto(&normalized_utf16, max_length),
+      static_cast<int>(max_length), &status);
+  if (!U_SUCCESS(status))
+    return false;
+  normalized_utf16.resize(actual_length);
+
+  return UTF16ToUTF8(normalized_utf16.data(),
+                     normalized_utf16.length(), result);
+}
+
 }  // namespace base
diff --git a/base/i18n/icu_string_conversions.h b/base/i18n/icu_string_conversions.h
index 1495cae..901771b 100644
--- a/base/i18n/icu_string_conversions.h
+++ b/base/i18n/icu_string_conversions.h
@@ -64,6 +64,12 @@ bool CodepageToWide(const std::string& encoded,
                     OnStringConversionError::Type on_error,
                     std::wstring* wide);
 
+// Converts from any codepage to UTF-8 and ensures the resulting UTF-8 is
+// normalized.
+bool ConvertToUtf8AndNormalize(const std::string& text,
+                               const std::string& charset,
+                               std::string* result);
+
 }  // namespace base
 
 #endif  // BASE_I18N_ICU_STRING_CONVERSIONS_H_
diff --git a/base/i18n/icu_string_conversions_unittest.cc b/base/i18n/icu_string_conversions_unittest.cc
index 2083fa9..40b0fed 100644
--- a/base/i18n/icu_string_conversions_unittest.cc
+++ b/base/i18n/icu_string_conversions_unittest.cc
@@ -11,6 +11,7 @@
 #include "base/basictypes.h"
 #include "base/i18n/icu_string_conversions.h"
 #include "base/logging.h"
+#include "base/string_piece.h"
 #include "base/utf_string_conversions.h"
 #include "testing/gtest/include/gtest/gtest.h"
 
@@ -325,4 +326,33 @@ TEST(ICUStringConversionsTest, ConvertBetweenCodepageAndUTF16) {
   }
 }
 
+static const struct {
+  const char* encoded;
+  const char* codepage_name;
+  bool expected_success;
+  const char* expected_value;
+} kConvertAndNormalizeCases[] = {
+  {"foo-\xe4.html", "iso-8859-1", true, "foo-\xc3\xa4.html"},
+  {"foo-\xe4.html", "iso-8859-7", true, "foo-\xce\xb4.html"},
+  {"foo-\xe4.html", "foo-bar", false, ""},
+  {"foo-\xff.html", "ascii", false, ""},
+  {"foo.html", "ascii", true, "foo.html"},
+  {"foo-a\xcc\x88.html", "utf-8", true, "foo-\xc3\xa4.html"},
+  {"\x95\x32\x82\x36\xD2\xBB", "gb18030", true, "\xF0\xA0\x80\x80\xE4\xB8\x80"},
+  {"\xA7\x41\xA6\x6E", "big5", true, "\xE4\xBD\xA0\xE5\xA5\xBD"},
+  // Windows-1258 does have a combining character at xD2 (which is U+0309).
+  // The sequence of (U+00E2, U+0309) is also encoded as U+1EA9.
+  {"foo\xE2\xD2", "windows-1258", true, "foo\xE1\xBA\xA9"},
+};
+TEST(ICUStringConversionsTest, ConvertToUtf8AndNormalize) {
+  std::string result;
+  for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kConvertAndNormalizeCases); ++i) {
+    bool success = ConvertToUtf8AndNormalize(
+        kConvertAndNormalizeCases[i].encoded,
+        kConvertAndNormalizeCases[i].codepage_name, &result);
+    EXPECT_EQ(kConvertAndNormalizeCases[i].expected_success, success);
+    EXPECT_EQ(kConvertAndNormalizeCases[i].expected_value, result);
+  }
+}
+
 }  // namespace base
author	jshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-11-03 23:25:55 +0000
committer	jshin@chromium.org <jshin@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2010-11-03 23:25:55 +0000
commit	c09fb1c79c0a3e76dbb6091e4b718fd9bb197395 (patch)
tree	aab648921cd1817792614596ba4e6f94c9c3d7e0 /base/i18n
parent	0912579b25f74d5b66c8adc0d3d8a7f805141e89 (diff)
download	chromium_src-c09fb1c79c0a3e76dbb6091e4b718fd9bb197395.zip chromium_src-c09fb1c79c0a3e76dbb6091e4b718fd9bb197395.tar.gz chromium_src-c09fb1c79c0a3e76dbb6091e4b718fd9bb197395.tar.bz2