Fixes Issue 7377: Regression: Omnibox trims URL ending with 0x85 (Take 2)

This is the same change as "http://codereview.chromium.org/20219/show", which I reverted it because it caused build breaks on sandbox. To investigate this build break, it seems this build break is somehow caused by "base/string_util.cc" that includes the TrimWhiteSpaceUTF8() function. To fix this build break, I moved the TrimWhiteSpaceUTF8() function to "base/string_util_icu.cc". BUG=7377 Review URL: http://codereview.chromium.org/28310 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@10970 0039d316-1c4b-4281-b951-d872f2087c98
author: hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-03-05 03:41:51 +0000
committer: hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2009-03-05 03:41:51 +0000
commit: 90f933a3b3ce799ebd2f1e04c8906ba0c6f514d9 (patch)
tree: 6baaad8528a0bc8cf45df75ef3dd69c60ccb99a4
parent: 3a164a1633cc2c8136aa385223a8ebdbda407150 (diff)
download: chromium_src-90f933a3b3ce799ebd2f1e04c8906ba0c6f514d9.zip
chromium_src-90f933a3b3ce799ebd2f1e04c8906ba0c6f514d9.tar.gz
chromium_src-90f933a3b3ce799ebd2f1e04c8906ba0c6f514d9.tar.bz2
6 files changed, 106 insertions, 8 deletions
diff --git a/base/string_util.cc b/base/string_util.cc
index 8f9bcf9..b37b483 100644
--- a/base/string_util.cc
+++ b/base/string_util.cc
@@ -381,8 +381,6 @@ const char kWhitespaceASCII[] = {
   0x0C,
   0x0D,
   0x20,    // Space
-  '\x85',  // <control-0085>
-  '\xa0',  // No-Break Space
   0
 };
 const char* const kCodepageUTF8 = "UTF-8";
@@ -437,10 +435,18 @@ TrimPositions TrimWhitespace(const std::wstring& input,
   return TrimStringT(input, kWhitespaceWide, positions, output);
 }
 
+TrimPositions TrimWhitespaceASCII(const std::string& input,
+                                  TrimPositions positions,
+                                  std::string* output) {
+  return TrimStringT(input, kWhitespaceASCII, positions, output);
+}
+
+// This function is only for backward-compatibility.
+// To be removed when all callers are updated.
 TrimPositions TrimWhitespace(const std::string& input,
                              TrimPositions positions,
                              std::string* output) {
-  return TrimStringT(input, kWhitespaceASCII, positions, output);
+  return TrimWhitespaceASCII(input, positions, output);
 }
 
 std::wstring CollapseWhitespace(const std::wstring& text,
diff --git a/base/string_util.h b/base/string_util.h
index 3f905fe..a688828 100644
--- a/base/string_util.h
+++ b/base/string_util.h
@@ -130,9 +130,13 @@ bool TrimString(const std::string& input,
                 std::string* output);
 
 // Trims any whitespace from either end of the input string.  Returns where
-// whitespace was found.  The non-wide version of this function only looks for
-// ASCII whitespace; UTF-8 code-points are not searched for (use the wide
-// version instead).
+// whitespace was found.
+// The non-wide version has two functions:
+// * TrimWhitespaceASCII()
+//   This function is for ASCII strings and only looks for ASCII whitespace;
+// * TrimWhitespaceUTF8()
+//   This function is for UTF-8 strings and looks for Unicode whitespace.
+// Please choose the best one according to your usage.
 // NOTE: Safe to use the same variable for both input and output.
 enum TrimPositions {
   TRIM_NONE     = 0,
@@ -143,6 +147,15 @@ enum TrimPositions {
 TrimPositions TrimWhitespace(const std::wstring& input,
                              TrimPositions positions,
                              std::wstring* output);
+TrimPositions TrimWhitespaceASCII(const std::string& input,
+                                  TrimPositions positions,
+                                  std::string* output);
+TrimPositions TrimWhitespaceUTF8(const std::string& input,
+                                 TrimPositions positions,
+                                 std::string* output);
+
+// Deprecated. This function is only for backward compatibility and calls
+// TrimWhitespaceASCII().
 TrimPositions TrimWhitespace(const std::string& input,
                              TrimPositions positions,
                              std::string* output);
diff --git a/base/string_util_icu.cc b/base/string_util_icu.cc
index 9036ce6..eae66d1 100644
--- a/base/string_util_icu.cc
+++ b/base/string_util_icu.cc
@@ -530,3 +530,17 @@ std::wstring FormatNumber(int64 number) {
 #endif  // defined(WCHAR_T_IS_UTF32)
 }
 
+TrimPositions TrimWhitespaceUTF8(const std::string& input,
+                                 TrimPositions positions,
+                                 std::string* output) {
+  // This implementation is not so fast since it converts the text encoding
+  // twice. Please feel free to file a bug if this function hurts the
+  // performance of Chrome.
+  DCHECK(IsStringUTF8(input));
+  std::wstring input_wide = UTF8ToWide(input);
+  std::wstring output_wide;
+  TrimPositions result = TrimWhitespace(input_wide, positions, &output_wide);
+  *output = WideToUTF8(output_wide);
+  return result;
+}
+
diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc
index 8123151..9fe449d 100644
--- a/base/string_util_unittest.cc
+++ b/base/string_util_unittest.cc
@@ -49,7 +49,6 @@ static const struct trim_case_ascii {
   {"  ", TRIM_TRAILING, "", TRIM_TRAILING},
   {"  ", TRIM_ALL, "", TRIM_ALL},
   {"\t\rTest String\n", TRIM_ALL, "Test String", TRIM_ALL},
-  {"\x85Test String\xa0\x20", TRIM_ALL, "Test String", TRIM_ALL},
 };
 
 TEST(StringUtilTest, TrimWhitespace) {
@@ -80,6 +79,52 @@ TEST(StringUtilTest, TrimWhitespace) {
   }
 }
 
+static const struct trim_case_utf8 {
+  const char* input;
+  const TrimPositions positions;
+  const char* output;
+  const TrimPositions return_value;
+} trim_cases_utf8[] = {
+  // UTF-8 strings that start (and end) with Unicode space characters
+  // (including zero-width spaces).
+  {"\xE2\x80\x80Test String\xE2\x80\x81", TRIM_ALL, "Test String", TRIM_ALL},
+  {"\xE2\x80\x82Test String\xE2\x80\x83", TRIM_ALL, "Test String", TRIM_ALL},
+  {"\xE2\x80\x84Test String\xE2\x80\x85", TRIM_ALL, "Test String", TRIM_ALL},
+  {"\xE2\x80\x86Test String\xE2\x80\x87", TRIM_ALL, "Test String", TRIM_ALL},
+  {"\xE2\x80\x88Test String\xE2\x80\x8A", TRIM_ALL, "Test String", TRIM_ALL},
+  {"\xE3\x80\x80Test String\xE3\x80\x80", TRIM_ALL, "Test String", TRIM_ALL},
+  // UTF-8 strings that end with 0x85 (NEL in ISO-8859).
+  {"\xD0\x85", TRIM_TRAILING, "\xD0\x85", TRIM_NONE},
+  {"\xD9\x85", TRIM_TRAILING, "\xD9\x85", TRIM_NONE},
+  {"\xEC\x97\x85", TRIM_TRAILING, "\xEC\x97\x85", TRIM_NONE},
+  {"\xF0\x90\x80\x85", TRIM_TRAILING, "\xF0\x90\x80\x85", TRIM_NONE},
+  // UTF-8 strings that end with 0xA0 (non-break space in ISO-8859-1).
+  {"\xD0\xA0", TRIM_TRAILING, "\xD0\xA0", TRIM_NONE},
+  {"\xD9\xA0", TRIM_TRAILING, "\xD9\xA0", TRIM_NONE},
+  {"\xEC\x97\xA0", TRIM_TRAILING, "\xEC\x97\xA0", TRIM_NONE},
+  {"\xF0\x90\x80\xA0", TRIM_TRAILING, "\xF0\x90\x80\xA0", TRIM_NONE},
+};
+
+TEST(StringUtilTest, TrimWhitespaceUTF8) {
+  std::string output_ascii;
+  for (size_t i = 0; i < arraysize(trim_cases_ascii); ++i) {
+    const trim_case_ascii& value = trim_cases_ascii[i];
+    EXPECT_EQ(value.return_value,
+              TrimWhitespaceASCII(value.input, value.positions, &output_ascii));
+    EXPECT_EQ(value.output, output_ascii);
+  }
+
+  // Test that TrimWhiteSpaceUTF8() can remove Unicode space characters and
+  // prevent from removing UTF-8 characters that end with an ISO-8859 NEL.
+  std::string output_utf8;
+  for (size_t i = 0; i < arraysize(trim_cases_utf8); ++i) {
+    const trim_case_utf8& value = trim_cases_utf8[i];
+    EXPECT_EQ(value.return_value,
+              TrimWhitespaceUTF8(value.input, value.positions, &output_utf8));
+    EXPECT_EQ(value.output, output_utf8);
+  }
+}
+
 static const struct collapse_case {
   const wchar_t* input;
   const bool trim;
diff --git a/chrome/browser/net/url_fixer_upper.cc b/chrome/browser/net/url_fixer_upper.cc
index 121807b..5d28a9d 100644
--- a/chrome/browser/net/url_fixer_upper.cc
+++ b/chrome/browser/net/url_fixer_upper.cc
@@ -265,7 +265,7 @@ string URLFixerUpper::SegmentURL(const string& text,
   *parts = url_parse::Parsed();
 
   string trimmed;
-  TrimWhitespace(text, TRIM_ALL, &trimmed);
+  TrimWhitespaceUTF8(text, TRIM_ALL, &trimmed);
   if (trimmed.empty())
     return string();  // Nothing to segment.
 
diff --git a/chrome/browser/net/url_fixer_upper_unittest.cc b/chrome/browser/net/url_fixer_upper_unittest.cc
index 1e6dbc7..b758d625 100644
--- a/chrome/browser/net/url_fixer_upper_unittest.cc
+++ b/chrome/browser/net/url_fixer_upper_unittest.cc
@@ -177,6 +177,26 @@ struct fixup_case {
   {"ftpblah.google.com", "", "http://ftpblah.google.com/"},
   {"ftp", "", "http://ftp/"},
   {"google.ftp.com", "", "http://google.ftp.com/"},
+  // URLs which end with 0x85 (NEL in ISO-8859).
+  { "http://google.com/search?q=\xd0\x85", "",
+    "http://google.com/search?q=\xd0\x85"
+  },
+  { "http://google.com/search?q=\xec\x97\x85", "",
+    "http://google.com/search?q=\xec\x97\x85"
+  },
+  { "http://google.com/search?q=\xf0\x90\x80\x85", "",
+    "http://google.com/search?q=\xf0\x90\x80\x85"
+  },
+  // URLs which end with 0xA0 (non-break space in ISO-8859).
+  { "http://google.com/search?q=\xd0\xa0", "",
+    "http://google.com/search?q=\xd0\xa0"
+  },
+  { "http://google.com/search?q=\xec\x97\xa0", "",
+    "http://google.com/search?q=\xec\x97\xa0"
+  },
+  { "http://google.com/search?q=\xf0\x90\x80\xa0", "",
+    "http://google.com/search?q=\xf0\x90\x80\xa0"
+  },
 };
 
 TEST(URLFixerUpperTest, FixupURL) {
author	hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-03-05 03:41:51 +0000
committer	hbono@chromium.org <hbono@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2009-03-05 03:41:51 +0000
commit	90f933a3b3ce799ebd2f1e04c8906ba0c6f514d9 (patch)
tree	6baaad8528a0bc8cf45df75ef3dd69c60ccb99a4
parent	3a164a1633cc2c8136aa385223a8ebdbda407150 (diff)
download	chromium_src-90f933a3b3ce799ebd2f1e04c8906ba0c6f514d9.zip chromium_src-90f933a3b3ce799ebd2f1e04c8906ba0c6f514d9.tar.gz chromium_src-90f933a3b3ce799ebd2f1e04c8906ba0c6f514d9.tar.bz2