diff options
-rw-r--r-- | base/string_util.cc | 26 | ||||
-rw-r--r-- | base/string_util.h | 19 | ||||
-rw-r--r-- | base/string_util_unittest.cc | 47 | ||||
-rw-r--r-- | build/googleurl.xcodeproj/project.pbxproj | 2 | ||||
-rw-r--r-- | chrome/browser/net/url_fixer_upper.cc | 2 | ||||
-rw-r--r-- | chrome/browser/net/url_fixer_upper_unittest.cc | 20 |
6 files changed, 108 insertions, 8 deletions
diff --git a/base/string_util.cc b/base/string_util.cc index 4ba8c4b..55be9df 100644 --- a/base/string_util.cc +++ b/base/string_util.cc @@ -337,8 +337,6 @@ const char kWhitespaceASCII[] = { 0x0C, 0x0D, 0x20, // Space - '\x85', // <control-0085> - '\xa0', // No-Break Space 0 }; const char* const kCodepageUTF8 = "UTF-8"; @@ -393,10 +391,32 @@ TrimPositions TrimWhitespace(const std::wstring& input, return TrimStringT(input, kWhitespaceWide, positions, output); } +TrimPositions TrimWhitespaceASCII(const std::string& input, + TrimPositions positions, + std::string* output) { + return TrimStringT(input, kWhitespaceASCII, positions, output); +} + +TrimPositions TrimWhitespaceUTF8(const std::string& input, + TrimPositions positions, + std::string* output) { + // This implementation is not so fast since it converts the text encoding + // twice. Please feel free to file a bug if this function hurts the + // performance of Chrome. + DCHECK(IsStringUTF8(input)); + std::wstring input_wide = UTF8ToWide(input); + std::wstring output_wide; + TrimPositions result = TrimWhitespace(input_wide, positions, &output_wide); + *output = WideToUTF8(output_wide); + return result; +} + +// This function is only for backward-compatibility. +// To be removed when all callers are updated. TrimPositions TrimWhitespace(const std::string& input, TrimPositions positions, std::string* output) { - return TrimStringT(input, kWhitespaceASCII, positions, output); + return TrimWhitespaceASCII(input, positions, output); } std::wstring CollapseWhitespace(const std::wstring& text, diff --git a/base/string_util.h b/base/string_util.h index 500a114..64e724e 100644 --- a/base/string_util.h +++ b/base/string_util.h @@ -128,9 +128,13 @@ bool TrimString(const std::string& input, std::string* output); // Trims any whitespace from either end of the input string. Returns where -// whitespace was found. The non-wide version of this function only looks for -// ASCII whitespace; UTF-8 code-points are not searched for (use the wide -// version instead). +// whitespace was found. +// The non-wide version has two functions: +// * TrimWhitespaceASCII() +// This function is for ASCII strings and only looks for ASCII whitespace; +// * TrimWhitespaceUTF8() +// This function is for UTF-8 strings and looks for Unicode whitespace. +// Please choose the best one according to your usage. // NOTE: Safe to use the same variable for both input and output. enum TrimPositions { TRIM_NONE = 0, @@ -141,6 +145,15 @@ enum TrimPositions { TrimPositions TrimWhitespace(const std::wstring& input, TrimPositions positions, std::wstring* output); +TrimPositions TrimWhitespaceASCII(const std::string& input, + TrimPositions positions, + std::string* output); +TrimPositions TrimWhitespaceUTF8(const std::string& input, + TrimPositions positions, + std::string* output); + +// Deprecated. This function is only for backward compatibility and calls +// TrimWhitespaceASCII(). TrimPositions TrimWhitespace(const std::string& input, TrimPositions positions, std::string* output); diff --git a/base/string_util_unittest.cc b/base/string_util_unittest.cc index 2b7634f..c5fdb2c 100644 --- a/base/string_util_unittest.cc +++ b/base/string_util_unittest.cc @@ -49,7 +49,6 @@ static const struct trim_case_ascii { {" ", TRIM_TRAILING, "", TRIM_TRAILING}, {" ", TRIM_ALL, "", TRIM_ALL}, {"\t\rTest String\n", TRIM_ALL, "Test String", TRIM_ALL}, - {"\x85Test String\xa0\x20", TRIM_ALL, "Test String", TRIM_ALL}, }; TEST(StringUtilTest, TrimWhitespace) { @@ -80,6 +79,52 @@ TEST(StringUtilTest, TrimWhitespace) { } } +static const struct trim_case_utf8 { + const char* input; + const TrimPositions positions; + const char* output; + const TrimPositions return_value; +} trim_cases_utf8[] = { + // UTF-8 strings that start (and end) with Unicode space characters + // (including zero-width spaces). + {"\xE2\x80\x80Test String\xE2\x80\x81", TRIM_ALL, "Test String", TRIM_ALL}, + {"\xE2\x80\x82Test String\xE2\x80\x83", TRIM_ALL, "Test String", TRIM_ALL}, + {"\xE2\x80\x84Test String\xE2\x80\x85", TRIM_ALL, "Test String", TRIM_ALL}, + {"\xE2\x80\x86Test String\xE2\x80\x87", TRIM_ALL, "Test String", TRIM_ALL}, + {"\xE2\x80\x88Test String\xE2\x80\x8A", TRIM_ALL, "Test String", TRIM_ALL}, + {"\xE3\x80\x80Test String\xE3\x80\x80", TRIM_ALL, "Test String", TRIM_ALL}, + // UTF-8 strings that end with 0x85 (NEL in ISO-8859). + {"\xD0\x85", TRIM_TRAILING, "\xD0\x85", TRIM_NONE}, + {"\xD9\x85", TRIM_TRAILING, "\xD9\x85", TRIM_NONE}, + {"\xEC\x97\x85", TRIM_TRAILING, "\xEC\x97\x85", TRIM_NONE}, + {"\xF0\x90\x80\x85", TRIM_TRAILING, "\xF0\x90\x80\x85", TRIM_NONE}, + // UTF-8 strings that end with 0xA0 (non-break space in ISO-8859-1). + {"\xD0\xA0", TRIM_TRAILING, "\xD0\xA0", TRIM_NONE}, + {"\xD9\xA0", TRIM_TRAILING, "\xD9\xA0", TRIM_NONE}, + {"\xEC\x97\xA0", TRIM_TRAILING, "\xEC\x97\xA0", TRIM_NONE}, + {"\xF0\x90\x80\xA0", TRIM_TRAILING, "\xF0\x90\x80\xA0", TRIM_NONE}, +}; + +TEST(StringUtilTest, TrimWhitespaceUTF8) { + std::string output_ascii; + for (size_t i = 0; i < arraysize(trim_cases_ascii); ++i) { + const trim_case_ascii& value = trim_cases_ascii[i]; + EXPECT_EQ(value.return_value, + TrimWhitespaceASCII(value.input, value.positions, &output_ascii)); + EXPECT_EQ(value.output, output_ascii); + } + + // Test that TrimWhiteSpaceUTF8() can remove Unicode space characters and + // prevent from removing UTF-8 characters that end with an ISO-8859 NEL. + std::string output_utf8; + for (size_t i = 0; i < arraysize(trim_cases_utf8); ++i) { + const trim_case_utf8& value = trim_cases_utf8[i]; + EXPECT_EQ(value.return_value, + TrimWhitespaceUTF8(value.input, value.positions, &output_utf8)); + EXPECT_EQ(value.output, output_utf8); + } +} + static const struct collapse_case { const wchar_t* input; const bool trim; diff --git a/build/googleurl.xcodeproj/project.pbxproj b/build/googleurl.xcodeproj/project.pbxproj index 38f074f..1db641c 100644 --- a/build/googleurl.xcodeproj/project.pbxproj +++ b/build/googleurl.xcodeproj/project.pbxproj @@ -60,6 +60,7 @@ 7BA019240E5A2BD700044150 /* libgtest.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 7BA019210E5A2BCB00044150 /* libgtest.a */; }; 7BA019640E5A2C2B00044150 /* libbase.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 7BA019440E5A2BFC00044150 /* libbase.a */; }; 7BA019700E5A2C4700044150 /* libicuuc.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 7BA0195F0E5A2C1200044150 /* libicuuc.a */; }; + 793B6B0D0F4D140000C68483 /* libicui18n.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 7BA0195B0E5A2C1200044150 /* libicui18n.a */; }; 7BA019740E5A2C5C00044150 /* libicudata.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 7BA019570E5A2C1200044150 /* libicudata.a */; }; /* End PBXBuildFile section */ @@ -264,6 +265,7 @@ 7BA018EF0E5A2B3300044150 /* libgoogleurl.a in Frameworks */, 7BA019240E5A2BD700044150 /* libgtest.a in Frameworks */, 7BA019740E5A2C5C00044150 /* libicudata.a in Frameworks */, + 793B6B0D0F4D140000C68483 /* libicui18n.a in Frameworks */, 7BA019700E5A2C4700044150 /* libicuuc.a in Frameworks */, ); runOnlyForDeploymentPostprocessing = 0; diff --git a/chrome/browser/net/url_fixer_upper.cc b/chrome/browser/net/url_fixer_upper.cc index 121807b..08d60d2 100644 --- a/chrome/browser/net/url_fixer_upper.cc +++ b/chrome/browser/net/url_fixer_upper.cc @@ -360,7 +360,7 @@ string URLFixerUpper::SegmentURL(const string& text, string URLFixerUpper::FixupURL(const string& text, const string& desired_tld) { string trimmed; - TrimWhitespace(text, TRIM_ALL, &trimmed); + TrimWhitespaceUTF8(text, TRIM_ALL, &trimmed); if (trimmed.empty()) return string(); // Nothing here. diff --git a/chrome/browser/net/url_fixer_upper_unittest.cc b/chrome/browser/net/url_fixer_upper_unittest.cc index 1e6dbc7..ef26b5e 100644 --- a/chrome/browser/net/url_fixer_upper_unittest.cc +++ b/chrome/browser/net/url_fixer_upper_unittest.cc @@ -177,6 +177,26 @@ struct fixup_case { {"ftpblah.google.com", "", "http://ftpblah.google.com/"}, {"ftp", "", "http://ftp/"}, {"google.ftp.com", "", "http://google.ftp.com/"}, + // URLs which end with an ISO-8859 next-line (0x85). + { "http://google.com/search?q=\xd0\x85", "", + "http://google.com/search?q=\xd0\x85" + }, + { "http://google.com/search?q=\xec\x97\x85", "", + "http://google.com/search?q=\xec\x97\x85" + }, + { "http://google.com/search?q=\xf0\x90\x80\x85", "", + "http://google.com/search?q=\xf0\x90\x80\x85" + }, + // URLs which end with a non-break space (0xA0). + { "http://google.com/search?q=\xd0\xa0", "", + "http://google.com/search?q=\xd0\xa0" + }, + { "http://google.com/search?q=\xec\x97\xa0", "", + "http://google.com/search?q=\xec\x97\xa0" + }, + { "http://google.com/search?q=\xf0\x90\x80\xa0", "", + "http://google.com/search?q=\xf0\x90\x80\xa0" + }, }; TEST(URLFixerUpperTest, FixupURL) { |