From 8c0af3834e11d643562c13788c06c695f2666f51 Mon Sep 17 00:00:00 2001 From: "jcampan@chromium.org" Date: Thu, 21 Jan 2010 01:50:50 +0000 Subject: Adding some more escaping method. This will be used by the translate feature. BUG=None TEST=Run the unit-tests. Review URL: http://codereview.chromium.org/548088 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@36715 0039d316-1c4b-4281-b951-d872f2087c98 --- net/base/escape.cc | 67 ++++++++++++++++++++++++++++---- net/base/escape.h | 6 +++ net/base/escape_unittest.cc | 94 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 157 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/base/escape.cc b/net/base/escape.cc index 5a00b07..bf23bcb 100644 --- a/net/base/escape.cc +++ b/net/base/escape.cc @@ -108,9 +108,10 @@ const char kUrlUnescape[128] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 }; -std::string UnescapeURLImpl(const std::string& escaped_text, - UnescapeRule::Type rules, - size_t* offset_for_adjustment) { +template +STR UnescapeURLImpl(const STR& escaped_text, + UnescapeRule::Type rules, + size_t* offset_for_adjustment) { size_t offset_temp = string16::npos; if (!offset_for_adjustment) offset_for_adjustment = &offset_temp; @@ -124,13 +125,22 @@ std::string UnescapeURLImpl(const std::string& escaped_text, // The output of the unescaping is always smaller than the input, so we can // reserve the input size to make sure we have enough buffer and don't have // to allocate in the loop below. - std::string result; + STR result; result.reserve(escaped_text.length()); for (size_t i = 0, max = escaped_text.size(); i < max; ++i) { - if (escaped_text[i] == '%' && i + 2 < max) { - const std::string::value_type most_sig_digit(escaped_text[i + 1]); - const std::string::value_type least_sig_digit(escaped_text[i + 2]); + if (static_cast(escaped_text[i]) >= 128) { + // Non ASCII character, append as is. + result.push_back(escaped_text[i]); + continue; + } + + char current_char = static_cast(escaped_text[i]); + if (current_char == '%' && i + 2 < max) { + const typename STR::value_type most_sig_digit( + static_cast(escaped_text[i + 1])); + const typename STR::value_type least_sig_digit( + static_cast(escaped_text[i + 2])); if (IsHex(most_sig_digit) && IsHex(least_sig_digit)) { unsigned char value = HexToInt(most_sig_digit) * 16 + HexToInt(least_sig_digit); @@ -272,11 +282,17 @@ std::string UnescapeURLComponent(const std::string& escaped_text, return UnescapeURLImpl(escaped_text, rules, NULL); } +string16 UnescapeURLComponent(const string16& escaped_text, + UnescapeRule::Type rules) { + return UnescapeURLImpl(escaped_text, rules, NULL); +} + + template void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) { static const struct { char key; - const char *replacement; + const char* replacement; } kCharsToEscape[] = { { '<', "<" }, { '>', ">" }, @@ -323,3 +339,38 @@ std::string EscapeForHTML(const std::string& input) { string16 EscapeForHTML(const string16& input) { return EscapeForHTMLImpl(input); } + +string16 UnescapeForHTML(const string16& input) { + static const struct { + const wchar_t* ampersand_code; + const char replacement; + } kEscapeToChars[] = { + { L"<", '<' }, + { L">", '>' }, + { L"&", '&' }, + { L""", '"' }, + { L"'", '\''}, + }; + + if (input.find(WideToUTF16(L"&")) == std::string::npos) + return input; + + string16 ampersand_chars[ARRAYSIZE_UNSAFE(kEscapeToChars)]; + string16 text(input); + for (string16::iterator iter = text.begin(); iter != text.end(); ++iter) { + if (*iter == '&') { + // Potential ampersand encode char. + size_t index = iter - text.begin(); + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kEscapeToChars); i++) { + if (ampersand_chars[i].empty()) + ampersand_chars[i] = WideToUTF16(kEscapeToChars[i].ampersand_code); + if (text.find(ampersand_chars[i], index) == index) { + text.replace(iter, iter + ampersand_chars[i].length(), + 1, kEscapeToChars[i].replacement); + break; + } + } + } + } + return text; +} diff --git a/net/base/escape.h b/net/base/escape.h index 67ccc5f..b9b0b6a 100644 --- a/net/base/escape.h +++ b/net/base/escape.h @@ -92,6 +92,8 @@ class UnescapeRule { // conversions need to take place, it only unescapes. std::string UnescapeURLComponent(const std::string& escaped_text, UnescapeRule::Type rules); +string16 UnescapeURLComponent(const string16& escaped_text, + UnescapeRule::Type rules); // Unescapes the given substring as a URL, and then tries to interpret the // result as being encoded as UTF-8. If the result is convertable into UTF-8, it @@ -106,6 +108,10 @@ string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text, UnescapeRule::Type rules, size_t* offset_for_adjustment); +// Unescape the following ampersand character codes from |text|: +// < > & " ' +string16 UnescapeForHTML(const string16& text); + // Deprecated ------------------------------------------------------------------ // Escapes characters in text suitable for use as a query parameter value. diff --git a/net/base/escape_unittest.cc b/net/base/escape_unittest.cc index c93024c..0049528 100644 --- a/net/base/escape_unittest.cc +++ b/net/base/escape_unittest.cc @@ -19,6 +19,12 @@ struct EscapeCase { }; struct UnescapeURLCase { + const wchar_t* input; + UnescapeRule::Type rules; + const wchar_t* output; +}; + +struct UnescapeURLCaseASCII { const char* input; UnescapeRule::Type rules; const char* output; @@ -144,8 +150,8 @@ TEST(EscapeTest, EscapeUrlEncodedData) { "%7B%7C%7D~%7F%80%FF"); } -TEST(EscapeTest, UnescapeURLComponent) { - const UnescapeURLCase unescape_cases[] = { +TEST(EscapeTest, UnescapeURLComponentASCII) { + const UnescapeURLCaseASCII unescape_cases[] = { {"", UnescapeRule::NORMAL, ""}, {"%2", UnescapeRule::NORMAL, "%2"}, {"%%%%%%", UnescapeRule::NORMAL, "%%%%%%"}, @@ -205,6 +211,70 @@ TEST(EscapeTest, UnescapeURLComponent) { EXPECT_EQ(expected, UnescapeURLComponent(input, UnescapeRule::NORMAL)); } +TEST(EscapeTest, UnescapeURLComponent) { + const UnescapeURLCase unescape_cases[] = { + {L"", UnescapeRule::NORMAL, L""}, + {L"%2", UnescapeRule::NORMAL, L"%2"}, + {L"%%%%%%", UnescapeRule::NORMAL, L"%%%%%%"}, + {L"Don't escape anything", UnescapeRule::NORMAL, L"Don't escape anything"}, + {L"Invalid %escape %2", UnescapeRule::NORMAL, L"Invalid %escape %2"}, + {L"Some%20random text %25%3bOK", UnescapeRule::NONE, + L"Some%20random text %25%3bOK"}, + {L"Some%20random text %25%3bOK", UnescapeRule::NORMAL, + L"Some%20random text %25;OK"}, + {L"Some%20random text %25%3bOK", UnescapeRule::SPACES, + L"Some random text %25;OK"}, + {L"Some%20random text %25%3bOK", UnescapeRule::URL_SPECIAL_CHARS, + L"Some%20random text %;OK"}, + {L"Some%20random text %25%3bOK", + UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS, + L"Some random text %;OK"}, + {L"%A0%B1%C2%D3%E4%F5", UnescapeRule::NORMAL, L"\xA0\xB1\xC2\xD3\xE4\xF5"}, + {L"%Aa%Bb%Cc%Dd%Ee%Ff", UnescapeRule::NORMAL, L"\xAa\xBb\xCc\xDd\xEe\xFf"}, + // Certain URL-sensitive characters should not be unescaped unless asked. + {L"Hello%20%13%10world %23# %3F? %3D= %26& %25% %2B+", UnescapeRule::SPACES, + L"Hello %13%10world %23# %3F? %3D= %26& %25% %2B+"}, + {L"Hello%20%13%10world %23# %3F? %3D= %26& %25% %2B+", + UnescapeRule::URL_SPECIAL_CHARS, + L"Hello%20%13%10world ## ?? == && %% ++"}, + // Control characters. + {L"%01%02%03%04%05%06%07%08%09 %25", UnescapeRule::URL_SPECIAL_CHARS, + L"%01%02%03%04%05%06%07%08%09 %"}, + {L"%01%02%03%04%05%06%07%08%09 %25", UnescapeRule::CONTROL_CHARS, + L"\x01\x02\x03\x04\x05\x06\x07\x08\x09 %25"}, + {L"Hello%20%13%10%02", UnescapeRule::SPACES, L"Hello %13%10%02"}, + {L"Hello%20%13%10%02", UnescapeRule::CONTROL_CHARS, + L"Hello%20\x13\x10\x02"}, + {L"Hello\x9824\x9827", UnescapeRule::CONTROL_CHARS, + L"Hello\x9824\x9827"}, + }; + + for (size_t i = 0; i < arraysize(unescape_cases); i++) { + string16 str(WideToUTF16(unescape_cases[i].input)); + EXPECT_EQ(WideToUTF16(unescape_cases[i].output), + UnescapeURLComponent(str, unescape_cases[i].rules)); + } + + // Test the NULL character unescaping (which wouldn't work above since those + // are just char pointers). + string16 input(WideToUTF16(L"Null")); + input.push_back(0); // Also have a NULL in the input. + input.append(WideToUTF16(L"%00%39Test")); + + // When we're unescaping NULLs + string16 expected(WideToUTF16(L"Null")); + expected.push_back(0); + expected.push_back(0); + expected.append(ASCIIToUTF16("9Test")); + EXPECT_EQ(expected, UnescapeURLComponent(input, UnescapeRule::CONTROL_CHARS)); + + // When we're not unescaping NULLs. + expected = WideToUTF16(L"Null"); + expected.push_back(0); + expected.append(WideToUTF16(L"%009Test")); + EXPECT_EQ(expected, UnescapeURLComponent(input, UnescapeRule::NORMAL)); +} + TEST(EscapeTest, UnescapeAndDecodeUTF8URLComponent) { const UnescapeAndDecodeCase unescape_cases[] = { { "%", @@ -300,3 +370,23 @@ TEST(EscapeTest, EscapeForHTML) { EXPECT_EQ(std::string(tests[i].expected_output), result); } } + +TEST(EscapeTest, UnescapeForHTML) { + const EscapeForHTMLCase tests[] = { + { "", "" }, + { "<hello>", "" }, + { "don't mess with me", "don\'t mess with me" }, + { "<>&"'", "<>&\"'" }, + { "& lt; & ; &; '", "& lt; & ; &; '" }, + { "&", "&" }, + { """, "\"" }, + { "'", "'" }, + { "<", "<" }, + { ">", ">" }, + { "& &", "& &" }, + }; + for (size_t i = 0; i < arraysize(tests); ++i) { + string16 result = UnescapeForHTML(ASCIIToUTF16(tests[i].input)); + EXPECT_EQ(ASCIIToUTF16(tests[i].expected_output), result); + } +} -- cgit v1.1