From 8c0af3834e11d643562c13788c06c695f2666f51 Mon Sep 17 00:00:00 2001
From: "jcampan@chromium.org"
 <jcampan@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>
Date: Thu, 21 Jan 2010 01:50:50 +0000
Subject: Adding some more escaping method. This will be used by the translate
 feature.

BUG=None
TEST=Run the unit-tests.

Review URL: http://codereview.chromium.org/548088

git-svn-id: svn://svn.chromium.org/chrome/trunk/src@36715 0039d316-1c4b-4281-b951-d872f2087c98
---
 net/base/escape.cc          | 67 ++++++++++++++++++++++++++++----
 net/base/escape.h           |  6 +++
 net/base/escape_unittest.cc | 94 ++++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 157 insertions(+), 10 deletions(-)

(limited to 'net')
diff --git a/net/base/escape.cc b/net/base/escape.cc
index 5a00b07..bf23bcb 100644
--- a/net/base/escape.cc
+++ b/net/base/escape.cc
@@ -108,9 +108,10 @@ const char kUrlUnescape[128] = {
      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0
 };
 
-std::string UnescapeURLImpl(const std::string& escaped_text,
-                            UnescapeRule::Type rules,
-                            size_t* offset_for_adjustment) {
+template<typename STR>
+STR UnescapeURLImpl(const STR& escaped_text,
+                    UnescapeRule::Type rules,
+                    size_t* offset_for_adjustment) {
   size_t offset_temp = string16::npos;
   if (!offset_for_adjustment)
     offset_for_adjustment = &offset_temp;
@@ -124,13 +125,22 @@ std::string UnescapeURLImpl(const std::string& escaped_text,
   // The output of the unescaping is always smaller than the input, so we can
   // reserve the input size to make sure we have enough buffer and don't have
   // to allocate in the loop below.
-  std::string result;
+  STR result;
   result.reserve(escaped_text.length());
 
   for (size_t i = 0, max = escaped_text.size(); i < max; ++i) {
-    if (escaped_text[i] == '%' && i + 2 < max) {
-      const std::string::value_type most_sig_digit(escaped_text[i + 1]);
-      const std::string::value_type least_sig_digit(escaped_text[i + 2]);
+    if (static_cast<unsigned char>(escaped_text[i]) >= 128) {
+      // Non ASCII character, append as is.
+      result.push_back(escaped_text[i]);
+      continue;
+    }
+
+    char current_char = static_cast<char>(escaped_text[i]);
+    if (current_char == '%' && i + 2 < max) {
+      const typename STR::value_type most_sig_digit(
+          static_cast<typename STR::value_type>(escaped_text[i + 1]));
+      const typename STR::value_type least_sig_digit(
+          static_cast<typename STR::value_type>(escaped_text[i + 2]));
       if (IsHex(most_sig_digit) && IsHex(least_sig_digit)) {
         unsigned char value = HexToInt(most_sig_digit) * 16 +
             HexToInt(least_sig_digit);
@@ -272,11 +282,17 @@ std::string UnescapeURLComponent(const std::string& escaped_text,
   return UnescapeURLImpl(escaped_text, rules, NULL);
 }
 
+string16 UnescapeURLComponent(const string16& escaped_text,
+                              UnescapeRule::Type rules) {
+  return UnescapeURLImpl(escaped_text, rules, NULL);
+}
+
+
 template <class str>
 void AppendEscapedCharForHTMLImpl(typename str::value_type c, str* output) {
   static const struct {
     char key;
-    const char *replacement;
+    const char* replacement;
   } kCharsToEscape[] = {
     { '<', "&lt;" },
     { '>', "&gt;" },
@@ -323,3 +339,38 @@ std::string EscapeForHTML(const std::string& input) {
 string16 EscapeForHTML(const string16& input) {
   return EscapeForHTMLImpl(input);
 }
+
+string16 UnescapeForHTML(const string16& input) {
+  static const struct {
+    const wchar_t* ampersand_code;
+    const char replacement;
+  } kEscapeToChars[] = {
+    { L"&lt;", '<' },
+    { L"&gt;", '>' },
+    { L"&amp;", '&' },
+    { L"&quot;", '"' },
+    { L"&#39;", '\''},
+  };
+
+  if (input.find(WideToUTF16(L"&")) == std::string::npos)
+    return input;
+
+  string16 ampersand_chars[ARRAYSIZE_UNSAFE(kEscapeToChars)];
+  string16 text(input);
+  for (string16::iterator iter = text.begin(); iter != text.end(); ++iter) {
+    if (*iter == '&') {
+      // Potential ampersand encode char.
+      size_t index = iter - text.begin();
+      for (size_t i = 0; i < ARRAYSIZE_UNSAFE(kEscapeToChars); i++) {
+        if (ampersand_chars[i].empty())
+          ampersand_chars[i] = WideToUTF16(kEscapeToChars[i].ampersand_code);
+        if (text.find(ampersand_chars[i], index) == index) {
+          text.replace(iter, iter + ampersand_chars[i].length(),
+                       1, kEscapeToChars[i].replacement);
+          break;
+        }
+      }
+    }
+  }
+  return text;
+}
diff --git a/net/base/escape.h b/net/base/escape.h
index 67ccc5f..b9b0b6a 100644
--- a/net/base/escape.h
+++ b/net/base/escape.h
@@ -92,6 +92,8 @@ class UnescapeRule {
 // conversions need to take place, it only unescapes.
 std::string UnescapeURLComponent(const std::string& escaped_text,
                                  UnescapeRule::Type rules);
+string16 UnescapeURLComponent(const string16& escaped_text,
+                              UnescapeRule::Type rules);
 
 // Unescapes the given substring as a URL, and then tries to interpret the
 // result as being encoded as UTF-8. If the result is convertable into UTF-8, it
@@ -106,6 +108,10 @@ string16 UnescapeAndDecodeUTF8URLComponent(const std::string& text,
                                            UnescapeRule::Type rules,
                                            size_t* offset_for_adjustment);
 
+// Unescape the following ampersand character codes from |text|:
+// &lt; &gt; &amp; &quot; &#39;
+string16 UnescapeForHTML(const string16& text);
+
 // Deprecated ------------------------------------------------------------------
 
 // Escapes characters in text suitable for use as a query parameter value.
diff --git a/net/base/escape_unittest.cc b/net/base/escape_unittest.cc
index c93024c..0049528 100644
--- a/net/base/escape_unittest.cc
+++ b/net/base/escape_unittest.cc
@@ -19,6 +19,12 @@ struct EscapeCase {
 };
 
 struct UnescapeURLCase {
+  const wchar_t* input;
+  UnescapeRule::Type rules;
+  const wchar_t* output;
+};
+
+struct UnescapeURLCaseASCII {
   const char* input;
   UnescapeRule::Type rules;
   const char* output;
@@ -144,8 +150,8 @@ TEST(EscapeTest, EscapeUrlEncodedData) {
     "%7B%7C%7D~%7F%80%FF");
 }
 
-TEST(EscapeTest, UnescapeURLComponent) {
-  const UnescapeURLCase unescape_cases[] = {
+TEST(EscapeTest, UnescapeURLComponentASCII) {
+  const UnescapeURLCaseASCII unescape_cases[] = {
     {"", UnescapeRule::NORMAL, ""},
     {"%2", UnescapeRule::NORMAL, "%2"},
     {"%%%%%%", UnescapeRule::NORMAL, "%%%%%%"},
@@ -205,6 +211,70 @@ TEST(EscapeTest, UnescapeURLComponent) {
   EXPECT_EQ(expected, UnescapeURLComponent(input, UnescapeRule::NORMAL));
 }
 
+TEST(EscapeTest, UnescapeURLComponent) {
+  const UnescapeURLCase unescape_cases[] = {
+    {L"", UnescapeRule::NORMAL, L""},
+    {L"%2", UnescapeRule::NORMAL, L"%2"},
+    {L"%%%%%%", UnescapeRule::NORMAL, L"%%%%%%"},
+    {L"Don't escape anything", UnescapeRule::NORMAL, L"Don't escape anything"},
+    {L"Invalid %escape %2", UnescapeRule::NORMAL, L"Invalid %escape %2"},
+    {L"Some%20random text %25%3bOK", UnescapeRule::NONE,
+     L"Some%20random text %25%3bOK"},
+    {L"Some%20random text %25%3bOK", UnescapeRule::NORMAL,
+     L"Some%20random text %25;OK"},
+    {L"Some%20random text %25%3bOK", UnescapeRule::SPACES,
+     L"Some random text %25;OK"},
+    {L"Some%20random text %25%3bOK", UnescapeRule::URL_SPECIAL_CHARS,
+     L"Some%20random text %;OK"},
+    {L"Some%20random text %25%3bOK",
+     UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS,
+     L"Some random text %;OK"},
+    {L"%A0%B1%C2%D3%E4%F5", UnescapeRule::NORMAL, L"\xA0\xB1\xC2\xD3\xE4\xF5"},
+    {L"%Aa%Bb%Cc%Dd%Ee%Ff", UnescapeRule::NORMAL, L"\xAa\xBb\xCc\xDd\xEe\xFf"},
+    // Certain URL-sensitive characters should not be unescaped unless asked.
+    {L"Hello%20%13%10world %23# %3F? %3D= %26& %25% %2B+", UnescapeRule::SPACES,
+     L"Hello %13%10world %23# %3F? %3D= %26& %25% %2B+"},
+    {L"Hello%20%13%10world %23# %3F? %3D= %26& %25% %2B+",
+     UnescapeRule::URL_SPECIAL_CHARS,
+     L"Hello%20%13%10world ## ?? == && %% ++"},
+    // Control characters.
+    {L"%01%02%03%04%05%06%07%08%09 %25", UnescapeRule::URL_SPECIAL_CHARS,
+     L"%01%02%03%04%05%06%07%08%09 %"},
+    {L"%01%02%03%04%05%06%07%08%09 %25", UnescapeRule::CONTROL_CHARS,
+     L"\x01\x02\x03\x04\x05\x06\x07\x08\x09 %25"},
+    {L"Hello%20%13%10%02", UnescapeRule::SPACES, L"Hello %13%10%02"},
+    {L"Hello%20%13%10%02", UnescapeRule::CONTROL_CHARS,
+     L"Hello%20\x13\x10\x02"},
+    {L"Hello\x9824\x9827", UnescapeRule::CONTROL_CHARS,
+     L"Hello\x9824\x9827"},
+  };
+
+  for (size_t i = 0; i < arraysize(unescape_cases); i++) {
+    string16 str(WideToUTF16(unescape_cases[i].input));
+    EXPECT_EQ(WideToUTF16(unescape_cases[i].output),
+              UnescapeURLComponent(str, unescape_cases[i].rules));
+  }
+
+  // Test the NULL character unescaping (which wouldn't work above since those
+  // are just char pointers).
+  string16 input(WideToUTF16(L"Null"));
+  input.push_back(0);  // Also have a NULL in the input.
+  input.append(WideToUTF16(L"%00%39Test"));
+
+  // When we're unescaping NULLs
+  string16 expected(WideToUTF16(L"Null"));
+  expected.push_back(0);
+  expected.push_back(0);
+  expected.append(ASCIIToUTF16("9Test"));
+  EXPECT_EQ(expected, UnescapeURLComponent(input, UnescapeRule::CONTROL_CHARS));
+
+  // When we're not unescaping NULLs.
+  expected = WideToUTF16(L"Null");
+  expected.push_back(0);
+  expected.append(WideToUTF16(L"%009Test"));
+  EXPECT_EQ(expected, UnescapeURLComponent(input, UnescapeRule::NORMAL));
+}
+
 TEST(EscapeTest, UnescapeAndDecodeUTF8URLComponent) {
   const UnescapeAndDecodeCase unescape_cases[] = {
     { "%",
@@ -300,3 +370,23 @@ TEST(EscapeTest, EscapeForHTML) {
     EXPECT_EQ(std::string(tests[i].expected_output), result);
   }
 }
+
+TEST(EscapeTest, UnescapeForHTML) {
+  const EscapeForHTMLCase tests[] = {
+    { "", "" },
+    { "&lt;hello&gt;", "<hello>" },
+    { "don&#39;t mess with me", "don\'t mess with me" },
+    { "&lt;&gt;&amp;&quot;&#39;", "<>&\"'" },
+    { "& lt; &amp ; &; '", "& lt; &amp ; &; '" },
+    { "&amp;", "&" },
+    { "&quot;", "\"" },
+    { "&#39;", "'" },
+    { "&lt;", "<" },
+    { "&gt;", ">" },
+    { "&amp; &", "& &" },
+  };
+  for (size_t i = 0; i < arraysize(tests); ++i) {
+    string16 result = UnescapeForHTML(ASCIIToUTF16(tests[i].input));
+    EXPECT_EQ(ASCIIToUTF16(tests[i].expected_output), result);
+  }
+}
-- 
cgit v1.1