Stop doing unnecessary UTF-8 to UTF-16 conversions in JSONWriter.

The JSONReader only accepts UTF-8 input strings and converts \uXXXX sequences back into UTF-8. However, the JSONWriter converts all non-ASCII characters to UTF-16 escape sequences. This round-tripping is sub-optimal, as noted in a TODO from r54359. One reason for this may be that JsonDoubleQuote(), used by JSONWriter, does not handle UTF-8 bytes correctly, interpreting them as code points and writing them out as \u00XX sequences. If this were read back through a RFC-compliant JSON parser, the result would be an invalid encoding error. JsonDoubleQuote() does handle UTF-16 correctly, though. This rewrites the base/json/string_escape.h API and fixes the above UTF-8 issue by dividing callers up into three groups: 1. Those that pass valid UTF-8 to be escaped. Prior to this change, very few callers used this variant. Those that did were likely using ASCII, otherwise the output would be mangled due to the above issue. Now, valid UTF-8 will be passed through to the output unescaped. Invalid UTF-8 sequences are replaced with U+FFFD. 2. Those that pass valid UTF-16 to be escaped. This function now validates that the input is valid UTF-16, and then converts it to unescaped UTF-8 sequences for the output. 3. Those that pass arbitrary byte arrays as std::string and expect a non-RFC- compliant encoding of the binary data using \uXXXX escapes. This behavior is now in the EscapeBytesAsInvalidJSONString() function. It is only used by callers who want a "debug string" but do not expect to actually parse the output as valid JSON, since it is not. Additionally, this removes the JSONWriter::OPTIONS_DO_NOT_ESCAPE flag, since the writer can now handle UTF-8 appropriately. BUG=15466 Committed: https://src.chromium.org/viewvc/chrome?view=rev&revision=239800 Reverted: https://src.chromium.org/viewvc/chrome?view=rev&revision=240082 R=asanka@chromium.org, bauerb@chromium.org, mark@chromium.org, thakis@chromium.org, zea@chromium.org Review URL: https://codereview.chromium.org/100823007 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@240190 0039d316-1c4b-4281-b951-d872f2087c98
author: rsesek@chromium.org <rsesek@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2013-12-11 22:10:45 +0000
committer: rsesek@chromium.org <rsesek@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> 2013-12-11 22:10:45 +0000
commit: bbe1571f0e657d3ba18c05835f06c297b863cc09 (patch)
tree: f45ace7793d8883d50d2d57f543c2e80a57a07a0 /base/json
parent: c1a2b233574df6256681b1ed1f7a18a10d942d10 (diff)
download: chromium_src-bbe1571f0e657d3ba18c05835f06c297b863cc09.zip
chromium_src-bbe1571f0e657d3ba18c05835f06c297b863cc09.tar.gz
chromium_src-bbe1571f0e657d3ba18c05835f06c297b863cc09.tar.bz2
6 files changed, 302 insertions, 179 deletions
diff --git a/base/json/json_value_serializer_unittest.cc b/base/json/json_value_serializer_unittest.cc
index 314cd07..44c0a57 100644
--- a/base/json/json_value_serializer_unittest.cc
+++ b/base/json/json_value_serializer_unittest.cc
@@ -235,22 +235,23 @@ TEST(JSONValueSerializerTest, StringEscape) {
   std::string all_chars_expected =
       "\\u0001\\u0002\\u0003\\u0004\\u0005\\u0006\\u0007\\b\\t\\n\\u000B\\f\\r"
       "\\u000E\\u000F\\u0010\\u0011\\u0012\\u0013\\u0014\\u0015\\u0016\\u0017"
-      "\\u0018\\u0019\\u001A\\u001B\\u001C\\u001D\\u001E\\u001F !\\\""
-      "#$%&'()*+,-./0123456789:;\\u003C=\\u003E?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\"
-      "\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\u007F\\u0080\\u0081\\u0082\\u0083"
-      "\\u0084\\u0085\\u0086\\u0087\\u0088\\u0089\\u008A\\u008B\\u008C\\u008D"
-      "\\u008E\\u008F\\u0090\\u0091\\u0092\\u0093\\u0094\\u0095\\u0096\\u0097"
-      "\\u0098\\u0099\\u009A\\u009B\\u009C\\u009D\\u009E\\u009F\\u00A0\\u00A1"
-      "\\u00A2\\u00A3\\u00A4\\u00A5\\u00A6\\u00A7\\u00A8\\u00A9\\u00AA\\u00AB"
-      "\\u00AC\\u00AD\\u00AE\\u00AF\\u00B0\\u00B1\\u00B2\\u00B3\\u00B4\\u00B5"
-      "\\u00B6\\u00B7\\u00B8\\u00B9\\u00BA\\u00BB\\u00BC\\u00BD\\u00BE\\u00BF"
-      "\\u00C0\\u00C1\\u00C2\\u00C3\\u00C4\\u00C5\\u00C6\\u00C7\\u00C8\\u00C9"
-      "\\u00CA\\u00CB\\u00CC\\u00CD\\u00CE\\u00CF\\u00D0\\u00D1\\u00D2\\u00D3"
-      "\\u00D4\\u00D5\\u00D6\\u00D7\\u00D8\\u00D9\\u00DA\\u00DB\\u00DC\\u00DD"
-      "\\u00DE\\u00DF\\u00E0\\u00E1\\u00E2\\u00E3\\u00E4\\u00E5\\u00E6\\u00E7"
-      "\\u00E8\\u00E9\\u00EA\\u00EB\\u00EC\\u00ED\\u00EE\\u00EF\\u00F0\\u00F1"
-      "\\u00F2\\u00F3\\u00F4\\u00F5\\u00F6\\u00F7\\u00F8\\u00F9\\u00FA\\u00FB"
-      "\\u00FC\\u00FD\\u00FE\\u00FF";
+      "\\u0018\\u0019\\u001A\\u001B\\u001C\\u001D\\u001E\\u001F !\\\"#$%&'()*+,"
+      "-./0123456789:;\\u003C=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\\\]^_`abcde"
+      "fghijklmnopqrstuvwxyz{|}~\x7F\xC2\x80\xC2\x81\xC2\x82\xC2\x83\xC2\x84"
+      "\xC2\x85\xC2\x86\xC2\x87\xC2\x88\xC2\x89\xC2\x8A\xC2\x8B\xC2\x8C\xC2\x8D"
+      "\xC2\x8E\xC2\x8F\xC2\x90\xC2\x91\xC2\x92\xC2\x93\xC2\x94\xC2\x95\xC2\x96"
+      "\xC2\x97\xC2\x98\xC2\x99\xC2\x9A\xC2\x9B\xC2\x9C\xC2\x9D\xC2\x9E\xC2\x9F"
+      "\xC2\xA0\xC2\xA1\xC2\xA2\xC2\xA3\xC2\xA4\xC2\xA5\xC2\xA6\xC2\xA7\xC2\xA8"
+      "\xC2\xA9\xC2\xAA\xC2\xAB\xC2\xAC\xC2\xAD\xC2\xAE\xC2\xAF\xC2\xB0\xC2\xB1"
+      "\xC2\xB2\xC2\xB3\xC2\xB4\xC2\xB5\xC2\xB6\xC2\xB7\xC2\xB8\xC2\xB9\xC2\xBA"
+      "\xC2\xBB\xC2\xBC\xC2\xBD\xC2\xBE\xC2\xBF\xC3\x80\xC3\x81\xC3\x82\xC3\x83"
+      "\xC3\x84\xC3\x85\xC3\x86\xC3\x87\xC3\x88\xC3\x89\xC3\x8A\xC3\x8B\xC3\x8C"
+      "\xC3\x8D\xC3\x8E\xC3\x8F\xC3\x90\xC3\x91\xC3\x92\xC3\x93\xC3\x94\xC3\x95"
+      "\xC3\x96\xC3\x97\xC3\x98\xC3\x99\xC3\x9A\xC3\x9B\xC3\x9C\xC3\x9D\xC3\x9E"
+      "\xC3\x9F\xC3\xA0\xC3\xA1\xC3\xA2\xC3\xA3\xC3\xA4\xC3\xA5\xC3\xA6\xC3\xA7"
+      "\xC3\xA8\xC3\xA9\xC3\xAA\xC3\xAB\xC3\xAC\xC3\xAD\xC3\xAE\xC3\xAF\xC3\xB0"
+      "\xC3\xB1\xC3\xB2\xC3\xB3\xC3\xB4\xC3\xB5\xC3\xB6\xC3\xB7\xC3\xB8\xC3\xB9"
+      "\xC3\xBA\xC3\xBB\xC3\xBC\xC3\xBD\xC3\xBE\xC3\xBF";
 
   std::string expected_output = "{\"all_chars\":\"" + all_chars_expected +
                                  "\"}";
@@ -273,7 +274,7 @@ TEST(JSONValueSerializerTest, UnicodeStrings) {
   string16 test(WideToUTF16(L"\x7F51\x9875"));
   root.SetString("web", test);
 
-  std::string expected = "{\"web\":\"\\u7F51\\u9875\"}";
+  std::string expected = "{\"web\":\"\xE7\xBD\x91\xE9\xA1\xB5\"}";
 
   std::string actual;
   JSONStringValueSerializer serializer(&actual);
diff --git a/base/json/json_writer.cc b/base/json/json_writer.cc
index 6a9cc6a..d600663 100644
--- a/base/json/json_writer.cc
+++ b/base/json/json_writer.cc
@@ -21,28 +21,24 @@ static const char kPrettyPrintLineEnding[] = "\r\n";
 static const char kPrettyPrintLineEnding[] = "\n";
 #endif
 
-/* static */
-const char* JSONWriter::kEmptyArray = "[]";
-
-/* static */
+// static
 void JSONWriter::Write(const Value* const node, std::string* json) {
   WriteWithOptions(node, 0, json);
 }
 
-/* static */
+// static
 void JSONWriter::WriteWithOptions(const Value* const node, int options,
                                   std::string* json) {
   json->clear();
   // Is there a better way to estimate the size of the output?
   json->reserve(1024);
 
-  bool escape = !(options & OPTIONS_DO_NOT_ESCAPE);
   bool omit_binary_values = !!(options & OPTIONS_OMIT_BINARY_VALUES);
   bool omit_double_type_preservation =
       !!(options & OPTIONS_OMIT_DOUBLE_TYPE_PRESERVATION);
   bool pretty_print = !!(options & OPTIONS_PRETTY_PRINT);
 
-  JSONWriter writer(escape, omit_binary_values, omit_double_type_preservation,
+  JSONWriter writer(omit_binary_values, omit_double_type_preservation,
                     pretty_print, json);
   writer.BuildJSONString(node, 0);
 
@@ -50,11 +46,10 @@ void JSONWriter::WriteWithOptions(const Value* const node, int options,
     json->append(kPrettyPrintLineEnding);
 }
 
-JSONWriter::JSONWriter(bool escape, bool omit_binary_values,
+JSONWriter::JSONWriter(bool omit_binary_values,
                        bool omit_double_type_preservation, bool pretty_print,
                        std::string* json)
-    : escape_(escape),
-      omit_binary_values_(omit_binary_values),
+    : omit_binary_values_(omit_binary_values),
       omit_double_type_preservation_(omit_double_type_preservation),
       pretty_print_(pretty_print),
       json_string_(json) {
@@ -123,11 +118,7 @@ void JSONWriter::BuildJSONString(const Value* const node, int depth) {
         std::string value;
         bool result = node->GetAsString(&value);
         DCHECK(result);
-        if (escape_) {
-          JsonDoubleQuote(UTF8ToUTF16(value), true, json_string_);
-        } else {
-          JsonDoubleQuote(value, true, json_string_);
-        }
+        EscapeJSONString(value, true, json_string_);
         break;
       }
 
@@ -169,7 +160,7 @@ void JSONWriter::BuildJSONString(const Value* const node, int depth) {
           json_string_->append(kPrettyPrintLineEnding);
 
         const DictionaryValue* dict =
-          static_cast<const DictionaryValue*>(node);
+            static_cast<const DictionaryValue*>(node);
         bool first_entry = true;
         for (DictionaryValue::Iterator itr(*dict); !itr.IsAtEnd();
              itr.Advance(), first_entry = false) {
@@ -186,7 +177,8 @@ void JSONWriter::BuildJSONString(const Value* const node, int depth) {
 
           if (pretty_print_)
             IndentLine(depth + 1);
-          AppendQuotedString(itr.key());
+
+          EscapeJSONString(itr.key(), true, json_string_);
           if (pretty_print_) {
             json_string_->append(": ");
           } else {
@@ -218,12 +210,6 @@ void JSONWriter::BuildJSONString(const Value* const node, int depth) {
   }
 }
 
-void JSONWriter::AppendQuotedString(const std::string& str) {
-  // TODO(viettrungluu): |str| is UTF-8, not ASCII, so to properly escape it we
-  // have to convert it to UTF-16. This round-trip is suboptimal.
-  JsonDoubleQuote(UTF8ToUTF16(str), true, json_string_);
-}
-
 void JSONWriter::IndentLine(int depth) {
   // It may be faster to keep an indent string so we don't have to keep
   // reallocating.
diff --git a/base/json/json_writer.h b/base/json/json_writer.h
index 94052c8..e4a143c 100644
--- a/base/json/json_writer.h
+++ b/base/json/json_writer.h
@@ -17,24 +17,19 @@ class Value;
 class BASE_EXPORT JSONWriter {
  public:
   enum Options {
-    // Do not escape the string, preserving its UTF8 characters. It is useful
-    // if you can pass the resulting string to the JSON parser in binary form
-    // (as UTF8).
-    OPTIONS_DO_NOT_ESCAPE = 1 << 0,
-
     // For values of binary type, the value (and key if within a dictionary)
     // will be omitted from the output.
-    OPTIONS_OMIT_BINARY_VALUES = 1 << 1,
+    OPTIONS_OMIT_BINARY_VALUES = 1 << 0,
 
     // This option instructs the writer to write doubles that have no fractional
     // part as a normal integer (i.e., without using exponential notation
     // or appending a '.0') as long as the value is within the range of a
     // 64-bit int.
-    OPTIONS_OMIT_DOUBLE_TYPE_PRESERVATION = 1 << 2,
+    OPTIONS_OMIT_DOUBLE_TYPE_PRESERVATION = 1 << 1,
 
     // Return a slightly nicer formatted json string (pads with whitespace to
     // help with readability).
-    OPTIONS_PRETTY_PRINT = 1 << 3
+    OPTIONS_PRETTY_PRINT = 1 << 2,
   };
 
   // Given a root node, generates a JSON string and puts it into |json|.
@@ -48,12 +43,8 @@ class BASE_EXPORT JSONWriter {
   static void WriteWithOptions(const Value* const node, int options,
                                std::string* json);
 
-  // A static, constant JSON string representing an empty array.  Useful
-  // for empty JSON argument passing.
-  static const char* kEmptyArray;
-
  private:
-  JSONWriter(bool escape, bool omit_binary_values,
+  JSONWriter(bool omit_binary_values,
              bool omit_double_type_preservation, bool pretty_print,
              std::string* json);
 
@@ -61,13 +52,9 @@ class BASE_EXPORT JSONWriter {
   // json_string_ will contain the JSON.
   void BuildJSONString(const Value* const node, int depth);
 
-  // Appends a quoted, escaped, version of (UTF-8) str to json_string_.
-  void AppendQuotedString(const std::string& str);
-
   // Adds space to json_string_ for the indent level.
   void IndentLine(int depth);
 
-  bool escape_;
   bool omit_binary_values_;
   bool omit_double_type_preservation_;
   bool pretty_print_;
diff --git a/base/json/string_escape.cc b/base/json/string_escape.cc
index 10ea670..a3b0735 100644
--- a/base/json/string_escape.cc
+++ b/base/json/string_escape.cc
@@ -8,40 +8,56 @@
 
 #include "base/strings/string_util.h"
 #include "base/strings/stringprintf.h"
+#include "base/strings/utf_string_conversion_utils.h"
+#include "base/strings/utf_string_conversions.h"
+#include "base/third_party/icu/icu_utf.h"
 
 namespace base {
 
 namespace {
 
-// Try to escape |c| as a "SingleEscapeCharacter" (\n, etc).  If successful,
-// returns true and appends the escape sequence to |dst|.  This isn't required
-// by the spec, but it's more readable by humans than the \uXXXX alternatives.
-template<typename CHAR>
-static bool JsonSingleEscapeChar(const CHAR c, std::string* dst) {
+// Format string for printing a \uXXXX escape sequence.
+const char kU16EscapeFormat[] = "\\u%04X";
+
+// The code point to output for an invalid input code unit.
+const uint32 kReplacementCodePoint = 0xFFFD;
+
+// Used below in EscapeSpecialCodePoint().
+COMPILE_ASSERT('<' == 0x3C, less_than_sign_is_0x3c);
+
+// Try to escape the |code_point| if it is a known special character. If
+// successful, returns true and appends the escape sequence to |dest|. This
+// isn't required by the spec, but it's more readable by humans.
+bool EscapeSpecialCodePoint(uint32 code_point, std::string* dest) {
   // WARNING: if you add a new case here, you need to update the reader as well.
   // Note: \v is in the reader, but not here since the JSON spec doesn't
   // allow it.
-  switch (c) {
+  switch (code_point) {
     case '\b':
-      dst->append("\\b");
+      dest->append("\\b");
       break;
     case '\f':
-      dst->append("\\f");
+      dest->append("\\f");
       break;
     case '\n':
-      dst->append("\\n");
+      dest->append("\\n");
       break;
     case '\r':
-      dst->append("\\r");
+      dest->append("\\r");
       break;
     case '\t':
-      dst->append("\\t");
+      dest->append("\\t");
       break;
     case '\\':
-      dst->append("\\\\");
+      dest->append("\\\\");
       break;
     case '"':
-      dst->append("\\\"");
+      dest->append("\\\"");
+      break;
+    // Escape < to prevent script execution; escaping > is not necessary and
+    // not doing so save a few bytes.
+    case '<':
+      dest->append("\\u003C");
       break;
     default:
       return false;
@@ -49,57 +65,90 @@ static bool JsonSingleEscapeChar(const CHAR c, std::string* dst) {
   return true;
 }
 
-template <class STR>
-void JsonDoubleQuoteT(const STR& str,
-                      bool put_in_quotes,
-                      std::string* dst) {
+template <typename S>
+bool EscapeJSONStringImpl(const S& str, bool put_in_quotes, std::string* dest) {
+  bool did_replacement = false;
+
   if (put_in_quotes)
-    dst->push_back('"');
-
-  for (typename STR::const_iterator it = str.begin(); it != str.end(); ++it) {
-    typename ToUnsigned<typename STR::value_type>::Unsigned c = *it;
-    if (!JsonSingleEscapeChar(c, dst)) {
-      if (c < 32 || c > 126 || c == '<' || c == '>') {
-        // 1. Escaping <, > to prevent script execution.
-        // 2. Technically, we could also pass through c > 126 as UTF8, but this
-        //    is also optional.  It would also be a pain to implement here.
-        unsigned int as_uint = static_cast<unsigned int>(c);
-        base::StringAppendF(dst, "\\u%04X", as_uint);
-      } else {
-        unsigned char ascii = static_cast<unsigned char>(*it);
-        dst->push_back(ascii);
-      }
+    dest->push_back('"');
+
+  // Casting is necessary because ICU uses int32. Try and do so safely.
+  CHECK_LE(str.length(), static_cast<size_t>(kint32max));
+  const int32 length = static_cast<int32>(str.length());
+
+  for (int32 i = 0; i < length; ++i) {
+    uint32 code_point;
+    if (!ReadUnicodeCharacter(str.data(), length, &i, &code_point)) {
+      code_point = kReplacementCodePoint;
+      did_replacement = true;
     }
+
+    if (EscapeSpecialCodePoint(code_point, dest))
+      continue;
+
+    // Escape non-printing characters.
+    if (code_point < 32)
+      base::StringAppendF(dest, kU16EscapeFormat, code_point);
+    else
+      WriteUnicodeCharacter(code_point, dest);
   }
 
   if (put_in_quotes)
-    dst->push_back('"');
+    dest->push_back('"');
+
+  return !did_replacement;
 }
 
 }  // namespace
 
-void JsonDoubleQuote(const StringPiece& str,
-                     bool put_in_quotes,
-                     std::string* dst) {
-  JsonDoubleQuoteT(str, put_in_quotes, dst);
+bool EscapeJSONString(const StringPiece& str,
+                      bool put_in_quotes,
+                      std::string* dest) {
+  return EscapeJSONStringImpl(str, put_in_quotes, dest);
 }
 
-std::string GetDoubleQuotedJson(const StringPiece& str) {
-  std::string dst;
-  JsonDoubleQuote(str, true, &dst);
-  return dst;
+bool EscapeJSONString(const StringPiece16& str,
+                      bool put_in_quotes,
+                      std::string* dest) {
+  return EscapeJSONStringImpl(str, put_in_quotes, dest);
+}
+
+std::string GetQuotedJSONString(const StringPiece& str) {
+  std::string dest;
+  bool ok = EscapeJSONStringImpl(str, true, &dest);
+  DCHECK(ok);
+  return dest;
 }
 
-void JsonDoubleQuote(const StringPiece16& str,
-                     bool put_in_quotes,
-                     std::string* dst) {
-  JsonDoubleQuoteT(str, put_in_quotes, dst);
+std::string GetQuotedJSONString(const StringPiece16& str) {
+  std::string dest;
+  bool ok = EscapeJSONStringImpl(str, true, &dest);
+  DCHECK(ok);
+  return dest;
 }
 
-std::string GetDoubleQuotedJson(const StringPiece16& str) {
-  std::string dst;
-  JsonDoubleQuote(str, true, &dst);
-  return dst;
+std::string EscapeBytesAsInvalidJSONString(const StringPiece& str,
+                                           bool put_in_quotes) {
+  std::string dest;
+
+  if (put_in_quotes)
+    dest.push_back('"');
+
+  for (StringPiece::const_iterator it = str.begin(); it != str.end(); ++it) {
+    ToUnsigned<StringPiece::value_type>::Unsigned c = *it;
+    if (EscapeSpecialCodePoint(c, &dest))
+      continue;
+
+    if (c < 32 || c > 126)
+      base::StringAppendF(&dest, kU16EscapeFormat, c);
+    else
+      dest.push_back(*it);
+  }
+
+  if (put_in_quotes)
+    dest.push_back('"');
+
+  return dest;
 }
 
 }  // namespace base
diff --git a/base/json/string_escape.h b/base/json/string_escape.h
index 0f16f59..b66b7e5 100644
--- a/base/json/string_escape.h
+++ b/base/json/string_escape.h
@@ -1,8 +1,8 @@
 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
-//
-// This file defines utility functions for escaping strings.
+
+// This file defines utility functions for escaping strings suitable for JSON.
 
 #ifndef BASE_JSON_STRING_ESCAPE_H_
 #define BASE_JSON_STRING_ESCAPE_H_
@@ -14,24 +14,46 @@
 
 namespace base {
 
-// Escape |str| appropriately for a JSON string literal, _appending_ the
-// result to |dst|. This will create unicode escape sequences (\uXXXX).
-// If |put_in_quotes| is true, the result will be surrounded in double quotes.
-// The outputted literal, when interpreted by the browser, should result in a
-// javascript string that is identical and the same length as the input |str|.
-BASE_EXPORT void JsonDoubleQuote(const StringPiece& str,
-                                 bool put_in_quotes,
-                                 std::string* dst);
-
-// Same as above, but always returns the result double quoted.
-BASE_EXPORT std::string GetDoubleQuotedJson(const StringPiece& str);
-
-BASE_EXPORT void JsonDoubleQuote(const StringPiece16& str,
-                                 bool put_in_quotes,
-                                 std::string* dst);
-
-// Same as above, but always returns the result double quoted.
-BASE_EXPORT std::string GetDoubleQuotedJson(const StringPiece16& str);
+// Appends to |dest| an escaped version of |str|. Valid UTF-8 code units will
+// pass through from the input to the output. Invalid code units will be
+// replaced with the U+FFFD replacement character. This function returns true
+// if no replacement was necessary and false if there was a lossy replacement.
+// On return, |dest| will contain a valid UTF-8 JSON string.
+//
+// Non-printing control characters will be escaped as \uXXXX sequences for
+// readability.
+//
+// If |put_in_quotes| is true, then a leading and trailing double-quote mark
+// will be appended to |dest| as well.
+BASE_EXPORT bool EscapeJSONString(const StringPiece& str,
+                                  bool put_in_quotes,
+                                  std::string* dest);
+
+// Performs a similar function to the UTF-8 StringPiece version above,
+// converting UTF-16 code units to UTF-8 code units and escaping non-printing
+// control characters. On return, |dest| will contain a valid UTF-8 JSON string.
+BASE_EXPORT bool EscapeJSONString(const StringPiece16& str,
+                                  bool put_in_quotes,
+                                  std::string* dest);
+
+// Helper functions that wrap the above two functions but return the value
+// instead of appending. |put_in_quotes| is always true.
+BASE_EXPORT std::string GetQuotedJSONString(const StringPiece& str);
+BASE_EXPORT std::string GetQuotedJSONString(const StringPiece16& str);
+
+// Given an arbitrary byte string |str|, this will escape all non-ASCII bytes
+// as \uXXXX escape sequences. This function is *NOT* meant to be used with
+// Unicode strings and does not validate |str| as one.
+//
+// CAVEAT CALLER: The output of this function may not be valid JSON, since
+// JSON requires escape sequences to be valid UTF-16 code units. This output
+// will be mangled if passed to to the base::JSONReader, since the reader will
+// interpret it as UTF-16 and convert it to UTF-8.
+//
+// The output of this function takes the *appearance* of JSON but is not in
+// fact valid according to RFC 4627.
+BASE_EXPORT std::string EscapeBytesAsInvalidJSONString(const StringPiece& str,
+                                                       bool put_in_quotes);
 
 }  // namespace base
 
diff --git a/base/json/string_escape_unittest.cc b/base/json/string_escape_unittest.cc
index f921994..7d82f9b 100644
--- a/base/json/string_escape_unittest.cc
+++ b/base/json/string_escape_unittest.cc
@@ -1,104 +1,182 @@
-// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+// Copyright (c) 2013 The Chromium Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style license that can be
 // found in the LICENSE file.
 
 #include "base/json/string_escape.h"
+
+#include "base/strings/string_util.h"
 #include "base/strings/utf_string_conversions.h"
 #include "testing/gtest/include/gtest/gtest.h"
 
 namespace base {
 
-namespace {
-
-const struct json_narrow_test_data {
-  const char* to_escape;
-  const char* escaped;
-} json_narrow_cases[] = {
-  {"\b\001aZ\"\\wee", "\\b\\u0001aZ\\\"\\\\wee"},
-  {"a\b\f\n\r\t\v\1\\.\"z",
-      "a\\b\\f\\n\\r\\t\\u000B\\u0001\\\\.\\\"z"},
-  {"b\x0f\x7f\xf0\xff!", "b\\u000F\\u007F\\u00F0\\u00FF!"},
-  {"c<>d", "c\\u003C\\u003Ed"},
-};
-
-}  // namespace
-
-TEST(StringEscapeTest, JsonDoubleQuoteNarrow) {
-  for (size_t i = 0; i < arraysize(json_narrow_cases); ++i) {
-    const char* in_ptr = json_narrow_cases[i].to_escape;
+TEST(JSONStringEscapeTest, EscapeUTF8) {
+  const struct {
+    const char* to_escape;
+    const char* escaped;
+  } cases[] = {
+    {"\b\001aZ\"\\wee", "\\b\\u0001aZ\\\"\\\\wee"},
+    {"a\b\f\n\r\t\v\1\\.\"z",
+        "a\\b\\f\\n\\r\\t\\u000B\\u0001\\\\.\\\"z"},
+    {"b\x0f\x7f\xf0\xff!",  // \xf0\xff is not a valid UTF-8 unit.
+        "b\\u000F\x7F\xEF\xBF\xBD\xEF\xBF\xBD!"},
+    {"c<>d", "c\\u003C>d"},
+  };
+
+  for (size_t i = 0; i < ARRAYSIZE_UNSAFE(cases); ++i) {
+    const char* in_ptr = cases[i].to_escape;
     std::string in_str = in_ptr;
+
     std::string out;
-    JsonDoubleQuote(in_ptr, false, &out);
-    EXPECT_EQ(std::string(json_narrow_cases[i].escaped), out);
+    EscapeJSONString(in_ptr, false, &out);
+    EXPECT_EQ(std::string(cases[i].escaped), out);
+    EXPECT_TRUE(IsStringUTF8(out));
+
     out.erase();
-    JsonDoubleQuote(in_str, false, &out);
-    EXPECT_EQ(std::string(json_narrow_cases[i].escaped), out);
+    bool convert_ok = EscapeJSONString(in_str, false, &out);
+    EXPECT_EQ(std::string(cases[i].escaped), out);
+    EXPECT_TRUE(IsStringUTF8(out));
+
+    if (convert_ok) {
+      std::string fooout = GetQuotedJSONString(in_str);
+      EXPECT_EQ("\"" + std::string(cases[i].escaped) + "\"", fooout);
+      EXPECT_TRUE(IsStringUTF8(out));
+    }
   }
 
-  std::string in = json_narrow_cases[0].to_escape;
+  std::string in = cases[0].to_escape;
   std::string out;
-  JsonDoubleQuote(in, false, &out);
+  EscapeJSONString(in, false, &out);
+  EXPECT_TRUE(IsStringUTF8(out));
 
   // test quoting
   std::string out_quoted;
-  JsonDoubleQuote(in, true, &out_quoted);
+  EscapeJSONString(in, true, &out_quoted);
   EXPECT_EQ(out.length() + 2, out_quoted.length());
   EXPECT_EQ(out_quoted.find(out), 1U);
+  EXPECT_TRUE(IsStringUTF8(out_quoted));
 
   // now try with a NULL in the string
   std::string null_prepend = "test";
   null_prepend.push_back(0);
   in = null_prepend + in;
   std::string expected = "test\\u0000";
-  expected += json_narrow_cases[0].escaped;
+  expected += cases[0].escaped;
   out.clear();
-  JsonDoubleQuote(in, false, &out);
+  EscapeJSONString(in, false, &out);
   EXPECT_EQ(expected, out);
+  EXPECT_TRUE(IsStringUTF8(out));
 }
 
-namespace {
-
-const struct json_wide_test_data {
-  const wchar_t* to_escape;
-  const char* escaped;
-} json_wide_cases[] = {
-  {L"b\uffb1\u00ff", "b\\uFFB1\\u00FF"},
-  {L"\b\001aZ\"\\wee", "\\b\\u0001aZ\\\"\\\\wee"},
-  {L"a\b\f\n\r\t\v\1\\.\"z",
-      "a\\b\\f\\n\\r\\t\\u000B\\u0001\\\\.\\\"z"},
-  {L"b\x0f\x7f\xf0\xff!", "b\\u000F\\u007F\\u00F0\\u00FF!"},
-  {L"c<>d", "c\\u003C\\u003Ed"},
-};
+TEST(JSONStringEscapeTest, EscapeUTF16) {
+  const struct {
+    const wchar_t* to_escape;
+    const char* escaped;
+  } cases[] = {
+    {L"b\uffb1\u00ff", "b\xEF\xBE\xB1\xC3\xBF"},
+    {L"\b\001aZ\"\\wee", "\\b\\u0001aZ\\\"\\\\wee"},
+    {L"a\b\f\n\r\t\v\1\\.\"z",
+        "a\\b\\f\\n\\r\\t\\u000B\\u0001\\\\.\\\"z"},
+    {L"b\x0f\x7f\xf0\xff!", "b\\u000F\x7F\xC3\xB0\xC3\xBF!"},
+    {L"c<>d", "c\\u003C>d"},
+  };
+
+  for (size_t i = 0; i < ARRAYSIZE_UNSAFE(cases); ++i) {
+    string16 in = WideToUTF16(cases[i].to_escape);
 
-}  // namespace
-
-TEST(StringEscapeTest, JsonDoubleQuoteWide) {
-  for (size_t i = 0; i < arraysize(json_wide_cases); ++i) {
     std::string out;
-    string16 in = WideToUTF16(json_wide_cases[i].to_escape);
-    JsonDoubleQuote(in, false, &out);
-    EXPECT_EQ(std::string(json_wide_cases[i].escaped), out);
+    EscapeJSONString(in, false, &out);
+    EXPECT_EQ(std::string(cases[i].escaped), out);
+    EXPECT_TRUE(IsStringUTF8(out));
+
+    out = GetQuotedJSONString(in);
+    EXPECT_EQ("\"" + std::string(cases[i].escaped) + "\"", out);
+    EXPECT_TRUE(IsStringUTF8(out));
   }
 
-  string16 in = WideToUTF16(json_wide_cases[0].to_escape);
+  string16 in = WideToUTF16(cases[0].to_escape);
   std::string out;
-  JsonDoubleQuote(in, false, &out);
+  EscapeJSONString(in, false, &out);
+  EXPECT_TRUE(IsStringUTF8(out));
 
   // test quoting
   std::string out_quoted;
-  JsonDoubleQuote(in, true, &out_quoted);
+  EscapeJSONString(in, true, &out_quoted);
   EXPECT_EQ(out.length() + 2, out_quoted.length());
   EXPECT_EQ(out_quoted.find(out), 1U);
+  EXPECT_TRUE(IsStringUTF8(out));
 
   // now try with a NULL in the string
   string16 null_prepend = WideToUTF16(L"test");
   null_prepend.push_back(0);
   in = null_prepend + in;
   std::string expected = "test\\u0000";
-  expected += json_wide_cases[0].escaped;
+  expected += cases[0].escaped;
   out.clear();
-  JsonDoubleQuote(in, false, &out);
+  EscapeJSONString(in, false, &out);
   EXPECT_EQ(expected, out);
+  EXPECT_TRUE(IsStringUTF8(out));
+}
+
+TEST(JSONStringEscapeTest, EscapeUTF16OutsideBMP) {
+  {
+    // {a, U+10300, !}, SMP.
+    string16 test;
+    test.push_back('a');
+    test.push_back(0xD800);
+    test.push_back(0xDF00);
+    test.push_back('!');
+    std::string actual;
+    EXPECT_TRUE(EscapeJSONString(test, false, &actual));
+    EXPECT_EQ("a\xF0\x90\x8C\x80!", actual);
+  }
+  {
+    // {U+20021, U+2002B}, SIP.
+    string16 test;
+    test.push_back(0xD840);
+    test.push_back(0xDC21);
+    test.push_back(0xD840);
+    test.push_back(0xDC2B);
+    std::string actual;
+    EXPECT_TRUE(EscapeJSONString(test, false, &actual));
+    EXPECT_EQ("\xF0\xA0\x80\xA1\xF0\xA0\x80\xAB", actual);
+  }
+  {
+    // {?, U+D800, @}, lone surrogate.
+    string16 test;
+    test.push_back('?');
+    test.push_back(0xD800);
+    test.push_back('@');
+    std::string actual;
+    EXPECT_FALSE(EscapeJSONString(test, false, &actual));
+    EXPECT_EQ("?\xEF\xBF\xBD@", actual);
+  }
+}
+
+TEST(JSONStringEscapeTest, EscapeBytes) {
+  const struct {
+    const char* to_escape;
+    const char* escaped;
+  } cases[] = {
+    {"b\x0f\x7f\xf0\xff!", "b\\u000F\\u007F\\u00F0\\u00FF!"},
+    {"\xe5\xc4\x4f\x05\xb6\xfd\0", "\\u00E5\\u00C4O\\u0005\\u00B6\\u00FD"},
+  };
+
+  for (size_t i = 0; i < ARRAYSIZE_UNSAFE(cases); ++i) {
+    std::string in = std::string(cases[i].to_escape);
+    EXPECT_FALSE(IsStringUTF8(in));
+
+    EXPECT_EQ(std::string(cases[i].escaped),
+        EscapeBytesAsInvalidJSONString(in, false));
+    EXPECT_EQ("\"" + std::string(cases[i].escaped) + "\"",
+        EscapeBytesAsInvalidJSONString(in, true));
+  }
+
+  const char kEmbedNull[] = { '\xab', '\x39', '\0', '\x9f', '\xab' };
+  std::string in(kEmbedNull, ARRAYSIZE_UNSAFE(kEmbedNull));
+  EXPECT_FALSE(IsStringUTF8(in));
+  EXPECT_EQ(std::string("\\u00AB9\\u0000\\u009F\\u00AB"),
+            EscapeBytesAsInvalidJSONString(in, false));
 }
 
 }  // namespace base
author	rsesek@chromium.org <rsesek@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2013-12-11 22:10:45 +0000
committer	rsesek@chromium.org <rsesek@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>	2013-12-11 22:10:45 +0000
commit	bbe1571f0e657d3ba18c05835f06c297b863cc09 (patch)
tree	f45ace7793d8883d50d2d57f543c2e80a57a07a0 /base/json
parent	c1a2b233574df6256681b1ed1f7a18a10d942d10 (diff)
download	chromium_src-bbe1571f0e657d3ba18c05835f06c297b863cc09.zip chromium_src-bbe1571f0e657d3ba18c05835f06c297b863cc09.tar.gz chromium_src-bbe1571f0e657d3ba18c05835f06c297b863cc09.tar.bz2