summaryrefslogtreecommitdiffstats
path: root/base
diff options
context:
space:
mode:
authorrsesek@chromium.org <rsesek@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2012-03-24 03:57:17 +0000
committerrsesek@chromium.org <rsesek@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2012-03-24 03:57:17 +0000
commit27cc5a09bba00e877f8815d592d0ba955f62002b (patch)
tree56e2f189ca65b82ee02e5b05fbfbc251cfa7ce2b /base
parent1dd17d77d24a3232976e03b3ec5c7c61a14f1015 (diff)
downloadchromium_src-27cc5a09bba00e877f8815d592d0ba955f62002b.zip
chromium_src-27cc5a09bba00e877f8815d592d0ba955f62002b.tar.gz
chromium_src-27cc5a09bba00e877f8815d592d0ba955f62002b.tar.bz2
Improve JSONReader performance by up to 55% by using std::string instead of wstring.
Before this change, JSONReader would: 1. Take std::string input 2. Convert it to wstring 3. Parse 4. Decode strings for the object representation, converting wstring to string16 5. Create a base::Value with a string16, which internally converts back to std::string After this change, JSONReader does: 1. Take std::string input 2. Parse 3. Create a base::Value with a std::string BUG=111581 TEST=Covered by unittests. Review URL: http://codereview.chromium.org/9801007 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@128678 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base')
-rw-r--r--base/json/json_reader.cc155
-rw-r--r--base/json/json_reader.h30
-rw-r--r--base/json/json_reader_unittest.cc32
3 files changed, 162 insertions, 55 deletions
diff --git a/base/json/json_reader.cc b/base/json/json_reader.cc
index bbaf5fb..31eecb4 100644
--- a/base/json/json_reader.cc
+++ b/base/json/json_reader.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
@@ -9,15 +9,17 @@
#include "base/memory/scoped_ptr.h"
#include "base/stringprintf.h"
#include "base/string_number_conversions.h"
+#include "base/string_piece.h"
#include "base/string_util.h"
+#include "base/third_party/icu/icu_utf.h"
#include "base/utf_string_conversions.h"
#include "base/values.h"
namespace {
-const wchar_t kNullString[] = L"null";
-const wchar_t kTrueString[] = L"true";
-const wchar_t kFalseString[] = L"false";
+const char kNullString[] = "null";
+const char kTrueString[] = "true";
+const char kFalseString[] = "false";
const int kStackLimit = 100;
@@ -25,11 +27,11 @@ const int kStackLimit = 100;
// token. The method returns false if there is no valid integer at the end of
// the token.
bool ReadInt(base::JSONReader::Token& token, bool can_have_leading_zeros) {
- wchar_t first = token.NextChar();
+ char first = token.NextChar();
int len = 0;
// Read in more digits.
- wchar_t c = first;
+ char c = first;
while ('\0' != c && IsAsciiDigit(c)) {
++token.length;
++len;
@@ -50,7 +52,7 @@ bool ReadInt(base::JSONReader::Token& token, bool can_have_leading_zeros) {
// the method returns false.
bool ReadHexDigits(base::JSONReader::Token& token, int digits) {
for (int i = 1; i <= digits; ++i) {
- wchar_t c = *(token.begin + token.length + i);
+ char c = *(token.begin + token.length + i);
if (c == '\0' || !IsHexDigit(c))
return false;
}
@@ -83,6 +85,7 @@ const char* JSONReader::kUnquotedDictionaryKey =
JSONReader::JSONReader()
: start_pos_(NULL),
json_pos_(NULL),
+ end_pos_(NULL),
stack_depth_(0),
allow_trailing_comma_(false),
error_code_(JSON_NO_ERROR),
@@ -148,23 +151,21 @@ std::string JSONReader::GetErrorMessage() const {
Value* JSONReader::JsonToValue(const std::string& json, bool check_root,
bool allow_trailing_comma) {
// The input must be in UTF-8.
- if (!IsStringUTF8(json.c_str())) {
+ if (!IsStringUTF8(json.data())) {
error_code_ = JSON_UNSUPPORTED_ENCODING;
return NULL;
}
- // The conversion from UTF8 to wstring removes null bytes for us
- // (a good thing).
- std::wstring json_wide(UTF8ToWide(json));
- start_pos_ = json_wide.c_str();
-
- // When the input JSON string starts with a UTF-8 Byte-Order-Mark
- // (0xEF, 0xBB, 0xBF), the UTF8ToWide() function converts it to a Unicode
- // BOM (U+FEFF). To avoid the JSONReader::BuildValue() function from
- // mis-treating a Unicode BOM as an invalid character and returning NULL,
- // skip a converted Unicode BOM if it exists.
- if (!json_wide.empty() && start_pos_[0] == 0xFEFF) {
- ++start_pos_;
+ start_pos_ = json.data();
+ end_pos_ = start_pos_ + json.size();
+
+ // When the input JSON string starts with a UTF-8 Byte-Order-Mark (U+FEFF)
+ // or <0xEF 0xBB 0xBF>, advance the start position to avoid the
+ // JSONReader::BuildValue() function from mis-treating a Unicode BOM as an
+ // invalid character and returning NULL.
+ if (json.size() >= 3 && start_pos_[0] == 0xEF &&
+ start_pos_[1] == 0xBB && start_pos_[2] == 0xBF) {
+ start_pos_ += 3;
}
json_pos_ = start_pos_;
@@ -356,7 +357,7 @@ JSONReader::Token JSONReader::ParseNumberToken() {
// We just grab the number here. We validate the size in DecodeNumber.
// According to RFC4627, a valid number is: [minus] int [frac] [exp]
Token token(Token::NUMBER, json_pos_, 0);
- wchar_t c = *json_pos_;
+ char c = *json_pos_;
if ('-' == c) {
++token.length;
c = token.NextChar();
@@ -390,15 +391,14 @@ JSONReader::Token JSONReader::ParseNumberToken() {
}
Value* JSONReader::DecodeNumber(const Token& token) {
- const std::wstring num_string(token.begin, token.length);
+ const std::string num_string(token.begin, token.length);
int num_int;
- if (StringToInt(WideToUTF8(num_string), &num_int))
+ if (StringToInt(num_string, &num_int))
return Value::CreateIntegerValue(num_int);
double num_double;
- if (StringToDouble(WideToUTF8(num_string), &num_double) &&
- base::IsFinite(num_double))
+ if (StringToDouble(num_string, &num_double) && base::IsFinite(num_double))
return Value::CreateDoubleValue(num_double);
return NULL;
@@ -406,8 +406,8 @@ Value* JSONReader::DecodeNumber(const Token& token) {
JSONReader::Token JSONReader::ParseStringToken() {
Token token(Token::STRING, json_pos_, 1);
- wchar_t c = token.NextChar();
- while ('\0' != c) {
+ char c = token.NextChar();
+ while (json_pos_ + token.length < end_pos_) {
if ('\\' == c) {
++token.length;
c = token.NextChar();
@@ -450,11 +450,11 @@ JSONReader::Token JSONReader::ParseStringToken() {
}
Value* JSONReader::DecodeString(const Token& token) {
- std::wstring decoded_str;
+ std::string decoded_str;
decoded_str.reserve(token.length - 2);
for (int i = 1; i < token.length - 1; ++i) {
- wchar_t c = *(token.begin + i);
+ char c = *(token.begin + i);
if ('\\' == c) {
++i;
c = *(token.begin + i);
@@ -483,17 +483,19 @@ Value* JSONReader::DecodeString(const Token& token) {
decoded_str.push_back('\v');
break;
- case 'x':
- decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 4) +
- HexDigitToInt(*(token.begin + i + 2)));
+ case 'x': {
+ if (i + 2 >= token.length)
+ return NULL;
+ int hex_digit = 0;
+ if (!HexStringToInt(StringPiece(token.begin + i + 1, 2), &hex_digit))
+ return NULL;
+ decoded_str.push_back(hex_digit);
i += 2;
break;
+ }
case 'u':
- decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 12 ) +
- (HexDigitToInt(*(token.begin + i + 2)) << 8) +
- (HexDigitToInt(*(token.begin + i + 3)) << 4) +
- HexDigitToInt(*(token.begin + i + 4)));
- i += 4;
+ if (!ConvertUTF16Units(token, &i, &decoded_str))
+ return NULL;
break;
default:
@@ -507,7 +509,66 @@ Value* JSONReader::DecodeString(const Token& token) {
decoded_str.push_back(c);
}
}
- return Value::CreateStringValue(WideToUTF16Hack(decoded_str));
+ return Value::CreateStringValue(decoded_str);
+}
+
+bool JSONReader::ConvertUTF16Units(const Token& token,
+ int* i,
+ std::string* dest_string) {
+ if (*i + 4 >= token.length)
+ return false;
+
+ // This is a 32-bit field because the shift operations in the
+ // conversion process below cause MSVC to error about "data loss."
+ // This only stores UTF-16 code units, though.
+ // Consume the UTF-16 code unit, which may be a high surrogate.
+ int code_unit16_high = 0;
+ if (!HexStringToInt(StringPiece(token.begin + *i + 1, 4), &code_unit16_high))
+ return false;
+ *i += 4;
+
+ // If this is a high surrogate, consume the next code unit to get the
+ // low surrogate.
+ int code_unit16_low = 0;
+ if (CBU16_IS_SURROGATE(code_unit16_high)) {
+ // Make sure this is the high surrogate. If not, it's an encoding
+ // error.
+ if (!CBU16_IS_SURROGATE_LEAD(code_unit16_high))
+ return false;
+
+ // Make sure that the token has more characters to consume the
+ // lower surrogate.
+ if (*i + 6 >= token.length)
+ return false;
+ if (*(++(*i) + token.begin) != '\\' || *(++(*i) + token.begin) != 'u')
+ return false;
+
+ if (!HexStringToInt(StringPiece(token.begin + *i + 1, 4), &code_unit16_low))
+ return false;
+ *i += 4;
+ if (!CBU16_IS_SURROGATE(code_unit16_low) ||
+ !CBU16_IS_TRAIL(code_unit16_low)) {
+ return false;
+ }
+ } else if (!CBU16_IS_SINGLE(code_unit16_high)) {
+ // If this is not a code point, it's an encoding error.
+ return false;
+ }
+
+ // Convert the UTF-16 code units to a code point and then to a UTF-8
+ // code unit sequence.
+ char code_point[8] = { 0 };
+ size_t offset = 0;
+ if (!code_unit16_low) {
+ CBU8_APPEND_UNSAFE(code_point, offset, code_unit16_high);
+ } else {
+ uint32 code_unit32 = CBU16_GET_SUPPLEMENTARY(code_unit16_high,
+ code_unit16_low);
+ offset = 0;
+ CBU8_APPEND_UNSAFE(code_point, offset, code_unit32);
+ }
+ dest_string->append(code_point);
+ return true;
}
JSONReader::Token JSONReader::ParseToken() {
@@ -580,7 +641,7 @@ JSONReader::Token JSONReader::ParseToken() {
}
void JSONReader::EatWhitespaceAndComments() {
- while ('\0' != *json_pos_) {
+ while (json_pos_ != end_pos_) {
switch (*json_pos_) {
case ' ':
case '\n':
@@ -604,11 +665,11 @@ bool JSONReader::EatComment() {
if ('/' != *json_pos_)
return false;
- wchar_t next_char = *(json_pos_ + 1);
+ char next_char = *(json_pos_ + 1);
if ('/' == next_char) {
// Line comment, read until \n or \r
json_pos_ += 2;
- while ('\0' != *json_pos_) {
+ while (json_pos_ != end_pos_) {
switch (*json_pos_) {
case '\n':
case '\r':
@@ -621,7 +682,7 @@ bool JSONReader::EatComment() {
} else if ('*' == next_char) {
// Block comment, read until */
json_pos_ += 2;
- while ('\0' != *json_pos_) {
+ while (json_pos_ != end_pos_) {
if ('*' == *json_pos_ && '/' == *(json_pos_ + 1)) {
json_pos_ += 2;
return true;
@@ -634,18 +695,18 @@ bool JSONReader::EatComment() {
return true;
}
-bool JSONReader::NextStringMatch(const wchar_t* str, size_t length) {
- return wcsncmp(json_pos_, str, length) == 0;
+bool JSONReader::NextStringMatch(const char* str, size_t length) {
+ return strncmp(json_pos_, str, length) == 0;
}
void JSONReader::SetErrorCode(JsonParseError error,
- const wchar_t* error_pos) {
+ const char* error_pos) {
int line_number = 1;
int column_number = 1;
// Figure out the line and column the error occured at.
- for (const wchar_t* pos = start_pos_; pos != error_pos; ++pos) {
- if (*pos == '\0') {
+ for (const char* pos = start_pos_; pos != error_pos; ++pos) {
+ if (pos > end_pos_) {
NOTREACHED();
return;
}
diff --git a/base/json/json_reader.h b/base/json/json_reader.h
index be3aef2..b1edfb0 100644
--- a/base/json/json_reader.h
+++ b/base/json/json_reader.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2011 The Chromium Authors. All rights reserved.
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
@@ -71,11 +71,11 @@ class BASE_EXPORT JSONReader {
INVALID_TOKEN,
};
- Token(Type t, const wchar_t* b, int len)
+ Token(Type t, const char* b, int len)
: type(t), begin(b), length(len) {}
// Get the character that's one past the end of this token.
- wchar_t NextChar() {
+ char NextChar() {
return *(begin + length);
}
@@ -86,7 +86,7 @@ class BASE_EXPORT JSONReader {
Type type;
// A pointer into JSONReader::json_pos_ that's the beginning of this token.
- const wchar_t* begin;
+ const char* begin;
// End should be one char past the end of the token.
int length;
@@ -186,6 +186,17 @@ class BASE_EXPORT JSONReader {
// (otherwise ParseStringToken would have failed).
Value* DecodeString(const Token& token);
+ // Helper function for DecodeString that consumes UTF16 [0,2] code units and
+ // convers them to UTF8 code untis. |token| is the string token in which the
+ // units should be read, |i| is the position in the token at which the first
+ // code unit starts, immediately after the |\u|. This will be mutated if code
+ // units are consumed. |dest_string| is a string to which the UTF8 code unit
+ // should be appended. Returns true on success and false if there's an
+ // encoding error.
+ bool ConvertUTF16Units(const Token& token,
+ int* i,
+ std::string* dest_string);
+
// Grabs the next token in the JSON stream. This does not increment the
// stream so it can be used to look ahead at the next token.
Token ParseToken();
@@ -198,17 +209,20 @@ class BASE_EXPORT JSONReader {
bool EatComment();
// Checks if |json_pos_| matches str.
- bool NextStringMatch(const wchar_t* str, size_t length);
+ bool NextStringMatch(const char* str, size_t length);
// Sets the error code that will be returned to the caller. The current
// line and column are determined and added into the final message.
- void SetErrorCode(const JsonParseError error, const wchar_t* error_pos);
+ void SetErrorCode(const JsonParseError error, const char* error_pos);
// Pointer to the starting position in the input string.
- const wchar_t* start_pos_;
+ const char* start_pos_;
// Pointer to the current position in the input string.
- const wchar_t* json_pos_;
+ const char* json_pos_;
+
+ // Pointer to the last position in the input string.
+ const char* end_pos_;
// Used to keep track of how many nested lists/dicts there are.
int stack_depth_;
diff --git a/base/json/json_reader_unittest.cc b/base/json/json_reader_unittest.cc
index 432512c..5900781 100644
--- a/base/json/json_reader_unittest.cc
+++ b/base/json/json_reader_unittest.cc
@@ -457,6 +457,38 @@ TEST(JSONReaderTest, Reading) {
false, false));
EXPECT_FALSE(root.get());
+ // Test utf16 encoded strings.
+ root.reset(JSONReader().JsonToValue("\"\\u20ac3,14\"", false, false));
+ ASSERT_TRUE(root.get());
+ EXPECT_TRUE(root->IsType(Value::TYPE_STRING));
+ str_val.clear();
+ EXPECT_TRUE(root->GetAsString(&str_val));
+ EXPECT_EQ("\xe2\x82\xac""3,14", str_val);
+
+ root.reset(JSONReader().JsonToValue("\"\\ud83d\\udca9\\ud83d\\udc6c\"",
+ false, false));
+ ASSERT_TRUE(root.get());
+ EXPECT_TRUE(root->IsType(Value::TYPE_STRING));
+ str_val.clear();
+ EXPECT_TRUE(root->GetAsString(&str_val));
+ EXPECT_EQ("\xf0\x9f\x92\xa9\xf0\x9f\x91\xac", str_val);
+
+ // Test invalid utf16 strings.
+ const char* cases[] = {
+ "\"\\u123\"", // Invalid scalar.
+ "\"\\ud83d\"", // Invalid scalar.
+ "\"\\u$%@!\"", // Invalid scalar.
+ "\"\\uzz89\"", // Invalid scalar.
+ "\"\\ud83d\\udca\"", // Invalid lower surrogate.
+ "\"\\ud83d\\ud83d\"", // Invalid lower surrogate.
+ "\"\\ud83foo\"", // No lower surrogate.
+ "\"\\ud83\\foo\"" // No lower surrogate.
+ };
+ for (size_t i = 0; i < arraysize(cases); ++i) {
+ root.reset(JSONReader().JsonToValue(cases[i], false, false));
+ EXPECT_FALSE(root.get()) << cases[i];
+ }
+
// Test invalid root objects.
root.reset(JSONReader::Read("null", false));
EXPECT_FALSE(root.get());