diff options
author | rsesek@chromium.org <rsesek@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2012-03-24 03:57:17 +0000 |
---|---|---|
committer | rsesek@chromium.org <rsesek@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2012-03-24 03:57:17 +0000 |
commit | 27cc5a09bba00e877f8815d592d0ba955f62002b (patch) | |
tree | 56e2f189ca65b82ee02e5b05fbfbc251cfa7ce2b /base | |
parent | 1dd17d77d24a3232976e03b3ec5c7c61a14f1015 (diff) | |
download | chromium_src-27cc5a09bba00e877f8815d592d0ba955f62002b.zip chromium_src-27cc5a09bba00e877f8815d592d0ba955f62002b.tar.gz chromium_src-27cc5a09bba00e877f8815d592d0ba955f62002b.tar.bz2 |
Improve JSONReader performance by up to 55% by using std::string instead of wstring.
Before this change, JSONReader would:
1. Take std::string input
2. Convert it to wstring
3. Parse
4. Decode strings for the object representation, converting wstring to string16
5. Create a base::Value with a string16, which internally converts back to std::string
After this change, JSONReader does:
1. Take std::string input
2. Parse
3. Create a base::Value with a std::string
BUG=111581
TEST=Covered by unittests.
Review URL: http://codereview.chromium.org/9801007
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@128678 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base')
-rw-r--r-- | base/json/json_reader.cc | 155 | ||||
-rw-r--r-- | base/json/json_reader.h | 30 | ||||
-rw-r--r-- | base/json/json_reader_unittest.cc | 32 |
3 files changed, 162 insertions, 55 deletions
diff --git a/base/json/json_reader.cc b/base/json/json_reader.cc index bbaf5fb..31eecb4 100644 --- a/base/json/json_reader.cc +++ b/base/json/json_reader.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Copyright (c) 2012 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. @@ -9,15 +9,17 @@ #include "base/memory/scoped_ptr.h" #include "base/stringprintf.h" #include "base/string_number_conversions.h" +#include "base/string_piece.h" #include "base/string_util.h" +#include "base/third_party/icu/icu_utf.h" #include "base/utf_string_conversions.h" #include "base/values.h" namespace { -const wchar_t kNullString[] = L"null"; -const wchar_t kTrueString[] = L"true"; -const wchar_t kFalseString[] = L"false"; +const char kNullString[] = "null"; +const char kTrueString[] = "true"; +const char kFalseString[] = "false"; const int kStackLimit = 100; @@ -25,11 +27,11 @@ const int kStackLimit = 100; // token. The method returns false if there is no valid integer at the end of // the token. bool ReadInt(base::JSONReader::Token& token, bool can_have_leading_zeros) { - wchar_t first = token.NextChar(); + char first = token.NextChar(); int len = 0; // Read in more digits. - wchar_t c = first; + char c = first; while ('\0' != c && IsAsciiDigit(c)) { ++token.length; ++len; @@ -50,7 +52,7 @@ bool ReadInt(base::JSONReader::Token& token, bool can_have_leading_zeros) { // the method returns false. bool ReadHexDigits(base::JSONReader::Token& token, int digits) { for (int i = 1; i <= digits; ++i) { - wchar_t c = *(token.begin + token.length + i); + char c = *(token.begin + token.length + i); if (c == '\0' || !IsHexDigit(c)) return false; } @@ -83,6 +85,7 @@ const char* JSONReader::kUnquotedDictionaryKey = JSONReader::JSONReader() : start_pos_(NULL), json_pos_(NULL), + end_pos_(NULL), stack_depth_(0), allow_trailing_comma_(false), error_code_(JSON_NO_ERROR), @@ -148,23 +151,21 @@ std::string JSONReader::GetErrorMessage() const { Value* JSONReader::JsonToValue(const std::string& json, bool check_root, bool allow_trailing_comma) { // The input must be in UTF-8. - if (!IsStringUTF8(json.c_str())) { + if (!IsStringUTF8(json.data())) { error_code_ = JSON_UNSUPPORTED_ENCODING; return NULL; } - // The conversion from UTF8 to wstring removes null bytes for us - // (a good thing). - std::wstring json_wide(UTF8ToWide(json)); - start_pos_ = json_wide.c_str(); - - // When the input JSON string starts with a UTF-8 Byte-Order-Mark - // (0xEF, 0xBB, 0xBF), the UTF8ToWide() function converts it to a Unicode - // BOM (U+FEFF). To avoid the JSONReader::BuildValue() function from - // mis-treating a Unicode BOM as an invalid character and returning NULL, - // skip a converted Unicode BOM if it exists. - if (!json_wide.empty() && start_pos_[0] == 0xFEFF) { - ++start_pos_; + start_pos_ = json.data(); + end_pos_ = start_pos_ + json.size(); + + // When the input JSON string starts with a UTF-8 Byte-Order-Mark (U+FEFF) + // or <0xEF 0xBB 0xBF>, advance the start position to avoid the + // JSONReader::BuildValue() function from mis-treating a Unicode BOM as an + // invalid character and returning NULL. + if (json.size() >= 3 && start_pos_[0] == 0xEF && + start_pos_[1] == 0xBB && start_pos_[2] == 0xBF) { + start_pos_ += 3; } json_pos_ = start_pos_; @@ -356,7 +357,7 @@ JSONReader::Token JSONReader::ParseNumberToken() { // We just grab the number here. We validate the size in DecodeNumber. // According to RFC4627, a valid number is: [minus] int [frac] [exp] Token token(Token::NUMBER, json_pos_, 0); - wchar_t c = *json_pos_; + char c = *json_pos_; if ('-' == c) { ++token.length; c = token.NextChar(); @@ -390,15 +391,14 @@ JSONReader::Token JSONReader::ParseNumberToken() { } Value* JSONReader::DecodeNumber(const Token& token) { - const std::wstring num_string(token.begin, token.length); + const std::string num_string(token.begin, token.length); int num_int; - if (StringToInt(WideToUTF8(num_string), &num_int)) + if (StringToInt(num_string, &num_int)) return Value::CreateIntegerValue(num_int); double num_double; - if (StringToDouble(WideToUTF8(num_string), &num_double) && - base::IsFinite(num_double)) + if (StringToDouble(num_string, &num_double) && base::IsFinite(num_double)) return Value::CreateDoubleValue(num_double); return NULL; @@ -406,8 +406,8 @@ Value* JSONReader::DecodeNumber(const Token& token) { JSONReader::Token JSONReader::ParseStringToken() { Token token(Token::STRING, json_pos_, 1); - wchar_t c = token.NextChar(); - while ('\0' != c) { + char c = token.NextChar(); + while (json_pos_ + token.length < end_pos_) { if ('\\' == c) { ++token.length; c = token.NextChar(); @@ -450,11 +450,11 @@ JSONReader::Token JSONReader::ParseStringToken() { } Value* JSONReader::DecodeString(const Token& token) { - std::wstring decoded_str; + std::string decoded_str; decoded_str.reserve(token.length - 2); for (int i = 1; i < token.length - 1; ++i) { - wchar_t c = *(token.begin + i); + char c = *(token.begin + i); if ('\\' == c) { ++i; c = *(token.begin + i); @@ -483,17 +483,19 @@ Value* JSONReader::DecodeString(const Token& token) { decoded_str.push_back('\v'); break; - case 'x': - decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 4) + - HexDigitToInt(*(token.begin + i + 2))); + case 'x': { + if (i + 2 >= token.length) + return NULL; + int hex_digit = 0; + if (!HexStringToInt(StringPiece(token.begin + i + 1, 2), &hex_digit)) + return NULL; + decoded_str.push_back(hex_digit); i += 2; break; + } case 'u': - decoded_str.push_back((HexDigitToInt(*(token.begin + i + 1)) << 12 ) + - (HexDigitToInt(*(token.begin + i + 2)) << 8) + - (HexDigitToInt(*(token.begin + i + 3)) << 4) + - HexDigitToInt(*(token.begin + i + 4))); - i += 4; + if (!ConvertUTF16Units(token, &i, &decoded_str)) + return NULL; break; default: @@ -507,7 +509,66 @@ Value* JSONReader::DecodeString(const Token& token) { decoded_str.push_back(c); } } - return Value::CreateStringValue(WideToUTF16Hack(decoded_str)); + return Value::CreateStringValue(decoded_str); +} + +bool JSONReader::ConvertUTF16Units(const Token& token, + int* i, + std::string* dest_string) { + if (*i + 4 >= token.length) + return false; + + // This is a 32-bit field because the shift operations in the + // conversion process below cause MSVC to error about "data loss." + // This only stores UTF-16 code units, though. + // Consume the UTF-16 code unit, which may be a high surrogate. + int code_unit16_high = 0; + if (!HexStringToInt(StringPiece(token.begin + *i + 1, 4), &code_unit16_high)) + return false; + *i += 4; + + // If this is a high surrogate, consume the next code unit to get the + // low surrogate. + int code_unit16_low = 0; + if (CBU16_IS_SURROGATE(code_unit16_high)) { + // Make sure this is the high surrogate. If not, it's an encoding + // error. + if (!CBU16_IS_SURROGATE_LEAD(code_unit16_high)) + return false; + + // Make sure that the token has more characters to consume the + // lower surrogate. + if (*i + 6 >= token.length) + return false; + if (*(++(*i) + token.begin) != '\\' || *(++(*i) + token.begin) != 'u') + return false; + + if (!HexStringToInt(StringPiece(token.begin + *i + 1, 4), &code_unit16_low)) + return false; + *i += 4; + if (!CBU16_IS_SURROGATE(code_unit16_low) || + !CBU16_IS_TRAIL(code_unit16_low)) { + return false; + } + } else if (!CBU16_IS_SINGLE(code_unit16_high)) { + // If this is not a code point, it's an encoding error. + return false; + } + + // Convert the UTF-16 code units to a code point and then to a UTF-8 + // code unit sequence. + char code_point[8] = { 0 }; + size_t offset = 0; + if (!code_unit16_low) { + CBU8_APPEND_UNSAFE(code_point, offset, code_unit16_high); + } else { + uint32 code_unit32 = CBU16_GET_SUPPLEMENTARY(code_unit16_high, + code_unit16_low); + offset = 0; + CBU8_APPEND_UNSAFE(code_point, offset, code_unit32); + } + dest_string->append(code_point); + return true; } JSONReader::Token JSONReader::ParseToken() { @@ -580,7 +641,7 @@ JSONReader::Token JSONReader::ParseToken() { } void JSONReader::EatWhitespaceAndComments() { - while ('\0' != *json_pos_) { + while (json_pos_ != end_pos_) { switch (*json_pos_) { case ' ': case '\n': @@ -604,11 +665,11 @@ bool JSONReader::EatComment() { if ('/' != *json_pos_) return false; - wchar_t next_char = *(json_pos_ + 1); + char next_char = *(json_pos_ + 1); if ('/' == next_char) { // Line comment, read until \n or \r json_pos_ += 2; - while ('\0' != *json_pos_) { + while (json_pos_ != end_pos_) { switch (*json_pos_) { case '\n': case '\r': @@ -621,7 +682,7 @@ bool JSONReader::EatComment() { } else if ('*' == next_char) { // Block comment, read until */ json_pos_ += 2; - while ('\0' != *json_pos_) { + while (json_pos_ != end_pos_) { if ('*' == *json_pos_ && '/' == *(json_pos_ + 1)) { json_pos_ += 2; return true; @@ -634,18 +695,18 @@ bool JSONReader::EatComment() { return true; } -bool JSONReader::NextStringMatch(const wchar_t* str, size_t length) { - return wcsncmp(json_pos_, str, length) == 0; +bool JSONReader::NextStringMatch(const char* str, size_t length) { + return strncmp(json_pos_, str, length) == 0; } void JSONReader::SetErrorCode(JsonParseError error, - const wchar_t* error_pos) { + const char* error_pos) { int line_number = 1; int column_number = 1; // Figure out the line and column the error occured at. - for (const wchar_t* pos = start_pos_; pos != error_pos; ++pos) { - if (*pos == '\0') { + for (const char* pos = start_pos_; pos != error_pos; ++pos) { + if (pos > end_pos_) { NOTREACHED(); return; } diff --git a/base/json/json_reader.h b/base/json/json_reader.h index be3aef2..b1edfb0 100644 --- a/base/json/json_reader.h +++ b/base/json/json_reader.h @@ -1,4 +1,4 @@ -// Copyright (c) 2011 The Chromium Authors. All rights reserved. +// Copyright (c) 2012 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // @@ -71,11 +71,11 @@ class BASE_EXPORT JSONReader { INVALID_TOKEN, }; - Token(Type t, const wchar_t* b, int len) + Token(Type t, const char* b, int len) : type(t), begin(b), length(len) {} // Get the character that's one past the end of this token. - wchar_t NextChar() { + char NextChar() { return *(begin + length); } @@ -86,7 +86,7 @@ class BASE_EXPORT JSONReader { Type type; // A pointer into JSONReader::json_pos_ that's the beginning of this token. - const wchar_t* begin; + const char* begin; // End should be one char past the end of the token. int length; @@ -186,6 +186,17 @@ class BASE_EXPORT JSONReader { // (otherwise ParseStringToken would have failed). Value* DecodeString(const Token& token); + // Helper function for DecodeString that consumes UTF16 [0,2] code units and + // convers them to UTF8 code untis. |token| is the string token in which the + // units should be read, |i| is the position in the token at which the first + // code unit starts, immediately after the |\u|. This will be mutated if code + // units are consumed. |dest_string| is a string to which the UTF8 code unit + // should be appended. Returns true on success and false if there's an + // encoding error. + bool ConvertUTF16Units(const Token& token, + int* i, + std::string* dest_string); + // Grabs the next token in the JSON stream. This does not increment the // stream so it can be used to look ahead at the next token. Token ParseToken(); @@ -198,17 +209,20 @@ class BASE_EXPORT JSONReader { bool EatComment(); // Checks if |json_pos_| matches str. - bool NextStringMatch(const wchar_t* str, size_t length); + bool NextStringMatch(const char* str, size_t length); // Sets the error code that will be returned to the caller. The current // line and column are determined and added into the final message. - void SetErrorCode(const JsonParseError error, const wchar_t* error_pos); + void SetErrorCode(const JsonParseError error, const char* error_pos); // Pointer to the starting position in the input string. - const wchar_t* start_pos_; + const char* start_pos_; // Pointer to the current position in the input string. - const wchar_t* json_pos_; + const char* json_pos_; + + // Pointer to the last position in the input string. + const char* end_pos_; // Used to keep track of how many nested lists/dicts there are. int stack_depth_; diff --git a/base/json/json_reader_unittest.cc b/base/json/json_reader_unittest.cc index 432512c..5900781 100644 --- a/base/json/json_reader_unittest.cc +++ b/base/json/json_reader_unittest.cc @@ -457,6 +457,38 @@ TEST(JSONReaderTest, Reading) { false, false)); EXPECT_FALSE(root.get()); + // Test utf16 encoded strings. + root.reset(JSONReader().JsonToValue("\"\\u20ac3,14\"", false, false)); + ASSERT_TRUE(root.get()); + EXPECT_TRUE(root->IsType(Value::TYPE_STRING)); + str_val.clear(); + EXPECT_TRUE(root->GetAsString(&str_val)); + EXPECT_EQ("\xe2\x82\xac""3,14", str_val); + + root.reset(JSONReader().JsonToValue("\"\\ud83d\\udca9\\ud83d\\udc6c\"", + false, false)); + ASSERT_TRUE(root.get()); + EXPECT_TRUE(root->IsType(Value::TYPE_STRING)); + str_val.clear(); + EXPECT_TRUE(root->GetAsString(&str_val)); + EXPECT_EQ("\xf0\x9f\x92\xa9\xf0\x9f\x91\xac", str_val); + + // Test invalid utf16 strings. + const char* cases[] = { + "\"\\u123\"", // Invalid scalar. + "\"\\ud83d\"", // Invalid scalar. + "\"\\u$%@!\"", // Invalid scalar. + "\"\\uzz89\"", // Invalid scalar. + "\"\\ud83d\\udca\"", // Invalid lower surrogate. + "\"\\ud83d\\ud83d\"", // Invalid lower surrogate. + "\"\\ud83foo\"", // No lower surrogate. + "\"\\ud83\\foo\"" // No lower surrogate. + }; + for (size_t i = 0; i < arraysize(cases); ++i) { + root.reset(JSONReader().JsonToValue(cases[i], false, false)); + EXPECT_FALSE(root.get()) << cases[i]; + } + // Test invalid root objects. root.reset(JSONReader::Read("null", false)); EXPECT_FALSE(root.get()); |