summaryrefslogtreecommitdiffstats
path: root/base/json_reader.h
blob: 9c4065be205792ff0701bc74f9e353df15a5854b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
// Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
//
// A JSON parser.  Converts strings of JSON into a Value object (see
// base/values.h).
// http://www.ietf.org/rfc/rfc4627.txt?number=4627
//
// Known limitations/deviations from the RFC:
// - Only knows how to parse ints within the range of a signed 32 bit int and
//   decimal numbers within a double.
// - Assumes input is encoded as UTF8.  The spec says we should allow UTF-16
//   (BE or LE) and UTF-32 (BE or LE) as well.
// - We limit nesting to 100 levels to prevent stack overflow (this is allowed
//   by the RFC).
// - A Unicode FAQ ("http://unicode.org/faq/utf_bom.html") writes a data
//   stream may start with a Unicode Byte-Order-Mark (U+FEFF), i.e. the input
//   UTF-8 string for the JSONReader::JsonToValue() function may start with a
//   UTF-8 BOM (0xEF, 0xBB, 0xBF).
//   To avoid the function from mis-treating a UTF-8 BOM as an invalid
//   character, the function skips a Unicode BOM at the beginning of the
//   Unicode string (converted from the input UTF-8 string) before parsing it.
//
// TODO(tc): Add a parsing option to to relax object keys being wrapped in
//   double quotes
// TODO(tc): Add an option to disable comment stripping
// TODO(aa): Consider making the constructor public and the static Read() method
// only a convenience for the common uses with more complex configuration going
// on the instance.

#ifndef BASE_JSON_READER_H_
#define BASE_JSON_READER_H_

#include <string>

#include "base/basictypes.h"
#include "testing/gtest/include/gtest/gtest_prod.h"

class Value;

class JSONReader {
 public:
  // A struct to hold a JS token.
  class Token {
   public:
    enum Type {
     OBJECT_BEGIN,           // {
     OBJECT_END,             // }
     ARRAY_BEGIN,            // [
     ARRAY_END,              // ]
     STRING,
     NUMBER,
     BOOL_TRUE,              // true
     BOOL_FALSE,             // false
     NULL_TOKEN,             // null
     LIST_SEPARATOR,         // ,
     OBJECT_PAIR_SEPARATOR,  // :
     END_OF_INPUT,
     INVALID_TOKEN,
    };
    Token(Type t, const wchar_t* b, int len)
      : type(t), begin(b), length(len) {}

    Type type;

    // A pointer into JSONReader::json_pos_ that's the beginning of this token.
    const wchar_t* begin;

    // End should be one char past the end of the token.
    int length;

    // Get the character that's one past the end of this token.
    wchar_t NextChar() {
      return *(begin + length);
    }
  };

  // Error messages that can be returned.
  static const char* kBadRootElementType;
  static const char* kInvalidEscape;
  static const char* kSyntaxError;
  static const char* kTrailingComma;
  static const char* kTooMuchNesting;
  static const char* kUnexpectedDataAfterRoot;
  static const char* kUnsupportedEncoding;
  static const char* kUnquotedDictionaryKey;

  JSONReader();

  // Reads and parses |json|, returning a Value. The caller owns the returned
  // instance. If |json| is not a properly formed JSON string, returns NULL.
  // If |allow_trailing_comma| is true, we will ignore trailing commas in
  // objects and arrays even though this goes against the RFC.
  static Value* Read(const std::string& json, bool allow_trailing_comma);

  // Reads and parses |json| like Read(). |error_message_out| is optional. If
  // specified and NULL is returned, |error_message_out| will be populated with
  // a string describing the error. Otherwise, |error_message_out| is
  // unmodified.
  static Value* ReadAndReturnError(const std::string& json,
                                   bool allow_trailing_comma,
                                   std::string* error_message_out);

  // Returns the error message if the last call to JsonToValue() failed. If the
  // last call did not fail, returns a valid empty string.
  std::string error_message() { return error_message_; }

  // Reads and parses |json|, returning a Value. The caller owns the returned
  // instance. If |json| is not a properly formed JSON string, returns NULL and
  // a detailed error can be retrieved from |error_message()|.
  // If |check_root| is true, we require that the root object be an object or
  // array. Otherwise, it can be any valid JSON type.
  // If |allow_trailing_comma| is true, we will ignore trailing commas in
  // objects and arrays even though this goes against the RFC.
  Value* JsonToValue(const std::string& json, bool check_root,
                     bool allow_trailing_comma);

 private:
  static std::string FormatErrorMessage(int line, int column,
                                        const char* description);

  DISALLOW_EVIL_CONSTRUCTORS(JSONReader);

  FRIEND_TEST(JSONReaderTest, Reading);
  FRIEND_TEST(JSONReaderTest, ErrorMessages);

  // Recursively build Value.  Returns NULL if we don't have a valid JSON
  // string.  If |is_root| is true, we verify that the root element is either
  // an object or an array.
  Value* BuildValue(bool is_root);

  // Parses a sequence of characters into a Token::NUMBER. If the sequence of
  // characters is not a valid number, returns a Token::INVALID_TOKEN. Note
  // that DecodeNumber is used to actually convert from a string to an
  // int/double.
  Token ParseNumberToken();

  // Try and convert the substring that token holds into an int or a double. If
  // we can (ie., no overflow), return the value, else return NULL.
  Value* DecodeNumber(const Token& token);

  // Parses a sequence of characters into a Token::STRING. If the sequence of
  // characters is not a valid string, returns a Token::INVALID_TOKEN. Note
  // that DecodeString is used to actually decode the escaped string into an
  // actual wstring.
  Token ParseStringToken();

  // Convert the substring into a value string.  This should always succeed
  // (otherwise ParseStringToken would have failed).
  Value* DecodeString(const Token& token);

  // Grabs the next token in the JSON stream.  This does not increment the
  // stream so it can be used to look ahead at the next token.
  Token ParseToken();

  // Increments |json_pos_| past leading whitespace and comments.
  void EatWhitespaceAndComments();

  // If |json_pos_| is at the start of a comment, eat it, otherwise, returns
  // false.
  bool EatComment();

  // Checks if |json_pos_| matches str.
  bool NextStringMatch(const std::wstring& str);

  // Creates the error message that will be returned to the caller. The current
  // line and column are determined and added into the final message.
  void SetErrorMessage(const char* description, const wchar_t* error_pos);

  // Pointer to the starting position in the input string.
  const wchar_t* start_pos_;

  // Pointer to the current position in the input string.
  const wchar_t* json_pos_;

  // Used to keep track of how many nested lists/dicts there are.
  int stack_depth_;

  // A parser flag that allows trailing commas in objects and arrays.
  bool allow_trailing_comma_;

  // Contains the error message for the last call to JsonToValue(), if any.
  std::string error_message_;
};

#endif  // BASE_JSON_READER_H_