// Copyright (c) 2012 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "extensions/browser/api/web_request/form_data_parser.h" #include #include "base/lazy_instance.h" #include "base/logging.h" #include "base/macros.h" #include "base/strings/string_util.h" #include "base/values.h" #include "net/base/escape.h" #include "net/url_request/url_request.h" #include "third_party/re2/re2/re2.h" using base::DictionaryValue; using base::ListValue; using base::StringPiece; using re2::RE2; namespace extensions { namespace { const char kContentDisposition[] = "content-disposition:"; const size_t kContentDispositionLength = arraysize(kContentDisposition) - 1; // kCharacterPattern is an allowed character in a URL encoding. Definition is // from RFC 1738, end of section 2.2. const char kCharacterPattern[] = "(?:[a-zA-Z0-9$_.+!*'(),]|-|(?:%[a-fA-F0-9]{2}))"; const char kEscapeClosingQuote[] = "\\\\E"; // A wrapper struct for static RE2 objects to be held as LazyInstance. struct Patterns { Patterns(); ~Patterns(); const RE2 transfer_padding_pattern; const RE2 crlf_pattern; const RE2 closing_pattern; const RE2 epilogue_pattern; const RE2 crlf_free_pattern; const RE2 preamble_pattern; const RE2 header_pattern; const RE2 content_disposition_pattern; const RE2 name_pattern; const RE2 value_pattern; const RE2 unquote_pattern; const RE2 url_encoded_pattern; }; Patterns::Patterns() : transfer_padding_pattern("[ \\t]*\\r\\n"), crlf_pattern("\\r\\n"), closing_pattern("--[ \\t]*"), epilogue_pattern("|\\r\\n(?s:.)*"), crlf_free_pattern("(?:[^\\r]|\\r+[^\\r\\n])*"), preamble_pattern(".+?"), header_pattern("[!-9;-~]+:(.|\\r\\n[\\t ])*\\r\\n"), content_disposition_pattern(std::string("(?i:") + kContentDisposition + ")"), name_pattern("\\bname=\"([^\"]*)\""), value_pattern("\\bfilename=\"([^\"]*)\""), unquote_pattern(kEscapeClosingQuote), url_encoded_pattern(std::string("(") + kCharacterPattern + "*)=(" + kCharacterPattern + "*)") { } Patterns::~Patterns() {} base::LazyInstance::Leaky g_patterns = LAZY_INSTANCE_INITIALIZER; } // namespace // Parses URLencoded forms, see // http://www.w3.org/TR/REC-html40-971218/interact/forms.html#h-17.13.4.1 . class FormDataParserUrlEncoded : public FormDataParser { public: FormDataParserUrlEncoded(); ~FormDataParserUrlEncoded() override; // Implementation of FormDataParser. bool AllDataReadOK() override; bool GetNextNameValue(Result* result) override; bool SetSource(base::StringPiece source) override; private: // Returns the pattern to match a single name-value pair. This could be even // static, but then we would have to spend more code on initializing the // cached pointer to g_patterns.Get(). const RE2& pattern() const { return patterns_->url_encoded_pattern; } // Auxiliary constant for using RE2. Number of arguments for parsing // name-value pairs (one for name, one for value). static const size_t args_size_ = 2u; static const net::UnescapeRule::Type unescape_rules_; re2::StringPiece source_; bool source_set_; bool source_malformed_; // Auxiliary store for using RE2. std::string name_; std::string value_; const RE2::Arg arg_name_; const RE2::Arg arg_value_; const RE2::Arg* args_[args_size_]; // Caching the pointer to g_patterns.Get(). const Patterns* patterns_; DISALLOW_COPY_AND_ASSIGN(FormDataParserUrlEncoded); }; // The following class, FormDataParserMultipart, parses forms encoded as // multipart, defined in RFCs 2388 (specific to forms), 2046 (multipart // encoding) and 5322 (MIME-headers). // // Implementation details // // The original grammar from RFC 2046 is this, "multipart-body" being the root // non-terminal: // // boundary := 0*69 bcharsnospace // bchars := bcharsnospace / " " // bcharsnospace := DIGIT / ALPHA / "'" / "(" / ")" / "+" / "_" / "," // / "-" / "." / "/" / ":" / "=" / "?" // dash-boundary := "--" boundary // multipart-body := [preamble CRLF] // dash-boundary transport-padding CRLF // body-part *encapsulation // close-delimiter transport-padding // [CRLF epilogue] // transport-padding := *LWSP-char // encapsulation := delimiter transport-padding CRLF body-part // delimiter := CRLF dash-boundary // close-delimiter := delimiter "--" // preamble := discard-text // epilogue := discard-text // discard-text := *(*text CRLF) *text // body-part := MIME-part-headers [CRLF *OCTET] // OCTET := // // Uppercase non-terminals are defined in RFC 5234, Appendix B.1; i.e. CRLF, // DIGIT, and ALPHA stand for "\r\n", '0'-'9' and the set of letters of the // English alphabet, respectively. // The non-terminal "text" is presumably just any text, excluding line breaks. // The non-terminal "LWSP-char" is not directly defined in the original grammar // but it means "linear whitespace", which is a space or a horizontal tab. // The non-terminal "MIME-part-headers" is not discussed in RFC 2046, so we use // the syntax for "optional fields" from Section 3.6.8 of RFC 5322: // // MIME-part-headers := field-name ":" unstructured CRLF // field-name := 1*ftext // ftext := %d33-57 / ; Printable US-ASCII // %d59-126 ; characters not including ":". // Based on Section 2.2.1 of RFC 5322, "unstructured" matches any string which // does not contain a CRLF sub-string, except for substrings "CRLF" and // "CRLF", which serve for "folding". // // The FormDataParseMultipart class reads the input source and tries to parse it // according to the grammar above, rooted at the "multipart-body" non-terminal. // This happens in stages: // // 1. The optional preamble and the initial dash-boundary with transport padding // and a CRLF are read and ignored. // // 2. Repeatedly each body part is read. The body parts can either serve to // upload a file, or just a string of bytes. // 2.a. The headers of that part are searched for the "content-disposition" // header, which contains the name of the value represented by that body // part. If the body-part is for file upload, that header also contains a // filename. // 2.b. The "*OCTET" part of the body part is then read and passed as the value // of the name-value pair for body parts representing a string of bytes. // For body parts for uploading a file the "*OCTET" part is just ignored // and the filename is used for value instead. // // 3. The final close-delimiter and epilogue are read and ignored. // // IMPORTANT NOTE // This parser supports sources split into multiple chunks. Therefore SetSource // can be called multiple times if the source is spread over several chunks. // However, the split may only occur inside a body part, right after the // trailing CRLF of headers. class FormDataParserMultipart : public FormDataParser { public: explicit FormDataParserMultipart(const std::string& boundary_separator); ~FormDataParserMultipart() override; // Implementation of FormDataParser. bool AllDataReadOK() override; bool GetNextNameValue(Result* result) override; bool SetSource(base::StringPiece source) override; private: enum State { STATE_INIT, // No input read yet. STATE_READY, // Ready to call GetNextNameValue. STATE_FINISHED, // Read the input until the end. STATE_SUSPEND, // Waiting until a new |source_| is set. STATE_ERROR }; // Produces a regexp to match the string "--" + |literal|. The idea is to // represent "--" + |literal| as a "quoted pattern", a verbatim copy enclosed // in "\\Q" and "\\E". The only catch is to watch out for occurences of "\\E" // inside |literal|. Those must be excluded from the quote and the backslash // doubly escaped. For example, for literal == "abc\\Edef" the result is // "\\Q--abc\\E\\\\E\\Qdef\\E". static std::string CreateBoundaryPatternFromLiteral( const std::string& literal); // Tests whether |input| has a prefix matching |pattern|. static bool StartsWithPattern(const re2::StringPiece& input, const RE2& pattern); // If |source_| starts with a header, seeks |source_| beyond the header. If // the header is Content-Disposition, extracts |name| from "name=" and // possibly |value| from "filename=" fields of that header. Only if the // "name" or "filename" fields are found, then |name| or |value| are touched. // Returns true iff |source_| is seeked forward. Sets |value_assigned| // to true iff |value| has been assigned to. bool TryReadHeader(base::StringPiece* name, base::StringPiece* value, bool* value_assigned); // Helper to GetNextNameValue. Expects that the input starts with a data // portion of a body part. An attempt is made to read the input until the end // of that body part. If |data| is not NULL, it is set to contain the data // portion. Returns true iff the reading was successful. bool FinishReadingPart(base::StringPiece* data); // These methods could be even static, but then we would have to spend more // code on initializing the cached pointer to g_patterns.Get(). const RE2& transfer_padding_pattern() const { return patterns_->transfer_padding_pattern; } const RE2& crlf_pattern() const { return patterns_->crlf_pattern; } const RE2& closing_pattern() const { return patterns_->closing_pattern; } const RE2& epilogue_pattern() const { return patterns_->epilogue_pattern; } const RE2& crlf_free_pattern() const { return patterns_->crlf_free_pattern; } const RE2& preamble_pattern() const { return patterns_->preamble_pattern; } const RE2& header_pattern() const { return patterns_->header_pattern; } const RE2& content_disposition_pattern() const { return patterns_->content_disposition_pattern; } const RE2& name_pattern() const { return patterns_->name_pattern; } const RE2& value_pattern() const { return patterns_->value_pattern; } // However, this is used in a static method so it needs to be static. static const RE2& unquote_pattern() { return g_patterns.Get().unquote_pattern; // No caching g_patterns here. } const RE2 dash_boundary_pattern_; // Because of initialisation dependency, |state_| needs to be declared after // |dash_boundary_pattern_|. State state_; // The parsed message can be split into multiple sources which we read // sequentially. re2::StringPiece source_; // Caching the pointer to g_patterns.Get(). const Patterns* patterns_; DISALLOW_COPY_AND_ASSIGN(FormDataParserMultipart); }; FormDataParser::Result::Result() {} FormDataParser::Result::~Result() {} FormDataParser::~FormDataParser() {} // static scoped_ptr FormDataParser::Create( const net::URLRequest& request) { std::string value; const bool found = request.extra_request_headers().GetHeader( net::HttpRequestHeaders::kContentType, &value); return CreateFromContentTypeHeader(found ? &value : NULL); } // static scoped_ptr FormDataParser::CreateFromContentTypeHeader( const std::string* content_type_header) { enum ParserChoice {URL_ENCODED, MULTIPART, ERROR_CHOICE}; ParserChoice choice = ERROR_CHOICE; std::string boundary; if (content_type_header == NULL) { choice = URL_ENCODED; } else { const std::string content_type( content_type_header->substr(0, content_type_header->find(';'))); if (base::strcasecmp( content_type.c_str(), "application/x-www-form-urlencoded") == 0) { choice = URL_ENCODED; } else if (base::strcasecmp( content_type.c_str(), "multipart/form-data") == 0) { static const char kBoundaryString[] = "boundary="; size_t offset = content_type_header->find(kBoundaryString); if (offset == std::string::npos) { // Malformed header. return scoped_ptr(); } offset += sizeof(kBoundaryString) - 1; boundary = content_type_header->substr( offset, content_type_header->find(';', offset)); if (!boundary.empty()) choice = MULTIPART; } } // Other cases are unparseable, including when |content_type| is "text/plain". switch (choice) { case URL_ENCODED: return scoped_ptr(new FormDataParserUrlEncoded()); case MULTIPART: return scoped_ptr(new FormDataParserMultipart(boundary)); case ERROR_CHOICE: return scoped_ptr(); } NOTREACHED(); // Some compilers do not believe this is unreachable. return scoped_ptr(); } FormDataParser::FormDataParser() {} const net::UnescapeRule::Type FormDataParserUrlEncoded::unescape_rules_ = net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS | net::UnescapeRule::SPACES | net::UnescapeRule::REPLACE_PLUS_WITH_SPACE; FormDataParserUrlEncoded::FormDataParserUrlEncoded() : source_(NULL), source_set_(false), source_malformed_(false), arg_name_(&name_), arg_value_(&value_), patterns_(g_patterns.Pointer()) { args_[0] = &arg_name_; args_[1] = &arg_value_; } FormDataParserUrlEncoded::~FormDataParserUrlEncoded() {} bool FormDataParserUrlEncoded::AllDataReadOK() { // All OK means we read the whole source. return source_set_ && source_.empty() && !source_malformed_; } bool FormDataParserUrlEncoded::GetNextNameValue(Result* result) { if (!source_set_ || source_malformed_) return false; bool success = RE2::ConsumeN(&source_, pattern(), args_, args_size_); if (success) { result->set_name(net::UnescapeURLComponent(name_, unescape_rules_)); result->set_value(net::UnescapeURLComponent(value_, unescape_rules_)); } if (source_.length() > 0) { if (source_[0] == '&') source_.remove_prefix(1); // Remove the leading '&'. else source_malformed_ = true; // '&' missing between two name-value pairs. } return success && !source_malformed_; } bool FormDataParserUrlEncoded::SetSource(base::StringPiece source) { if (source_set_) return false; // We do not allow multiple sources for this parser. source_.set(source.data(), source.size()); source_set_ = true; source_malformed_ = false; return true; } // static std::string FormDataParserMultipart::CreateBoundaryPatternFromLiteral( const std::string& literal) { static const char quote[] = "\\Q"; static const char unquote[] = "\\E"; // The result always starts with opening the qoute and then "--". std::string result("\\Q--"); // This StringPiece is used below to record the next occurrence of "\\E" in // |literal|. re2::StringPiece seek_unquote(literal); const char* copy_start = literal.data(); size_t copy_length = literal.size(); // Find all "\\E" in |literal| and exclude them from the \Q...\E quote. while (RE2::FindAndConsume(&seek_unquote, unquote_pattern())) { copy_length = seek_unquote.data() - copy_start; result.append(copy_start, copy_length); result.append(kEscapeClosingQuote); result.append(quote); copy_start = seek_unquote.data(); } // Finish the last \Q...\E quote. copy_length = (literal.data() + literal.size()) - copy_start; result.append(copy_start, copy_length); result.append(unquote); return result; } // static bool FormDataParserMultipart::StartsWithPattern(const re2::StringPiece& input, const RE2& pattern) { return pattern.Match(input, 0, input.size(), RE2::ANCHOR_START, NULL, 0); } FormDataParserMultipart::FormDataParserMultipart( const std::string& boundary_separator) : dash_boundary_pattern_( CreateBoundaryPatternFromLiteral(boundary_separator)), state_(dash_boundary_pattern_.ok() ? STATE_INIT : STATE_ERROR), patterns_(g_patterns.Pointer()) {} FormDataParserMultipart::~FormDataParserMultipart() {} bool FormDataParserMultipart::AllDataReadOK() { return state_ == STATE_FINISHED; } bool FormDataParserMultipart::FinishReadingPart(base::StringPiece* data) { const char* data_start = source_.data(); while (!StartsWithPattern(source_, dash_boundary_pattern_)) { if (!RE2::Consume(&source_, crlf_free_pattern()) || !RE2::Consume(&source_, crlf_pattern())) { state_ = STATE_ERROR; return false; } } if (data != NULL) { if (source_.data() == data_start) { // No data in this body part. state_ = STATE_ERROR; return false; } // Subtract 2 for the trailing "\r\n". data->set(data_start, source_.data() - data_start - 2); } // Finally, read the dash-boundary and either skip to the next body part, or // finish reading the source. CHECK(RE2::Consume(&source_, dash_boundary_pattern_)); if (StartsWithPattern(source_, closing_pattern())) { CHECK(RE2::Consume(&source_, closing_pattern())); if (RE2::Consume(&source_, epilogue_pattern())) state_ = STATE_FINISHED; else state_ = STATE_ERROR; } else { // Next body part ahead. if (!RE2::Consume(&source_, transfer_padding_pattern())) state_ = STATE_ERROR; } return state_ != STATE_ERROR; } bool FormDataParserMultipart::GetNextNameValue(Result* result) { if (source_.empty() || state_ != STATE_READY) return false; // 1. Read body-part headers. base::StringPiece name; base::StringPiece value; bool value_assigned = false; bool value_assigned_temp; while (TryReadHeader(&name, &value, &value_assigned_temp)) value_assigned |= value_assigned_temp; if (name.empty() || state_ == STATE_ERROR) { state_ = STATE_ERROR; return false; } // 2. Read the trailing CRLF after headers. if (!RE2::Consume(&source_, crlf_pattern())) { state_ = STATE_ERROR; return false; } // 3. Read the data of this body part, i.e., everything until the first // dash-boundary. bool return_value; if (value_assigned && source_.empty()) { // Wait for a new source? return_value = true; state_ = STATE_SUSPEND; } else { return_value = FinishReadingPart(value_assigned ? NULL : &value); } std::string unescaped_name = net::UnescapeURLComponent( name.as_string(), net::UnescapeRule::URL_SPECIAL_CHARS | net::UnescapeRule::CONTROL_CHARS); result->set_name(unescaped_name); result->set_value(value); return return_value; } bool FormDataParserMultipart::SetSource(base::StringPiece source) { if (source.data() == NULL || !source_.empty()) return false; source_.set(source.data(), source.size()); switch (state_) { case STATE_INIT: // Seek behind the preamble. while (!StartsWithPattern(source_, dash_boundary_pattern_)) { if (!RE2::Consume(&source_, preamble_pattern())) { state_ = STATE_ERROR; break; } } // Read dash-boundary, transfer padding, and CRLF. if (state_ != STATE_ERROR) { if (!RE2::Consume(&source_, dash_boundary_pattern_) || !RE2::Consume(&source_, transfer_padding_pattern())) state_ = STATE_ERROR; else state_ = STATE_READY; } break; case STATE_READY: // Nothing to do. break; case STATE_SUSPEND: state_ = FinishReadingPart(NULL) ? STATE_READY : STATE_ERROR; break; default: state_ = STATE_ERROR; } return state_ != STATE_ERROR; } bool FormDataParserMultipart::TryReadHeader(base::StringPiece* name, base::StringPiece* value, bool* value_assigned) { *value_assigned = false; const char* header_start = source_.data(); if (!RE2::Consume(&source_, header_pattern())) return false; // (*) After this point we must return true, because we consumed one header. // Subtract 2 for the trailing "\r\n". re2::StringPiece header(header_start, source_.data() - header_start - 2); if (!StartsWithPattern(header, content_disposition_pattern())) return true; // Skip headers that don't describe the content-disposition. re2::StringPiece groups[2]; if (!name_pattern().Match(header, kContentDispositionLength, header.size(), RE2::UNANCHORED, groups, 2)) { state_ = STATE_ERROR; return true; // See (*) for why true. } name->set(groups[1].data(), groups[1].size()); if (value_pattern().Match(header, kContentDispositionLength, header.size(), RE2::UNANCHORED, groups, 2)) { value->set(groups[1].data(), groups[1].size()); *value_assigned = true; } return true; } } // namespace extensions