diff options
Diffstat (limited to 'net/tools/flip_server/balsa_frame.cc')
-rw-r--r-- | net/tools/flip_server/balsa_frame.cc | 1571 |
1 files changed, 1571 insertions, 0 deletions
diff --git a/net/tools/flip_server/balsa_frame.cc b/net/tools/flip_server/balsa_frame.cc new file mode 100644 index 0000000..39695cd --- /dev/null +++ b/net/tools/flip_server/balsa_frame.cc @@ -0,0 +1,1571 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "net/tools/flip_server/balsa_frame.h" + +#include <assert.h> +#include <emmintrin.h> +#include <strings.h> + +#include <limits> +#include <iostream> +#include <string> +#include <utility> +#include <vector> + +#include "base/logging.h" +#include "base/port.h" +#include "net/tools/flip_server/balsa_enums.h" +#include "net/tools/flip_server/balsa_headers.h" +#include "net/tools/flip_server/balsa_visitor_interface.h" +#include "net/tools/flip_server/buffer_interface.h" +#include "net/tools/flip_server/simple_buffer.h" +#ifdef CHROMIUM +#else +#include "strings/split.h" +#include "strings/stringpiece.h" // for StringPiece +#include "strings/stringpiece_utils.h" +#endif + +namespace gfe2 { + +// Constants holding some header names for headers which can affect the way the +// HTTP message is framed, and so must be processed specially: +static const char kContentLength[] = "content-length"; +static const size_t kContentLengthSize = sizeof(kContentLength) - 1; +static const char kTransferEncoding[] = "transfer-encoding"; +static const size_t kTransferEncodingSize = sizeof(kTransferEncoding) - 1; + +void BalsaFrame::Reset() { + last_char_was_slash_r_ = false; + saw_non_newline_char_ = false; + start_was_space_ = true; + chunk_length_character_extracted_ = false; + // is_request_ = true; // not reset between messages. + // request_was_head_ = false; // not reset between messages. + // max_header_length_ = 4096; // not reset between messages. + // max_request_uri_length_ = 2048; // not reset between messages. + // visitor_ = &do_nothing_visitor_; // not reset between messages. + chunk_length_remaining_ = 0; + content_length_remaining_ = 0; + last_slash_n_loc_ = NULL; + last_recorded_slash_n_loc_ = NULL; + last_slash_n_idx_ = 0; + term_chars_ = 0; + parse_state_ = BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE; + last_error_ = BalsaFrameEnums::NO_ERROR; + lines_.clear(); + if (headers_ != NULL) { + headers_->Clear(); + } +} + +const char* BalsaFrameEnums::ParseStateToString( + BalsaFrameEnums::ParseState error_code) { + switch (error_code) { + case ERROR: + return "ERROR"; + case READING_HEADER_AND_FIRSTLINE: + return "READING_HEADER_AND_FIRSTLINE"; + case READING_CHUNK_LENGTH: + return "READING_CHUNK_LENGTH"; + case READING_CHUNK_EXTENSION: + return "READING_CHUNK_EXTENSION"; + case READING_CHUNK_DATA: + return "READING_CHUNK_DATA"; + case READING_CHUNK_TERM: + return "READING_CHUNK_TERM"; + case READING_LAST_CHUNK_TERM: + return "READING_LAST_CHUNK_TERM"; + case READING_TRAILER: + return "READING_TRAILER"; + case READING_UNTIL_CLOSE: + return "READING_UNTIL_CLOSE"; + case READING_CONTENT: + return "READING_CONTENT"; + case MESSAGE_FULLY_READ: + return "MESSAGE_FULLY_READ"; + case NUM_STATES: + return "UNKNOWN_STATE"; + } + return "UNKNOWN_STATE"; +} + +const char* BalsaFrameEnums::ErrorCodeToString( + BalsaFrameEnums::ErrorCode error_code) { + switch (error_code) { + case NO_ERROR: + return "NO_ERROR"; + case NO_STATUS_LINE_IN_RESPONSE: + return "NO_STATUS_LINE_IN_RESPONSE"; + case NO_REQUEST_LINE_IN_REQUEST: + return "NO_REQUEST_LINE_IN_REQUEST"; + case FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION: + return "FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION"; + case FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD: + return "FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD"; + case FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE: + return "FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE"; + case FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI: + return "FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI"; + case FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE: + return "FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE"; + case FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION: + return "FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION"; + case FAILED_CONVERTING_STATUS_CODE_TO_INT: + return "FAILED_CONVERTING_STATUS_CODE_TO_INT"; + case REQUEST_URI_TOO_LONG: + return "REQUEST_URI_TOO_LONG"; + case HEADERS_TOO_LONG: + return "HEADERS_TOO_LONG"; + case UNPARSABLE_CONTENT_LENGTH: + return "UNPARSABLE_CONTENT_LENGTH"; + case MAYBE_BODY_BUT_NO_CONTENT_LENGTH: + return "MAYBE_BODY_BUT_NO_CONTENT_LENGTH"; + case REQUIRED_BODY_BUT_NO_CONTENT_LENGTH: + return "REQUIRED_BODY_BUT_NO_CONTENT_LENGTH"; + case HEADER_MISSING_COLON: + return "HEADER_MISSING_COLON"; + case INVALID_CHUNK_LENGTH: + return "INVALID_CHUNK_LENGTH"; + case CHUNK_LENGTH_OVERFLOW: + return "CHUNK_LENGTH_OVERFLOW"; + case CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO: + return "CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO"; + case CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT: + return "CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT"; + case MULTIPLE_CONTENT_LENGTH_KEYS: + return "MULTIPLE_CONTENT_LENGTH_KEYS"; + case MULTIPLE_TRANSFER_ENCODING_KEYS: + return "MULTIPLE_TRANSFER_ENCODING_KEYS"; + case UNKNOWN_TRANSFER_ENCODING: + return "UNKNOWN_TRANSFER_ENCODING"; + case INVALID_HEADER_FORMAT: + return "INVALID_HEADER_FORMAT"; + case INTERNAL_LOGIC_ERROR: + return "INTERNAL_LOGIC_ERROR"; + case NUM_ERROR_CODES: + return "UNKNOWN_ERROR"; + } + return "UNKNOWN_ERROR"; +} + +// Summary: +// Parses the first line of either a request or response. +// Note that in the case of a detected warning, error_code will be set +// but the function will not return false. +// Exactly zero or one warning or error (but not both) may be detected +// by this function. +// Note that this function will not write the data of the first-line +// into the header's buffer (that should already have been done elsewhere). +// +// Pre-conditions: +// begin != end +// *begin should be a character which is > ' '. This implies that there +// is at least one non-whitespace characters between [begin, end). +// headers is a valid pointer to a BalsaHeaders class. +// error_code is a valid pointer to a BalsaFrameEnums::ErrorCode value. +// Entire first line must exist between [begin, end) +// Exactly zero or one newlines -may- exist between [begin, end) +// [begin, end) should exist in the header's buffer. +// +// Side-effects: +// headers will be modified +// error_code may be modified if either a warning or error is detected +// +// Returns: +// True if no error (as opposed to warning) is detected. +// False if an error (as opposed to warning) is detected. + +// +// If there is indeed non-whitespace in the line, then the following +// will take care of this for you: +// while (*begin <= ' ') ++begin; +// ProcessFirstLine(begin, end, is_request, &headers, &error_code); +// +bool ParseHTTPFirstLine(const char* begin, + const char* end, + bool is_request, + size_t max_request_uri_length, + BalsaHeaders* headers, + BalsaFrameEnums::ErrorCode* error_code) { + const char* current = begin; + // HTTP firstlines all have the following structure: + // LWS NONWS LWS NONWS LWS NONWS NOTCRLF CRLF + // [\t \r\n]+ [^\t ]+ [\t ]+ [^\t ]+ [\t ]+ [^\t ]+ [^\r\n]+ "\r\n" + // ws1 nws1 ws2 nws2 ws3 nws3 ws4 + // | [-------) [-------) [----------------) + // REQ: method request_uri version + // RESP: version statuscode reason + // + // The first NONWS->LWS component we'll call firstline_a. + // The second firstline_b, and the third firstline_c. + // + // firstline_a goes from nws1 to (but not including) ws2 + // firstline_b goes from nws2 to (but not including) ws3 + // firstline_c goes from nws3 to (but not including) ws4 + // + // In the code: + // ws1 == whitespace_1_idx_ + // nws1 == non_whitespace_1_idx_ + // ws2 == whitespace_2_idx_ + // nws2 == non_whitespace_2_idx_ + // ws3 == whitespace_3_idx_ + // nws3 == non_whitespace_3_idx_ + // ws4 == whitespace_4_idx_ + + // Kill all whitespace (including '\r\n') at the end of the line. + --end; + if (*end != '\n') { + *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR; + LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n" + << headers->OriginalHeadersForDebugging(); + return false; + } + while (begin < end && *end <= ' ') { + --end; + } + DCHECK(*end != '\n'); + if (*end == '\n') { + *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR; + LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n" + << headers->OriginalHeadersForDebugging(); + return false; + } + ++end; + + // The two following statements should not be possible. + if (end == begin) { + *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR; + LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n" + << headers->OriginalHeadersForDebugging(); + return false; + } + + // whitespace_1_idx_ + headers->whitespace_1_idx_ = current - begin; + // This loop is commented out as it is never used in current code. This is + // true only because we don't begin parsing the headers at all until we've + // encountered a non whitespace character at the beginning of the stream, at + // which point we begin our demarcation of header-start. If we did -not- do + // this (for instance, only looked for [\r\n] instead of (< ' ')), this loop + // would be necessary for the proper functioning of this parsing. + // This is left here as this function may (in the future) be refactored out + // of the BalsaFrame class so that it may be shared between code in + // BalsaFrame and BalsaHeaders (where it would be used in some variant of the + // set_first_line() function (at which point it would be necessary). +#if 0 + while (*current <= ' ') { + ++current; + } +#endif + // non_whitespace_1_idx_ + headers->non_whitespace_1_idx_ = current - begin; + do { + // The first time through, we're guaranteed that the current character + // won't be a whitespace (else the loop above wouldn't have terminated). + // That implies that we're guaranteed to get at least one non-whitespace + // character if we get into this loop at all. + ++current; + if (current == end) { + headers->whitespace_2_idx_ = current - begin; + headers->non_whitespace_2_idx_ = current - begin; + headers->whitespace_3_idx_ = current - begin; + headers->non_whitespace_3_idx_ = current - begin; + headers->whitespace_4_idx_ = current - begin; + // FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD for request + // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION for response + *error_code = + static_cast<BalsaFrameEnums::ErrorCode>( + BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION + + is_request); + if (!is_request) { // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION + return false; + } + goto output_exhausted; + } + } while (*current > ' '); + // whitespace_2_idx_ + headers->whitespace_2_idx_ = current - begin; + do { + ++current; + // Note that due to the loop which consumes all of the whitespace + // at the end of the line, current can never == end while in this function. + } while (*current <= ' '); + // non_whitespace_2_idx_ + headers->non_whitespace_2_idx_ = current - begin; + do { + ++current; + if (current == end) { + headers->whitespace_3_idx_ = current - begin; + headers->non_whitespace_3_idx_ = current - begin; + headers->whitespace_4_idx_ = current - begin; + // FAILED_TO_FIND_START_OF_REQUEST_REQUEST_URI for request + // FAILED_TO_FIND_START_OF_RESPONSE_STATUSCODE for response + *error_code = + static_cast<BalsaFrameEnums::ErrorCode>( + BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE + + is_request); + goto output_exhausted; + } + } while (*current > ' '); + // whitespace_3_idx_ + headers->whitespace_3_idx_ = current - begin; + do { + ++current; + // Note that due to the loop which consumes all of the whitespace + // at the end of the line, current can never == end while in this function. + } while (*current <= ' '); + // non_whitespace_3_idx_ + headers->non_whitespace_3_idx_ = current - begin; + headers->whitespace_4_idx_ = end - begin; + + output_exhausted: + // Note that we don't fail the parse immediately when parsing of the + // firstline fails. Depending on the protocol type, we may want to accept + // a firstline with only one or two elements, e.g., for HTTP/0.9: + // GET\r\n + // or + // GET /\r\n + // should be parsed without issue (though the visitor should know that + // parsing the entire line was not exactly as it should be). + // + // Eventually, these errors may be removed alltogether, as the visitor can + // detect them on its own by examining the size of the various fields. + // headers->set_first_line(non_whitespace_1_idx_, current); + + if (is_request) { + if ((headers->whitespace_3_idx_ - headers->non_whitespace_2_idx_) > + max_request_uri_length) { + // For requests, we need at least the method. We could assume that a + // blank URI means "/". If version isn't stated, it should be assumed + // to be HTTP/0.9 by the visitor. + *error_code = BalsaFrameEnums::REQUEST_URI_TOO_LONG; + return false; + } + } else { + headers->parsed_response_code_ = 0; + { + const char* parsed_response_code_current = + begin + headers->non_whitespace_2_idx_; + const char* parsed_response_code_end = begin + headers->whitespace_3_idx_; + const size_t kMaxDiv10 = numeric_limits<size_t>::max() / 10; + + // Convert a string of [0-9]* into an int. + // Note that this allows for the conversion of response codes which + // are outside the bounds of normal HTTP response codes (no checking + // is done to ensure that these are valid-- they're merely parsed)! + while (parsed_response_code_current < parsed_response_code_end) { + if (*parsed_response_code_current < '0' || + *parsed_response_code_current > '9') { + *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT; + return false; + } + size_t status_code_x_10 = headers->parsed_response_code_ * 10; + uint8 c = *parsed_response_code_current - '0'; + if ((headers->parsed_response_code_ > kMaxDiv10) || + (numeric_limits<size_t>::max() - status_code_x_10) < c) { + // overflow. + *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT; + return false; + } + headers->parsed_response_code_ = status_code_x_10 + c; + ++parsed_response_code_current; + } + } + } + return true; +} + +// begin - beginning of the firstline +// end - end of the firstline +// +// A precondition for this function is that there is non-whitespace between +// [begin, end). If this precondition is not met, the function will not perform +// as expected (and bad things may happen, and it will eat your first, second, +// and third unborn children!). +// +// Another precondition for this function is that [begin, end) includes +// at most one newline, which must be at the end of the line. +void BalsaFrame::ProcessFirstLine(const char* begin, const char* end) { + BalsaFrameEnums::ErrorCode previous_error = last_error_; + if (!ParseHTTPFirstLine(begin, + end, + is_request_, + max_request_uri_length_, + headers_, + &last_error_)) { + parse_state_ = BalsaFrameEnums::ERROR; + visitor_->HandleHeaderError(this); + return; + } + if (previous_error != last_error_) { + visitor_->HandleHeaderWarning(this); + } + + if (is_request_) { + int version_length = + headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_; + visitor_->ProcessRequestFirstLine( + begin + headers_->non_whitespace_1_idx_, + headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_, + begin + headers_->non_whitespace_1_idx_, + headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_, + begin + headers_->non_whitespace_2_idx_, + headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_, + begin + headers_->non_whitespace_3_idx_, + version_length); + if (version_length == 0) + parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; + } else { + visitor_->ProcessResponseFirstLine( + begin + headers_->non_whitespace_1_idx_, + headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_, + begin + headers_->non_whitespace_1_idx_, + headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_, + begin + headers_->non_whitespace_2_idx_, + headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_, + begin + headers_->non_whitespace_3_idx_, + headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_); + } +} + +// 'stream_begin' points to the first character of the headers buffer. +// 'line_begin' points to the first character of the line. +// 'current' points to a char which is ':'. +// 'line_end' points to the position of '\n' + 1. +// 'line_begin' points to the position of first character of line. +void BalsaFrame::CleanUpKeyValueWhitespace( + const char* stream_begin, + const char* line_begin, + const char* current, + const char* line_end, + HeaderLineDescription* current_header_line) { + const char* colon_loc = current; + DCHECK_LT(colon_loc, line_end); + DCHECK_EQ(':', *colon_loc); + DCHECK_EQ(':', *current); + DCHECK_GE(' ', *line_end) << "\"" << string(line_begin, line_end) << "\""; + + // TODO(fenix): Investigate whether or not the bounds tests in the + // while loops here are redundant, and if so, remove them. + --current; + while (current > line_begin && *current <= ' ') --current; + current += (current != colon_loc); + current_header_line->key_end_idx = current - stream_begin; + + current = colon_loc; + DCHECK_EQ(':', *current); + ++current; + while (current < line_end && *current <= ' ') ++current; + current_header_line->value_begin_idx = current - stream_begin; + + DCHECK_GE(current_header_line->key_end_idx, + current_header_line->first_char_idx); + DCHECK_GE(current_header_line->value_begin_idx, + current_header_line->key_end_idx); + DCHECK_GE(current_header_line->last_char_idx, + current_header_line->value_begin_idx); +} + +inline void BalsaFrame::FindColonsAndParseIntoKeyValue() { + DCHECK(!lines_.empty()); + const char* stream_begin = headers_->OriginalHeaderStreamBegin(); + // The last line is always just a newline (and is uninteresting). + const Lines::size_type lines_size_m1 = lines_.size() - 1; +#if __SSE2__ + const __v16qi colons = { ':', ':', ':', ':', ':', ':', ':', ':', + ':', ':', ':', ':', ':', ':', ':', ':'}; + const char* header_lines_end_m16 = headers_->OriginalHeaderStreamEnd() - 16; +#endif // __SSE2__ + const char* current = stream_begin + lines_[1].first; + // This code is a bit more subtle than it may appear at first glance. + // This code looks for a colon in the current line... but it also looks + // beyond the current line. If there is no colon in the current line, then + // for each subsequent line (until the colon which -has- been found is + // associated with a line), no searching for a colon will be performed. In + // this way, we minimize the amount of bytes we have scanned for a colon. + for (Lines::size_type i = 1; i < lines_size_m1;) { + const char* line_begin = stream_begin + lines_[i].first; + + // Here we handle possible continuations. Note that we do not replace + // the '\n' in the line before a continuation (at least, as of now), + // which implies that any code which looks for a value must deal with + // "\r\n", etc -within- the line (and not just at the end of it). + for (++i; i < lines_size_m1; ++i) { + const char c = *(stream_begin + lines_[i].first); + if (c > ' ') { + // Not a continuation, so stop. Note that if the 'original' i = 1, + // and the next line is not a continuation, we'll end up with i = 2 + // when we break. This handles the incrementing of i for the outer + // loop. + break; + } + } + const char* line_end = stream_begin + lines_[i - 1].second; + DCHECK_LT(line_begin - stream_begin, line_end - stream_begin); + + // We cleanup the whitespace at the end of the line before doing anything + // else of interest as it allows us to do nothing when irregularly formatted + // headers are parsed (e.g. those with only keys, only values, or no colon). + // + // We're guaranteed to have *line_end > ' ' while line_end >= line_begin. + --line_end; + DCHECK_EQ('\n', *line_end) << "\"" << string(line_begin, line_end) << "\""; + while (*line_end <= ' ' && line_end > line_begin) { + --line_end; + } + ++line_end; + DCHECK_GE(' ', *line_end); + DCHECK_LT(line_begin, line_end); + + // We use '0' for the block idx, because we're always writing to the first + // block from the framer (we do this because the framer requires that the + // entire header sequence be in a contiguous buffer). + headers_->header_lines_.push_back( + HeaderLineDescription(line_begin - stream_begin, + line_end - stream_begin, + line_end - stream_begin, + line_end - stream_begin, + 0)); + if (current >= line_end) { + last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON; + visitor_->HandleHeaderWarning(this); + // Then the next colon will not be found within this header line-- time + // to try again with another header-line. + continue; + } else if (current < line_begin) { + // When this condition is true, the last detected colon was part of a + // previous line. We reset to the beginning of the line as we don't care + // about the presence of any colon before the beginning of the current + // line. + current = line_begin; + } +#if __SSE2__ + while (current < header_lines_end_m16) { + __m128i header_bytes = + _mm_loadu_si128(reinterpret_cast<const __m128i *>(current)); + __m128i colon_cmp = + _mm_cmpeq_epi8(header_bytes, reinterpret_cast<__m128i>(colons)); + int colon_msk = _mm_movemask_epi8(colon_cmp); + if (colon_msk == 0) { + current += 16; + continue; + } + current += (ffs(colon_msk) - 1); + if (current > line_end) { + break; + } + goto found_colon; + } +#endif // __SSE2__ + for (; current < line_end; ++current) { + if (*current != ':') { + continue; + } + goto found_colon; + } + // If we've gotten to here, then there was no colon + // in the line. The arguments we passed into the construction + // for the HeaderLineDescription object should be OK-- it assumes + // that the entire content is 'key' by default (which is true, as + // there was no colon, there can be no value). Note that this is a + // construct which is technically not allowed by the spec. + last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON; + visitor_->HandleHeaderWarning(this); + continue; + found_colon: + DCHECK_EQ(*current, ':'); + DCHECK_LE(current - stream_begin, line_end - stream_begin); + DCHECK_LE(stream_begin - stream_begin, current - stream_begin); + + HeaderLineDescription& current_header_line = headers_->header_lines_.back(); + current_header_line.key_end_idx = current - stream_begin; + current_header_line.value_begin_idx = current_header_line.key_end_idx; + if (current < line_end) { + ++current_header_line.key_end_idx; + + CleanUpKeyValueWhitespace(stream_begin, + line_begin, + current, + line_end, + ¤t_header_line); + } + } +} + +void BalsaFrame::ProcessContentLengthLine( + HeaderLines::size_type line_idx, + BalsaHeadersEnums::ContentLengthStatus* status, + size_t* length) { + const HeaderLineDescription& header_line = headers_->header_lines_[line_idx]; + const char* stream_begin = headers_->OriginalHeaderStreamBegin(); + const char* line_end = stream_begin + header_line.last_char_idx; + const char* value_begin = (stream_begin + header_line.value_begin_idx); + + if (value_begin >= line_end) { + // There is no non-whitespace value data. +#if DEBUGFRAMER + LOG(INFO) << "invalid content-length -- no non-whitespace value data"; +#endif + *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH; + return; + } + + *length = 0; + while (value_begin < line_end) { + if (*value_begin < '0' || *value_begin > '9') { + // bad! content-length found, and couldn't parse all of it! + *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH; +#if DEBUGFRAMER + LOG(INFO) << "invalid content-length - non numeric character detected"; +#endif // DEBUGFRAMER + return; + } + const size_t kMaxDiv10 = numeric_limits<size_t>::max() / 10; + size_t length_x_10 = *length * 10; + const char c = *value_begin - '0'; + if (*length > kMaxDiv10 || + (numeric_limits<size_t>::max() - length_x_10) < c) { + *status = BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW; +#if DEBUGFRAMER + LOG(INFO) << "content-length overflow"; +#endif // DEBUGFRAMER + return; + } + *length = length_x_10 + c; + ++value_begin; + } +#if DEBUGFRAMER + LOG(INFO) << "content_length parsed: " << *length; +#endif // DEBUGFRAMER + *status = BalsaHeadersEnums::VALID_CONTENT_LENGTH; +} + +void BalsaFrame::ProcessTransferEncodingLine(HeaderLines::size_type line_idx) { + const HeaderLineDescription& header_line = headers_->header_lines_[line_idx]; + const char* stream_begin = headers_->OriginalHeaderStreamBegin(); + const char* line_end = stream_begin + header_line.last_char_idx; + const char* value_begin = stream_begin + header_line.value_begin_idx; + size_t value_length = line_end - value_begin; + + if ((value_length == 7) && + !strncasecmp(value_begin, "chunked", 7)) { + headers_->transfer_encoding_is_chunked_ = true; + } else if ((value_length == 8) && + !strncasecmp(value_begin, "identity", 8)) { + headers_->transfer_encoding_is_chunked_ = false; + } else { + last_error_ = BalsaFrameEnums::UNKNOWN_TRANSFER_ENCODING; + parse_state_ = BalsaFrameEnums::ERROR; + visitor_->HandleHeaderError(this); + return; + } +} + +namespace { +bool SplitStringPiece(StringPiece original, char delim, + StringPiece* before, StringPiece* after) { + const char* p = original.data(); + const char* end = p + original.size(); + + while (p != end) { + if (*p == delim) { + ++p; + } else { + const char* start = p; + while (++p != end && *p != delim) { + // Skip to the next occurence of the delimiter. + } + *before = StringPiece(start, p - start); + if (p != end) + *after = StringPiece(p + 1, end - (p + 1)); + else + *after = StringPiece(""); + StringPieceUtils::RemoveWhitespaceContext(before); + StringPieceUtils::RemoveWhitespaceContext(after); + return true; + } + } + + *before = original; + *after = ""; + return false; +} + +// TODO(phython): Fix this function to properly deal with quoted values. +// E.g. ";;foo", "\";;\"", or \"aa; +// The last example, the semi-colon is a separator between extensions. +void ProcessChunkExtensionsManual(StringPiece all_extensions, + BalsaHeaders* extensions) { + StringPiece extension; + StringPiece remaining; + StringPieceUtils::RemoveWhitespaceContext(&all_extensions); + SplitStringPiece(all_extensions, ';', &extension, &remaining); + while (!extension.empty()) { + StringPiece key; + StringPiece value; + SplitStringPiece(extension, '=', &key, &value); + if (!value.empty()) { + // Strip quotation marks if they exist. + if (!value.empty() && value[0] == '"') + value.remove_prefix(1); + if (!value.empty() && value[value.length() - 1] == '"') + value.remove_suffix(1); + } + + extensions->AppendHeader(key, value); + + StringPieceUtils::RemoveWhitespaceContext(&remaining); + SplitStringPiece(remaining, ';', &extension, &remaining); + } +} + +// TODO(phython): Fix this function to properly deal with quoted values. +// E.g. ";;foo", "\";;\"", or \"aa; +// The last example, the semi-colon is a separator between extensions. +void ProcessChunkExtensionsGoogle3(const char* input, size_t size, + BalsaHeaders* extensions) { + vector<StringPiece> key_values; + SplitStringPieceToVector(StringPiece(input, size), ";", &key_values, true); + for (int i = 0; i < key_values.size(); ++i) { + StringPiece key = key_values[i].substr(0, key_values[i].find('=')); + StringPiece value; + if (key.length() < key_values[i].length()) { + value = key_values[i].substr(key.length() + 1); + // Remove any leading and trailing whitespace. + StringPieceUtils::RemoveWhitespaceContext(&value); + + // Strip quotation marks if they exist. + if (!value.empty() && value[0] == '"') + value.remove_prefix(1); + if (!value.empty() && value[value.length() - 1] == '"') + value.remove_suffix(1); + } + + // Strip the key whitespace after checking that there is a value. + StringPieceUtils::RemoveWhitespaceContext(&key); + extensions->AppendHeader(key, value); + } +} + +} // anonymous namespace + +void BalsaFrame::ProcessChunkExtensions(const char* input, size_t size, + BalsaHeaders* extensions) { +#if 0 + ProcessChunkExtensionsGoogle3(input, size, extensions); +#else + ProcessChunkExtensionsManual(StringPiece(input, size), extensions); +#endif +} + +void BalsaFrame::ProcessHeaderLines() { + HeaderLines::size_type content_length_idx = 0; + HeaderLines::size_type transfer_encoding_idx = 0; + + DCHECK(!lines_.empty()); +#if DEBUGFRAMER + LOG(INFO) << "******@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@**********\n"; +#endif // DEBUGFRAMER + + // There is no need to attempt to process headers if no header lines exist. + // There are at least two lines in the message which are not header lines. + // These two non-header lines are the first line of the message, and the + // last line of the message (which is an empty line). + // Thus, we test to see if we have more than two lines total before attempting + // to parse any header lines. + if (lines_.size() > 2) { + const char* stream_begin = headers_->OriginalHeaderStreamBegin(); + + // Then, for the rest of the header data, we parse these into key-value + // pairs. + FindColonsAndParseIntoKeyValue(); + // At this point, we've parsed all of the headers. Time to look for those + // headers which we require for framing. + const HeaderLines::size_type + header_lines_size = headers_->header_lines_.size(); + for (HeaderLines::size_type i = 0; i < header_lines_size; ++i) { + const HeaderLineDescription& current_header_line = + headers_->header_lines_[i]; + const char* key_begin = + (stream_begin + current_header_line.first_char_idx); + const char* key_end = (stream_begin + current_header_line.key_end_idx); + const size_t key_len = key_end - key_begin; + const char c = *key_begin; +#if DEBUGFRAMER + LOG(INFO) << "[" << i << "]: " << string(key_begin, key_len) + << " c: '" << c << "' key_len: " << key_len; +#endif // DEBUGFRAMER + // If a header begins with either lowercase or uppercase 'c' or 't', then + // the header may be one of content-length, connection, content-encoding + // or transfer-encoding. These headers are special, as they change the way + // that the message is framed, and so the framer is required to search + // for them. + + + if (c == 'c' || c == 'C') { + if ((key_len == kContentLengthSize) && + 0 == strncasecmp(key_begin, kContentLength, kContentLengthSize)) { + BalsaHeadersEnums::ContentLengthStatus content_length_status = + BalsaHeadersEnums::NO_CONTENT_LENGTH; + size_t length = 0; + ProcessContentLengthLine(i, &content_length_status, &length); + if (content_length_idx != 0) { // then we've already seen one! + if ((headers_->content_length_status_ != content_length_status) || + ((headers_->content_length_status_ == + BalsaHeadersEnums::VALID_CONTENT_LENGTH) && + length != headers_->content_length_)) { + last_error_ = BalsaFrameEnums::MULTIPLE_CONTENT_LENGTH_KEYS; + parse_state_ = BalsaFrameEnums::ERROR; + visitor_->HandleHeaderError(this); + return; + } + continue; + } else { + content_length_idx = i + 1; + headers_->content_length_status_ = content_length_status; + headers_->content_length_ = length; + content_length_remaining_ = length; + } + + } + } else if (c == 't' || c == 'T') { + if ((key_len == kTransferEncodingSize) && + 0 == strncasecmp(key_begin, kTransferEncoding, + kTransferEncodingSize)) { + if (transfer_encoding_idx != 0) { + last_error_ = BalsaFrameEnums::MULTIPLE_TRANSFER_ENCODING_KEYS; + parse_state_ = BalsaFrameEnums::ERROR; + visitor_->HandleHeaderError(this); + return; + } + transfer_encoding_idx = i + 1; + } + } else if (i == 0 && (key_len == 0 || c == ' ')) { + last_error_ = BalsaFrameEnums::INVALID_HEADER_FORMAT; + parse_state_ = BalsaFrameEnums::ERROR; + visitor_->HandleHeaderError(this); + return; + } + } + if (headers_->transfer_encoding_is_chunked_) { + headers_->content_length_ = 0; + headers_->content_length_status_ = BalsaHeadersEnums::NO_CONTENT_LENGTH; + content_length_remaining_ = 0; + } + if (transfer_encoding_idx != 0) { + ProcessTransferEncodingLine(transfer_encoding_idx - 1); + } + } +} + +void BalsaFrame::AssignParseStateAfterHeadersHaveBeenParsed() { + // For responses, can't have a body if the request was a HEAD, or if it is + // one of these response-codes. rfc2616 section 4.3 + parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; + if (is_request_ || + !(request_was_head_ || + (headers_->parsed_response_code_ >= 100 && + headers_->parsed_response_code_ < 200) || + (headers_->parsed_response_code_ == 204) || + (headers_->parsed_response_code_ == 304))) { + // Then we can have a body. + if (headers_->transfer_encoding_is_chunked_) { + // Note that + // if ( Transfer-Encoding: chunked && Content-length: ) + // then Transfer-Encoding: chunked trumps. + // This is as specified in the spec. + // rfc2616 section 4.4.3 + parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH; + } else { + // Errors parsing content-length definitely can cause + // protocol errors/warnings + switch (headers_->content_length_status_) { + // If we have a content-length, and it is parsed + // properly, there are two options. + // 1) zero content, in which case the message is done, and + // 2) nonzero content, in which case we have to + // consume the body. + case BalsaHeadersEnums::VALID_CONTENT_LENGTH: + if (headers_->content_length_ == 0) { + parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; + } else { + parse_state_ = BalsaFrameEnums::READING_CONTENT; + } + break; + case BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW: + case BalsaHeadersEnums::INVALID_CONTENT_LENGTH: + // If there were characters left-over after parsing the + // content length, we should flag an error and stop. + parse_state_ = BalsaFrameEnums::ERROR; + last_error_ = BalsaFrameEnums::UNPARSABLE_CONTENT_LENGTH; + visitor_->HandleHeaderError(this); + break; + // We can have: no transfer-encoding, no content length, and no + // connection: close... + // Unfortunately, this case doesn't seem to be covered in the spec. + // We'll assume that the safest thing to do here is what the google + // binaries before 2008 already do, which is to assume that + // everything until the connection is closed is body. + case BalsaHeadersEnums::NO_CONTENT_LENGTH: + if (is_request_) { + StringPiece method = headers_->request_method(); + // POSTs and PUTs should have a detectable body length. If they + // do not we consider it an error. + if ((method.size() == 4 && + strncmp(method.data(), "POST", 4) == 0) || + (method.size() == 3 && + strncmp(method.data(), "PUT", 3) == 0)) { + parse_state_ = BalsaFrameEnums::ERROR; + last_error_ = + BalsaFrameEnums::REQUIRED_BODY_BUT_NO_CONTENT_LENGTH; + visitor_->HandleHeaderError(this); + break; + } + parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; + } else { + parse_state_ = BalsaFrameEnums::READING_UNTIL_CLOSE; + last_error_ = BalsaFrameEnums::MAYBE_BODY_BUT_NO_CONTENT_LENGTH; + visitor_->HandleHeaderWarning(this); + } + break; + // The COV_NF_... statements here provide hints to the apparatus + // which computes coverage reports/ratios that this code is never + // intended to be executed, and should technically be impossible. + // COV_NF_START + default: + LOG(FATAL) << "Saw a content_length_status: " + << headers_->content_length_status_ << " which is unknown."; + // COV_NF_END + } + } + } +} + +size_t BalsaFrame::ProcessHeaders(const char* message_start, + size_t message_length) { + const char* const original_message_start = message_start; + const char* const message_end = message_start + message_length; + const char* message_current = message_start; + const char* checkpoint = message_start; + + if (message_length == 0) { + goto bottom; + } + + while (message_current < message_end) { + size_t base_idx = headers_->GetReadableBytesFromHeaderStream(); + + // Yes, we could use strchr (assuming null termination), or + // memchr, but as it turns out that is slower than this tight loop + // for the input that we see. + if (!saw_non_newline_char_) { + do { + const char c = *message_current; + if (c != '\r' && c != '\n') { + if (c <= ' ') { + parse_state_ = BalsaFrameEnums::ERROR; + last_error_ = BalsaFrameEnums::NO_REQUEST_LINE_IN_REQUEST; + visitor_->HandleHeaderError(this); + goto bottom; + } else { + saw_non_newline_char_ = true; + checkpoint = message_start = message_current; + goto read_real_message; + } + } + ++message_current; + } while (message_current < message_end); + goto bottom; // this is necessary to skip 'last_char_was_slash_r' checks + } else { + read_real_message: + // Note that SSE2 can be enabled on certain piii platforms. +#if __SSE2__ + { + const char* const message_end_m16 = message_end - 16; + __v16qi newlines = { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n', + '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' }; + while (message_current < message_end_m16) { + // What this does (using compiler intrinsics): + // + // Load 16 '\n's into an xmm register + // Load 16 bytes of currennt message into an xmm register + // Do byte-wise equals on those two xmm registers + // Take the first bit of each byte, and put that into the first + // 16 bits of a mask + // If the mask is zero, no '\n' found. increment by 16 and try again + // Else scan forward to find the first set bit. + // Increment current by the index of the first set bit + // (ffs returns index of first set bit + 1) + __m128i msg_bytes = + _mm_loadu_si128(const_cast<__m128i *>( + reinterpret_cast<const __m128i *>(message_current))); + __m128i newline_cmp = + _mm_cmpeq_epi8(msg_bytes, reinterpret_cast<__m128i>(newlines)); + int newline_msk = _mm_movemask_epi8(newline_cmp); + if (newline_msk == 0) { + message_current += 16; + continue; + } + message_current += (ffs(newline_msk) - 1); + const size_t relative_idx = message_current - message_start; + const size_t message_current_idx = 1 + base_idx + relative_idx; + lines_.push_back(make_pair(last_slash_n_idx_, message_current_idx)); + if (lines_.size() == 1) { + headers_->WriteFromFramer(checkpoint, + 1 + message_current - checkpoint); + checkpoint = message_current + 1; + const char* begin = headers_->OriginalHeaderStreamBegin(); +#if DEBUGFRAMER + LOG(INFO) << "First line " << string(begin, lines_[0].second); + LOG(INFO) << "is_request_: " << is_request_; +#endif + ProcessFirstLine(begin, begin + lines_[0].second); + if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) + goto process_lines; + else if (parse_state_ == BalsaFrameEnums::ERROR) + goto bottom; + } + const size_t chars_since_last_slash_n = (message_current_idx - + last_slash_n_idx_); + last_slash_n_idx_ = message_current_idx; + if (chars_since_last_slash_n > 2) { + // We have a slash-n, but the last slash n was + // more than 2 characters away from this. Thus, we know + // that this cannot be an end-of-header. + ++message_current; + continue; + } + if ((chars_since_last_slash_n == 1) || + (((message_current > message_start) && + (*(message_current - 1) == '\r')) || + (last_char_was_slash_r_))) { + goto process_lines; + } + ++message_current; + } + } +#endif // __SSE2__ + while (message_current < message_end) { + if (*message_current != '\n') { + ++message_current; + continue; + } + const size_t relative_idx = message_current - message_start; + const size_t message_current_idx = 1 + base_idx + relative_idx; + lines_.push_back(make_pair(last_slash_n_idx_, message_current_idx)); + if (lines_.size() == 1) { + headers_->WriteFromFramer(checkpoint, + 1 + message_current - checkpoint); + checkpoint = message_current + 1; + const char* begin = headers_->OriginalHeaderStreamBegin(); +#if DEBUGFRAMER + LOG(INFO) << "First line " << string(begin, lines_[0].second); + LOG(INFO) << "is_request_: " << is_request_; +#endif + ProcessFirstLine(begin, begin + lines_[0].second); + if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) + goto process_lines; + else if (parse_state_ == BalsaFrameEnums::ERROR) + goto bottom; + } + const size_t chars_since_last_slash_n = (message_current_idx - + last_slash_n_idx_); + last_slash_n_idx_ = message_current_idx; + if (chars_since_last_slash_n > 2) { + // false positive. + ++message_current; + continue; + } + if ((chars_since_last_slash_n == 1) || + (((message_current > message_start) && + (*(message_current - 1) == '\r')) || + (last_char_was_slash_r_))) { + goto process_lines; + } + ++message_current; + } + } + continue; + process_lines: + ++message_current; + DCHECK(message_current >= message_start); + if (message_current > message_start) { + headers_->WriteFromFramer(checkpoint, message_current - checkpoint); + } + + // Check if we have exceeded maximum headers length + // Although we check for this limit before and after we call this function + // we check it here as well to make sure that in case the visitor changed + // the max_header_length_ (for example after processing the first line) + // we handle it gracefully. + if (headers_->GetReadableBytesFromHeaderStream() > max_header_length_) { + parse_state_ = BalsaFrameEnums::ERROR; + last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG; + visitor_->HandleHeaderError(this); + goto bottom; + } + + // Since we know that we won't be writing any more bytes of the header, + // we tell that to the headers object. The headers object may make + // more efficient allocation decisions when this is signaled. + headers_->DoneWritingFromFramer(); + { + const char* readable_ptr = NULL; + size_t readable_size = 0; + headers_->GetReadablePtrFromHeaderStream(&readable_ptr, &readable_size); + visitor_->ProcessHeaderInput(readable_ptr, readable_size); + } + + // Ok, now that we've written everything into our header buffer, it is + // time to process the header lines (extract proper values for headers + // which are important for framing). + ProcessHeaderLines(); + if (parse_state_ == BalsaFrameEnums::ERROR) { + goto bottom; + } + AssignParseStateAfterHeadersHaveBeenParsed(); + if (parse_state_ == BalsaFrameEnums::ERROR) { + goto bottom; + } + visitor_->ProcessHeaders(*headers_); + visitor_->HeaderDone(); + if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) { + visitor_->MessageDone(); + } + goto bottom; + } + // If we've gotten to here, it means that we've consumed all of the + // available input. We need to record whether or not the last character we + // saw was a '\r' so that a subsequent call to ProcessInput correctly finds + // a header framing that is split across the two calls. + last_char_was_slash_r_ = (*(message_end - 1) == '\r'); + DCHECK(message_current >= message_start); + if (message_current > message_start) { + headers_->WriteFromFramer(checkpoint, message_current - checkpoint); + } + bottom: + return message_current - original_message_start; +} + + +size_t BalsaFrame::BytesSafeToSplice() const { + switch (parse_state_) { + case BalsaFrameEnums::READING_CHUNK_DATA: + return chunk_length_remaining_; + case BalsaFrameEnums::READING_UNTIL_CLOSE: + return numeric_limits<size_t>::max(); + case BalsaFrameEnums::READING_CONTENT: + return content_length_remaining_; + default: + return 0; + } +} + +void BalsaFrame::BytesSpliced(size_t bytes_spliced) { + switch (parse_state_) { + case BalsaFrameEnums::READING_CHUNK_DATA: + if (chunk_length_remaining_ >= bytes_spliced) { + chunk_length_remaining_ -= bytes_spliced; + if (chunk_length_remaining_ == 0) { + parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM; + } + return; + } else { + last_error_ = + BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT; + goto error_exit; + } + + case BalsaFrameEnums::READING_UNTIL_CLOSE: + return; + + case BalsaFrameEnums::READING_CONTENT: + if (content_length_remaining_ >= bytes_spliced) { + content_length_remaining_ -= bytes_spliced; + if (content_length_remaining_ == 0) { + parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; + visitor_->MessageDone(); + } + return; + } else { + last_error_ = + BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT; + goto error_exit; + } + + default: + last_error_ = BalsaFrameEnums::CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO; + goto error_exit; + } + + error_exit: + parse_state_ = BalsaFrameEnums::ERROR; + visitor_->HandleBodyError(this); +}; + +// You may note that the state-machine contained within this function has both +// switch and goto labels for nearly the same thing. For instance, the +// following two labels refer to the same code block: +// label_reading_chunk_data: +// case BalsaFrameEnums::READING_CHUNK_DATA: +// The 'case' statement is required for the switch statement which occurs when +// ProcessInput is invoked. The goto label is required as the state-machine +// does not use a computed goto in any subsequent operations. +// +// Since several states exit the state machine for various reasons, there is +// also one label at the bottom of the function. When it is appropriate to +// return from the function, that part of the state machine instead issues a +// goto bottom; This results in less code duplication, and makes debugging +// easier (as you can add a statement to a section of code which is guaranteed +// to be invoked when the function is exiting. +size_t BalsaFrame::ProcessInput(const char* input, size_t size) { + const char* current = input; + const char* on_entry = current; + const char* end = current + size; +#if DEBUGFRAMER + LOG(INFO) << "\n==============" + << BalsaFrameEnums::ParseStateToString(parse_state_) + << "===============\n"; +#endif // DEBUGFRAMER + + DCHECK(headers_ != NULL); + if (headers_ == NULL) return 0; + + if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) { + const size_t header_length = headers_->GetReadableBytesFromHeaderStream(); + // Yes, we still have to check this here as the user can change the + // max_header_length amount! + // Also it is possible that we have reached the maximum allowed header size, + // and we have more to consume (remember we are still inside + // READING_HEADER_AND_FIRSTLINE) in which case we directly declare an error. + if (header_length > max_header_length_ || + (header_length == max_header_length_ && size > 0)) { + parse_state_ = BalsaFrameEnums::ERROR; + last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG; + visitor_->HandleHeaderError(this); + goto bottom; + } + size_t bytes_to_process = max_header_length_ - header_length; + if (bytes_to_process > size) { + bytes_to_process = size; + } + current += ProcessHeaders(input, bytes_to_process); + // If we are still reading headers check if we have crossed the headers + // limit. Note that we check for >= as opposed to >. This is because if + // header_length_after equals max_header_length_ and we are still in the + // parse_state_ BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE we know for + // sure that the headers limit will be crossed later on + if ((parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE)) { + // Note that headers_ is valid only if we are still reading headers. + const size_t header_length_after = + headers_->GetReadableBytesFromHeaderStream(); + if (header_length_after >= max_header_length_) { + parse_state_ = BalsaFrameEnums::ERROR; + last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG; + visitor_->HandleHeaderError(this); + } + } + goto bottom; + } else if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ || + parse_state_ == BalsaFrameEnums::ERROR) { + // Can do nothing more 'till we're reset. + goto bottom; + } + + while (current < end) { + switch (parse_state_) { + label_reading_chunk_length: + case BalsaFrameEnums::READING_CHUNK_LENGTH: + // In this state we read the chunk length. + // Note that once we hit a character which is not in: + // [0-9;A-Fa-f\n], we transition to a different state. + // + { + // If we used strtol, etc, we'd have to buffer this line. + // This is more annoying than simply doing the conversion + // here. This code accounts for overflow. + static const signed char buf[] = { + // %0 %1 %2 %3 %4 %5 %6 %7 %8 \t \n %b %c \r %e %f + -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -1, -1, -2, -1, -1, + // %10 %11 %12 %13 %14 %15 %16 %17 %18 %19 %1a %1b %1c %1d %1e %1f + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + // ' ' %21 %22 %23 %24 %25 %26 %27 %28 %29 %2a %2b %2c %2d %2e %2f + -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + // %30 %31 %32 %33 %34 %35 %36 %37 %38 %39 %3a ';' %3c %3d %3e %3f + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -2, -1, -1, -1, -1, + // %40 'A' 'B' 'C' 'D' 'E' 'F' %47 %48 %49 %4a %4b %4c %4d %4e %4f + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + // %50 %51 %52 %53 %54 %55 %56 %57 %58 %59 %5a %5b %5c %5d %5e %5f + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + // %60 'a' 'b' 'c' 'd' 'e' 'f' %67 %68 %69 %6a %6b %6c %6d %6e %6f + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + // %70 %71 %72 %73 %74 %75 %76 %77 %78 %79 %7a %7b %7c %7d %7e %7f + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + }; + // valid cases: + // "09123\n" // -> 09123 + // "09123\r\n" // -> 09123 + // "09123 \n" // -> 09123 + // "09123 \r\n" // -> 09123 + // "09123 12312\n" // -> 09123 + // "09123 12312\r\n" // -> 09123 + // "09123; foo=bar\n" // -> 09123 + // "09123; foo=bar\r\n" // -> 09123 + // "FFFFFFFFFFFFFFFF\r\n" // -> FFFFFFFFFFFFFFFF + // "FFFFFFFFFFFFFFFF 22\r\n" // -> FFFFFFFFFFFFFFFF + // invalid cases: + // "[ \t]+[^\n]*\n" + // "FFFFFFFFFFFFFFFFF\r\n" (would overflow) + // "\r\n" + // "\n" + while (current < end) { + const char c = *current; + ++current; + const signed char addition = buf[c]; + if (addition >= 0) { + chunk_length_character_extracted_ = true; + size_t length_x_16 = chunk_length_remaining_ * 16; + const size_t kMaxDiv16 = numeric_limits<size_t>::max() / 16; + if ((chunk_length_remaining_ > kMaxDiv16) || + (numeric_limits<size_t>::max() - length_x_16) < addition) { + // overflow -- asked for a chunk-length greater than 2^64 - 1!! + parse_state_ = BalsaFrameEnums::ERROR; + last_error_ = BalsaFrameEnums::CHUNK_LENGTH_OVERFLOW; + visitor_->ProcessBodyInput(on_entry, current - on_entry); + visitor_->HandleChunkingError(this); + goto bottom; + } + chunk_length_remaining_ = length_x_16 + addition; + continue; + } + + if (!chunk_length_character_extracted_ || addition == -1) { + // ^[0-9;A-Fa-f][ \t\n] -- was not matched, either because no + // characters were converted, or an unexpected character was + // seen. + parse_state_ = BalsaFrameEnums::ERROR; + last_error_ = BalsaFrameEnums::INVALID_CHUNK_LENGTH; + visitor_->ProcessBodyInput(on_entry, current - on_entry); + visitor_->HandleChunkingError(this); + goto bottom; + } + + --current; + parse_state_ = BalsaFrameEnums::READING_CHUNK_EXTENSION; + visitor_->ProcessChunkLength(chunk_length_remaining_); + goto label_reading_chunk_extension; + } + } + visitor_->ProcessBodyInput(on_entry, current - on_entry); + goto bottom; // case BalsaFrameEnums::READING_CHUNK_LENGTH + + label_reading_chunk_extension: + case BalsaFrameEnums::READING_CHUNK_EXTENSION: + { + // TODO(phython): Convert this scanning to be 16 bytes at a time if + // there is data to be read. + const char* extensions_start = current; + size_t extensions_length = 0; + while (current < end) { + const char c = *current; + if (c == '\r' || c == '\n') { + extensions_length = + (extensions_start == current) ? + 0 : + current - extensions_start - 1; + } + + ++current; + if (c == '\n') { + chunk_length_character_extracted_ = false; + visitor_->ProcessChunkExtensions( + extensions_start, extensions_length); + if (chunk_length_remaining_ != 0) { + parse_state_ = BalsaFrameEnums::READING_CHUNK_DATA; + goto label_reading_chunk_data; + } + HeaderFramingFound('\n'); + parse_state_ = BalsaFrameEnums::READING_LAST_CHUNK_TERM; + goto label_reading_last_chunk_term; + } + } + visitor_->ProcessChunkExtensions( + extensions_start, extensions_length); + } + + visitor_->ProcessBodyInput(on_entry, current - on_entry); + goto bottom; // case BalsaFrameEnums::READING_CHUNK_EXTENSION + + label_reading_chunk_data: + case BalsaFrameEnums::READING_CHUNK_DATA: + while (current < end) { + if (chunk_length_remaining_ == 0) { + break; + } + // read in the chunk + size_t bytes_remaining = end - current; + size_t consumed_bytes = (chunk_length_remaining_ < bytes_remaining) ? + chunk_length_remaining_ : bytes_remaining; + const char* tmp_current = current + consumed_bytes; + visitor_->ProcessBodyInput(on_entry, tmp_current - on_entry); + visitor_->ProcessBodyData(current, consumed_bytes); + on_entry = current = tmp_current; + chunk_length_remaining_ -= consumed_bytes; + } + if (chunk_length_remaining_ == 0) { + parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM; + goto label_reading_chunk_term; + } + visitor_->ProcessBodyInput(on_entry, current - on_entry); + goto bottom; // case BalsaFrameEnums::READING_CHUNK_DATA + + label_reading_chunk_term: + case BalsaFrameEnums::READING_CHUNK_TERM: + while (current < end) { + const char c = *current; + ++current; + + if (c == '\n') { + parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH; + goto label_reading_chunk_length; + } + } + visitor_->ProcessBodyInput(on_entry, current - on_entry); + goto bottom; // case BalsaFrameEnums::READING_CHUNK_TERM + + label_reading_last_chunk_term: + case BalsaFrameEnums::READING_LAST_CHUNK_TERM: + while (current < end) { + const char c = *current; + + if (!HeaderFramingFound(c)) { + // If not, however, since the spec only suggests that the + // client SHOULD indicate the presence of trailers, we get to + // *test* that they did or didn't. + // If all of the bytes we've seen since: + // OPTIONAL_WS 0 OPTIONAL_STUFF CRLF + // are either '\r', or '\n', then we can assume that we don't yet + // know if we need to parse headers, or if the next byte will make + // the HeaderFramingFound condition (above) true. + if (HeaderFramingMayBeFound()) { + // If true, then we have seen only characters '\r' or '\n'. + ++current; + + // Lets try again! There is no state change here. + continue; + } else { + // If (!HeaderFramingMayBeFound()), then we know that we must be + // reading the first non CRLF character of a trailer. + parse_state_ = BalsaFrameEnums::READING_TRAILER; + visitor_->ProcessBodyInput(on_entry, current - on_entry); + on_entry = current; + goto label_reading_trailer; + } + } else { + // If we've found a "\r\n\r\n", then the message + // is done. + ++current; + parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; + visitor_->ProcessBodyInput(on_entry, current - on_entry); + visitor_->MessageDone(); + goto bottom; + } + break; // from while loop + } + visitor_->ProcessBodyInput(on_entry, current - on_entry); + goto bottom; // case BalsaFrameEnums::READING_LAST_CHUNK_TERM + + label_reading_trailer: + case BalsaFrameEnums::READING_TRAILER: + while (current < end) { + const char c = *current; + ++current; + // TODO(fenix): If we ever care about trailers as part of framing, + // deal with them here (see below for part of the 'solution') + // if (LineFramingFound(c)) { + // trailer_lines_.push_back(make_pair(start_of_line_, + // trailer_length_ - 1)); + // start_of_line_ = trailer_length_; + // } + if (HeaderFramingFound(c)) { + // ProcessTrailers(visitor_, &trailers_); + parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; + visitor_->ProcessTrailerInput(on_entry, current - on_entry); + visitor_->MessageDone(); + goto bottom; + } + } + visitor_->ProcessTrailerInput(on_entry, current - on_entry); + break; // case BalsaFrameEnums::READING_TRAILER + + // Note that there is no label: + // 'label_reading_until_close' + // here. This is because the state-machine exists immediately after + // reading the headers instead of transitioning here (as it would + // do if it was consuming all the data it could, all the time). + case BalsaFrameEnums::READING_UNTIL_CLOSE: + { + const size_t bytes_remaining = end - current; + if (bytes_remaining > 0) { + visitor_->ProcessBodyInput(current, bytes_remaining); + visitor_->ProcessBodyData(current, bytes_remaining); + current += bytes_remaining; + } + } + goto bottom; // case BalsaFrameEnums::READING_UNTIL_CLOSE + + // label_reading_content: + case BalsaFrameEnums::READING_CONTENT: +#if DEBUGFRAMER + LOG(INFO) << "ReadingContent: " << content_length_remaining_; +#endif // DEBUGFRAMER + while (content_length_remaining_ && current < end) { + // read in the content + const size_t bytes_remaining = end - current; + const size_t consumed_bytes = + (content_length_remaining_ < bytes_remaining) ? + content_length_remaining_ : bytes_remaining; + visitor_->ProcessBodyInput(current, consumed_bytes); + visitor_->ProcessBodyData(current, consumed_bytes); + current += consumed_bytes; + content_length_remaining_ -= consumed_bytes; + } + if (content_length_remaining_ == 0) { + parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ; + visitor_->MessageDone(); + } + goto bottom; // case BalsaFrameEnums::READING_CONTENT + + default: + // The state-machine should never be in a state that isn't handled + // above. This is a glaring logic error, and we should do something + // drastic to ensure that this gets looked-at and fixed. + LOG(FATAL) << "Unknown state: " << parse_state_ // COV_NF_LINE + << " memory corruption?!"; // COV_NF_LINE + } + } + bottom: +#if DEBUGFRAMER + LOG(INFO) << "\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n" + << string(input, current) + << "\n$$$$$$$$$$$$$$" + << BalsaFrameEnums::ParseStateToString(parse_state_) + << "$$$$$$$$$$$$$$$" + << " consumed: " << (current - input); + if (Error()) { + LOG(INFO) << BalsaFrameEnums::ErrorCodeToString(ErrorCode()); + } +#endif // DEBUGFRAMER + return current - input; +} + +const int32 BalsaFrame::kValidTerm1; +const int32 BalsaFrame::kValidTerm1Mask; +const int32 BalsaFrame::kValidTerm2; +const int32 BalsaFrame::kValidTerm2Mask; + +} // namespace gfe2 + |