1// Copyright 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "net/tools/balsa/balsa_frame.h"
6
7// Visual C++ defines _M_IX86_FP as 2 if the /arch:SSE2 compiler option is
8// specified.
9#if !defined(__SSE2__) && _M_IX86_FP == 2
10#define __SSE2__ 1
11#endif
12
13#include <assert.h>
14#if __SSE2__
15#include <emmintrin.h>
16#endif  // __SSE2__
17
18#include <limits>
19#include <string>
20#include <utility>
21#include <vector>
22
23#include "base/logging.h"
24#include "base/port.h"
25#include "base/strings/string_piece.h"
26#include "net/tools/balsa/balsa_enums.h"
27#include "net/tools/balsa/balsa_headers.h"
28#include "net/tools/balsa/balsa_visitor_interface.h"
29#include "net/tools/balsa/buffer_interface.h"
30#include "net/tools/balsa/simple_buffer.h"
31#include "net/tools/balsa/split.h"
32#include "net/tools/balsa/string_piece_utils.h"
33
34#if defined(COMPILER_MSVC)
35#include <intrin.h>
36#include <string.h>
37
38#pragma intrinsic(_BitScanForward)
39
40static int ffs(int i) {
41  unsigned long index;
42  return _BitScanForward(&index, i) ? index + 1 : 0;
43}
44
45#define strncasecmp _strnicmp
46#else
47#include <strings.h>
48#endif
49
50namespace net {
51
52// Constants holding some header names for headers which can affect the way the
53// HTTP message is framed, and so must be processed specially:
54static const char kContentLength[] = "content-length";
55static const size_t kContentLengthSize = sizeof(kContentLength) - 1;
56static const char kTransferEncoding[] = "transfer-encoding";
57static const size_t kTransferEncodingSize = sizeof(kTransferEncoding) - 1;
58
59BalsaFrame::BalsaFrame()
60    : last_char_was_slash_r_(false),
61      saw_non_newline_char_(false),
62      start_was_space_(true),
63      chunk_length_character_extracted_(false),
64      is_request_(true),
65      request_was_head_(false),
66      max_header_length_(16 * 1024),
67      max_request_uri_length_(2048),
68      visitor_(&do_nothing_visitor_),
69      chunk_length_remaining_(0),
70      content_length_remaining_(0),
71      last_slash_n_loc_(NULL),
72      last_recorded_slash_n_loc_(NULL),
73      last_slash_n_idx_(0),
74      term_chars_(0),
75      parse_state_(BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE),
76      last_error_(BalsaFrameEnums::NO_ERROR),
77      headers_(NULL) {
78}
79
80BalsaFrame::~BalsaFrame() {}
81
82void BalsaFrame::Reset() {
83  last_char_was_slash_r_ = false;
84  saw_non_newline_char_ = false;
85  start_was_space_ = true;
86  chunk_length_character_extracted_ = false;
87  // is_request_ = true;               // not reset between messages.
88  // request_was_head_ = false;        // not reset between messages.
89  // max_header_length_ = 4096;        // not reset between messages.
90  // max_request_uri_length_ = 2048;   // not reset between messages.
91  // visitor_ = &do_nothing_visitor_;  // not reset between messages.
92  chunk_length_remaining_ = 0;
93  content_length_remaining_ = 0;
94  last_slash_n_loc_ = NULL;
95  last_recorded_slash_n_loc_ = NULL;
96  last_slash_n_idx_ = 0;
97  term_chars_ = 0;
98  parse_state_ = BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE;
99  last_error_ = BalsaFrameEnums::NO_ERROR;
100  lines_.clear();
101  if (headers_ != NULL) {
102    headers_->Clear();
103  }
104}
105
106const char* BalsaFrameEnums::ParseStateToString(
107    BalsaFrameEnums::ParseState error_code) {
108  switch (error_code) {
109    case PARSE_ERROR:
110      return "PARSE_ERROR";
111    case READING_HEADER_AND_FIRSTLINE:
112      return "READING_HEADER_AND_FIRSTLINE";
113    case READING_CHUNK_LENGTH:
114      return "READING_CHUNK_LENGTH";
115    case READING_CHUNK_EXTENSION:
116      return "READING_CHUNK_EXTENSION";
117    case READING_CHUNK_DATA:
118      return "READING_CHUNK_DATA";
119    case READING_CHUNK_TERM:
120      return "READING_CHUNK_TERM";
121    case READING_LAST_CHUNK_TERM:
122      return "READING_LAST_CHUNK_TERM";
123    case READING_TRAILER:
124      return "READING_TRAILER";
125    case READING_UNTIL_CLOSE:
126      return "READING_UNTIL_CLOSE";
127    case READING_CONTENT:
128      return "READING_CONTENT";
129    case MESSAGE_FULLY_READ:
130      return "MESSAGE_FULLY_READ";
131    case NUM_STATES:
132      return "UNKNOWN_STATE";
133  }
134  return "UNKNOWN_STATE";
135}
136
137const char* BalsaFrameEnums::ErrorCodeToString(
138    BalsaFrameEnums::ErrorCode error_code) {
139  switch (error_code) {
140    case NO_ERROR:
141      return "NO_ERROR";
142    case NO_STATUS_LINE_IN_RESPONSE:
143      return "NO_STATUS_LINE_IN_RESPONSE";
144    case NO_REQUEST_LINE_IN_REQUEST:
145      return "NO_REQUEST_LINE_IN_REQUEST";
146    case FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION:
147      return "FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION";
148    case FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD:
149      return "FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD";
150    case FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE:
151      return "FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE";
152    case FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI:
153      return "FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI";
154    case FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE:
155      return "FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE";
156    case FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION:
157      return "FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION";
158    case FAILED_CONVERTING_STATUS_CODE_TO_INT:
159      return "FAILED_CONVERTING_STATUS_CODE_TO_INT";
160    case REQUEST_URI_TOO_LONG:
161      return "REQUEST_URI_TOO_LONG";
162    case HEADERS_TOO_LONG:
163      return "HEADERS_TOO_LONG";
164    case UNPARSABLE_CONTENT_LENGTH:
165      return "UNPARSABLE_CONTENT_LENGTH";
166    case MAYBE_BODY_BUT_NO_CONTENT_LENGTH:
167      return "MAYBE_BODY_BUT_NO_CONTENT_LENGTH";
168    case REQUIRED_BODY_BUT_NO_CONTENT_LENGTH:
169      return "REQUIRED_BODY_BUT_NO_CONTENT_LENGTH";
170    case HEADER_MISSING_COLON:
171      return "HEADER_MISSING_COLON";
172    case INVALID_CHUNK_LENGTH:
173      return "INVALID_CHUNK_LENGTH";
174    case CHUNK_LENGTH_OVERFLOW:
175      return "CHUNK_LENGTH_OVERFLOW";
176    case CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO:
177      return "CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO";
178    case CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT:
179      return "CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT";
180    case MULTIPLE_CONTENT_LENGTH_KEYS:
181      return "MULTIPLE_CONTENT_LENGTH_KEYS";
182    case MULTIPLE_TRANSFER_ENCODING_KEYS:
183      return "MULTIPLE_TRANSFER_ENCODING_KEYS";
184    case UNKNOWN_TRANSFER_ENCODING:
185      return "UNKNOWN_TRANSFER_ENCODING";
186    case INVALID_HEADER_FORMAT:
187      return "INVALID_HEADER_FORMAT";
188    case INTERNAL_LOGIC_ERROR:
189      return "INTERNAL_LOGIC_ERROR";
190    case NUM_ERROR_CODES:
191      return "UNKNOWN_ERROR";
192  }
193  return "UNKNOWN_ERROR";
194}
195
196// Summary:
197//     Parses the first line of either a request or response.
198//     Note that in the case of a detected warning, error_code will be set
199//   but the function will not return false.
200//     Exactly zero or one warning or error (but not both) may be detected
201//   by this function.
202//     Note that this function will not write the data of the first-line
203//   into the header's buffer (that should already have been done elsewhere).
204//
205// Pre-conditions:
206//     begin != end
207//     *begin should be a character which is > ' '. This implies that there
208//   is at least one non-whitespace characters between [begin, end).
209//   headers is a valid pointer to a BalsaHeaders class.
210//     error_code is a valid pointer to a BalsaFrameEnums::ErrorCode value.
211//     Entire first line must exist between [begin, end)
212//     Exactly zero or one newlines -may- exist between [begin, end)
213//     [begin, end) should exist in the header's buffer.
214//
215// Side-effects:
216//   headers will be modified
217//   error_code may be modified if either a warning or error is detected
218//
219// Returns:
220//   True if no error (as opposed to warning) is detected.
221//   False if an error (as opposed to warning) is detected.
222
223//
224// If there is indeed non-whitespace in the line, then the following
225// will take care of this for you:
226//  while (*begin <= ' ') ++begin;
227//  ProcessFirstLine(begin, end, is_request, &headers, &error_code);
228//
229bool ParseHTTPFirstLine(const char* begin,
230                        const char* end,
231                        bool is_request,
232                        size_t max_request_uri_length,
233                        BalsaHeaders* headers,
234                        BalsaFrameEnums::ErrorCode* error_code) {
235  const char* current = begin;
236  // HTTP firstlines all have the following structure:
237  //  LWS         NONWS  LWS    NONWS   LWS    NONWS   NOTCRLF  CRLF
238  //  [\t \r\n]+ [^\t ]+ [\t ]+ [^\t ]+ [\t ]+ [^\t ]+ [^\r\n]+ "\r\n"
239  //  ws1        nws1    ws2    nws2    ws3    nws3             ws4
240  //  |          [-------)      [-------)      [----------------)
241  //    REQ:     method         request_uri    version
242  //   RESP:     version        statuscode     reason
243  //
244  //   The first NONWS->LWS component we'll call firstline_a.
245  //   The second firstline_b, and the third firstline_c.
246  //
247  //   firstline_a goes from nws1 to (but not including) ws2
248  //   firstline_b goes from nws2 to (but not including) ws3
249  //   firstline_c goes from nws3 to (but not including) ws4
250  //
251  // In the code:
252  //    ws1 == whitespace_1_idx_
253  //   nws1 == non_whitespace_1_idx_
254  //    ws2 == whitespace_2_idx_
255  //   nws2 == non_whitespace_2_idx_
256  //    ws3 == whitespace_3_idx_
257  //   nws3 == non_whitespace_3_idx_
258  //    ws4 == whitespace_4_idx_
259
260  // Kill all whitespace (including '\r\n') at the end of the line.
261  --end;
262  if (*end != '\n') {
263    *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
264    LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
265                << headers->OriginalHeadersForDebugging();
266    return false;
267  }
268  while (begin < end && *end <= ' ') {
269    --end;
270  }
271  DCHECK(*end != '\n');
272  if (*end == '\n') {
273    *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
274    LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
275                << headers->OriginalHeadersForDebugging();
276    return false;
277  }
278  ++end;
279
280  // The two following statements should not be possible.
281  if (end == begin) {
282    *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
283    LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
284                << headers->OriginalHeadersForDebugging();
285    return false;
286  }
287
288  // whitespace_1_idx_
289  headers->whitespace_1_idx_ = current - begin;
290  // This loop is commented out as it is never used in current code.  This is
291  // true only because we don't begin parsing the headers at all until we've
292  // encountered a non whitespace character at the beginning of the stream, at
293  // which point we begin our demarcation of header-start.  If we did -not- do
294  // this (for instance, only looked for [\r\n] instead of (< ' ')), this loop
295  // would be necessary for the proper functioning of this parsing.
296  // This is left here as this function may (in the future) be refactored out
297  // of the BalsaFrame class so that it may be shared between code in
298  // BalsaFrame and BalsaHeaders (where it would be used in some variant of the
299  // set_first_line() function (at which point it would be necessary).
300#if 0
301  while (*current <= ' ') {
302    ++current;
303  }
304#endif
305  // non_whitespace_1_idx_
306  headers->non_whitespace_1_idx_ = current - begin;
307  do {
308    // The first time through, we're guaranteed that the current character
309    // won't be a whitespace (else the loop above wouldn't have terminated).
310    // That implies that we're guaranteed to get at least one non-whitespace
311    // character if we get into this loop at all.
312    ++current;
313    if (current == end) {
314      headers->whitespace_2_idx_ = current - begin;
315      headers->non_whitespace_2_idx_ = current - begin;
316      headers->whitespace_3_idx_ = current - begin;
317      headers->non_whitespace_3_idx_ = current - begin;
318      headers->whitespace_4_idx_ = current - begin;
319      // FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD   for request
320      // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION for response
321      *error_code =
322        static_cast<BalsaFrameEnums::ErrorCode>(
323            BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION +
324            is_request);
325      if (!is_request) {  // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION
326        return false;
327      }
328      goto output_exhausted;
329    }
330  } while (*current > ' ');
331  // whitespace_2_idx_
332  headers->whitespace_2_idx_ = current - begin;
333  do {
334    ++current;
335    // Note that due to the loop which consumes all of the whitespace
336    // at the end of the line, current can never == end while in this function.
337  } while (*current <= ' ');
338  // non_whitespace_2_idx_
339  headers->non_whitespace_2_idx_ = current - begin;
340  do {
341    ++current;
342    if (current == end) {
343      headers->whitespace_3_idx_ = current - begin;
344      headers->non_whitespace_3_idx_ = current - begin;
345      headers->whitespace_4_idx_ = current - begin;
346      // FAILED_TO_FIND_START_OF_REQUEST_REQUEST_URI for request
347      // FAILED_TO_FIND_START_OF_RESPONSE_STATUSCODE for response
348      *error_code =
349        static_cast<BalsaFrameEnums::ErrorCode>(
350            BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE
351                                 + is_request);
352      goto output_exhausted;
353    }
354  } while (*current > ' ');
355  // whitespace_3_idx_
356  headers->whitespace_3_idx_ = current - begin;
357  do {
358    ++current;
359    // Note that due to the loop which consumes all of the whitespace
360    // at the end of the line, current can never == end while in this function.
361  } while (*current <= ' ');
362  // non_whitespace_3_idx_
363  headers->non_whitespace_3_idx_ = current - begin;
364  headers->whitespace_4_idx_ = end - begin;
365
366 output_exhausted:
367  // Note that we don't fail the parse immediately when parsing of the
368  // firstline fails.  Depending on the protocol type, we may want to accept
369  // a firstline with only one or two elements, e.g., for HTTP/0.9:
370  //   GET\r\n
371  // or
372  //   GET /\r\n
373  // should be parsed without issue (though the visitor should know that
374  // parsing the entire line was not exactly as it should be).
375  //
376  // Eventually, these errors may be removed alltogether, as the visitor can
377  // detect them on its own by examining the size of the various fields.
378  // headers->set_first_line(non_whitespace_1_idx_, current);
379
380  if (is_request) {
381    if ((headers->whitespace_3_idx_ - headers->non_whitespace_2_idx_) >
382        max_request_uri_length) {
383      // For requests, we need at least the method.  We could assume that a
384      // blank URI means "/".  If version isn't stated, it should be assumed
385      // to be HTTP/0.9 by the visitor.
386      *error_code = BalsaFrameEnums::REQUEST_URI_TOO_LONG;
387      return false;
388    }
389  } else {
390    headers->parsed_response_code_ = 0;
391    {
392      const char* parsed_response_code_current =
393        begin + headers->non_whitespace_2_idx_;
394      const char* parsed_response_code_end = begin + headers->whitespace_3_idx_;
395      const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10;
396
397      // Convert a string of [0-9]* into an int.
398      // Note that this allows for the conversion of response codes which
399      // are outside the bounds of normal HTTP response codes (no checking
400      // is done to ensure that these are valid-- they're merely parsed)!
401      while (parsed_response_code_current < parsed_response_code_end) {
402        if (*parsed_response_code_current < '0' ||
403            *parsed_response_code_current > '9') {
404          *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT;
405          return false;
406        }
407        size_t status_code_x_10 = headers->parsed_response_code_ * 10;
408        uint8 c = *parsed_response_code_current - '0';
409        if ((headers->parsed_response_code_ > kMaxDiv10) ||
410            (std::numeric_limits<size_t>::max() - status_code_x_10) < c) {
411          // overflow.
412          *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT;
413          return false;
414        }
415        headers->parsed_response_code_ = status_code_x_10 + c;
416        ++parsed_response_code_current;
417      }
418    }
419  }
420  return true;
421}
422
423// begin - beginning of the firstline
424// end - end of the firstline
425//
426// A precondition for this function is that there is non-whitespace between
427// [begin, end). If this precondition is not met, the function will not perform
428// as expected (and bad things may happen, and it will eat your first, second,
429// and third unborn children!).
430//
431// Another precondition for this function is that [begin, end) includes
432// at most one newline, which must be at the end of the line.
433void BalsaFrame::ProcessFirstLine(const char* begin, const char* end) {
434  BalsaFrameEnums::ErrorCode previous_error = last_error_;
435  if (!ParseHTTPFirstLine(begin,
436                          end,
437                          is_request_,
438                          max_request_uri_length_,
439                          headers_,
440                          &last_error_)) {
441    parse_state_ = BalsaFrameEnums::PARSE_ERROR;
442    visitor_->HandleHeaderError(this);
443    return;
444  }
445  if (previous_error != last_error_) {
446    visitor_->HandleHeaderWarning(this);
447  }
448
449  if (is_request_) {
450    size_t version_length =
451        headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_;
452    visitor_->ProcessRequestFirstLine(
453        begin + headers_->non_whitespace_1_idx_,
454        headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_,
455        begin + headers_->non_whitespace_1_idx_,
456        headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_,
457        begin + headers_->non_whitespace_2_idx_,
458        headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_,
459        begin + headers_->non_whitespace_3_idx_,
460        version_length);
461    if (version_length == 0)
462      parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
463  } else {
464    visitor_->ProcessResponseFirstLine(
465        begin + headers_->non_whitespace_1_idx_,
466        headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_,
467        begin + headers_->non_whitespace_1_idx_,
468        headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_,
469        begin + headers_->non_whitespace_2_idx_,
470        headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_,
471        begin + headers_->non_whitespace_3_idx_,
472        headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_);
473  }
474}
475
476// 'stream_begin' points to the first character of the headers buffer.
477// 'line_begin' points to the first character of the line.
478// 'current' points to a char which is ':'.
479// 'line_end' points to the position of '\n' + 1.
480// 'line_begin' points to the position of first character of line.
481void BalsaFrame::CleanUpKeyValueWhitespace(
482    const char* stream_begin,
483    const char* line_begin,
484    const char* current,
485    const char* line_end,
486    HeaderLineDescription* current_header_line) {
487  const char* colon_loc = current;
488  DCHECK_LT(colon_loc, line_end);
489  DCHECK_EQ(':', *colon_loc);
490  DCHECK_EQ(':', *current);
491  DCHECK_GE(' ', *line_end)
492    << "\"" << std::string(line_begin, line_end) << "\"";
493
494  // TODO(fenix): Investigate whether or not the bounds tests in the
495  // while loops here are redundant, and if so, remove them.
496  --current;
497  while (current > line_begin && *current <= ' ') --current;
498  current += (current != colon_loc);
499  current_header_line->key_end_idx = current - stream_begin;
500
501  current = colon_loc;
502  DCHECK_EQ(':', *current);
503  ++current;
504  while (current < line_end && *current <= ' ') ++current;
505  current_header_line->value_begin_idx = current - stream_begin;
506
507  DCHECK_GE(current_header_line->key_end_idx,
508            current_header_line->first_char_idx);
509  DCHECK_GE(current_header_line->value_begin_idx,
510            current_header_line->key_end_idx);
511  DCHECK_GE(current_header_line->last_char_idx,
512            current_header_line->value_begin_idx);
513}
514
515inline void BalsaFrame::FindColonsAndParseIntoKeyValue() {
516  DCHECK(!lines_.empty());
517  const char* stream_begin = headers_->OriginalHeaderStreamBegin();
518  // The last line is always just a newline (and is uninteresting).
519  const Lines::size_type lines_size_m1 = lines_.size() - 1;
520#if __SSE2__
521  const __m128i colons = _mm_set1_epi8(':');
522  const char* header_lines_end_m16 = headers_->OriginalHeaderStreamEnd() - 16;
523#endif  // __SSE2__
524  const char* current = stream_begin + lines_[1].first;
525  // This code is a bit more subtle than it may appear at first glance.
526  // This code looks for a colon in the current line... but it also looks
527  // beyond the current line. If there is no colon in the current line, then
528  // for each subsequent line (until the colon which -has- been found is
529  // associated with a line), no searching for a colon will be performed. In
530  // this way, we minimize the amount of bytes we have scanned for a colon.
531  for (Lines::size_type i = 1; i < lines_size_m1;) {
532    const char* line_begin = stream_begin + lines_[i].first;
533
534    // Here we handle possible continuations.  Note that we do not replace
535    // the '\n' in the line before a continuation (at least, as of now),
536    // which implies that any code which looks for a value must deal with
537    // "\r\n", etc -within- the line (and not just at the end of it).
538    for (++i; i < lines_size_m1; ++i) {
539      const char c = *(stream_begin + lines_[i].first);
540      if (c > ' ') {
541        // Not a continuation, so stop.  Note that if the 'original' i = 1,
542        // and the next line is not a continuation, we'll end up with i = 2
543        // when we break. This handles the incrementing of i for the outer
544        // loop.
545        break;
546      }
547    }
548    const char* line_end = stream_begin + lines_[i - 1].second;
549    DCHECK_LT(line_begin - stream_begin, line_end - stream_begin);
550
551    // We cleanup the whitespace at the end of the line before doing anything
552    // else of interest as it allows us to do nothing when irregularly formatted
553    // headers are parsed (e.g. those with only keys, only values, or no colon).
554    //
555    // We're guaranteed to have *line_end > ' ' while line_end >= line_begin.
556    --line_end;
557    DCHECK_EQ('\n', *line_end)
558      << "\"" << std::string(line_begin, line_end) << "\"";
559    while (*line_end <= ' ' && line_end > line_begin) {
560      --line_end;
561    }
562    ++line_end;
563    DCHECK_GE(' ', *line_end);
564    DCHECK_LT(line_begin, line_end);
565
566    // We use '0' for the block idx, because we're always writing to the first
567    // block from the framer (we do this because the framer requires that the
568    // entire header sequence be in a contiguous buffer).
569    headers_->header_lines_.push_back(
570        HeaderLineDescription(line_begin - stream_begin,
571                              line_end - stream_begin,
572                              line_end - stream_begin,
573                              line_end - stream_begin,
574                              0));
575    if (current >= line_end) {
576      last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON;
577      visitor_->HandleHeaderWarning(this);
578      // Then the next colon will not be found within this header line-- time
579      // to try again with another header-line.
580      continue;
581    } else if (current < line_begin) {
582      // When this condition is true, the last detected colon was part of a
583      // previous line.  We reset to the beginning of the line as we don't care
584      // about the presence of any colon before the beginning of the current
585      // line.
586      current = line_begin;
587    }
588#if __SSE2__
589    while (current < header_lines_end_m16) {
590      __m128i header_bytes =
591        _mm_loadu_si128(reinterpret_cast<const __m128i *>(current));
592      __m128i colon_cmp = _mm_cmpeq_epi8(header_bytes, colons);
593      int colon_msk = _mm_movemask_epi8(colon_cmp);
594      if (colon_msk == 0) {
595        current += 16;
596        continue;
597      }
598      current += (ffs(colon_msk) - 1);
599      if (current > line_end) {
600        break;
601      }
602      goto found_colon;
603    }
604#endif  // __SSE2__
605    for (; current < line_end; ++current) {
606      if (*current != ':') {
607        continue;
608      }
609      goto found_colon;
610    }
611    // If we've gotten to here, then there was no colon
612    // in the line. The arguments we passed into the construction
613    // for the HeaderLineDescription object should be OK-- it assumes
614    // that the entire content is 'key' by default (which is true, as
615    // there was no colon, there can be no value). Note that this is a
616    // construct which is technically not allowed by the spec.
617    last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON;
618    visitor_->HandleHeaderWarning(this);
619    continue;
620 found_colon:
621    DCHECK_EQ(*current, ':');
622    DCHECK_LE(current - stream_begin, line_end - stream_begin);
623    DCHECK_LE(stream_begin - stream_begin, current - stream_begin);
624
625    HeaderLineDescription& current_header_line = headers_->header_lines_.back();
626    current_header_line.key_end_idx = current - stream_begin;
627    current_header_line.value_begin_idx = current_header_line.key_end_idx;
628    if (current < line_end) {
629      ++current_header_line.key_end_idx;
630
631      CleanUpKeyValueWhitespace(stream_begin,
632                                line_begin,
633                                current,
634                                line_end,
635                                &current_header_line);
636    }
637  }
638}
639
640void BalsaFrame::ProcessContentLengthLine(
641    HeaderLines::size_type line_idx,
642    BalsaHeadersEnums::ContentLengthStatus* status,
643    size_t* length) {
644  const HeaderLineDescription& header_line = headers_->header_lines_[line_idx];
645  const char* stream_begin = headers_->OriginalHeaderStreamBegin();
646  const char* line_end = stream_begin + header_line.last_char_idx;
647  const char* value_begin = (stream_begin + header_line.value_begin_idx);
648
649  if (value_begin >= line_end) {
650    // There is no non-whitespace value data.
651#if DEBUGFRAMER
652      LOG(INFO) << "invalid content-length -- no non-whitespace value data";
653#endif
654    *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH;
655    return;
656  }
657
658  *length = 0;
659  while (value_begin < line_end) {
660    if (*value_begin < '0' || *value_begin > '9') {
661      // bad! content-length found, and couldn't parse all of it!
662      *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH;
663#if DEBUGFRAMER
664      LOG(INFO) << "invalid content-length - non numeric character detected";
665#endif  // DEBUGFRAMER
666      return;
667    }
668    const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10;
669    size_t length_x_10 = *length * 10;
670    const unsigned char c = *value_begin - '0';
671    if (*length > kMaxDiv10 ||
672        (std::numeric_limits<size_t>::max() - length_x_10) < c) {
673      *status = BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW;
674#if DEBUGFRAMER
675      LOG(INFO) << "content-length overflow";
676#endif  // DEBUGFRAMER
677      return;
678    }
679    *length = length_x_10 + c;
680    ++value_begin;
681  }
682#if DEBUGFRAMER
683  LOG(INFO) << "content_length parsed: " << *length;
684#endif  // DEBUGFRAMER
685  *status = BalsaHeadersEnums::VALID_CONTENT_LENGTH;
686}
687
688void BalsaFrame::ProcessTransferEncodingLine(HeaderLines::size_type line_idx) {
689  const HeaderLineDescription& header_line = headers_->header_lines_[line_idx];
690  const char* stream_begin = headers_->OriginalHeaderStreamBegin();
691  const char* line_end = stream_begin + header_line.last_char_idx;
692  const char* value_begin = stream_begin + header_line.value_begin_idx;
693  size_t value_length = line_end - value_begin;
694
695  if ((value_length == 7) &&
696      !strncasecmp(value_begin, "chunked", 7)) {
697    headers_->transfer_encoding_is_chunked_ = true;
698  } else if ((value_length == 8) &&
699      !strncasecmp(value_begin, "identity", 8)) {
700    headers_->transfer_encoding_is_chunked_ = false;
701  } else {
702    last_error_ = BalsaFrameEnums::UNKNOWN_TRANSFER_ENCODING;
703    parse_state_ = BalsaFrameEnums::PARSE_ERROR;
704    visitor_->HandleHeaderError(this);
705    return;
706  }
707}
708
709namespace {
710bool SplitStringPiece(base::StringPiece original, char delim,
711                      base::StringPiece* before, base::StringPiece* after) {
712  const char* p = original.data();
713  const char* end = p + original.size();
714
715  while (p != end) {
716    if (*p == delim) {
717      ++p;
718    } else {
719      const char* start = p;
720      while (++p != end && *p != delim) {
721        // Skip to the next occurence of the delimiter.
722      }
723      *before = base::StringPiece(start, p - start);
724      if (p != end)
725        *after = base::StringPiece(p + 1, end - (p + 1));
726      else
727        *after = base::StringPiece("");
728      StringPieceUtils::RemoveWhitespaceContext(before);
729      StringPieceUtils::RemoveWhitespaceContext(after);
730      return true;
731    }
732  }
733
734  *before = original;
735  *after = "";
736  return false;
737}
738
739// TODO(phython): Fix this function to properly deal with quoted values.
740// E.g. ";;foo", "\";;\"", or \"aa;
741// The last example, the semi-colon is a separator between extensions.
742void ProcessChunkExtensionsManual(base::StringPiece all_extensions,
743                                  BalsaHeaders* extensions) {
744  base::StringPiece extension;
745  base::StringPiece remaining;
746  StringPieceUtils::RemoveWhitespaceContext(&all_extensions);
747  SplitStringPiece(all_extensions, ';', &extension, &remaining);
748  while (!extension.empty()) {
749    base::StringPiece key;
750    base::StringPiece value;
751    SplitStringPiece(extension, '=', &key, &value);
752    if (!value.empty()) {
753      // Strip quotation marks if they exist.
754      if (!value.empty() && value[0] == '"')
755        value.remove_prefix(1);
756      if (!value.empty() && value[value.length() - 1] == '"')
757        value.remove_suffix(1);
758    }
759
760    extensions->AppendHeader(key, value);
761
762    StringPieceUtils::RemoveWhitespaceContext(&remaining);
763    SplitStringPiece(remaining, ';', &extension, &remaining);
764  }
765}
766
767}  // anonymous namespace
768
769void BalsaFrame::ProcessChunkExtensions(const char* input, size_t size,
770                                        BalsaHeaders* extensions) {
771  ProcessChunkExtensionsManual(base::StringPiece(input, size), extensions);
772}
773
774void BalsaFrame::ProcessHeaderLines() {
775  HeaderLines::size_type content_length_idx = 0;
776  HeaderLines::size_type transfer_encoding_idx = 0;
777
778  DCHECK(!lines_.empty());
779#if DEBUGFRAMER
780  LOG(INFO) << "******@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@**********\n";
781#endif  // DEBUGFRAMER
782
783  // There is no need to attempt to process headers if no header lines exist.
784  // There are at least two lines in the message which are not header lines.
785  // These two non-header lines are the first line of the message, and the
786  // last line of the message (which is an empty line).
787  // Thus, we test to see if we have more than two lines total before attempting
788  // to parse any header lines.
789  if (lines_.size() > 2) {
790    const char* stream_begin = headers_->OriginalHeaderStreamBegin();
791
792    // Then, for the rest of the header data, we parse these into key-value
793    // pairs.
794    FindColonsAndParseIntoKeyValue();
795    // At this point, we've parsed all of the headers.  Time to look for those
796    // headers which we require for framing.
797    const HeaderLines::size_type
798      header_lines_size = headers_->header_lines_.size();
799    for (HeaderLines::size_type i = 0; i < header_lines_size; ++i) {
800      const HeaderLineDescription& current_header_line =
801        headers_->header_lines_[i];
802      const char* key_begin =
803        (stream_begin + current_header_line.first_char_idx);
804      const char* key_end = (stream_begin + current_header_line.key_end_idx);
805      const size_t key_len = key_end - key_begin;
806      const char c = *key_begin;
807#if DEBUGFRAMER
808      LOG(INFO) << "[" << i << "]: " << std::string(key_begin, key_len)
809                << " c: '" << c << "' key_len: " << key_len;
810#endif  // DEBUGFRAMER
811      // If a header begins with either lowercase or uppercase 'c' or 't', then
812      // the header may be one of content-length, connection, content-encoding
813      // or transfer-encoding. These headers are special, as they change the way
814      // that the message is framed, and so the framer is required to search
815      // for them.
816
817
818      if (c == 'c' || c == 'C') {
819        if ((key_len == kContentLengthSize) &&
820            0 == strncasecmp(key_begin, kContentLength, kContentLengthSize)) {
821          BalsaHeadersEnums::ContentLengthStatus content_length_status =
822            BalsaHeadersEnums::NO_CONTENT_LENGTH;
823          size_t length = 0;
824          ProcessContentLengthLine(i, &content_length_status, &length);
825          if (content_length_idx != 0) {  // then we've already seen one!
826            if ((headers_->content_length_status_ != content_length_status) ||
827                ((headers_->content_length_status_ ==
828                  BalsaHeadersEnums::VALID_CONTENT_LENGTH) &&
829                 length != headers_->content_length_)) {
830              last_error_ = BalsaFrameEnums::MULTIPLE_CONTENT_LENGTH_KEYS;
831              parse_state_ = BalsaFrameEnums::PARSE_ERROR;
832              visitor_->HandleHeaderError(this);
833              return;
834            }
835            continue;
836          } else {
837            content_length_idx = i + 1;
838            headers_->content_length_status_ = content_length_status;
839            headers_->content_length_ = length;
840            content_length_remaining_ = length;
841          }
842
843        }
844      } else if (c == 't' || c == 'T') {
845        if ((key_len == kTransferEncodingSize) &&
846            0 == strncasecmp(key_begin, kTransferEncoding,
847                             kTransferEncodingSize)) {
848          if (transfer_encoding_idx != 0) {
849            last_error_ = BalsaFrameEnums::MULTIPLE_TRANSFER_ENCODING_KEYS;
850            parse_state_ = BalsaFrameEnums::PARSE_ERROR;
851            visitor_->HandleHeaderError(this);
852            return;
853          }
854          transfer_encoding_idx = i + 1;
855        }
856      } else if (i == 0 && (key_len == 0 || c == ' ')) {
857        last_error_ = BalsaFrameEnums::INVALID_HEADER_FORMAT;
858        parse_state_ = BalsaFrameEnums::PARSE_ERROR;
859        visitor_->HandleHeaderError(this);
860        return;
861      }
862    }
863    if (headers_->transfer_encoding_is_chunked_) {
864      headers_->content_length_ = 0;
865      headers_->content_length_status_ = BalsaHeadersEnums::NO_CONTENT_LENGTH;
866      content_length_remaining_ = 0;
867    }
868    if (transfer_encoding_idx != 0) {
869      ProcessTransferEncodingLine(transfer_encoding_idx - 1);
870    }
871  }
872}
873
874void BalsaFrame::AssignParseStateAfterHeadersHaveBeenParsed() {
875  // For responses, can't have a body if the request was a HEAD, or if it is
876  // one of these response-codes.  rfc2616 section 4.3
877  parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
878  if (is_request_ ||
879      !(request_was_head_ ||
880        (headers_->parsed_response_code_ >= 100 &&
881         headers_->parsed_response_code_ < 200) ||
882        (headers_->parsed_response_code_ == 204) ||
883        (headers_->parsed_response_code_ == 304))) {
884    // Then we can have a body.
885    if (headers_->transfer_encoding_is_chunked_) {
886      // Note that
887      // if ( Transfer-Encoding: chunked &&  Content-length: )
888      // then Transfer-Encoding: chunked trumps.
889      // This is as specified in the spec.
890      // rfc2616 section 4.4.3
891      parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH;
892    } else {
893      // Errors parsing content-length definitely can cause
894      // protocol errors/warnings
895      switch (headers_->content_length_status_) {
896        // If we have a content-length, and it is parsed
897        // properly, there are two options.
898        // 1) zero content, in which case the message is done, and
899        // 2) nonzero content, in which case we have to
900        //    consume the body.
901        case BalsaHeadersEnums::VALID_CONTENT_LENGTH:
902          if (headers_->content_length_ == 0) {
903            parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
904          } else {
905            parse_state_ = BalsaFrameEnums::READING_CONTENT;
906          }
907          break;
908        case BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW:
909        case BalsaHeadersEnums::INVALID_CONTENT_LENGTH:
910          // If there were characters left-over after parsing the
911          // content length, we should flag an error and stop.
912          parse_state_ = BalsaFrameEnums::PARSE_ERROR;
913          last_error_ = BalsaFrameEnums::UNPARSABLE_CONTENT_LENGTH;
914          visitor_->HandleHeaderError(this);
915          break;
916          // We can have: no transfer-encoding, no content length, and no
917          // connection: close...
918          // Unfortunately, this case doesn't seem to be covered in the spec.
919          // We'll assume that the safest thing to do here is what the google
920          // binaries before 2008 already do, which is to assume that
921          // everything until the connection is closed is body.
922        case BalsaHeadersEnums::NO_CONTENT_LENGTH:
923          if (is_request_) {
924            base::StringPiece method = headers_->request_method();
925            // POSTs and PUTs should have a detectable body length.  If they
926            // do not we consider it an error.
927            if ((method.size() == 4 &&
928                 strncmp(method.data(), "POST", 4) == 0) ||
929                (method.size() == 3 &&
930                 strncmp(method.data(), "PUT", 3) == 0)) {
931              parse_state_ = BalsaFrameEnums::PARSE_ERROR;
932              last_error_ =
933                  BalsaFrameEnums::REQUIRED_BODY_BUT_NO_CONTENT_LENGTH;
934              visitor_->HandleHeaderError(this);
935              break;
936            }
937            parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
938          } else {
939            parse_state_ = BalsaFrameEnums::READING_UNTIL_CLOSE;
940            last_error_ = BalsaFrameEnums::MAYBE_BODY_BUT_NO_CONTENT_LENGTH;
941            visitor_->HandleHeaderWarning(this);
942          }
943          break;
944          // The COV_NF_... statements here provide hints to the apparatus
945          // which computes coverage reports/ratios that this code is never
946          // intended to be executed, and should technically be impossible.
947          // COV_NF_START
948        default:
949          LOG(FATAL) << "Saw a content_length_status: "
950           << headers_->content_length_status_ << " which is unknown.";
951          // COV_NF_END
952      }
953    }
954  }
955}
956
957size_t BalsaFrame::ProcessHeaders(const char* message_start,
958                                  size_t message_length) {
959  const char* const original_message_start = message_start;
960  const char* const message_end = message_start + message_length;
961  const char* message_current = message_start;
962  const char* checkpoint = message_start;
963
964  if (message_length == 0) {
965    goto bottom;
966  }
967
968  while (message_current < message_end) {
969    size_t base_idx = headers_->GetReadableBytesFromHeaderStream();
970
971    // Yes, we could use strchr (assuming null termination), or
972    // memchr, but as it turns out that is slower than this tight loop
973    // for the input that we see.
974    if (!saw_non_newline_char_) {
975      do {
976        const char c = *message_current;
977        if (c != '\r' && c != '\n') {
978          if (c <= ' ') {
979            parse_state_ = BalsaFrameEnums::PARSE_ERROR;
980            last_error_ = BalsaFrameEnums::NO_REQUEST_LINE_IN_REQUEST;
981            visitor_->HandleHeaderError(this);
982            goto bottom;
983          } else {
984            saw_non_newline_char_ = true;
985            checkpoint = message_start = message_current;
986            goto read_real_message;
987          }
988        }
989        ++message_current;
990      } while (message_current < message_end);
991      goto bottom;  // this is necessary to skip 'last_char_was_slash_r' checks
992    } else {
993 read_real_message:
994      // Note that SSE2 can be enabled on certain piii platforms.
995#if __SSE2__
996      {
997        const char* const message_end_m16 = message_end - 16;
998        __m128i newlines = _mm_set1_epi8('\n');
999        while (message_current < message_end_m16) {
1000          // What this does (using compiler intrinsics):
1001          //
1002          // Load 16 '\n's into an xmm register
1003          // Load 16 bytes of currennt message into an xmm register
1004          // Do byte-wise equals on those two xmm registers
1005          // Take the first bit of each byte, and put that into the first
1006          //   16 bits of a mask
1007          // If the mask is zero, no '\n' found. increment by 16 and try again
1008          // Else scan forward to find the first set bit.
1009          // Increment current by the index of the first set bit
1010          //   (ffs returns index of first set bit + 1)
1011          __m128i msg_bytes =
1012            _mm_loadu_si128(const_cast<__m128i *>(
1013                    reinterpret_cast<const __m128i *>(message_current)));
1014          __m128i newline_cmp = _mm_cmpeq_epi8(msg_bytes, newlines);
1015          int newline_msk = _mm_movemask_epi8(newline_cmp);
1016          if (newline_msk == 0) {
1017            message_current += 16;
1018            continue;
1019          }
1020          message_current += (ffs(newline_msk) - 1);
1021          const size_t relative_idx = message_current - message_start;
1022          const size_t message_current_idx = 1 + base_idx + relative_idx;
1023          lines_.push_back(std::make_pair(last_slash_n_idx_,
1024                                          message_current_idx));
1025          if (lines_.size() == 1) {
1026            headers_->WriteFromFramer(checkpoint,
1027                                      1 + message_current - checkpoint);
1028            checkpoint = message_current + 1;
1029            const char* begin = headers_->OriginalHeaderStreamBegin();
1030#if DEBUGFRAMER
1031          LOG(INFO) << "First line " << std::string(begin, lines_[0].second);
1032          LOG(INFO) << "is_request_: " << is_request_;
1033#endif
1034            ProcessFirstLine(begin, begin + lines_[0].second);
1035            if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ)
1036              goto process_lines;
1037            else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR)
1038              goto bottom;
1039          }
1040          const size_t chars_since_last_slash_n = (message_current_idx -
1041                                                   last_slash_n_idx_);
1042          last_slash_n_idx_ = message_current_idx;
1043          if (chars_since_last_slash_n > 2) {
1044            // We have a slash-n, but the last slash n was
1045            // more than 2 characters away from this. Thus, we know
1046            // that this cannot be an end-of-header.
1047            ++message_current;
1048            continue;
1049          }
1050          if ((chars_since_last_slash_n == 1) ||
1051              (((message_current > message_start) &&
1052                (*(message_current - 1) == '\r')) ||
1053               (last_char_was_slash_r_))) {
1054            goto process_lines;
1055          }
1056          ++message_current;
1057        }
1058      }
1059#endif  // __SSE2__
1060      while (message_current < message_end) {
1061        if (*message_current != '\n') {
1062          ++message_current;
1063          continue;
1064        }
1065        const size_t relative_idx = message_current - message_start;
1066        const size_t message_current_idx = 1 + base_idx + relative_idx;
1067        lines_.push_back(std::make_pair(last_slash_n_idx_,
1068                                        message_current_idx));
1069        if (lines_.size() == 1) {
1070          headers_->WriteFromFramer(checkpoint,
1071                                    1 + message_current - checkpoint);
1072          checkpoint = message_current + 1;
1073          const char* begin = headers_->OriginalHeaderStreamBegin();
1074#if DEBUGFRAMER
1075          LOG(INFO) << "First line " << std::string(begin, lines_[0].second);
1076          LOG(INFO) << "is_request_: " << is_request_;
1077#endif
1078          ProcessFirstLine(begin, begin + lines_[0].second);
1079          if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ)
1080            goto process_lines;
1081          else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR)
1082            goto bottom;
1083        }
1084        const size_t chars_since_last_slash_n = (message_current_idx -
1085                                                 last_slash_n_idx_);
1086        last_slash_n_idx_ = message_current_idx;
1087        if (chars_since_last_slash_n > 2) {
1088          // false positive.
1089          ++message_current;
1090          continue;
1091        }
1092        if ((chars_since_last_slash_n == 1) ||
1093            (((message_current > message_start) &&
1094              (*(message_current - 1) == '\r')) ||
1095             (last_char_was_slash_r_))) {
1096          goto process_lines;
1097        }
1098        ++message_current;
1099      }
1100    }
1101    continue;
1102 process_lines:
1103    ++message_current;
1104    DCHECK(message_current >= message_start);
1105    if (message_current > message_start) {
1106      headers_->WriteFromFramer(checkpoint, message_current - checkpoint);
1107    }
1108
1109    // Check if we have exceeded maximum headers length
1110    // Although we check for this limit before and after we call this function
1111    // we check it here as well to make sure that in case the visitor changed
1112    // the max_header_length_ (for example after processing the first line)
1113    // we handle it gracefully.
1114    if (headers_->GetReadableBytesFromHeaderStream() > max_header_length_) {
1115      parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1116      last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1117      visitor_->HandleHeaderError(this);
1118      goto bottom;
1119    }
1120
1121    // Since we know that we won't be writing any more bytes of the header,
1122    // we tell that to the headers object. The headers object may make
1123    // more efficient allocation decisions when this is signaled.
1124    headers_->DoneWritingFromFramer();
1125    {
1126      const char* readable_ptr = NULL;
1127      size_t readable_size = 0;
1128      headers_->GetReadablePtrFromHeaderStream(&readable_ptr, &readable_size);
1129      visitor_->ProcessHeaderInput(readable_ptr, readable_size);
1130    }
1131
1132    // Ok, now that we've written everything into our header buffer, it is
1133    // time to process the header lines (extract proper values for headers
1134    // which are important for framing).
1135    ProcessHeaderLines();
1136    if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1137      goto bottom;
1138    }
1139    AssignParseStateAfterHeadersHaveBeenParsed();
1140    if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1141      goto bottom;
1142    }
1143    visitor_->ProcessHeaders(*headers_);
1144    visitor_->HeaderDone();
1145    if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) {
1146      visitor_->MessageDone();
1147    }
1148    goto bottom;
1149  }
1150  // If we've gotten to here, it means that we've consumed all of the
1151  // available input. We need to record whether or not the last character we
1152  // saw was a '\r' so that a subsequent call to ProcessInput correctly finds
1153  // a header framing that is split across the two calls.
1154  last_char_was_slash_r_ = (*(message_end - 1) == '\r');
1155  DCHECK(message_current >= message_start);
1156  if (message_current > message_start) {
1157    headers_->WriteFromFramer(checkpoint, message_current - checkpoint);
1158  }
1159 bottom:
1160  return message_current - original_message_start;
1161}
1162
1163
1164size_t BalsaFrame::BytesSafeToSplice() const {
1165  switch (parse_state_) {
1166    case BalsaFrameEnums::READING_CHUNK_DATA:
1167      return chunk_length_remaining_;
1168    case BalsaFrameEnums::READING_UNTIL_CLOSE:
1169      return std::numeric_limits<size_t>::max();
1170    case BalsaFrameEnums::READING_CONTENT:
1171      return content_length_remaining_;
1172    default:
1173      return 0;
1174  }
1175}
1176
1177void BalsaFrame::BytesSpliced(size_t bytes_spliced) {
1178  switch (parse_state_) {
1179    case BalsaFrameEnums::READING_CHUNK_DATA:
1180      if (chunk_length_remaining_ >= bytes_spliced) {
1181        chunk_length_remaining_ -= bytes_spliced;
1182        if (chunk_length_remaining_ == 0) {
1183          parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM;
1184        }
1185        return;
1186      } else {
1187        last_error_ =
1188          BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT;
1189        goto error_exit;
1190      }
1191
1192    case BalsaFrameEnums::READING_UNTIL_CLOSE:
1193      return;
1194
1195    case BalsaFrameEnums::READING_CONTENT:
1196      if (content_length_remaining_ >= bytes_spliced) {
1197        content_length_remaining_ -= bytes_spliced;
1198        if (content_length_remaining_ == 0) {
1199          parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1200          visitor_->MessageDone();
1201        }
1202        return;
1203      } else {
1204        last_error_ =
1205          BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT;
1206        goto error_exit;
1207      }
1208
1209    default:
1210      last_error_ = BalsaFrameEnums::CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO;
1211      goto error_exit;
1212  }
1213
1214 error_exit:
1215  parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1216  visitor_->HandleBodyError(this);
1217};
1218
1219// You may note that the state-machine contained within this function has both
1220// switch and goto labels for nearly the same thing. For instance, the
1221// following two labels refer to the same code block:
1222//   label_reading_chunk_data:
1223//   case BalsaFrameEnums::READING_CHUNK_DATA:
1224// The 'case' statement is required for the switch statement which occurs when
1225// ProcessInput is invoked. The goto label is required as the state-machine
1226// does not use a computed goto in any subsequent operations.
1227//
1228// Since several states exit the state machine for various reasons, there is
1229// also one label at the bottom of the function. When it is appropriate to
1230// return from the function, that part of the state machine instead issues a
1231// goto bottom; This results in less code duplication, and makes debugging
1232// easier (as you can add a statement to a section of code which is guaranteed
1233// to be invoked when the function is exiting.
1234size_t BalsaFrame::ProcessInput(const char* input, size_t size) {
1235  const char* current = input;
1236  const char* on_entry = current;
1237  const char* end = current + size;
1238#if DEBUGFRAMER
1239  LOG(INFO) << "\n=============="
1240            << BalsaFrameEnums::ParseStateToString(parse_state_)
1241            << "===============\n";
1242#endif  // DEBUGFRAMER
1243
1244  DCHECK(headers_ != NULL);
1245  if (headers_ == NULL) return 0;
1246
1247  if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) {
1248    const size_t header_length = headers_->GetReadableBytesFromHeaderStream();
1249    // Yes, we still have to check this here as the user can change the
1250    // max_header_length amount!
1251    // Also it is possible that we have reached the maximum allowed header size,
1252    // and we have more to consume (remember we are still inside
1253    // READING_HEADER_AND_FIRSTLINE) in which case we directly declare an error.
1254    if (header_length > max_header_length_ ||
1255        (header_length == max_header_length_ && size > 0)) {
1256      parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1257      last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1258      visitor_->HandleHeaderError(this);
1259      goto bottom;
1260    }
1261    size_t bytes_to_process = max_header_length_ - header_length;
1262    if (bytes_to_process > size) {
1263      bytes_to_process = size;
1264    }
1265    current += ProcessHeaders(input, bytes_to_process);
1266    // If we are still reading headers check if we have crossed the headers
1267    // limit. Note that we check for >= as opposed to >. This is because if
1268    // header_length_after equals max_header_length_ and we are still in the
1269    // parse_state_  BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE we know for
1270    // sure that the headers limit will be crossed later on
1271    if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) {
1272      // Note that headers_ is valid only if we are still reading headers.
1273      const size_t header_length_after =
1274          headers_->GetReadableBytesFromHeaderStream();
1275      if (header_length_after >= max_header_length_) {
1276        parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1277        last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1278        visitor_->HandleHeaderError(this);
1279      }
1280    }
1281    goto bottom;
1282  } else if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ ||
1283             parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1284    // Can do nothing more 'till we're reset.
1285    goto bottom;
1286  }
1287
1288  while (current < end) {
1289    switch (parse_state_) {
1290 label_reading_chunk_length:
1291      case BalsaFrameEnums::READING_CHUNK_LENGTH:
1292        // In this state we read the chunk length.
1293        // Note that once we hit a character which is not in:
1294        // [0-9;A-Fa-f\n], we transition to a different state.
1295        //
1296        {
1297          // If we used strtol, etc, we'd have to buffer this line.
1298          // This is more annoying than simply doing the conversion
1299          // here. This code accounts for overflow.
1300          static const signed char buf[] = {
1301            // %0  %1  %2  %3  %4  %5  %6  %7  %8  \t  \n  %b  %c  \r  %e  %f
1302               -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -1, -1, -2, -1, -1,
1303            // %10 %11 %12 %13 %14 %15 %16 %17 %18 %19 %1a %1b %1c %1d %1e %1f
1304               -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1305            // ' ' %21 %22 %23 %24 %25 %26 %27 %28 %29 %2a %2b %2c %2d %2e %2f
1306               -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1307            // %30 %31 %32 %33 %34 %35 %36 %37 %38 %39 %3a ';' %3c %3d %3e %3f
1308                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -2, -1, -1, -1, -1,
1309            // %40 'A' 'B' 'C' 'D' 'E' 'F' %47 %48 %49 %4a %4b %4c %4d %4e %4f
1310               -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1311            // %50 %51 %52 %53 %54 %55 %56 %57 %58 %59 %5a %5b %5c %5d %5e %5f
1312               -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1313            // %60 'a' 'b' 'c' 'd' 'e' 'f' %67 %68 %69 %6a %6b %6c %6d %6e %6f
1314               -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1315            // %70 %71 %72 %73 %74 %75 %76 %77 %78 %79 %7a %7b %7c %7d %7e %7f
1316               -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1317          };
1318          // valid cases:
1319          //  "09123\n"                      // -> 09123
1320          //  "09123\r\n"                    // -> 09123
1321          //  "09123  \n"                    // -> 09123
1322          //  "09123  \r\n"                  // -> 09123
1323          //  "09123  12312\n"               // -> 09123
1324          //  "09123  12312\r\n"             // -> 09123
1325          //  "09123; foo=bar\n"             // -> 09123
1326          //  "09123; foo=bar\r\n"           // -> 09123
1327          //  "FFFFFFFFFFFFFFFF\r\n"         // -> FFFFFFFFFFFFFFFF
1328          //  "FFFFFFFFFFFFFFFF 22\r\n"      // -> FFFFFFFFFFFFFFFF
1329          // invalid cases:
1330          // "[ \t]+[^\n]*\n"
1331          // "FFFFFFFFFFFFFFFFF\r\n"  (would overflow)
1332          // "\r\n"
1333          // "\n"
1334          while (current < end) {
1335            const char c = *current;
1336            ++current;
1337            const signed char addition = buf[static_cast<int>(c)];
1338            if (addition >= 0) {
1339              chunk_length_character_extracted_ = true;
1340              size_t length_x_16 = chunk_length_remaining_ * 16;
1341              const size_t kMaxDiv16 = std::numeric_limits<size_t>::max() / 16;
1342              if ((chunk_length_remaining_ > kMaxDiv16) ||
1343                  ((std::numeric_limits<size_t>::max() - length_x_16) <
1344                   static_cast<size_t>(addition))) {
1345                // overflow -- asked for a chunk-length greater than 2^64 - 1!!
1346                parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1347                last_error_ = BalsaFrameEnums::CHUNK_LENGTH_OVERFLOW;
1348                visitor_->ProcessBodyInput(on_entry, current - on_entry);
1349                visitor_->HandleChunkingError(this);
1350                goto bottom;
1351              }
1352              chunk_length_remaining_ = length_x_16 + addition;
1353              continue;
1354            }
1355
1356            if (!chunk_length_character_extracted_ || addition == -1) {
1357              // ^[0-9;A-Fa-f][ \t\n] -- was not matched, either because no
1358              // characters were converted, or an unexpected character was
1359              // seen.
1360              parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1361              last_error_ = BalsaFrameEnums::INVALID_CHUNK_LENGTH;
1362              visitor_->ProcessBodyInput(on_entry, current - on_entry);
1363              visitor_->HandleChunkingError(this);
1364              goto bottom;
1365            }
1366
1367            --current;
1368            parse_state_ = BalsaFrameEnums::READING_CHUNK_EXTENSION;
1369            visitor_->ProcessChunkLength(chunk_length_remaining_);
1370            goto label_reading_chunk_extension;
1371          }
1372        }
1373        visitor_->ProcessBodyInput(on_entry, current - on_entry);
1374        goto bottom;  // case BalsaFrameEnums::READING_CHUNK_LENGTH
1375
1376 label_reading_chunk_extension:
1377      case BalsaFrameEnums::READING_CHUNK_EXTENSION:
1378        {
1379          // TODO(phython): Convert this scanning to be 16 bytes at a time if
1380          // there is data to be read.
1381          const char* extensions_start = current;
1382          size_t extensions_length = 0;
1383          while (current < end) {
1384            const char c = *current;
1385            if (c == '\r' || c == '\n') {
1386              extensions_length =
1387                  (extensions_start == current) ?
1388                  0 :
1389                  current - extensions_start - 1;
1390            }
1391
1392            ++current;
1393            if (c == '\n') {
1394              chunk_length_character_extracted_ = false;
1395              visitor_->ProcessChunkExtensions(
1396                  extensions_start, extensions_length);
1397              if (chunk_length_remaining_ != 0) {
1398                parse_state_ = BalsaFrameEnums::READING_CHUNK_DATA;
1399                goto label_reading_chunk_data;
1400              }
1401              HeaderFramingFound('\n');
1402              parse_state_ = BalsaFrameEnums::READING_LAST_CHUNK_TERM;
1403              goto label_reading_last_chunk_term;
1404            }
1405          }
1406          visitor_->ProcessChunkExtensions(
1407              extensions_start, extensions_length);
1408        }
1409
1410        visitor_->ProcessBodyInput(on_entry, current - on_entry);
1411        goto bottom;  // case BalsaFrameEnums::READING_CHUNK_EXTENSION
1412
1413 label_reading_chunk_data:
1414      case BalsaFrameEnums::READING_CHUNK_DATA:
1415        while (current < end) {
1416          if (chunk_length_remaining_ == 0) {
1417            break;
1418          }
1419          // read in the chunk
1420          size_t bytes_remaining = end - current;
1421          size_t consumed_bytes = (chunk_length_remaining_ < bytes_remaining) ?
1422            chunk_length_remaining_ : bytes_remaining;
1423          const char* tmp_current = current + consumed_bytes;
1424          visitor_->ProcessBodyInput(on_entry, tmp_current - on_entry);
1425          visitor_->ProcessBodyData(current, consumed_bytes);
1426          on_entry = current = tmp_current;
1427          chunk_length_remaining_ -= consumed_bytes;
1428        }
1429        if (chunk_length_remaining_ == 0) {
1430          parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM;
1431          goto label_reading_chunk_term;
1432        }
1433        visitor_->ProcessBodyInput(on_entry, current - on_entry);
1434        goto bottom;  // case BalsaFrameEnums::READING_CHUNK_DATA
1435
1436 label_reading_chunk_term:
1437      case BalsaFrameEnums::READING_CHUNK_TERM:
1438        while (current < end) {
1439          const char c = *current;
1440          ++current;
1441
1442          if (c == '\n') {
1443            parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH;
1444            goto label_reading_chunk_length;
1445          }
1446        }
1447        visitor_->ProcessBodyInput(on_entry, current - on_entry);
1448        goto bottom;  // case BalsaFrameEnums::READING_CHUNK_TERM
1449
1450 label_reading_last_chunk_term:
1451      case BalsaFrameEnums::READING_LAST_CHUNK_TERM:
1452        while (current < end) {
1453          const char c = *current;
1454
1455          if (!HeaderFramingFound(c)) {
1456            // If not, however, since the spec only suggests that the
1457            // client SHOULD indicate the presence of trailers, we get to
1458            // *test* that they did or didn't.
1459            // If all of the bytes we've seen since:
1460            //   OPTIONAL_WS 0 OPTIONAL_STUFF CRLF
1461            // are either '\r', or '\n', then we can assume that we don't yet
1462            // know if we need to parse headers, or if the next byte will make
1463            // the HeaderFramingFound condition (above) true.
1464            if (HeaderFramingMayBeFound()) {
1465              // If true, then we have seen only characters '\r' or '\n'.
1466              ++current;
1467
1468              // Lets try again! There is no state change here.
1469              continue;
1470            } else {
1471              // If (!HeaderFramingMayBeFound()), then we know that we must be
1472              // reading the first non CRLF character of a trailer.
1473              parse_state_ = BalsaFrameEnums::READING_TRAILER;
1474              visitor_->ProcessBodyInput(on_entry, current - on_entry);
1475              on_entry = current;
1476              goto label_reading_trailer;
1477            }
1478          } else {
1479            // If we've found a "\r\n\r\n", then the message
1480            // is done.
1481            ++current;
1482            parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1483            visitor_->ProcessBodyInput(on_entry, current - on_entry);
1484            visitor_->MessageDone();
1485            goto bottom;
1486          }
1487          break;  // from while loop
1488        }
1489        visitor_->ProcessBodyInput(on_entry, current - on_entry);
1490        goto bottom;  // case BalsaFrameEnums::READING_LAST_CHUNK_TERM
1491
1492 label_reading_trailer:
1493      case BalsaFrameEnums::READING_TRAILER:
1494        while (current < end) {
1495          const char c = *current;
1496          ++current;
1497          // TODO(fenix): If we ever care about trailers as part of framing,
1498          // deal with them here (see below for part of the 'solution')
1499          // if (LineFramingFound(c)) {
1500          // trailer_lines_.push_back(make_pair(start_of_line_,
1501          //                                   trailer_length_ - 1));
1502          // start_of_line_ = trailer_length_;
1503          // }
1504          if (HeaderFramingFound(c)) {
1505            // ProcessTrailers(visitor_, &trailers_);
1506            parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1507            visitor_->ProcessTrailerInput(on_entry, current - on_entry);
1508            visitor_->MessageDone();
1509            goto bottom;
1510          }
1511        }
1512        visitor_->ProcessTrailerInput(on_entry, current - on_entry);
1513        break;  // case BalsaFrameEnums::READING_TRAILER
1514
1515        // Note that there is no label:
1516        //   'label_reading_until_close'
1517        // here. This is because the state-machine exists immediately after
1518        // reading the headers instead of transitioning here (as it would
1519        // do if it was consuming all the data it could, all the time).
1520      case BalsaFrameEnums::READING_UNTIL_CLOSE:
1521        {
1522          const size_t bytes_remaining = end - current;
1523          if (bytes_remaining > 0) {
1524            visitor_->ProcessBodyInput(current, bytes_remaining);
1525            visitor_->ProcessBodyData(current, bytes_remaining);
1526            current += bytes_remaining;
1527          }
1528        }
1529        goto bottom;  // case BalsaFrameEnums::READING_UNTIL_CLOSE
1530
1531        // label_reading_content:
1532      case BalsaFrameEnums::READING_CONTENT:
1533#if DEBUGFRAMER
1534        LOG(INFO) << "ReadingContent: " << content_length_remaining_;
1535#endif  // DEBUGFRAMER
1536        while (content_length_remaining_ && current < end) {
1537          // read in the content
1538          const size_t bytes_remaining = end - current;
1539          const size_t consumed_bytes =
1540            (content_length_remaining_ < bytes_remaining) ?
1541            content_length_remaining_ : bytes_remaining;
1542          visitor_->ProcessBodyInput(current, consumed_bytes);
1543          visitor_->ProcessBodyData(current, consumed_bytes);
1544          current += consumed_bytes;
1545          content_length_remaining_ -= consumed_bytes;
1546        }
1547        if (content_length_remaining_ == 0) {
1548          parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1549          visitor_->MessageDone();
1550        }
1551        goto bottom;  // case BalsaFrameEnums::READING_CONTENT
1552
1553      default:
1554        // The state-machine should never be in a state that isn't handled
1555        // above.  This is a glaring logic error, and we should do something
1556        // drastic to ensure that this gets looked-at and fixed.
1557        LOG(FATAL) << "Unknown state: " << parse_state_  // COV_NF_LINE
1558          << " memory corruption?!";                     // COV_NF_LINE
1559    }
1560  }
1561 bottom:
1562#if DEBUGFRAMER
1563  LOG(INFO) << "\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n"
1564    << std::string(input, current)
1565    << "\n$$$$$$$$$$$$$$"
1566    << BalsaFrameEnums::ParseStateToString(parse_state_)
1567    << "$$$$$$$$$$$$$$$"
1568    << " consumed: " << (current - input);
1569  if (Error()) {
1570    LOG(INFO) << BalsaFrameEnums::ErrorCodeToString(ErrorCode());
1571  }
1572#endif  // DEBUGFRAMER
1573  return current - input;
1574}
1575
1576}  // namespace net
1577