1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "net/tools/flip_server/balsa_frame.h"
6
7#include <assert.h>
8#if __SSE2__
9#include <emmintrin.h>
10#endif  // __SSE2__
11#include <strings.h>
12
13#include <limits>
14#include <string>
15#include <utility>
16#include <vector>
17
18#include "base/logging.h"
19#include "base/port.h"
20#include "base/strings/string_piece.h"
21#include "net/tools/flip_server/balsa_enums.h"
22#include "net/tools/flip_server/balsa_headers.h"
23#include "net/tools/flip_server/balsa_visitor_interface.h"
24#include "net/tools/flip_server/buffer_interface.h"
25#include "net/tools/flip_server/simple_buffer.h"
26#include "net/tools/flip_server/split.h"
27#include "net/tools/flip_server/string_piece_utils.h"
28
29namespace net {
30
31// Constants holding some header names for headers which can affect the way the
32// HTTP message is framed, and so must be processed specially:
33static const char kContentLength[] = "content-length";
34static const size_t kContentLengthSize = sizeof(kContentLength) - 1;
35static const char kTransferEncoding[] = "transfer-encoding";
36static const size_t kTransferEncodingSize = sizeof(kTransferEncoding) - 1;
37
38BalsaFrame::BalsaFrame()
39    : last_char_was_slash_r_(false),
40      saw_non_newline_char_(false),
41      start_was_space_(true),
42      chunk_length_character_extracted_(false),
43      is_request_(true),
44      request_was_head_(false),
45      max_header_length_(16 * 1024),
46      max_request_uri_length_(2048),
47      visitor_(&do_nothing_visitor_),
48      chunk_length_remaining_(0),
49      content_length_remaining_(0),
50      last_slash_n_loc_(NULL),
51      last_recorded_slash_n_loc_(NULL),
52      last_slash_n_idx_(0),
53      term_chars_(0),
54      parse_state_(BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE),
55      last_error_(BalsaFrameEnums::NO_ERROR),
56      headers_(NULL) {
57}
58
59BalsaFrame::~BalsaFrame() {}
60
61void BalsaFrame::Reset() {
62  last_char_was_slash_r_ = false;
63  saw_non_newline_char_ = false;
64  start_was_space_ = true;
65  chunk_length_character_extracted_ = false;
66  // is_request_ = true;               // not reset between messages.
67  // request_was_head_ = false;        // not reset between messages.
68  // max_header_length_ = 4096;        // not reset between messages.
69  // max_request_uri_length_ = 2048;   // not reset between messages.
70  // visitor_ = &do_nothing_visitor_;  // not reset between messages.
71  chunk_length_remaining_ = 0;
72  content_length_remaining_ = 0;
73  last_slash_n_loc_ = NULL;
74  last_recorded_slash_n_loc_ = NULL;
75  last_slash_n_idx_ = 0;
76  term_chars_ = 0;
77  parse_state_ = BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE;
78  last_error_ = BalsaFrameEnums::NO_ERROR;
79  lines_.clear();
80  if (headers_ != NULL) {
81    headers_->Clear();
82  }
83}
84
85const char* BalsaFrameEnums::ParseStateToString(
86    BalsaFrameEnums::ParseState error_code) {
87  switch (error_code) {
88    case PARSE_ERROR:
89      return "PARSE_ERROR";
90    case READING_HEADER_AND_FIRSTLINE:
91      return "READING_HEADER_AND_FIRSTLINE";
92    case READING_CHUNK_LENGTH:
93      return "READING_CHUNK_LENGTH";
94    case READING_CHUNK_EXTENSION:
95      return "READING_CHUNK_EXTENSION";
96    case READING_CHUNK_DATA:
97      return "READING_CHUNK_DATA";
98    case READING_CHUNK_TERM:
99      return "READING_CHUNK_TERM";
100    case READING_LAST_CHUNK_TERM:
101      return "READING_LAST_CHUNK_TERM";
102    case READING_TRAILER:
103      return "READING_TRAILER";
104    case READING_UNTIL_CLOSE:
105      return "READING_UNTIL_CLOSE";
106    case READING_CONTENT:
107      return "READING_CONTENT";
108    case MESSAGE_FULLY_READ:
109      return "MESSAGE_FULLY_READ";
110    case NUM_STATES:
111      return "UNKNOWN_STATE";
112  }
113  return "UNKNOWN_STATE";
114}
115
116const char* BalsaFrameEnums::ErrorCodeToString(
117    BalsaFrameEnums::ErrorCode error_code) {
118  switch (error_code) {
119    case NO_ERROR:
120      return "NO_ERROR";
121    case NO_STATUS_LINE_IN_RESPONSE:
122      return "NO_STATUS_LINE_IN_RESPONSE";
123    case NO_REQUEST_LINE_IN_REQUEST:
124      return "NO_REQUEST_LINE_IN_REQUEST";
125    case FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION:
126      return "FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION";
127    case FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD:
128      return "FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD";
129    case FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE:
130      return "FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE";
131    case FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI:
132      return "FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI";
133    case FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE:
134      return "FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE";
135    case FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION:
136      return "FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION";
137    case FAILED_CONVERTING_STATUS_CODE_TO_INT:
138      return "FAILED_CONVERTING_STATUS_CODE_TO_INT";
139    case REQUEST_URI_TOO_LONG:
140      return "REQUEST_URI_TOO_LONG";
141    case HEADERS_TOO_LONG:
142      return "HEADERS_TOO_LONG";
143    case UNPARSABLE_CONTENT_LENGTH:
144      return "UNPARSABLE_CONTENT_LENGTH";
145    case MAYBE_BODY_BUT_NO_CONTENT_LENGTH:
146      return "MAYBE_BODY_BUT_NO_CONTENT_LENGTH";
147    case REQUIRED_BODY_BUT_NO_CONTENT_LENGTH:
148      return "REQUIRED_BODY_BUT_NO_CONTENT_LENGTH";
149    case HEADER_MISSING_COLON:
150      return "HEADER_MISSING_COLON";
151    case INVALID_CHUNK_LENGTH:
152      return "INVALID_CHUNK_LENGTH";
153    case CHUNK_LENGTH_OVERFLOW:
154      return "CHUNK_LENGTH_OVERFLOW";
155    case CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO:
156      return "CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO";
157    case CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT:
158      return "CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT";
159    case MULTIPLE_CONTENT_LENGTH_KEYS:
160      return "MULTIPLE_CONTENT_LENGTH_KEYS";
161    case MULTIPLE_TRANSFER_ENCODING_KEYS:
162      return "MULTIPLE_TRANSFER_ENCODING_KEYS";
163    case UNKNOWN_TRANSFER_ENCODING:
164      return "UNKNOWN_TRANSFER_ENCODING";
165    case INVALID_HEADER_FORMAT:
166      return "INVALID_HEADER_FORMAT";
167    case INTERNAL_LOGIC_ERROR:
168      return "INTERNAL_LOGIC_ERROR";
169    case NUM_ERROR_CODES:
170      return "UNKNOWN_ERROR";
171  }
172  return "UNKNOWN_ERROR";
173}
174
175// Summary:
176//     Parses the first line of either a request or response.
177//     Note that in the case of a detected warning, error_code will be set
178//   but the function will not return false.
179//     Exactly zero or one warning or error (but not both) may be detected
180//   by this function.
181//     Note that this function will not write the data of the first-line
182//   into the header's buffer (that should already have been done elsewhere).
183//
184// Pre-conditions:
185//     begin != end
186//     *begin should be a character which is > ' '. This implies that there
187//   is at least one non-whitespace characters between [begin, end).
188//   headers is a valid pointer to a BalsaHeaders class.
189//     error_code is a valid pointer to a BalsaFrameEnums::ErrorCode value.
190//     Entire first line must exist between [begin, end)
191//     Exactly zero or one newlines -may- exist between [begin, end)
192//     [begin, end) should exist in the header's buffer.
193//
194// Side-effects:
195//   headers will be modified
196//   error_code may be modified if either a warning or error is detected
197//
198// Returns:
199//   True if no error (as opposed to warning) is detected.
200//   False if an error (as opposed to warning) is detected.
201
202//
203// If there is indeed non-whitespace in the line, then the following
204// will take care of this for you:
205//  while (*begin <= ' ') ++begin;
206//  ProcessFirstLine(begin, end, is_request, &headers, &error_code);
207//
208bool ParseHTTPFirstLine(const char* begin,
209                        const char* end,
210                        bool is_request,
211                        size_t max_request_uri_length,
212                        BalsaHeaders* headers,
213                        BalsaFrameEnums::ErrorCode* error_code) {
214  const char* current = begin;
215  // HTTP firstlines all have the following structure:
216  //  LWS         NONWS  LWS    NONWS   LWS    NONWS   NOTCRLF  CRLF
217  //  [\t \r\n]+ [^\t ]+ [\t ]+ [^\t ]+ [\t ]+ [^\t ]+ [^\r\n]+ "\r\n"
218  //  ws1        nws1    ws2    nws2    ws3    nws3             ws4
219  //  |          [-------)      [-------)      [----------------)
220  //    REQ:     method         request_uri    version
221  //   RESP:     version        statuscode     reason
222  //
223  //   The first NONWS->LWS component we'll call firstline_a.
224  //   The second firstline_b, and the third firstline_c.
225  //
226  //   firstline_a goes from nws1 to (but not including) ws2
227  //   firstline_b goes from nws2 to (but not including) ws3
228  //   firstline_c goes from nws3 to (but not including) ws4
229  //
230  // In the code:
231  //    ws1 == whitespace_1_idx_
232  //   nws1 == non_whitespace_1_idx_
233  //    ws2 == whitespace_2_idx_
234  //   nws2 == non_whitespace_2_idx_
235  //    ws3 == whitespace_3_idx_
236  //   nws3 == non_whitespace_3_idx_
237  //    ws4 == whitespace_4_idx_
238
239  // Kill all whitespace (including '\r\n') at the end of the line.
240  --end;
241  if (*end != '\n') {
242    *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
243    LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
244                << headers->OriginalHeadersForDebugging();
245    return false;
246  }
247  while (begin < end && *end <= ' ') {
248    --end;
249  }
250  DCHECK(*end != '\n');
251  if (*end == '\n') {
252    *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
253    LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
254                << headers->OriginalHeadersForDebugging();
255    return false;
256  }
257  ++end;
258
259  // The two following statements should not be possible.
260  if (end == begin) {
261    *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
262    LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
263                << headers->OriginalHeadersForDebugging();
264    return false;
265  }
266
267  // whitespace_1_idx_
268  headers->whitespace_1_idx_ = current - begin;
269  // This loop is commented out as it is never used in current code.  This is
270  // true only because we don't begin parsing the headers at all until we've
271  // encountered a non whitespace character at the beginning of the stream, at
272  // which point we begin our demarcation of header-start.  If we did -not- do
273  // this (for instance, only looked for [\r\n] instead of (< ' ')), this loop
274  // would be necessary for the proper functioning of this parsing.
275  // This is left here as this function may (in the future) be refactored out
276  // of the BalsaFrame class so that it may be shared between code in
277  // BalsaFrame and BalsaHeaders (where it would be used in some variant of the
278  // set_first_line() function (at which point it would be necessary).
279#if 0
280  while (*current <= ' ') {
281    ++current;
282  }
283#endif
284  // non_whitespace_1_idx_
285  headers->non_whitespace_1_idx_ = current - begin;
286  do {
287    // The first time through, we're guaranteed that the current character
288    // won't be a whitespace (else the loop above wouldn't have terminated).
289    // That implies that we're guaranteed to get at least one non-whitespace
290    // character if we get into this loop at all.
291    ++current;
292    if (current == end) {
293      headers->whitespace_2_idx_ = current - begin;
294      headers->non_whitespace_2_idx_ = current - begin;
295      headers->whitespace_3_idx_ = current - begin;
296      headers->non_whitespace_3_idx_ = current - begin;
297      headers->whitespace_4_idx_ = current - begin;
298      // FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD   for request
299      // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION for response
300      *error_code =
301        static_cast<BalsaFrameEnums::ErrorCode>(
302            BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION +
303            is_request);
304      if (!is_request) {  // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION
305        return false;
306      }
307      goto output_exhausted;
308    }
309  } while (*current > ' ');
310  // whitespace_2_idx_
311  headers->whitespace_2_idx_ = current - begin;
312  do {
313    ++current;
314    // Note that due to the loop which consumes all of the whitespace
315    // at the end of the line, current can never == end while in this function.
316  } while (*current <= ' ');
317  // non_whitespace_2_idx_
318  headers->non_whitespace_2_idx_ = current - begin;
319  do {
320    ++current;
321    if (current == end) {
322      headers->whitespace_3_idx_ = current - begin;
323      headers->non_whitespace_3_idx_ = current - begin;
324      headers->whitespace_4_idx_ = current - begin;
325      // FAILED_TO_FIND_START_OF_REQUEST_REQUEST_URI for request
326      // FAILED_TO_FIND_START_OF_RESPONSE_STATUSCODE for response
327      *error_code =
328        static_cast<BalsaFrameEnums::ErrorCode>(
329            BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE
330                                 + is_request);
331      goto output_exhausted;
332    }
333  } while (*current > ' ');
334  // whitespace_3_idx_
335  headers->whitespace_3_idx_ = current - begin;
336  do {
337    ++current;
338    // Note that due to the loop which consumes all of the whitespace
339    // at the end of the line, current can never == end while in this function.
340  } while (*current <= ' ');
341  // non_whitespace_3_idx_
342  headers->non_whitespace_3_idx_ = current - begin;
343  headers->whitespace_4_idx_ = end - begin;
344
345 output_exhausted:
346  // Note that we don't fail the parse immediately when parsing of the
347  // firstline fails.  Depending on the protocol type, we may want to accept
348  // a firstline with only one or two elements, e.g., for HTTP/0.9:
349  //   GET\r\n
350  // or
351  //   GET /\r\n
352  // should be parsed without issue (though the visitor should know that
353  // parsing the entire line was not exactly as it should be).
354  //
355  // Eventually, these errors may be removed alltogether, as the visitor can
356  // detect them on its own by examining the size of the various fields.
357  // headers->set_first_line(non_whitespace_1_idx_, current);
358
359  if (is_request) {
360    if ((headers->whitespace_3_idx_ - headers->non_whitespace_2_idx_) >
361        max_request_uri_length) {
362      // For requests, we need at least the method.  We could assume that a
363      // blank URI means "/".  If version isn't stated, it should be assumed
364      // to be HTTP/0.9 by the visitor.
365      *error_code = BalsaFrameEnums::REQUEST_URI_TOO_LONG;
366      return false;
367    }
368  } else {
369    headers->parsed_response_code_ = 0;
370    {
371      const char* parsed_response_code_current =
372        begin + headers->non_whitespace_2_idx_;
373      const char* parsed_response_code_end = begin + headers->whitespace_3_idx_;
374      const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10;
375
376      // Convert a string of [0-9]* into an int.
377      // Note that this allows for the conversion of response codes which
378      // are outside the bounds of normal HTTP response codes (no checking
379      // is done to ensure that these are valid-- they're merely parsed)!
380      while (parsed_response_code_current < parsed_response_code_end) {
381        if (*parsed_response_code_current < '0' ||
382            *parsed_response_code_current > '9') {
383          *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT;
384          return false;
385        }
386        size_t status_code_x_10 = headers->parsed_response_code_ * 10;
387        uint8 c = *parsed_response_code_current - '0';
388        if ((headers->parsed_response_code_ > kMaxDiv10) ||
389            (std::numeric_limits<size_t>::max() - status_code_x_10) < c) {
390          // overflow.
391          *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT;
392          return false;
393        }
394        headers->parsed_response_code_ = status_code_x_10 + c;
395        ++parsed_response_code_current;
396      }
397    }
398  }
399  return true;
400}
401
402// begin - beginning of the firstline
403// end - end of the firstline
404//
405// A precondition for this function is that there is non-whitespace between
406// [begin, end). If this precondition is not met, the function will not perform
407// as expected (and bad things may happen, and it will eat your first, second,
408// and third unborn children!).
409//
410// Another precondition for this function is that [begin, end) includes
411// at most one newline, which must be at the end of the line.
412void BalsaFrame::ProcessFirstLine(const char* begin, const char* end) {
413  BalsaFrameEnums::ErrorCode previous_error = last_error_;
414  if (!ParseHTTPFirstLine(begin,
415                          end,
416                          is_request_,
417                          max_request_uri_length_,
418                          headers_,
419                          &last_error_)) {
420    parse_state_ = BalsaFrameEnums::PARSE_ERROR;
421    visitor_->HandleHeaderError(this);
422    return;
423  }
424  if (previous_error != last_error_) {
425    visitor_->HandleHeaderWarning(this);
426  }
427
428  if (is_request_) {
429    int version_length =
430        headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_;
431    visitor_->ProcessRequestFirstLine(
432        begin + headers_->non_whitespace_1_idx_,
433        headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_,
434        begin + headers_->non_whitespace_1_idx_,
435        headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_,
436        begin + headers_->non_whitespace_2_idx_,
437        headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_,
438        begin + headers_->non_whitespace_3_idx_,
439        version_length);
440    if (version_length == 0)
441      parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
442  } else {
443    visitor_->ProcessResponseFirstLine(
444        begin + headers_->non_whitespace_1_idx_,
445        headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_,
446        begin + headers_->non_whitespace_1_idx_,
447        headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_,
448        begin + headers_->non_whitespace_2_idx_,
449        headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_,
450        begin + headers_->non_whitespace_3_idx_,
451        headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_);
452  }
453}
454
455// 'stream_begin' points to the first character of the headers buffer.
456// 'line_begin' points to the first character of the line.
457// 'current' points to a char which is ':'.
458// 'line_end' points to the position of '\n' + 1.
459// 'line_begin' points to the position of first character of line.
460void BalsaFrame::CleanUpKeyValueWhitespace(
461    const char* stream_begin,
462    const char* line_begin,
463    const char* current,
464    const char* line_end,
465    HeaderLineDescription* current_header_line) {
466  const char* colon_loc = current;
467  DCHECK_LT(colon_loc, line_end);
468  DCHECK_EQ(':', *colon_loc);
469  DCHECK_EQ(':', *current);
470  DCHECK_GE(' ', *line_end)
471    << "\"" << std::string(line_begin, line_end) << "\"";
472
473  // TODO(fenix): Investigate whether or not the bounds tests in the
474  // while loops here are redundant, and if so, remove them.
475  --current;
476  while (current > line_begin && *current <= ' ') --current;
477  current += (current != colon_loc);
478  current_header_line->key_end_idx = current - stream_begin;
479
480  current = colon_loc;
481  DCHECK_EQ(':', *current);
482  ++current;
483  while (current < line_end && *current <= ' ') ++current;
484  current_header_line->value_begin_idx = current - stream_begin;
485
486  DCHECK_GE(current_header_line->key_end_idx,
487            current_header_line->first_char_idx);
488  DCHECK_GE(current_header_line->value_begin_idx,
489            current_header_line->key_end_idx);
490  DCHECK_GE(current_header_line->last_char_idx,
491            current_header_line->value_begin_idx);
492}
493
494inline void BalsaFrame::FindColonsAndParseIntoKeyValue() {
495  DCHECK(!lines_.empty());
496  const char* stream_begin = headers_->OriginalHeaderStreamBegin();
497  // The last line is always just a newline (and is uninteresting).
498  const Lines::size_type lines_size_m1 = lines_.size() - 1;
499#if __SSE2__
500  const __v16qi colons = { ':', ':', ':', ':', ':', ':', ':', ':',
501                           ':', ':', ':', ':', ':', ':', ':', ':'};
502  const char* header_lines_end_m16 = headers_->OriginalHeaderStreamEnd() - 16;
503#endif  // __SSE2__
504  const char* current = stream_begin + lines_[1].first;
505  // This code is a bit more subtle than it may appear at first glance.
506  // This code looks for a colon in the current line... but it also looks
507  // beyond the current line. If there is no colon in the current line, then
508  // for each subsequent line (until the colon which -has- been found is
509  // associated with a line), no searching for a colon will be performed. In
510  // this way, we minimize the amount of bytes we have scanned for a colon.
511  for (Lines::size_type i = 1; i < lines_size_m1;) {
512    const char* line_begin = stream_begin + lines_[i].first;
513
514    // Here we handle possible continuations.  Note that we do not replace
515    // the '\n' in the line before a continuation (at least, as of now),
516    // which implies that any code which looks for a value must deal with
517    // "\r\n", etc -within- the line (and not just at the end of it).
518    for (++i; i < lines_size_m1; ++i) {
519      const char c = *(stream_begin + lines_[i].first);
520      if (c > ' ') {
521        // Not a continuation, so stop.  Note that if the 'original' i = 1,
522        // and the next line is not a continuation, we'll end up with i = 2
523        // when we break. This handles the incrementing of i for the outer
524        // loop.
525        break;
526      }
527    }
528    const char* line_end = stream_begin + lines_[i - 1].second;
529    DCHECK_LT(line_begin - stream_begin, line_end - stream_begin);
530
531    // We cleanup the whitespace at the end of the line before doing anything
532    // else of interest as it allows us to do nothing when irregularly formatted
533    // headers are parsed (e.g. those with only keys, only values, or no colon).
534    //
535    // We're guaranteed to have *line_end > ' ' while line_end >= line_begin.
536    --line_end;
537    DCHECK_EQ('\n', *line_end)
538      << "\"" << std::string(line_begin, line_end) << "\"";
539    while (*line_end <= ' ' && line_end > line_begin) {
540      --line_end;
541    }
542    ++line_end;
543    DCHECK_GE(' ', *line_end);
544    DCHECK_LT(line_begin, line_end);
545
546    // We use '0' for the block idx, because we're always writing to the first
547    // block from the framer (we do this because the framer requires that the
548    // entire header sequence be in a contiguous buffer).
549    headers_->header_lines_.push_back(
550        HeaderLineDescription(line_begin - stream_begin,
551                              line_end - stream_begin,
552                              line_end - stream_begin,
553                              line_end - stream_begin,
554                              0));
555    if (current >= line_end) {
556      last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON;
557      visitor_->HandleHeaderWarning(this);
558      // Then the next colon will not be found within this header line-- time
559      // to try again with another header-line.
560      continue;
561    } else if (current < line_begin) {
562      // When this condition is true, the last detected colon was part of a
563      // previous line.  We reset to the beginning of the line as we don't care
564      // about the presence of any colon before the beginning of the current
565      // line.
566      current = line_begin;
567    }
568#if __SSE2__
569    while (current < header_lines_end_m16) {
570      __m128i header_bytes =
571        _mm_loadu_si128(reinterpret_cast<const __m128i *>(current));
572      __m128i colon_cmp =
573        _mm_cmpeq_epi8(header_bytes, reinterpret_cast<__m128i>(colons));
574      int colon_msk = _mm_movemask_epi8(colon_cmp);
575      if (colon_msk == 0) {
576        current += 16;
577        continue;
578      }
579      current += (ffs(colon_msk) - 1);
580      if (current > line_end) {
581        break;
582      }
583      goto found_colon;
584    }
585#endif  // __SSE2__
586    for (; current < line_end; ++current) {
587      if (*current != ':') {
588        continue;
589      }
590      goto found_colon;
591    }
592    // If we've gotten to here, then there was no colon
593    // in the line. The arguments we passed into the construction
594    // for the HeaderLineDescription object should be OK-- it assumes
595    // that the entire content is 'key' by default (which is true, as
596    // there was no colon, there can be no value). Note that this is a
597    // construct which is technically not allowed by the spec.
598    last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON;
599    visitor_->HandleHeaderWarning(this);
600    continue;
601 found_colon:
602    DCHECK_EQ(*current, ':');
603    DCHECK_LE(current - stream_begin, line_end - stream_begin);
604    DCHECK_LE(stream_begin - stream_begin, current - stream_begin);
605
606    HeaderLineDescription& current_header_line = headers_->header_lines_.back();
607    current_header_line.key_end_idx = current - stream_begin;
608    current_header_line.value_begin_idx = current_header_line.key_end_idx;
609    if (current < line_end) {
610      ++current_header_line.key_end_idx;
611
612      CleanUpKeyValueWhitespace(stream_begin,
613                                line_begin,
614                                current,
615                                line_end,
616                                &current_header_line);
617    }
618  }
619}
620
621void BalsaFrame::ProcessContentLengthLine(
622    HeaderLines::size_type line_idx,
623    BalsaHeadersEnums::ContentLengthStatus* status,
624    size_t* length) {
625  const HeaderLineDescription& header_line = headers_->header_lines_[line_idx];
626  const char* stream_begin = headers_->OriginalHeaderStreamBegin();
627  const char* line_end = stream_begin + header_line.last_char_idx;
628  const char* value_begin = (stream_begin + header_line.value_begin_idx);
629
630  if (value_begin >= line_end) {
631    // There is no non-whitespace value data.
632#if DEBUGFRAMER
633      LOG(INFO) << "invalid content-length -- no non-whitespace value data";
634#endif
635    *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH;
636    return;
637  }
638
639  *length = 0;
640  while (value_begin < line_end) {
641    if (*value_begin < '0' || *value_begin > '9') {
642      // bad! content-length found, and couldn't parse all of it!
643      *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH;
644#if DEBUGFRAMER
645      LOG(INFO) << "invalid content-length - non numeric character detected";
646#endif  // DEBUGFRAMER
647      return;
648    }
649    const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10;
650    size_t length_x_10 = *length * 10;
651    const unsigned char c = *value_begin - '0';
652    if (*length > kMaxDiv10 ||
653        (std::numeric_limits<size_t>::max() - length_x_10) < c) {
654      *status = BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW;
655#if DEBUGFRAMER
656      LOG(INFO) << "content-length overflow";
657#endif  // DEBUGFRAMER
658      return;
659    }
660    *length = length_x_10 + c;
661    ++value_begin;
662  }
663#if DEBUGFRAMER
664  LOG(INFO) << "content_length parsed: " << *length;
665#endif  // DEBUGFRAMER
666  *status = BalsaHeadersEnums::VALID_CONTENT_LENGTH;
667}
668
669void BalsaFrame::ProcessTransferEncodingLine(HeaderLines::size_type line_idx) {
670  const HeaderLineDescription& header_line = headers_->header_lines_[line_idx];
671  const char* stream_begin = headers_->OriginalHeaderStreamBegin();
672  const char* line_end = stream_begin + header_line.last_char_idx;
673  const char* value_begin = stream_begin + header_line.value_begin_idx;
674  size_t value_length = line_end - value_begin;
675
676  if ((value_length == 7) &&
677      !strncasecmp(value_begin, "chunked", 7)) {
678    headers_->transfer_encoding_is_chunked_ = true;
679  } else if ((value_length == 8) &&
680      !strncasecmp(value_begin, "identity", 8)) {
681    headers_->transfer_encoding_is_chunked_ = false;
682  } else {
683    last_error_ = BalsaFrameEnums::UNKNOWN_TRANSFER_ENCODING;
684    parse_state_ = BalsaFrameEnums::PARSE_ERROR;
685    visitor_->HandleHeaderError(this);
686    return;
687  }
688}
689
690namespace {
691bool SplitStringPiece(base::StringPiece original, char delim,
692                      base::StringPiece* before, base::StringPiece* after) {
693  const char* p = original.data();
694  const char* end = p + original.size();
695
696  while (p != end) {
697    if (*p == delim) {
698      ++p;
699    } else {
700      const char* start = p;
701      while (++p != end && *p != delim) {
702        // Skip to the next occurence of the delimiter.
703      }
704      *before = base::StringPiece(start, p - start);
705      if (p != end)
706        *after = base::StringPiece(p + 1, end - (p + 1));
707      else
708        *after = base::StringPiece("");
709      StringPieceUtils::RemoveWhitespaceContext(before);
710      StringPieceUtils::RemoveWhitespaceContext(after);
711      return true;
712    }
713  }
714
715  *before = original;
716  *after = "";
717  return false;
718}
719
720// TODO(phython): Fix this function to properly deal with quoted values.
721// E.g. ";;foo", "\";;\"", or \"aa;
722// The last example, the semi-colon is a separator between extensions.
723void ProcessChunkExtensionsManual(base::StringPiece all_extensions,
724                                  BalsaHeaders* extensions) {
725  base::StringPiece extension;
726  base::StringPiece remaining;
727  StringPieceUtils::RemoveWhitespaceContext(&all_extensions);
728  SplitStringPiece(all_extensions, ';', &extension, &remaining);
729  while (!extension.empty()) {
730    base::StringPiece key;
731    base::StringPiece value;
732    SplitStringPiece(extension, '=', &key, &value);
733    if (!value.empty()) {
734      // Strip quotation marks if they exist.
735      if (!value.empty() && value[0] == '"')
736        value.remove_prefix(1);
737      if (!value.empty() && value[value.length() - 1] == '"')
738        value.remove_suffix(1);
739    }
740
741    extensions->AppendHeader(key, value);
742
743    StringPieceUtils::RemoveWhitespaceContext(&remaining);
744    SplitStringPiece(remaining, ';', &extension, &remaining);
745  }
746}
747
748// TODO(phython): Fix this function to properly deal with quoted values.
749// E.g. ";;foo", "\";;\"", or \"aa;
750// The last example, the semi-colon is a separator between extensions.
751void ProcessChunkExtensionsGoogle3(const char* input, size_t size,
752                                   BalsaHeaders* extensions) {
753  std::vector<base::StringPiece> key_values;
754  SplitStringPieceToVector(base::StringPiece(input, size), ";",
755                           &key_values, true);
756  for (unsigned int i = 0; i < key_values.size(); ++i) {
757    base::StringPiece key = key_values[i].substr(0, key_values[i].find('='));
758    base::StringPiece value;
759    if (key.length() < key_values[i].length()) {
760      value = key_values[i].substr(key.length() + 1);
761      // Remove any leading and trailing whitespace.
762      StringPieceUtils::RemoveWhitespaceContext(&value);
763
764      // Strip quotation marks if they exist.
765      if (!value.empty() && value[0] == '"')
766        value.remove_prefix(1);
767      if (!value.empty() && value[value.length() - 1] == '"')
768        value.remove_suffix(1);
769    }
770
771    // Strip the key whitespace after checking that there is a value.
772    StringPieceUtils::RemoveWhitespaceContext(&key);
773    extensions->AppendHeader(key, value);
774  }
775}
776
777}  // anonymous namespace
778
779void BalsaFrame::ProcessChunkExtensions(const char* input, size_t size,
780                                        BalsaHeaders* extensions) {
781#if 0
782  ProcessChunkExtensionsGoogle3(input, size, extensions);
783#else
784  ProcessChunkExtensionsManual(base::StringPiece(input, size), extensions);
785#endif
786}
787
788void BalsaFrame::ProcessHeaderLines() {
789  HeaderLines::size_type content_length_idx = 0;
790  HeaderLines::size_type transfer_encoding_idx = 0;
791
792  DCHECK(!lines_.empty());
793#if DEBUGFRAMER
794  LOG(INFO) << "******@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@**********\n";
795#endif  // DEBUGFRAMER
796
797  // There is no need to attempt to process headers if no header lines exist.
798  // There are at least two lines in the message which are not header lines.
799  // These two non-header lines are the first line of the message, and the
800  // last line of the message (which is an empty line).
801  // Thus, we test to see if we have more than two lines total before attempting
802  // to parse any header lines.
803  if (lines_.size() > 2) {
804    const char* stream_begin = headers_->OriginalHeaderStreamBegin();
805
806    // Then, for the rest of the header data, we parse these into key-value
807    // pairs.
808    FindColonsAndParseIntoKeyValue();
809    // At this point, we've parsed all of the headers.  Time to look for those
810    // headers which we require for framing.
811    const HeaderLines::size_type
812      header_lines_size = headers_->header_lines_.size();
813    for (HeaderLines::size_type i = 0; i < header_lines_size; ++i) {
814      const HeaderLineDescription& current_header_line =
815        headers_->header_lines_[i];
816      const char* key_begin =
817        (stream_begin + current_header_line.first_char_idx);
818      const char* key_end = (stream_begin + current_header_line.key_end_idx);
819      const size_t key_len = key_end - key_begin;
820      const char c = *key_begin;
821#if DEBUGFRAMER
822      LOG(INFO) << "[" << i << "]: " << std::string(key_begin, key_len)
823                << " c: '" << c << "' key_len: " << key_len;
824#endif  // DEBUGFRAMER
825      // If a header begins with either lowercase or uppercase 'c' or 't', then
826      // the header may be one of content-length, connection, content-encoding
827      // or transfer-encoding. These headers are special, as they change the way
828      // that the message is framed, and so the framer is required to search
829      // for them.
830
831
832      if (c == 'c' || c == 'C') {
833        if ((key_len == kContentLengthSize) &&
834            0 == strncasecmp(key_begin, kContentLength, kContentLengthSize)) {
835          BalsaHeadersEnums::ContentLengthStatus content_length_status =
836            BalsaHeadersEnums::NO_CONTENT_LENGTH;
837          size_t length = 0;
838          ProcessContentLengthLine(i, &content_length_status, &length);
839          if (content_length_idx != 0) {  // then we've already seen one!
840            if ((headers_->content_length_status_ != content_length_status) ||
841                ((headers_->content_length_status_ ==
842                  BalsaHeadersEnums::VALID_CONTENT_LENGTH) &&
843                 length != headers_->content_length_)) {
844              last_error_ = BalsaFrameEnums::MULTIPLE_CONTENT_LENGTH_KEYS;
845              parse_state_ = BalsaFrameEnums::PARSE_ERROR;
846              visitor_->HandleHeaderError(this);
847              return;
848            }
849            continue;
850          } else {
851            content_length_idx = i + 1;
852            headers_->content_length_status_ = content_length_status;
853            headers_->content_length_ = length;
854            content_length_remaining_ = length;
855          }
856
857        }
858      } else if (c == 't' || c == 'T') {
859        if ((key_len == kTransferEncodingSize) &&
860            0 == strncasecmp(key_begin, kTransferEncoding,
861                             kTransferEncodingSize)) {
862          if (transfer_encoding_idx != 0) {
863            last_error_ = BalsaFrameEnums::MULTIPLE_TRANSFER_ENCODING_KEYS;
864            parse_state_ = BalsaFrameEnums::PARSE_ERROR;
865            visitor_->HandleHeaderError(this);
866            return;
867          }
868          transfer_encoding_idx = i + 1;
869        }
870      } else if (i == 0 && (key_len == 0 || c == ' ')) {
871        last_error_ = BalsaFrameEnums::INVALID_HEADER_FORMAT;
872        parse_state_ = BalsaFrameEnums::PARSE_ERROR;
873        visitor_->HandleHeaderError(this);
874        return;
875      }
876    }
877    if (headers_->transfer_encoding_is_chunked_) {
878      headers_->content_length_ = 0;
879      headers_->content_length_status_ = BalsaHeadersEnums::NO_CONTENT_LENGTH;
880      content_length_remaining_ = 0;
881    }
882    if (transfer_encoding_idx != 0) {
883      ProcessTransferEncodingLine(transfer_encoding_idx - 1);
884    }
885  }
886}
887
888void BalsaFrame::AssignParseStateAfterHeadersHaveBeenParsed() {
889  // For responses, can't have a body if the request was a HEAD, or if it is
890  // one of these response-codes.  rfc2616 section 4.3
891  parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
892  if (is_request_ ||
893      !(request_was_head_ ||
894        (headers_->parsed_response_code_ >= 100 &&
895         headers_->parsed_response_code_ < 200) ||
896        (headers_->parsed_response_code_ == 204) ||
897        (headers_->parsed_response_code_ == 304))) {
898    // Then we can have a body.
899    if (headers_->transfer_encoding_is_chunked_) {
900      // Note that
901      // if ( Transfer-Encoding: chunked &&  Content-length: )
902      // then Transfer-Encoding: chunked trumps.
903      // This is as specified in the spec.
904      // rfc2616 section 4.4.3
905      parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH;
906    } else {
907      // Errors parsing content-length definitely can cause
908      // protocol errors/warnings
909      switch (headers_->content_length_status_) {
910        // If we have a content-length, and it is parsed
911        // properly, there are two options.
912        // 1) zero content, in which case the message is done, and
913        // 2) nonzero content, in which case we have to
914        //    consume the body.
915        case BalsaHeadersEnums::VALID_CONTENT_LENGTH:
916          if (headers_->content_length_ == 0) {
917            parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
918          } else {
919            parse_state_ = BalsaFrameEnums::READING_CONTENT;
920          }
921          break;
922        case BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW:
923        case BalsaHeadersEnums::INVALID_CONTENT_LENGTH:
924          // If there were characters left-over after parsing the
925          // content length, we should flag an error and stop.
926          parse_state_ = BalsaFrameEnums::PARSE_ERROR;
927          last_error_ = BalsaFrameEnums::UNPARSABLE_CONTENT_LENGTH;
928          visitor_->HandleHeaderError(this);
929          break;
930          // We can have: no transfer-encoding, no content length, and no
931          // connection: close...
932          // Unfortunately, this case doesn't seem to be covered in the spec.
933          // We'll assume that the safest thing to do here is what the google
934          // binaries before 2008 already do, which is to assume that
935          // everything until the connection is closed is body.
936        case BalsaHeadersEnums::NO_CONTENT_LENGTH:
937          if (is_request_) {
938            base::StringPiece method = headers_->request_method();
939            // POSTs and PUTs should have a detectable body length.  If they
940            // do not we consider it an error.
941            if ((method.size() == 4 &&
942                 strncmp(method.data(), "POST", 4) == 0) ||
943                (method.size() == 3 &&
944                 strncmp(method.data(), "PUT", 3) == 0)) {
945              parse_state_ = BalsaFrameEnums::PARSE_ERROR;
946              last_error_ =
947                  BalsaFrameEnums::REQUIRED_BODY_BUT_NO_CONTENT_LENGTH;
948              visitor_->HandleHeaderError(this);
949              break;
950            }
951            parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
952          } else {
953            parse_state_ = BalsaFrameEnums::READING_UNTIL_CLOSE;
954            last_error_ = BalsaFrameEnums::MAYBE_BODY_BUT_NO_CONTENT_LENGTH;
955            visitor_->HandleHeaderWarning(this);
956          }
957          break;
958          // The COV_NF_... statements here provide hints to the apparatus
959          // which computes coverage reports/ratios that this code is never
960          // intended to be executed, and should technically be impossible.
961          // COV_NF_START
962        default:
963          LOG(FATAL) << "Saw a content_length_status: "
964           << headers_->content_length_status_ << " which is unknown.";
965          // COV_NF_END
966      }
967    }
968  }
969}
970
971size_t BalsaFrame::ProcessHeaders(const char* message_start,
972                                  size_t message_length) {
973  const char* const original_message_start = message_start;
974  const char* const message_end = message_start + message_length;
975  const char* message_current = message_start;
976  const char* checkpoint = message_start;
977
978  if (message_length == 0) {
979    goto bottom;
980  }
981
982  while (message_current < message_end) {
983    size_t base_idx = headers_->GetReadableBytesFromHeaderStream();
984
985    // Yes, we could use strchr (assuming null termination), or
986    // memchr, but as it turns out that is slower than this tight loop
987    // for the input that we see.
988    if (!saw_non_newline_char_) {
989      do {
990        const char c = *message_current;
991        if (c != '\r' && c != '\n') {
992          if (c <= ' ') {
993            parse_state_ = BalsaFrameEnums::PARSE_ERROR;
994            last_error_ = BalsaFrameEnums::NO_REQUEST_LINE_IN_REQUEST;
995            visitor_->HandleHeaderError(this);
996            goto bottom;
997          } else {
998            saw_non_newline_char_ = true;
999            checkpoint = message_start = message_current;
1000            goto read_real_message;
1001          }
1002        }
1003        ++message_current;
1004      } while (message_current < message_end);
1005      goto bottom;  // this is necessary to skip 'last_char_was_slash_r' checks
1006    } else {
1007 read_real_message:
1008      // Note that SSE2 can be enabled on certain piii platforms.
1009#if __SSE2__
1010      {
1011        const char* const message_end_m16 = message_end - 16;
1012        __v16qi newlines = { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
1013                             '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' };
1014        while (message_current < message_end_m16) {
1015          // What this does (using compiler intrinsics):
1016          //
1017          // Load 16 '\n's into an xmm register
1018          // Load 16 bytes of currennt message into an xmm register
1019          // Do byte-wise equals on those two xmm registers
1020          // Take the first bit of each byte, and put that into the first
1021          //   16 bits of a mask
1022          // If the mask is zero, no '\n' found. increment by 16 and try again
1023          // Else scan forward to find the first set bit.
1024          // Increment current by the index of the first set bit
1025          //   (ffs returns index of first set bit + 1)
1026          __m128i msg_bytes =
1027            _mm_loadu_si128(const_cast<__m128i *>(
1028                    reinterpret_cast<const __m128i *>(message_current)));
1029          __m128i newline_cmp =
1030            _mm_cmpeq_epi8(msg_bytes, reinterpret_cast<__m128i>(newlines));
1031          int newline_msk = _mm_movemask_epi8(newline_cmp);
1032          if (newline_msk == 0) {
1033            message_current += 16;
1034            continue;
1035          }
1036          message_current += (ffs(newline_msk) - 1);
1037          const size_t relative_idx = message_current - message_start;
1038          const size_t message_current_idx = 1 + base_idx + relative_idx;
1039          lines_.push_back(std::make_pair(last_slash_n_idx_,
1040                                          message_current_idx));
1041          if (lines_.size() == 1) {
1042            headers_->WriteFromFramer(checkpoint,
1043                                      1 + message_current - checkpoint);
1044            checkpoint = message_current + 1;
1045            const char* begin = headers_->OriginalHeaderStreamBegin();
1046#if DEBUGFRAMER
1047          LOG(INFO) << "First line " << std::string(begin, lines_[0].second);
1048          LOG(INFO) << "is_request_: " << is_request_;
1049#endif
1050            ProcessFirstLine(begin, begin + lines_[0].second);
1051            if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ)
1052              goto process_lines;
1053            else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR)
1054              goto bottom;
1055          }
1056          const size_t chars_since_last_slash_n = (message_current_idx -
1057                                                   last_slash_n_idx_);
1058          last_slash_n_idx_ = message_current_idx;
1059          if (chars_since_last_slash_n > 2) {
1060            // We have a slash-n, but the last slash n was
1061            // more than 2 characters away from this. Thus, we know
1062            // that this cannot be an end-of-header.
1063            ++message_current;
1064            continue;
1065          }
1066          if ((chars_since_last_slash_n == 1) ||
1067              (((message_current > message_start) &&
1068                (*(message_current - 1) == '\r')) ||
1069               (last_char_was_slash_r_))) {
1070            goto process_lines;
1071          }
1072          ++message_current;
1073        }
1074      }
1075#endif  // __SSE2__
1076      while (message_current < message_end) {
1077        if (*message_current != '\n') {
1078          ++message_current;
1079          continue;
1080        }
1081        const size_t relative_idx = message_current - message_start;
1082        const size_t message_current_idx = 1 + base_idx + relative_idx;
1083        lines_.push_back(std::make_pair(last_slash_n_idx_,
1084                                        message_current_idx));
1085        if (lines_.size() == 1) {
1086          headers_->WriteFromFramer(checkpoint,
1087                                    1 + message_current - checkpoint);
1088          checkpoint = message_current + 1;
1089          const char* begin = headers_->OriginalHeaderStreamBegin();
1090#if DEBUGFRAMER
1091          LOG(INFO) << "First line " << std::string(begin, lines_[0].second);
1092          LOG(INFO) << "is_request_: " << is_request_;
1093#endif
1094          ProcessFirstLine(begin, begin + lines_[0].second);
1095          if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ)
1096            goto process_lines;
1097          else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR)
1098            goto bottom;
1099        }
1100        const size_t chars_since_last_slash_n = (message_current_idx -
1101                                                 last_slash_n_idx_);
1102        last_slash_n_idx_ = message_current_idx;
1103        if (chars_since_last_slash_n > 2) {
1104          // false positive.
1105          ++message_current;
1106          continue;
1107        }
1108        if ((chars_since_last_slash_n == 1) ||
1109            (((message_current > message_start) &&
1110              (*(message_current - 1) == '\r')) ||
1111             (last_char_was_slash_r_))) {
1112          goto process_lines;
1113        }
1114        ++message_current;
1115      }
1116    }
1117    continue;
1118 process_lines:
1119    ++message_current;
1120    DCHECK(message_current >= message_start);
1121    if (message_current > message_start) {
1122      headers_->WriteFromFramer(checkpoint, message_current - checkpoint);
1123    }
1124
1125    // Check if we have exceeded maximum headers length
1126    // Although we check for this limit before and after we call this function
1127    // we check it here as well to make sure that in case the visitor changed
1128    // the max_header_length_ (for example after processing the first line)
1129    // we handle it gracefully.
1130    if (headers_->GetReadableBytesFromHeaderStream() > max_header_length_) {
1131      parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1132      last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1133      visitor_->HandleHeaderError(this);
1134      goto bottom;
1135    }
1136
1137    // Since we know that we won't be writing any more bytes of the header,
1138    // we tell that to the headers object. The headers object may make
1139    // more efficient allocation decisions when this is signaled.
1140    headers_->DoneWritingFromFramer();
1141    {
1142      const char* readable_ptr = NULL;
1143      size_t readable_size = 0;
1144      headers_->GetReadablePtrFromHeaderStream(&readable_ptr, &readable_size);
1145      visitor_->ProcessHeaderInput(readable_ptr, readable_size);
1146    }
1147
1148    // Ok, now that we've written everything into our header buffer, it is
1149    // time to process the header lines (extract proper values for headers
1150    // which are important for framing).
1151    ProcessHeaderLines();
1152    if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1153      goto bottom;
1154    }
1155    AssignParseStateAfterHeadersHaveBeenParsed();
1156    if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1157      goto bottom;
1158    }
1159    visitor_->ProcessHeaders(*headers_);
1160    visitor_->HeaderDone();
1161    if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) {
1162      visitor_->MessageDone();
1163    }
1164    goto bottom;
1165  }
1166  // If we've gotten to here, it means that we've consumed all of the
1167  // available input. We need to record whether or not the last character we
1168  // saw was a '\r' so that a subsequent call to ProcessInput correctly finds
1169  // a header framing that is split across the two calls.
1170  last_char_was_slash_r_ = (*(message_end - 1) == '\r');
1171  DCHECK(message_current >= message_start);
1172  if (message_current > message_start) {
1173    headers_->WriteFromFramer(checkpoint, message_current - checkpoint);
1174  }
1175 bottom:
1176  return message_current - original_message_start;
1177}
1178
1179
1180size_t BalsaFrame::BytesSafeToSplice() const {
1181  switch (parse_state_) {
1182    case BalsaFrameEnums::READING_CHUNK_DATA:
1183      return chunk_length_remaining_;
1184    case BalsaFrameEnums::READING_UNTIL_CLOSE:
1185      return std::numeric_limits<size_t>::max();
1186    case BalsaFrameEnums::READING_CONTENT:
1187      return content_length_remaining_;
1188    default:
1189      return 0;
1190  }
1191}
1192
1193void BalsaFrame::BytesSpliced(size_t bytes_spliced) {
1194  switch (parse_state_) {
1195    case BalsaFrameEnums::READING_CHUNK_DATA:
1196      if (chunk_length_remaining_ >= bytes_spliced) {
1197        chunk_length_remaining_ -= bytes_spliced;
1198        if (chunk_length_remaining_ == 0) {
1199          parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM;
1200        }
1201        return;
1202      } else {
1203        last_error_ =
1204          BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT;
1205        goto error_exit;
1206      }
1207
1208    case BalsaFrameEnums::READING_UNTIL_CLOSE:
1209      return;
1210
1211    case BalsaFrameEnums::READING_CONTENT:
1212      if (content_length_remaining_ >= bytes_spliced) {
1213        content_length_remaining_ -= bytes_spliced;
1214        if (content_length_remaining_ == 0) {
1215          parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1216          visitor_->MessageDone();
1217        }
1218        return;
1219      } else {
1220        last_error_ =
1221          BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT;
1222        goto error_exit;
1223      }
1224
1225    default:
1226      last_error_ = BalsaFrameEnums::CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO;
1227      goto error_exit;
1228  }
1229
1230 error_exit:
1231  parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1232  visitor_->HandleBodyError(this);
1233};
1234
1235// You may note that the state-machine contained within this function has both
1236// switch and goto labels for nearly the same thing. For instance, the
1237// following two labels refer to the same code block:
1238//   label_reading_chunk_data:
1239//   case BalsaFrameEnums::READING_CHUNK_DATA:
1240// The 'case' statement is required for the switch statement which occurs when
1241// ProcessInput is invoked. The goto label is required as the state-machine
1242// does not use a computed goto in any subsequent operations.
1243//
1244// Since several states exit the state machine for various reasons, there is
1245// also one label at the bottom of the function. When it is appropriate to
1246// return from the function, that part of the state machine instead issues a
1247// goto bottom; This results in less code duplication, and makes debugging
1248// easier (as you can add a statement to a section of code which is guaranteed
1249// to be invoked when the function is exiting.
1250size_t BalsaFrame::ProcessInput(const char* input, size_t size) {
1251  const char* current = input;
1252  const char* on_entry = current;
1253  const char* end = current + size;
1254#if DEBUGFRAMER
1255  LOG(INFO) << "\n=============="
1256            << BalsaFrameEnums::ParseStateToString(parse_state_)
1257            << "===============\n";
1258#endif  // DEBUGFRAMER
1259
1260  DCHECK(headers_ != NULL);
1261  if (headers_ == NULL) return 0;
1262
1263  if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) {
1264    const size_t header_length = headers_->GetReadableBytesFromHeaderStream();
1265    // Yes, we still have to check this here as the user can change the
1266    // max_header_length amount!
1267    // Also it is possible that we have reached the maximum allowed header size,
1268    // and we have more to consume (remember we are still inside
1269    // READING_HEADER_AND_FIRSTLINE) in which case we directly declare an error.
1270    if (header_length > max_header_length_ ||
1271        (header_length == max_header_length_ && size > 0)) {
1272      parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1273      last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1274      visitor_->HandleHeaderError(this);
1275      goto bottom;
1276    }
1277    size_t bytes_to_process = max_header_length_ - header_length;
1278    if (bytes_to_process > size) {
1279      bytes_to_process = size;
1280    }
1281    current += ProcessHeaders(input, bytes_to_process);
1282    // If we are still reading headers check if we have crossed the headers
1283    // limit. Note that we check for >= as opposed to >. This is because if
1284    // header_length_after equals max_header_length_ and we are still in the
1285    // parse_state_  BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE we know for
1286    // sure that the headers limit will be crossed later on
1287    if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) {
1288      // Note that headers_ is valid only if we are still reading headers.
1289      const size_t header_length_after =
1290          headers_->GetReadableBytesFromHeaderStream();
1291      if (header_length_after >= max_header_length_) {
1292        parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1293        last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1294        visitor_->HandleHeaderError(this);
1295      }
1296    }
1297    goto bottom;
1298  } else if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ ||
1299             parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1300    // Can do nothing more 'till we're reset.
1301    goto bottom;
1302  }
1303
1304  while (current < end) {
1305    switch (parse_state_) {
1306 label_reading_chunk_length:
1307      case BalsaFrameEnums::READING_CHUNK_LENGTH:
1308        // In this state we read the chunk length.
1309        // Note that once we hit a character which is not in:
1310        // [0-9;A-Fa-f\n], we transition to a different state.
1311        //
1312        {
1313          // If we used strtol, etc, we'd have to buffer this line.
1314          // This is more annoying than simply doing the conversion
1315          // here. This code accounts for overflow.
1316          static const signed char buf[] = {
1317            // %0  %1  %2  %3  %4  %5  %6  %7  %8  \t  \n  %b  %c  \r  %e  %f
1318               -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -1, -1, -2, -1, -1,
1319            // %10 %11 %12 %13 %14 %15 %16 %17 %18 %19 %1a %1b %1c %1d %1e %1f
1320               -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1321            // ' ' %21 %22 %23 %24 %25 %26 %27 %28 %29 %2a %2b %2c %2d %2e %2f
1322               -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1323            // %30 %31 %32 %33 %34 %35 %36 %37 %38 %39 %3a ';' %3c %3d %3e %3f
1324                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -2, -1, -1, -1, -1,
1325            // %40 'A' 'B' 'C' 'D' 'E' 'F' %47 %48 %49 %4a %4b %4c %4d %4e %4f
1326               -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1327            // %50 %51 %52 %53 %54 %55 %56 %57 %58 %59 %5a %5b %5c %5d %5e %5f
1328               -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1329            // %60 'a' 'b' 'c' 'd' 'e' 'f' %67 %68 %69 %6a %6b %6c %6d %6e %6f
1330               -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1331            // %70 %71 %72 %73 %74 %75 %76 %77 %78 %79 %7a %7b %7c %7d %7e %7f
1332               -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1333          };
1334          // valid cases:
1335          //  "09123\n"                      // -> 09123
1336          //  "09123\r\n"                    // -> 09123
1337          //  "09123  \n"                    // -> 09123
1338          //  "09123  \r\n"                  // -> 09123
1339          //  "09123  12312\n"               // -> 09123
1340          //  "09123  12312\r\n"             // -> 09123
1341          //  "09123; foo=bar\n"             // -> 09123
1342          //  "09123; foo=bar\r\n"           // -> 09123
1343          //  "FFFFFFFFFFFFFFFF\r\n"         // -> FFFFFFFFFFFFFFFF
1344          //  "FFFFFFFFFFFFFFFF 22\r\n"      // -> FFFFFFFFFFFFFFFF
1345          // invalid cases:
1346          // "[ \t]+[^\n]*\n"
1347          // "FFFFFFFFFFFFFFFFF\r\n"  (would overflow)
1348          // "\r\n"
1349          // "\n"
1350          while (current < end) {
1351            const char c = *current;
1352            ++current;
1353            const signed char addition = buf[static_cast<int>(c)];
1354            if (addition >= 0) {
1355              chunk_length_character_extracted_ = true;
1356              size_t length_x_16 = chunk_length_remaining_ * 16;
1357              const size_t kMaxDiv16 = std::numeric_limits<size_t>::max() / 16;
1358              if ((chunk_length_remaining_ > kMaxDiv16) ||
1359                  ((std::numeric_limits<size_t>::max() - length_x_16) <
1360                   static_cast<size_t>(addition))) {
1361                // overflow -- asked for a chunk-length greater than 2^64 - 1!!
1362                parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1363                last_error_ = BalsaFrameEnums::CHUNK_LENGTH_OVERFLOW;
1364                visitor_->ProcessBodyInput(on_entry, current - on_entry);
1365                visitor_->HandleChunkingError(this);
1366                goto bottom;
1367              }
1368              chunk_length_remaining_ = length_x_16 + addition;
1369              continue;
1370            }
1371
1372            if (!chunk_length_character_extracted_ || addition == -1) {
1373              // ^[0-9;A-Fa-f][ \t\n] -- was not matched, either because no
1374              // characters were converted, or an unexpected character was
1375              // seen.
1376              parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1377              last_error_ = BalsaFrameEnums::INVALID_CHUNK_LENGTH;
1378              visitor_->ProcessBodyInput(on_entry, current - on_entry);
1379              visitor_->HandleChunkingError(this);
1380              goto bottom;
1381            }
1382
1383            --current;
1384            parse_state_ = BalsaFrameEnums::READING_CHUNK_EXTENSION;
1385            visitor_->ProcessChunkLength(chunk_length_remaining_);
1386            goto label_reading_chunk_extension;
1387          }
1388        }
1389        visitor_->ProcessBodyInput(on_entry, current - on_entry);
1390        goto bottom;  // case BalsaFrameEnums::READING_CHUNK_LENGTH
1391
1392 label_reading_chunk_extension:
1393      case BalsaFrameEnums::READING_CHUNK_EXTENSION:
1394        {
1395          // TODO(phython): Convert this scanning to be 16 bytes at a time if
1396          // there is data to be read.
1397          const char* extensions_start = current;
1398          size_t extensions_length = 0;
1399          while (current < end) {
1400            const char c = *current;
1401            if (c == '\r' || c == '\n') {
1402              extensions_length =
1403                  (extensions_start == current) ?
1404                  0 :
1405                  current - extensions_start - 1;
1406            }
1407
1408            ++current;
1409            if (c == '\n') {
1410              chunk_length_character_extracted_ = false;
1411              visitor_->ProcessChunkExtensions(
1412                  extensions_start, extensions_length);
1413              if (chunk_length_remaining_ != 0) {
1414                parse_state_ = BalsaFrameEnums::READING_CHUNK_DATA;
1415                goto label_reading_chunk_data;
1416              }
1417              HeaderFramingFound('\n');
1418              parse_state_ = BalsaFrameEnums::READING_LAST_CHUNK_TERM;
1419              goto label_reading_last_chunk_term;
1420            }
1421          }
1422          visitor_->ProcessChunkExtensions(
1423              extensions_start, extensions_length);
1424        }
1425
1426        visitor_->ProcessBodyInput(on_entry, current - on_entry);
1427        goto bottom;  // case BalsaFrameEnums::READING_CHUNK_EXTENSION
1428
1429 label_reading_chunk_data:
1430      case BalsaFrameEnums::READING_CHUNK_DATA:
1431        while (current < end) {
1432          if (chunk_length_remaining_ == 0) {
1433            break;
1434          }
1435          // read in the chunk
1436          size_t bytes_remaining = end - current;
1437          size_t consumed_bytes = (chunk_length_remaining_ < bytes_remaining) ?
1438            chunk_length_remaining_ : bytes_remaining;
1439          const char* tmp_current = current + consumed_bytes;
1440          visitor_->ProcessBodyInput(on_entry, tmp_current - on_entry);
1441          visitor_->ProcessBodyData(current, consumed_bytes);
1442          on_entry = current = tmp_current;
1443          chunk_length_remaining_ -= consumed_bytes;
1444        }
1445        if (chunk_length_remaining_ == 0) {
1446          parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM;
1447          goto label_reading_chunk_term;
1448        }
1449        visitor_->ProcessBodyInput(on_entry, current - on_entry);
1450        goto bottom;  // case BalsaFrameEnums::READING_CHUNK_DATA
1451
1452 label_reading_chunk_term:
1453      case BalsaFrameEnums::READING_CHUNK_TERM:
1454        while (current < end) {
1455          const char c = *current;
1456          ++current;
1457
1458          if (c == '\n') {
1459            parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH;
1460            goto label_reading_chunk_length;
1461          }
1462        }
1463        visitor_->ProcessBodyInput(on_entry, current - on_entry);
1464        goto bottom;  // case BalsaFrameEnums::READING_CHUNK_TERM
1465
1466 label_reading_last_chunk_term:
1467      case BalsaFrameEnums::READING_LAST_CHUNK_TERM:
1468        while (current < end) {
1469          const char c = *current;
1470
1471          if (!HeaderFramingFound(c)) {
1472            // If not, however, since the spec only suggests that the
1473            // client SHOULD indicate the presence of trailers, we get to
1474            // *test* that they did or didn't.
1475            // If all of the bytes we've seen since:
1476            //   OPTIONAL_WS 0 OPTIONAL_STUFF CRLF
1477            // are either '\r', or '\n', then we can assume that we don't yet
1478            // know if we need to parse headers, or if the next byte will make
1479            // the HeaderFramingFound condition (above) true.
1480            if (HeaderFramingMayBeFound()) {
1481              // If true, then we have seen only characters '\r' or '\n'.
1482              ++current;
1483
1484              // Lets try again! There is no state change here.
1485              continue;
1486            } else {
1487              // If (!HeaderFramingMayBeFound()), then we know that we must be
1488              // reading the first non CRLF character of a trailer.
1489              parse_state_ = BalsaFrameEnums::READING_TRAILER;
1490              visitor_->ProcessBodyInput(on_entry, current - on_entry);
1491              on_entry = current;
1492              goto label_reading_trailer;
1493            }
1494          } else {
1495            // If we've found a "\r\n\r\n", then the message
1496            // is done.
1497            ++current;
1498            parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1499            visitor_->ProcessBodyInput(on_entry, current - on_entry);
1500            visitor_->MessageDone();
1501            goto bottom;
1502          }
1503          break;  // from while loop
1504        }
1505        visitor_->ProcessBodyInput(on_entry, current - on_entry);
1506        goto bottom;  // case BalsaFrameEnums::READING_LAST_CHUNK_TERM
1507
1508 label_reading_trailer:
1509      case BalsaFrameEnums::READING_TRAILER:
1510        while (current < end) {
1511          const char c = *current;
1512          ++current;
1513          // TODO(fenix): If we ever care about trailers as part of framing,
1514          // deal with them here (see below for part of the 'solution')
1515          // if (LineFramingFound(c)) {
1516          // trailer_lines_.push_back(make_pair(start_of_line_,
1517          //                                   trailer_length_ - 1));
1518          // start_of_line_ = trailer_length_;
1519          // }
1520          if (HeaderFramingFound(c)) {
1521            // ProcessTrailers(visitor_, &trailers_);
1522            parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1523            visitor_->ProcessTrailerInput(on_entry, current - on_entry);
1524            visitor_->MessageDone();
1525            goto bottom;
1526          }
1527        }
1528        visitor_->ProcessTrailerInput(on_entry, current - on_entry);
1529        break;  // case BalsaFrameEnums::READING_TRAILER
1530
1531        // Note that there is no label:
1532        //   'label_reading_until_close'
1533        // here. This is because the state-machine exists immediately after
1534        // reading the headers instead of transitioning here (as it would
1535        // do if it was consuming all the data it could, all the time).
1536      case BalsaFrameEnums::READING_UNTIL_CLOSE:
1537        {
1538          const size_t bytes_remaining = end - current;
1539          if (bytes_remaining > 0) {
1540            visitor_->ProcessBodyInput(current, bytes_remaining);
1541            visitor_->ProcessBodyData(current, bytes_remaining);
1542            current += bytes_remaining;
1543          }
1544        }
1545        goto bottom;  // case BalsaFrameEnums::READING_UNTIL_CLOSE
1546
1547        // label_reading_content:
1548      case BalsaFrameEnums::READING_CONTENT:
1549#if DEBUGFRAMER
1550        LOG(INFO) << "ReadingContent: " << content_length_remaining_;
1551#endif  // DEBUGFRAMER
1552        while (content_length_remaining_ && current < end) {
1553          // read in the content
1554          const size_t bytes_remaining = end - current;
1555          const size_t consumed_bytes =
1556            (content_length_remaining_ < bytes_remaining) ?
1557            content_length_remaining_ : bytes_remaining;
1558          visitor_->ProcessBodyInput(current, consumed_bytes);
1559          visitor_->ProcessBodyData(current, consumed_bytes);
1560          current += consumed_bytes;
1561          content_length_remaining_ -= consumed_bytes;
1562        }
1563        if (content_length_remaining_ == 0) {
1564          parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1565          visitor_->MessageDone();
1566        }
1567        goto bottom;  // case BalsaFrameEnums::READING_CONTENT
1568
1569      default:
1570        // The state-machine should never be in a state that isn't handled
1571        // above.  This is a glaring logic error, and we should do something
1572        // drastic to ensure that this gets looked-at and fixed.
1573        LOG(FATAL) << "Unknown state: " << parse_state_  // COV_NF_LINE
1574          << " memory corruption?!";                     // COV_NF_LINE
1575    }
1576  }
1577 bottom:
1578#if DEBUGFRAMER
1579  LOG(INFO) << "\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n"
1580    << std::string(input, current)
1581    << "\n$$$$$$$$$$$$$$"
1582    << BalsaFrameEnums::ParseStateToString(parse_state_)
1583    << "$$$$$$$$$$$$$$$"
1584    << " consumed: " << (current - input);
1585  if (Error()) {
1586    LOG(INFO) << BalsaFrameEnums::ErrorCodeToString(ErrorCode());
1587  }
1588#endif  // DEBUGFRAMER
1589  return current - input;
1590}
1591
1592const uint32 BalsaFrame::kValidTerm1;
1593const uint32 BalsaFrame::kValidTerm1Mask;
1594const uint32 BalsaFrame::kValidTerm2;
1595const uint32 BalsaFrame::kValidTerm2Mask;
1596
1597}  // namespace net
1598