balsa_frame.cc revision f8ee788a64d60abd8f2d742a5fdedde054ecd910
1// Copyright 2013 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "net/tools/balsa/balsa_frame.h"
6
7#include <assert.h>
8#if __SSE2__
9#include <emmintrin.h>
10#endif  // __SSE2__
11
12#include <limits>
13#include <string>
14#include <utility>
15#include <vector>
16
17#include "base/logging.h"
18#include "base/port.h"
19#include "base/strings/string_piece.h"
20#include "net/tools/balsa/balsa_enums.h"
21#include "net/tools/balsa/balsa_headers.h"
22#include "net/tools/balsa/balsa_visitor_interface.h"
23#include "net/tools/balsa/buffer_interface.h"
24#include "net/tools/balsa/simple_buffer.h"
25#include "net/tools/balsa/split.h"
26#include "net/tools/balsa/string_piece_utils.h"
27
28#if defined(COMPILER_MSVC)
29#include <string.h>
30#define strncasecmp _strnicmp
31#else
32#include <strings.h>
33#endif
34
35namespace net {
36
37// Constants holding some header names for headers which can affect the way the
38// HTTP message is framed, and so must be processed specially:
39static const char kContentLength[] = "content-length";
40static const size_t kContentLengthSize = sizeof(kContentLength) - 1;
41static const char kTransferEncoding[] = "transfer-encoding";
42static const size_t kTransferEncodingSize = sizeof(kTransferEncoding) - 1;
43
44BalsaFrame::BalsaFrame()
45    : last_char_was_slash_r_(false),
46      saw_non_newline_char_(false),
47      start_was_space_(true),
48      chunk_length_character_extracted_(false),
49      is_request_(true),
50      request_was_head_(false),
51      max_header_length_(16 * 1024),
52      max_request_uri_length_(2048),
53      visitor_(&do_nothing_visitor_),
54      chunk_length_remaining_(0),
55      content_length_remaining_(0),
56      last_slash_n_loc_(NULL),
57      last_recorded_slash_n_loc_(NULL),
58      last_slash_n_idx_(0),
59      term_chars_(0),
60      parse_state_(BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE),
61      last_error_(BalsaFrameEnums::NO_ERROR),
62      headers_(NULL) {
63}
64
65BalsaFrame::~BalsaFrame() {}
66
67void BalsaFrame::Reset() {
68  last_char_was_slash_r_ = false;
69  saw_non_newline_char_ = false;
70  start_was_space_ = true;
71  chunk_length_character_extracted_ = false;
72  // is_request_ = true;               // not reset between messages.
73  // request_was_head_ = false;        // not reset between messages.
74  // max_header_length_ = 4096;        // not reset between messages.
75  // max_request_uri_length_ = 2048;   // not reset between messages.
76  // visitor_ = &do_nothing_visitor_;  // not reset between messages.
77  chunk_length_remaining_ = 0;
78  content_length_remaining_ = 0;
79  last_slash_n_loc_ = NULL;
80  last_recorded_slash_n_loc_ = NULL;
81  last_slash_n_idx_ = 0;
82  term_chars_ = 0;
83  parse_state_ = BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE;
84  last_error_ = BalsaFrameEnums::NO_ERROR;
85  lines_.clear();
86  if (headers_ != NULL) {
87    headers_->Clear();
88  }
89}
90
91const char* BalsaFrameEnums::ParseStateToString(
92    BalsaFrameEnums::ParseState error_code) {
93  switch (error_code) {
94    case PARSE_ERROR:
95      return "PARSE_ERROR";
96    case READING_HEADER_AND_FIRSTLINE:
97      return "READING_HEADER_AND_FIRSTLINE";
98    case READING_CHUNK_LENGTH:
99      return "READING_CHUNK_LENGTH";
100    case READING_CHUNK_EXTENSION:
101      return "READING_CHUNK_EXTENSION";
102    case READING_CHUNK_DATA:
103      return "READING_CHUNK_DATA";
104    case READING_CHUNK_TERM:
105      return "READING_CHUNK_TERM";
106    case READING_LAST_CHUNK_TERM:
107      return "READING_LAST_CHUNK_TERM";
108    case READING_TRAILER:
109      return "READING_TRAILER";
110    case READING_UNTIL_CLOSE:
111      return "READING_UNTIL_CLOSE";
112    case READING_CONTENT:
113      return "READING_CONTENT";
114    case MESSAGE_FULLY_READ:
115      return "MESSAGE_FULLY_READ";
116    case NUM_STATES:
117      return "UNKNOWN_STATE";
118  }
119  return "UNKNOWN_STATE";
120}
121
122const char* BalsaFrameEnums::ErrorCodeToString(
123    BalsaFrameEnums::ErrorCode error_code) {
124  switch (error_code) {
125    case NO_ERROR:
126      return "NO_ERROR";
127    case NO_STATUS_LINE_IN_RESPONSE:
128      return "NO_STATUS_LINE_IN_RESPONSE";
129    case NO_REQUEST_LINE_IN_REQUEST:
130      return "NO_REQUEST_LINE_IN_REQUEST";
131    case FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION:
132      return "FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION";
133    case FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD:
134      return "FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD";
135    case FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE:
136      return "FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE";
137    case FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI:
138      return "FAILED_TO_FIND_WS_AFTER_REQUEST_REQUEST_URI";
139    case FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE:
140      return "FAILED_TO_FIND_NL_AFTER_RESPONSE_REASON_PHRASE";
141    case FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION:
142      return "FAILED_TO_FIND_NL_AFTER_REQUEST_HTTP_VERSION";
143    case FAILED_CONVERTING_STATUS_CODE_TO_INT:
144      return "FAILED_CONVERTING_STATUS_CODE_TO_INT";
145    case REQUEST_URI_TOO_LONG:
146      return "REQUEST_URI_TOO_LONG";
147    case HEADERS_TOO_LONG:
148      return "HEADERS_TOO_LONG";
149    case UNPARSABLE_CONTENT_LENGTH:
150      return "UNPARSABLE_CONTENT_LENGTH";
151    case MAYBE_BODY_BUT_NO_CONTENT_LENGTH:
152      return "MAYBE_BODY_BUT_NO_CONTENT_LENGTH";
153    case REQUIRED_BODY_BUT_NO_CONTENT_LENGTH:
154      return "REQUIRED_BODY_BUT_NO_CONTENT_LENGTH";
155    case HEADER_MISSING_COLON:
156      return "HEADER_MISSING_COLON";
157    case INVALID_CHUNK_LENGTH:
158      return "INVALID_CHUNK_LENGTH";
159    case CHUNK_LENGTH_OVERFLOW:
160      return "CHUNK_LENGTH_OVERFLOW";
161    case CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO:
162      return "CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO";
163    case CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT:
164      return "CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT";
165    case MULTIPLE_CONTENT_LENGTH_KEYS:
166      return "MULTIPLE_CONTENT_LENGTH_KEYS";
167    case MULTIPLE_TRANSFER_ENCODING_KEYS:
168      return "MULTIPLE_TRANSFER_ENCODING_KEYS";
169    case UNKNOWN_TRANSFER_ENCODING:
170      return "UNKNOWN_TRANSFER_ENCODING";
171    case INVALID_HEADER_FORMAT:
172      return "INVALID_HEADER_FORMAT";
173    case INTERNAL_LOGIC_ERROR:
174      return "INTERNAL_LOGIC_ERROR";
175    case NUM_ERROR_CODES:
176      return "UNKNOWN_ERROR";
177  }
178  return "UNKNOWN_ERROR";
179}
180
181// Summary:
182//     Parses the first line of either a request or response.
183//     Note that in the case of a detected warning, error_code will be set
184//   but the function will not return false.
185//     Exactly zero or one warning or error (but not both) may be detected
186//   by this function.
187//     Note that this function will not write the data of the first-line
188//   into the header's buffer (that should already have been done elsewhere).
189//
190// Pre-conditions:
191//     begin != end
192//     *begin should be a character which is > ' '. This implies that there
193//   is at least one non-whitespace characters between [begin, end).
194//   headers is a valid pointer to a BalsaHeaders class.
195//     error_code is a valid pointer to a BalsaFrameEnums::ErrorCode value.
196//     Entire first line must exist between [begin, end)
197//     Exactly zero or one newlines -may- exist between [begin, end)
198//     [begin, end) should exist in the header's buffer.
199//
200// Side-effects:
201//   headers will be modified
202//   error_code may be modified if either a warning or error is detected
203//
204// Returns:
205//   True if no error (as opposed to warning) is detected.
206//   False if an error (as opposed to warning) is detected.
207
208//
209// If there is indeed non-whitespace in the line, then the following
210// will take care of this for you:
211//  while (*begin <= ' ') ++begin;
212//  ProcessFirstLine(begin, end, is_request, &headers, &error_code);
213//
214bool ParseHTTPFirstLine(const char* begin,
215                        const char* end,
216                        bool is_request,
217                        size_t max_request_uri_length,
218                        BalsaHeaders* headers,
219                        BalsaFrameEnums::ErrorCode* error_code) {
220  const char* current = begin;
221  // HTTP firstlines all have the following structure:
222  //  LWS         NONWS  LWS    NONWS   LWS    NONWS   NOTCRLF  CRLF
223  //  [\t \r\n]+ [^\t ]+ [\t ]+ [^\t ]+ [\t ]+ [^\t ]+ [^\r\n]+ "\r\n"
224  //  ws1        nws1    ws2    nws2    ws3    nws3             ws4
225  //  |          [-------)      [-------)      [----------------)
226  //    REQ:     method         request_uri    version
227  //   RESP:     version        statuscode     reason
228  //
229  //   The first NONWS->LWS component we'll call firstline_a.
230  //   The second firstline_b, and the third firstline_c.
231  //
232  //   firstline_a goes from nws1 to (but not including) ws2
233  //   firstline_b goes from nws2 to (but not including) ws3
234  //   firstline_c goes from nws3 to (but not including) ws4
235  //
236  // In the code:
237  //    ws1 == whitespace_1_idx_
238  //   nws1 == non_whitespace_1_idx_
239  //    ws2 == whitespace_2_idx_
240  //   nws2 == non_whitespace_2_idx_
241  //    ws3 == whitespace_3_idx_
242  //   nws3 == non_whitespace_3_idx_
243  //    ws4 == whitespace_4_idx_
244
245  // Kill all whitespace (including '\r\n') at the end of the line.
246  --end;
247  if (*end != '\n') {
248    *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
249    LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
250                << headers->OriginalHeadersForDebugging();
251    return false;
252  }
253  while (begin < end && *end <= ' ') {
254    --end;
255  }
256  DCHECK(*end != '\n');
257  if (*end == '\n') {
258    *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
259    LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
260                << headers->OriginalHeadersForDebugging();
261    return false;
262  }
263  ++end;
264
265  // The two following statements should not be possible.
266  if (end == begin) {
267    *error_code = BalsaFrameEnums::INTERNAL_LOGIC_ERROR;
268    LOG(DFATAL) << "INTERNAL_LOGIC_ERROR Headers: \n"
269                << headers->OriginalHeadersForDebugging();
270    return false;
271  }
272
273  // whitespace_1_idx_
274  headers->whitespace_1_idx_ = current - begin;
275  // This loop is commented out as it is never used in current code.  This is
276  // true only because we don't begin parsing the headers at all until we've
277  // encountered a non whitespace character at the beginning of the stream, at
278  // which point we begin our demarcation of header-start.  If we did -not- do
279  // this (for instance, only looked for [\r\n] instead of (< ' ')), this loop
280  // would be necessary for the proper functioning of this parsing.
281  // This is left here as this function may (in the future) be refactored out
282  // of the BalsaFrame class so that it may be shared between code in
283  // BalsaFrame and BalsaHeaders (where it would be used in some variant of the
284  // set_first_line() function (at which point it would be necessary).
285#if 0
286  while (*current <= ' ') {
287    ++current;
288  }
289#endif
290  // non_whitespace_1_idx_
291  headers->non_whitespace_1_idx_ = current - begin;
292  do {
293    // The first time through, we're guaranteed that the current character
294    // won't be a whitespace (else the loop above wouldn't have terminated).
295    // That implies that we're guaranteed to get at least one non-whitespace
296    // character if we get into this loop at all.
297    ++current;
298    if (current == end) {
299      headers->whitespace_2_idx_ = current - begin;
300      headers->non_whitespace_2_idx_ = current - begin;
301      headers->whitespace_3_idx_ = current - begin;
302      headers->non_whitespace_3_idx_ = current - begin;
303      headers->whitespace_4_idx_ = current - begin;
304      // FAILED_TO_FIND_WS_AFTER_REQUEST_METHOD   for request
305      // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION for response
306      *error_code =
307        static_cast<BalsaFrameEnums::ErrorCode>(
308            BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION +
309            is_request);
310      if (!is_request) {  // FAILED_TO_FIND_WS_AFTER_RESPONSE_VERSION
311        return false;
312      }
313      goto output_exhausted;
314    }
315  } while (*current > ' ');
316  // whitespace_2_idx_
317  headers->whitespace_2_idx_ = current - begin;
318  do {
319    ++current;
320    // Note that due to the loop which consumes all of the whitespace
321    // at the end of the line, current can never == end while in this function.
322  } while (*current <= ' ');
323  // non_whitespace_2_idx_
324  headers->non_whitespace_2_idx_ = current - begin;
325  do {
326    ++current;
327    if (current == end) {
328      headers->whitespace_3_idx_ = current - begin;
329      headers->non_whitespace_3_idx_ = current - begin;
330      headers->whitespace_4_idx_ = current - begin;
331      // FAILED_TO_FIND_START_OF_REQUEST_REQUEST_URI for request
332      // FAILED_TO_FIND_START_OF_RESPONSE_STATUSCODE for response
333      *error_code =
334        static_cast<BalsaFrameEnums::ErrorCode>(
335            BalsaFrameEnums::FAILED_TO_FIND_WS_AFTER_RESPONSE_STATUSCODE
336                                 + is_request);
337      goto output_exhausted;
338    }
339  } while (*current > ' ');
340  // whitespace_3_idx_
341  headers->whitespace_3_idx_ = current - begin;
342  do {
343    ++current;
344    // Note that due to the loop which consumes all of the whitespace
345    // at the end of the line, current can never == end while in this function.
346  } while (*current <= ' ');
347  // non_whitespace_3_idx_
348  headers->non_whitespace_3_idx_ = current - begin;
349  headers->whitespace_4_idx_ = end - begin;
350
351 output_exhausted:
352  // Note that we don't fail the parse immediately when parsing of the
353  // firstline fails.  Depending on the protocol type, we may want to accept
354  // a firstline with only one or two elements, e.g., for HTTP/0.9:
355  //   GET\r\n
356  // or
357  //   GET /\r\n
358  // should be parsed without issue (though the visitor should know that
359  // parsing the entire line was not exactly as it should be).
360  //
361  // Eventually, these errors may be removed alltogether, as the visitor can
362  // detect them on its own by examining the size of the various fields.
363  // headers->set_first_line(non_whitespace_1_idx_, current);
364
365  if (is_request) {
366    if ((headers->whitespace_3_idx_ - headers->non_whitespace_2_idx_) >
367        max_request_uri_length) {
368      // For requests, we need at least the method.  We could assume that a
369      // blank URI means "/".  If version isn't stated, it should be assumed
370      // to be HTTP/0.9 by the visitor.
371      *error_code = BalsaFrameEnums::REQUEST_URI_TOO_LONG;
372      return false;
373    }
374  } else {
375    headers->parsed_response_code_ = 0;
376    {
377      const char* parsed_response_code_current =
378        begin + headers->non_whitespace_2_idx_;
379      const char* parsed_response_code_end = begin + headers->whitespace_3_idx_;
380      const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10;
381
382      // Convert a string of [0-9]* into an int.
383      // Note that this allows for the conversion of response codes which
384      // are outside the bounds of normal HTTP response codes (no checking
385      // is done to ensure that these are valid-- they're merely parsed)!
386      while (parsed_response_code_current < parsed_response_code_end) {
387        if (*parsed_response_code_current < '0' ||
388            *parsed_response_code_current > '9') {
389          *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT;
390          return false;
391        }
392        size_t status_code_x_10 = headers->parsed_response_code_ * 10;
393        uint8 c = *parsed_response_code_current - '0';
394        if ((headers->parsed_response_code_ > kMaxDiv10) ||
395            (std::numeric_limits<size_t>::max() - status_code_x_10) < c) {
396          // overflow.
397          *error_code = BalsaFrameEnums::FAILED_CONVERTING_STATUS_CODE_TO_INT;
398          return false;
399        }
400        headers->parsed_response_code_ = status_code_x_10 + c;
401        ++parsed_response_code_current;
402      }
403    }
404  }
405  return true;
406}
407
408// begin - beginning of the firstline
409// end - end of the firstline
410//
411// A precondition for this function is that there is non-whitespace between
412// [begin, end). If this precondition is not met, the function will not perform
413// as expected (and bad things may happen, and it will eat your first, second,
414// and third unborn children!).
415//
416// Another precondition for this function is that [begin, end) includes
417// at most one newline, which must be at the end of the line.
418void BalsaFrame::ProcessFirstLine(const char* begin, const char* end) {
419  BalsaFrameEnums::ErrorCode previous_error = last_error_;
420  if (!ParseHTTPFirstLine(begin,
421                          end,
422                          is_request_,
423                          max_request_uri_length_,
424                          headers_,
425                          &last_error_)) {
426    parse_state_ = BalsaFrameEnums::PARSE_ERROR;
427    visitor_->HandleHeaderError(this);
428    return;
429  }
430  if (previous_error != last_error_) {
431    visitor_->HandleHeaderWarning(this);
432  }
433
434  if (is_request_) {
435    size_t version_length =
436        headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_;
437    visitor_->ProcessRequestFirstLine(
438        begin + headers_->non_whitespace_1_idx_,
439        headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_,
440        begin + headers_->non_whitespace_1_idx_,
441        headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_,
442        begin + headers_->non_whitespace_2_idx_,
443        headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_,
444        begin + headers_->non_whitespace_3_idx_,
445        version_length);
446    if (version_length == 0)
447      parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
448  } else {
449    visitor_->ProcessResponseFirstLine(
450        begin + headers_->non_whitespace_1_idx_,
451        headers_->whitespace_4_idx_ - headers_->non_whitespace_1_idx_,
452        begin + headers_->non_whitespace_1_idx_,
453        headers_->whitespace_2_idx_ - headers_->non_whitespace_1_idx_,
454        begin + headers_->non_whitespace_2_idx_,
455        headers_->whitespace_3_idx_ - headers_->non_whitespace_2_idx_,
456        begin + headers_->non_whitespace_3_idx_,
457        headers_->whitespace_4_idx_ - headers_->non_whitespace_3_idx_);
458  }
459}
460
461// 'stream_begin' points to the first character of the headers buffer.
462// 'line_begin' points to the first character of the line.
463// 'current' points to a char which is ':'.
464// 'line_end' points to the position of '\n' + 1.
465// 'line_begin' points to the position of first character of line.
466void BalsaFrame::CleanUpKeyValueWhitespace(
467    const char* stream_begin,
468    const char* line_begin,
469    const char* current,
470    const char* line_end,
471    HeaderLineDescription* current_header_line) {
472  const char* colon_loc = current;
473  DCHECK_LT(colon_loc, line_end);
474  DCHECK_EQ(':', *colon_loc);
475  DCHECK_EQ(':', *current);
476  DCHECK_GE(' ', *line_end)
477    << "\"" << std::string(line_begin, line_end) << "\"";
478
479  // TODO(fenix): Investigate whether or not the bounds tests in the
480  // while loops here are redundant, and if so, remove them.
481  --current;
482  while (current > line_begin && *current <= ' ') --current;
483  current += (current != colon_loc);
484  current_header_line->key_end_idx = current - stream_begin;
485
486  current = colon_loc;
487  DCHECK_EQ(':', *current);
488  ++current;
489  while (current < line_end && *current <= ' ') ++current;
490  current_header_line->value_begin_idx = current - stream_begin;
491
492  DCHECK_GE(current_header_line->key_end_idx,
493            current_header_line->first_char_idx);
494  DCHECK_GE(current_header_line->value_begin_idx,
495            current_header_line->key_end_idx);
496  DCHECK_GE(current_header_line->last_char_idx,
497            current_header_line->value_begin_idx);
498}
499
500inline void BalsaFrame::FindColonsAndParseIntoKeyValue() {
501  DCHECK(!lines_.empty());
502  const char* stream_begin = headers_->OriginalHeaderStreamBegin();
503  // The last line is always just a newline (and is uninteresting).
504  const Lines::size_type lines_size_m1 = lines_.size() - 1;
505#if __SSE2__
506  const __v16qi colons = { ':', ':', ':', ':', ':', ':', ':', ':',
507                           ':', ':', ':', ':', ':', ':', ':', ':'};
508  const char* header_lines_end_m16 = headers_->OriginalHeaderStreamEnd() - 16;
509#endif  // __SSE2__
510  const char* current = stream_begin + lines_[1].first;
511  // This code is a bit more subtle than it may appear at first glance.
512  // This code looks for a colon in the current line... but it also looks
513  // beyond the current line. If there is no colon in the current line, then
514  // for each subsequent line (until the colon which -has- been found is
515  // associated with a line), no searching for a colon will be performed. In
516  // this way, we minimize the amount of bytes we have scanned for a colon.
517  for (Lines::size_type i = 1; i < lines_size_m1;) {
518    const char* line_begin = stream_begin + lines_[i].first;
519
520    // Here we handle possible continuations.  Note that we do not replace
521    // the '\n' in the line before a continuation (at least, as of now),
522    // which implies that any code which looks for a value must deal with
523    // "\r\n", etc -within- the line (and not just at the end of it).
524    for (++i; i < lines_size_m1; ++i) {
525      const char c = *(stream_begin + lines_[i].first);
526      if (c > ' ') {
527        // Not a continuation, so stop.  Note that if the 'original' i = 1,
528        // and the next line is not a continuation, we'll end up with i = 2
529        // when we break. This handles the incrementing of i for the outer
530        // loop.
531        break;
532      }
533    }
534    const char* line_end = stream_begin + lines_[i - 1].second;
535    DCHECK_LT(line_begin - stream_begin, line_end - stream_begin);
536
537    // We cleanup the whitespace at the end of the line before doing anything
538    // else of interest as it allows us to do nothing when irregularly formatted
539    // headers are parsed (e.g. those with only keys, only values, or no colon).
540    //
541    // We're guaranteed to have *line_end > ' ' while line_end >= line_begin.
542    --line_end;
543    DCHECK_EQ('\n', *line_end)
544      << "\"" << std::string(line_begin, line_end) << "\"";
545    while (*line_end <= ' ' && line_end > line_begin) {
546      --line_end;
547    }
548    ++line_end;
549    DCHECK_GE(' ', *line_end);
550    DCHECK_LT(line_begin, line_end);
551
552    // We use '0' for the block idx, because we're always writing to the first
553    // block from the framer (we do this because the framer requires that the
554    // entire header sequence be in a contiguous buffer).
555    headers_->header_lines_.push_back(
556        HeaderLineDescription(line_begin - stream_begin,
557                              line_end - stream_begin,
558                              line_end - stream_begin,
559                              line_end - stream_begin,
560                              0));
561    if (current >= line_end) {
562      last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON;
563      visitor_->HandleHeaderWarning(this);
564      // Then the next colon will not be found within this header line-- time
565      // to try again with another header-line.
566      continue;
567    } else if (current < line_begin) {
568      // When this condition is true, the last detected colon was part of a
569      // previous line.  We reset to the beginning of the line as we don't care
570      // about the presence of any colon before the beginning of the current
571      // line.
572      current = line_begin;
573    }
574#if __SSE2__
575    while (current < header_lines_end_m16) {
576      __m128i header_bytes =
577        _mm_loadu_si128(reinterpret_cast<const __m128i *>(current));
578      __m128i colon_cmp =
579        _mm_cmpeq_epi8(header_bytes, reinterpret_cast<__m128i>(colons));
580      int colon_msk = _mm_movemask_epi8(colon_cmp);
581      if (colon_msk == 0) {
582        current += 16;
583        continue;
584      }
585      current += (ffs(colon_msk) - 1);
586      if (current > line_end) {
587        break;
588      }
589      goto found_colon;
590    }
591#endif  // __SSE2__
592    for (; current < line_end; ++current) {
593      if (*current != ':') {
594        continue;
595      }
596      goto found_colon;
597    }
598    // If we've gotten to here, then there was no colon
599    // in the line. The arguments we passed into the construction
600    // for the HeaderLineDescription object should be OK-- it assumes
601    // that the entire content is 'key' by default (which is true, as
602    // there was no colon, there can be no value). Note that this is a
603    // construct which is technically not allowed by the spec.
604    last_error_ = BalsaFrameEnums::HEADER_MISSING_COLON;
605    visitor_->HandleHeaderWarning(this);
606    continue;
607 found_colon:
608    DCHECK_EQ(*current, ':');
609    DCHECK_LE(current - stream_begin, line_end - stream_begin);
610    DCHECK_LE(stream_begin - stream_begin, current - stream_begin);
611
612    HeaderLineDescription& current_header_line = headers_->header_lines_.back();
613    current_header_line.key_end_idx = current - stream_begin;
614    current_header_line.value_begin_idx = current_header_line.key_end_idx;
615    if (current < line_end) {
616      ++current_header_line.key_end_idx;
617
618      CleanUpKeyValueWhitespace(stream_begin,
619                                line_begin,
620                                current,
621                                line_end,
622                                &current_header_line);
623    }
624  }
625}
626
627void BalsaFrame::ProcessContentLengthLine(
628    HeaderLines::size_type line_idx,
629    BalsaHeadersEnums::ContentLengthStatus* status,
630    size_t* length) {
631  const HeaderLineDescription& header_line = headers_->header_lines_[line_idx];
632  const char* stream_begin = headers_->OriginalHeaderStreamBegin();
633  const char* line_end = stream_begin + header_line.last_char_idx;
634  const char* value_begin = (stream_begin + header_line.value_begin_idx);
635
636  if (value_begin >= line_end) {
637    // There is no non-whitespace value data.
638#if DEBUGFRAMER
639      LOG(INFO) << "invalid content-length -- no non-whitespace value data";
640#endif
641    *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH;
642    return;
643  }
644
645  *length = 0;
646  while (value_begin < line_end) {
647    if (*value_begin < '0' || *value_begin > '9') {
648      // bad! content-length found, and couldn't parse all of it!
649      *status = BalsaHeadersEnums::INVALID_CONTENT_LENGTH;
650#if DEBUGFRAMER
651      LOG(INFO) << "invalid content-length - non numeric character detected";
652#endif  // DEBUGFRAMER
653      return;
654    }
655    const size_t kMaxDiv10 = std::numeric_limits<size_t>::max() / 10;
656    size_t length_x_10 = *length * 10;
657    const unsigned char c = *value_begin - '0';
658    if (*length > kMaxDiv10 ||
659        (std::numeric_limits<size_t>::max() - length_x_10) < c) {
660      *status = BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW;
661#if DEBUGFRAMER
662      LOG(INFO) << "content-length overflow";
663#endif  // DEBUGFRAMER
664      return;
665    }
666    *length = length_x_10 + c;
667    ++value_begin;
668  }
669#if DEBUGFRAMER
670  LOG(INFO) << "content_length parsed: " << *length;
671#endif  // DEBUGFRAMER
672  *status = BalsaHeadersEnums::VALID_CONTENT_LENGTH;
673}
674
675void BalsaFrame::ProcessTransferEncodingLine(HeaderLines::size_type line_idx) {
676  const HeaderLineDescription& header_line = headers_->header_lines_[line_idx];
677  const char* stream_begin = headers_->OriginalHeaderStreamBegin();
678  const char* line_end = stream_begin + header_line.last_char_idx;
679  const char* value_begin = stream_begin + header_line.value_begin_idx;
680  size_t value_length = line_end - value_begin;
681
682  if ((value_length == 7) &&
683      !strncasecmp(value_begin, "chunked", 7)) {
684    headers_->transfer_encoding_is_chunked_ = true;
685  } else if ((value_length == 8) &&
686      !strncasecmp(value_begin, "identity", 8)) {
687    headers_->transfer_encoding_is_chunked_ = false;
688  } else {
689    last_error_ = BalsaFrameEnums::UNKNOWN_TRANSFER_ENCODING;
690    parse_state_ = BalsaFrameEnums::PARSE_ERROR;
691    visitor_->HandleHeaderError(this);
692    return;
693  }
694}
695
696namespace {
697bool SplitStringPiece(base::StringPiece original, char delim,
698                      base::StringPiece* before, base::StringPiece* after) {
699  const char* p = original.data();
700  const char* end = p + original.size();
701
702  while (p != end) {
703    if (*p == delim) {
704      ++p;
705    } else {
706      const char* start = p;
707      while (++p != end && *p != delim) {
708        // Skip to the next occurence of the delimiter.
709      }
710      *before = base::StringPiece(start, p - start);
711      if (p != end)
712        *after = base::StringPiece(p + 1, end - (p + 1));
713      else
714        *after = base::StringPiece("");
715      StringPieceUtils::RemoveWhitespaceContext(before);
716      StringPieceUtils::RemoveWhitespaceContext(after);
717      return true;
718    }
719  }
720
721  *before = original;
722  *after = "";
723  return false;
724}
725
726// TODO(phython): Fix this function to properly deal with quoted values.
727// E.g. ";;foo", "\";;\"", or \"aa;
728// The last example, the semi-colon is a separator between extensions.
729void ProcessChunkExtensionsManual(base::StringPiece all_extensions,
730                                  BalsaHeaders* extensions) {
731  base::StringPiece extension;
732  base::StringPiece remaining;
733  StringPieceUtils::RemoveWhitespaceContext(&all_extensions);
734  SplitStringPiece(all_extensions, ';', &extension, &remaining);
735  while (!extension.empty()) {
736    base::StringPiece key;
737    base::StringPiece value;
738    SplitStringPiece(extension, '=', &key, &value);
739    if (!value.empty()) {
740      // Strip quotation marks if they exist.
741      if (!value.empty() && value[0] == '"')
742        value.remove_prefix(1);
743      if (!value.empty() && value[value.length() - 1] == '"')
744        value.remove_suffix(1);
745    }
746
747    extensions->AppendHeader(key, value);
748
749    StringPieceUtils::RemoveWhitespaceContext(&remaining);
750    SplitStringPiece(remaining, ';', &extension, &remaining);
751  }
752}
753
754}  // anonymous namespace
755
756void BalsaFrame::ProcessChunkExtensions(const char* input, size_t size,
757                                        BalsaHeaders* extensions) {
758  ProcessChunkExtensionsManual(base::StringPiece(input, size), extensions);
759}
760
761void BalsaFrame::ProcessHeaderLines() {
762  HeaderLines::size_type content_length_idx = 0;
763  HeaderLines::size_type transfer_encoding_idx = 0;
764
765  DCHECK(!lines_.empty());
766#if DEBUGFRAMER
767  LOG(INFO) << "******@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@**********\n";
768#endif  // DEBUGFRAMER
769
770  // There is no need to attempt to process headers if no header lines exist.
771  // There are at least two lines in the message which are not header lines.
772  // These two non-header lines are the first line of the message, and the
773  // last line of the message (which is an empty line).
774  // Thus, we test to see if we have more than two lines total before attempting
775  // to parse any header lines.
776  if (lines_.size() > 2) {
777    const char* stream_begin = headers_->OriginalHeaderStreamBegin();
778
779    // Then, for the rest of the header data, we parse these into key-value
780    // pairs.
781    FindColonsAndParseIntoKeyValue();
782    // At this point, we've parsed all of the headers.  Time to look for those
783    // headers which we require for framing.
784    const HeaderLines::size_type
785      header_lines_size = headers_->header_lines_.size();
786    for (HeaderLines::size_type i = 0; i < header_lines_size; ++i) {
787      const HeaderLineDescription& current_header_line =
788        headers_->header_lines_[i];
789      const char* key_begin =
790        (stream_begin + current_header_line.first_char_idx);
791      const char* key_end = (stream_begin + current_header_line.key_end_idx);
792      const size_t key_len = key_end - key_begin;
793      const char c = *key_begin;
794#if DEBUGFRAMER
795      LOG(INFO) << "[" << i << "]: " << std::string(key_begin, key_len)
796                << " c: '" << c << "' key_len: " << key_len;
797#endif  // DEBUGFRAMER
798      // If a header begins with either lowercase or uppercase 'c' or 't', then
799      // the header may be one of content-length, connection, content-encoding
800      // or transfer-encoding. These headers are special, as they change the way
801      // that the message is framed, and so the framer is required to search
802      // for them.
803
804
805      if (c == 'c' || c == 'C') {
806        if ((key_len == kContentLengthSize) &&
807            0 == strncasecmp(key_begin, kContentLength, kContentLengthSize)) {
808          BalsaHeadersEnums::ContentLengthStatus content_length_status =
809            BalsaHeadersEnums::NO_CONTENT_LENGTH;
810          size_t length = 0;
811          ProcessContentLengthLine(i, &content_length_status, &length);
812          if (content_length_idx != 0) {  // then we've already seen one!
813            if ((headers_->content_length_status_ != content_length_status) ||
814                ((headers_->content_length_status_ ==
815                  BalsaHeadersEnums::VALID_CONTENT_LENGTH) &&
816                 length != headers_->content_length_)) {
817              last_error_ = BalsaFrameEnums::MULTIPLE_CONTENT_LENGTH_KEYS;
818              parse_state_ = BalsaFrameEnums::PARSE_ERROR;
819              visitor_->HandleHeaderError(this);
820              return;
821            }
822            continue;
823          } else {
824            content_length_idx = i + 1;
825            headers_->content_length_status_ = content_length_status;
826            headers_->content_length_ = length;
827            content_length_remaining_ = length;
828          }
829
830        }
831      } else if (c == 't' || c == 'T') {
832        if ((key_len == kTransferEncodingSize) &&
833            0 == strncasecmp(key_begin, kTransferEncoding,
834                             kTransferEncodingSize)) {
835          if (transfer_encoding_idx != 0) {
836            last_error_ = BalsaFrameEnums::MULTIPLE_TRANSFER_ENCODING_KEYS;
837            parse_state_ = BalsaFrameEnums::PARSE_ERROR;
838            visitor_->HandleHeaderError(this);
839            return;
840          }
841          transfer_encoding_idx = i + 1;
842        }
843      } else if (i == 0 && (key_len == 0 || c == ' ')) {
844        last_error_ = BalsaFrameEnums::INVALID_HEADER_FORMAT;
845        parse_state_ = BalsaFrameEnums::PARSE_ERROR;
846        visitor_->HandleHeaderError(this);
847        return;
848      }
849    }
850    if (headers_->transfer_encoding_is_chunked_) {
851      headers_->content_length_ = 0;
852      headers_->content_length_status_ = BalsaHeadersEnums::NO_CONTENT_LENGTH;
853      content_length_remaining_ = 0;
854    }
855    if (transfer_encoding_idx != 0) {
856      ProcessTransferEncodingLine(transfer_encoding_idx - 1);
857    }
858  }
859}
860
861void BalsaFrame::AssignParseStateAfterHeadersHaveBeenParsed() {
862  // For responses, can't have a body if the request was a HEAD, or if it is
863  // one of these response-codes.  rfc2616 section 4.3
864  parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
865  if (is_request_ ||
866      !(request_was_head_ ||
867        (headers_->parsed_response_code_ >= 100 &&
868         headers_->parsed_response_code_ < 200) ||
869        (headers_->parsed_response_code_ == 204) ||
870        (headers_->parsed_response_code_ == 304))) {
871    // Then we can have a body.
872    if (headers_->transfer_encoding_is_chunked_) {
873      // Note that
874      // if ( Transfer-Encoding: chunked &&  Content-length: )
875      // then Transfer-Encoding: chunked trumps.
876      // This is as specified in the spec.
877      // rfc2616 section 4.4.3
878      parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH;
879    } else {
880      // Errors parsing content-length definitely can cause
881      // protocol errors/warnings
882      switch (headers_->content_length_status_) {
883        // If we have a content-length, and it is parsed
884        // properly, there are two options.
885        // 1) zero content, in which case the message is done, and
886        // 2) nonzero content, in which case we have to
887        //    consume the body.
888        case BalsaHeadersEnums::VALID_CONTENT_LENGTH:
889          if (headers_->content_length_ == 0) {
890            parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
891          } else {
892            parse_state_ = BalsaFrameEnums::READING_CONTENT;
893          }
894          break;
895        case BalsaHeadersEnums::CONTENT_LENGTH_OVERFLOW:
896        case BalsaHeadersEnums::INVALID_CONTENT_LENGTH:
897          // If there were characters left-over after parsing the
898          // content length, we should flag an error and stop.
899          parse_state_ = BalsaFrameEnums::PARSE_ERROR;
900          last_error_ = BalsaFrameEnums::UNPARSABLE_CONTENT_LENGTH;
901          visitor_->HandleHeaderError(this);
902          break;
903          // We can have: no transfer-encoding, no content length, and no
904          // connection: close...
905          // Unfortunately, this case doesn't seem to be covered in the spec.
906          // We'll assume that the safest thing to do here is what the google
907          // binaries before 2008 already do, which is to assume that
908          // everything until the connection is closed is body.
909        case BalsaHeadersEnums::NO_CONTENT_LENGTH:
910          if (is_request_) {
911            base::StringPiece method = headers_->request_method();
912            // POSTs and PUTs should have a detectable body length.  If they
913            // do not we consider it an error.
914            if ((method.size() == 4 &&
915                 strncmp(method.data(), "POST", 4) == 0) ||
916                (method.size() == 3 &&
917                 strncmp(method.data(), "PUT", 3) == 0)) {
918              parse_state_ = BalsaFrameEnums::PARSE_ERROR;
919              last_error_ =
920                  BalsaFrameEnums::REQUIRED_BODY_BUT_NO_CONTENT_LENGTH;
921              visitor_->HandleHeaderError(this);
922              break;
923            }
924            parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
925          } else {
926            parse_state_ = BalsaFrameEnums::READING_UNTIL_CLOSE;
927            last_error_ = BalsaFrameEnums::MAYBE_BODY_BUT_NO_CONTENT_LENGTH;
928            visitor_->HandleHeaderWarning(this);
929          }
930          break;
931          // The COV_NF_... statements here provide hints to the apparatus
932          // which computes coverage reports/ratios that this code is never
933          // intended to be executed, and should technically be impossible.
934          // COV_NF_START
935        default:
936          LOG(FATAL) << "Saw a content_length_status: "
937           << headers_->content_length_status_ << " which is unknown.";
938          // COV_NF_END
939      }
940    }
941  }
942}
943
944size_t BalsaFrame::ProcessHeaders(const char* message_start,
945                                  size_t message_length) {
946  const char* const original_message_start = message_start;
947  const char* const message_end = message_start + message_length;
948  const char* message_current = message_start;
949  const char* checkpoint = message_start;
950
951  if (message_length == 0) {
952    goto bottom;
953  }
954
955  while (message_current < message_end) {
956    size_t base_idx = headers_->GetReadableBytesFromHeaderStream();
957
958    // Yes, we could use strchr (assuming null termination), or
959    // memchr, but as it turns out that is slower than this tight loop
960    // for the input that we see.
961    if (!saw_non_newline_char_) {
962      do {
963        const char c = *message_current;
964        if (c != '\r' && c != '\n') {
965          if (c <= ' ') {
966            parse_state_ = BalsaFrameEnums::PARSE_ERROR;
967            last_error_ = BalsaFrameEnums::NO_REQUEST_LINE_IN_REQUEST;
968            visitor_->HandleHeaderError(this);
969            goto bottom;
970          } else {
971            saw_non_newline_char_ = true;
972            checkpoint = message_start = message_current;
973            goto read_real_message;
974          }
975        }
976        ++message_current;
977      } while (message_current < message_end);
978      goto bottom;  // this is necessary to skip 'last_char_was_slash_r' checks
979    } else {
980 read_real_message:
981      // Note that SSE2 can be enabled on certain piii platforms.
982#if __SSE2__
983      {
984        const char* const message_end_m16 = message_end - 16;
985        __v16qi newlines = { '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n',
986                             '\n', '\n', '\n', '\n', '\n', '\n', '\n', '\n' };
987        while (message_current < message_end_m16) {
988          // What this does (using compiler intrinsics):
989          //
990          // Load 16 '\n's into an xmm register
991          // Load 16 bytes of currennt message into an xmm register
992          // Do byte-wise equals on those two xmm registers
993          // Take the first bit of each byte, and put that into the first
994          //   16 bits of a mask
995          // If the mask is zero, no '\n' found. increment by 16 and try again
996          // Else scan forward to find the first set bit.
997          // Increment current by the index of the first set bit
998          //   (ffs returns index of first set bit + 1)
999          __m128i msg_bytes =
1000            _mm_loadu_si128(const_cast<__m128i *>(
1001                    reinterpret_cast<const __m128i *>(message_current)));
1002          __m128i newline_cmp =
1003            _mm_cmpeq_epi8(msg_bytes, reinterpret_cast<__m128i>(newlines));
1004          int newline_msk = _mm_movemask_epi8(newline_cmp);
1005          if (newline_msk == 0) {
1006            message_current += 16;
1007            continue;
1008          }
1009          message_current += (ffs(newline_msk) - 1);
1010          const size_t relative_idx = message_current - message_start;
1011          const size_t message_current_idx = 1 + base_idx + relative_idx;
1012          lines_.push_back(std::make_pair(last_slash_n_idx_,
1013                                          message_current_idx));
1014          if (lines_.size() == 1) {
1015            headers_->WriteFromFramer(checkpoint,
1016                                      1 + message_current - checkpoint);
1017            checkpoint = message_current + 1;
1018            const char* begin = headers_->OriginalHeaderStreamBegin();
1019#if DEBUGFRAMER
1020          LOG(INFO) << "First line " << std::string(begin, lines_[0].second);
1021          LOG(INFO) << "is_request_: " << is_request_;
1022#endif
1023            ProcessFirstLine(begin, begin + lines_[0].second);
1024            if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ)
1025              goto process_lines;
1026            else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR)
1027              goto bottom;
1028          }
1029          const size_t chars_since_last_slash_n = (message_current_idx -
1030                                                   last_slash_n_idx_);
1031          last_slash_n_idx_ = message_current_idx;
1032          if (chars_since_last_slash_n > 2) {
1033            // We have a slash-n, but the last slash n was
1034            // more than 2 characters away from this. Thus, we know
1035            // that this cannot be an end-of-header.
1036            ++message_current;
1037            continue;
1038          }
1039          if ((chars_since_last_slash_n == 1) ||
1040              (((message_current > message_start) &&
1041                (*(message_current - 1) == '\r')) ||
1042               (last_char_was_slash_r_))) {
1043            goto process_lines;
1044          }
1045          ++message_current;
1046        }
1047      }
1048#endif  // __SSE2__
1049      while (message_current < message_end) {
1050        if (*message_current != '\n') {
1051          ++message_current;
1052          continue;
1053        }
1054        const size_t relative_idx = message_current - message_start;
1055        const size_t message_current_idx = 1 + base_idx + relative_idx;
1056        lines_.push_back(std::make_pair(last_slash_n_idx_,
1057                                        message_current_idx));
1058        if (lines_.size() == 1) {
1059          headers_->WriteFromFramer(checkpoint,
1060                                    1 + message_current - checkpoint);
1061          checkpoint = message_current + 1;
1062          const char* begin = headers_->OriginalHeaderStreamBegin();
1063#if DEBUGFRAMER
1064          LOG(INFO) << "First line " << std::string(begin, lines_[0].second);
1065          LOG(INFO) << "is_request_: " << is_request_;
1066#endif
1067          ProcessFirstLine(begin, begin + lines_[0].second);
1068          if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ)
1069            goto process_lines;
1070          else if (parse_state_ == BalsaFrameEnums::PARSE_ERROR)
1071            goto bottom;
1072        }
1073        const size_t chars_since_last_slash_n = (message_current_idx -
1074                                                 last_slash_n_idx_);
1075        last_slash_n_idx_ = message_current_idx;
1076        if (chars_since_last_slash_n > 2) {
1077          // false positive.
1078          ++message_current;
1079          continue;
1080        }
1081        if ((chars_since_last_slash_n == 1) ||
1082            (((message_current > message_start) &&
1083              (*(message_current - 1) == '\r')) ||
1084             (last_char_was_slash_r_))) {
1085          goto process_lines;
1086        }
1087        ++message_current;
1088      }
1089    }
1090    continue;
1091 process_lines:
1092    ++message_current;
1093    DCHECK(message_current >= message_start);
1094    if (message_current > message_start) {
1095      headers_->WriteFromFramer(checkpoint, message_current - checkpoint);
1096    }
1097
1098    // Check if we have exceeded maximum headers length
1099    // Although we check for this limit before and after we call this function
1100    // we check it here as well to make sure that in case the visitor changed
1101    // the max_header_length_ (for example after processing the first line)
1102    // we handle it gracefully.
1103    if (headers_->GetReadableBytesFromHeaderStream() > max_header_length_) {
1104      parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1105      last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1106      visitor_->HandleHeaderError(this);
1107      goto bottom;
1108    }
1109
1110    // Since we know that we won't be writing any more bytes of the header,
1111    // we tell that to the headers object. The headers object may make
1112    // more efficient allocation decisions when this is signaled.
1113    headers_->DoneWritingFromFramer();
1114    {
1115      const char* readable_ptr = NULL;
1116      size_t readable_size = 0;
1117      headers_->GetReadablePtrFromHeaderStream(&readable_ptr, &readable_size);
1118      visitor_->ProcessHeaderInput(readable_ptr, readable_size);
1119    }
1120
1121    // Ok, now that we've written everything into our header buffer, it is
1122    // time to process the header lines (extract proper values for headers
1123    // which are important for framing).
1124    ProcessHeaderLines();
1125    if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1126      goto bottom;
1127    }
1128    AssignParseStateAfterHeadersHaveBeenParsed();
1129    if (parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1130      goto bottom;
1131    }
1132    visitor_->ProcessHeaders(*headers_);
1133    visitor_->HeaderDone();
1134    if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ) {
1135      visitor_->MessageDone();
1136    }
1137    goto bottom;
1138  }
1139  // If we've gotten to here, it means that we've consumed all of the
1140  // available input. We need to record whether or not the last character we
1141  // saw was a '\r' so that a subsequent call to ProcessInput correctly finds
1142  // a header framing that is split across the two calls.
1143  last_char_was_slash_r_ = (*(message_end - 1) == '\r');
1144  DCHECK(message_current >= message_start);
1145  if (message_current > message_start) {
1146    headers_->WriteFromFramer(checkpoint, message_current - checkpoint);
1147  }
1148 bottom:
1149  return message_current - original_message_start;
1150}
1151
1152
1153size_t BalsaFrame::BytesSafeToSplice() const {
1154  switch (parse_state_) {
1155    case BalsaFrameEnums::READING_CHUNK_DATA:
1156      return chunk_length_remaining_;
1157    case BalsaFrameEnums::READING_UNTIL_CLOSE:
1158      return std::numeric_limits<size_t>::max();
1159    case BalsaFrameEnums::READING_CONTENT:
1160      return content_length_remaining_;
1161    default:
1162      return 0;
1163  }
1164}
1165
1166void BalsaFrame::BytesSpliced(size_t bytes_spliced) {
1167  switch (parse_state_) {
1168    case BalsaFrameEnums::READING_CHUNK_DATA:
1169      if (chunk_length_remaining_ >= bytes_spliced) {
1170        chunk_length_remaining_ -= bytes_spliced;
1171        if (chunk_length_remaining_ == 0) {
1172          parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM;
1173        }
1174        return;
1175      } else {
1176        last_error_ =
1177          BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT;
1178        goto error_exit;
1179      }
1180
1181    case BalsaFrameEnums::READING_UNTIL_CLOSE:
1182      return;
1183
1184    case BalsaFrameEnums::READING_CONTENT:
1185      if (content_length_remaining_ >= bytes_spliced) {
1186        content_length_remaining_ -= bytes_spliced;
1187        if (content_length_remaining_ == 0) {
1188          parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1189          visitor_->MessageDone();
1190        }
1191        return;
1192      } else {
1193        last_error_ =
1194          BalsaFrameEnums::CALLED_BYTES_SPLICED_AND_EXCEEDED_SAFE_SPLICE_AMOUNT;
1195        goto error_exit;
1196      }
1197
1198    default:
1199      last_error_ = BalsaFrameEnums::CALLED_BYTES_SPLICED_WHEN_UNSAFE_TO_DO_SO;
1200      goto error_exit;
1201  }
1202
1203 error_exit:
1204  parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1205  visitor_->HandleBodyError(this);
1206};
1207
1208// You may note that the state-machine contained within this function has both
1209// switch and goto labels for nearly the same thing. For instance, the
1210// following two labels refer to the same code block:
1211//   label_reading_chunk_data:
1212//   case BalsaFrameEnums::READING_CHUNK_DATA:
1213// The 'case' statement is required for the switch statement which occurs when
1214// ProcessInput is invoked. The goto label is required as the state-machine
1215// does not use a computed goto in any subsequent operations.
1216//
1217// Since several states exit the state machine for various reasons, there is
1218// also one label at the bottom of the function. When it is appropriate to
1219// return from the function, that part of the state machine instead issues a
1220// goto bottom; This results in less code duplication, and makes debugging
1221// easier (as you can add a statement to a section of code which is guaranteed
1222// to be invoked when the function is exiting.
1223size_t BalsaFrame::ProcessInput(const char* input, size_t size) {
1224  const char* current = input;
1225  const char* on_entry = current;
1226  const char* end = current + size;
1227#if DEBUGFRAMER
1228  LOG(INFO) << "\n=============="
1229            << BalsaFrameEnums::ParseStateToString(parse_state_)
1230            << "===============\n";
1231#endif  // DEBUGFRAMER
1232
1233  DCHECK(headers_ != NULL);
1234  if (headers_ == NULL) return 0;
1235
1236  if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) {
1237    const size_t header_length = headers_->GetReadableBytesFromHeaderStream();
1238    // Yes, we still have to check this here as the user can change the
1239    // max_header_length amount!
1240    // Also it is possible that we have reached the maximum allowed header size,
1241    // and we have more to consume (remember we are still inside
1242    // READING_HEADER_AND_FIRSTLINE) in which case we directly declare an error.
1243    if (header_length > max_header_length_ ||
1244        (header_length == max_header_length_ && size > 0)) {
1245      parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1246      last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1247      visitor_->HandleHeaderError(this);
1248      goto bottom;
1249    }
1250    size_t bytes_to_process = max_header_length_ - header_length;
1251    if (bytes_to_process > size) {
1252      bytes_to_process = size;
1253    }
1254    current += ProcessHeaders(input, bytes_to_process);
1255    // If we are still reading headers check if we have crossed the headers
1256    // limit. Note that we check for >= as opposed to >. This is because if
1257    // header_length_after equals max_header_length_ and we are still in the
1258    // parse_state_  BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE we know for
1259    // sure that the headers limit will be crossed later on
1260    if (parse_state_ == BalsaFrameEnums::READING_HEADER_AND_FIRSTLINE) {
1261      // Note that headers_ is valid only if we are still reading headers.
1262      const size_t header_length_after =
1263          headers_->GetReadableBytesFromHeaderStream();
1264      if (header_length_after >= max_header_length_) {
1265        parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1266        last_error_ = BalsaFrameEnums::HEADERS_TOO_LONG;
1267        visitor_->HandleHeaderError(this);
1268      }
1269    }
1270    goto bottom;
1271  } else if (parse_state_ == BalsaFrameEnums::MESSAGE_FULLY_READ ||
1272             parse_state_ == BalsaFrameEnums::PARSE_ERROR) {
1273    // Can do nothing more 'till we're reset.
1274    goto bottom;
1275  }
1276
1277  while (current < end) {
1278    switch (parse_state_) {
1279 label_reading_chunk_length:
1280      case BalsaFrameEnums::READING_CHUNK_LENGTH:
1281        // In this state we read the chunk length.
1282        // Note that once we hit a character which is not in:
1283        // [0-9;A-Fa-f\n], we transition to a different state.
1284        //
1285        {
1286          // If we used strtol, etc, we'd have to buffer this line.
1287          // This is more annoying than simply doing the conversion
1288          // here. This code accounts for overflow.
1289          static const signed char buf[] = {
1290            // %0  %1  %2  %3  %4  %5  %6  %7  %8  \t  \n  %b  %c  \r  %e  %f
1291               -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, -2, -1, -1, -2, -1, -1,
1292            // %10 %11 %12 %13 %14 %15 %16 %17 %18 %19 %1a %1b %1c %1d %1e %1f
1293               -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1294            // ' ' %21 %22 %23 %24 %25 %26 %27 %28 %29 %2a %2b %2c %2d %2e %2f
1295               -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1296            // %30 %31 %32 %33 %34 %35 %36 %37 %38 %39 %3a ';' %3c %3d %3e %3f
1297                0,  1,  2,  3,  4,  5,  6,  7,  8,  9, -1, -2, -1, -1, -1, -1,
1298            // %40 'A' 'B' 'C' 'D' 'E' 'F' %47 %48 %49 %4a %4b %4c %4d %4e %4f
1299               -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1300            // %50 %51 %52 %53 %54 %55 %56 %57 %58 %59 %5a %5b %5c %5d %5e %5f
1301               -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1302            // %60 'a' 'b' 'c' 'd' 'e' 'f' %67 %68 %69 %6a %6b %6c %6d %6e %6f
1303               -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1304            // %70 %71 %72 %73 %74 %75 %76 %77 %78 %79 %7a %7b %7c %7d %7e %7f
1305               -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
1306          };
1307          // valid cases:
1308          //  "09123\n"                      // -> 09123
1309          //  "09123\r\n"                    // -> 09123
1310          //  "09123  \n"                    // -> 09123
1311          //  "09123  \r\n"                  // -> 09123
1312          //  "09123  12312\n"               // -> 09123
1313          //  "09123  12312\r\n"             // -> 09123
1314          //  "09123; foo=bar\n"             // -> 09123
1315          //  "09123; foo=bar\r\n"           // -> 09123
1316          //  "FFFFFFFFFFFFFFFF\r\n"         // -> FFFFFFFFFFFFFFFF
1317          //  "FFFFFFFFFFFFFFFF 22\r\n"      // -> FFFFFFFFFFFFFFFF
1318          // invalid cases:
1319          // "[ \t]+[^\n]*\n"
1320          // "FFFFFFFFFFFFFFFFF\r\n"  (would overflow)
1321          // "\r\n"
1322          // "\n"
1323          while (current < end) {
1324            const char c = *current;
1325            ++current;
1326            const signed char addition = buf[static_cast<int>(c)];
1327            if (addition >= 0) {
1328              chunk_length_character_extracted_ = true;
1329              size_t length_x_16 = chunk_length_remaining_ * 16;
1330              const size_t kMaxDiv16 = std::numeric_limits<size_t>::max() / 16;
1331              if ((chunk_length_remaining_ > kMaxDiv16) ||
1332                  ((std::numeric_limits<size_t>::max() - length_x_16) <
1333                   static_cast<size_t>(addition))) {
1334                // overflow -- asked for a chunk-length greater than 2^64 - 1!!
1335                parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1336                last_error_ = BalsaFrameEnums::CHUNK_LENGTH_OVERFLOW;
1337                visitor_->ProcessBodyInput(on_entry, current - on_entry);
1338                visitor_->HandleChunkingError(this);
1339                goto bottom;
1340              }
1341              chunk_length_remaining_ = length_x_16 + addition;
1342              continue;
1343            }
1344
1345            if (!chunk_length_character_extracted_ || addition == -1) {
1346              // ^[0-9;A-Fa-f][ \t\n] -- was not matched, either because no
1347              // characters were converted, or an unexpected character was
1348              // seen.
1349              parse_state_ = BalsaFrameEnums::PARSE_ERROR;
1350              last_error_ = BalsaFrameEnums::INVALID_CHUNK_LENGTH;
1351              visitor_->ProcessBodyInput(on_entry, current - on_entry);
1352              visitor_->HandleChunkingError(this);
1353              goto bottom;
1354            }
1355
1356            --current;
1357            parse_state_ = BalsaFrameEnums::READING_CHUNK_EXTENSION;
1358            visitor_->ProcessChunkLength(chunk_length_remaining_);
1359            goto label_reading_chunk_extension;
1360          }
1361        }
1362        visitor_->ProcessBodyInput(on_entry, current - on_entry);
1363        goto bottom;  // case BalsaFrameEnums::READING_CHUNK_LENGTH
1364
1365 label_reading_chunk_extension:
1366      case BalsaFrameEnums::READING_CHUNK_EXTENSION:
1367        {
1368          // TODO(phython): Convert this scanning to be 16 bytes at a time if
1369          // there is data to be read.
1370          const char* extensions_start = current;
1371          size_t extensions_length = 0;
1372          while (current < end) {
1373            const char c = *current;
1374            if (c == '\r' || c == '\n') {
1375              extensions_length =
1376                  (extensions_start == current) ?
1377                  0 :
1378                  current - extensions_start - 1;
1379            }
1380
1381            ++current;
1382            if (c == '\n') {
1383              chunk_length_character_extracted_ = false;
1384              visitor_->ProcessChunkExtensions(
1385                  extensions_start, extensions_length);
1386              if (chunk_length_remaining_ != 0) {
1387                parse_state_ = BalsaFrameEnums::READING_CHUNK_DATA;
1388                goto label_reading_chunk_data;
1389              }
1390              HeaderFramingFound('\n');
1391              parse_state_ = BalsaFrameEnums::READING_LAST_CHUNK_TERM;
1392              goto label_reading_last_chunk_term;
1393            }
1394          }
1395          visitor_->ProcessChunkExtensions(
1396              extensions_start, extensions_length);
1397        }
1398
1399        visitor_->ProcessBodyInput(on_entry, current - on_entry);
1400        goto bottom;  // case BalsaFrameEnums::READING_CHUNK_EXTENSION
1401
1402 label_reading_chunk_data:
1403      case BalsaFrameEnums::READING_CHUNK_DATA:
1404        while (current < end) {
1405          if (chunk_length_remaining_ == 0) {
1406            break;
1407          }
1408          // read in the chunk
1409          size_t bytes_remaining = end - current;
1410          size_t consumed_bytes = (chunk_length_remaining_ < bytes_remaining) ?
1411            chunk_length_remaining_ : bytes_remaining;
1412          const char* tmp_current = current + consumed_bytes;
1413          visitor_->ProcessBodyInput(on_entry, tmp_current - on_entry);
1414          visitor_->ProcessBodyData(current, consumed_bytes);
1415          on_entry = current = tmp_current;
1416          chunk_length_remaining_ -= consumed_bytes;
1417        }
1418        if (chunk_length_remaining_ == 0) {
1419          parse_state_ = BalsaFrameEnums::READING_CHUNK_TERM;
1420          goto label_reading_chunk_term;
1421        }
1422        visitor_->ProcessBodyInput(on_entry, current - on_entry);
1423        goto bottom;  // case BalsaFrameEnums::READING_CHUNK_DATA
1424
1425 label_reading_chunk_term:
1426      case BalsaFrameEnums::READING_CHUNK_TERM:
1427        while (current < end) {
1428          const char c = *current;
1429          ++current;
1430
1431          if (c == '\n') {
1432            parse_state_ = BalsaFrameEnums::READING_CHUNK_LENGTH;
1433            goto label_reading_chunk_length;
1434          }
1435        }
1436        visitor_->ProcessBodyInput(on_entry, current - on_entry);
1437        goto bottom;  // case BalsaFrameEnums::READING_CHUNK_TERM
1438
1439 label_reading_last_chunk_term:
1440      case BalsaFrameEnums::READING_LAST_CHUNK_TERM:
1441        while (current < end) {
1442          const char c = *current;
1443
1444          if (!HeaderFramingFound(c)) {
1445            // If not, however, since the spec only suggests that the
1446            // client SHOULD indicate the presence of trailers, we get to
1447            // *test* that they did or didn't.
1448            // If all of the bytes we've seen since:
1449            //   OPTIONAL_WS 0 OPTIONAL_STUFF CRLF
1450            // are either '\r', or '\n', then we can assume that we don't yet
1451            // know if we need to parse headers, or if the next byte will make
1452            // the HeaderFramingFound condition (above) true.
1453            if (HeaderFramingMayBeFound()) {
1454              // If true, then we have seen only characters '\r' or '\n'.
1455              ++current;
1456
1457              // Lets try again! There is no state change here.
1458              continue;
1459            } else {
1460              // If (!HeaderFramingMayBeFound()), then we know that we must be
1461              // reading the first non CRLF character of a trailer.
1462              parse_state_ = BalsaFrameEnums::READING_TRAILER;
1463              visitor_->ProcessBodyInput(on_entry, current - on_entry);
1464              on_entry = current;
1465              goto label_reading_trailer;
1466            }
1467          } else {
1468            // If we've found a "\r\n\r\n", then the message
1469            // is done.
1470            ++current;
1471            parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1472            visitor_->ProcessBodyInput(on_entry, current - on_entry);
1473            visitor_->MessageDone();
1474            goto bottom;
1475          }
1476          break;  // from while loop
1477        }
1478        visitor_->ProcessBodyInput(on_entry, current - on_entry);
1479        goto bottom;  // case BalsaFrameEnums::READING_LAST_CHUNK_TERM
1480
1481 label_reading_trailer:
1482      case BalsaFrameEnums::READING_TRAILER:
1483        while (current < end) {
1484          const char c = *current;
1485          ++current;
1486          // TODO(fenix): If we ever care about trailers as part of framing,
1487          // deal with them here (see below for part of the 'solution')
1488          // if (LineFramingFound(c)) {
1489          // trailer_lines_.push_back(make_pair(start_of_line_,
1490          //                                   trailer_length_ - 1));
1491          // start_of_line_ = trailer_length_;
1492          // }
1493          if (HeaderFramingFound(c)) {
1494            // ProcessTrailers(visitor_, &trailers_);
1495            parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1496            visitor_->ProcessTrailerInput(on_entry, current - on_entry);
1497            visitor_->MessageDone();
1498            goto bottom;
1499          }
1500        }
1501        visitor_->ProcessTrailerInput(on_entry, current - on_entry);
1502        break;  // case BalsaFrameEnums::READING_TRAILER
1503
1504        // Note that there is no label:
1505        //   'label_reading_until_close'
1506        // here. This is because the state-machine exists immediately after
1507        // reading the headers instead of transitioning here (as it would
1508        // do if it was consuming all the data it could, all the time).
1509      case BalsaFrameEnums::READING_UNTIL_CLOSE:
1510        {
1511          const size_t bytes_remaining = end - current;
1512          if (bytes_remaining > 0) {
1513            visitor_->ProcessBodyInput(current, bytes_remaining);
1514            visitor_->ProcessBodyData(current, bytes_remaining);
1515            current += bytes_remaining;
1516          }
1517        }
1518        goto bottom;  // case BalsaFrameEnums::READING_UNTIL_CLOSE
1519
1520        // label_reading_content:
1521      case BalsaFrameEnums::READING_CONTENT:
1522#if DEBUGFRAMER
1523        LOG(INFO) << "ReadingContent: " << content_length_remaining_;
1524#endif  // DEBUGFRAMER
1525        while (content_length_remaining_ && current < end) {
1526          // read in the content
1527          const size_t bytes_remaining = end - current;
1528          const size_t consumed_bytes =
1529            (content_length_remaining_ < bytes_remaining) ?
1530            content_length_remaining_ : bytes_remaining;
1531          visitor_->ProcessBodyInput(current, consumed_bytes);
1532          visitor_->ProcessBodyData(current, consumed_bytes);
1533          current += consumed_bytes;
1534          content_length_remaining_ -= consumed_bytes;
1535        }
1536        if (content_length_remaining_ == 0) {
1537          parse_state_ = BalsaFrameEnums::MESSAGE_FULLY_READ;
1538          visitor_->MessageDone();
1539        }
1540        goto bottom;  // case BalsaFrameEnums::READING_CONTENT
1541
1542      default:
1543        // The state-machine should never be in a state that isn't handled
1544        // above.  This is a glaring logic error, and we should do something
1545        // drastic to ensure that this gets looked-at and fixed.
1546        LOG(FATAL) << "Unknown state: " << parse_state_  // COV_NF_LINE
1547          << " memory corruption?!";                     // COV_NF_LINE
1548    }
1549  }
1550 bottom:
1551#if DEBUGFRAMER
1552  LOG(INFO) << "\n^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n\n"
1553    << std::string(input, current)
1554    << "\n$$$$$$$$$$$$$$"
1555    << BalsaFrameEnums::ParseStateToString(parse_state_)
1556    << "$$$$$$$$$$$$$$$"
1557    << " consumed: " << (current - input);
1558  if (Error()) {
1559    LOG(INFO) << BalsaFrameEnums::ErrorCodeToString(ErrorCode());
1560  }
1561#endif  // DEBUGFRAMER
1562  return current - input;
1563}
1564
1565}  // namespace net
1566