1// Protocol Buffers - Google's data interchange format
2// Copyright 2008 Google Inc.  All rights reserved.
3// http://code.google.com/p/protobuf/
4//
5// Redistribution and use in source and binary forms, with or without
6// modification, are permitted provided that the following conditions are
7// met:
8//
9//     * Redistributions of source code must retain the above copyright
10// notice, this list of conditions and the following disclaimer.
11//     * Redistributions in binary form must reproduce the above
12// copyright notice, this list of conditions and the following disclaimer
13// in the documentation and/or other materials provided with the
14// distribution.
15//     * Neither the name of Google Inc. nor the names of its
16// contributors may be used to endorse or promote products derived from
17// this software without specific prior written permission.
18//
19// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31// Author: kenton@google.com (Kenton Varda)
32//  Based on original Protocol Buffers design by
33//  Sanjay Ghemawat, Jeff Dean, and others.
34//
35// Here we have a hand-written lexer.  At first you might ask yourself,
36// "Hand-written text processing?  Is Kenton crazy?!"  Well, first of all,
37// yes I am crazy, but that's beside the point.  There are actually reasons
38// why I ended up writing this this way.
39//
40// The traditional approach to lexing is to use lex to generate a lexer for
41// you.  Unfortunately, lex's output is ridiculously ugly and difficult to
42// integrate cleanly with C++ code, especially abstract code or code meant
43// as a library.  Better parser-generators exist but would add dependencies
44// which most users won't already have, which we'd like to avoid.  (GNU flex
45// has a C++ output option, but it's still ridiculously ugly, non-abstract,
46// and not library-friendly.)
47//
48// The next approach that any good software engineer should look at is to
49// use regular expressions.  And, indeed, I did.  I have code which
50// implements this same class using regular expressions.  It's about 200
51// lines shorter.  However:
52// - Rather than error messages telling you "This string has an invalid
53//   escape sequence at line 5, column 45", you get error messages like
54//   "Parse error on line 5".  Giving more precise errors requires adding
55//   a lot of code that ends up basically as complex as the hand-coded
56//   version anyway.
57// - The regular expression to match a string literal looks like this:
58//     kString  = new RE("(\"([^\"\\\\]|"              // non-escaped
59//                       "\\\\[abfnrtv?\"'\\\\0-7]|"   // normal escape
60//                       "\\\\x[0-9a-fA-F])*\"|"       // hex escape
61//                       "\'([^\'\\\\]|"        // Also support single-quotes.
62//                       "\\\\[abfnrtv?\"'\\\\0-7]|"
63//                       "\\\\x[0-9a-fA-F])*\')");
64//   Verifying the correctness of this line noise is actually harder than
65//   verifying the correctness of ConsumeString(), defined below.  I'm not
66//   even confident that the above is correct, after staring at it for some
67//   time.
68// - PCRE is fast, but there's still more overhead involved than the code
69//   below.
70// - Sadly, regular expressions are not part of the C standard library, so
71//   using them would require depending on some other library.  For the
72//   open source release, this could be really annoying.  Nobody likes
73//   downloading one piece of software just to find that they need to
74//   download something else to make it work, and in all likelihood
75//   people downloading Protocol Buffers will already be doing so just
76//   to make something else work.  We could include a copy of PCRE with
77//   our code, but that obligates us to keep it up-to-date and just seems
78//   like a big waste just to save 200 lines of code.
79//
80// On a similar but unrelated note, I'm even scared to use ctype.h.
81// Apparently functions like isalpha() are locale-dependent.  So, if we used
82// that, then if this code is being called from some program that doesn't
83// have its locale set to "C", it would behave strangely.  We can't just set
84// the locale to "C" ourselves since we might break the calling program that
85// way, particularly if it is multi-threaded.  WTF?  Someone please let me
86// (Kenton) know if I'm missing something here...
87//
88// I'd love to hear about other alternatives, though, as this code isn't
89// exactly pretty.
90
91#include <google/protobuf/io/tokenizer.h>
92#include <google/protobuf/stubs/common.h>
93#include <google/protobuf/stubs/stringprintf.h>
94#include <google/protobuf/io/zero_copy_stream.h>
95#include <google/protobuf/stubs/strutil.h>
96#include <google/protobuf/stubs/stl_util.h>
97
98namespace google {
99namespace protobuf {
100namespace io {
101namespace {
102
103// As mentioned above, I don't trust ctype.h due to the presence of "locales".
104// So, I have written replacement functions here.  Someone please smack me if
105// this is a bad idea or if there is some way around this.
106//
107// These "character classes" are designed to be used in template methods.
108// For instance, Tokenizer::ConsumeZeroOrMore<Whitespace>() will eat
109// whitespace.
110
111// Note:  No class is allowed to contain '\0', since this is used to mark end-
112//   of-input and is handled specially.
113
114#define CHARACTER_CLASS(NAME, EXPRESSION)      \
115  class NAME {                                 \
116   public:                                     \
117    static inline bool InClass(char c) {       \
118      return EXPRESSION;                       \
119    }                                          \
120  }
121
122CHARACTER_CLASS(Whitespace, c == ' ' || c == '\n' || c == '\t' ||
123                            c == '\r' || c == '\v' || c == '\f');
124CHARACTER_CLASS(WhitespaceNoNewline, c == ' ' || c == '\t' ||
125                                     c == '\r' || c == '\v' || c == '\f');
126
127CHARACTER_CLASS(Unprintable, c < ' ' && c > '\0');
128
129CHARACTER_CLASS(Digit, '0' <= c && c <= '9');
130CHARACTER_CLASS(OctalDigit, '0' <= c && c <= '7');
131CHARACTER_CLASS(HexDigit, ('0' <= c && c <= '9') ||
132                          ('a' <= c && c <= 'f') ||
133                          ('A' <= c && c <= 'F'));
134
135CHARACTER_CLASS(Letter, ('a' <= c && c <= 'z') ||
136                        ('A' <= c && c <= 'Z') ||
137                        (c == '_'));
138
139CHARACTER_CLASS(Alphanumeric, ('a' <= c && c <= 'z') ||
140                              ('A' <= c && c <= 'Z') ||
141                              ('0' <= c && c <= '9') ||
142                              (c == '_'));
143
144CHARACTER_CLASS(Escape, c == 'a' || c == 'b' || c == 'f' || c == 'n' ||
145                        c == 'r' || c == 't' || c == 'v' || c == '\\' ||
146                        c == '?' || c == '\'' || c == '\"');
147
148#undef CHARACTER_CLASS
149
150// Given a char, interpret it as a numeric digit and return its value.
151// This supports any number base up to 36.
152inline int DigitValue(char digit) {
153  if ('0' <= digit && digit <= '9') return digit - '0';
154  if ('a' <= digit && digit <= 'z') return digit - 'a' + 10;
155  if ('A' <= digit && digit <= 'Z') return digit - 'A' + 10;
156  return -1;
157}
158
159// Inline because it's only used in one place.
160inline char TranslateEscape(char c) {
161  switch (c) {
162    case 'a':  return '\a';
163    case 'b':  return '\b';
164    case 'f':  return '\f';
165    case 'n':  return '\n';
166    case 'r':  return '\r';
167    case 't':  return '\t';
168    case 'v':  return '\v';
169    case '\\': return '\\';
170    case '?':  return '\?';    // Trigraphs = :(
171    case '\'': return '\'';
172    case '"':  return '\"';
173
174    // We expect escape sequences to have been validated separately.
175    default:   return '?';
176  }
177}
178
179}  // anonymous namespace
180
181ErrorCollector::~ErrorCollector() {}
182
183// ===================================================================
184
185Tokenizer::Tokenizer(ZeroCopyInputStream* input,
186                     ErrorCollector* error_collector)
187  : input_(input),
188    error_collector_(error_collector),
189    buffer_(NULL),
190    buffer_size_(0),
191    buffer_pos_(0),
192    read_error_(false),
193    line_(0),
194    column_(0),
195    record_target_(NULL),
196    record_start_(-1),
197    allow_f_after_float_(false),
198    comment_style_(CPP_COMMENT_STYLE) {
199
200  current_.line = 0;
201  current_.column = 0;
202  current_.end_column = 0;
203  current_.type = TYPE_START;
204
205  Refresh();
206}
207
208Tokenizer::~Tokenizer() {
209  // If we had any buffer left unread, return it to the underlying stream
210  // so that someone else can read it.
211  if (buffer_size_ > buffer_pos_) {
212    input_->BackUp(buffer_size_ - buffer_pos_);
213  }
214}
215
216// -------------------------------------------------------------------
217// Internal helpers.
218
219void Tokenizer::NextChar() {
220  // Update our line and column counters based on the character being
221  // consumed.
222  if (current_char_ == '\n') {
223    ++line_;
224    column_ = 0;
225  } else if (current_char_ == '\t') {
226    column_ += kTabWidth - column_ % kTabWidth;
227  } else {
228    ++column_;
229  }
230
231  // Advance to the next character.
232  ++buffer_pos_;
233  if (buffer_pos_ < buffer_size_) {
234    current_char_ = buffer_[buffer_pos_];
235  } else {
236    Refresh();
237  }
238}
239
240void Tokenizer::Refresh() {
241  if (read_error_) {
242    current_char_ = '\0';
243    return;
244  }
245
246  // If we're in a token, append the rest of the buffer to it.
247  if (record_target_ != NULL && record_start_ < buffer_size_) {
248    record_target_->append(buffer_ + record_start_, buffer_size_ - record_start_);
249    record_start_ = 0;
250  }
251
252  const void* data = NULL;
253  buffer_ = NULL;
254  buffer_pos_ = 0;
255  do {
256    if (!input_->Next(&data, &buffer_size_)) {
257      // end of stream (or read error)
258      buffer_size_ = 0;
259      read_error_ = true;
260      current_char_ = '\0';
261      return;
262    }
263  } while (buffer_size_ == 0);
264
265  buffer_ = static_cast<const char*>(data);
266
267  current_char_ = buffer_[0];
268}
269
270inline void Tokenizer::RecordTo(string* target) {
271  record_target_ = target;
272  record_start_ = buffer_pos_;
273}
274
275inline void Tokenizer::StopRecording() {
276  // Note:  The if() is necessary because some STL implementations crash when
277  //   you call string::append(NULL, 0), presumably because they are trying to
278  //   be helpful by detecting the NULL pointer, even though there's nothing
279  //   wrong with reading zero bytes from NULL.
280  if (buffer_pos_ != record_start_) {
281    record_target_->append(buffer_ + record_start_, buffer_pos_ - record_start_);
282  }
283  record_target_ = NULL;
284  record_start_ = -1;
285}
286
287inline void Tokenizer::StartToken() {
288  current_.type = TYPE_START;    // Just for the sake of initializing it.
289  current_.text.clear();
290  current_.line = line_;
291  current_.column = column_;
292  RecordTo(&current_.text);
293}
294
295inline void Tokenizer::EndToken() {
296  StopRecording();
297  current_.end_column = column_;
298}
299
300// -------------------------------------------------------------------
301// Helper methods that consume characters.
302
303template<typename CharacterClass>
304inline bool Tokenizer::LookingAt() {
305  return CharacterClass::InClass(current_char_);
306}
307
308template<typename CharacterClass>
309inline bool Tokenizer::TryConsumeOne() {
310  if (CharacterClass::InClass(current_char_)) {
311    NextChar();
312    return true;
313  } else {
314    return false;
315  }
316}
317
318inline bool Tokenizer::TryConsume(char c) {
319  if (current_char_ == c) {
320    NextChar();
321    return true;
322  } else {
323    return false;
324  }
325}
326
327template<typename CharacterClass>
328inline void Tokenizer::ConsumeZeroOrMore() {
329  while (CharacterClass::InClass(current_char_)) {
330    NextChar();
331  }
332}
333
334template<typename CharacterClass>
335inline void Tokenizer::ConsumeOneOrMore(const char* error) {
336  if (!CharacterClass::InClass(current_char_)) {
337    AddError(error);
338  } else {
339    do {
340      NextChar();
341    } while (CharacterClass::InClass(current_char_));
342  }
343}
344
345// -------------------------------------------------------------------
346// Methods that read whole patterns matching certain kinds of tokens
347// or comments.
348
349void Tokenizer::ConsumeString(char delimiter) {
350  while (true) {
351    switch (current_char_) {
352      case '\0':
353      case '\n': {
354        AddError("String literals cannot cross line boundaries.");
355        return;
356      }
357
358      case '\\': {
359        // An escape sequence.
360        NextChar();
361        if (TryConsumeOne<Escape>()) {
362          // Valid escape sequence.
363        } else if (TryConsumeOne<OctalDigit>()) {
364          // Possibly followed by two more octal digits, but these will
365          // just be consumed by the main loop anyway so we don't need
366          // to do so explicitly here.
367        } else if (TryConsume('x') || TryConsume('X')) {
368          if (!TryConsumeOne<HexDigit>()) {
369            AddError("Expected hex digits for escape sequence.");
370          }
371          // Possibly followed by another hex digit, but again we don't care.
372        } else if (TryConsume('u')) {
373          if (!TryConsumeOne<HexDigit>() ||
374              !TryConsumeOne<HexDigit>() ||
375              !TryConsumeOne<HexDigit>() ||
376              !TryConsumeOne<HexDigit>()) {
377            AddError("Expected four hex digits for \\u escape sequence.");
378          }
379        } else if (TryConsume('U')) {
380          // We expect 8 hex digits; but only the range up to 0x10ffff is
381          // legal.
382          if (!TryConsume('0') ||
383              !TryConsume('0') ||
384              !(TryConsume('0') || TryConsume('1')) ||
385              !TryConsumeOne<HexDigit>() ||
386              !TryConsumeOne<HexDigit>() ||
387              !TryConsumeOne<HexDigit>() ||
388              !TryConsumeOne<HexDigit>() ||
389              !TryConsumeOne<HexDigit>()) {
390            AddError("Expected eight hex digits up to 10ffff for \\U escape "
391                     "sequence");
392          }
393        } else {
394          AddError("Invalid escape sequence in string literal.");
395        }
396        break;
397      }
398
399      default: {
400        if (current_char_ == delimiter) {
401          NextChar();
402          return;
403        }
404        NextChar();
405        break;
406      }
407    }
408  }
409}
410
411Tokenizer::TokenType Tokenizer::ConsumeNumber(bool started_with_zero,
412                                              bool started_with_dot) {
413  bool is_float = false;
414
415  if (started_with_zero && (TryConsume('x') || TryConsume('X'))) {
416    // A hex number (started with "0x").
417    ConsumeOneOrMore<HexDigit>("\"0x\" must be followed by hex digits.");
418
419  } else if (started_with_zero && LookingAt<Digit>()) {
420    // An octal number (had a leading zero).
421    ConsumeZeroOrMore<OctalDigit>();
422    if (LookingAt<Digit>()) {
423      AddError("Numbers starting with leading zero must be in octal.");
424      ConsumeZeroOrMore<Digit>();
425    }
426
427  } else {
428    // A decimal number.
429    if (started_with_dot) {
430      is_float = true;
431      ConsumeZeroOrMore<Digit>();
432    } else {
433      ConsumeZeroOrMore<Digit>();
434
435      if (TryConsume('.')) {
436        is_float = true;
437        ConsumeZeroOrMore<Digit>();
438      }
439    }
440
441    if (TryConsume('e') || TryConsume('E')) {
442      is_float = true;
443      TryConsume('-') || TryConsume('+');
444      ConsumeOneOrMore<Digit>("\"e\" must be followed by exponent.");
445    }
446
447    if (allow_f_after_float_ && (TryConsume('f') || TryConsume('F'))) {
448      is_float = true;
449    }
450  }
451
452  if (LookingAt<Letter>()) {
453    AddError("Need space between number and identifier.");
454  } else if (current_char_ == '.') {
455    if (is_float) {
456      AddError(
457        "Already saw decimal point or exponent; can't have another one.");
458    } else {
459      AddError("Hex and octal numbers must be integers.");
460    }
461  }
462
463  return is_float ? TYPE_FLOAT : TYPE_INTEGER;
464}
465
466void Tokenizer::ConsumeLineComment(string* content) {
467  if (content != NULL) RecordTo(content);
468
469  while (current_char_ != '\0' && current_char_ != '\n') {
470    NextChar();
471  }
472  TryConsume('\n');
473
474  if (content != NULL) StopRecording();
475}
476
477void Tokenizer::ConsumeBlockComment(string* content) {
478  int start_line = line_;
479  int start_column = column_ - 2;
480
481  if (content != NULL) RecordTo(content);
482
483  while (true) {
484    while (current_char_ != '\0' &&
485           current_char_ != '*' &&
486           current_char_ != '/' &&
487           current_char_ != '\n') {
488      NextChar();
489    }
490
491    if (TryConsume('\n')) {
492      if (content != NULL) StopRecording();
493
494      // Consume leading whitespace and asterisk;
495      ConsumeZeroOrMore<WhitespaceNoNewline>();
496      if (TryConsume('*')) {
497        if (TryConsume('/')) {
498          // End of comment.
499          break;
500        }
501      }
502
503      if (content != NULL) RecordTo(content);
504    } else if (TryConsume('*') && TryConsume('/')) {
505      // End of comment.
506      if (content != NULL) {
507        StopRecording();
508        // Strip trailing "*/".
509        content->erase(content->size() - 2);
510      }
511      break;
512    } else if (TryConsume('/') && current_char_ == '*') {
513      // Note:  We didn't consume the '*' because if there is a '/' after it
514      //   we want to interpret that as the end of the comment.
515      AddError(
516        "\"/*\" inside block comment.  Block comments cannot be nested.");
517    } else if (current_char_ == '\0') {
518      AddError("End-of-file inside block comment.");
519      error_collector_->AddError(
520        start_line, start_column, "  Comment started here.");
521      if (content != NULL) StopRecording();
522      break;
523    }
524  }
525}
526
527Tokenizer::NextCommentStatus Tokenizer::TryConsumeCommentStart() {
528  if (comment_style_ == CPP_COMMENT_STYLE && TryConsume('/')) {
529    if (TryConsume('/')) {
530      return LINE_COMMENT;
531    } else if (TryConsume('*')) {
532      return BLOCK_COMMENT;
533    } else {
534      // Oops, it was just a slash.  Return it.
535      current_.type = TYPE_SYMBOL;
536      current_.text = "/";
537      current_.line = line_;
538      current_.column = column_ - 1;
539      current_.end_column = column_;
540      return SLASH_NOT_COMMENT;
541    }
542  } else if (comment_style_ == SH_COMMENT_STYLE && TryConsume('#')) {
543    return LINE_COMMENT;
544  } else {
545    return NO_COMMENT;
546  }
547}
548
549// -------------------------------------------------------------------
550
551bool Tokenizer::Next() {
552  previous_ = current_;
553
554  while (!read_error_) {
555    ConsumeZeroOrMore<Whitespace>();
556
557    switch (TryConsumeCommentStart()) {
558      case LINE_COMMENT:
559        ConsumeLineComment(NULL);
560        continue;
561      case BLOCK_COMMENT:
562        ConsumeBlockComment(NULL);
563        continue;
564      case SLASH_NOT_COMMENT:
565        return true;
566      case NO_COMMENT:
567        break;
568    }
569
570    // Check for EOF before continuing.
571    if (read_error_) break;
572
573    if (LookingAt<Unprintable>() || current_char_ == '\0') {
574      AddError("Invalid control characters encountered in text.");
575      NextChar();
576      // Skip more unprintable characters, too.  But, remember that '\0' is
577      // also what current_char_ is set to after EOF / read error.  We have
578      // to be careful not to go into an infinite loop of trying to consume
579      // it, so make sure to check read_error_ explicitly before consuming
580      // '\0'.
581      while (TryConsumeOne<Unprintable>() ||
582             (!read_error_ && TryConsume('\0'))) {
583        // Ignore.
584      }
585
586    } else {
587      // Reading some sort of token.
588      StartToken();
589
590      if (TryConsumeOne<Letter>()) {
591        ConsumeZeroOrMore<Alphanumeric>();
592        current_.type = TYPE_IDENTIFIER;
593      } else if (TryConsume('0')) {
594        current_.type = ConsumeNumber(true, false);
595      } else if (TryConsume('.')) {
596        // This could be the beginning of a floating-point number, or it could
597        // just be a '.' symbol.
598
599        if (TryConsumeOne<Digit>()) {
600          // It's a floating-point number.
601          if (previous_.type == TYPE_IDENTIFIER &&
602              current_.line == previous_.line &&
603              current_.column == previous_.end_column) {
604            // We don't accept syntax like "blah.123".
605            error_collector_->AddError(line_, column_ - 2,
606              "Need space between identifier and decimal point.");
607          }
608          current_.type = ConsumeNumber(false, true);
609        } else {
610          current_.type = TYPE_SYMBOL;
611        }
612      } else if (TryConsumeOne<Digit>()) {
613        current_.type = ConsumeNumber(false, false);
614      } else if (TryConsume('\"')) {
615        ConsumeString('\"');
616        current_.type = TYPE_STRING;
617      } else if (TryConsume('\'')) {
618        ConsumeString('\'');
619        current_.type = TYPE_STRING;
620      } else {
621        NextChar();
622        current_.type = TYPE_SYMBOL;
623      }
624
625      EndToken();
626      return true;
627    }
628  }
629
630  // EOF
631  current_.type = TYPE_END;
632  current_.text.clear();
633  current_.line = line_;
634  current_.column = column_;
635  current_.end_column = column_;
636  return false;
637}
638
639namespace {
640
641// Helper class for collecting comments and putting them in the right places.
642//
643// This basically just buffers the most recent comment until it can be decided
644// exactly where that comment should be placed.  When Flush() is called, the
645// current comment goes into either prev_trailing_comments or detached_comments.
646// When the CommentCollector is destroyed, the last buffered comment goes into
647// next_leading_comments.
648class CommentCollector {
649 public:
650  CommentCollector(string* prev_trailing_comments,
651                   vector<string>* detached_comments,
652                   string* next_leading_comments)
653      : prev_trailing_comments_(prev_trailing_comments),
654        detached_comments_(detached_comments),
655        next_leading_comments_(next_leading_comments),
656        has_comment_(false),
657        is_line_comment_(false),
658        can_attach_to_prev_(true) {
659    if (prev_trailing_comments != NULL) prev_trailing_comments->clear();
660    if (detached_comments != NULL) detached_comments->clear();
661    if (next_leading_comments != NULL) next_leading_comments->clear();
662  }
663
664  ~CommentCollector() {
665    // Whatever is in the buffer is a leading comment.
666    if (next_leading_comments_ != NULL && has_comment_) {
667      comment_buffer_.swap(*next_leading_comments_);
668    }
669  }
670
671  // About to read a line comment.  Get the comment buffer pointer in order to
672  // read into it.
673  string* GetBufferForLineComment() {
674    // We want to combine with previous line comments, but not block comments.
675    if (has_comment_ && !is_line_comment_) {
676      Flush();
677    }
678    has_comment_ = true;
679    is_line_comment_ = true;
680    return &comment_buffer_;
681  }
682
683  // About to read a block comment.  Get the comment buffer pointer in order to
684  // read into it.
685  string* GetBufferForBlockComment() {
686    if (has_comment_) {
687      Flush();
688    }
689    has_comment_ = true;
690    is_line_comment_ = false;
691    return &comment_buffer_;
692  }
693
694  void ClearBuffer() {
695    comment_buffer_.clear();
696    has_comment_ = false;
697  }
698
699  // Called once we know that the comment buffer is complete and is *not*
700  // connected to the next token.
701  void Flush() {
702    if (has_comment_) {
703      if (can_attach_to_prev_) {
704        if (prev_trailing_comments_ != NULL) {
705          prev_trailing_comments_->append(comment_buffer_);
706        }
707        can_attach_to_prev_ = false;
708      } else {
709        if (detached_comments_ != NULL) {
710          detached_comments_->push_back(comment_buffer_);
711        }
712      }
713      ClearBuffer();
714    }
715  }
716
717  void DetachFromPrev() {
718    can_attach_to_prev_ = false;
719  }
720
721 private:
722  string* prev_trailing_comments_;
723  vector<string>* detached_comments_;
724  string* next_leading_comments_;
725
726  string comment_buffer_;
727
728  // True if any comments were read into comment_buffer_.  This can be true even
729  // if comment_buffer_ is empty, namely if the comment was "/**/".
730  bool has_comment_;
731
732  // Is the comment in the comment buffer a line comment?
733  bool is_line_comment_;
734
735  // Is it still possible that we could be reading a comment attached to the
736  // previous token?
737  bool can_attach_to_prev_;
738};
739
740} // namespace
741
742bool Tokenizer::NextWithComments(string* prev_trailing_comments,
743                                 vector<string>* detached_comments,
744                                 string* next_leading_comments) {
745  CommentCollector collector(prev_trailing_comments, detached_comments,
746                             next_leading_comments);
747
748  if (current_.type == TYPE_START) {
749    collector.DetachFromPrev();
750  } else {
751    // A comment appearing on the same line must be attached to the previous
752    // declaration.
753    ConsumeZeroOrMore<WhitespaceNoNewline>();
754    switch (TryConsumeCommentStart()) {
755      case LINE_COMMENT:
756        ConsumeLineComment(collector.GetBufferForLineComment());
757
758        // Don't allow comments on subsequent lines to be attached to a trailing
759        // comment.
760        collector.Flush();
761        break;
762      case BLOCK_COMMENT:
763        ConsumeBlockComment(collector.GetBufferForBlockComment());
764
765        ConsumeZeroOrMore<WhitespaceNoNewline>();
766        if (!TryConsume('\n')) {
767          // Oops, the next token is on the same line.  If we recorded a comment
768          // we really have no idea which token it should be attached to.
769          collector.ClearBuffer();
770          return Next();
771        }
772
773        // Don't allow comments on subsequent lines to be attached to a trailing
774        // comment.
775        collector.Flush();
776        break;
777      case SLASH_NOT_COMMENT:
778        return true;
779      case NO_COMMENT:
780        if (!TryConsume('\n')) {
781          // The next token is on the same line.  There are no comments.
782          return Next();
783        }
784        break;
785    }
786  }
787
788  // OK, we are now on the line *after* the previous token.
789  while (true) {
790    ConsumeZeroOrMore<WhitespaceNoNewline>();
791
792    switch (TryConsumeCommentStart()) {
793      case LINE_COMMENT:
794        ConsumeLineComment(collector.GetBufferForLineComment());
795        break;
796      case BLOCK_COMMENT:
797        ConsumeBlockComment(collector.GetBufferForBlockComment());
798
799        // Consume the rest of the line so that we don't interpret it as a
800        // blank line the next time around the loop.
801        ConsumeZeroOrMore<WhitespaceNoNewline>();
802        TryConsume('\n');
803        break;
804      case SLASH_NOT_COMMENT:
805        return true;
806      case NO_COMMENT:
807        if (TryConsume('\n')) {
808          // Completely blank line.
809          collector.Flush();
810          collector.DetachFromPrev();
811        } else {
812          bool result = Next();
813          if (!result ||
814              current_.text == "}" ||
815              current_.text == "]" ||
816              current_.text == ")") {
817            // It looks like we're at the end of a scope.  In this case it
818            // makes no sense to attach a comment to the following token.
819            collector.Flush();
820          }
821          return result;
822        }
823        break;
824    }
825  }
826}
827
828// -------------------------------------------------------------------
829// Token-parsing helpers.  Remember that these don't need to report
830// errors since any errors should already have been reported while
831// tokenizing.  Also, these can assume that whatever text they
832// are given is text that the tokenizer actually parsed as a token
833// of the given type.
834
835bool Tokenizer::ParseInteger(const string& text, uint64 max_value,
836                             uint64* output) {
837  // Sadly, we can't just use strtoul() since it is only 32-bit and strtoull()
838  // is non-standard.  I hate the C standard library.  :(
839
840//  return strtoull(text.c_str(), NULL, 0);
841
842  const char* ptr = text.c_str();
843  int base = 10;
844  if (ptr[0] == '0') {
845    if (ptr[1] == 'x' || ptr[1] == 'X') {
846      // This is hex.
847      base = 16;
848      ptr += 2;
849    } else {
850      // This is octal.
851      base = 8;
852    }
853  }
854
855  uint64 result = 0;
856  for (; *ptr != '\0'; ptr++) {
857    int digit = DigitValue(*ptr);
858    GOOGLE_LOG_IF(DFATAL, digit < 0 || digit >= base)
859      << " Tokenizer::ParseInteger() passed text that could not have been"
860         " tokenized as an integer: " << CEscape(text);
861    if (digit > max_value || result > (max_value - digit) / base) {
862      // Overflow.
863      return false;
864    }
865    result = result * base + digit;
866  }
867
868  *output = result;
869  return true;
870}
871
872double Tokenizer::ParseFloat(const string& text) {
873  const char* start = text.c_str();
874  char* end;
875  double result = NoLocaleStrtod(start, &end);
876
877  // "1e" is not a valid float, but if the tokenizer reads it, it will
878  // report an error but still return it as a valid token.  We need to
879  // accept anything the tokenizer could possibly return, error or not.
880  if (*end == 'e' || *end == 'E') {
881    ++end;
882    if (*end == '-' || *end == '+') ++end;
883  }
884
885  // If the Tokenizer had allow_f_after_float_ enabled, the float may be
886  // suffixed with the letter 'f'.
887  if (*end == 'f' || *end == 'F') {
888    ++end;
889  }
890
891  GOOGLE_LOG_IF(DFATAL, end - start != text.size() || *start == '-')
892    << " Tokenizer::ParseFloat() passed text that could not have been"
893       " tokenized as a float: " << CEscape(text);
894  return result;
895}
896
897// Helper to append a Unicode code point to a string as UTF8, without bringing
898// in any external dependencies.
899static void AppendUTF8(uint32 code_point, string* output) {
900  uint32 tmp = 0;
901  int len = 0;
902  if (code_point <= 0x7f) {
903    tmp = code_point;
904    len = 1;
905  } else if (code_point <= 0x07ff) {
906    tmp = 0x0000c080 |
907        ((code_point & 0x07c0) << 2) |
908        (code_point & 0x003f);
909    len = 2;
910  } else if (code_point <= 0xffff) {
911    tmp = 0x00e08080 |
912        ((code_point & 0xf000) << 4) |
913        ((code_point & 0x0fc0) << 2) |
914        (code_point & 0x003f);
915    len = 3;
916  } else if (code_point <= 0x1fffff) {
917    tmp = 0xf0808080 |
918        ((code_point & 0x1c0000) << 6) |
919        ((code_point & 0x03f000) << 4) |
920        ((code_point & 0x000fc0) << 2) |
921        (code_point & 0x003f);
922    len = 4;
923  } else {
924    // UTF-16 is only defined for code points up to 0x10FFFF, and UTF-8 is
925    // normally only defined up to there as well.
926    StringAppendF(output, "\\U%08x", code_point);
927    return;
928  }
929  tmp = ghtonl(tmp);
930  output->append(reinterpret_cast<const char*>(&tmp) + sizeof(tmp) - len, len);
931}
932
933// Try to read <len> hex digits from ptr, and stuff the numeric result into
934// *result. Returns true if that many digits were successfully consumed.
935static bool ReadHexDigits(const char* ptr, int len, uint32* result) {
936  *result = 0;
937  if (len == 0) return false;
938  for (const char* end = ptr + len; ptr < end; ++ptr) {
939    if (*ptr == '\0') return false;
940    *result = (*result << 4) + DigitValue(*ptr);
941  }
942  return true;
943}
944
945// Handling UTF-16 surrogate pairs. UTF-16 encodes code points in the range
946// 0x10000...0x10ffff as a pair of numbers, a head surrogate followed by a trail
947// surrogate. These numbers are in a reserved range of Unicode code points, so
948// if we encounter such a pair we know how to parse it and convert it into a
949// single code point.
950static const uint32 kMinHeadSurrogate = 0xd800;
951static const uint32 kMaxHeadSurrogate = 0xdc00;
952static const uint32 kMinTrailSurrogate = 0xdc00;
953static const uint32 kMaxTrailSurrogate = 0xe000;
954
955static inline bool IsHeadSurrogate(uint32 code_point) {
956  return (code_point >= kMinHeadSurrogate) && (code_point < kMaxHeadSurrogate);
957}
958
959static inline bool IsTrailSurrogate(uint32 code_point) {
960  return (code_point >= kMinTrailSurrogate) &&
961      (code_point < kMaxTrailSurrogate);
962}
963
964// Combine a head and trail surrogate into a single Unicode code point.
965static uint32 AssembleUTF16(uint32 head_surrogate, uint32 trail_surrogate) {
966  GOOGLE_DCHECK(IsHeadSurrogate(head_surrogate));
967  GOOGLE_DCHECK(IsTrailSurrogate(trail_surrogate));
968  return 0x10000 + (((head_surrogate - kMinHeadSurrogate) << 10) |
969      (trail_surrogate - kMinTrailSurrogate));
970}
971
972// Convert the escape sequence parameter to a number of expected hex digits.
973static inline int UnicodeLength(char key) {
974  if (key == 'u') return 4;
975  if (key == 'U') return 8;
976  return 0;
977}
978
979// Given a pointer to the 'u' or 'U' starting a Unicode escape sequence, attempt
980// to parse that sequence. On success, returns a pointer to the first char
981// beyond that sequence, and fills in *code_point. On failure, returns ptr
982// itself.
983static const char* FetchUnicodePoint(const char* ptr, uint32* code_point) {
984  const char* p = ptr;
985  // Fetch the code point.
986  const int len = UnicodeLength(*p++);
987  if (!ReadHexDigits(p, len, code_point))
988    return ptr;
989  p += len;
990
991  // Check if the code point we read is a "head surrogate." If so, then we
992  // expect it to be immediately followed by another code point which is a valid
993  // "trail surrogate," and together they form a UTF-16 pair which decodes into
994  // a single Unicode point. Trail surrogates may only use \u, not \U.
995  if (IsHeadSurrogate(*code_point) && *p == '\\' && *(p + 1) == 'u') {
996    uint32 trail_surrogate;
997    if (ReadHexDigits(p + 2, 4, &trail_surrogate) &&
998        IsTrailSurrogate(trail_surrogate)) {
999      *code_point = AssembleUTF16(*code_point, trail_surrogate);
1000      p += 6;
1001    }
1002    // If this failed, then we just emit the head surrogate as a code point.
1003    // It's bogus, but so is the string.
1004  }
1005
1006  return p;
1007}
1008
1009// The text string must begin and end with single or double quote
1010// characters.
1011void Tokenizer::ParseStringAppend(const string& text, string* output) {
1012  // Reminder: text[0] is always a quote character.  (If text is
1013  // empty, it's invalid, so we'll just return).
1014  const size_t text_size = text.size();
1015  if (text_size == 0) {
1016    GOOGLE_LOG(DFATAL)
1017      << " Tokenizer::ParseStringAppend() passed text that could not"
1018         " have been tokenized as a string: " << CEscape(text);
1019    return;
1020  }
1021
1022  // Reserve room for new string. The branch is necessary because if
1023  // there is already space available the reserve() call might
1024  // downsize the output.
1025  const size_t new_len = text_size + output->size();
1026  if (new_len > output->capacity()) {
1027    output->reserve(new_len);
1028  }
1029
1030  // Loop through the string copying characters to "output" and
1031  // interpreting escape sequences.  Note that any invalid escape
1032  // sequences or other errors were already reported while tokenizing.
1033  // In this case we do not need to produce valid results.
1034  for (const char* ptr = text.c_str() + 1; *ptr != '\0'; ptr++) {
1035    if (*ptr == '\\' && ptr[1] != '\0') {
1036      // An escape sequence.
1037      ++ptr;
1038
1039      if (OctalDigit::InClass(*ptr)) {
1040        // An octal escape.  May one, two, or three digits.
1041        int code = DigitValue(*ptr);
1042        if (OctalDigit::InClass(ptr[1])) {
1043          ++ptr;
1044          code = code * 8 + DigitValue(*ptr);
1045        }
1046        if (OctalDigit::InClass(ptr[1])) {
1047          ++ptr;
1048          code = code * 8 + DigitValue(*ptr);
1049        }
1050        output->push_back(static_cast<char>(code));
1051
1052      } else if (*ptr == 'x') {
1053        // A hex escape.  May zero, one, or two digits.  (The zero case
1054        // will have been caught as an error earlier.)
1055        int code = 0;
1056        if (HexDigit::InClass(ptr[1])) {
1057          ++ptr;
1058          code = DigitValue(*ptr);
1059        }
1060        if (HexDigit::InClass(ptr[1])) {
1061          ++ptr;
1062          code = code * 16 + DigitValue(*ptr);
1063        }
1064        output->push_back(static_cast<char>(code));
1065
1066      } else if (*ptr == 'u' || *ptr == 'U') {
1067        uint32 unicode;
1068        const char* end = FetchUnicodePoint(ptr, &unicode);
1069        if (end == ptr) {
1070          // Failure: Just dump out what we saw, don't try to parse it.
1071          output->push_back(*ptr);
1072        } else {
1073          AppendUTF8(unicode, output);
1074          ptr = end - 1;  // Because we're about to ++ptr.
1075        }
1076      } else {
1077        // Some other escape code.
1078        output->push_back(TranslateEscape(*ptr));
1079      }
1080
1081    } else if (*ptr == text[0] && ptr[1] == '\0') {
1082      // Ignore final quote matching the starting quote.
1083    } else {
1084      output->push_back(*ptr);
1085    }
1086  }
1087}
1088
1089}  // namespace io
1090}  // namespace protobuf
1091}  // namespace google
1092