Lexer.cpp revision 515f43f9f23de50d155b481b8774ec40bdfd7ff2
1//===--- Lexer.cpp - C Language Family Lexer ------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10//  This file implements the Lexer and Token interfaces.
11//
12//===----------------------------------------------------------------------===//
13//
14// TODO: GCC Diagnostics emitted by the lexer:
15// PEDWARN: (form feed|vertical tab) in preprocessing directive
16//
17// Universal characters, unicode, char mapping:
18// WARNING: `%.*s' is not in NFKC
19// WARNING: `%.*s' is not in NFC
20//
21// Other:
22// TODO: Options to support:
23//    -fexec-charset,-fwide-exec-charset
24//
25//===----------------------------------------------------------------------===//
26
27#include "clang/Lex/Lexer.h"
28#include "clang/Lex/Preprocessor.h"
29#include "clang/Lex/LexDiagnostic.h"
30#include "clang/Basic/SourceManager.h"
31#include "llvm/Support/Compiler.h"
32#include "llvm/Support/MemoryBuffer.h"
33#include <cctype>
34using namespace clang;
35
36static void InitCharacterInfo();
37
38//===----------------------------------------------------------------------===//
39// Token Class Implementation
40//===----------------------------------------------------------------------===//
41
42/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
43bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
44  if (IdentifierInfo *II = getIdentifierInfo())
45    return II->getObjCKeywordID() == objcKey;
46  return false;
47}
48
49/// getObjCKeywordID - Return the ObjC keyword kind.
50tok::ObjCKeywordKind Token::getObjCKeywordID() const {
51  IdentifierInfo *specId = getIdentifierInfo();
52  return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
53}
54
55
56//===----------------------------------------------------------------------===//
57// Lexer Class Implementation
58//===----------------------------------------------------------------------===//
59
60void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
61                      const char *BufEnd) {
62  InitCharacterInfo();
63
64  BufferStart = BufStart;
65  BufferPtr = BufPtr;
66  BufferEnd = BufEnd;
67
68  assert(BufEnd[0] == 0 &&
69         "We assume that the input buffer has a null character at the end"
70         " to simplify lexing!");
71
72  Is_PragmaLexer = false;
73  IsInConflictMarker = false;
74
75  // Start of the file is a start of line.
76  IsAtStartOfLine = true;
77
78  // We are not after parsing a #.
79  ParsingPreprocessorDirective = false;
80
81  // We are not after parsing #include.
82  ParsingFilename = false;
83
84  // We are not in raw mode.  Raw mode disables diagnostics and interpretation
85  // of tokens (e.g. identifiers, thus disabling macro expansion).  It is used
86  // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
87  // or otherwise skipping over tokens.
88  LexingRawMode = false;
89
90  // Default to not keeping comments.
91  ExtendedTokenMode = 0;
92}
93
94/// Lexer constructor - Create a new lexer object for the specified buffer
95/// with the specified preprocessor managing the lexing process.  This lexer
96/// assumes that the associated file buffer and Preprocessor objects will
97/// outlive it, so it doesn't take ownership of either of them.
98Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP)
99  : PreprocessorLexer(&PP, FID),
100    FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
101    Features(PP.getLangOptions()) {
102
103  InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
104            InputFile->getBufferEnd());
105
106  // Default to keeping comments if the preprocessor wants them.
107  SetCommentRetentionState(PP.getCommentRetentionState());
108}
109
110/// Lexer constructor - Create a new raw lexer object.  This object is only
111/// suitable for calls to 'LexRawToken'.  This lexer assumes that the text
112/// range will outlive it, so it doesn't take ownership of it.
113Lexer::Lexer(SourceLocation fileloc, const LangOptions &features,
114             const char *BufStart, const char *BufPtr, const char *BufEnd)
115  : FileLoc(fileloc), Features(features) {
116
117  InitLexer(BufStart, BufPtr, BufEnd);
118
119  // We *are* in raw mode.
120  LexingRawMode = true;
121}
122
123/// Lexer constructor - Create a new raw lexer object.  This object is only
124/// suitable for calls to 'LexRawToken'.  This lexer assumes that the text
125/// range will outlive it, so it doesn't take ownership of it.
126Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
127             const SourceManager &SM, const LangOptions &features)
128  : FileLoc(SM.getLocForStartOfFile(FID)), Features(features) {
129
130  InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(),
131            FromFile->getBufferEnd());
132
133  // We *are* in raw mode.
134  LexingRawMode = true;
135}
136
137/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
138/// _Pragma expansion.  This has a variety of magic semantics that this method
139/// sets up.  It returns a new'd Lexer that must be delete'd when done.
140///
141/// On entrance to this routine, TokStartLoc is a macro location which has a
142/// spelling loc that indicates the bytes to be lexed for the token and an
143/// instantiation location that indicates where all lexed tokens should be
144/// "expanded from".
145///
146/// FIXME: It would really be nice to make _Pragma just be a wrapper around a
147/// normal lexer that remaps tokens as they fly by.  This would require making
148/// Preprocessor::Lex virtual.  Given that, we could just dump in a magic lexer
149/// interface that could handle this stuff.  This would pull GetMappedTokenLoc
150/// out of the critical path of the lexer!
151///
152Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
153                                 SourceLocation InstantiationLocStart,
154                                 SourceLocation InstantiationLocEnd,
155                                 unsigned TokLen, Preprocessor &PP) {
156  SourceManager &SM = PP.getSourceManager();
157
158  // Create the lexer as if we were going to lex the file normally.
159  FileID SpellingFID = SM.getFileID(SpellingLoc);
160  const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID);
161  Lexer *L = new Lexer(SpellingFID, InputFile, PP);
162
163  // Now that the lexer is created, change the start/end locations so that we
164  // just lex the subsection of the file that we want.  This is lexing from a
165  // scratch buffer.
166  const char *StrData = SM.getCharacterData(SpellingLoc);
167
168  L->BufferPtr = StrData;
169  L->BufferEnd = StrData+TokLen;
170  assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
171
172  // Set the SourceLocation with the remapping information.  This ensures that
173  // GetMappedTokenLoc will remap the tokens as they are lexed.
174  L->FileLoc = SM.createInstantiationLoc(SM.getLocForStartOfFile(SpellingFID),
175                                         InstantiationLocStart,
176                                         InstantiationLocEnd, TokLen);
177
178  // Ensure that the lexer thinks it is inside a directive, so that end \n will
179  // return an EOM token.
180  L->ParsingPreprocessorDirective = true;
181
182  // This lexer really is for _Pragma.
183  L->Is_PragmaLexer = true;
184  return L;
185}
186
187
188/// Stringify - Convert the specified string into a C string, with surrounding
189/// ""'s, and with escaped \ and " characters.
190std::string Lexer::Stringify(const std::string &Str, bool Charify) {
191  std::string Result = Str;
192  char Quote = Charify ? '\'' : '"';
193  for (unsigned i = 0, e = Result.size(); i != e; ++i) {
194    if (Result[i] == '\\' || Result[i] == Quote) {
195      Result.insert(Result.begin()+i, '\\');
196      ++i; ++e;
197    }
198  }
199  return Result;
200}
201
202/// Stringify - Convert the specified string into a C string by escaping '\'
203/// and " characters.  This does not add surrounding ""'s to the string.
204void Lexer::Stringify(llvm::SmallVectorImpl<char> &Str) {
205  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
206    if (Str[i] == '\\' || Str[i] == '"') {
207      Str.insert(Str.begin()+i, '\\');
208      ++i; ++e;
209    }
210  }
211}
212
213static bool isWhitespace(unsigned char c);
214
215/// MeasureTokenLength - Relex the token at the specified location and return
216/// its length in bytes in the input file.  If the token needs cleaning (e.g.
217/// includes a trigraph or an escaped newline) then this count includes bytes
218/// that are part of that.
219unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
220                                   const SourceManager &SM,
221                                   const LangOptions &LangOpts) {
222  // TODO: this could be special cased for common tokens like identifiers, ')',
223  // etc to make this faster, if it mattered.  Just look at StrData[0] to handle
224  // all obviously single-char tokens.  This could use
225  // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
226  // something.
227
228  // If this comes from a macro expansion, we really do want the macro name, not
229  // the token this macro expanded to.
230  Loc = SM.getInstantiationLoc(Loc);
231  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
232  bool Invalid = false;
233  llvm::StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
234  if (Invalid)
235    return 0;
236
237  const char *StrData = Buffer.data()+LocInfo.second;
238
239  if (isWhitespace(StrData[0]))
240    return 0;
241
242  // Create a lexer starting at the beginning of this token.
243  Lexer TheLexer(Loc, LangOpts, Buffer.begin(), StrData, Buffer.end());
244  TheLexer.SetCommentRetentionState(true);
245  Token TheTok;
246  TheLexer.LexFromRawLexer(TheTok);
247  return TheTok.getLength();
248}
249
250//===----------------------------------------------------------------------===//
251// Character information.
252//===----------------------------------------------------------------------===//
253
254enum {
255  CHAR_HORZ_WS  = 0x01,  // ' ', '\t', '\f', '\v'.  Note, no '\0'
256  CHAR_VERT_WS  = 0x02,  // '\r', '\n'
257  CHAR_LETTER   = 0x04,  // a-z,A-Z
258  CHAR_NUMBER   = 0x08,  // 0-9
259  CHAR_UNDER    = 0x10,  // _
260  CHAR_PERIOD   = 0x20   // .
261};
262
263// Statically initialize CharInfo table based on ASCII character set
264// Reference: FreeBSD 7.2 /usr/share/misc/ascii
265static const unsigned char CharInfo[256] =
266{
267// 0 NUL         1 SOH         2 STX         3 ETX
268// 4 EOT         5 ENQ         6 ACK         7 BEL
269   0           , 0           , 0           , 0           ,
270   0           , 0           , 0           , 0           ,
271// 8 BS          9 HT         10 NL         11 VT
272//12 NP         13 CR         14 SO         15 SI
273   0           , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS,
274   CHAR_HORZ_WS, CHAR_VERT_WS, 0           , 0           ,
275//16 DLE        17 DC1        18 DC2        19 DC3
276//20 DC4        21 NAK        22 SYN        23 ETB
277   0           , 0           , 0           , 0           ,
278   0           , 0           , 0           , 0           ,
279//24 CAN        25 EM         26 SUB        27 ESC
280//28 FS         29 GS         30 RS         31 US
281   0           , 0           , 0           , 0           ,
282   0           , 0           , 0           , 0           ,
283//32 SP         33  !         34  "         35  #
284//36  $         37  %         38  &         39  '
285   CHAR_HORZ_WS, 0           , 0           , 0           ,
286   0           , 0           , 0           , 0           ,
287//40  (         41  )         42  *         43  +
288//44  ,         45  -         46  .         47  /
289   0           , 0           , 0           , 0           ,
290   0           , 0           , CHAR_PERIOD , 0           ,
291//48  0         49  1         50  2         51  3
292//52  4         53  5         54  6         55  7
293   CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
294   CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
295//56  8         57  9         58  :         59  ;
296//60  <         61  =         62  >         63  ?
297   CHAR_NUMBER , CHAR_NUMBER , 0           , 0           ,
298   0           , 0           , 0           , 0           ,
299//64  @         65  A         66  B         67  C
300//68  D         69  E         70  F         71  G
301   0           , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
302   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
303//72  H         73  I         74  J         75  K
304//76  L         77  M         78  N         79  O
305   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
306   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
307//80  P         81  Q         82  R         83  S
308//84  T         85  U         86  V         87  W
309   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
310   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
311//88  X         89  Y         90  Z         91  [
312//92  \         93  ]         94  ^         95  _
313   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0           ,
314   0           , 0           , 0           , CHAR_UNDER  ,
315//96  `         97  a         98  b         99  c
316//100  d       101  e        102  f        103  g
317   0           , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
318   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
319//104  h       105  i        106  j        107  k
320//108  l       109  m        110  n        111  o
321   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
322   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
323//112  p       113  q        114  r        115  s
324//116  t       117  u        118  v        119  w
325   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
326   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
327//120  x       121  y        122  z        123  {
328//124  |        125  }        126  ~        127 DEL
329   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0           ,
330   0           , 0           , 0           , 0
331};
332
333static void InitCharacterInfo() {
334  static bool isInited = false;
335  if (isInited) return;
336  // check the statically-initialized CharInfo table
337  assert(CHAR_HORZ_WS == CharInfo[(int)' ']);
338  assert(CHAR_HORZ_WS == CharInfo[(int)'\t']);
339  assert(CHAR_HORZ_WS == CharInfo[(int)'\f']);
340  assert(CHAR_HORZ_WS == CharInfo[(int)'\v']);
341  assert(CHAR_VERT_WS == CharInfo[(int)'\n']);
342  assert(CHAR_VERT_WS == CharInfo[(int)'\r']);
343  assert(CHAR_UNDER   == CharInfo[(int)'_']);
344  assert(CHAR_PERIOD  == CharInfo[(int)'.']);
345  for (unsigned i = 'a'; i <= 'z'; ++i) {
346    assert(CHAR_LETTER == CharInfo[i]);
347    assert(CHAR_LETTER == CharInfo[i+'A'-'a']);
348  }
349  for (unsigned i = '0'; i <= '9'; ++i)
350    assert(CHAR_NUMBER == CharInfo[i]);
351
352  isInited = true;
353}
354
355
356/// isIdentifierBody - Return true if this is the body character of an
357/// identifier, which is [a-zA-Z0-9_].
358static inline bool isIdentifierBody(unsigned char c) {
359  return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false;
360}
361
362/// isHorizontalWhitespace - Return true if this character is horizontal
363/// whitespace: ' ', '\t', '\f', '\v'.  Note that this returns false for '\0'.
364static inline bool isHorizontalWhitespace(unsigned char c) {
365  return (CharInfo[c] & CHAR_HORZ_WS) ? true : false;
366}
367
368/// isWhitespace - Return true if this character is horizontal or vertical
369/// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'.  Note that this returns false
370/// for '\0'.
371static inline bool isWhitespace(unsigned char c) {
372  return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false;
373}
374
375/// isNumberBody - Return true if this is the body character of an
376/// preprocessing number, which is [a-zA-Z0-9_.].
377static inline bool isNumberBody(unsigned char c) {
378  return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ?
379    true : false;
380}
381
382
383//===----------------------------------------------------------------------===//
384// Diagnostics forwarding code.
385//===----------------------------------------------------------------------===//
386
387/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
388/// lexer buffer was all instantiated at a single point, perform the mapping.
389/// This is currently only used for _Pragma implementation, so it is the slow
390/// path of the hot getSourceLocation method.  Do not allow it to be inlined.
391static DISABLE_INLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP,
392                                                       SourceLocation FileLoc,
393                                                       unsigned CharNo,
394                                                       unsigned TokLen);
395static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
396                                        SourceLocation FileLoc,
397                                        unsigned CharNo, unsigned TokLen) {
398  assert(FileLoc.isMacroID() && "Must be an instantiation");
399
400  // Otherwise, we're lexing "mapped tokens".  This is used for things like
401  // _Pragma handling.  Combine the instantiation location of FileLoc with the
402  // spelling location.
403  SourceManager &SM = PP.getSourceManager();
404
405  // Create a new SLoc which is expanded from Instantiation(FileLoc) but whose
406  // characters come from spelling(FileLoc)+Offset.
407  SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
408  SpellingLoc = SpellingLoc.getFileLocWithOffset(CharNo);
409
410  // Figure out the expansion loc range, which is the range covered by the
411  // original _Pragma(...) sequence.
412  std::pair<SourceLocation,SourceLocation> II =
413    SM.getImmediateInstantiationRange(FileLoc);
414
415  return SM.createInstantiationLoc(SpellingLoc, II.first, II.second, TokLen);
416}
417
418/// getSourceLocation - Return a source location identifier for the specified
419/// offset in the current file.
420SourceLocation Lexer::getSourceLocation(const char *Loc,
421                                        unsigned TokLen) const {
422  assert(Loc >= BufferStart && Loc <= BufferEnd &&
423         "Location out of range for this buffer!");
424
425  // In the normal case, we're just lexing from a simple file buffer, return
426  // the file id from FileLoc with the offset specified.
427  unsigned CharNo = Loc-BufferStart;
428  if (FileLoc.isFileID())
429    return FileLoc.getFileLocWithOffset(CharNo);
430
431  // Otherwise, this is the _Pragma lexer case, which pretends that all of the
432  // tokens are lexed from where the _Pragma was defined.
433  assert(PP && "This doesn't work on raw lexers");
434  return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
435}
436
437/// Diag - Forwarding function for diagnostics.  This translate a source
438/// position in the current buffer into a SourceLocation object for rendering.
439DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
440  return PP->Diag(getSourceLocation(Loc), DiagID);
441}
442
443//===----------------------------------------------------------------------===//
444// Trigraph and Escaped Newline Handling Code.
445//===----------------------------------------------------------------------===//
446
447/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
448/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
449static char GetTrigraphCharForLetter(char Letter) {
450  switch (Letter) {
451  default:   return 0;
452  case '=':  return '#';
453  case ')':  return ']';
454  case '(':  return '[';
455  case '!':  return '|';
456  case '\'': return '^';
457  case '>':  return '}';
458  case '/':  return '\\';
459  case '<':  return '{';
460  case '-':  return '~';
461  }
462}
463
464/// DecodeTrigraphChar - If the specified character is a legal trigraph when
465/// prefixed with ??, emit a trigraph warning.  If trigraphs are enabled,
466/// return the result character.  Finally, emit a warning about trigraph use
467/// whether trigraphs are enabled or not.
468static char DecodeTrigraphChar(const char *CP, Lexer *L) {
469  char Res = GetTrigraphCharForLetter(*CP);
470  if (!Res || !L) return Res;
471
472  if (!L->getFeatures().Trigraphs) {
473    if (!L->isLexingRawMode())
474      L->Diag(CP-2, diag::trigraph_ignored);
475    return 0;
476  }
477
478  if (!L->isLexingRawMode())
479    L->Diag(CP-2, diag::trigraph_converted) << std::string()+Res;
480  return Res;
481}
482
483/// getEscapedNewLineSize - Return the size of the specified escaped newline,
484/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
485/// trigraph equivalent on entry to this function.
486unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
487  unsigned Size = 0;
488  while (isWhitespace(Ptr[Size])) {
489    ++Size;
490
491    if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
492      continue;
493
494    // If this is a \r\n or \n\r, skip the other half.
495    if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
496        Ptr[Size-1] != Ptr[Size])
497      ++Size;
498
499    return Size;
500  }
501
502  // Not an escaped newline, must be a \t or something else.
503  return 0;
504}
505
506/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
507/// them), skip over them and return the first non-escaped-newline found,
508/// otherwise return P.
509const char *Lexer::SkipEscapedNewLines(const char *P) {
510  while (1) {
511    const char *AfterEscape;
512    if (*P == '\\') {
513      AfterEscape = P+1;
514    } else if (*P == '?') {
515      // If not a trigraph for escape, bail out.
516      if (P[1] != '?' || P[2] != '/')
517        return P;
518      AfterEscape = P+3;
519    } else {
520      return P;
521    }
522
523    unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
524    if (NewLineSize == 0) return P;
525    P = AfterEscape+NewLineSize;
526  }
527}
528
529
530/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
531/// get its size, and return it.  This is tricky in several cases:
532///   1. If currently at the start of a trigraph, we warn about the trigraph,
533///      then either return the trigraph (skipping 3 chars) or the '?',
534///      depending on whether trigraphs are enabled or not.
535///   2. If this is an escaped newline (potentially with whitespace between
536///      the backslash and newline), implicitly skip the newline and return
537///      the char after it.
538///   3. If this is a UCN, return it.  FIXME: C++ UCN's?
539///
540/// This handles the slow/uncommon case of the getCharAndSize method.  Here we
541/// know that we can accumulate into Size, and that we have already incremented
542/// Ptr by Size bytes.
543///
544/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
545/// be updated to match.
546///
547char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
548                               Token *Tok) {
549  // If we have a slash, look for an escaped newline.
550  if (Ptr[0] == '\\') {
551    ++Size;
552    ++Ptr;
553Slash:
554    // Common case, backslash-char where the char is not whitespace.
555    if (!isWhitespace(Ptr[0])) return '\\';
556
557    // See if we have optional whitespace characters between the slash and
558    // newline.
559    if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
560      // Remember that this token needs to be cleaned.
561      if (Tok) Tok->setFlag(Token::NeedsCleaning);
562
563      // Warn if there was whitespace between the backslash and newline.
564      if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
565        Diag(Ptr, diag::backslash_newline_space);
566
567      // Found backslash<whitespace><newline>.  Parse the char after it.
568      Size += EscapedNewLineSize;
569      Ptr  += EscapedNewLineSize;
570      // Use slow version to accumulate a correct size field.
571      return getCharAndSizeSlow(Ptr, Size, Tok);
572    }
573
574    // Otherwise, this is not an escaped newline, just return the slash.
575    return '\\';
576  }
577
578  // If this is a trigraph, process it.
579  if (Ptr[0] == '?' && Ptr[1] == '?') {
580    // If this is actually a legal trigraph (not something like "??x"), emit
581    // a trigraph warning.  If so, and if trigraphs are enabled, return it.
582    if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) {
583      // Remember that this token needs to be cleaned.
584      if (Tok) Tok->setFlag(Token::NeedsCleaning);
585
586      Ptr += 3;
587      Size += 3;
588      if (C == '\\') goto Slash;
589      return C;
590    }
591  }
592
593  // If this is neither, return a single character.
594  ++Size;
595  return *Ptr;
596}
597
598
599/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
600/// getCharAndSizeNoWarn method.  Here we know that we can accumulate into Size,
601/// and that we have already incremented Ptr by Size bytes.
602///
603/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
604/// be updated to match.
605char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
606                                     const LangOptions &Features) {
607  // If we have a slash, look for an escaped newline.
608  if (Ptr[0] == '\\') {
609    ++Size;
610    ++Ptr;
611Slash:
612    // Common case, backslash-char where the char is not whitespace.
613    if (!isWhitespace(Ptr[0])) return '\\';
614
615    // See if we have optional whitespace characters followed by a newline.
616    if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
617      // Found backslash<whitespace><newline>.  Parse the char after it.
618      Size += EscapedNewLineSize;
619      Ptr  += EscapedNewLineSize;
620
621      // Use slow version to accumulate a correct size field.
622      return getCharAndSizeSlowNoWarn(Ptr, Size, Features);
623    }
624
625    // Otherwise, this is not an escaped newline, just return the slash.
626    return '\\';
627  }
628
629  // If this is a trigraph, process it.
630  if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
631    // If this is actually a legal trigraph (not something like "??x"), return
632    // it.
633    if (char C = GetTrigraphCharForLetter(Ptr[2])) {
634      Ptr += 3;
635      Size += 3;
636      if (C == '\\') goto Slash;
637      return C;
638    }
639  }
640
641  // If this is neither, return a single character.
642  ++Size;
643  return *Ptr;
644}
645
646//===----------------------------------------------------------------------===//
647// Helper methods for lexing.
648//===----------------------------------------------------------------------===//
649
650void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
651  // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
652  unsigned Size;
653  unsigned char C = *CurPtr++;
654  while (isIdentifierBody(C))
655    C = *CurPtr++;
656
657  --CurPtr;   // Back up over the skipped character.
658
659  // Fast path, no $,\,? in identifier found.  '\' might be an escaped newline
660  // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
661  // FIXME: UCNs.
662  //
663  // TODO: Could merge these checks into a CharInfo flag to make the comparison
664  // cheaper
665  if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) {
666FinishIdentifier:
667    const char *IdStart = BufferPtr;
668    FormTokenWithChars(Result, CurPtr, tok::identifier);
669
670    // If we are in raw mode, return this identifier raw.  There is no need to
671    // look up identifier information or attempt to macro expand it.
672    if (LexingRawMode) return;
673
674    // Fill in Result.IdentifierInfo, looking up the identifier in the
675    // identifier table.
676    IdentifierInfo *II = PP->LookUpIdentifierInfo(Result, IdStart);
677
678    // Change the kind of this identifier to the appropriate token kind, e.g.
679    // turning "for" into a keyword.
680    Result.setKind(II->getTokenID());
681
682    // Finally, now that we know we have an identifier, pass this off to the
683    // preprocessor, which may macro expand it or something.
684    if (II->isHandleIdentifierCase())
685      PP->HandleIdentifier(Result);
686    return;
687  }
688
689  // Otherwise, $,\,? in identifier found.  Enter slower path.
690
691  C = getCharAndSize(CurPtr, Size);
692  while (1) {
693    if (C == '$') {
694      // If we hit a $ and they are not supported in identifiers, we are done.
695      if (!Features.DollarIdents) goto FinishIdentifier;
696
697      // Otherwise, emit a diagnostic and continue.
698      if (!isLexingRawMode())
699        Diag(CurPtr, diag::ext_dollar_in_identifier);
700      CurPtr = ConsumeChar(CurPtr, Size, Result);
701      C = getCharAndSize(CurPtr, Size);
702      continue;
703    } else if (!isIdentifierBody(C)) { // FIXME: UCNs.
704      // Found end of identifier.
705      goto FinishIdentifier;
706    }
707
708    // Otherwise, this character is good, consume it.
709    CurPtr = ConsumeChar(CurPtr, Size, Result);
710
711    C = getCharAndSize(CurPtr, Size);
712    while (isIdentifierBody(C)) { // FIXME: UCNs.
713      CurPtr = ConsumeChar(CurPtr, Size, Result);
714      C = getCharAndSize(CurPtr, Size);
715    }
716  }
717}
718
719
720/// LexNumericConstant - Lex the remainder of a integer or floating point
721/// constant. From[-1] is the first character lexed.  Return the end of the
722/// constant.
723void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
724  unsigned Size;
725  char C = getCharAndSize(CurPtr, Size);
726  char PrevCh = 0;
727  while (isNumberBody(C)) { // FIXME: UCNs?
728    CurPtr = ConsumeChar(CurPtr, Size, Result);
729    PrevCh = C;
730    C = getCharAndSize(CurPtr, Size);
731  }
732
733  // If we fell out, check for a sign, due to 1e+12.  If we have one, continue.
734  if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e'))
735    return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
736
737  // If we have a hex FP constant, continue.
738  if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p') &&
739      (!PP || !PP->getLangOptions().CPlusPlus0x))
740    return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
741
742  // Update the location of token as well as BufferPtr.
743  const char *TokStart = BufferPtr;
744  FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
745  Result.setLiteralData(TokStart);
746}
747
748/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
749/// either " or L".
750void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) {
751  const char *NulCharacter = 0; // Does this string contain the \0 character?
752
753  char C = getAndAdvanceChar(CurPtr, Result);
754  while (C != '"') {
755    // Skip escaped characters.
756    if (C == '\\') {
757      // Skip the escaped character.
758      C = getAndAdvanceChar(CurPtr, Result);
759    } else if (C == '\n' || C == '\r' ||             // Newline.
760               (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
761      if (!isLexingRawMode() && !Features.AsmPreprocessor)
762        Diag(BufferPtr, diag::err_unterminated_string);
763      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
764      return;
765    } else if (C == 0) {
766      NulCharacter = CurPtr-1;
767    }
768    C = getAndAdvanceChar(CurPtr, Result);
769  }
770
771  // If a nul character existed in the string, warn about it.
772  if (NulCharacter && !isLexingRawMode())
773    Diag(NulCharacter, diag::null_in_string);
774
775  // Update the location of the token as well as the BufferPtr instance var.
776  const char *TokStart = BufferPtr;
777  FormTokenWithChars(Result, CurPtr,
778                     Wide ? tok::wide_string_literal : tok::string_literal);
779  Result.setLiteralData(TokStart);
780}
781
782/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
783/// after having lexed the '<' character.  This is used for #include filenames.
784void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
785  const char *NulCharacter = 0; // Does this string contain the \0 character?
786  const char *AfterLessPos = CurPtr;
787  char C = getAndAdvanceChar(CurPtr, Result);
788  while (C != '>') {
789    // Skip escaped characters.
790    if (C == '\\') {
791      // Skip the escaped character.
792      C = getAndAdvanceChar(CurPtr, Result);
793    } else if (C == '\n' || C == '\r' ||             // Newline.
794               (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
795      // If the filename is unterminated, then it must just be a lone <
796      // character.  Return this as such.
797      FormTokenWithChars(Result, AfterLessPos, tok::less);
798      return;
799    } else if (C == 0) {
800      NulCharacter = CurPtr-1;
801    }
802    C = getAndAdvanceChar(CurPtr, Result);
803  }
804
805  // If a nul character existed in the string, warn about it.
806  if (NulCharacter && !isLexingRawMode())
807    Diag(NulCharacter, diag::null_in_string);
808
809  // Update the location of token as well as BufferPtr.
810  const char *TokStart = BufferPtr;
811  FormTokenWithChars(Result, CurPtr, tok::angle_string_literal);
812  Result.setLiteralData(TokStart);
813}
814
815
816/// LexCharConstant - Lex the remainder of a character constant, after having
817/// lexed either ' or L'.
818void Lexer::LexCharConstant(Token &Result, const char *CurPtr) {
819  const char *NulCharacter = 0; // Does this character contain the \0 character?
820
821  // Handle the common case of 'x' and '\y' efficiently.
822  char C = getAndAdvanceChar(CurPtr, Result);
823  if (C == '\'') {
824    if (!isLexingRawMode() && !Features.AsmPreprocessor)
825      Diag(BufferPtr, diag::err_empty_character);
826    FormTokenWithChars(Result, CurPtr, tok::unknown);
827    return;
828  } else if (C == '\\') {
829    // Skip the escaped character.
830    // FIXME: UCN's.
831    C = getAndAdvanceChar(CurPtr, Result);
832  }
833
834  if (C && C != '\n' && C != '\r' && CurPtr[0] == '\'') {
835    ++CurPtr;
836  } else {
837    // Fall back on generic code for embedded nulls, newlines, wide chars.
838    do {
839      // Skip escaped characters.
840      if (C == '\\') {
841        // Skip the escaped character.
842        C = getAndAdvanceChar(CurPtr, Result);
843      } else if (C == '\n' || C == '\r' ||               // Newline.
844                 (C == 0 && CurPtr-1 == BufferEnd)) {    // End of file.
845        if (!isLexingRawMode() && !Features.AsmPreprocessor)
846          Diag(BufferPtr, diag::err_unterminated_char);
847        FormTokenWithChars(Result, CurPtr-1, tok::unknown);
848        return;
849      } else if (C == 0) {
850        NulCharacter = CurPtr-1;
851      }
852      C = getAndAdvanceChar(CurPtr, Result);
853    } while (C != '\'');
854  }
855
856  if (NulCharacter && !isLexingRawMode())
857    Diag(NulCharacter, diag::null_in_char);
858
859  // Update the location of token as well as BufferPtr.
860  const char *TokStart = BufferPtr;
861  FormTokenWithChars(Result, CurPtr, tok::char_constant);
862  Result.setLiteralData(TokStart);
863}
864
865/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
866/// Update BufferPtr to point to the next non-whitespace character and return.
867///
868/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
869///
870bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
871  // Whitespace - Skip it, then return the token after the whitespace.
872  unsigned char Char = *CurPtr;  // Skip consequtive spaces efficiently.
873  while (1) {
874    // Skip horizontal whitespace very aggressively.
875    while (isHorizontalWhitespace(Char))
876      Char = *++CurPtr;
877
878    // Otherwise if we have something other than whitespace, we're done.
879    if (Char != '\n' && Char != '\r')
880      break;
881
882    if (ParsingPreprocessorDirective) {
883      // End of preprocessor directive line, let LexTokenInternal handle this.
884      BufferPtr = CurPtr;
885      return false;
886    }
887
888    // ok, but handle newline.
889    // The returned token is at the start of the line.
890    Result.setFlag(Token::StartOfLine);
891    // No leading whitespace seen so far.
892    Result.clearFlag(Token::LeadingSpace);
893    Char = *++CurPtr;
894  }
895
896  // If this isn't immediately after a newline, there is leading space.
897  char PrevChar = CurPtr[-1];
898  if (PrevChar != '\n' && PrevChar != '\r')
899    Result.setFlag(Token::LeadingSpace);
900
901  // If the client wants us to return whitespace, return it now.
902  if (isKeepWhitespaceMode()) {
903    FormTokenWithChars(Result, CurPtr, tok::unknown);
904    return true;
905  }
906
907  BufferPtr = CurPtr;
908  return false;
909}
910
911// SkipBCPLComment - We have just read the // characters from input.  Skip until
912// we find the newline character thats terminate the comment.  Then update
913/// BufferPtr and return.
914///
915/// If we're in KeepCommentMode or any CommentHandler has inserted
916/// some tokens, this will store the first token and return true.
917bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) {
918  // If BCPL comments aren't explicitly enabled for this language, emit an
919  // extension warning.
920  if (!Features.BCPLComment && !isLexingRawMode()) {
921    Diag(BufferPtr, diag::ext_bcpl_comment);
922
923    // Mark them enabled so we only emit one warning for this translation
924    // unit.
925    Features.BCPLComment = true;
926  }
927
928  // Scan over the body of the comment.  The common case, when scanning, is that
929  // the comment contains normal ascii characters with nothing interesting in
930  // them.  As such, optimize for this case with the inner loop.
931  char C;
932  do {
933    C = *CurPtr;
934    // FIXME: Speedup BCPL comment lexing.  Just scan for a \n or \r character.
935    // If we find a \n character, scan backwards, checking to see if it's an
936    // escaped newline, like we do for block comments.
937
938    // Skip over characters in the fast loop.
939    while (C != 0 &&                // Potentially EOF.
940           C != '\\' &&             // Potentially escaped newline.
941           C != '?' &&              // Potentially trigraph.
942           C != '\n' && C != '\r')  // Newline or DOS-style newline.
943      C = *++CurPtr;
944
945    // If this is a newline, we're done.
946    if (C == '\n' || C == '\r')
947      break;  // Found the newline? Break out!
948
949    // Otherwise, this is a hard case.  Fall back on getAndAdvanceChar to
950    // properly decode the character.  Read it in raw mode to avoid emitting
951    // diagnostics about things like trigraphs.  If we see an escaped newline,
952    // we'll handle it below.
953    const char *OldPtr = CurPtr;
954    bool OldRawMode = isLexingRawMode();
955    LexingRawMode = true;
956    C = getAndAdvanceChar(CurPtr, Result);
957    LexingRawMode = OldRawMode;
958
959    // If the char that we finally got was a \n, then we must have had something
960    // like \<newline><newline>.  We don't want to have consumed the second
961    // newline, we want CurPtr, to end up pointing to it down below.
962    if (C == '\n' || C == '\r') {
963      --CurPtr;
964      C = 'x'; // doesn't matter what this is.
965    }
966
967    // If we read multiple characters, and one of those characters was a \r or
968    // \n, then we had an escaped newline within the comment.  Emit diagnostic
969    // unless the next line is also a // comment.
970    if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') {
971      for (; OldPtr != CurPtr; ++OldPtr)
972        if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
973          // Okay, we found a // comment that ends in a newline, if the next
974          // line is also a // comment, but has spaces, don't emit a diagnostic.
975          if (isspace(C)) {
976            const char *ForwardPtr = CurPtr;
977            while (isspace(*ForwardPtr))  // Skip whitespace.
978              ++ForwardPtr;
979            if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
980              break;
981          }
982
983          if (!isLexingRawMode())
984            Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment);
985          break;
986        }
987    }
988
989    if (CurPtr == BufferEnd+1) { --CurPtr; break; }
990  } while (C != '\n' && C != '\r');
991
992  // Found but did not consume the newline.  Notify comment handlers about the
993  // comment unless we're in a #if 0 block.
994  if (PP && !isLexingRawMode() &&
995      PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
996                                            getSourceLocation(CurPtr)))) {
997    BufferPtr = CurPtr;
998    return true; // A token has to be returned.
999  }
1000
1001  // If we are returning comments as tokens, return this comment as a token.
1002  if (inKeepCommentMode())
1003    return SaveBCPLComment(Result, CurPtr);
1004
1005  // If we are inside a preprocessor directive and we see the end of line,
1006  // return immediately, so that the lexer can return this as an EOM token.
1007  if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
1008    BufferPtr = CurPtr;
1009    return false;
1010  }
1011
1012  // Otherwise, eat the \n character.  We don't care if this is a \n\r or
1013  // \r\n sequence.  This is an efficiency hack (because we know the \n can't
1014  // contribute to another token), it isn't needed for correctness.  Note that
1015  // this is ok even in KeepWhitespaceMode, because we would have returned the
1016  /// comment above in that mode.
1017  ++CurPtr;
1018
1019  // The next returned token is at the start of the line.
1020  Result.setFlag(Token::StartOfLine);
1021  // No leading whitespace seen so far.
1022  Result.clearFlag(Token::LeadingSpace);
1023  BufferPtr = CurPtr;
1024  return false;
1025}
1026
1027/// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in
1028/// an appropriate way and return it.
1029bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) {
1030  // If we're not in a preprocessor directive, just return the // comment
1031  // directly.
1032  FormTokenWithChars(Result, CurPtr, tok::comment);
1033
1034  if (!ParsingPreprocessorDirective)
1035    return true;
1036
1037  // If this BCPL-style comment is in a macro definition, transmogrify it into
1038  // a C-style block comment.
1039  bool Invalid = false;
1040  std::string Spelling = PP->getSpelling(Result, &Invalid);
1041  if (Invalid)
1042    return true;
1043
1044  assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?");
1045  Spelling[1] = '*';   // Change prefix to "/*".
1046  Spelling += "*/";    // add suffix.
1047
1048  Result.setKind(tok::comment);
1049  PP->CreateString(&Spelling[0], Spelling.size(), Result,
1050                   Result.getLocation());
1051  return true;
1052}
1053
1054/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
1055/// character (either \n or \r) is part of an escaped newline sequence.  Issue a
1056/// diagnostic if so.  We know that the newline is inside of a block comment.
1057static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
1058                                                  Lexer *L) {
1059  assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
1060
1061  // Back up off the newline.
1062  --CurPtr;
1063
1064  // If this is a two-character newline sequence, skip the other character.
1065  if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
1066    // \n\n or \r\r -> not escaped newline.
1067    if (CurPtr[0] == CurPtr[1])
1068      return false;
1069    // \n\r or \r\n -> skip the newline.
1070    --CurPtr;
1071  }
1072
1073  // If we have horizontal whitespace, skip over it.  We allow whitespace
1074  // between the slash and newline.
1075  bool HasSpace = false;
1076  while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
1077    --CurPtr;
1078    HasSpace = true;
1079  }
1080
1081  // If we have a slash, we know this is an escaped newline.
1082  if (*CurPtr == '\\') {
1083    if (CurPtr[-1] != '*') return false;
1084  } else {
1085    // It isn't a slash, is it the ?? / trigraph?
1086    if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' ||
1087        CurPtr[-3] != '*')
1088      return false;
1089
1090    // This is the trigraph ending the comment.  Emit a stern warning!
1091    CurPtr -= 2;
1092
1093    // If no trigraphs are enabled, warn that we ignored this trigraph and
1094    // ignore this * character.
1095    if (!L->getFeatures().Trigraphs) {
1096      if (!L->isLexingRawMode())
1097        L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
1098      return false;
1099    }
1100    if (!L->isLexingRawMode())
1101      L->Diag(CurPtr, diag::trigraph_ends_block_comment);
1102  }
1103
1104  // Warn about having an escaped newline between the */ characters.
1105  if (!L->isLexingRawMode())
1106    L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
1107
1108  // If there was space between the backslash and newline, warn about it.
1109  if (HasSpace && !L->isLexingRawMode())
1110    L->Diag(CurPtr, diag::backslash_newline_space);
1111
1112  return true;
1113}
1114
1115#ifdef __SSE2__
1116#include <emmintrin.h>
1117#elif __ALTIVEC__
1118#include <altivec.h>
1119#undef bool
1120#endif
1121
1122/// SkipBlockComment - We have just read the /* characters from input.  Read
1123/// until we find the */ characters that terminate the comment.  Note that we
1124/// don't bother decoding trigraphs or escaped newlines in block comments,
1125/// because they cannot cause the comment to end.  The only thing that can
1126/// happen is the comment could end with an escaped newline between the */ end
1127/// of comment.
1128///
1129/// If we're in KeepCommentMode or any CommentHandler has inserted
1130/// some tokens, this will store the first token and return true.
1131bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
1132  // Scan one character past where we should, looking for a '/' character.  Once
1133  // we find it, check to see if it was preceeded by a *.  This common
1134  // optimization helps people who like to put a lot of * characters in their
1135  // comments.
1136
1137  // The first character we get with newlines and trigraphs skipped to handle
1138  // the degenerate /*/ case below correctly if the * has an escaped newline
1139  // after it.
1140  unsigned CharSize;
1141  unsigned char C = getCharAndSize(CurPtr, CharSize);
1142  CurPtr += CharSize;
1143  if (C == 0 && CurPtr == BufferEnd+1) {
1144    if (!isLexingRawMode())
1145      Diag(BufferPtr, diag::err_unterminated_block_comment);
1146    --CurPtr;
1147
1148    // KeepWhitespaceMode should return this broken comment as a token.  Since
1149    // it isn't a well formed comment, just return it as an 'unknown' token.
1150    if (isKeepWhitespaceMode()) {
1151      FormTokenWithChars(Result, CurPtr, tok::unknown);
1152      return true;
1153    }
1154
1155    BufferPtr = CurPtr;
1156    return false;
1157  }
1158
1159  // Check to see if the first character after the '/*' is another /.  If so,
1160  // then this slash does not end the block comment, it is part of it.
1161  if (C == '/')
1162    C = *CurPtr++;
1163
1164  while (1) {
1165    // Skip over all non-interesting characters until we find end of buffer or a
1166    // (probably ending) '/' character.
1167    if (CurPtr + 24 < BufferEnd) {
1168      // While not aligned to a 16-byte boundary.
1169      while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
1170        C = *CurPtr++;
1171
1172      if (C == '/') goto FoundSlash;
1173
1174#ifdef __SSE2__
1175      __m128i Slashes = _mm_set_epi8('/', '/', '/', '/', '/', '/', '/', '/',
1176                                     '/', '/', '/', '/', '/', '/', '/', '/');
1177      while (CurPtr+16 <= BufferEnd &&
1178             _mm_movemask_epi8(_mm_cmpeq_epi8(*(__m128i*)CurPtr, Slashes)) == 0)
1179        CurPtr += 16;
1180#elif __ALTIVEC__
1181      __vector unsigned char Slashes = {
1182        '/', '/', '/', '/',  '/', '/', '/', '/',
1183        '/', '/', '/', '/',  '/', '/', '/', '/'
1184      };
1185      while (CurPtr+16 <= BufferEnd &&
1186             !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes))
1187        CurPtr += 16;
1188#else
1189      // Scan for '/' quickly.  Many block comments are very large.
1190      while (CurPtr[0] != '/' &&
1191             CurPtr[1] != '/' &&
1192             CurPtr[2] != '/' &&
1193             CurPtr[3] != '/' &&
1194             CurPtr+4 < BufferEnd) {
1195        CurPtr += 4;
1196      }
1197#endif
1198
1199      // It has to be one of the bytes scanned, increment to it and read one.
1200      C = *CurPtr++;
1201    }
1202
1203    // Loop to scan the remainder.
1204    while (C != '/' && C != '\0')
1205      C = *CurPtr++;
1206
1207  FoundSlash:
1208    if (C == '/') {
1209      if (CurPtr[-2] == '*')  // We found the final */.  We're done!
1210        break;
1211
1212      if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
1213        if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
1214          // We found the final */, though it had an escaped newline between the
1215          // * and /.  We're done!
1216          break;
1217        }
1218      }
1219      if (CurPtr[0] == '*' && CurPtr[1] != '/') {
1220        // If this is a /* inside of the comment, emit a warning.  Don't do this
1221        // if this is a /*/, which will end the comment.  This misses cases with
1222        // embedded escaped newlines, but oh well.
1223        if (!isLexingRawMode())
1224          Diag(CurPtr-1, diag::warn_nested_block_comment);
1225      }
1226    } else if (C == 0 && CurPtr == BufferEnd+1) {
1227      if (!isLexingRawMode())
1228        Diag(BufferPtr, diag::err_unterminated_block_comment);
1229      // Note: the user probably forgot a */.  We could continue immediately
1230      // after the /*, but this would involve lexing a lot of what really is the
1231      // comment, which surely would confuse the parser.
1232      --CurPtr;
1233
1234      // KeepWhitespaceMode should return this broken comment as a token.  Since
1235      // it isn't a well formed comment, just return it as an 'unknown' token.
1236      if (isKeepWhitespaceMode()) {
1237        FormTokenWithChars(Result, CurPtr, tok::unknown);
1238        return true;
1239      }
1240
1241      BufferPtr = CurPtr;
1242      return false;
1243    }
1244    C = *CurPtr++;
1245  }
1246
1247  // Notify comment handlers about the comment unless we're in a #if 0 block.
1248  if (PP && !isLexingRawMode() &&
1249      PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
1250                                            getSourceLocation(CurPtr)))) {
1251    BufferPtr = CurPtr;
1252    return true; // A token has to be returned.
1253  }
1254
1255  // If we are returning comments as tokens, return this comment as a token.
1256  if (inKeepCommentMode()) {
1257    FormTokenWithChars(Result, CurPtr, tok::comment);
1258    return true;
1259  }
1260
1261  // It is common for the tokens immediately after a /**/ comment to be
1262  // whitespace.  Instead of going through the big switch, handle it
1263  // efficiently now.  This is safe even in KeepWhitespaceMode because we would
1264  // have already returned above with the comment as a token.
1265  if (isHorizontalWhitespace(*CurPtr)) {
1266    Result.setFlag(Token::LeadingSpace);
1267    SkipWhitespace(Result, CurPtr+1);
1268    return false;
1269  }
1270
1271  // Otherwise, just return so that the next character will be lexed as a token.
1272  BufferPtr = CurPtr;
1273  Result.setFlag(Token::LeadingSpace);
1274  return false;
1275}
1276
1277//===----------------------------------------------------------------------===//
1278// Primary Lexing Entry Points
1279//===----------------------------------------------------------------------===//
1280
1281/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
1282/// uninterpreted string.  This switches the lexer out of directive mode.
1283std::string Lexer::ReadToEndOfLine() {
1284  assert(ParsingPreprocessorDirective && ParsingFilename == false &&
1285         "Must be in a preprocessing directive!");
1286  std::string Result;
1287  Token Tmp;
1288
1289  // CurPtr - Cache BufferPtr in an automatic variable.
1290  const char *CurPtr = BufferPtr;
1291  while (1) {
1292    char Char = getAndAdvanceChar(CurPtr, Tmp);
1293    switch (Char) {
1294    default:
1295      Result += Char;
1296      break;
1297    case 0:  // Null.
1298      // Found end of file?
1299      if (CurPtr-1 != BufferEnd) {
1300        // Nope, normal character, continue.
1301        Result += Char;
1302        break;
1303      }
1304      // FALL THROUGH.
1305    case '\r':
1306    case '\n':
1307      // Okay, we found the end of the line. First, back up past the \0, \r, \n.
1308      assert(CurPtr[-1] == Char && "Trigraphs for newline?");
1309      BufferPtr = CurPtr-1;
1310
1311      // Next, lex the character, which should handle the EOM transition.
1312      Lex(Tmp);
1313      assert(Tmp.is(tok::eom) && "Unexpected token!");
1314
1315      // Finally, we're done, return the string we found.
1316      return Result;
1317    }
1318  }
1319}
1320
1321/// LexEndOfFile - CurPtr points to the end of this file.  Handle this
1322/// condition, reporting diagnostics and handling other edge cases as required.
1323/// This returns true if Result contains a token, false if PP.Lex should be
1324/// called again.
1325bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
1326  // If we hit the end of the file while parsing a preprocessor directive,
1327  // end the preprocessor directive first.  The next token returned will
1328  // then be the end of file.
1329  if (ParsingPreprocessorDirective) {
1330    // Done parsing the "line".
1331    ParsingPreprocessorDirective = false;
1332    // Update the location of token as well as BufferPtr.
1333    FormTokenWithChars(Result, CurPtr, tok::eom);
1334
1335    // Restore comment saving mode, in case it was disabled for directive.
1336    SetCommentRetentionState(PP->getCommentRetentionState());
1337    return true;  // Have a token.
1338  }
1339
1340  // If we are in raw mode, return this event as an EOF token.  Let the caller
1341  // that put us in raw mode handle the event.
1342  if (isLexingRawMode()) {
1343    Result.startToken();
1344    BufferPtr = BufferEnd;
1345    FormTokenWithChars(Result, BufferEnd, tok::eof);
1346    return true;
1347  }
1348
1349  // Otherwise, check if we are code-completing, then issue diagnostics for
1350  // unterminated #if and missing newline.
1351
1352  if (PP && PP->isCodeCompletionFile(FileLoc)) {
1353    // We're at the end of the file, but we've been asked to consider the
1354    // end of the file to be a code-completion token. Return the
1355    // code-completion token.
1356    Result.startToken();
1357    FormTokenWithChars(Result, CurPtr, tok::code_completion);
1358
1359    // Only do the eof -> code_completion translation once.
1360    PP->SetCodeCompletionPoint(0, 0, 0);
1361    return true;
1362  }
1363
1364  // If we are in a #if directive, emit an error.
1365  while (!ConditionalStack.empty()) {
1366    PP->Diag(ConditionalStack.back().IfLoc,
1367             diag::err_pp_unterminated_conditional);
1368    ConditionalStack.pop_back();
1369  }
1370
1371  // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
1372  // a pedwarn.
1373  if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r'))
1374    Diag(BufferEnd, diag::ext_no_newline_eof)
1375      << FixItHint::CreateInsertion(getSourceLocation(BufferEnd), "\n");
1376
1377  BufferPtr = CurPtr;
1378
1379  // Finally, let the preprocessor handle this.
1380  return PP->HandleEndOfFile(Result);
1381}
1382
1383/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
1384/// the specified lexer will return a tok::l_paren token, 0 if it is something
1385/// else and 2 if there are no more tokens in the buffer controlled by the
1386/// lexer.
1387unsigned Lexer::isNextPPTokenLParen() {
1388  assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
1389
1390  // Switch to 'skipping' mode.  This will ensure that we can lex a token
1391  // without emitting diagnostics, disables macro expansion, and will cause EOF
1392  // to return an EOF token instead of popping the include stack.
1393  LexingRawMode = true;
1394
1395  // Save state that can be changed while lexing so that we can restore it.
1396  const char *TmpBufferPtr = BufferPtr;
1397  bool inPPDirectiveMode = ParsingPreprocessorDirective;
1398
1399  Token Tok;
1400  Tok.startToken();
1401  LexTokenInternal(Tok);
1402
1403  // Restore state that may have changed.
1404  BufferPtr = TmpBufferPtr;
1405  ParsingPreprocessorDirective = inPPDirectiveMode;
1406
1407  // Restore the lexer back to non-skipping mode.
1408  LexingRawMode = false;
1409
1410  if (Tok.is(tok::eof))
1411    return 2;
1412  return Tok.is(tok::l_paren);
1413}
1414
1415/// FindConflictEnd - Find the end of a version control conflict marker.
1416static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd) {
1417  llvm::StringRef RestOfBuffer(CurPtr+7, BufferEnd-CurPtr-7);
1418  size_t Pos = RestOfBuffer.find(">>>>>>>");
1419  while (Pos != llvm::StringRef::npos) {
1420    // Must occur at start of line.
1421    if (RestOfBuffer[Pos-1] != '\r' &&
1422        RestOfBuffer[Pos-1] != '\n') {
1423      RestOfBuffer = RestOfBuffer.substr(Pos+7);
1424      continue;
1425    }
1426    return RestOfBuffer.data()+Pos;
1427  }
1428  return 0;
1429}
1430
1431/// IsStartOfConflictMarker - If the specified pointer is the start of a version
1432/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
1433/// and recover nicely.  This returns true if it is a conflict marker and false
1434/// if not.
1435bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
1436  // Only a conflict marker if it starts at the beginning of a line.
1437  if (CurPtr != BufferStart &&
1438      CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
1439    return false;
1440
1441  // Check to see if we have <<<<<<<.
1442  if (BufferEnd-CurPtr < 8 ||
1443      llvm::StringRef(CurPtr, 7) != "<<<<<<<")
1444    return false;
1445
1446  // If we have a situation where we don't care about conflict markers, ignore
1447  // it.
1448  if (IsInConflictMarker || isLexingRawMode())
1449    return false;
1450
1451  // Check to see if there is a >>>>>>> somewhere in the buffer at the start of
1452  // a line to terminate this conflict marker.
1453  if (FindConflictEnd(CurPtr+7, BufferEnd)) {
1454    // We found a match.  We are really in a conflict marker.
1455    // Diagnose this, and ignore to the end of line.
1456    Diag(CurPtr, diag::err_conflict_marker);
1457    IsInConflictMarker = true;
1458
1459    // Skip ahead to the end of line.  We know this exists because the
1460    // end-of-conflict marker starts with \r or \n.
1461    while (*CurPtr != '\r' && *CurPtr != '\n') {
1462      assert(CurPtr != BufferEnd && "Didn't find end of line");
1463      ++CurPtr;
1464    }
1465    BufferPtr = CurPtr;
1466    return true;
1467  }
1468
1469  // No end of conflict marker found.
1470  return false;
1471}
1472
1473
1474/// HandleEndOfConflictMarker - If this is a '=======' or '|||||||' or '>>>>>>>'
1475/// marker, then it is the end of a conflict marker.  Handle it by ignoring up
1476/// until the end of the line.  This returns true if it is a conflict marker and
1477/// false if not.
1478bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
1479  // Only a conflict marker if it starts at the beginning of a line.
1480  if (CurPtr != BufferStart &&
1481      CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
1482    return false;
1483
1484  // If we have a situation where we don't care about conflict markers, ignore
1485  // it.
1486  if (!IsInConflictMarker || isLexingRawMode())
1487    return false;
1488
1489  // Check to see if we have the marker (7 characters in a row).
1490  for (unsigned i = 1; i != 7; ++i)
1491    if (CurPtr[i] != CurPtr[0])
1492      return false;
1493
1494  // If we do have it, search for the end of the conflict marker.  This could
1495  // fail if it got skipped with a '#if 0' or something.  Note that CurPtr might
1496  // be the end of conflict marker.
1497  if (const char *End = FindConflictEnd(CurPtr, BufferEnd)) {
1498    CurPtr = End;
1499
1500    // Skip ahead to the end of line.
1501    while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
1502      ++CurPtr;
1503
1504    BufferPtr = CurPtr;
1505
1506    // No longer in the conflict marker.
1507    IsInConflictMarker = false;
1508    return true;
1509  }
1510
1511  return false;
1512}
1513
1514
1515/// LexTokenInternal - This implements a simple C family lexer.  It is an
1516/// extremely performance critical piece of code.  This assumes that the buffer
1517/// has a null character at the end of the file.  This returns a preprocessing
1518/// token, not a normal token, as such, it is an internal interface.  It assumes
1519/// that the Flags of result have been cleared before calling this.
1520void Lexer::LexTokenInternal(Token &Result) {
1521LexNextToken:
1522  // New token, can't need cleaning yet.
1523  Result.clearFlag(Token::NeedsCleaning);
1524  Result.setIdentifierInfo(0);
1525
1526  // CurPtr - Cache BufferPtr in an automatic variable.
1527  const char *CurPtr = BufferPtr;
1528
1529  // Small amounts of horizontal whitespace is very common between tokens.
1530  if ((*CurPtr == ' ') || (*CurPtr == '\t')) {
1531    ++CurPtr;
1532    while ((*CurPtr == ' ') || (*CurPtr == '\t'))
1533      ++CurPtr;
1534
1535    // If we are keeping whitespace and other tokens, just return what we just
1536    // skipped.  The next lexer invocation will return the token after the
1537    // whitespace.
1538    if (isKeepWhitespaceMode()) {
1539      FormTokenWithChars(Result, CurPtr, tok::unknown);
1540      return;
1541    }
1542
1543    BufferPtr = CurPtr;
1544    Result.setFlag(Token::LeadingSpace);
1545  }
1546
1547  unsigned SizeTmp, SizeTmp2;   // Temporaries for use in cases below.
1548
1549  // Read a character, advancing over it.
1550  char Char = getAndAdvanceChar(CurPtr, Result);
1551  tok::TokenKind Kind;
1552
1553  switch (Char) {
1554  case 0:  // Null.
1555    // Found end of file?
1556    if (CurPtr-1 == BufferEnd) {
1557      // Read the PP instance variable into an automatic variable, because
1558      // LexEndOfFile will often delete 'this'.
1559      Preprocessor *PPCache = PP;
1560      if (LexEndOfFile(Result, CurPtr-1))  // Retreat back into the file.
1561        return;   // Got a token to return.
1562      assert(PPCache && "Raw buffer::LexEndOfFile should return a token");
1563      return PPCache->Lex(Result);
1564    }
1565
1566    if (!isLexingRawMode())
1567      Diag(CurPtr-1, diag::null_in_file);
1568    Result.setFlag(Token::LeadingSpace);
1569    if (SkipWhitespace(Result, CurPtr))
1570      return; // KeepWhitespaceMode
1571
1572    goto LexNextToken;   // GCC isn't tail call eliminating.
1573
1574  case 26:  // DOS & CP/M EOF: "^Z".
1575    // If we're in Microsoft extensions mode, treat this as end of file.
1576    if (Features.Microsoft) {
1577      // Read the PP instance variable into an automatic variable, because
1578      // LexEndOfFile will often delete 'this'.
1579      Preprocessor *PPCache = PP;
1580      if (LexEndOfFile(Result, CurPtr-1))  // Retreat back into the file.
1581        return;   // Got a token to return.
1582      assert(PPCache && "Raw buffer::LexEndOfFile should return a token");
1583      return PPCache->Lex(Result);
1584    }
1585    // If Microsoft extensions are disabled, this is just random garbage.
1586    Kind = tok::unknown;
1587    break;
1588
1589  case '\n':
1590  case '\r':
1591    // If we are inside a preprocessor directive and we see the end of line,
1592    // we know we are done with the directive, so return an EOM token.
1593    if (ParsingPreprocessorDirective) {
1594      // Done parsing the "line".
1595      ParsingPreprocessorDirective = false;
1596
1597      // Restore comment saving mode, in case it was disabled for directive.
1598      SetCommentRetentionState(PP->getCommentRetentionState());
1599
1600      // Since we consumed a newline, we are back at the start of a line.
1601      IsAtStartOfLine = true;
1602
1603      Kind = tok::eom;
1604      break;
1605    }
1606    // The returned token is at the start of the line.
1607    Result.setFlag(Token::StartOfLine);
1608    // No leading whitespace seen so far.
1609    Result.clearFlag(Token::LeadingSpace);
1610
1611    if (SkipWhitespace(Result, CurPtr))
1612      return; // KeepWhitespaceMode
1613    goto LexNextToken;   // GCC isn't tail call eliminating.
1614  case ' ':
1615  case '\t':
1616  case '\f':
1617  case '\v':
1618  SkipHorizontalWhitespace:
1619    Result.setFlag(Token::LeadingSpace);
1620    if (SkipWhitespace(Result, CurPtr))
1621      return; // KeepWhitespaceMode
1622
1623  SkipIgnoredUnits:
1624    CurPtr = BufferPtr;
1625
1626    // If the next token is obviously a // or /* */ comment, skip it efficiently
1627    // too (without going through the big switch stmt).
1628    if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
1629        Features.BCPLComment) {
1630      if (SkipBCPLComment(Result, CurPtr+2))
1631        return; // There is a token to return.
1632      goto SkipIgnoredUnits;
1633    } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
1634      if (SkipBlockComment(Result, CurPtr+2))
1635        return; // There is a token to return.
1636      goto SkipIgnoredUnits;
1637    } else if (isHorizontalWhitespace(*CurPtr)) {
1638      goto SkipHorizontalWhitespace;
1639    }
1640    goto LexNextToken;   // GCC isn't tail call eliminating.
1641
1642  // C99 6.4.4.1: Integer Constants.
1643  // C99 6.4.4.2: Floating Constants.
1644  case '0': case '1': case '2': case '3': case '4':
1645  case '5': case '6': case '7': case '8': case '9':
1646    // Notify MIOpt that we read a non-whitespace/non-comment token.
1647    MIOpt.ReadToken();
1648    return LexNumericConstant(Result, CurPtr);
1649
1650  case 'L':   // Identifier (Loony) or wide literal (L'x' or L"xyz").
1651    // Notify MIOpt that we read a non-whitespace/non-comment token.
1652    MIOpt.ReadToken();
1653    Char = getCharAndSize(CurPtr, SizeTmp);
1654
1655    // Wide string literal.
1656    if (Char == '"')
1657      return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
1658                              true);
1659
1660    // Wide character constant.
1661    if (Char == '\'')
1662      return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
1663    // FALL THROUGH, treating L like the start of an identifier.
1664
1665  // C99 6.4.2: Identifiers.
1666  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
1667  case 'H': case 'I': case 'J': case 'K':    /*'L'*/case 'M': case 'N':
1668  case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
1669  case 'V': case 'W': case 'X': case 'Y': case 'Z':
1670  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
1671  case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
1672  case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
1673  case 'v': case 'w': case 'x': case 'y': case 'z':
1674  case '_':
1675    // Notify MIOpt that we read a non-whitespace/non-comment token.
1676    MIOpt.ReadToken();
1677    return LexIdentifier(Result, CurPtr);
1678
1679  case '$':   // $ in identifiers.
1680    if (Features.DollarIdents) {
1681      if (!isLexingRawMode())
1682        Diag(CurPtr-1, diag::ext_dollar_in_identifier);
1683      // Notify MIOpt that we read a non-whitespace/non-comment token.
1684      MIOpt.ReadToken();
1685      return LexIdentifier(Result, CurPtr);
1686    }
1687
1688    Kind = tok::unknown;
1689    break;
1690
1691  // C99 6.4.4: Character Constants.
1692  case '\'':
1693    // Notify MIOpt that we read a non-whitespace/non-comment token.
1694    MIOpt.ReadToken();
1695    return LexCharConstant(Result, CurPtr);
1696
1697  // C99 6.4.5: String Literals.
1698  case '"':
1699    // Notify MIOpt that we read a non-whitespace/non-comment token.
1700    MIOpt.ReadToken();
1701    return LexStringLiteral(Result, CurPtr, false);
1702
1703  // C99 6.4.6: Punctuators.
1704  case '?':
1705    Kind = tok::question;
1706    break;
1707  case '[':
1708    Kind = tok::l_square;
1709    break;
1710  case ']':
1711    Kind = tok::r_square;
1712    break;
1713  case '(':
1714    Kind = tok::l_paren;
1715    break;
1716  case ')':
1717    Kind = tok::r_paren;
1718    break;
1719  case '{':
1720    Kind = tok::l_brace;
1721    break;
1722  case '}':
1723    Kind = tok::r_brace;
1724    break;
1725  case '.':
1726    Char = getCharAndSize(CurPtr, SizeTmp);
1727    if (Char >= '0' && Char <= '9') {
1728      // Notify MIOpt that we read a non-whitespace/non-comment token.
1729      MIOpt.ReadToken();
1730
1731      return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
1732    } else if (Features.CPlusPlus && Char == '*') {
1733      Kind = tok::periodstar;
1734      CurPtr += SizeTmp;
1735    } else if (Char == '.' &&
1736               getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
1737      Kind = tok::ellipsis;
1738      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
1739                           SizeTmp2, Result);
1740    } else {
1741      Kind = tok::period;
1742    }
1743    break;
1744  case '&':
1745    Char = getCharAndSize(CurPtr, SizeTmp);
1746    if (Char == '&') {
1747      Kind = tok::ampamp;
1748      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1749    } else if (Char == '=') {
1750      Kind = tok::ampequal;
1751      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1752    } else {
1753      Kind = tok::amp;
1754    }
1755    break;
1756  case '*':
1757    if (getCharAndSize(CurPtr, SizeTmp) == '=') {
1758      Kind = tok::starequal;
1759      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1760    } else {
1761      Kind = tok::star;
1762    }
1763    break;
1764  case '+':
1765    Char = getCharAndSize(CurPtr, SizeTmp);
1766    if (Char == '+') {
1767      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1768      Kind = tok::plusplus;
1769    } else if (Char == '=') {
1770      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1771      Kind = tok::plusequal;
1772    } else {
1773      Kind = tok::plus;
1774    }
1775    break;
1776  case '-':
1777    Char = getCharAndSize(CurPtr, SizeTmp);
1778    if (Char == '-') {      // --
1779      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1780      Kind = tok::minusminus;
1781    } else if (Char == '>' && Features.CPlusPlus &&
1782               getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') {  // C++ ->*
1783      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
1784                           SizeTmp2, Result);
1785      Kind = tok::arrowstar;
1786    } else if (Char == '>') {   // ->
1787      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1788      Kind = tok::arrow;
1789    } else if (Char == '=') {   // -=
1790      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1791      Kind = tok::minusequal;
1792    } else {
1793      Kind = tok::minus;
1794    }
1795    break;
1796  case '~':
1797    Kind = tok::tilde;
1798    break;
1799  case '!':
1800    if (getCharAndSize(CurPtr, SizeTmp) == '=') {
1801      Kind = tok::exclaimequal;
1802      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1803    } else {
1804      Kind = tok::exclaim;
1805    }
1806    break;
1807  case '/':
1808    // 6.4.9: Comments
1809    Char = getCharAndSize(CurPtr, SizeTmp);
1810    if (Char == '/') {         // BCPL comment.
1811      // Even if BCPL comments are disabled (e.g. in C89 mode), we generally
1812      // want to lex this as a comment.  There is one problem with this though,
1813      // that in one particular corner case, this can change the behavior of the
1814      // resultant program.  For example, In  "foo //**/ bar", C89 would lex
1815      // this as "foo / bar" and langauges with BCPL comments would lex it as
1816      // "foo".  Check to see if the character after the second slash is a '*'.
1817      // If so, we will lex that as a "/" instead of the start of a comment.
1818      if (Features.BCPLComment ||
1819          getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') {
1820        if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
1821          return; // There is a token to return.
1822
1823        // It is common for the tokens immediately after a // comment to be
1824        // whitespace (indentation for the next line).  Instead of going through
1825        // the big switch, handle it efficiently now.
1826        goto SkipIgnoredUnits;
1827      }
1828    }
1829
1830    if (Char == '*') {  // /**/ comment.
1831      if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
1832        return; // There is a token to return.
1833      goto LexNextToken;   // GCC isn't tail call eliminating.
1834    }
1835
1836    if (Char == '=') {
1837      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1838      Kind = tok::slashequal;
1839    } else {
1840      Kind = tok::slash;
1841    }
1842    break;
1843  case '%':
1844    Char = getCharAndSize(CurPtr, SizeTmp);
1845    if (Char == '=') {
1846      Kind = tok::percentequal;
1847      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1848    } else if (Features.Digraphs && Char == '>') {
1849      Kind = tok::r_brace;                             // '%>' -> '}'
1850      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1851    } else if (Features.Digraphs && Char == ':') {
1852      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1853      Char = getCharAndSize(CurPtr, SizeTmp);
1854      if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
1855        Kind = tok::hashhash;                          // '%:%:' -> '##'
1856        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
1857                             SizeTmp2, Result);
1858      } else if (Char == '@' && Features.Microsoft) {  // %:@ -> #@ -> Charize
1859        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1860        if (!isLexingRawMode())
1861          Diag(BufferPtr, diag::charize_microsoft_ext);
1862        Kind = tok::hashat;
1863      } else {                                         // '%:' -> '#'
1864        // We parsed a # character.  If this occurs at the start of the line,
1865        // it's actually the start of a preprocessing directive.  Callback to
1866        // the preprocessor to handle it.
1867        // FIXME: -fpreprocessed mode??
1868        if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) {
1869          FormTokenWithChars(Result, CurPtr, tok::hash);
1870          PP->HandleDirective(Result);
1871
1872          // As an optimization, if the preprocessor didn't switch lexers, tail
1873          // recurse.
1874          if (PP->isCurrentLexer(this)) {
1875            // Start a new token. If this is a #include or something, the PP may
1876            // want us starting at the beginning of the line again.  If so, set
1877            // the StartOfLine flag and clear LeadingSpace.
1878            if (IsAtStartOfLine) {
1879              Result.setFlag(Token::StartOfLine);
1880              Result.clearFlag(Token::LeadingSpace);
1881              IsAtStartOfLine = false;
1882            }
1883            goto LexNextToken;   // GCC isn't tail call eliminating.
1884          }
1885
1886          return PP->Lex(Result);
1887        }
1888
1889        Kind = tok::hash;
1890      }
1891    } else {
1892      Kind = tok::percent;
1893    }
1894    break;
1895  case '<':
1896    Char = getCharAndSize(CurPtr, SizeTmp);
1897    if (ParsingFilename) {
1898      return LexAngledStringLiteral(Result, CurPtr);
1899    } else if (Char == '<') {
1900      char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
1901      if (After == '=') {
1902        Kind = tok::lesslessequal;
1903        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
1904                             SizeTmp2, Result);
1905      } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
1906        // If this is actually a '<<<<<<<' version control conflict marker,
1907        // recognize it as such and recover nicely.
1908        goto LexNextToken;
1909      } else {
1910        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1911        Kind = tok::lessless;
1912      }
1913    } else if (Char == '=') {
1914      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1915      Kind = tok::lessequal;
1916    } else if (Features.Digraphs && Char == ':') {     // '<:' -> '['
1917      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1918      Kind = tok::l_square;
1919    } else if (Features.Digraphs && Char == '%') {     // '<%' -> '{'
1920      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1921      Kind = tok::l_brace;
1922    } else {
1923      Kind = tok::less;
1924    }
1925    break;
1926  case '>':
1927    Char = getCharAndSize(CurPtr, SizeTmp);
1928    if (Char == '=') {
1929      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1930      Kind = tok::greaterequal;
1931    } else if (Char == '>') {
1932      char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
1933      if (After == '=') {
1934        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
1935                             SizeTmp2, Result);
1936        Kind = tok::greatergreaterequal;
1937      } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
1938        // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
1939        goto LexNextToken;
1940      } else {
1941        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1942        Kind = tok::greatergreater;
1943      }
1944
1945    } else {
1946      Kind = tok::greater;
1947    }
1948    break;
1949  case '^':
1950    Char = getCharAndSize(CurPtr, SizeTmp);
1951    if (Char == '=') {
1952      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1953      Kind = tok::caretequal;
1954    } else {
1955      Kind = tok::caret;
1956    }
1957    break;
1958  case '|':
1959    Char = getCharAndSize(CurPtr, SizeTmp);
1960    if (Char == '=') {
1961      Kind = tok::pipeequal;
1962      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1963    } else if (Char == '|') {
1964      // If this is '|||||||' and we're in a conflict marker, ignore it.
1965      if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
1966        goto LexNextToken;
1967      Kind = tok::pipepipe;
1968      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1969    } else {
1970      Kind = tok::pipe;
1971    }
1972    break;
1973  case ':':
1974    Char = getCharAndSize(CurPtr, SizeTmp);
1975    if (Features.Digraphs && Char == '>') {
1976      Kind = tok::r_square; // ':>' -> ']'
1977      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1978    } else if (Features.CPlusPlus && Char == ':') {
1979      Kind = tok::coloncolon;
1980      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1981    } else {
1982      Kind = tok::colon;
1983    }
1984    break;
1985  case ';':
1986    Kind = tok::semi;
1987    break;
1988  case '=':
1989    Char = getCharAndSize(CurPtr, SizeTmp);
1990    if (Char == '=') {
1991      // If this is '=======' and we're in a conflict marker, ignore it.
1992      if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
1993        goto LexNextToken;
1994
1995      Kind = tok::equalequal;
1996      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1997    } else {
1998      Kind = tok::equal;
1999    }
2000    break;
2001  case ',':
2002    Kind = tok::comma;
2003    break;
2004  case '#':
2005    Char = getCharAndSize(CurPtr, SizeTmp);
2006    if (Char == '#') {
2007      Kind = tok::hashhash;
2008      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2009    } else if (Char == '@' && Features.Microsoft) {  // #@ -> Charize
2010      Kind = tok::hashat;
2011      if (!isLexingRawMode())
2012        Diag(BufferPtr, diag::charize_microsoft_ext);
2013      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2014    } else {
2015      // We parsed a # character.  If this occurs at the start of the line,
2016      // it's actually the start of a preprocessing directive.  Callback to
2017      // the preprocessor to handle it.
2018      // FIXME: -fpreprocessed mode??
2019      if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) {
2020        FormTokenWithChars(Result, CurPtr, tok::hash);
2021        PP->HandleDirective(Result);
2022
2023        // As an optimization, if the preprocessor didn't switch lexers, tail
2024        // recurse.
2025        if (PP->isCurrentLexer(this)) {
2026          // Start a new token.  If this is a #include or something, the PP may
2027          // want us starting at the beginning of the line again.  If so, set
2028          // the StartOfLine flag and clear LeadingSpace.
2029          if (IsAtStartOfLine) {
2030            Result.setFlag(Token::StartOfLine);
2031            Result.clearFlag(Token::LeadingSpace);
2032            IsAtStartOfLine = false;
2033          }
2034          goto LexNextToken;   // GCC isn't tail call eliminating.
2035        }
2036        return PP->Lex(Result);
2037      }
2038
2039      Kind = tok::hash;
2040    }
2041    break;
2042
2043  case '@':
2044    // Objective C support.
2045    if (CurPtr[-1] == '@' && Features.ObjC1)
2046      Kind = tok::at;
2047    else
2048      Kind = tok::unknown;
2049    break;
2050
2051  case '\\':
2052    // FIXME: UCN's.
2053    // FALL THROUGH.
2054  default:
2055    Kind = tok::unknown;
2056    break;
2057  }
2058
2059  // Notify MIOpt that we read a non-whitespace/non-comment token.
2060  MIOpt.ReadToken();
2061
2062  // Update the location of token as well as BufferPtr.
2063  FormTokenWithChars(Result, CurPtr, Kind);
2064}
2065