Lexer.cpp revision 2d804c4325fcf3893386e16970b82fd0f9af1d7c
1//===--- Lexer.cpp - C Language Family Lexer ------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10//  This file implements the Lexer and Token interfaces.
11//
12//===----------------------------------------------------------------------===//
13//
14// TODO: GCC Diagnostics emitted by the lexer:
15// PEDWARN: (form feed|vertical tab) in preprocessing directive
16//
17// Universal characters, unicode, char mapping:
18// WARNING: `%.*s' is not in NFKC
19// WARNING: `%.*s' is not in NFC
20//
21// Other:
22// TODO: Options to support:
23//    -fexec-charset,-fwide-exec-charset
24//
25//===----------------------------------------------------------------------===//
26
27#include "clang/Lex/Lexer.h"
28#include "clang/Lex/Preprocessor.h"
29#include "clang/Lex/LexDiagnostic.h"
30#include "clang/Lex/CodeCompletionHandler.h"
31#include "clang/Basic/SourceManager.h"
32#include "llvm/ADT/StringSwitch.h"
33#include "llvm/Support/Compiler.h"
34#include "llvm/Support/MemoryBuffer.h"
35#include <cctype>
36using namespace clang;
37
38static void InitCharacterInfo();
39
40//===----------------------------------------------------------------------===//
41// Token Class Implementation
42//===----------------------------------------------------------------------===//
43
44/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
45bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
46  if (IdentifierInfo *II = getIdentifierInfo())
47    return II->getObjCKeywordID() == objcKey;
48  return false;
49}
50
51/// getObjCKeywordID - Return the ObjC keyword kind.
52tok::ObjCKeywordKind Token::getObjCKeywordID() const {
53  IdentifierInfo *specId = getIdentifierInfo();
54  return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
55}
56
57
58//===----------------------------------------------------------------------===//
59// Lexer Class Implementation
60//===----------------------------------------------------------------------===//
61
62void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
63                      const char *BufEnd) {
64  InitCharacterInfo();
65
66  BufferStart = BufStart;
67  BufferPtr = BufPtr;
68  BufferEnd = BufEnd;
69
70  assert(BufEnd[0] == 0 &&
71         "We assume that the input buffer has a null character at the end"
72         " to simplify lexing!");
73
74  Is_PragmaLexer = false;
75  IsInConflictMarker = false;
76
77  // Start of the file is a start of line.
78  IsAtStartOfLine = true;
79
80  // We are not after parsing a #.
81  ParsingPreprocessorDirective = false;
82
83  // We are not after parsing #include.
84  ParsingFilename = false;
85
86  // We are not in raw mode.  Raw mode disables diagnostics and interpretation
87  // of tokens (e.g. identifiers, thus disabling macro expansion).  It is used
88  // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
89  // or otherwise skipping over tokens.
90  LexingRawMode = false;
91
92  // Default to not keeping comments.
93  ExtendedTokenMode = 0;
94}
95
96/// Lexer constructor - Create a new lexer object for the specified buffer
97/// with the specified preprocessor managing the lexing process.  This lexer
98/// assumes that the associated file buffer and Preprocessor objects will
99/// outlive it, so it doesn't take ownership of either of them.
100Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP)
101  : PreprocessorLexer(&PP, FID),
102    FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
103    Features(PP.getLangOptions()) {
104
105  InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
106            InputFile->getBufferEnd());
107
108  // Default to keeping comments if the preprocessor wants them.
109  SetCommentRetentionState(PP.getCommentRetentionState());
110}
111
112/// Lexer constructor - Create a new raw lexer object.  This object is only
113/// suitable for calls to 'LexRawToken'.  This lexer assumes that the text
114/// range will outlive it, so it doesn't take ownership of it.
115Lexer::Lexer(SourceLocation fileloc, const LangOptions &features,
116             const char *BufStart, const char *BufPtr, const char *BufEnd)
117  : FileLoc(fileloc), Features(features) {
118
119  InitLexer(BufStart, BufPtr, BufEnd);
120
121  // We *are* in raw mode.
122  LexingRawMode = true;
123}
124
125/// Lexer constructor - Create a new raw lexer object.  This object is only
126/// suitable for calls to 'LexRawToken'.  This lexer assumes that the text
127/// range will outlive it, so it doesn't take ownership of it.
128Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
129             const SourceManager &SM, const LangOptions &features)
130  : FileLoc(SM.getLocForStartOfFile(FID)), Features(features) {
131
132  InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(),
133            FromFile->getBufferEnd());
134
135  // We *are* in raw mode.
136  LexingRawMode = true;
137}
138
139/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
140/// _Pragma expansion.  This has a variety of magic semantics that this method
141/// sets up.  It returns a new'd Lexer that must be delete'd when done.
142///
143/// On entrance to this routine, TokStartLoc is a macro location which has a
144/// spelling loc that indicates the bytes to be lexed for the token and an
145/// instantiation location that indicates where all lexed tokens should be
146/// "expanded from".
147///
148/// FIXME: It would really be nice to make _Pragma just be a wrapper around a
149/// normal lexer that remaps tokens as they fly by.  This would require making
150/// Preprocessor::Lex virtual.  Given that, we could just dump in a magic lexer
151/// interface that could handle this stuff.  This would pull GetMappedTokenLoc
152/// out of the critical path of the lexer!
153///
154Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
155                                 SourceLocation InstantiationLocStart,
156                                 SourceLocation InstantiationLocEnd,
157                                 unsigned TokLen, Preprocessor &PP) {
158  SourceManager &SM = PP.getSourceManager();
159
160  // Create the lexer as if we were going to lex the file normally.
161  FileID SpellingFID = SM.getFileID(SpellingLoc);
162  const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID);
163  Lexer *L = new Lexer(SpellingFID, InputFile, PP);
164
165  // Now that the lexer is created, change the start/end locations so that we
166  // just lex the subsection of the file that we want.  This is lexing from a
167  // scratch buffer.
168  const char *StrData = SM.getCharacterData(SpellingLoc);
169
170  L->BufferPtr = StrData;
171  L->BufferEnd = StrData+TokLen;
172  assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
173
174  // Set the SourceLocation with the remapping information.  This ensures that
175  // GetMappedTokenLoc will remap the tokens as they are lexed.
176  L->FileLoc = SM.createInstantiationLoc(SM.getLocForStartOfFile(SpellingFID),
177                                         InstantiationLocStart,
178                                         InstantiationLocEnd, TokLen);
179
180  // Ensure that the lexer thinks it is inside a directive, so that end \n will
181  // return an EOM token.
182  L->ParsingPreprocessorDirective = true;
183
184  // This lexer really is for _Pragma.
185  L->Is_PragmaLexer = true;
186  return L;
187}
188
189
190/// Stringify - Convert the specified string into a C string, with surrounding
191/// ""'s, and with escaped \ and " characters.
192std::string Lexer::Stringify(const std::string &Str, bool Charify) {
193  std::string Result = Str;
194  char Quote = Charify ? '\'' : '"';
195  for (unsigned i = 0, e = Result.size(); i != e; ++i) {
196    if (Result[i] == '\\' || Result[i] == Quote) {
197      Result.insert(Result.begin()+i, '\\');
198      ++i; ++e;
199    }
200  }
201  return Result;
202}
203
204/// Stringify - Convert the specified string into a C string by escaping '\'
205/// and " characters.  This does not add surrounding ""'s to the string.
206void Lexer::Stringify(llvm::SmallVectorImpl<char> &Str) {
207  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
208    if (Str[i] == '\\' || Str[i] == '"') {
209      Str.insert(Str.begin()+i, '\\');
210      ++i; ++e;
211    }
212  }
213}
214
215static bool isWhitespace(unsigned char c);
216
217/// MeasureTokenLength - Relex the token at the specified location and return
218/// its length in bytes in the input file.  If the token needs cleaning (e.g.
219/// includes a trigraph or an escaped newline) then this count includes bytes
220/// that are part of that.
221unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
222                                   const SourceManager &SM,
223                                   const LangOptions &LangOpts) {
224  // TODO: this could be special cased for common tokens like identifiers, ')',
225  // etc to make this faster, if it mattered.  Just look at StrData[0] to handle
226  // all obviously single-char tokens.  This could use
227  // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
228  // something.
229
230  // If this comes from a macro expansion, we really do want the macro name, not
231  // the token this macro expanded to.
232  Loc = SM.getInstantiationLoc(Loc);
233  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
234  bool Invalid = false;
235  llvm::StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
236  if (Invalid)
237    return 0;
238
239  const char *StrData = Buffer.data()+LocInfo.second;
240
241  if (isWhitespace(StrData[0]))
242    return 0;
243
244  // Create a lexer starting at the beginning of this token.
245  Lexer TheLexer(Loc, LangOpts, Buffer.begin(), StrData, Buffer.end());
246  TheLexer.SetCommentRetentionState(true);
247  Token TheTok;
248  TheLexer.LexFromRawLexer(TheTok);
249  return TheTok.getLength();
250}
251
252SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
253                                          const SourceManager &SM,
254                                          const LangOptions &LangOpts) {
255  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
256  bool Invalid = false;
257  llvm::StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
258  if (Invalid)
259    return Loc;
260
261  // Back up from the current location until we hit the beginning of a line
262  // (or the buffer). We'll relex from that point.
263  const char *BufStart = Buffer.data();
264  const char *StrData = BufStart+LocInfo.second;
265  if (StrData[0] == '\n' || StrData[0] == '\r')
266    return Loc;
267
268  const char *LexStart = StrData;
269  while (LexStart != BufStart) {
270    if (LexStart[0] == '\n' || LexStart[0] == '\r') {
271      ++LexStart;
272      break;
273    }
274
275    --LexStart;
276  }
277
278  // Create a lexer starting at the beginning of this token.
279  SourceLocation LexerStartLoc = Loc.getFileLocWithOffset(-LocInfo.second);
280  Lexer TheLexer(LexerStartLoc, LangOpts, BufStart, LexStart, Buffer.end());
281  TheLexer.SetCommentRetentionState(true);
282
283  // Lex tokens until we find the token that contains the source location.
284  Token TheTok;
285  do {
286    TheLexer.LexFromRawLexer(TheTok);
287
288    if (TheLexer.getBufferLocation() > StrData) {
289      // Lexing this token has taken the lexer past the source location we're
290      // looking for. If the current token encompasses our source location,
291      // return the beginning of that token.
292      if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
293        return TheTok.getLocation();
294
295      // We ended up skipping over the source location entirely, which means
296      // that it points into whitespace. We're done here.
297      break;
298    }
299  } while (TheTok.getKind() != tok::eof);
300
301  // We've passed our source location; just return the original source location.
302  return Loc;
303}
304
305namespace {
306  enum PreambleDirectiveKind {
307    PDK_Skipped,
308    PDK_StartIf,
309    PDK_EndIf,
310    PDK_Unknown
311  };
312}
313
314std::pair<unsigned, bool>
315Lexer::ComputePreamble(const llvm::MemoryBuffer *Buffer, unsigned MaxLines) {
316  // Create a lexer starting at the beginning of the file. Note that we use a
317  // "fake" file source location at offset 1 so that the lexer will track our
318  // position within the file.
319  const unsigned StartOffset = 1;
320  SourceLocation StartLoc = SourceLocation::getFromRawEncoding(StartOffset);
321  LangOptions LangOpts;
322  Lexer TheLexer(StartLoc, LangOpts, Buffer->getBufferStart(),
323                 Buffer->getBufferStart(), Buffer->getBufferEnd());
324
325  bool InPreprocessorDirective = false;
326  Token TheTok;
327  Token IfStartTok;
328  unsigned IfCount = 0;
329  unsigned Line = 0;
330
331  do {
332    TheLexer.LexFromRawLexer(TheTok);
333
334    if (InPreprocessorDirective) {
335      // If we've hit the end of the file, we're done.
336      if (TheTok.getKind() == tok::eof) {
337        InPreprocessorDirective = false;
338        break;
339      }
340
341      // If we haven't hit the end of the preprocessor directive, skip this
342      // token.
343      if (!TheTok.isAtStartOfLine())
344        continue;
345
346      // We've passed the end of the preprocessor directive, and will look
347      // at this token again below.
348      InPreprocessorDirective = false;
349    }
350
351    // Keep track of the # of lines in the preamble.
352    if (TheTok.isAtStartOfLine()) {
353      ++Line;
354
355      // If we were asked to limit the number of lines in the preamble,
356      // and we're about to exceed that limit, we're done.
357      if (MaxLines && Line >= MaxLines)
358        break;
359    }
360
361    // Comments are okay; skip over them.
362    if (TheTok.getKind() == tok::comment)
363      continue;
364
365    if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
366      // This is the start of a preprocessor directive.
367      Token HashTok = TheTok;
368      InPreprocessorDirective = true;
369
370      // Figure out which direective this is. Since we're lexing raw tokens,
371      // we don't have an identifier table available. Instead, just look at
372      // the raw identifier to recognize and categorize preprocessor directives.
373      TheLexer.LexFromRawLexer(TheTok);
374      if (TheTok.getKind() == tok::identifier && !TheTok.needsCleaning()) {
375        const char *IdStart = Buffer->getBufferStart()
376                            + TheTok.getLocation().getRawEncoding() - 1;
377        llvm::StringRef Keyword(IdStart, TheTok.getLength());
378        PreambleDirectiveKind PDK
379          = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
380              .Case("include", PDK_Skipped)
381              .Case("__include_macros", PDK_Skipped)
382              .Case("define", PDK_Skipped)
383              .Case("undef", PDK_Skipped)
384              .Case("line", PDK_Skipped)
385              .Case("error", PDK_Skipped)
386              .Case("pragma", PDK_Skipped)
387              .Case("import", PDK_Skipped)
388              .Case("include_next", PDK_Skipped)
389              .Case("warning", PDK_Skipped)
390              .Case("ident", PDK_Skipped)
391              .Case("sccs", PDK_Skipped)
392              .Case("assert", PDK_Skipped)
393              .Case("unassert", PDK_Skipped)
394              .Case("if", PDK_StartIf)
395              .Case("ifdef", PDK_StartIf)
396              .Case("ifndef", PDK_StartIf)
397              .Case("elif", PDK_Skipped)
398              .Case("else", PDK_Skipped)
399              .Case("endif", PDK_EndIf)
400              .Default(PDK_Unknown);
401
402        switch (PDK) {
403        case PDK_Skipped:
404          continue;
405
406        case PDK_StartIf:
407          if (IfCount == 0)
408            IfStartTok = HashTok;
409
410          ++IfCount;
411          continue;
412
413        case PDK_EndIf:
414          // Mismatched #endif. The preamble ends here.
415          if (IfCount == 0)
416            break;
417
418          --IfCount;
419          continue;
420
421        case PDK_Unknown:
422          // We don't know what this directive is; stop at the '#'.
423          break;
424        }
425      }
426
427      // We only end up here if we didn't recognize the preprocessor
428      // directive or it was one that can't occur in the preamble at this
429      // point. Roll back the current token to the location of the '#'.
430      InPreprocessorDirective = false;
431      TheTok = HashTok;
432    }
433
434    // We hit a token that we don't recognize as being in the
435    // "preprocessing only" part of the file, so we're no longer in
436    // the preamble.
437    break;
438  } while (true);
439
440  SourceLocation End = IfCount? IfStartTok.getLocation() : TheTok.getLocation();
441  return std::make_pair(End.getRawEncoding() - StartLoc.getRawEncoding(),
442                        IfCount? IfStartTok.isAtStartOfLine()
443                               : TheTok.isAtStartOfLine());
444}
445
446//===----------------------------------------------------------------------===//
447// Character information.
448//===----------------------------------------------------------------------===//
449
450enum {
451  CHAR_HORZ_WS  = 0x01,  // ' ', '\t', '\f', '\v'.  Note, no '\0'
452  CHAR_VERT_WS  = 0x02,  // '\r', '\n'
453  CHAR_LETTER   = 0x04,  // a-z,A-Z
454  CHAR_NUMBER   = 0x08,  // 0-9
455  CHAR_UNDER    = 0x10,  // _
456  CHAR_PERIOD   = 0x20   // .
457};
458
459// Statically initialize CharInfo table based on ASCII character set
460// Reference: FreeBSD 7.2 /usr/share/misc/ascii
461static const unsigned char CharInfo[256] =
462{
463// 0 NUL         1 SOH         2 STX         3 ETX
464// 4 EOT         5 ENQ         6 ACK         7 BEL
465   0           , 0           , 0           , 0           ,
466   0           , 0           , 0           , 0           ,
467// 8 BS          9 HT         10 NL         11 VT
468//12 NP         13 CR         14 SO         15 SI
469   0           , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS,
470   CHAR_HORZ_WS, CHAR_VERT_WS, 0           , 0           ,
471//16 DLE        17 DC1        18 DC2        19 DC3
472//20 DC4        21 NAK        22 SYN        23 ETB
473   0           , 0           , 0           , 0           ,
474   0           , 0           , 0           , 0           ,
475//24 CAN        25 EM         26 SUB        27 ESC
476//28 FS         29 GS         30 RS         31 US
477   0           , 0           , 0           , 0           ,
478   0           , 0           , 0           , 0           ,
479//32 SP         33  !         34  "         35  #
480//36  $         37  %         38  &         39  '
481   CHAR_HORZ_WS, 0           , 0           , 0           ,
482   0           , 0           , 0           , 0           ,
483//40  (         41  )         42  *         43  +
484//44  ,         45  -         46  .         47  /
485   0           , 0           , 0           , 0           ,
486   0           , 0           , CHAR_PERIOD , 0           ,
487//48  0         49  1         50  2         51  3
488//52  4         53  5         54  6         55  7
489   CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
490   CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
491//56  8         57  9         58  :         59  ;
492//60  <         61  =         62  >         63  ?
493   CHAR_NUMBER , CHAR_NUMBER , 0           , 0           ,
494   0           , 0           , 0           , 0           ,
495//64  @         65  A         66  B         67  C
496//68  D         69  E         70  F         71  G
497   0           , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
498   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
499//72  H         73  I         74  J         75  K
500//76  L         77  M         78  N         79  O
501   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
502   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
503//80  P         81  Q         82  R         83  S
504//84  T         85  U         86  V         87  W
505   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
506   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
507//88  X         89  Y         90  Z         91  [
508//92  \         93  ]         94  ^         95  _
509   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0           ,
510   0           , 0           , 0           , CHAR_UNDER  ,
511//96  `         97  a         98  b         99  c
512//100  d       101  e        102  f        103  g
513   0           , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
514   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
515//104  h       105  i        106  j        107  k
516//108  l       109  m        110  n        111  o
517   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
518   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
519//112  p       113  q        114  r        115  s
520//116  t       117  u        118  v        119  w
521   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
522   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
523//120  x       121  y        122  z        123  {
524//124  |        125  }        126  ~        127 DEL
525   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0           ,
526   0           , 0           , 0           , 0
527};
528
529static void InitCharacterInfo() {
530  static bool isInited = false;
531  if (isInited) return;
532  // check the statically-initialized CharInfo table
533  assert(CHAR_HORZ_WS == CharInfo[(int)' ']);
534  assert(CHAR_HORZ_WS == CharInfo[(int)'\t']);
535  assert(CHAR_HORZ_WS == CharInfo[(int)'\f']);
536  assert(CHAR_HORZ_WS == CharInfo[(int)'\v']);
537  assert(CHAR_VERT_WS == CharInfo[(int)'\n']);
538  assert(CHAR_VERT_WS == CharInfo[(int)'\r']);
539  assert(CHAR_UNDER   == CharInfo[(int)'_']);
540  assert(CHAR_PERIOD  == CharInfo[(int)'.']);
541  for (unsigned i = 'a'; i <= 'z'; ++i) {
542    assert(CHAR_LETTER == CharInfo[i]);
543    assert(CHAR_LETTER == CharInfo[i+'A'-'a']);
544  }
545  for (unsigned i = '0'; i <= '9'; ++i)
546    assert(CHAR_NUMBER == CharInfo[i]);
547
548  isInited = true;
549}
550
551
552/// isIdentifierBody - Return true if this is the body character of an
553/// identifier, which is [a-zA-Z0-9_].
554static inline bool isIdentifierBody(unsigned char c) {
555  return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false;
556}
557
558/// isHorizontalWhitespace - Return true if this character is horizontal
559/// whitespace: ' ', '\t', '\f', '\v'.  Note that this returns false for '\0'.
560static inline bool isHorizontalWhitespace(unsigned char c) {
561  return (CharInfo[c] & CHAR_HORZ_WS) ? true : false;
562}
563
564/// isWhitespace - Return true if this character is horizontal or vertical
565/// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'.  Note that this returns false
566/// for '\0'.
567static inline bool isWhitespace(unsigned char c) {
568  return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false;
569}
570
571/// isNumberBody - Return true if this is the body character of an
572/// preprocessing number, which is [a-zA-Z0-9_.].
573static inline bool isNumberBody(unsigned char c) {
574  return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ?
575    true : false;
576}
577
578
579//===----------------------------------------------------------------------===//
580// Diagnostics forwarding code.
581//===----------------------------------------------------------------------===//
582
583/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
584/// lexer buffer was all instantiated at a single point, perform the mapping.
585/// This is currently only used for _Pragma implementation, so it is the slow
586/// path of the hot getSourceLocation method.  Do not allow it to be inlined.
587static DISABLE_INLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP,
588                                                       SourceLocation FileLoc,
589                                                       unsigned CharNo,
590                                                       unsigned TokLen);
591static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
592                                        SourceLocation FileLoc,
593                                        unsigned CharNo, unsigned TokLen) {
594  assert(FileLoc.isMacroID() && "Must be an instantiation");
595
596  // Otherwise, we're lexing "mapped tokens".  This is used for things like
597  // _Pragma handling.  Combine the instantiation location of FileLoc with the
598  // spelling location.
599  SourceManager &SM = PP.getSourceManager();
600
601  // Create a new SLoc which is expanded from Instantiation(FileLoc) but whose
602  // characters come from spelling(FileLoc)+Offset.
603  SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
604  SpellingLoc = SpellingLoc.getFileLocWithOffset(CharNo);
605
606  // Figure out the expansion loc range, which is the range covered by the
607  // original _Pragma(...) sequence.
608  std::pair<SourceLocation,SourceLocation> II =
609    SM.getImmediateInstantiationRange(FileLoc);
610
611  return SM.createInstantiationLoc(SpellingLoc, II.first, II.second, TokLen);
612}
613
614/// getSourceLocation - Return a source location identifier for the specified
615/// offset in the current file.
616SourceLocation Lexer::getSourceLocation(const char *Loc,
617                                        unsigned TokLen) const {
618  assert(Loc >= BufferStart && Loc <= BufferEnd &&
619         "Location out of range for this buffer!");
620
621  // In the normal case, we're just lexing from a simple file buffer, return
622  // the file id from FileLoc with the offset specified.
623  unsigned CharNo = Loc-BufferStart;
624  if (FileLoc.isFileID())
625    return FileLoc.getFileLocWithOffset(CharNo);
626
627  // Otherwise, this is the _Pragma lexer case, which pretends that all of the
628  // tokens are lexed from where the _Pragma was defined.
629  assert(PP && "This doesn't work on raw lexers");
630  return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
631}
632
633/// Diag - Forwarding function for diagnostics.  This translate a source
634/// position in the current buffer into a SourceLocation object for rendering.
635DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
636  return PP->Diag(getSourceLocation(Loc), DiagID);
637}
638
639//===----------------------------------------------------------------------===//
640// Trigraph and Escaped Newline Handling Code.
641//===----------------------------------------------------------------------===//
642
643/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
644/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
645static char GetTrigraphCharForLetter(char Letter) {
646  switch (Letter) {
647  default:   return 0;
648  case '=':  return '#';
649  case ')':  return ']';
650  case '(':  return '[';
651  case '!':  return '|';
652  case '\'': return '^';
653  case '>':  return '}';
654  case '/':  return '\\';
655  case '<':  return '{';
656  case '-':  return '~';
657  }
658}
659
660/// DecodeTrigraphChar - If the specified character is a legal trigraph when
661/// prefixed with ??, emit a trigraph warning.  If trigraphs are enabled,
662/// return the result character.  Finally, emit a warning about trigraph use
663/// whether trigraphs are enabled or not.
664static char DecodeTrigraphChar(const char *CP, Lexer *L) {
665  char Res = GetTrigraphCharForLetter(*CP);
666  if (!Res || !L) return Res;
667
668  if (!L->getFeatures().Trigraphs) {
669    if (!L->isLexingRawMode())
670      L->Diag(CP-2, diag::trigraph_ignored);
671    return 0;
672  }
673
674  if (!L->isLexingRawMode())
675    L->Diag(CP-2, diag::trigraph_converted) << llvm::StringRef(&Res, 1);
676  return Res;
677}
678
679/// getEscapedNewLineSize - Return the size of the specified escaped newline,
680/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
681/// trigraph equivalent on entry to this function.
682unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
683  unsigned Size = 0;
684  while (isWhitespace(Ptr[Size])) {
685    ++Size;
686
687    if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
688      continue;
689
690    // If this is a \r\n or \n\r, skip the other half.
691    if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
692        Ptr[Size-1] != Ptr[Size])
693      ++Size;
694
695    return Size;
696  }
697
698  // Not an escaped newline, must be a \t or something else.
699  return 0;
700}
701
702/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
703/// them), skip over them and return the first non-escaped-newline found,
704/// otherwise return P.
705const char *Lexer::SkipEscapedNewLines(const char *P) {
706  while (1) {
707    const char *AfterEscape;
708    if (*P == '\\') {
709      AfterEscape = P+1;
710    } else if (*P == '?') {
711      // If not a trigraph for escape, bail out.
712      if (P[1] != '?' || P[2] != '/')
713        return P;
714      AfterEscape = P+3;
715    } else {
716      return P;
717    }
718
719    unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
720    if (NewLineSize == 0) return P;
721    P = AfterEscape+NewLineSize;
722  }
723}
724
725
726/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
727/// get its size, and return it.  This is tricky in several cases:
728///   1. If currently at the start of a trigraph, we warn about the trigraph,
729///      then either return the trigraph (skipping 3 chars) or the '?',
730///      depending on whether trigraphs are enabled or not.
731///   2. If this is an escaped newline (potentially with whitespace between
732///      the backslash and newline), implicitly skip the newline and return
733///      the char after it.
734///   3. If this is a UCN, return it.  FIXME: C++ UCN's?
735///
736/// This handles the slow/uncommon case of the getCharAndSize method.  Here we
737/// know that we can accumulate into Size, and that we have already incremented
738/// Ptr by Size bytes.
739///
740/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
741/// be updated to match.
742///
743char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
744                               Token *Tok) {
745  // If we have a slash, look for an escaped newline.
746  if (Ptr[0] == '\\') {
747    ++Size;
748    ++Ptr;
749Slash:
750    // Common case, backslash-char where the char is not whitespace.
751    if (!isWhitespace(Ptr[0])) return '\\';
752
753    // See if we have optional whitespace characters between the slash and
754    // newline.
755    if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
756      // Remember that this token needs to be cleaned.
757      if (Tok) Tok->setFlag(Token::NeedsCleaning);
758
759      // Warn if there was whitespace between the backslash and newline.
760      if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
761        Diag(Ptr, diag::backslash_newline_space);
762
763      // Found backslash<whitespace><newline>.  Parse the char after it.
764      Size += EscapedNewLineSize;
765      Ptr  += EscapedNewLineSize;
766      // Use slow version to accumulate a correct size field.
767      return getCharAndSizeSlow(Ptr, Size, Tok);
768    }
769
770    // Otherwise, this is not an escaped newline, just return the slash.
771    return '\\';
772  }
773
774  // If this is a trigraph, process it.
775  if (Ptr[0] == '?' && Ptr[1] == '?') {
776    // If this is actually a legal trigraph (not something like "??x"), emit
777    // a trigraph warning.  If so, and if trigraphs are enabled, return it.
778    if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) {
779      // Remember that this token needs to be cleaned.
780      if (Tok) Tok->setFlag(Token::NeedsCleaning);
781
782      Ptr += 3;
783      Size += 3;
784      if (C == '\\') goto Slash;
785      return C;
786    }
787  }
788
789  // If this is neither, return a single character.
790  ++Size;
791  return *Ptr;
792}
793
794
795/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
796/// getCharAndSizeNoWarn method.  Here we know that we can accumulate into Size,
797/// and that we have already incremented Ptr by Size bytes.
798///
799/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
800/// be updated to match.
801char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
802                                     const LangOptions &Features) {
803  // If we have a slash, look for an escaped newline.
804  if (Ptr[0] == '\\') {
805    ++Size;
806    ++Ptr;
807Slash:
808    // Common case, backslash-char where the char is not whitespace.
809    if (!isWhitespace(Ptr[0])) return '\\';
810
811    // See if we have optional whitespace characters followed by a newline.
812    if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
813      // Found backslash<whitespace><newline>.  Parse the char after it.
814      Size += EscapedNewLineSize;
815      Ptr  += EscapedNewLineSize;
816
817      // Use slow version to accumulate a correct size field.
818      return getCharAndSizeSlowNoWarn(Ptr, Size, Features);
819    }
820
821    // Otherwise, this is not an escaped newline, just return the slash.
822    return '\\';
823  }
824
825  // If this is a trigraph, process it.
826  if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
827    // If this is actually a legal trigraph (not something like "??x"), return
828    // it.
829    if (char C = GetTrigraphCharForLetter(Ptr[2])) {
830      Ptr += 3;
831      Size += 3;
832      if (C == '\\') goto Slash;
833      return C;
834    }
835  }
836
837  // If this is neither, return a single character.
838  ++Size;
839  return *Ptr;
840}
841
842//===----------------------------------------------------------------------===//
843// Helper methods for lexing.
844//===----------------------------------------------------------------------===//
845
846/// \brief Routine that indiscriminately skips bytes in the source file.
847void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) {
848  BufferPtr += Bytes;
849  if (BufferPtr > BufferEnd)
850    BufferPtr = BufferEnd;
851  IsAtStartOfLine = StartOfLine;
852}
853
854void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
855  // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
856  unsigned Size;
857  unsigned char C = *CurPtr++;
858  while (isIdentifierBody(C))
859    C = *CurPtr++;
860
861  --CurPtr;   // Back up over the skipped character.
862
863  // Fast path, no $,\,? in identifier found.  '\' might be an escaped newline
864  // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
865  // FIXME: UCNs.
866  //
867  // TODO: Could merge these checks into a CharInfo flag to make the comparison
868  // cheaper
869  if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) {
870FinishIdentifier:
871    const char *IdStart = BufferPtr;
872    FormTokenWithChars(Result, CurPtr, tok::identifier);
873
874    // If we are in raw mode, return this identifier raw.  There is no need to
875    // look up identifier information or attempt to macro expand it.
876    if (LexingRawMode) return;
877
878    // Fill in Result.IdentifierInfo, looking up the identifier in the
879    // identifier table.
880    IdentifierInfo *II = PP->LookUpIdentifierInfo(Result, IdStart);
881
882    // Change the kind of this identifier to the appropriate token kind, e.g.
883    // turning "for" into a keyword.
884    Result.setKind(II->getTokenID());
885
886    // Finally, now that we know we have an identifier, pass this off to the
887    // preprocessor, which may macro expand it or something.
888    if (II->isHandleIdentifierCase())
889      PP->HandleIdentifier(Result);
890    return;
891  }
892
893  // Otherwise, $,\,? in identifier found.  Enter slower path.
894
895  C = getCharAndSize(CurPtr, Size);
896  while (1) {
897    if (C == '$') {
898      // If we hit a $ and they are not supported in identifiers, we are done.
899      if (!Features.DollarIdents) goto FinishIdentifier;
900
901      // Otherwise, emit a diagnostic and continue.
902      if (!isLexingRawMode())
903        Diag(CurPtr, diag::ext_dollar_in_identifier);
904      CurPtr = ConsumeChar(CurPtr, Size, Result);
905      C = getCharAndSize(CurPtr, Size);
906      continue;
907    } else if (!isIdentifierBody(C)) { // FIXME: UCNs.
908      // Found end of identifier.
909      goto FinishIdentifier;
910    }
911
912    // Otherwise, this character is good, consume it.
913    CurPtr = ConsumeChar(CurPtr, Size, Result);
914
915    C = getCharAndSize(CurPtr, Size);
916    while (isIdentifierBody(C)) { // FIXME: UCNs.
917      CurPtr = ConsumeChar(CurPtr, Size, Result);
918      C = getCharAndSize(CurPtr, Size);
919    }
920  }
921}
922
923/// isHexaLiteral - Return true if Start points to a hex constant.
924/// FIXME: This isn't correct, it will mislex:
925///     0\       <- escaped newline.
926///     x1234e+1
927/// in microsoft mode (where this is supposed to be several different tokens).
928static inline bool isHexaLiteral(const char *Start, const char *End) {
929  return ((End - Start > 2) && Start[0] == '0' &&
930          (Start[1] == 'x' || Start[1] == 'X'));
931}
932
933/// LexNumericConstant - Lex the remainder of a integer or floating point
934/// constant. From[-1] is the first character lexed.  Return the end of the
935/// constant.
936void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
937  unsigned Size;
938  char C = getCharAndSize(CurPtr, Size);
939  char PrevCh = 0;
940  while (isNumberBody(C)) { // FIXME: UCNs?
941    CurPtr = ConsumeChar(CurPtr, Size, Result);
942    PrevCh = C;
943    C = getCharAndSize(CurPtr, Size);
944  }
945
946  // If we fell out, check for a sign, due to 1e+12.  If we have one, continue.
947  if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
948    // If we are in Microsoft mode, don't continue if the constant is hex.
949    // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
950    if (!Features.Microsoft || !isHexaLiteral(BufferPtr, CurPtr))
951      return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
952  }
953
954  // If we have a hex FP constant, continue.
955  if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p') &&
956      !Features.CPlusPlus0x)
957    return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
958
959  // Update the location of token as well as BufferPtr.
960  const char *TokStart = BufferPtr;
961  FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
962  Result.setLiteralData(TokStart);
963}
964
965/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
966/// either " or L".
967void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) {
968  const char *NulCharacter = 0; // Does this string contain the \0 character?
969
970  char C = getAndAdvanceChar(CurPtr, Result);
971  while (C != '"') {
972    // Skip escaped characters.  Escaped newlines will already be processed by
973    // getAndAdvanceChar.
974    if (C == '\\')
975      C = getAndAdvanceChar(CurPtr, Result);
976
977    if (C == '\n' || C == '\r' ||             // Newline.
978        (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
979      if (C == 0 && PP && PP->isCodeCompletionFile(FileLoc))
980        PP->CodeCompleteNaturalLanguage();
981      else if (!isLexingRawMode() && !Features.AsmPreprocessor)
982        Diag(BufferPtr, diag::err_unterminated_string);
983      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
984      return;
985    }
986
987    if (C == 0)
988      NulCharacter = CurPtr-1;
989    C = getAndAdvanceChar(CurPtr, Result);
990  }
991
992  // If a nul character existed in the string, warn about it.
993  if (NulCharacter && !isLexingRawMode())
994    Diag(NulCharacter, diag::null_in_string);
995
996  // Update the location of the token as well as the BufferPtr instance var.
997  const char *TokStart = BufferPtr;
998  FormTokenWithChars(Result, CurPtr,
999                     Wide ? tok::wide_string_literal : tok::string_literal);
1000  Result.setLiteralData(TokStart);
1001}
1002
1003/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
1004/// after having lexed the '<' character.  This is used for #include filenames.
1005void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
1006  const char *NulCharacter = 0; // Does this string contain the \0 character?
1007  const char *AfterLessPos = CurPtr;
1008  char C = getAndAdvanceChar(CurPtr, Result);
1009  while (C != '>') {
1010    // Skip escaped characters.
1011    if (C == '\\') {
1012      // Skip the escaped character.
1013      C = getAndAdvanceChar(CurPtr, Result);
1014    } else if (C == '\n' || C == '\r' ||             // Newline.
1015               (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
1016      // If the filename is unterminated, then it must just be a lone <
1017      // character.  Return this as such.
1018      FormTokenWithChars(Result, AfterLessPos, tok::less);
1019      return;
1020    } else if (C == 0) {
1021      NulCharacter = CurPtr-1;
1022    }
1023    C = getAndAdvanceChar(CurPtr, Result);
1024  }
1025
1026  // If a nul character existed in the string, warn about it.
1027  if (NulCharacter && !isLexingRawMode())
1028    Diag(NulCharacter, diag::null_in_string);
1029
1030  // Update the location of token as well as BufferPtr.
1031  const char *TokStart = BufferPtr;
1032  FormTokenWithChars(Result, CurPtr, tok::angle_string_literal);
1033  Result.setLiteralData(TokStart);
1034}
1035
1036
1037/// LexCharConstant - Lex the remainder of a character constant, after having
1038/// lexed either ' or L'.
1039void Lexer::LexCharConstant(Token &Result, const char *CurPtr) {
1040  const char *NulCharacter = 0; // Does this character contain the \0 character?
1041
1042  char C = getAndAdvanceChar(CurPtr, Result);
1043  if (C == '\'') {
1044    if (!isLexingRawMode() && !Features.AsmPreprocessor)
1045      Diag(BufferPtr, diag::err_empty_character);
1046    FormTokenWithChars(Result, CurPtr, tok::unknown);
1047    return;
1048  }
1049
1050  while (C != '\'') {
1051    // Skip escaped characters.
1052    if (C == '\\') {
1053      // Skip the escaped character.
1054      // FIXME: UCN's
1055      C = getAndAdvanceChar(CurPtr, Result);
1056    } else if (C == '\n' || C == '\r' ||             // Newline.
1057               (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
1058      if (C == 0 && PP && PP->isCodeCompletionFile(FileLoc))
1059        PP->CodeCompleteNaturalLanguage();
1060      else if (!isLexingRawMode() && !Features.AsmPreprocessor)
1061        Diag(BufferPtr, diag::err_unterminated_char);
1062      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1063      return;
1064    } else if (C == 0) {
1065      NulCharacter = CurPtr-1;
1066    }
1067    C = getAndAdvanceChar(CurPtr, Result);
1068  }
1069
1070  // If a nul character existed in the character, warn about it.
1071  if (NulCharacter && !isLexingRawMode())
1072    Diag(NulCharacter, diag::null_in_char);
1073
1074  // Update the location of token as well as BufferPtr.
1075  const char *TokStart = BufferPtr;
1076  FormTokenWithChars(Result, CurPtr, tok::char_constant);
1077  Result.setLiteralData(TokStart);
1078}
1079
1080/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
1081/// Update BufferPtr to point to the next non-whitespace character and return.
1082///
1083/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
1084///
1085bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
1086  // Whitespace - Skip it, then return the token after the whitespace.
1087  unsigned char Char = *CurPtr;  // Skip consequtive spaces efficiently.
1088  while (1) {
1089    // Skip horizontal whitespace very aggressively.
1090    while (isHorizontalWhitespace(Char))
1091      Char = *++CurPtr;
1092
1093    // Otherwise if we have something other than whitespace, we're done.
1094    if (Char != '\n' && Char != '\r')
1095      break;
1096
1097    if (ParsingPreprocessorDirective) {
1098      // End of preprocessor directive line, let LexTokenInternal handle this.
1099      BufferPtr = CurPtr;
1100      return false;
1101    }
1102
1103    // ok, but handle newline.
1104    // The returned token is at the start of the line.
1105    Result.setFlag(Token::StartOfLine);
1106    // No leading whitespace seen so far.
1107    Result.clearFlag(Token::LeadingSpace);
1108    Char = *++CurPtr;
1109  }
1110
1111  // If this isn't immediately after a newline, there is leading space.
1112  char PrevChar = CurPtr[-1];
1113  if (PrevChar != '\n' && PrevChar != '\r')
1114    Result.setFlag(Token::LeadingSpace);
1115
1116  // If the client wants us to return whitespace, return it now.
1117  if (isKeepWhitespaceMode()) {
1118    FormTokenWithChars(Result, CurPtr, tok::unknown);
1119    return true;
1120  }
1121
1122  BufferPtr = CurPtr;
1123  return false;
1124}
1125
1126// SkipBCPLComment - We have just read the // characters from input.  Skip until
1127// we find the newline character thats terminate the comment.  Then update
1128/// BufferPtr and return.
1129///
1130/// If we're in KeepCommentMode or any CommentHandler has inserted
1131/// some tokens, this will store the first token and return true.
1132bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) {
1133  // If BCPL comments aren't explicitly enabled for this language, emit an
1134  // extension warning.
1135  if (!Features.BCPLComment && !isLexingRawMode()) {
1136    Diag(BufferPtr, diag::ext_bcpl_comment);
1137
1138    // Mark them enabled so we only emit one warning for this translation
1139    // unit.
1140    Features.BCPLComment = true;
1141  }
1142
1143  // Scan over the body of the comment.  The common case, when scanning, is that
1144  // the comment contains normal ascii characters with nothing interesting in
1145  // them.  As such, optimize for this case with the inner loop.
1146  char C;
1147  do {
1148    C = *CurPtr;
1149    // FIXME: Speedup BCPL comment lexing.  Just scan for a \n or \r character.
1150    // If we find a \n character, scan backwards, checking to see if it's an
1151    // escaped newline, like we do for block comments.
1152
1153    // Skip over characters in the fast loop.
1154    while (C != 0 &&                // Potentially EOF.
1155           C != '\\' &&             // Potentially escaped newline.
1156           C != '?' &&              // Potentially trigraph.
1157           C != '\n' && C != '\r')  // Newline or DOS-style newline.
1158      C = *++CurPtr;
1159
1160    // If this is a newline, we're done.
1161    if (C == '\n' || C == '\r')
1162      break;  // Found the newline? Break out!
1163
1164    // Otherwise, this is a hard case.  Fall back on getAndAdvanceChar to
1165    // properly decode the character.  Read it in raw mode to avoid emitting
1166    // diagnostics about things like trigraphs.  If we see an escaped newline,
1167    // we'll handle it below.
1168    const char *OldPtr = CurPtr;
1169    bool OldRawMode = isLexingRawMode();
1170    LexingRawMode = true;
1171    C = getAndAdvanceChar(CurPtr, Result);
1172    LexingRawMode = OldRawMode;
1173
1174    // If the char that we finally got was a \n, then we must have had something
1175    // like \<newline><newline>.  We don't want to have consumed the second
1176    // newline, we want CurPtr, to end up pointing to it down below.
1177    if (C == '\n' || C == '\r') {
1178      --CurPtr;
1179      C = 'x'; // doesn't matter what this is.
1180    }
1181
1182    // If we read multiple characters, and one of those characters was a \r or
1183    // \n, then we had an escaped newline within the comment.  Emit diagnostic
1184    // unless the next line is also a // comment.
1185    if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') {
1186      for (; OldPtr != CurPtr; ++OldPtr)
1187        if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
1188          // Okay, we found a // comment that ends in a newline, if the next
1189          // line is also a // comment, but has spaces, don't emit a diagnostic.
1190          if (isspace(C)) {
1191            const char *ForwardPtr = CurPtr;
1192            while (isspace(*ForwardPtr))  // Skip whitespace.
1193              ++ForwardPtr;
1194            if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
1195              break;
1196          }
1197
1198          if (!isLexingRawMode())
1199            Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment);
1200          break;
1201        }
1202    }
1203
1204    if (CurPtr == BufferEnd+1) {
1205      if (PP && PP->isCodeCompletionFile(FileLoc))
1206        PP->CodeCompleteNaturalLanguage();
1207
1208      --CurPtr;
1209      break;
1210    }
1211  } while (C != '\n' && C != '\r');
1212
1213  // Found but did not consume the newline.  Notify comment handlers about the
1214  // comment unless we're in a #if 0 block.
1215  if (PP && !isLexingRawMode() &&
1216      PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
1217                                            getSourceLocation(CurPtr)))) {
1218    BufferPtr = CurPtr;
1219    return true; // A token has to be returned.
1220  }
1221
1222  // If we are returning comments as tokens, return this comment as a token.
1223  if (inKeepCommentMode())
1224    return SaveBCPLComment(Result, CurPtr);
1225
1226  // If we are inside a preprocessor directive and we see the end of line,
1227  // return immediately, so that the lexer can return this as an EOM token.
1228  if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
1229    BufferPtr = CurPtr;
1230    return false;
1231  }
1232
1233  // Otherwise, eat the \n character.  We don't care if this is a \n\r or
1234  // \r\n sequence.  This is an efficiency hack (because we know the \n can't
1235  // contribute to another token), it isn't needed for correctness.  Note that
1236  // this is ok even in KeepWhitespaceMode, because we would have returned the
1237  /// comment above in that mode.
1238  ++CurPtr;
1239
1240  // The next returned token is at the start of the line.
1241  Result.setFlag(Token::StartOfLine);
1242  // No leading whitespace seen so far.
1243  Result.clearFlag(Token::LeadingSpace);
1244  BufferPtr = CurPtr;
1245  return false;
1246}
1247
1248/// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in
1249/// an appropriate way and return it.
1250bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) {
1251  // If we're not in a preprocessor directive, just return the // comment
1252  // directly.
1253  FormTokenWithChars(Result, CurPtr, tok::comment);
1254
1255  if (!ParsingPreprocessorDirective)
1256    return true;
1257
1258  // If this BCPL-style comment is in a macro definition, transmogrify it into
1259  // a C-style block comment.
1260  bool Invalid = false;
1261  std::string Spelling = PP->getSpelling(Result, &Invalid);
1262  if (Invalid)
1263    return true;
1264
1265  assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?");
1266  Spelling[1] = '*';   // Change prefix to "/*".
1267  Spelling += "*/";    // add suffix.
1268
1269  Result.setKind(tok::comment);
1270  PP->CreateString(&Spelling[0], Spelling.size(), Result,
1271                   Result.getLocation());
1272  return true;
1273}
1274
1275/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
1276/// character (either \n or \r) is part of an escaped newline sequence.  Issue a
1277/// diagnostic if so.  We know that the newline is inside of a block comment.
1278static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
1279                                                  Lexer *L) {
1280  assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
1281
1282  // Back up off the newline.
1283  --CurPtr;
1284
1285  // If this is a two-character newline sequence, skip the other character.
1286  if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
1287    // \n\n or \r\r -> not escaped newline.
1288    if (CurPtr[0] == CurPtr[1])
1289      return false;
1290    // \n\r or \r\n -> skip the newline.
1291    --CurPtr;
1292  }
1293
1294  // If we have horizontal whitespace, skip over it.  We allow whitespace
1295  // between the slash and newline.
1296  bool HasSpace = false;
1297  while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
1298    --CurPtr;
1299    HasSpace = true;
1300  }
1301
1302  // If we have a slash, we know this is an escaped newline.
1303  if (*CurPtr == '\\') {
1304    if (CurPtr[-1] != '*') return false;
1305  } else {
1306    // It isn't a slash, is it the ?? / trigraph?
1307    if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' ||
1308        CurPtr[-3] != '*')
1309      return false;
1310
1311    // This is the trigraph ending the comment.  Emit a stern warning!
1312    CurPtr -= 2;
1313
1314    // If no trigraphs are enabled, warn that we ignored this trigraph and
1315    // ignore this * character.
1316    if (!L->getFeatures().Trigraphs) {
1317      if (!L->isLexingRawMode())
1318        L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
1319      return false;
1320    }
1321    if (!L->isLexingRawMode())
1322      L->Diag(CurPtr, diag::trigraph_ends_block_comment);
1323  }
1324
1325  // Warn about having an escaped newline between the */ characters.
1326  if (!L->isLexingRawMode())
1327    L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
1328
1329  // If there was space between the backslash and newline, warn about it.
1330  if (HasSpace && !L->isLexingRawMode())
1331    L->Diag(CurPtr, diag::backslash_newline_space);
1332
1333  return true;
1334}
1335
1336#ifdef __SSE2__
1337#include <emmintrin.h>
1338#elif __ALTIVEC__
1339#include <altivec.h>
1340#undef bool
1341#endif
1342
1343/// SkipBlockComment - We have just read the /* characters from input.  Read
1344/// until we find the */ characters that terminate the comment.  Note that we
1345/// don't bother decoding trigraphs or escaped newlines in block comments,
1346/// because they cannot cause the comment to end.  The only thing that can
1347/// happen is the comment could end with an escaped newline between the */ end
1348/// of comment.
1349///
1350/// If we're in KeepCommentMode or any CommentHandler has inserted
1351/// some tokens, this will store the first token and return true.
1352bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
1353  // Scan one character past where we should, looking for a '/' character.  Once
1354  // we find it, check to see if it was preceeded by a *.  This common
1355  // optimization helps people who like to put a lot of * characters in their
1356  // comments.
1357
1358  // The first character we get with newlines and trigraphs skipped to handle
1359  // the degenerate /*/ case below correctly if the * has an escaped newline
1360  // after it.
1361  unsigned CharSize;
1362  unsigned char C = getCharAndSize(CurPtr, CharSize);
1363  CurPtr += CharSize;
1364  if (C == 0 && CurPtr == BufferEnd+1) {
1365    if (!isLexingRawMode() &&
1366        !PP->isCodeCompletionFile(FileLoc))
1367      Diag(BufferPtr, diag::err_unterminated_block_comment);
1368    --CurPtr;
1369
1370    // KeepWhitespaceMode should return this broken comment as a token.  Since
1371    // it isn't a well formed comment, just return it as an 'unknown' token.
1372    if (isKeepWhitespaceMode()) {
1373      FormTokenWithChars(Result, CurPtr, tok::unknown);
1374      return true;
1375    }
1376
1377    BufferPtr = CurPtr;
1378    return false;
1379  }
1380
1381  // Check to see if the first character after the '/*' is another /.  If so,
1382  // then this slash does not end the block comment, it is part of it.
1383  if (C == '/')
1384    C = *CurPtr++;
1385
1386  while (1) {
1387    // Skip over all non-interesting characters until we find end of buffer or a
1388    // (probably ending) '/' character.
1389    if (CurPtr + 24 < BufferEnd) {
1390      // While not aligned to a 16-byte boundary.
1391      while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
1392        C = *CurPtr++;
1393
1394      if (C == '/') goto FoundSlash;
1395
1396#ifdef __SSE2__
1397      __m128i Slashes = _mm_set_epi8('/', '/', '/', '/', '/', '/', '/', '/',
1398                                     '/', '/', '/', '/', '/', '/', '/', '/');
1399      while (CurPtr+16 <= BufferEnd &&
1400             _mm_movemask_epi8(_mm_cmpeq_epi8(*(__m128i*)CurPtr, Slashes)) == 0)
1401        CurPtr += 16;
1402#elif __ALTIVEC__
1403      __vector unsigned char Slashes = {
1404        '/', '/', '/', '/',  '/', '/', '/', '/',
1405        '/', '/', '/', '/',  '/', '/', '/', '/'
1406      };
1407      while (CurPtr+16 <= BufferEnd &&
1408             !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes))
1409        CurPtr += 16;
1410#else
1411      // Scan for '/' quickly.  Many block comments are very large.
1412      while (CurPtr[0] != '/' &&
1413             CurPtr[1] != '/' &&
1414             CurPtr[2] != '/' &&
1415             CurPtr[3] != '/' &&
1416             CurPtr+4 < BufferEnd) {
1417        CurPtr += 4;
1418      }
1419#endif
1420
1421      // It has to be one of the bytes scanned, increment to it and read one.
1422      C = *CurPtr++;
1423    }
1424
1425    // Loop to scan the remainder.
1426    while (C != '/' && C != '\0')
1427      C = *CurPtr++;
1428
1429  FoundSlash:
1430    if (C == '/') {
1431      if (CurPtr[-2] == '*')  // We found the final */.  We're done!
1432        break;
1433
1434      if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
1435        if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
1436          // We found the final */, though it had an escaped newline between the
1437          // * and /.  We're done!
1438          break;
1439        }
1440      }
1441      if (CurPtr[0] == '*' && CurPtr[1] != '/') {
1442        // If this is a /* inside of the comment, emit a warning.  Don't do this
1443        // if this is a /*/, which will end the comment.  This misses cases with
1444        // embedded escaped newlines, but oh well.
1445        if (!isLexingRawMode())
1446          Diag(CurPtr-1, diag::warn_nested_block_comment);
1447      }
1448    } else if (C == 0 && CurPtr == BufferEnd+1) {
1449      if (PP && PP->isCodeCompletionFile(FileLoc))
1450        PP->CodeCompleteNaturalLanguage();
1451      else if (!isLexingRawMode())
1452        Diag(BufferPtr, diag::err_unterminated_block_comment);
1453      // Note: the user probably forgot a */.  We could continue immediately
1454      // after the /*, but this would involve lexing a lot of what really is the
1455      // comment, which surely would confuse the parser.
1456      --CurPtr;
1457
1458      // KeepWhitespaceMode should return this broken comment as a token.  Since
1459      // it isn't a well formed comment, just return it as an 'unknown' token.
1460      if (isKeepWhitespaceMode()) {
1461        FormTokenWithChars(Result, CurPtr, tok::unknown);
1462        return true;
1463      }
1464
1465      BufferPtr = CurPtr;
1466      return false;
1467    }
1468    C = *CurPtr++;
1469  }
1470
1471  // Notify comment handlers about the comment unless we're in a #if 0 block.
1472  if (PP && !isLexingRawMode() &&
1473      PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
1474                                            getSourceLocation(CurPtr)))) {
1475    BufferPtr = CurPtr;
1476    return true; // A token has to be returned.
1477  }
1478
1479  // If we are returning comments as tokens, return this comment as a token.
1480  if (inKeepCommentMode()) {
1481    FormTokenWithChars(Result, CurPtr, tok::comment);
1482    return true;
1483  }
1484
1485  // It is common for the tokens immediately after a /**/ comment to be
1486  // whitespace.  Instead of going through the big switch, handle it
1487  // efficiently now.  This is safe even in KeepWhitespaceMode because we would
1488  // have already returned above with the comment as a token.
1489  if (isHorizontalWhitespace(*CurPtr)) {
1490    Result.setFlag(Token::LeadingSpace);
1491    SkipWhitespace(Result, CurPtr+1);
1492    return false;
1493  }
1494
1495  // Otherwise, just return so that the next character will be lexed as a token.
1496  BufferPtr = CurPtr;
1497  Result.setFlag(Token::LeadingSpace);
1498  return false;
1499}
1500
1501//===----------------------------------------------------------------------===//
1502// Primary Lexing Entry Points
1503//===----------------------------------------------------------------------===//
1504
1505/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
1506/// uninterpreted string.  This switches the lexer out of directive mode.
1507std::string Lexer::ReadToEndOfLine() {
1508  assert(ParsingPreprocessorDirective && ParsingFilename == false &&
1509         "Must be in a preprocessing directive!");
1510  std::string Result;
1511  Token Tmp;
1512
1513  // CurPtr - Cache BufferPtr in an automatic variable.
1514  const char *CurPtr = BufferPtr;
1515  while (1) {
1516    char Char = getAndAdvanceChar(CurPtr, Tmp);
1517    switch (Char) {
1518    default:
1519      Result += Char;
1520      break;
1521    case 0:  // Null.
1522      // Found end of file?
1523      if (CurPtr-1 != BufferEnd) {
1524        // Nope, normal character, continue.
1525        Result += Char;
1526        break;
1527      }
1528      // FALL THROUGH.
1529    case '\r':
1530    case '\n':
1531      // Okay, we found the end of the line. First, back up past the \0, \r, \n.
1532      assert(CurPtr[-1] == Char && "Trigraphs for newline?");
1533      BufferPtr = CurPtr-1;
1534
1535      // Next, lex the character, which should handle the EOM transition.
1536      Lex(Tmp);
1537      if (Tmp.is(tok::code_completion)) {
1538        if (PP && PP->getCodeCompletionHandler())
1539          PP->getCodeCompletionHandler()->CodeCompleteNaturalLanguage();
1540        Lex(Tmp);
1541      }
1542      assert(Tmp.is(tok::eom) && "Unexpected token!");
1543
1544      // Finally, we're done, return the string we found.
1545      return Result;
1546    }
1547  }
1548}
1549
1550/// LexEndOfFile - CurPtr points to the end of this file.  Handle this
1551/// condition, reporting diagnostics and handling other edge cases as required.
1552/// This returns true if Result contains a token, false if PP.Lex should be
1553/// called again.
1554bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
1555  // Check if we are performing code completion.
1556  if (PP && PP->isCodeCompletionFile(FileLoc)) {
1557    // We're at the end of the file, but we've been asked to consider the
1558    // end of the file to be a code-completion token. Return the
1559    // code-completion token.
1560    Result.startToken();
1561    FormTokenWithChars(Result, CurPtr, tok::code_completion);
1562
1563    // Only do the eof -> code_completion translation once.
1564    PP->SetCodeCompletionPoint(0, 0, 0);
1565
1566    // Silence any diagnostics that occur once we hit the code-completion point.
1567    PP->getDiagnostics().setSuppressAllDiagnostics(true);
1568    return true;
1569  }
1570
1571  // If we hit the end of the file while parsing a preprocessor directive,
1572  // end the preprocessor directive first.  The next token returned will
1573  // then be the end of file.
1574  if (ParsingPreprocessorDirective) {
1575    // Done parsing the "line".
1576    ParsingPreprocessorDirective = false;
1577    // Update the location of token as well as BufferPtr.
1578    FormTokenWithChars(Result, CurPtr, tok::eom);
1579
1580    // Restore comment saving mode, in case it was disabled for directive.
1581    SetCommentRetentionState(PP->getCommentRetentionState());
1582    return true;  // Have a token.
1583  }
1584
1585  // If we are in raw mode, return this event as an EOF token.  Let the caller
1586  // that put us in raw mode handle the event.
1587  if (isLexingRawMode()) {
1588    Result.startToken();
1589    BufferPtr = BufferEnd;
1590    FormTokenWithChars(Result, BufferEnd, tok::eof);
1591    return true;
1592  }
1593
1594  // Issue diagnostics for unterminated #if and missing newline.
1595
1596  // If we are in a #if directive, emit an error.
1597  while (!ConditionalStack.empty()) {
1598    if (!PP->isCodeCompletionFile(FileLoc))
1599      PP->Diag(ConditionalStack.back().IfLoc,
1600               diag::err_pp_unterminated_conditional);
1601    ConditionalStack.pop_back();
1602  }
1603
1604  // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
1605  // a pedwarn.
1606  if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r'))
1607    Diag(BufferEnd, diag::ext_no_newline_eof)
1608      << FixItHint::CreateInsertion(getSourceLocation(BufferEnd), "\n");
1609
1610  BufferPtr = CurPtr;
1611
1612  // Finally, let the preprocessor handle this.
1613  return PP->HandleEndOfFile(Result);
1614}
1615
1616/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
1617/// the specified lexer will return a tok::l_paren token, 0 if it is something
1618/// else and 2 if there are no more tokens in the buffer controlled by the
1619/// lexer.
1620unsigned Lexer::isNextPPTokenLParen() {
1621  assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
1622
1623  // Switch to 'skipping' mode.  This will ensure that we can lex a token
1624  // without emitting diagnostics, disables macro expansion, and will cause EOF
1625  // to return an EOF token instead of popping the include stack.
1626  LexingRawMode = true;
1627
1628  // Save state that can be changed while lexing so that we can restore it.
1629  const char *TmpBufferPtr = BufferPtr;
1630  bool inPPDirectiveMode = ParsingPreprocessorDirective;
1631
1632  Token Tok;
1633  Tok.startToken();
1634  LexTokenInternal(Tok);
1635
1636  // Restore state that may have changed.
1637  BufferPtr = TmpBufferPtr;
1638  ParsingPreprocessorDirective = inPPDirectiveMode;
1639
1640  // Restore the lexer back to non-skipping mode.
1641  LexingRawMode = false;
1642
1643  if (Tok.is(tok::eof))
1644    return 2;
1645  return Tok.is(tok::l_paren);
1646}
1647
1648/// FindConflictEnd - Find the end of a version control conflict marker.
1649static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd) {
1650  llvm::StringRef RestOfBuffer(CurPtr+7, BufferEnd-CurPtr-7);
1651  size_t Pos = RestOfBuffer.find(">>>>>>>");
1652  while (Pos != llvm::StringRef::npos) {
1653    // Must occur at start of line.
1654    if (RestOfBuffer[Pos-1] != '\r' &&
1655        RestOfBuffer[Pos-1] != '\n') {
1656      RestOfBuffer = RestOfBuffer.substr(Pos+7);
1657      Pos = RestOfBuffer.find(">>>>>>>");
1658      continue;
1659    }
1660    return RestOfBuffer.data()+Pos;
1661  }
1662  return 0;
1663}
1664
1665/// IsStartOfConflictMarker - If the specified pointer is the start of a version
1666/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
1667/// and recover nicely.  This returns true if it is a conflict marker and false
1668/// if not.
1669bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
1670  // Only a conflict marker if it starts at the beginning of a line.
1671  if (CurPtr != BufferStart &&
1672      CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
1673    return false;
1674
1675  // Check to see if we have <<<<<<<.
1676  if (BufferEnd-CurPtr < 8 ||
1677      llvm::StringRef(CurPtr, 7) != "<<<<<<<")
1678    return false;
1679
1680  // If we have a situation where we don't care about conflict markers, ignore
1681  // it.
1682  if (IsInConflictMarker || isLexingRawMode())
1683    return false;
1684
1685  // Check to see if there is a >>>>>>> somewhere in the buffer at the start of
1686  // a line to terminate this conflict marker.
1687  if (FindConflictEnd(CurPtr, BufferEnd)) {
1688    // We found a match.  We are really in a conflict marker.
1689    // Diagnose this, and ignore to the end of line.
1690    Diag(CurPtr, diag::err_conflict_marker);
1691    IsInConflictMarker = true;
1692
1693    // Skip ahead to the end of line.  We know this exists because the
1694    // end-of-conflict marker starts with \r or \n.
1695    while (*CurPtr != '\r' && *CurPtr != '\n') {
1696      assert(CurPtr != BufferEnd && "Didn't find end of line");
1697      ++CurPtr;
1698    }
1699    BufferPtr = CurPtr;
1700    return true;
1701  }
1702
1703  // No end of conflict marker found.
1704  return false;
1705}
1706
1707
1708/// HandleEndOfConflictMarker - If this is a '=======' or '|||||||' or '>>>>>>>'
1709/// marker, then it is the end of a conflict marker.  Handle it by ignoring up
1710/// until the end of the line.  This returns true if it is a conflict marker and
1711/// false if not.
1712bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
1713  // Only a conflict marker if it starts at the beginning of a line.
1714  if (CurPtr != BufferStart &&
1715      CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
1716    return false;
1717
1718  // If we have a situation where we don't care about conflict markers, ignore
1719  // it.
1720  if (!IsInConflictMarker || isLexingRawMode())
1721    return false;
1722
1723  // Check to see if we have the marker (7 characters in a row).
1724  for (unsigned i = 1; i != 7; ++i)
1725    if (CurPtr[i] != CurPtr[0])
1726      return false;
1727
1728  // If we do have it, search for the end of the conflict marker.  This could
1729  // fail if it got skipped with a '#if 0' or something.  Note that CurPtr might
1730  // be the end of conflict marker.
1731  if (const char *End = FindConflictEnd(CurPtr, BufferEnd)) {
1732    CurPtr = End;
1733
1734    // Skip ahead to the end of line.
1735    while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
1736      ++CurPtr;
1737
1738    BufferPtr = CurPtr;
1739
1740    // No longer in the conflict marker.
1741    IsInConflictMarker = false;
1742    return true;
1743  }
1744
1745  return false;
1746}
1747
1748
1749/// LexTokenInternal - This implements a simple C family lexer.  It is an
1750/// extremely performance critical piece of code.  This assumes that the buffer
1751/// has a null character at the end of the file.  This returns a preprocessing
1752/// token, not a normal token, as such, it is an internal interface.  It assumes
1753/// that the Flags of result have been cleared before calling this.
1754void Lexer::LexTokenInternal(Token &Result) {
1755LexNextToken:
1756  // New token, can't need cleaning yet.
1757  Result.clearFlag(Token::NeedsCleaning);
1758  Result.setIdentifierInfo(0);
1759
1760  // CurPtr - Cache BufferPtr in an automatic variable.
1761  const char *CurPtr = BufferPtr;
1762
1763  // Small amounts of horizontal whitespace is very common between tokens.
1764  if ((*CurPtr == ' ') || (*CurPtr == '\t')) {
1765    ++CurPtr;
1766    while ((*CurPtr == ' ') || (*CurPtr == '\t'))
1767      ++CurPtr;
1768
1769    // If we are keeping whitespace and other tokens, just return what we just
1770    // skipped.  The next lexer invocation will return the token after the
1771    // whitespace.
1772    if (isKeepWhitespaceMode()) {
1773      FormTokenWithChars(Result, CurPtr, tok::unknown);
1774      return;
1775    }
1776
1777    BufferPtr = CurPtr;
1778    Result.setFlag(Token::LeadingSpace);
1779  }
1780
1781  unsigned SizeTmp, SizeTmp2;   // Temporaries for use in cases below.
1782
1783  // Read a character, advancing over it.
1784  char Char = getAndAdvanceChar(CurPtr, Result);
1785  tok::TokenKind Kind;
1786
1787  switch (Char) {
1788  case 0:  // Null.
1789    // Found end of file?
1790    if (CurPtr-1 == BufferEnd) {
1791      // Read the PP instance variable into an automatic variable, because
1792      // LexEndOfFile will often delete 'this'.
1793      Preprocessor *PPCache = PP;
1794      if (LexEndOfFile(Result, CurPtr-1))  // Retreat back into the file.
1795        return;   // Got a token to return.
1796      assert(PPCache && "Raw buffer::LexEndOfFile should return a token");
1797      return PPCache->Lex(Result);
1798    }
1799
1800    if (!isLexingRawMode())
1801      Diag(CurPtr-1, diag::null_in_file);
1802    Result.setFlag(Token::LeadingSpace);
1803    if (SkipWhitespace(Result, CurPtr))
1804      return; // KeepWhitespaceMode
1805
1806    goto LexNextToken;   // GCC isn't tail call eliminating.
1807
1808  case 26:  // DOS & CP/M EOF: "^Z".
1809    // If we're in Microsoft extensions mode, treat this as end of file.
1810    if (Features.Microsoft) {
1811      // Read the PP instance variable into an automatic variable, because
1812      // LexEndOfFile will often delete 'this'.
1813      Preprocessor *PPCache = PP;
1814      if (LexEndOfFile(Result, CurPtr-1))  // Retreat back into the file.
1815        return;   // Got a token to return.
1816      assert(PPCache && "Raw buffer::LexEndOfFile should return a token");
1817      return PPCache->Lex(Result);
1818    }
1819    // If Microsoft extensions are disabled, this is just random garbage.
1820    Kind = tok::unknown;
1821    break;
1822
1823  case '\n':
1824  case '\r':
1825    // If we are inside a preprocessor directive and we see the end of line,
1826    // we know we are done with the directive, so return an EOM token.
1827    if (ParsingPreprocessorDirective) {
1828      // Done parsing the "line".
1829      ParsingPreprocessorDirective = false;
1830
1831      // Restore comment saving mode, in case it was disabled for directive.
1832      SetCommentRetentionState(PP->getCommentRetentionState());
1833
1834      // Since we consumed a newline, we are back at the start of a line.
1835      IsAtStartOfLine = true;
1836
1837      Kind = tok::eom;
1838      break;
1839    }
1840    // The returned token is at the start of the line.
1841    Result.setFlag(Token::StartOfLine);
1842    // No leading whitespace seen so far.
1843    Result.clearFlag(Token::LeadingSpace);
1844
1845    if (SkipWhitespace(Result, CurPtr))
1846      return; // KeepWhitespaceMode
1847    goto LexNextToken;   // GCC isn't tail call eliminating.
1848  case ' ':
1849  case '\t':
1850  case '\f':
1851  case '\v':
1852  SkipHorizontalWhitespace:
1853    Result.setFlag(Token::LeadingSpace);
1854    if (SkipWhitespace(Result, CurPtr))
1855      return; // KeepWhitespaceMode
1856
1857  SkipIgnoredUnits:
1858    CurPtr = BufferPtr;
1859
1860    // If the next token is obviously a // or /* */ comment, skip it efficiently
1861    // too (without going through the big switch stmt).
1862    if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
1863        Features.BCPLComment) {
1864      if (SkipBCPLComment(Result, CurPtr+2))
1865        return; // There is a token to return.
1866      goto SkipIgnoredUnits;
1867    } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
1868      if (SkipBlockComment(Result, CurPtr+2))
1869        return; // There is a token to return.
1870      goto SkipIgnoredUnits;
1871    } else if (isHorizontalWhitespace(*CurPtr)) {
1872      goto SkipHorizontalWhitespace;
1873    }
1874    goto LexNextToken;   // GCC isn't tail call eliminating.
1875
1876  // C99 6.4.4.1: Integer Constants.
1877  // C99 6.4.4.2: Floating Constants.
1878  case '0': case '1': case '2': case '3': case '4':
1879  case '5': case '6': case '7': case '8': case '9':
1880    // Notify MIOpt that we read a non-whitespace/non-comment token.
1881    MIOpt.ReadToken();
1882    return LexNumericConstant(Result, CurPtr);
1883
1884  case 'L':   // Identifier (Loony) or wide literal (L'x' or L"xyz").
1885    // Notify MIOpt that we read a non-whitespace/non-comment token.
1886    MIOpt.ReadToken();
1887    Char = getCharAndSize(CurPtr, SizeTmp);
1888
1889    // Wide string literal.
1890    if (Char == '"')
1891      return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
1892                              true);
1893
1894    // Wide character constant.
1895    if (Char == '\'')
1896      return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
1897    // FALL THROUGH, treating L like the start of an identifier.
1898
1899  // C99 6.4.2: Identifiers.
1900  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
1901  case 'H': case 'I': case 'J': case 'K':    /*'L'*/case 'M': case 'N':
1902  case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
1903  case 'V': case 'W': case 'X': case 'Y': case 'Z':
1904  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
1905  case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
1906  case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
1907  case 'v': case 'w': case 'x': case 'y': case 'z':
1908  case '_':
1909    // Notify MIOpt that we read a non-whitespace/non-comment token.
1910    MIOpt.ReadToken();
1911    return LexIdentifier(Result, CurPtr);
1912
1913  case '$':   // $ in identifiers.
1914    if (Features.DollarIdents) {
1915      if (!isLexingRawMode())
1916        Diag(CurPtr-1, diag::ext_dollar_in_identifier);
1917      // Notify MIOpt that we read a non-whitespace/non-comment token.
1918      MIOpt.ReadToken();
1919      return LexIdentifier(Result, CurPtr);
1920    }
1921
1922    Kind = tok::unknown;
1923    break;
1924
1925  // C99 6.4.4: Character Constants.
1926  case '\'':
1927    // Notify MIOpt that we read a non-whitespace/non-comment token.
1928    MIOpt.ReadToken();
1929    return LexCharConstant(Result, CurPtr);
1930
1931  // C99 6.4.5: String Literals.
1932  case '"':
1933    // Notify MIOpt that we read a non-whitespace/non-comment token.
1934    MIOpt.ReadToken();
1935    return LexStringLiteral(Result, CurPtr, false);
1936
1937  // C99 6.4.6: Punctuators.
1938  case '?':
1939    Kind = tok::question;
1940    break;
1941  case '[':
1942    Kind = tok::l_square;
1943    break;
1944  case ']':
1945    Kind = tok::r_square;
1946    break;
1947  case '(':
1948    Kind = tok::l_paren;
1949    break;
1950  case ')':
1951    Kind = tok::r_paren;
1952    break;
1953  case '{':
1954    Kind = tok::l_brace;
1955    break;
1956  case '}':
1957    Kind = tok::r_brace;
1958    break;
1959  case '.':
1960    Char = getCharAndSize(CurPtr, SizeTmp);
1961    if (Char >= '0' && Char <= '9') {
1962      // Notify MIOpt that we read a non-whitespace/non-comment token.
1963      MIOpt.ReadToken();
1964
1965      return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
1966    } else if (Features.CPlusPlus && Char == '*') {
1967      Kind = tok::periodstar;
1968      CurPtr += SizeTmp;
1969    } else if (Char == '.' &&
1970               getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
1971      Kind = tok::ellipsis;
1972      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
1973                           SizeTmp2, Result);
1974    } else {
1975      Kind = tok::period;
1976    }
1977    break;
1978  case '&':
1979    Char = getCharAndSize(CurPtr, SizeTmp);
1980    if (Char == '&') {
1981      Kind = tok::ampamp;
1982      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1983    } else if (Char == '=') {
1984      Kind = tok::ampequal;
1985      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1986    } else {
1987      Kind = tok::amp;
1988    }
1989    break;
1990  case '*':
1991    if (getCharAndSize(CurPtr, SizeTmp) == '=') {
1992      Kind = tok::starequal;
1993      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1994    } else {
1995      Kind = tok::star;
1996    }
1997    break;
1998  case '+':
1999    Char = getCharAndSize(CurPtr, SizeTmp);
2000    if (Char == '+') {
2001      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2002      Kind = tok::plusplus;
2003    } else if (Char == '=') {
2004      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2005      Kind = tok::plusequal;
2006    } else {
2007      Kind = tok::plus;
2008    }
2009    break;
2010  case '-':
2011    Char = getCharAndSize(CurPtr, SizeTmp);
2012    if (Char == '-') {      // --
2013      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2014      Kind = tok::minusminus;
2015    } else if (Char == '>' && Features.CPlusPlus &&
2016               getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') {  // C++ ->*
2017      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
2018                           SizeTmp2, Result);
2019      Kind = tok::arrowstar;
2020    } else if (Char == '>') {   // ->
2021      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2022      Kind = tok::arrow;
2023    } else if (Char == '=') {   // -=
2024      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2025      Kind = tok::minusequal;
2026    } else {
2027      Kind = tok::minus;
2028    }
2029    break;
2030  case '~':
2031    Kind = tok::tilde;
2032    break;
2033  case '!':
2034    if (getCharAndSize(CurPtr, SizeTmp) == '=') {
2035      Kind = tok::exclaimequal;
2036      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2037    } else {
2038      Kind = tok::exclaim;
2039    }
2040    break;
2041  case '/':
2042    // 6.4.9: Comments
2043    Char = getCharAndSize(CurPtr, SizeTmp);
2044    if (Char == '/') {         // BCPL comment.
2045      // Even if BCPL comments are disabled (e.g. in C89 mode), we generally
2046      // want to lex this as a comment.  There is one problem with this though,
2047      // that in one particular corner case, this can change the behavior of the
2048      // resultant program.  For example, In  "foo //**/ bar", C89 would lex
2049      // this as "foo / bar" and langauges with BCPL comments would lex it as
2050      // "foo".  Check to see if the character after the second slash is a '*'.
2051      // If so, we will lex that as a "/" instead of the start of a comment.
2052      if (Features.BCPLComment ||
2053          getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') {
2054        if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
2055          return; // There is a token to return.
2056
2057        // It is common for the tokens immediately after a // comment to be
2058        // whitespace (indentation for the next line).  Instead of going through
2059        // the big switch, handle it efficiently now.
2060        goto SkipIgnoredUnits;
2061      }
2062    }
2063
2064    if (Char == '*') {  // /**/ comment.
2065      if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
2066        return; // There is a token to return.
2067      goto LexNextToken;   // GCC isn't tail call eliminating.
2068    }
2069
2070    if (Char == '=') {
2071      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2072      Kind = tok::slashequal;
2073    } else {
2074      Kind = tok::slash;
2075    }
2076    break;
2077  case '%':
2078    Char = getCharAndSize(CurPtr, SizeTmp);
2079    if (Char == '=') {
2080      Kind = tok::percentequal;
2081      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2082    } else if (Features.Digraphs && Char == '>') {
2083      Kind = tok::r_brace;                             // '%>' -> '}'
2084      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2085    } else if (Features.Digraphs && Char == ':') {
2086      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2087      Char = getCharAndSize(CurPtr, SizeTmp);
2088      if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
2089        Kind = tok::hashhash;                          // '%:%:' -> '##'
2090        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
2091                             SizeTmp2, Result);
2092      } else if (Char == '@' && Features.Microsoft) {  // %:@ -> #@ -> Charize
2093        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2094        if (!isLexingRawMode())
2095          Diag(BufferPtr, diag::charize_microsoft_ext);
2096        Kind = tok::hashat;
2097      } else {                                         // '%:' -> '#'
2098        // We parsed a # character.  If this occurs at the start of the line,
2099        // it's actually the start of a preprocessing directive.  Callback to
2100        // the preprocessor to handle it.
2101        // FIXME: -fpreprocessed mode??
2102        if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) {
2103          FormTokenWithChars(Result, CurPtr, tok::hash);
2104          PP->HandleDirective(Result);
2105
2106          // As an optimization, if the preprocessor didn't switch lexers, tail
2107          // recurse.
2108          if (PP->isCurrentLexer(this)) {
2109            // Start a new token. If this is a #include or something, the PP may
2110            // want us starting at the beginning of the line again.  If so, set
2111            // the StartOfLine flag and clear LeadingSpace.
2112            if (IsAtStartOfLine) {
2113              Result.setFlag(Token::StartOfLine);
2114              Result.clearFlag(Token::LeadingSpace);
2115              IsAtStartOfLine = false;
2116            }
2117            goto LexNextToken;   // GCC isn't tail call eliminating.
2118          }
2119
2120          return PP->Lex(Result);
2121        }
2122
2123        Kind = tok::hash;
2124      }
2125    } else {
2126      Kind = tok::percent;
2127    }
2128    break;
2129  case '<':
2130    Char = getCharAndSize(CurPtr, SizeTmp);
2131    if (ParsingFilename) {
2132      return LexAngledStringLiteral(Result, CurPtr);
2133    } else if (Char == '<') {
2134      char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
2135      if (After == '=') {
2136        Kind = tok::lesslessequal;
2137        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
2138                             SizeTmp2, Result);
2139      } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
2140        // If this is actually a '<<<<<<<' version control conflict marker,
2141        // recognize it as such and recover nicely.
2142        goto LexNextToken;
2143      } else {
2144        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2145        Kind = tok::lessless;
2146      }
2147    } else if (Char == '=') {
2148      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2149      Kind = tok::lessequal;
2150    } else if (Features.Digraphs && Char == ':') {     // '<:' -> '['
2151      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2152      Kind = tok::l_square;
2153    } else if (Features.Digraphs && Char == '%') {     // '<%' -> '{'
2154      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2155      Kind = tok::l_brace;
2156    } else {
2157      Kind = tok::less;
2158    }
2159    break;
2160  case '>':
2161    Char = getCharAndSize(CurPtr, SizeTmp);
2162    if (Char == '=') {
2163      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2164      Kind = tok::greaterequal;
2165    } else if (Char == '>') {
2166      char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
2167      if (After == '=') {
2168        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
2169                             SizeTmp2, Result);
2170        Kind = tok::greatergreaterequal;
2171      } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
2172        // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
2173        goto LexNextToken;
2174      } else {
2175        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2176        Kind = tok::greatergreater;
2177      }
2178
2179    } else {
2180      Kind = tok::greater;
2181    }
2182    break;
2183  case '^':
2184    Char = getCharAndSize(CurPtr, SizeTmp);
2185    if (Char == '=') {
2186      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2187      Kind = tok::caretequal;
2188    } else {
2189      Kind = tok::caret;
2190    }
2191    break;
2192  case '|':
2193    Char = getCharAndSize(CurPtr, SizeTmp);
2194    if (Char == '=') {
2195      Kind = tok::pipeequal;
2196      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2197    } else if (Char == '|') {
2198      // If this is '|||||||' and we're in a conflict marker, ignore it.
2199      if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
2200        goto LexNextToken;
2201      Kind = tok::pipepipe;
2202      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2203    } else {
2204      Kind = tok::pipe;
2205    }
2206    break;
2207  case ':':
2208    Char = getCharAndSize(CurPtr, SizeTmp);
2209    if (Features.Digraphs && Char == '>') {
2210      Kind = tok::r_square; // ':>' -> ']'
2211      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2212    } else if (Features.CPlusPlus && Char == ':') {
2213      Kind = tok::coloncolon;
2214      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2215    } else {
2216      Kind = tok::colon;
2217    }
2218    break;
2219  case ';':
2220    Kind = tok::semi;
2221    break;
2222  case '=':
2223    Char = getCharAndSize(CurPtr, SizeTmp);
2224    if (Char == '=') {
2225      // If this is '=======' and we're in a conflict marker, ignore it.
2226      if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
2227        goto LexNextToken;
2228
2229      Kind = tok::equalequal;
2230      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2231    } else {
2232      Kind = tok::equal;
2233    }
2234    break;
2235  case ',':
2236    Kind = tok::comma;
2237    break;
2238  case '#':
2239    Char = getCharAndSize(CurPtr, SizeTmp);
2240    if (Char == '#') {
2241      Kind = tok::hashhash;
2242      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2243    } else if (Char == '@' && Features.Microsoft) {  // #@ -> Charize
2244      Kind = tok::hashat;
2245      if (!isLexingRawMode())
2246        Diag(BufferPtr, diag::charize_microsoft_ext);
2247      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2248    } else {
2249      // We parsed a # character.  If this occurs at the start of the line,
2250      // it's actually the start of a preprocessing directive.  Callback to
2251      // the preprocessor to handle it.
2252      // FIXME: -fpreprocessed mode??
2253      if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) {
2254        FormTokenWithChars(Result, CurPtr, tok::hash);
2255        PP->HandleDirective(Result);
2256
2257        // As an optimization, if the preprocessor didn't switch lexers, tail
2258        // recurse.
2259        if (PP->isCurrentLexer(this)) {
2260          // Start a new token.  If this is a #include or something, the PP may
2261          // want us starting at the beginning of the line again.  If so, set
2262          // the StartOfLine flag and clear LeadingSpace.
2263          if (IsAtStartOfLine) {
2264            Result.setFlag(Token::StartOfLine);
2265            Result.clearFlag(Token::LeadingSpace);
2266            IsAtStartOfLine = false;
2267          }
2268          goto LexNextToken;   // GCC isn't tail call eliminating.
2269        }
2270        return PP->Lex(Result);
2271      }
2272
2273      Kind = tok::hash;
2274    }
2275    break;
2276
2277  case '@':
2278    // Objective C support.
2279    if (CurPtr[-1] == '@' && Features.ObjC1)
2280      Kind = tok::at;
2281    else
2282      Kind = tok::unknown;
2283    break;
2284
2285  case '\\':
2286    // FIXME: UCN's.
2287    // FALL THROUGH.
2288  default:
2289    Kind = tok::unknown;
2290    break;
2291  }
2292
2293  // Notify MIOpt that we read a non-whitespace/non-comment token.
2294  MIOpt.ReadToken();
2295
2296  // Update the location of token as well as BufferPtr.
2297  FormTokenWithChars(Result, CurPtr, Kind);
2298}
2299