Lexer.cpp revision 5cc2c6eb67b6e5361bbe96f79b519fd62ec666d6
1//===--- Lexer.cpp - C Language Family Lexer ------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10//  This file implements the Lexer and Token interfaces.
11//
12//===----------------------------------------------------------------------===//
13//
14// TODO: GCC Diagnostics emitted by the lexer:
15// PEDWARN: (form feed|vertical tab) in preprocessing directive
16//
17// Universal characters, unicode, char mapping:
18// WARNING: `%.*s' is not in NFKC
19// WARNING: `%.*s' is not in NFC
20//
21// Other:
22// TODO: Options to support:
23//    -fexec-charset,-fwide-exec-charset
24//
25//===----------------------------------------------------------------------===//
26
27#include "clang/Lex/Lexer.h"
28#include "clang/Lex/Preprocessor.h"
29#include "clang/Lex/LexDiagnostic.h"
30#include "clang/Lex/CodeCompletionHandler.h"
31#include "clang/Basic/SourceManager.h"
32#include "llvm/ADT/StringSwitch.h"
33#include "llvm/ADT/STLExtras.h"
34#include "llvm/Support/Compiler.h"
35#include "llvm/Support/MemoryBuffer.h"
36#include <cstring>
37using namespace clang;
38
39static void InitCharacterInfo();
40
41//===----------------------------------------------------------------------===//
42// Token Class Implementation
43//===----------------------------------------------------------------------===//
44
45/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
46bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
47  if (IdentifierInfo *II = getIdentifierInfo())
48    return II->getObjCKeywordID() == objcKey;
49  return false;
50}
51
52/// getObjCKeywordID - Return the ObjC keyword kind.
53tok::ObjCKeywordKind Token::getObjCKeywordID() const {
54  IdentifierInfo *specId = getIdentifierInfo();
55  return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
56}
57
58
59//===----------------------------------------------------------------------===//
60// Lexer Class Implementation
61//===----------------------------------------------------------------------===//
62
63void Lexer::anchor() { }
64
65void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
66                      const char *BufEnd) {
67  InitCharacterInfo();
68
69  BufferStart = BufStart;
70  BufferPtr = BufPtr;
71  BufferEnd = BufEnd;
72
73  assert(BufEnd[0] == 0 &&
74         "We assume that the input buffer has a null character at the end"
75         " to simplify lexing!");
76
77  // Check whether we have a BOM in the beginning of the buffer. If yes - act
78  // accordingly. Right now we support only UTF-8 with and without BOM, so, just
79  // skip the UTF-8 BOM if it's present.
80  if (BufferStart == BufferPtr) {
81    // Determine the size of the BOM.
82    StringRef Buf(BufferStart, BufferEnd - BufferStart);
83    size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
84      .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
85      .Default(0);
86
87    // Skip the BOM.
88    BufferPtr += BOMLength;
89  }
90
91  Is_PragmaLexer = false;
92  CurrentConflictMarkerState = CMK_None;
93
94  // Start of the file is a start of line.
95  IsAtStartOfLine = true;
96
97  // We are not after parsing a #.
98  ParsingPreprocessorDirective = false;
99
100  // We are not after parsing #include.
101  ParsingFilename = false;
102
103  // We are not in raw mode.  Raw mode disables diagnostics and interpretation
104  // of tokens (e.g. identifiers, thus disabling macro expansion).  It is used
105  // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
106  // or otherwise skipping over tokens.
107  LexingRawMode = false;
108
109  // Default to not keeping comments.
110  ExtendedTokenMode = 0;
111}
112
113/// Lexer constructor - Create a new lexer object for the specified buffer
114/// with the specified preprocessor managing the lexing process.  This lexer
115/// assumes that the associated file buffer and Preprocessor objects will
116/// outlive it, so it doesn't take ownership of either of them.
117Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP)
118  : PreprocessorLexer(&PP, FID),
119    FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
120    Features(PP.getLangOptions()) {
121
122  InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
123            InputFile->getBufferEnd());
124
125  // Default to keeping comments if the preprocessor wants them.
126  SetCommentRetentionState(PP.getCommentRetentionState());
127}
128
129/// Lexer constructor - Create a new raw lexer object.  This object is only
130/// suitable for calls to 'LexRawToken'.  This lexer assumes that the text
131/// range will outlive it, so it doesn't take ownership of it.
132Lexer::Lexer(SourceLocation fileloc, const LangOptions &features,
133             const char *BufStart, const char *BufPtr, const char *BufEnd)
134  : FileLoc(fileloc), Features(features) {
135
136  InitLexer(BufStart, BufPtr, BufEnd);
137
138  // We *are* in raw mode.
139  LexingRawMode = true;
140}
141
142/// Lexer constructor - Create a new raw lexer object.  This object is only
143/// suitable for calls to 'LexRawToken'.  This lexer assumes that the text
144/// range will outlive it, so it doesn't take ownership of it.
145Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile,
146             const SourceManager &SM, const LangOptions &features)
147  : FileLoc(SM.getLocForStartOfFile(FID)), Features(features) {
148
149  InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(),
150            FromFile->getBufferEnd());
151
152  // We *are* in raw mode.
153  LexingRawMode = true;
154}
155
156/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
157/// _Pragma expansion.  This has a variety of magic semantics that this method
158/// sets up.  It returns a new'd Lexer that must be delete'd when done.
159///
160/// On entrance to this routine, TokStartLoc is a macro location which has a
161/// spelling loc that indicates the bytes to be lexed for the token and an
162/// expansion location that indicates where all lexed tokens should be
163/// "expanded from".
164///
165/// FIXME: It would really be nice to make _Pragma just be a wrapper around a
166/// normal lexer that remaps tokens as they fly by.  This would require making
167/// Preprocessor::Lex virtual.  Given that, we could just dump in a magic lexer
168/// interface that could handle this stuff.  This would pull GetMappedTokenLoc
169/// out of the critical path of the lexer!
170///
171Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
172                                 SourceLocation ExpansionLocStart,
173                                 SourceLocation ExpansionLocEnd,
174                                 unsigned TokLen, Preprocessor &PP) {
175  SourceManager &SM = PP.getSourceManager();
176
177  // Create the lexer as if we were going to lex the file normally.
178  FileID SpellingFID = SM.getFileID(SpellingLoc);
179  const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID);
180  Lexer *L = new Lexer(SpellingFID, InputFile, PP);
181
182  // Now that the lexer is created, change the start/end locations so that we
183  // just lex the subsection of the file that we want.  This is lexing from a
184  // scratch buffer.
185  const char *StrData = SM.getCharacterData(SpellingLoc);
186
187  L->BufferPtr = StrData;
188  L->BufferEnd = StrData+TokLen;
189  assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
190
191  // Set the SourceLocation with the remapping information.  This ensures that
192  // GetMappedTokenLoc will remap the tokens as they are lexed.
193  L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
194                                     ExpansionLocStart,
195                                     ExpansionLocEnd, TokLen);
196
197  // Ensure that the lexer thinks it is inside a directive, so that end \n will
198  // return an EOD token.
199  L->ParsingPreprocessorDirective = true;
200
201  // This lexer really is for _Pragma.
202  L->Is_PragmaLexer = true;
203  return L;
204}
205
206
207/// Stringify - Convert the specified string into a C string, with surrounding
208/// ""'s, and with escaped \ and " characters.
209std::string Lexer::Stringify(const std::string &Str, bool Charify) {
210  std::string Result = Str;
211  char Quote = Charify ? '\'' : '"';
212  for (unsigned i = 0, e = Result.size(); i != e; ++i) {
213    if (Result[i] == '\\' || Result[i] == Quote) {
214      Result.insert(Result.begin()+i, '\\');
215      ++i; ++e;
216    }
217  }
218  return Result;
219}
220
221/// Stringify - Convert the specified string into a C string by escaping '\'
222/// and " characters.  This does not add surrounding ""'s to the string.
223void Lexer::Stringify(SmallVectorImpl<char> &Str) {
224  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
225    if (Str[i] == '\\' || Str[i] == '"') {
226      Str.insert(Str.begin()+i, '\\');
227      ++i; ++e;
228    }
229  }
230}
231
232//===----------------------------------------------------------------------===//
233// Token Spelling
234//===----------------------------------------------------------------------===//
235
236/// getSpelling() - Return the 'spelling' of this token.  The spelling of a
237/// token are the characters used to represent the token in the source file
238/// after trigraph expansion and escaped-newline folding.  In particular, this
239/// wants to get the true, uncanonicalized, spelling of things like digraphs
240/// UCNs, etc.
241StringRef Lexer::getSpelling(SourceLocation loc,
242                                   SmallVectorImpl<char> &buffer,
243                                   const SourceManager &SM,
244                                   const LangOptions &options,
245                                   bool *invalid) {
246  // Break down the source location.
247  std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
248
249  // Try to the load the file buffer.
250  bool invalidTemp = false;
251  StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
252  if (invalidTemp) {
253    if (invalid) *invalid = true;
254    return StringRef();
255  }
256
257  const char *tokenBegin = file.data() + locInfo.second;
258
259  // Lex from the start of the given location.
260  Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
261              file.begin(), tokenBegin, file.end());
262  Token token;
263  lexer.LexFromRawLexer(token);
264
265  unsigned length = token.getLength();
266
267  // Common case:  no need for cleaning.
268  if (!token.needsCleaning())
269    return StringRef(tokenBegin, length);
270
271  // Hard case, we need to relex the characters into the string.
272  buffer.clear();
273  buffer.reserve(length);
274
275  for (const char *ti = tokenBegin, *te = ti + length; ti != te; ) {
276    unsigned charSize;
277    buffer.push_back(Lexer::getCharAndSizeNoWarn(ti, charSize, options));
278    ti += charSize;
279  }
280
281  return StringRef(buffer.data(), buffer.size());
282}
283
284/// getSpelling() - Return the 'spelling' of this token.  The spelling of a
285/// token are the characters used to represent the token in the source file
286/// after trigraph expansion and escaped-newline folding.  In particular, this
287/// wants to get the true, uncanonicalized, spelling of things like digraphs
288/// UCNs, etc.
289std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
290                               const LangOptions &Features, bool *Invalid) {
291  assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
292
293  // If this token contains nothing interesting, return it directly.
294  bool CharDataInvalid = false;
295  const char* TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
296                                                    &CharDataInvalid);
297  if (Invalid)
298    *Invalid = CharDataInvalid;
299  if (CharDataInvalid)
300    return std::string();
301
302  if (!Tok.needsCleaning())
303    return std::string(TokStart, TokStart+Tok.getLength());
304
305  std::string Result;
306  Result.reserve(Tok.getLength());
307
308  // Otherwise, hard case, relex the characters into the string.
309  for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength();
310       Ptr != End; ) {
311    unsigned CharSize;
312    Result.push_back(Lexer::getCharAndSizeNoWarn(Ptr, CharSize, Features));
313    Ptr += CharSize;
314  }
315  assert(Result.size() != unsigned(Tok.getLength()) &&
316         "NeedsCleaning flag set on something that didn't need cleaning!");
317  return Result;
318}
319
320/// getSpelling - This method is used to get the spelling of a token into a
321/// preallocated buffer, instead of as an std::string.  The caller is required
322/// to allocate enough space for the token, which is guaranteed to be at least
323/// Tok.getLength() bytes long.  The actual length of the token is returned.
324///
325/// Note that this method may do two possible things: it may either fill in
326/// the buffer specified with characters, or it may *change the input pointer*
327/// to point to a constant buffer with the data already in it (avoiding a
328/// copy).  The caller is not allowed to modify the returned buffer pointer
329/// if an internal buffer is returned.
330unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
331                            const SourceManager &SourceMgr,
332                            const LangOptions &Features, bool *Invalid) {
333  assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
334
335  const char *TokStart = 0;
336  // NOTE: this has to be checked *before* testing for an IdentifierInfo.
337  if (Tok.is(tok::raw_identifier))
338    TokStart = Tok.getRawIdentifierData();
339  else if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
340    // Just return the string from the identifier table, which is very quick.
341    Buffer = II->getNameStart();
342    return II->getLength();
343  }
344
345  // NOTE: this can be checked even after testing for an IdentifierInfo.
346  if (Tok.isLiteral())
347    TokStart = Tok.getLiteralData();
348
349  if (TokStart == 0) {
350    // Compute the start of the token in the input lexer buffer.
351    bool CharDataInvalid = false;
352    TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
353    if (Invalid)
354      *Invalid = CharDataInvalid;
355    if (CharDataInvalid) {
356      Buffer = "";
357      return 0;
358    }
359  }
360
361  // If this token contains nothing interesting, return it directly.
362  if (!Tok.needsCleaning()) {
363    Buffer = TokStart;
364    return Tok.getLength();
365  }
366
367  // Otherwise, hard case, relex the characters into the string.
368  char *OutBuf = const_cast<char*>(Buffer);
369  for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength();
370       Ptr != End; ) {
371    unsigned CharSize;
372    *OutBuf++ = Lexer::getCharAndSizeNoWarn(Ptr, CharSize, Features);
373    Ptr += CharSize;
374  }
375  assert(unsigned(OutBuf-Buffer) != Tok.getLength() &&
376         "NeedsCleaning flag set on something that didn't need cleaning!");
377
378  return OutBuf-Buffer;
379}
380
381
382
383static bool isWhitespace(unsigned char c);
384
385/// MeasureTokenLength - Relex the token at the specified location and return
386/// its length in bytes in the input file.  If the token needs cleaning (e.g.
387/// includes a trigraph or an escaped newline) then this count includes bytes
388/// that are part of that.
389unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
390                                   const SourceManager &SM,
391                                   const LangOptions &LangOpts) {
392  // TODO: this could be special cased for common tokens like identifiers, ')',
393  // etc to make this faster, if it mattered.  Just look at StrData[0] to handle
394  // all obviously single-char tokens.  This could use
395  // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
396  // something.
397
398  // If this comes from a macro expansion, we really do want the macro name, not
399  // the token this macro expanded to.
400  Loc = SM.getExpansionLoc(Loc);
401  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
402  bool Invalid = false;
403  StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
404  if (Invalid)
405    return 0;
406
407  const char *StrData = Buffer.data()+LocInfo.second;
408
409  if (isWhitespace(StrData[0]))
410    return 0;
411
412  // Create a lexer starting at the beginning of this token.
413  Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
414                 Buffer.begin(), StrData, Buffer.end());
415  TheLexer.SetCommentRetentionState(true);
416  Token TheTok;
417  TheLexer.LexFromRawLexer(TheTok);
418  return TheTok.getLength();
419}
420
421static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
422                                              const SourceManager &SM,
423                                              const LangOptions &LangOpts) {
424  assert(Loc.isFileID());
425  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
426  if (LocInfo.first.isInvalid())
427    return Loc;
428
429  bool Invalid = false;
430  StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
431  if (Invalid)
432    return Loc;
433
434  // Back up from the current location until we hit the beginning of a line
435  // (or the buffer). We'll relex from that point.
436  const char *BufStart = Buffer.data();
437  if (LocInfo.second >= Buffer.size())
438    return Loc;
439
440  const char *StrData = BufStart+LocInfo.second;
441  if (StrData[0] == '\n' || StrData[0] == '\r')
442    return Loc;
443
444  const char *LexStart = StrData;
445  while (LexStart != BufStart) {
446    if (LexStart[0] == '\n' || LexStart[0] == '\r') {
447      ++LexStart;
448      break;
449    }
450
451    --LexStart;
452  }
453
454  // Create a lexer starting at the beginning of this token.
455  SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
456  Lexer TheLexer(LexerStartLoc, LangOpts, BufStart, LexStart, Buffer.end());
457  TheLexer.SetCommentRetentionState(true);
458
459  // Lex tokens until we find the token that contains the source location.
460  Token TheTok;
461  do {
462    TheLexer.LexFromRawLexer(TheTok);
463
464    if (TheLexer.getBufferLocation() > StrData) {
465      // Lexing this token has taken the lexer past the source location we're
466      // looking for. If the current token encompasses our source location,
467      // return the beginning of that token.
468      if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
469        return TheTok.getLocation();
470
471      // We ended up skipping over the source location entirely, which means
472      // that it points into whitespace. We're done here.
473      break;
474    }
475  } while (TheTok.getKind() != tok::eof);
476
477  // We've passed our source location; just return the original source location.
478  return Loc;
479}
480
481SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
482                                          const SourceManager &SM,
483                                          const LangOptions &LangOpts) {
484 if (Loc.isFileID())
485   return getBeginningOfFileToken(Loc, SM, LangOpts);
486
487 if (!SM.isMacroArgExpansion(Loc))
488   return Loc;
489
490 SourceLocation FileLoc = SM.getSpellingLoc(Loc);
491 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
492 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
493 std::pair<FileID, unsigned> BeginFileLocInfo
494   = SM.getDecomposedLoc(BeginFileLoc);
495 assert(FileLocInfo.first == BeginFileLocInfo.first &&
496        FileLocInfo.second >= BeginFileLocInfo.second);
497 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
498}
499
500namespace {
501  enum PreambleDirectiveKind {
502    PDK_Skipped,
503    PDK_StartIf,
504    PDK_EndIf,
505    PDK_Unknown
506  };
507}
508
509std::pair<unsigned, bool>
510Lexer::ComputePreamble(const llvm::MemoryBuffer *Buffer,
511                       const LangOptions &Features, unsigned MaxLines) {
512  // Create a lexer starting at the beginning of the file. Note that we use a
513  // "fake" file source location at offset 1 so that the lexer will track our
514  // position within the file.
515  const unsigned StartOffset = 1;
516  SourceLocation StartLoc = SourceLocation::getFromRawEncoding(StartOffset);
517  Lexer TheLexer(StartLoc, Features, Buffer->getBufferStart(),
518                 Buffer->getBufferStart(), Buffer->getBufferEnd());
519
520  bool InPreprocessorDirective = false;
521  Token TheTok;
522  Token IfStartTok;
523  unsigned IfCount = 0;
524
525  unsigned MaxLineOffset = 0;
526  if (MaxLines) {
527    const char *CurPtr = Buffer->getBufferStart();
528    unsigned CurLine = 0;
529    while (CurPtr != Buffer->getBufferEnd()) {
530      char ch = *CurPtr++;
531      if (ch == '\n') {
532        ++CurLine;
533        if (CurLine == MaxLines)
534          break;
535      }
536    }
537    if (CurPtr != Buffer->getBufferEnd())
538      MaxLineOffset = CurPtr - Buffer->getBufferStart();
539  }
540
541  do {
542    TheLexer.LexFromRawLexer(TheTok);
543
544    if (InPreprocessorDirective) {
545      // If we've hit the end of the file, we're done.
546      if (TheTok.getKind() == tok::eof) {
547        InPreprocessorDirective = false;
548        break;
549      }
550
551      // If we haven't hit the end of the preprocessor directive, skip this
552      // token.
553      if (!TheTok.isAtStartOfLine())
554        continue;
555
556      // We've passed the end of the preprocessor directive, and will look
557      // at this token again below.
558      InPreprocessorDirective = false;
559    }
560
561    // Keep track of the # of lines in the preamble.
562    if (TheTok.isAtStartOfLine()) {
563      unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
564
565      // If we were asked to limit the number of lines in the preamble,
566      // and we're about to exceed that limit, we're done.
567      if (MaxLineOffset && TokOffset >= MaxLineOffset)
568        break;
569    }
570
571    // Comments are okay; skip over them.
572    if (TheTok.getKind() == tok::comment)
573      continue;
574
575    if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
576      // This is the start of a preprocessor directive.
577      Token HashTok = TheTok;
578      InPreprocessorDirective = true;
579
580      // Figure out which directive this is. Since we're lexing raw tokens,
581      // we don't have an identifier table available. Instead, just look at
582      // the raw identifier to recognize and categorize preprocessor directives.
583      TheLexer.LexFromRawLexer(TheTok);
584      if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
585        StringRef Keyword(TheTok.getRawIdentifierData(),
586                                TheTok.getLength());
587        PreambleDirectiveKind PDK
588          = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
589              .Case("include", PDK_Skipped)
590              .Case("__include_macros", PDK_Skipped)
591              .Case("define", PDK_Skipped)
592              .Case("undef", PDK_Skipped)
593              .Case("line", PDK_Skipped)
594              .Case("error", PDK_Skipped)
595              .Case("pragma", PDK_Skipped)
596              .Case("import", PDK_Skipped)
597              .Case("include_next", PDK_Skipped)
598              .Case("warning", PDK_Skipped)
599              .Case("ident", PDK_Skipped)
600              .Case("sccs", PDK_Skipped)
601              .Case("assert", PDK_Skipped)
602              .Case("unassert", PDK_Skipped)
603              .Case("if", PDK_StartIf)
604              .Case("ifdef", PDK_StartIf)
605              .Case("ifndef", PDK_StartIf)
606              .Case("elif", PDK_Skipped)
607              .Case("else", PDK_Skipped)
608              .Case("endif", PDK_EndIf)
609              .Default(PDK_Unknown);
610
611        switch (PDK) {
612        case PDK_Skipped:
613          continue;
614
615        case PDK_StartIf:
616          if (IfCount == 0)
617            IfStartTok = HashTok;
618
619          ++IfCount;
620          continue;
621
622        case PDK_EndIf:
623          // Mismatched #endif. The preamble ends here.
624          if (IfCount == 0)
625            break;
626
627          --IfCount;
628          continue;
629
630        case PDK_Unknown:
631          // We don't know what this directive is; stop at the '#'.
632          break;
633        }
634      }
635
636      // We only end up here if we didn't recognize the preprocessor
637      // directive or it was one that can't occur in the preamble at this
638      // point. Roll back the current token to the location of the '#'.
639      InPreprocessorDirective = false;
640      TheTok = HashTok;
641    }
642
643    // We hit a token that we don't recognize as being in the
644    // "preprocessing only" part of the file, so we're no longer in
645    // the preamble.
646    break;
647  } while (true);
648
649  SourceLocation End = IfCount? IfStartTok.getLocation() : TheTok.getLocation();
650  return std::make_pair(End.getRawEncoding() - StartLoc.getRawEncoding(),
651                        IfCount? IfStartTok.isAtStartOfLine()
652                               : TheTok.isAtStartOfLine());
653}
654
655
656/// AdvanceToTokenCharacter - Given a location that specifies the start of a
657/// token, return a new location that specifies a character within the token.
658SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart,
659                                              unsigned CharNo,
660                                              const SourceManager &SM,
661                                              const LangOptions &Features) {
662  // Figure out how many physical characters away the specified expansion
663  // character is.  This needs to take into consideration newlines and
664  // trigraphs.
665  bool Invalid = false;
666  const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
667
668  // If they request the first char of the token, we're trivially done.
669  if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
670    return TokStart;
671
672  unsigned PhysOffset = 0;
673
674  // The usual case is that tokens don't contain anything interesting.  Skip
675  // over the uninteresting characters.  If a token only consists of simple
676  // chars, this method is extremely fast.
677  while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
678    if (CharNo == 0)
679      return TokStart.getLocWithOffset(PhysOffset);
680    ++TokPtr, --CharNo, ++PhysOffset;
681  }
682
683  // If we have a character that may be a trigraph or escaped newline, use a
684  // lexer to parse it correctly.
685  for (; CharNo; --CharNo) {
686    unsigned Size;
687    Lexer::getCharAndSizeNoWarn(TokPtr, Size, Features);
688    TokPtr += Size;
689    PhysOffset += Size;
690  }
691
692  // Final detail: if we end up on an escaped newline, we want to return the
693  // location of the actual byte of the token.  For example foo\<newline>bar
694  // advanced by 3 should return the location of b, not of \\.  One compounding
695  // detail of this is that the escape may be made by a trigraph.
696  if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
697    PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
698
699  return TokStart.getLocWithOffset(PhysOffset);
700}
701
702/// \brief Computes the source location just past the end of the
703/// token at this source location.
704///
705/// This routine can be used to produce a source location that
706/// points just past the end of the token referenced by \p Loc, and
707/// is generally used when a diagnostic needs to point just after a
708/// token where it expected something different that it received. If
709/// the returned source location would not be meaningful (e.g., if
710/// it points into a macro), this routine returns an invalid
711/// source location.
712///
713/// \param Offset an offset from the end of the token, where the source
714/// location should refer to. The default offset (0) produces a source
715/// location pointing just past the end of the token; an offset of 1 produces
716/// a source location pointing to the last character in the token, etc.
717SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
718                                          const SourceManager &SM,
719                                          const LangOptions &Features) {
720  if (Loc.isInvalid())
721    return SourceLocation();
722
723  if (Loc.isMacroID()) {
724    if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, Features, &Loc))
725      return SourceLocation(); // Points inside the macro expansion.
726  }
727
728  unsigned Len = Lexer::MeasureTokenLength(Loc, SM, Features);
729  if (Len > Offset)
730    Len = Len - Offset;
731  else
732    return Loc;
733
734  return Loc.getLocWithOffset(Len);
735}
736
737/// \brief Returns true if the given MacroID location points at the first
738/// token of the macro expansion.
739bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
740                                      const SourceManager &SM,
741                                      const LangOptions &LangOpts,
742                                      SourceLocation *MacroBegin) {
743  assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
744
745  std::pair<FileID, unsigned> infoLoc = SM.getDecomposedLoc(loc);
746  // FIXME: If the token comes from the macro token paste operator ('##')
747  // this function will always return false;
748  if (infoLoc.second > 0)
749    return false; // Does not point at the start of token.
750
751  SourceLocation expansionLoc =
752    SM.getSLocEntry(infoLoc.first).getExpansion().getExpansionLocStart();
753  if (expansionLoc.isFileID()) {
754    // No other macro expansions, this is the first.
755    if (MacroBegin)
756      *MacroBegin = expansionLoc;
757    return true;
758  }
759
760  return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
761}
762
763/// \brief Returns true if the given MacroID location points at the last
764/// token of the macro expansion.
765bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
766                                    const SourceManager &SM,
767                                    const LangOptions &LangOpts,
768                                    SourceLocation *MacroEnd) {
769  assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
770
771  SourceLocation spellLoc = SM.getSpellingLoc(loc);
772  unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
773  if (tokLen == 0)
774    return false;
775
776  FileID FID = SM.getFileID(loc);
777  SourceLocation afterLoc = loc.getLocWithOffset(tokLen+1);
778  if (SM.isInFileID(afterLoc, FID))
779    return false; // Still in the same FileID, does not point to the last token.
780
781  // FIXME: If the token comes from the macro token paste operator ('##')
782  // or the stringify operator ('#') this function will always return false;
783
784  SourceLocation expansionLoc =
785    SM.getSLocEntry(FID).getExpansion().getExpansionLocEnd();
786  if (expansionLoc.isFileID()) {
787    // No other macro expansions.
788    if (MacroEnd)
789      *MacroEnd = expansionLoc;
790    return true;
791  }
792
793  return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
794}
795
796static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
797                                             const SourceManager &SM,
798                                             const LangOptions &LangOpts) {
799  SourceLocation Begin = Range.getBegin();
800  SourceLocation End = Range.getEnd();
801  assert(Begin.isFileID() && End.isFileID());
802  if (Range.isTokenRange()) {
803    End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
804    if (End.isInvalid())
805      return CharSourceRange();
806  }
807
808  // Break down the source locations.
809  FileID FID;
810  unsigned BeginOffs;
811  llvm::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
812  if (FID.isInvalid())
813    return CharSourceRange();
814
815  unsigned EndOffs;
816  if (!SM.isInFileID(End, FID, &EndOffs) ||
817      BeginOffs > EndOffs)
818    return CharSourceRange();
819
820  return CharSourceRange::getCharRange(Begin, End);
821}
822
823/// \brief Accepts a range and returns a character range with file locations.
824///
825/// Returns a null range if a part of the range resides inside a macro
826/// expansion or the range does not reside on the same FileID.
827CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
828                                         const SourceManager &SM,
829                                         const LangOptions &LangOpts) {
830  SourceLocation Begin = Range.getBegin();
831  SourceLocation End = Range.getEnd();
832  if (Begin.isInvalid() || End.isInvalid())
833    return CharSourceRange();
834
835  if (Begin.isFileID() && End.isFileID())
836    return makeRangeFromFileLocs(Range, SM, LangOpts);
837
838  if (Begin.isMacroID() && End.isFileID()) {
839    if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
840      return CharSourceRange();
841    Range.setBegin(Begin);
842    return makeRangeFromFileLocs(Range, SM, LangOpts);
843  }
844
845  if (Begin.isFileID() && End.isMacroID()) {
846    if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts,
847                                                          &End)) ||
848        (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts,
849                                                           &End)))
850      return CharSourceRange();
851    Range.setEnd(End);
852    return makeRangeFromFileLocs(Range, SM, LangOpts);
853  }
854
855  assert(Begin.isMacroID() && End.isMacroID());
856  SourceLocation MacroBegin, MacroEnd;
857  if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
858      ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
859                                                        &MacroEnd)) ||
860       (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
861                                                         &MacroEnd)))) {
862    Range.setBegin(MacroBegin);
863    Range.setEnd(MacroEnd);
864    return makeRangeFromFileLocs(Range, SM, LangOpts);
865  }
866
867  FileID FID;
868  unsigned BeginOffs;
869  llvm::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
870  if (FID.isInvalid())
871    return CharSourceRange();
872
873  unsigned EndOffs;
874  if (!SM.isInFileID(End, FID, &EndOffs) ||
875      BeginOffs > EndOffs)
876    return CharSourceRange();
877
878  const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
879  const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
880  if (Expansion.isMacroArgExpansion() &&
881      Expansion.getSpellingLoc().isFileID()) {
882    SourceLocation SpellLoc = Expansion.getSpellingLoc();
883    Range.setBegin(SpellLoc.getLocWithOffset(BeginOffs));
884    Range.setEnd(SpellLoc.getLocWithOffset(EndOffs));
885    return makeRangeFromFileLocs(Range, SM, LangOpts);
886  }
887
888  return CharSourceRange();
889}
890
891StringRef Lexer::getSourceText(CharSourceRange Range,
892                               const SourceManager &SM,
893                               const LangOptions &LangOpts,
894                               bool *Invalid) {
895  Range = makeFileCharRange(Range, SM, LangOpts);
896  if (Range.isInvalid()) {
897    if (Invalid) *Invalid = true;
898    return StringRef();
899  }
900
901  // Break down the source location.
902  std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
903  if (beginInfo.first.isInvalid()) {
904    if (Invalid) *Invalid = true;
905    return StringRef();
906  }
907
908  unsigned EndOffs;
909  if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
910      beginInfo.second > EndOffs) {
911    if (Invalid) *Invalid = true;
912    return StringRef();
913  }
914
915  // Try to the load the file buffer.
916  bool invalidTemp = false;
917  StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
918  if (invalidTemp) {
919    if (Invalid) *Invalid = true;
920    return StringRef();
921  }
922
923  if (Invalid) *Invalid = false;
924  return file.substr(beginInfo.second, EndOffs - beginInfo.second);
925}
926
927StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
928                                       const SourceManager &SM,
929                                       const LangOptions &LangOpts) {
930  assert(Loc.isMacroID() && "Only reasonble to call this on macros");
931
932  // Find the location of the immediate macro expansion.
933  while (1) {
934    FileID FID = SM.getFileID(Loc);
935    const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
936    const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
937    Loc = Expansion.getExpansionLocStart();
938    if (!Expansion.isMacroArgExpansion())
939      break;
940
941    // For macro arguments we need to check that the argument did not come
942    // from an inner macro, e.g: "MAC1( MAC2(foo) )"
943
944    // Loc points to the argument id of the macro definition, move to the
945    // macro expansion.
946    Loc = SM.getImmediateExpansionRange(Loc).first;
947    SourceLocation SpellLoc = Expansion.getSpellingLoc();
948    if (SpellLoc.isFileID())
949      break; // No inner macro.
950
951    // If spelling location resides in the same FileID as macro expansion
952    // location, it means there is no inner macro.
953    FileID MacroFID = SM.getFileID(Loc);
954    if (SM.isInFileID(SpellLoc, MacroFID))
955      break;
956
957    // Argument came from inner macro.
958    Loc = SpellLoc;
959  }
960
961  // Find the spelling location of the start of the non-argument expansion
962  // range. This is where the macro name was spelled in order to begin
963  // expanding this macro.
964  Loc = SM.getSpellingLoc(Loc);
965
966  // Dig out the buffer where the macro name was spelled and the extents of the
967  // name so that we can render it into the expansion note.
968  std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
969  unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
970  StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
971  return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
972}
973
974//===----------------------------------------------------------------------===//
975// Character information.
976//===----------------------------------------------------------------------===//
977
978enum {
979  CHAR_HORZ_WS  = 0x01,  // ' ', '\t', '\f', '\v'.  Note, no '\0'
980  CHAR_VERT_WS  = 0x02,  // '\r', '\n'
981  CHAR_LETTER   = 0x04,  // a-z,A-Z
982  CHAR_NUMBER   = 0x08,  // 0-9
983  CHAR_UNDER    = 0x10,  // _
984  CHAR_PERIOD   = 0x20,  // .
985  CHAR_RAWDEL   = 0x40   // {}[]#<>%:;?*+-/^&|~!=,"'
986};
987
988// Statically initialize CharInfo table based on ASCII character set
989// Reference: FreeBSD 7.2 /usr/share/misc/ascii
990static const unsigned char CharInfo[256] =
991{
992// 0 NUL         1 SOH         2 STX         3 ETX
993// 4 EOT         5 ENQ         6 ACK         7 BEL
994   0           , 0           , 0           , 0           ,
995   0           , 0           , 0           , 0           ,
996// 8 BS          9 HT         10 NL         11 VT
997//12 NP         13 CR         14 SO         15 SI
998   0           , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS,
999   CHAR_HORZ_WS, CHAR_VERT_WS, 0           , 0           ,
1000//16 DLE        17 DC1        18 DC2        19 DC3
1001//20 DC4        21 NAK        22 SYN        23 ETB
1002   0           , 0           , 0           , 0           ,
1003   0           , 0           , 0           , 0           ,
1004//24 CAN        25 EM         26 SUB        27 ESC
1005//28 FS         29 GS         30 RS         31 US
1006   0           , 0           , 0           , 0           ,
1007   0           , 0           , 0           , 0           ,
1008//32 SP         33  !         34  "         35  #
1009//36  $         37  %         38  &         39  '
1010   CHAR_HORZ_WS, CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
1011   0           , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
1012//40  (         41  )         42  *         43  +
1013//44  ,         45  -         46  .         47  /
1014   0           , 0           , CHAR_RAWDEL , CHAR_RAWDEL ,
1015   CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL ,
1016//48  0         49  1         50  2         51  3
1017//52  4         53  5         54  6         55  7
1018   CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
1019   CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER ,
1020//56  8         57  9         58  :         59  ;
1021//60  <         61  =         62  >         63  ?
1022   CHAR_NUMBER , CHAR_NUMBER , CHAR_RAWDEL , CHAR_RAWDEL ,
1023   CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL ,
1024//64  @         65  A         66  B         67  C
1025//68  D         69  E         70  F         71  G
1026   0           , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
1027   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
1028//72  H         73  I         74  J         75  K
1029//76  L         77  M         78  N         79  O
1030   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
1031   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
1032//80  P         81  Q         82  R         83  S
1033//84  T         85  U         86  V         87  W
1034   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
1035   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
1036//88  X         89  Y         90  Z         91  [
1037//92  \         93  ]         94  ^         95  _
1038   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL ,
1039   0           , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER  ,
1040//96  `         97  a         98  b         99  c
1041//100  d       101  e        102  f        103  g
1042   0           , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
1043   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
1044//104  h       105  i        106  j        107  k
1045//108  l       109  m        110  n        111  o
1046   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
1047   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
1048//112  p       113  q        114  r        115  s
1049//116  t       117  u        118  v        119  w
1050   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
1051   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER ,
1052//120  x       121  y        122  z        123  {
1053//124  |       125  }        126  ~        127 DEL
1054   CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL ,
1055   CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0
1056};
1057
1058static void InitCharacterInfo() {
1059  static bool isInited = false;
1060  if (isInited) return;
1061  // check the statically-initialized CharInfo table
1062  assert(CHAR_HORZ_WS == CharInfo[(int)' ']);
1063  assert(CHAR_HORZ_WS == CharInfo[(int)'\t']);
1064  assert(CHAR_HORZ_WS == CharInfo[(int)'\f']);
1065  assert(CHAR_HORZ_WS == CharInfo[(int)'\v']);
1066  assert(CHAR_VERT_WS == CharInfo[(int)'\n']);
1067  assert(CHAR_VERT_WS == CharInfo[(int)'\r']);
1068  assert(CHAR_UNDER   == CharInfo[(int)'_']);
1069  assert(CHAR_PERIOD  == CharInfo[(int)'.']);
1070  for (unsigned i = 'a'; i <= 'z'; ++i) {
1071    assert(CHAR_LETTER == CharInfo[i]);
1072    assert(CHAR_LETTER == CharInfo[i+'A'-'a']);
1073  }
1074  for (unsigned i = '0'; i <= '9'; ++i)
1075    assert(CHAR_NUMBER == CharInfo[i]);
1076
1077  isInited = true;
1078}
1079
1080
1081/// isIdentifierHead - Return true if this is the first character of an
1082/// identifier, which is [a-zA-Z_].
1083static inline bool isIdentifierHead(unsigned char c) {
1084  return (CharInfo[c] & (CHAR_LETTER|CHAR_UNDER)) ? true : false;
1085}
1086
1087/// isIdentifierBody - Return true if this is the body character of an
1088/// identifier, which is [a-zA-Z0-9_].
1089static inline bool isIdentifierBody(unsigned char c) {
1090  return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false;
1091}
1092
1093/// isHorizontalWhitespace - Return true if this character is horizontal
1094/// whitespace: ' ', '\t', '\f', '\v'.  Note that this returns false for '\0'.
1095static inline bool isHorizontalWhitespace(unsigned char c) {
1096  return (CharInfo[c] & CHAR_HORZ_WS) ? true : false;
1097}
1098
1099/// isVerticalWhitespace - Return true if this character is vertical
1100/// whitespace: '\n', '\r'.  Note that this returns false for '\0'.
1101static inline bool isVerticalWhitespace(unsigned char c) {
1102  return (CharInfo[c] & CHAR_VERT_WS) ? true : false;
1103}
1104
1105/// isWhitespace - Return true if this character is horizontal or vertical
1106/// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'.  Note that this returns false
1107/// for '\0'.
1108static inline bool isWhitespace(unsigned char c) {
1109  return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false;
1110}
1111
1112/// isNumberBody - Return true if this is the body character of an
1113/// preprocessing number, which is [a-zA-Z0-9_.].
1114static inline bool isNumberBody(unsigned char c) {
1115  return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ?
1116    true : false;
1117}
1118
1119/// isRawStringDelimBody - Return true if this is the body character of a
1120/// raw string delimiter.
1121static inline bool isRawStringDelimBody(unsigned char c) {
1122  return (CharInfo[c] &
1123          (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL)) ?
1124    true : false;
1125}
1126
1127
1128//===----------------------------------------------------------------------===//
1129// Diagnostics forwarding code.
1130//===----------------------------------------------------------------------===//
1131
1132/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1133/// lexer buffer was all expanded at a single point, perform the mapping.
1134/// This is currently only used for _Pragma implementation, so it is the slow
1135/// path of the hot getSourceLocation method.  Do not allow it to be inlined.
1136static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
1137    Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
1138static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
1139                                        SourceLocation FileLoc,
1140                                        unsigned CharNo, unsigned TokLen) {
1141  assert(FileLoc.isMacroID() && "Must be a macro expansion");
1142
1143  // Otherwise, we're lexing "mapped tokens".  This is used for things like
1144  // _Pragma handling.  Combine the expansion location of FileLoc with the
1145  // spelling location.
1146  SourceManager &SM = PP.getSourceManager();
1147
1148  // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1149  // characters come from spelling(FileLoc)+Offset.
1150  SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1151  SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1152
1153  // Figure out the expansion loc range, which is the range covered by the
1154  // original _Pragma(...) sequence.
1155  std::pair<SourceLocation,SourceLocation> II =
1156    SM.getImmediateExpansionRange(FileLoc);
1157
1158  return SM.createExpansionLoc(SpellingLoc, II.first, II.second, TokLen);
1159}
1160
1161/// getSourceLocation - Return a source location identifier for the specified
1162/// offset in the current file.
1163SourceLocation Lexer::getSourceLocation(const char *Loc,
1164                                        unsigned TokLen) const {
1165  assert(Loc >= BufferStart && Loc <= BufferEnd &&
1166         "Location out of range for this buffer!");
1167
1168  // In the normal case, we're just lexing from a simple file buffer, return
1169  // the file id from FileLoc with the offset specified.
1170  unsigned CharNo = Loc-BufferStart;
1171  if (FileLoc.isFileID())
1172    return FileLoc.getLocWithOffset(CharNo);
1173
1174  // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1175  // tokens are lexed from where the _Pragma was defined.
1176  assert(PP && "This doesn't work on raw lexers");
1177  return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1178}
1179
1180/// Diag - Forwarding function for diagnostics.  This translate a source
1181/// position in the current buffer into a SourceLocation object for rendering.
1182DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1183  return PP->Diag(getSourceLocation(Loc), DiagID);
1184}
1185
1186//===----------------------------------------------------------------------===//
1187// Trigraph and Escaped Newline Handling Code.
1188//===----------------------------------------------------------------------===//
1189
1190/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1191/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
1192static char GetTrigraphCharForLetter(char Letter) {
1193  switch (Letter) {
1194  default:   return 0;
1195  case '=':  return '#';
1196  case ')':  return ']';
1197  case '(':  return '[';
1198  case '!':  return '|';
1199  case '\'': return '^';
1200  case '>':  return '}';
1201  case '/':  return '\\';
1202  case '<':  return '{';
1203  case '-':  return '~';
1204  }
1205}
1206
1207/// DecodeTrigraphChar - If the specified character is a legal trigraph when
1208/// prefixed with ??, emit a trigraph warning.  If trigraphs are enabled,
1209/// return the result character.  Finally, emit a warning about trigraph use
1210/// whether trigraphs are enabled or not.
1211static char DecodeTrigraphChar(const char *CP, Lexer *L) {
1212  char Res = GetTrigraphCharForLetter(*CP);
1213  if (!Res || !L) return Res;
1214
1215  if (!L->getFeatures().Trigraphs) {
1216    if (!L->isLexingRawMode())
1217      L->Diag(CP-2, diag::trigraph_ignored);
1218    return 0;
1219  }
1220
1221  if (!L->isLexingRawMode())
1222    L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1223  return Res;
1224}
1225
1226/// getEscapedNewLineSize - Return the size of the specified escaped newline,
1227/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1228/// trigraph equivalent on entry to this function.
1229unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1230  unsigned Size = 0;
1231  while (isWhitespace(Ptr[Size])) {
1232    ++Size;
1233
1234    if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1235      continue;
1236
1237    // If this is a \r\n or \n\r, skip the other half.
1238    if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1239        Ptr[Size-1] != Ptr[Size])
1240      ++Size;
1241
1242    return Size;
1243  }
1244
1245  // Not an escaped newline, must be a \t or something else.
1246  return 0;
1247}
1248
1249/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1250/// them), skip over them and return the first non-escaped-newline found,
1251/// otherwise return P.
1252const char *Lexer::SkipEscapedNewLines(const char *P) {
1253  while (1) {
1254    const char *AfterEscape;
1255    if (*P == '\\') {
1256      AfterEscape = P+1;
1257    } else if (*P == '?') {
1258      // If not a trigraph for escape, bail out.
1259      if (P[1] != '?' || P[2] != '/')
1260        return P;
1261      AfterEscape = P+3;
1262    } else {
1263      return P;
1264    }
1265
1266    unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1267    if (NewLineSize == 0) return P;
1268    P = AfterEscape+NewLineSize;
1269  }
1270}
1271
1272/// \brief Checks that the given token is the first token that occurs after the
1273/// given location (this excludes comments and whitespace). Returns the location
1274/// immediately after the specified token. If the token is not found or the
1275/// location is inside a macro, the returned source location will be invalid.
1276SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc,
1277                                        tok::TokenKind TKind,
1278                                        const SourceManager &SM,
1279                                        const LangOptions &LangOpts,
1280                                        bool SkipTrailingWhitespaceAndNewLine) {
1281  if (Loc.isMacroID()) {
1282    if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1283      return SourceLocation();
1284  }
1285  Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1286
1287  // Break down the source location.
1288  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1289
1290  // Try to load the file buffer.
1291  bool InvalidTemp = false;
1292  llvm::StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1293  if (InvalidTemp)
1294    return SourceLocation();
1295
1296  const char *TokenBegin = File.data() + LocInfo.second;
1297
1298  // Lex from the start of the given location.
1299  Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1300                                      TokenBegin, File.end());
1301  // Find the token.
1302  Token Tok;
1303  lexer.LexFromRawLexer(Tok);
1304  if (Tok.isNot(TKind))
1305    return SourceLocation();
1306  SourceLocation TokenLoc = Tok.getLocation();
1307
1308  // Calculate how much whitespace needs to be skipped if any.
1309  unsigned NumWhitespaceChars = 0;
1310  if (SkipTrailingWhitespaceAndNewLine) {
1311    const char *TokenEnd = SM.getCharacterData(TokenLoc) +
1312                           Tok.getLength();
1313    unsigned char C = *TokenEnd;
1314    while (isHorizontalWhitespace(C)) {
1315      C = *(++TokenEnd);
1316      NumWhitespaceChars++;
1317    }
1318    if (isVerticalWhitespace(C))
1319      NumWhitespaceChars++;
1320  }
1321
1322  return TokenLoc.getLocWithOffset(Tok.getLength() + NumWhitespaceChars);
1323}
1324
1325/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1326/// get its size, and return it.  This is tricky in several cases:
1327///   1. If currently at the start of a trigraph, we warn about the trigraph,
1328///      then either return the trigraph (skipping 3 chars) or the '?',
1329///      depending on whether trigraphs are enabled or not.
1330///   2. If this is an escaped newline (potentially with whitespace between
1331///      the backslash and newline), implicitly skip the newline and return
1332///      the char after it.
1333///   3. If this is a UCN, return it.  FIXME: C++ UCN's?
1334///
1335/// This handles the slow/uncommon case of the getCharAndSize method.  Here we
1336/// know that we can accumulate into Size, and that we have already incremented
1337/// Ptr by Size bytes.
1338///
1339/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1340/// be updated to match.
1341///
1342char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
1343                               Token *Tok) {
1344  // If we have a slash, look for an escaped newline.
1345  if (Ptr[0] == '\\') {
1346    ++Size;
1347    ++Ptr;
1348Slash:
1349    // Common case, backslash-char where the char is not whitespace.
1350    if (!isWhitespace(Ptr[0])) return '\\';
1351
1352    // See if we have optional whitespace characters between the slash and
1353    // newline.
1354    if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1355      // Remember that this token needs to be cleaned.
1356      if (Tok) Tok->setFlag(Token::NeedsCleaning);
1357
1358      // Warn if there was whitespace between the backslash and newline.
1359      if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1360        Diag(Ptr, diag::backslash_newline_space);
1361
1362      // Found backslash<whitespace><newline>.  Parse the char after it.
1363      Size += EscapedNewLineSize;
1364      Ptr  += EscapedNewLineSize;
1365
1366      // If the char that we finally got was a \n, then we must have had
1367      // something like \<newline><newline>.  We don't want to consume the
1368      // second newline.
1369      if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0')
1370        return ' ';
1371
1372      // Use slow version to accumulate a correct size field.
1373      return getCharAndSizeSlow(Ptr, Size, Tok);
1374    }
1375
1376    // Otherwise, this is not an escaped newline, just return the slash.
1377    return '\\';
1378  }
1379
1380  // If this is a trigraph, process it.
1381  if (Ptr[0] == '?' && Ptr[1] == '?') {
1382    // If this is actually a legal trigraph (not something like "??x"), emit
1383    // a trigraph warning.  If so, and if trigraphs are enabled, return it.
1384    if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) {
1385      // Remember that this token needs to be cleaned.
1386      if (Tok) Tok->setFlag(Token::NeedsCleaning);
1387
1388      Ptr += 3;
1389      Size += 3;
1390      if (C == '\\') goto Slash;
1391      return C;
1392    }
1393  }
1394
1395  // If this is neither, return a single character.
1396  ++Size;
1397  return *Ptr;
1398}
1399
1400
1401/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1402/// getCharAndSizeNoWarn method.  Here we know that we can accumulate into Size,
1403/// and that we have already incremented Ptr by Size bytes.
1404///
1405/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1406/// be updated to match.
1407char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
1408                                     const LangOptions &Features) {
1409  // If we have a slash, look for an escaped newline.
1410  if (Ptr[0] == '\\') {
1411    ++Size;
1412    ++Ptr;
1413Slash:
1414    // Common case, backslash-char where the char is not whitespace.
1415    if (!isWhitespace(Ptr[0])) return '\\';
1416
1417    // See if we have optional whitespace characters followed by a newline.
1418    if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1419      // Found backslash<whitespace><newline>.  Parse the char after it.
1420      Size += EscapedNewLineSize;
1421      Ptr  += EscapedNewLineSize;
1422
1423      // If the char that we finally got was a \n, then we must have had
1424      // something like \<newline><newline>.  We don't want to consume the
1425      // second newline.
1426      if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0')
1427        return ' ';
1428
1429      // Use slow version to accumulate a correct size field.
1430      return getCharAndSizeSlowNoWarn(Ptr, Size, Features);
1431    }
1432
1433    // Otherwise, this is not an escaped newline, just return the slash.
1434    return '\\';
1435  }
1436
1437  // If this is a trigraph, process it.
1438  if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1439    // If this is actually a legal trigraph (not something like "??x"), return
1440    // it.
1441    if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1442      Ptr += 3;
1443      Size += 3;
1444      if (C == '\\') goto Slash;
1445      return C;
1446    }
1447  }
1448
1449  // If this is neither, return a single character.
1450  ++Size;
1451  return *Ptr;
1452}
1453
1454//===----------------------------------------------------------------------===//
1455// Helper methods for lexing.
1456//===----------------------------------------------------------------------===//
1457
1458/// \brief Routine that indiscriminately skips bytes in the source file.
1459void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) {
1460  BufferPtr += Bytes;
1461  if (BufferPtr > BufferEnd)
1462    BufferPtr = BufferEnd;
1463  IsAtStartOfLine = StartOfLine;
1464}
1465
1466void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
1467  // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
1468  unsigned Size;
1469  unsigned char C = *CurPtr++;
1470  while (isIdentifierBody(C))
1471    C = *CurPtr++;
1472
1473  --CurPtr;   // Back up over the skipped character.
1474
1475  // Fast path, no $,\,? in identifier found.  '\' might be an escaped newline
1476  // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
1477  // FIXME: UCNs.
1478  //
1479  // TODO: Could merge these checks into a CharInfo flag to make the comparison
1480  // cheaper
1481  if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) {
1482FinishIdentifier:
1483    const char *IdStart = BufferPtr;
1484    FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1485    Result.setRawIdentifierData(IdStart);
1486
1487    // If we are in raw mode, return this identifier raw.  There is no need to
1488    // look up identifier information or attempt to macro expand it.
1489    if (LexingRawMode)
1490      return;
1491
1492    // Fill in Result.IdentifierInfo and update the token kind,
1493    // looking up the identifier in the identifier table.
1494    IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
1495
1496    // Finally, now that we know we have an identifier, pass this off to the
1497    // preprocessor, which may macro expand it or something.
1498    if (II->isHandleIdentifierCase())
1499      PP->HandleIdentifier(Result);
1500
1501    return;
1502  }
1503
1504  // Otherwise, $,\,? in identifier found.  Enter slower path.
1505
1506  C = getCharAndSize(CurPtr, Size);
1507  while (1) {
1508    if (C == '$') {
1509      // If we hit a $ and they are not supported in identifiers, we are done.
1510      if (!Features.DollarIdents) goto FinishIdentifier;
1511
1512      // Otherwise, emit a diagnostic and continue.
1513      if (!isLexingRawMode())
1514        Diag(CurPtr, diag::ext_dollar_in_identifier);
1515      CurPtr = ConsumeChar(CurPtr, Size, Result);
1516      C = getCharAndSize(CurPtr, Size);
1517      continue;
1518    } else if (!isIdentifierBody(C)) { // FIXME: UCNs.
1519      // Found end of identifier.
1520      goto FinishIdentifier;
1521    }
1522
1523    // Otherwise, this character is good, consume it.
1524    CurPtr = ConsumeChar(CurPtr, Size, Result);
1525
1526    C = getCharAndSize(CurPtr, Size);
1527    while (isIdentifierBody(C)) { // FIXME: UCNs.
1528      CurPtr = ConsumeChar(CurPtr, Size, Result);
1529      C = getCharAndSize(CurPtr, Size);
1530    }
1531  }
1532}
1533
1534/// isHexaLiteral - Return true if Start points to a hex constant.
1535/// in microsoft mode (where this is supposed to be several different tokens).
1536static bool isHexaLiteral(const char *Start, const LangOptions &Features) {
1537  unsigned Size;
1538  char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, Features);
1539  if (C1 != '0')
1540    return false;
1541  char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, Features);
1542  return (C2 == 'x' || C2 == 'X');
1543}
1544
1545/// LexNumericConstant - Lex the remainder of a integer or floating point
1546/// constant. From[-1] is the first character lexed.  Return the end of the
1547/// constant.
1548void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
1549  unsigned Size;
1550  char C = getCharAndSize(CurPtr, Size);
1551  char PrevCh = 0;
1552  while (isNumberBody(C)) { // FIXME: UCNs.
1553    CurPtr = ConsumeChar(CurPtr, Size, Result);
1554    PrevCh = C;
1555    C = getCharAndSize(CurPtr, Size);
1556  }
1557
1558  // If we fell out, check for a sign, due to 1e+12.  If we have one, continue.
1559  if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
1560    // If we are in Microsoft mode, don't continue if the constant is hex.
1561    // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
1562    if (!Features.MicrosoftExt || !isHexaLiteral(BufferPtr, Features))
1563      return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1564  }
1565
1566  // If we have a hex FP constant, continue.
1567  if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p'))
1568    return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
1569
1570  // Update the location of token as well as BufferPtr.
1571  const char *TokStart = BufferPtr;
1572  FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
1573  Result.setLiteralData(TokStart);
1574}
1575
1576/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
1577/// in C++11.
1578const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr) {
1579  assert(getFeatures().CPlusPlus0x && "ud-suffix only exists in C++11");
1580
1581  // Maximally munch an identifier. FIXME: UCNs.
1582  unsigned Size;
1583  char C = getCharAndSize(CurPtr, Size);
1584  if (isIdentifierHead(C)) {
1585    do {
1586      CurPtr = ConsumeChar(CurPtr, Size, Result);
1587      C = getCharAndSize(CurPtr, Size);
1588    } while (isIdentifierBody(C));
1589  }
1590  return CurPtr;
1591}
1592
1593/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
1594/// either " or L" or u8" or u" or U".
1595void Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
1596                             tok::TokenKind Kind) {
1597  const char *NulCharacter = 0; // Does this string contain the \0 character?
1598
1599  if (!isLexingRawMode() &&
1600      (Kind == tok::utf8_string_literal ||
1601       Kind == tok::utf16_string_literal ||
1602       Kind == tok::utf32_string_literal))
1603    Diag(BufferPtr, diag::warn_cxx98_compat_unicode_literal);
1604
1605  char C = getAndAdvanceChar(CurPtr, Result);
1606  while (C != '"') {
1607    // Skip escaped characters.  Escaped newlines will already be processed by
1608    // getAndAdvanceChar.
1609    if (C == '\\')
1610      C = getAndAdvanceChar(CurPtr, Result);
1611
1612    if (C == '\n' || C == '\r' ||             // Newline.
1613        (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
1614      if (!isLexingRawMode() && !Features.AsmPreprocessor)
1615        Diag(BufferPtr, diag::warn_unterminated_string);
1616      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1617      return;
1618    }
1619
1620    if (C == 0) {
1621      if (isCodeCompletionPoint(CurPtr-1)) {
1622        PP->CodeCompleteNaturalLanguage();
1623        FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1624        return cutOffLexing();
1625      }
1626
1627      NulCharacter = CurPtr-1;
1628    }
1629    C = getAndAdvanceChar(CurPtr, Result);
1630  }
1631
1632  // If we are in C++11, lex the optional ud-suffix.
1633  if (getFeatures().CPlusPlus0x)
1634    CurPtr = LexUDSuffix(Result, CurPtr);
1635
1636  // If a nul character existed in the string, warn about it.
1637  if (NulCharacter && !isLexingRawMode())
1638    Diag(NulCharacter, diag::null_in_string);
1639
1640  // Update the location of the token as well as the BufferPtr instance var.
1641  const char *TokStart = BufferPtr;
1642  FormTokenWithChars(Result, CurPtr, Kind);
1643  Result.setLiteralData(TokStart);
1644}
1645
1646/// LexRawStringLiteral - Lex the remainder of a raw string literal, after
1647/// having lexed R", LR", u8R", uR", or UR".
1648void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
1649                                tok::TokenKind Kind) {
1650  // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
1651  //  Between the initial and final double quote characters of the raw string,
1652  //  any transformations performed in phases 1 and 2 (trigraphs,
1653  //  universal-character-names, and line splicing) are reverted.
1654
1655  if (!isLexingRawMode())
1656    Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
1657
1658  unsigned PrefixLen = 0;
1659
1660  while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
1661    ++PrefixLen;
1662
1663  // If the last character was not a '(', then we didn't lex a valid delimiter.
1664  if (CurPtr[PrefixLen] != '(') {
1665    if (!isLexingRawMode()) {
1666      const char *PrefixEnd = &CurPtr[PrefixLen];
1667      if (PrefixLen == 16) {
1668        Diag(PrefixEnd, diag::err_raw_delim_too_long);
1669      } else {
1670        Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
1671          << StringRef(PrefixEnd, 1);
1672      }
1673    }
1674
1675    // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
1676    // it's possible the '"' was intended to be part of the raw string, but
1677    // there's not much we can do about that.
1678    while (1) {
1679      char C = *CurPtr++;
1680
1681      if (C == '"')
1682        break;
1683      if (C == 0 && CurPtr-1 == BufferEnd) {
1684        --CurPtr;
1685        break;
1686      }
1687    }
1688
1689    FormTokenWithChars(Result, CurPtr, tok::unknown);
1690    return;
1691  }
1692
1693  // Save prefix and move CurPtr past it
1694  const char *Prefix = CurPtr;
1695  CurPtr += PrefixLen + 1; // skip over prefix and '('
1696
1697  while (1) {
1698    char C = *CurPtr++;
1699
1700    if (C == ')') {
1701      // Check for prefix match and closing quote.
1702      if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
1703        CurPtr += PrefixLen + 1; // skip over prefix and '"'
1704        break;
1705      }
1706    } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
1707      if (!isLexingRawMode())
1708        Diag(BufferPtr, diag::err_unterminated_raw_string)
1709          << StringRef(Prefix, PrefixLen);
1710      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1711      return;
1712    }
1713  }
1714
1715  // If we are in C++11, lex the optional ud-suffix.
1716  if (getFeatures().CPlusPlus0x)
1717    CurPtr = LexUDSuffix(Result, CurPtr);
1718
1719  // Update the location of token as well as BufferPtr.
1720  const char *TokStart = BufferPtr;
1721  FormTokenWithChars(Result, CurPtr, Kind);
1722  Result.setLiteralData(TokStart);
1723}
1724
1725/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
1726/// after having lexed the '<' character.  This is used for #include filenames.
1727void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
1728  const char *NulCharacter = 0; // Does this string contain the \0 character?
1729  const char *AfterLessPos = CurPtr;
1730  char C = getAndAdvanceChar(CurPtr, Result);
1731  while (C != '>') {
1732    // Skip escaped characters.
1733    if (C == '\\') {
1734      // Skip the escaped character.
1735      C = getAndAdvanceChar(CurPtr, Result);
1736    } else if (C == '\n' || C == '\r' ||             // Newline.
1737               (C == 0 && (CurPtr-1 == BufferEnd ||  // End of file.
1738                           isCodeCompletionPoint(CurPtr-1)))) {
1739      // If the filename is unterminated, then it must just be a lone <
1740      // character.  Return this as such.
1741      FormTokenWithChars(Result, AfterLessPos, tok::less);
1742      return;
1743    } else if (C == 0) {
1744      NulCharacter = CurPtr-1;
1745    }
1746    C = getAndAdvanceChar(CurPtr, Result);
1747  }
1748
1749  // If a nul character existed in the string, warn about it.
1750  if (NulCharacter && !isLexingRawMode())
1751    Diag(NulCharacter, diag::null_in_string);
1752
1753  // Update the location of token as well as BufferPtr.
1754  const char *TokStart = BufferPtr;
1755  FormTokenWithChars(Result, CurPtr, tok::angle_string_literal);
1756  Result.setLiteralData(TokStart);
1757}
1758
1759
1760/// LexCharConstant - Lex the remainder of a character constant, after having
1761/// lexed either ' or L' or u' or U'.
1762void Lexer::LexCharConstant(Token &Result, const char *CurPtr,
1763                            tok::TokenKind Kind) {
1764  const char *NulCharacter = 0; // Does this character contain the \0 character?
1765
1766  if (!isLexingRawMode() &&
1767      (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant))
1768    Diag(BufferPtr, diag::warn_cxx98_compat_unicode_literal);
1769
1770  char C = getAndAdvanceChar(CurPtr, Result);
1771  if (C == '\'') {
1772    if (!isLexingRawMode() && !Features.AsmPreprocessor)
1773      Diag(BufferPtr, diag::err_empty_character);
1774    FormTokenWithChars(Result, CurPtr, tok::unknown);
1775    return;
1776  }
1777
1778  while (C != '\'') {
1779    // Skip escaped characters.
1780    if (C == '\\') {
1781      // Skip the escaped character.
1782      // FIXME: UCN's
1783      C = getAndAdvanceChar(CurPtr, Result);
1784    } else if (C == '\n' || C == '\r' ||             // Newline.
1785               (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
1786      if (!isLexingRawMode() && !Features.AsmPreprocessor)
1787        Diag(BufferPtr, diag::warn_unterminated_char);
1788      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1789      return;
1790    } else if (C == 0) {
1791      if (isCodeCompletionPoint(CurPtr-1)) {
1792        PP->CodeCompleteNaturalLanguage();
1793        FormTokenWithChars(Result, CurPtr-1, tok::unknown);
1794        return cutOffLexing();
1795      }
1796
1797      NulCharacter = CurPtr-1;
1798    }
1799    C = getAndAdvanceChar(CurPtr, Result);
1800  }
1801
1802  // If we are in C++11, lex the optional ud-suffix.
1803  if (getFeatures().CPlusPlus0x)
1804    CurPtr = LexUDSuffix(Result, CurPtr);
1805
1806  // If a nul character existed in the character, warn about it.
1807  if (NulCharacter && !isLexingRawMode())
1808    Diag(NulCharacter, diag::null_in_char);
1809
1810  // Update the location of token as well as BufferPtr.
1811  const char *TokStart = BufferPtr;
1812  FormTokenWithChars(Result, CurPtr, Kind);
1813  Result.setLiteralData(TokStart);
1814}
1815
1816/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
1817/// Update BufferPtr to point to the next non-whitespace character and return.
1818///
1819/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
1820///
1821bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
1822  // Whitespace - Skip it, then return the token after the whitespace.
1823  unsigned char Char = *CurPtr;  // Skip consequtive spaces efficiently.
1824  while (1) {
1825    // Skip horizontal whitespace very aggressively.
1826    while (isHorizontalWhitespace(Char))
1827      Char = *++CurPtr;
1828
1829    // Otherwise if we have something other than whitespace, we're done.
1830    if (Char != '\n' && Char != '\r')
1831      break;
1832
1833    if (ParsingPreprocessorDirective) {
1834      // End of preprocessor directive line, let LexTokenInternal handle this.
1835      BufferPtr = CurPtr;
1836      return false;
1837    }
1838
1839    // ok, but handle newline.
1840    // The returned token is at the start of the line.
1841    Result.setFlag(Token::StartOfLine);
1842    // No leading whitespace seen so far.
1843    Result.clearFlag(Token::LeadingSpace);
1844    Char = *++CurPtr;
1845  }
1846
1847  // If this isn't immediately after a newline, there is leading space.
1848  char PrevChar = CurPtr[-1];
1849  if (PrevChar != '\n' && PrevChar != '\r')
1850    Result.setFlag(Token::LeadingSpace);
1851
1852  // If the client wants us to return whitespace, return it now.
1853  if (isKeepWhitespaceMode()) {
1854    FormTokenWithChars(Result, CurPtr, tok::unknown);
1855    return true;
1856  }
1857
1858  BufferPtr = CurPtr;
1859  return false;
1860}
1861
1862// SkipBCPLComment - We have just read the // characters from input.  Skip until
1863// we find the newline character thats terminate the comment.  Then update
1864/// BufferPtr and return.
1865///
1866/// If we're in KeepCommentMode or any CommentHandler has inserted
1867/// some tokens, this will store the first token and return true.
1868bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) {
1869  // If BCPL comments aren't explicitly enabled for this language, emit an
1870  // extension warning.
1871  if (!Features.BCPLComment && !isLexingRawMode()) {
1872    Diag(BufferPtr, diag::ext_bcpl_comment);
1873
1874    // Mark them enabled so we only emit one warning for this translation
1875    // unit.
1876    Features.BCPLComment = true;
1877  }
1878
1879  // Scan over the body of the comment.  The common case, when scanning, is that
1880  // the comment contains normal ascii characters with nothing interesting in
1881  // them.  As such, optimize for this case with the inner loop.
1882  char C;
1883  do {
1884    C = *CurPtr;
1885    // Skip over characters in the fast loop.
1886    while (C != 0 &&                // Potentially EOF.
1887           C != '\n' && C != '\r')  // Newline or DOS-style newline.
1888      C = *++CurPtr;
1889
1890    const char *NextLine = CurPtr;
1891    if (C != 0) {
1892      // We found a newline, see if it's escaped.
1893      const char *EscapePtr = CurPtr-1;
1894      while (isHorizontalWhitespace(*EscapePtr)) // Skip whitespace.
1895        --EscapePtr;
1896
1897      if (*EscapePtr == '\\') // Escaped newline.
1898        CurPtr = EscapePtr;
1899      else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
1900               EscapePtr[-2] == '?') // Trigraph-escaped newline.
1901        CurPtr = EscapePtr-2;
1902      else
1903        break; // This is a newline, we're done.
1904
1905      C = *CurPtr;
1906    }
1907
1908    // Otherwise, this is a hard case.  Fall back on getAndAdvanceChar to
1909    // properly decode the character.  Read it in raw mode to avoid emitting
1910    // diagnostics about things like trigraphs.  If we see an escaped newline,
1911    // we'll handle it below.
1912    const char *OldPtr = CurPtr;
1913    bool OldRawMode = isLexingRawMode();
1914    LexingRawMode = true;
1915    C = getAndAdvanceChar(CurPtr, Result);
1916    LexingRawMode = OldRawMode;
1917
1918    // If we only read only one character, then no special handling is needed.
1919    // We're done and can skip forward to the newline.
1920    if (C != 0 && CurPtr == OldPtr+1) {
1921      CurPtr = NextLine;
1922      break;
1923    }
1924
1925    // If we read multiple characters, and one of those characters was a \r or
1926    // \n, then we had an escaped newline within the comment.  Emit diagnostic
1927    // unless the next line is also a // comment.
1928    if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') {
1929      for (; OldPtr != CurPtr; ++OldPtr)
1930        if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
1931          // Okay, we found a // comment that ends in a newline, if the next
1932          // line is also a // comment, but has spaces, don't emit a diagnostic.
1933          if (isWhitespace(C)) {
1934            const char *ForwardPtr = CurPtr;
1935            while (isWhitespace(*ForwardPtr))  // Skip whitespace.
1936              ++ForwardPtr;
1937            if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
1938              break;
1939          }
1940
1941          if (!isLexingRawMode())
1942            Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment);
1943          break;
1944        }
1945    }
1946
1947    if (CurPtr == BufferEnd+1) {
1948      --CurPtr;
1949      break;
1950    }
1951
1952    if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
1953      PP->CodeCompleteNaturalLanguage();
1954      cutOffLexing();
1955      return false;
1956    }
1957
1958  } while (C != '\n' && C != '\r');
1959
1960  // Found but did not consume the newline.  Notify comment handlers about the
1961  // comment unless we're in a #if 0 block.
1962  if (PP && !isLexingRawMode() &&
1963      PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
1964                                            getSourceLocation(CurPtr)))) {
1965    BufferPtr = CurPtr;
1966    return true; // A token has to be returned.
1967  }
1968
1969  // If we are returning comments as tokens, return this comment as a token.
1970  if (inKeepCommentMode())
1971    return SaveBCPLComment(Result, CurPtr);
1972
1973  // If we are inside a preprocessor directive and we see the end of line,
1974  // return immediately, so that the lexer can return this as an EOD token.
1975  if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
1976    BufferPtr = CurPtr;
1977    return false;
1978  }
1979
1980  // Otherwise, eat the \n character.  We don't care if this is a \n\r or
1981  // \r\n sequence.  This is an efficiency hack (because we know the \n can't
1982  // contribute to another token), it isn't needed for correctness.  Note that
1983  // this is ok even in KeepWhitespaceMode, because we would have returned the
1984  /// comment above in that mode.
1985  ++CurPtr;
1986
1987  // The next returned token is at the start of the line.
1988  Result.setFlag(Token::StartOfLine);
1989  // No leading whitespace seen so far.
1990  Result.clearFlag(Token::LeadingSpace);
1991  BufferPtr = CurPtr;
1992  return false;
1993}
1994
1995/// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in
1996/// an appropriate way and return it.
1997bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) {
1998  // If we're not in a preprocessor directive, just return the // comment
1999  // directly.
2000  FormTokenWithChars(Result, CurPtr, tok::comment);
2001
2002  if (!ParsingPreprocessorDirective)
2003    return true;
2004
2005  // If this BCPL-style comment is in a macro definition, transmogrify it into
2006  // a C-style block comment.
2007  bool Invalid = false;
2008  std::string Spelling = PP->getSpelling(Result, &Invalid);
2009  if (Invalid)
2010    return true;
2011
2012  assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?");
2013  Spelling[1] = '*';   // Change prefix to "/*".
2014  Spelling += "*/";    // add suffix.
2015
2016  Result.setKind(tok::comment);
2017  PP->CreateString(&Spelling[0], Spelling.size(), Result,
2018                   Result.getLocation(), Result.getLocation());
2019  return true;
2020}
2021
2022/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2023/// character (either \n or \r) is part of an escaped newline sequence.  Issue a
2024/// diagnostic if so.  We know that the newline is inside of a block comment.
2025static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
2026                                                  Lexer *L) {
2027  assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
2028
2029  // Back up off the newline.
2030  --CurPtr;
2031
2032  // If this is a two-character newline sequence, skip the other character.
2033  if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2034    // \n\n or \r\r -> not escaped newline.
2035    if (CurPtr[0] == CurPtr[1])
2036      return false;
2037    // \n\r or \r\n -> skip the newline.
2038    --CurPtr;
2039  }
2040
2041  // If we have horizontal whitespace, skip over it.  We allow whitespace
2042  // between the slash and newline.
2043  bool HasSpace = false;
2044  while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2045    --CurPtr;
2046    HasSpace = true;
2047  }
2048
2049  // If we have a slash, we know this is an escaped newline.
2050  if (*CurPtr == '\\') {
2051    if (CurPtr[-1] != '*') return false;
2052  } else {
2053    // It isn't a slash, is it the ?? / trigraph?
2054    if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' ||
2055        CurPtr[-3] != '*')
2056      return false;
2057
2058    // This is the trigraph ending the comment.  Emit a stern warning!
2059    CurPtr -= 2;
2060
2061    // If no trigraphs are enabled, warn that we ignored this trigraph and
2062    // ignore this * character.
2063    if (!L->getFeatures().Trigraphs) {
2064      if (!L->isLexingRawMode())
2065        L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
2066      return false;
2067    }
2068    if (!L->isLexingRawMode())
2069      L->Diag(CurPtr, diag::trigraph_ends_block_comment);
2070  }
2071
2072  // Warn about having an escaped newline between the */ characters.
2073  if (!L->isLexingRawMode())
2074    L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
2075
2076  // If there was space between the backslash and newline, warn about it.
2077  if (HasSpace && !L->isLexingRawMode())
2078    L->Diag(CurPtr, diag::backslash_newline_space);
2079
2080  return true;
2081}
2082
2083#ifdef __SSE2__
2084#include <emmintrin.h>
2085#elif __ALTIVEC__
2086#include <altivec.h>
2087#undef bool
2088#endif
2089
2090/// SkipBlockComment - We have just read the /* characters from input.  Read
2091/// until we find the */ characters that terminate the comment.  Note that we
2092/// don't bother decoding trigraphs or escaped newlines in block comments,
2093/// because they cannot cause the comment to end.  The only thing that can
2094/// happen is the comment could end with an escaped newline between the */ end
2095/// of comment.
2096///
2097/// If we're in KeepCommentMode or any CommentHandler has inserted
2098/// some tokens, this will store the first token and return true.
2099bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
2100  // Scan one character past where we should, looking for a '/' character.  Once
2101  // we find it, check to see if it was preceded by a *.  This common
2102  // optimization helps people who like to put a lot of * characters in their
2103  // comments.
2104
2105  // The first character we get with newlines and trigraphs skipped to handle
2106  // the degenerate /*/ case below correctly if the * has an escaped newline
2107  // after it.
2108  unsigned CharSize;
2109  unsigned char C = getCharAndSize(CurPtr, CharSize);
2110  CurPtr += CharSize;
2111  if (C == 0 && CurPtr == BufferEnd+1) {
2112    if (!isLexingRawMode())
2113      Diag(BufferPtr, diag::err_unterminated_block_comment);
2114    --CurPtr;
2115
2116    // KeepWhitespaceMode should return this broken comment as a token.  Since
2117    // it isn't a well formed comment, just return it as an 'unknown' token.
2118    if (isKeepWhitespaceMode()) {
2119      FormTokenWithChars(Result, CurPtr, tok::unknown);
2120      return true;
2121    }
2122
2123    BufferPtr = CurPtr;
2124    return false;
2125  }
2126
2127  // Check to see if the first character after the '/*' is another /.  If so,
2128  // then this slash does not end the block comment, it is part of it.
2129  if (C == '/')
2130    C = *CurPtr++;
2131
2132  while (1) {
2133    // Skip over all non-interesting characters until we find end of buffer or a
2134    // (probably ending) '/' character.
2135    if (CurPtr + 24 < BufferEnd &&
2136        // If there is a code-completion point avoid the fast scan because it
2137        // doesn't check for '\0'.
2138        !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2139      // While not aligned to a 16-byte boundary.
2140      while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
2141        C = *CurPtr++;
2142
2143      if (C == '/') goto FoundSlash;
2144
2145#ifdef __SSE2__
2146      __m128i Slashes = _mm_set1_epi8('/');
2147      while (CurPtr+16 <= BufferEnd) {
2148        int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(__m128i*)CurPtr, Slashes));
2149        if (cmp != 0) {
2150          // Adjust the pointer to point directly after the first slash. It's
2151          // not necessary to set C here, it will be overwritten at the end of
2152          // the outer loop.
2153          CurPtr += llvm::CountTrailingZeros_32(cmp) + 1;
2154          goto FoundSlash;
2155        }
2156        CurPtr += 16;
2157      }
2158#elif __ALTIVEC__
2159      __vector unsigned char Slashes = {
2160        '/', '/', '/', '/',  '/', '/', '/', '/',
2161        '/', '/', '/', '/',  '/', '/', '/', '/'
2162      };
2163      while (CurPtr+16 <= BufferEnd &&
2164             !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes))
2165        CurPtr += 16;
2166#else
2167      // Scan for '/' quickly.  Many block comments are very large.
2168      while (CurPtr[0] != '/' &&
2169             CurPtr[1] != '/' &&
2170             CurPtr[2] != '/' &&
2171             CurPtr[3] != '/' &&
2172             CurPtr+4 < BufferEnd) {
2173        CurPtr += 4;
2174      }
2175#endif
2176
2177      // It has to be one of the bytes scanned, increment to it and read one.
2178      C = *CurPtr++;
2179    }
2180
2181    // Loop to scan the remainder.
2182    while (C != '/' && C != '\0')
2183      C = *CurPtr++;
2184
2185    if (C == '/') {
2186  FoundSlash:
2187      if (CurPtr[-2] == '*')  // We found the final */.  We're done!
2188        break;
2189
2190      if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
2191        if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
2192          // We found the final */, though it had an escaped newline between the
2193          // * and /.  We're done!
2194          break;
2195        }
2196      }
2197      if (CurPtr[0] == '*' && CurPtr[1] != '/') {
2198        // If this is a /* inside of the comment, emit a warning.  Don't do this
2199        // if this is a /*/, which will end the comment.  This misses cases with
2200        // embedded escaped newlines, but oh well.
2201        if (!isLexingRawMode())
2202          Diag(CurPtr-1, diag::warn_nested_block_comment);
2203      }
2204    } else if (C == 0 && CurPtr == BufferEnd+1) {
2205      if (!isLexingRawMode())
2206        Diag(BufferPtr, diag::err_unterminated_block_comment);
2207      // Note: the user probably forgot a */.  We could continue immediately
2208      // after the /*, but this would involve lexing a lot of what really is the
2209      // comment, which surely would confuse the parser.
2210      --CurPtr;
2211
2212      // KeepWhitespaceMode should return this broken comment as a token.  Since
2213      // it isn't a well formed comment, just return it as an 'unknown' token.
2214      if (isKeepWhitespaceMode()) {
2215        FormTokenWithChars(Result, CurPtr, tok::unknown);
2216        return true;
2217      }
2218
2219      BufferPtr = CurPtr;
2220      return false;
2221    } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2222      PP->CodeCompleteNaturalLanguage();
2223      cutOffLexing();
2224      return false;
2225    }
2226
2227    C = *CurPtr++;
2228  }
2229
2230  // Notify comment handlers about the comment unless we're in a #if 0 block.
2231  if (PP && !isLexingRawMode() &&
2232      PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2233                                            getSourceLocation(CurPtr)))) {
2234    BufferPtr = CurPtr;
2235    return true; // A token has to be returned.
2236  }
2237
2238  // If we are returning comments as tokens, return this comment as a token.
2239  if (inKeepCommentMode()) {
2240    FormTokenWithChars(Result, CurPtr, tok::comment);
2241    return true;
2242  }
2243
2244  // It is common for the tokens immediately after a /**/ comment to be
2245  // whitespace.  Instead of going through the big switch, handle it
2246  // efficiently now.  This is safe even in KeepWhitespaceMode because we would
2247  // have already returned above with the comment as a token.
2248  if (isHorizontalWhitespace(*CurPtr)) {
2249    Result.setFlag(Token::LeadingSpace);
2250    SkipWhitespace(Result, CurPtr+1);
2251    return false;
2252  }
2253
2254  // Otherwise, just return so that the next character will be lexed as a token.
2255  BufferPtr = CurPtr;
2256  Result.setFlag(Token::LeadingSpace);
2257  return false;
2258}
2259
2260//===----------------------------------------------------------------------===//
2261// Primary Lexing Entry Points
2262//===----------------------------------------------------------------------===//
2263
2264/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
2265/// uninterpreted string.  This switches the lexer out of directive mode.
2266std::string Lexer::ReadToEndOfLine() {
2267  assert(ParsingPreprocessorDirective && ParsingFilename == false &&
2268         "Must be in a preprocessing directive!");
2269  std::string Result;
2270  Token Tmp;
2271
2272  // CurPtr - Cache BufferPtr in an automatic variable.
2273  const char *CurPtr = BufferPtr;
2274  while (1) {
2275    char Char = getAndAdvanceChar(CurPtr, Tmp);
2276    switch (Char) {
2277    default:
2278      Result += Char;
2279      break;
2280    case 0:  // Null.
2281      // Found end of file?
2282      if (CurPtr-1 != BufferEnd) {
2283        if (isCodeCompletionPoint(CurPtr-1)) {
2284          PP->CodeCompleteNaturalLanguage();
2285          cutOffLexing();
2286          return Result;
2287        }
2288
2289        // Nope, normal character, continue.
2290        Result += Char;
2291        break;
2292      }
2293      // FALL THROUGH.
2294    case '\r':
2295    case '\n':
2296      // Okay, we found the end of the line. First, back up past the \0, \r, \n.
2297      assert(CurPtr[-1] == Char && "Trigraphs for newline?");
2298      BufferPtr = CurPtr-1;
2299
2300      // Next, lex the character, which should handle the EOD transition.
2301      Lex(Tmp);
2302      if (Tmp.is(tok::code_completion)) {
2303        if (PP)
2304          PP->CodeCompleteNaturalLanguage();
2305        Lex(Tmp);
2306      }
2307      assert(Tmp.is(tok::eod) && "Unexpected token!");
2308
2309      // Finally, we're done, return the string we found.
2310      return Result;
2311    }
2312  }
2313}
2314
2315/// LexEndOfFile - CurPtr points to the end of this file.  Handle this
2316/// condition, reporting diagnostics and handling other edge cases as required.
2317/// This returns true if Result contains a token, false if PP.Lex should be
2318/// called again.
2319bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
2320  // If we hit the end of the file while parsing a preprocessor directive,
2321  // end the preprocessor directive first.  The next token returned will
2322  // then be the end of file.
2323  if (ParsingPreprocessorDirective) {
2324    // Done parsing the "line".
2325    ParsingPreprocessorDirective = false;
2326    // Update the location of token as well as BufferPtr.
2327    FormTokenWithChars(Result, CurPtr, tok::eod);
2328
2329    // Restore comment saving mode, in case it was disabled for directive.
2330    SetCommentRetentionState(PP->getCommentRetentionState());
2331    return true;  // Have a token.
2332  }
2333
2334  // If we are in raw mode, return this event as an EOF token.  Let the caller
2335  // that put us in raw mode handle the event.
2336  if (isLexingRawMode()) {
2337    Result.startToken();
2338    BufferPtr = BufferEnd;
2339    FormTokenWithChars(Result, BufferEnd, tok::eof);
2340    return true;
2341  }
2342
2343  // Issue diagnostics for unterminated #if and missing newline.
2344
2345  // If we are in a #if directive, emit an error.
2346  while (!ConditionalStack.empty()) {
2347    if (PP->getCodeCompletionFileLoc() != FileLoc)
2348      PP->Diag(ConditionalStack.back().IfLoc,
2349               diag::err_pp_unterminated_conditional);
2350    ConditionalStack.pop_back();
2351  }
2352
2353  // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
2354  // a pedwarn.
2355  if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r'))
2356    Diag(BufferEnd, diag::ext_no_newline_eof)
2357      << FixItHint::CreateInsertion(getSourceLocation(BufferEnd), "\n");
2358
2359  BufferPtr = CurPtr;
2360
2361  // Finally, let the preprocessor handle this.
2362  return PP->HandleEndOfFile(Result);
2363}
2364
2365/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
2366/// the specified lexer will return a tok::l_paren token, 0 if it is something
2367/// else and 2 if there are no more tokens in the buffer controlled by the
2368/// lexer.
2369unsigned Lexer::isNextPPTokenLParen() {
2370  assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
2371
2372  // Switch to 'skipping' mode.  This will ensure that we can lex a token
2373  // without emitting diagnostics, disables macro expansion, and will cause EOF
2374  // to return an EOF token instead of popping the include stack.
2375  LexingRawMode = true;
2376
2377  // Save state that can be changed while lexing so that we can restore it.
2378  const char *TmpBufferPtr = BufferPtr;
2379  bool inPPDirectiveMode = ParsingPreprocessorDirective;
2380
2381  Token Tok;
2382  Tok.startToken();
2383  LexTokenInternal(Tok);
2384
2385  // Restore state that may have changed.
2386  BufferPtr = TmpBufferPtr;
2387  ParsingPreprocessorDirective = inPPDirectiveMode;
2388
2389  // Restore the lexer back to non-skipping mode.
2390  LexingRawMode = false;
2391
2392  if (Tok.is(tok::eof))
2393    return 2;
2394  return Tok.is(tok::l_paren);
2395}
2396
2397/// FindConflictEnd - Find the end of a version control conflict marker.
2398static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
2399                                   ConflictMarkerKind CMK) {
2400  const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
2401  size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
2402  StringRef RestOfBuffer(CurPtr+TermLen, BufferEnd-CurPtr-TermLen);
2403  size_t Pos = RestOfBuffer.find(Terminator);
2404  while (Pos != StringRef::npos) {
2405    // Must occur at start of line.
2406    if (RestOfBuffer[Pos-1] != '\r' &&
2407        RestOfBuffer[Pos-1] != '\n') {
2408      RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
2409      Pos = RestOfBuffer.find(Terminator);
2410      continue;
2411    }
2412    return RestOfBuffer.data()+Pos;
2413  }
2414  return 0;
2415}
2416
2417/// IsStartOfConflictMarker - If the specified pointer is the start of a version
2418/// control conflict marker like '<<<<<<<', recognize it as such, emit an error
2419/// and recover nicely.  This returns true if it is a conflict marker and false
2420/// if not.
2421bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
2422  // Only a conflict marker if it starts at the beginning of a line.
2423  if (CurPtr != BufferStart &&
2424      CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
2425    return false;
2426
2427  // Check to see if we have <<<<<<< or >>>>.
2428  if ((BufferEnd-CurPtr < 8 || StringRef(CurPtr, 7) != "<<<<<<<") &&
2429      (BufferEnd-CurPtr < 6 || StringRef(CurPtr, 5) != ">>>> "))
2430    return false;
2431
2432  // If we have a situation where we don't care about conflict markers, ignore
2433  // it.
2434  if (CurrentConflictMarkerState || isLexingRawMode())
2435    return false;
2436
2437  ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
2438
2439  // Check to see if there is an ending marker somewhere in the buffer at the
2440  // start of a line to terminate this conflict marker.
2441  if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
2442    // We found a match.  We are really in a conflict marker.
2443    // Diagnose this, and ignore to the end of line.
2444    Diag(CurPtr, diag::err_conflict_marker);
2445    CurrentConflictMarkerState = Kind;
2446
2447    // Skip ahead to the end of line.  We know this exists because the
2448    // end-of-conflict marker starts with \r or \n.
2449    while (*CurPtr != '\r' && *CurPtr != '\n') {
2450      assert(CurPtr != BufferEnd && "Didn't find end of line");
2451      ++CurPtr;
2452    }
2453    BufferPtr = CurPtr;
2454    return true;
2455  }
2456
2457  // No end of conflict marker found.
2458  return false;
2459}
2460
2461
2462/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
2463/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
2464/// is the end of a conflict marker.  Handle it by ignoring up until the end of
2465/// the line.  This returns true if it is a conflict marker and false if not.
2466bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
2467  // Only a conflict marker if it starts at the beginning of a line.
2468  if (CurPtr != BufferStart &&
2469      CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
2470    return false;
2471
2472  // If we have a situation where we don't care about conflict markers, ignore
2473  // it.
2474  if (!CurrentConflictMarkerState || isLexingRawMode())
2475    return false;
2476
2477  // Check to see if we have the marker (4 characters in a row).
2478  for (unsigned i = 1; i != 4; ++i)
2479    if (CurPtr[i] != CurPtr[0])
2480      return false;
2481
2482  // If we do have it, search for the end of the conflict marker.  This could
2483  // fail if it got skipped with a '#if 0' or something.  Note that CurPtr might
2484  // be the end of conflict marker.
2485  if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
2486                                        CurrentConflictMarkerState)) {
2487    CurPtr = End;
2488
2489    // Skip ahead to the end of line.
2490    while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
2491      ++CurPtr;
2492
2493    BufferPtr = CurPtr;
2494
2495    // No longer in the conflict marker.
2496    CurrentConflictMarkerState = CMK_None;
2497    return true;
2498  }
2499
2500  return false;
2501}
2502
2503bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
2504  if (PP && PP->isCodeCompletionEnabled()) {
2505    SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
2506    return Loc == PP->getCodeCompletionLoc();
2507  }
2508
2509  return false;
2510}
2511
2512
2513/// LexTokenInternal - This implements a simple C family lexer.  It is an
2514/// extremely performance critical piece of code.  This assumes that the buffer
2515/// has a null character at the end of the file.  This returns a preprocessing
2516/// token, not a normal token, as such, it is an internal interface.  It assumes
2517/// that the Flags of result have been cleared before calling this.
2518void Lexer::LexTokenInternal(Token &Result) {
2519LexNextToken:
2520  // New token, can't need cleaning yet.
2521  Result.clearFlag(Token::NeedsCleaning);
2522  Result.setIdentifierInfo(0);
2523
2524  // CurPtr - Cache BufferPtr in an automatic variable.
2525  const char *CurPtr = BufferPtr;
2526
2527  // Small amounts of horizontal whitespace is very common between tokens.
2528  if ((*CurPtr == ' ') || (*CurPtr == '\t')) {
2529    ++CurPtr;
2530    while ((*CurPtr == ' ') || (*CurPtr == '\t'))
2531      ++CurPtr;
2532
2533    // If we are keeping whitespace and other tokens, just return what we just
2534    // skipped.  The next lexer invocation will return the token after the
2535    // whitespace.
2536    if (isKeepWhitespaceMode()) {
2537      FormTokenWithChars(Result, CurPtr, tok::unknown);
2538      return;
2539    }
2540
2541    BufferPtr = CurPtr;
2542    Result.setFlag(Token::LeadingSpace);
2543  }
2544
2545  unsigned SizeTmp, SizeTmp2;   // Temporaries for use in cases below.
2546
2547  // Read a character, advancing over it.
2548  char Char = getAndAdvanceChar(CurPtr, Result);
2549  tok::TokenKind Kind;
2550
2551  switch (Char) {
2552  case 0:  // Null.
2553    // Found end of file?
2554    if (CurPtr-1 == BufferEnd) {
2555      // Read the PP instance variable into an automatic variable, because
2556      // LexEndOfFile will often delete 'this'.
2557      Preprocessor *PPCache = PP;
2558      if (LexEndOfFile(Result, CurPtr-1))  // Retreat back into the file.
2559        return;   // Got a token to return.
2560      assert(PPCache && "Raw buffer::LexEndOfFile should return a token");
2561      return PPCache->Lex(Result);
2562    }
2563
2564    // Check if we are performing code completion.
2565    if (isCodeCompletionPoint(CurPtr-1)) {
2566      // Return the code-completion token.
2567      Result.startToken();
2568      FormTokenWithChars(Result, CurPtr, tok::code_completion);
2569      return;
2570    }
2571
2572    if (!isLexingRawMode())
2573      Diag(CurPtr-1, diag::null_in_file);
2574    Result.setFlag(Token::LeadingSpace);
2575    if (SkipWhitespace(Result, CurPtr))
2576      return; // KeepWhitespaceMode
2577
2578    goto LexNextToken;   // GCC isn't tail call eliminating.
2579
2580  case 26:  // DOS & CP/M EOF: "^Z".
2581    // If we're in Microsoft extensions mode, treat this as end of file.
2582    if (Features.MicrosoftExt) {
2583      // Read the PP instance variable into an automatic variable, because
2584      // LexEndOfFile will often delete 'this'.
2585      Preprocessor *PPCache = PP;
2586      if (LexEndOfFile(Result, CurPtr-1))  // Retreat back into the file.
2587        return;   // Got a token to return.
2588      assert(PPCache && "Raw buffer::LexEndOfFile should return a token");
2589      return PPCache->Lex(Result);
2590    }
2591    // If Microsoft extensions are disabled, this is just random garbage.
2592    Kind = tok::unknown;
2593    break;
2594
2595  case '\n':
2596  case '\r':
2597    // If we are inside a preprocessor directive and we see the end of line,
2598    // we know we are done with the directive, so return an EOD token.
2599    if (ParsingPreprocessorDirective) {
2600      // Done parsing the "line".
2601      ParsingPreprocessorDirective = false;
2602
2603      // Restore comment saving mode, in case it was disabled for directive.
2604      SetCommentRetentionState(PP->getCommentRetentionState());
2605
2606      // Since we consumed a newline, we are back at the start of a line.
2607      IsAtStartOfLine = true;
2608
2609      Kind = tok::eod;
2610      break;
2611    }
2612    // The returned token is at the start of the line.
2613    Result.setFlag(Token::StartOfLine);
2614    // No leading whitespace seen so far.
2615    Result.clearFlag(Token::LeadingSpace);
2616
2617    if (SkipWhitespace(Result, CurPtr))
2618      return; // KeepWhitespaceMode
2619    goto LexNextToken;   // GCC isn't tail call eliminating.
2620  case ' ':
2621  case '\t':
2622  case '\f':
2623  case '\v':
2624  SkipHorizontalWhitespace:
2625    Result.setFlag(Token::LeadingSpace);
2626    if (SkipWhitespace(Result, CurPtr))
2627      return; // KeepWhitespaceMode
2628
2629  SkipIgnoredUnits:
2630    CurPtr = BufferPtr;
2631
2632    // If the next token is obviously a // or /* */ comment, skip it efficiently
2633    // too (without going through the big switch stmt).
2634    if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
2635        Features.BCPLComment && !Features.TraditionalCPP) {
2636      if (SkipBCPLComment(Result, CurPtr+2))
2637        return; // There is a token to return.
2638      goto SkipIgnoredUnits;
2639    } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
2640      if (SkipBlockComment(Result, CurPtr+2))
2641        return; // There is a token to return.
2642      goto SkipIgnoredUnits;
2643    } else if (isHorizontalWhitespace(*CurPtr)) {
2644      goto SkipHorizontalWhitespace;
2645    }
2646    goto LexNextToken;   // GCC isn't tail call eliminating.
2647
2648  // C99 6.4.4.1: Integer Constants.
2649  // C99 6.4.4.2: Floating Constants.
2650  case '0': case '1': case '2': case '3': case '4':
2651  case '5': case '6': case '7': case '8': case '9':
2652    // Notify MIOpt that we read a non-whitespace/non-comment token.
2653    MIOpt.ReadToken();
2654    return LexNumericConstant(Result, CurPtr);
2655
2656  case 'u':   // Identifier (uber) or C++0x UTF-8 or UTF-16 string literal
2657    // Notify MIOpt that we read a non-whitespace/non-comment token.
2658    MIOpt.ReadToken();
2659
2660    if (Features.CPlusPlus0x) {
2661      Char = getCharAndSize(CurPtr, SizeTmp);
2662
2663      // UTF-16 string literal
2664      if (Char == '"')
2665        return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
2666                                tok::utf16_string_literal);
2667
2668      // UTF-16 character constant
2669      if (Char == '\'')
2670        return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
2671                               tok::utf16_char_constant);
2672
2673      // UTF-16 raw string literal
2674      if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
2675        return LexRawStringLiteral(Result,
2676                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
2677                                           SizeTmp2, Result),
2678                               tok::utf16_string_literal);
2679
2680      if (Char == '8') {
2681        char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
2682
2683        // UTF-8 string literal
2684        if (Char2 == '"')
2685          return LexStringLiteral(Result,
2686                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
2687                                           SizeTmp2, Result),
2688                               tok::utf8_string_literal);
2689
2690        if (Char2 == 'R') {
2691          unsigned SizeTmp3;
2692          char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
2693          // UTF-8 raw string literal
2694          if (Char3 == '"') {
2695            return LexRawStringLiteral(Result,
2696                   ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
2697                                           SizeTmp2, Result),
2698                               SizeTmp3, Result),
2699                   tok::utf8_string_literal);
2700          }
2701        }
2702      }
2703    }
2704
2705    // treat u like the start of an identifier.
2706    return LexIdentifier(Result, CurPtr);
2707
2708  case 'U':   // Identifier (Uber) or C++0x UTF-32 string literal
2709    // Notify MIOpt that we read a non-whitespace/non-comment token.
2710    MIOpt.ReadToken();
2711
2712    if (Features.CPlusPlus0x) {
2713      Char = getCharAndSize(CurPtr, SizeTmp);
2714
2715      // UTF-32 string literal
2716      if (Char == '"')
2717        return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
2718                                tok::utf32_string_literal);
2719
2720      // UTF-32 character constant
2721      if (Char == '\'')
2722        return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
2723                               tok::utf32_char_constant);
2724
2725      // UTF-32 raw string literal
2726      if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
2727        return LexRawStringLiteral(Result,
2728                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
2729                                           SizeTmp2, Result),
2730                               tok::utf32_string_literal);
2731    }
2732
2733    // treat U like the start of an identifier.
2734    return LexIdentifier(Result, CurPtr);
2735
2736  case 'R': // Identifier or C++0x raw string literal
2737    // Notify MIOpt that we read a non-whitespace/non-comment token.
2738    MIOpt.ReadToken();
2739
2740    if (Features.CPlusPlus0x) {
2741      Char = getCharAndSize(CurPtr, SizeTmp);
2742
2743      if (Char == '"')
2744        return LexRawStringLiteral(Result,
2745                                   ConsumeChar(CurPtr, SizeTmp, Result),
2746                                   tok::string_literal);
2747    }
2748
2749    // treat R like the start of an identifier.
2750    return LexIdentifier(Result, CurPtr);
2751
2752  case 'L':   // Identifier (Loony) or wide literal (L'x' or L"xyz").
2753    // Notify MIOpt that we read a non-whitespace/non-comment token.
2754    MIOpt.ReadToken();
2755    Char = getCharAndSize(CurPtr, SizeTmp);
2756
2757    // Wide string literal.
2758    if (Char == '"')
2759      return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
2760                              tok::wide_string_literal);
2761
2762    // Wide raw string literal.
2763    if (Features.CPlusPlus0x && Char == 'R' &&
2764        getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
2765      return LexRawStringLiteral(Result,
2766                               ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
2767                                           SizeTmp2, Result),
2768                               tok::wide_string_literal);
2769
2770    // Wide character constant.
2771    if (Char == '\'')
2772      return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
2773                             tok::wide_char_constant);
2774    // FALL THROUGH, treating L like the start of an identifier.
2775
2776  // C99 6.4.2: Identifiers.
2777  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
2778  case 'H': case 'I': case 'J': case 'K':    /*'L'*/case 'M': case 'N':
2779  case 'O': case 'P': case 'Q':    /*'R'*/case 'S': case 'T':    /*'U'*/
2780  case 'V': case 'W': case 'X': case 'Y': case 'Z':
2781  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
2782  case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
2783  case 'o': case 'p': case 'q': case 'r': case 's': case 't':    /*'u'*/
2784  case 'v': case 'w': case 'x': case 'y': case 'z':
2785  case '_':
2786    // Notify MIOpt that we read a non-whitespace/non-comment token.
2787    MIOpt.ReadToken();
2788    return LexIdentifier(Result, CurPtr);
2789
2790  case '$':   // $ in identifiers.
2791    if (Features.DollarIdents) {
2792      if (!isLexingRawMode())
2793        Diag(CurPtr-1, diag::ext_dollar_in_identifier);
2794      // Notify MIOpt that we read a non-whitespace/non-comment token.
2795      MIOpt.ReadToken();
2796      return LexIdentifier(Result, CurPtr);
2797    }
2798
2799    Kind = tok::unknown;
2800    break;
2801
2802  // C99 6.4.4: Character Constants.
2803  case '\'':
2804    // Notify MIOpt that we read a non-whitespace/non-comment token.
2805    MIOpt.ReadToken();
2806    return LexCharConstant(Result, CurPtr, tok::char_constant);
2807
2808  // C99 6.4.5: String Literals.
2809  case '"':
2810    // Notify MIOpt that we read a non-whitespace/non-comment token.
2811    MIOpt.ReadToken();
2812    return LexStringLiteral(Result, CurPtr, tok::string_literal);
2813
2814  // C99 6.4.6: Punctuators.
2815  case '?':
2816    Kind = tok::question;
2817    break;
2818  case '[':
2819    Kind = tok::l_square;
2820    break;
2821  case ']':
2822    Kind = tok::r_square;
2823    break;
2824  case '(':
2825    Kind = tok::l_paren;
2826    break;
2827  case ')':
2828    Kind = tok::r_paren;
2829    break;
2830  case '{':
2831    Kind = tok::l_brace;
2832    break;
2833  case '}':
2834    Kind = tok::r_brace;
2835    break;
2836  case '.':
2837    Char = getCharAndSize(CurPtr, SizeTmp);
2838    if (Char >= '0' && Char <= '9') {
2839      // Notify MIOpt that we read a non-whitespace/non-comment token.
2840      MIOpt.ReadToken();
2841
2842      return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
2843    } else if (Features.CPlusPlus && Char == '*') {
2844      Kind = tok::periodstar;
2845      CurPtr += SizeTmp;
2846    } else if (Char == '.' &&
2847               getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
2848      Kind = tok::ellipsis;
2849      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
2850                           SizeTmp2, Result);
2851    } else {
2852      Kind = tok::period;
2853    }
2854    break;
2855  case '&':
2856    Char = getCharAndSize(CurPtr, SizeTmp);
2857    if (Char == '&') {
2858      Kind = tok::ampamp;
2859      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2860    } else if (Char == '=') {
2861      Kind = tok::ampequal;
2862      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2863    } else {
2864      Kind = tok::amp;
2865    }
2866    break;
2867  case '*':
2868    if (getCharAndSize(CurPtr, SizeTmp) == '=') {
2869      Kind = tok::starequal;
2870      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2871    } else {
2872      Kind = tok::star;
2873    }
2874    break;
2875  case '+':
2876    Char = getCharAndSize(CurPtr, SizeTmp);
2877    if (Char == '+') {
2878      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2879      Kind = tok::plusplus;
2880    } else if (Char == '=') {
2881      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2882      Kind = tok::plusequal;
2883    } else {
2884      Kind = tok::plus;
2885    }
2886    break;
2887  case '-':
2888    Char = getCharAndSize(CurPtr, SizeTmp);
2889    if (Char == '-') {      // --
2890      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2891      Kind = tok::minusminus;
2892    } else if (Char == '>' && Features.CPlusPlus &&
2893               getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') {  // C++ ->*
2894      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
2895                           SizeTmp2, Result);
2896      Kind = tok::arrowstar;
2897    } else if (Char == '>') {   // ->
2898      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2899      Kind = tok::arrow;
2900    } else if (Char == '=') {   // -=
2901      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2902      Kind = tok::minusequal;
2903    } else {
2904      Kind = tok::minus;
2905    }
2906    break;
2907  case '~':
2908    Kind = tok::tilde;
2909    break;
2910  case '!':
2911    if (getCharAndSize(CurPtr, SizeTmp) == '=') {
2912      Kind = tok::exclaimequal;
2913      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2914    } else {
2915      Kind = tok::exclaim;
2916    }
2917    break;
2918  case '/':
2919    // 6.4.9: Comments
2920    Char = getCharAndSize(CurPtr, SizeTmp);
2921    if (Char == '/') {         // BCPL comment.
2922      // Even if BCPL comments are disabled (e.g. in C89 mode), we generally
2923      // want to lex this as a comment.  There is one problem with this though,
2924      // that in one particular corner case, this can change the behavior of the
2925      // resultant program.  For example, In  "foo //**/ bar", C89 would lex
2926      // this as "foo / bar" and langauges with BCPL comments would lex it as
2927      // "foo".  Check to see if the character after the second slash is a '*'.
2928      // If so, we will lex that as a "/" instead of the start of a comment.
2929      // However, we never do this in -traditional-cpp mode.
2930      if ((Features.BCPLComment ||
2931           getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') &&
2932          !Features.TraditionalCPP) {
2933        if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
2934          return; // There is a token to return.
2935
2936        // It is common for the tokens immediately after a // comment to be
2937        // whitespace (indentation for the next line).  Instead of going through
2938        // the big switch, handle it efficiently now.
2939        goto SkipIgnoredUnits;
2940      }
2941    }
2942
2943    if (Char == '*') {  // /**/ comment.
2944      if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
2945        return; // There is a token to return.
2946      goto LexNextToken;   // GCC isn't tail call eliminating.
2947    }
2948
2949    if (Char == '=') {
2950      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2951      Kind = tok::slashequal;
2952    } else {
2953      Kind = tok::slash;
2954    }
2955    break;
2956  case '%':
2957    Char = getCharAndSize(CurPtr, SizeTmp);
2958    if (Char == '=') {
2959      Kind = tok::percentequal;
2960      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2961    } else if (Features.Digraphs && Char == '>') {
2962      Kind = tok::r_brace;                             // '%>' -> '}'
2963      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2964    } else if (Features.Digraphs && Char == ':') {
2965      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2966      Char = getCharAndSize(CurPtr, SizeTmp);
2967      if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
2968        Kind = tok::hashhash;                          // '%:%:' -> '##'
2969        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
2970                             SizeTmp2, Result);
2971      } else if (Char == '@' && Features.MicrosoftExt) {// %:@ -> #@ -> Charize
2972        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
2973        if (!isLexingRawMode())
2974          Diag(BufferPtr, diag::ext_charize_microsoft);
2975        Kind = tok::hashat;
2976      } else {                                         // '%:' -> '#'
2977        // We parsed a # character.  If this occurs at the start of the line,
2978        // it's actually the start of a preprocessing directive.  Callback to
2979        // the preprocessor to handle it.
2980        // FIXME: -fpreprocessed mode??
2981        if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) {
2982          FormTokenWithChars(Result, CurPtr, tok::hash);
2983          PP->HandleDirective(Result);
2984
2985          // As an optimization, if the preprocessor didn't switch lexers, tail
2986          // recurse.
2987          if (PP->isCurrentLexer(this)) {
2988            // Start a new token. If this is a #include or something, the PP may
2989            // want us starting at the beginning of the line again.  If so, set
2990            // the StartOfLine flag and clear LeadingSpace.
2991            if (IsAtStartOfLine) {
2992              Result.setFlag(Token::StartOfLine);
2993              Result.clearFlag(Token::LeadingSpace);
2994              IsAtStartOfLine = false;
2995            }
2996            goto LexNextToken;   // GCC isn't tail call eliminating.
2997          }
2998
2999          return PP->Lex(Result);
3000        }
3001
3002        Kind = tok::hash;
3003      }
3004    } else {
3005      Kind = tok::percent;
3006    }
3007    break;
3008  case '<':
3009    Char = getCharAndSize(CurPtr, SizeTmp);
3010    if (ParsingFilename) {
3011      return LexAngledStringLiteral(Result, CurPtr);
3012    } else if (Char == '<') {
3013      char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3014      if (After == '=') {
3015        Kind = tok::lesslessequal;
3016        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3017                             SizeTmp2, Result);
3018      } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
3019        // If this is actually a '<<<<<<<' version control conflict marker,
3020        // recognize it as such and recover nicely.
3021        goto LexNextToken;
3022      } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
3023        // If this is '<<<<' and we're in a Perforce-style conflict marker,
3024        // ignore it.
3025        goto LexNextToken;
3026      } else if (Features.CUDA && After == '<') {
3027        Kind = tok::lesslessless;
3028        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3029                             SizeTmp2, Result);
3030      } else {
3031        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3032        Kind = tok::lessless;
3033      }
3034    } else if (Char == '=') {
3035      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3036      Kind = tok::lessequal;
3037    } else if (Features.Digraphs && Char == ':') {     // '<:' -> '['
3038      if (Features.CPlusPlus0x &&
3039          getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
3040        // C++0x [lex.pptoken]p3:
3041        //  Otherwise, if the next three characters are <:: and the subsequent
3042        //  character is neither : nor >, the < is treated as a preprocessor
3043        //  token by itself and not as the first character of the alternative
3044        //  token <:.
3045        unsigned SizeTmp3;
3046        char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3047        if (After != ':' && After != '>') {
3048          Kind = tok::less;
3049          if (!isLexingRawMode())
3050            Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
3051          break;
3052        }
3053      }
3054
3055      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3056      Kind = tok::l_square;
3057    } else if (Features.Digraphs && Char == '%') {     // '<%' -> '{'
3058      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3059      Kind = tok::l_brace;
3060    } else {
3061      Kind = tok::less;
3062    }
3063    break;
3064  case '>':
3065    Char = getCharAndSize(CurPtr, SizeTmp);
3066    if (Char == '=') {
3067      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3068      Kind = tok::greaterequal;
3069    } else if (Char == '>') {
3070      char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
3071      if (After == '=') {
3072        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3073                             SizeTmp2, Result);
3074        Kind = tok::greatergreaterequal;
3075      } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
3076        // If this is actually a '>>>>' conflict marker, recognize it as such
3077        // and recover nicely.
3078        goto LexNextToken;
3079      } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
3080        // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
3081        goto LexNextToken;
3082      } else if (Features.CUDA && After == '>') {
3083        Kind = tok::greatergreatergreater;
3084        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3085                             SizeTmp2, Result);
3086      } else {
3087        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3088        Kind = tok::greatergreater;
3089      }
3090
3091    } else {
3092      Kind = tok::greater;
3093    }
3094    break;
3095  case '^':
3096    Char = getCharAndSize(CurPtr, SizeTmp);
3097    if (Char == '=') {
3098      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3099      Kind = tok::caretequal;
3100    } else {
3101      Kind = tok::caret;
3102    }
3103    break;
3104  case '|':
3105    Char = getCharAndSize(CurPtr, SizeTmp);
3106    if (Char == '=') {
3107      Kind = tok::pipeequal;
3108      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3109    } else if (Char == '|') {
3110      // If this is '|||||||' and we're in a conflict marker, ignore it.
3111      if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
3112        goto LexNextToken;
3113      Kind = tok::pipepipe;
3114      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3115    } else {
3116      Kind = tok::pipe;
3117    }
3118    break;
3119  case ':':
3120    Char = getCharAndSize(CurPtr, SizeTmp);
3121    if (Features.Digraphs && Char == '>') {
3122      Kind = tok::r_square; // ':>' -> ']'
3123      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3124    } else if (Features.CPlusPlus && Char == ':') {
3125      Kind = tok::coloncolon;
3126      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3127    } else {
3128      Kind = tok::colon;
3129    }
3130    break;
3131  case ';':
3132    Kind = tok::semi;
3133    break;
3134  case '=':
3135    Char = getCharAndSize(CurPtr, SizeTmp);
3136    if (Char == '=') {
3137      // If this is '====' and we're in a conflict marker, ignore it.
3138      if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
3139        goto LexNextToken;
3140
3141      Kind = tok::equalequal;
3142      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3143    } else {
3144      Kind = tok::equal;
3145    }
3146    break;
3147  case ',':
3148    Kind = tok::comma;
3149    break;
3150  case '#':
3151    Char = getCharAndSize(CurPtr, SizeTmp);
3152    if (Char == '#') {
3153      Kind = tok::hashhash;
3154      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3155    } else if (Char == '@' && Features.MicrosoftExt) {  // #@ -> Charize
3156      Kind = tok::hashat;
3157      if (!isLexingRawMode())
3158        Diag(BufferPtr, diag::ext_charize_microsoft);
3159      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
3160    } else {
3161      // We parsed a # character.  If this occurs at the start of the line,
3162      // it's actually the start of a preprocessing directive.  Callback to
3163      // the preprocessor to handle it.
3164      // FIXME: -fpreprocessed mode??
3165      if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) {
3166        FormTokenWithChars(Result, CurPtr, tok::hash);
3167        PP->HandleDirective(Result);
3168
3169        // As an optimization, if the preprocessor didn't switch lexers, tail
3170        // recurse.
3171        if (PP->isCurrentLexer(this)) {
3172          // Start a new token.  If this is a #include or something, the PP may
3173          // want us starting at the beginning of the line again.  If so, set
3174          // the StartOfLine flag and clear LeadingSpace.
3175          if (IsAtStartOfLine) {
3176            Result.setFlag(Token::StartOfLine);
3177            Result.clearFlag(Token::LeadingSpace);
3178            IsAtStartOfLine = false;
3179          }
3180          goto LexNextToken;   // GCC isn't tail call eliminating.
3181        }
3182        return PP->Lex(Result);
3183      }
3184
3185      Kind = tok::hash;
3186    }
3187    break;
3188
3189  case '@':
3190    // Objective C support.
3191    if (CurPtr[-1] == '@' && Features.ObjC1)
3192      Kind = tok::at;
3193    else
3194      Kind = tok::unknown;
3195    break;
3196
3197  case '\\':
3198    // FIXME: UCN's.
3199    // FALL THROUGH.
3200  default:
3201    Kind = tok::unknown;
3202    break;
3203  }
3204
3205  // Notify MIOpt that we read a non-whitespace/non-comment token.
3206  MIOpt.ReadToken();
3207
3208  // Update the location of token as well as BufferPtr.
3209  FormTokenWithChars(Result, CurPtr, Kind);
3210}
3211