Lexer.cpp revision d1186fa38166a581b51975f0382a45fc3a0733d0
1//===--- Lexer.cpp - C Language Family Lexer ------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10//  This file implements the Lexer and Token interfaces.
11//
12//===----------------------------------------------------------------------===//
13//
14// TODO: GCC Diagnostics emitted by the lexer:
15// PEDWARN: (form feed|vertical tab) in preprocessing directive
16//
17// Universal characters, unicode, char mapping:
18// WARNING: `%.*s' is not in NFKC
19// WARNING: `%.*s' is not in NFC
20//
21// Other:
22// TODO: Options to support:
23//    -fexec-charset,-fwide-exec-charset
24//
25//===----------------------------------------------------------------------===//
26
27#include "clang/Lex/Lexer.h"
28#include "clang/Lex/Preprocessor.h"
29#include "clang/Basic/Diagnostic.h"
30#include "clang/Basic/SourceManager.h"
31#include "llvm/Support/Compiler.h"
32#include "llvm/Support/MemoryBuffer.h"
33#include <cctype>
34using namespace clang;
35
36static void InitCharacterInfo();
37
38//===----------------------------------------------------------------------===//
39// Token Class Implementation
40//===----------------------------------------------------------------------===//
41
42/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
43bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
44  if (IdentifierInfo *II = getIdentifierInfo())
45    return II->getObjCKeywordID() == objcKey;
46  return false;
47}
48
49/// getObjCKeywordID - Return the ObjC keyword kind.
50tok::ObjCKeywordKind Token::getObjCKeywordID() const {
51  IdentifierInfo *specId = getIdentifierInfo();
52  return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
53}
54
55
56//===----------------------------------------------------------------------===//
57// Lexer Class Implementation
58//===----------------------------------------------------------------------===//
59
60void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
61                      const char *BufEnd) {
62  InitCharacterInfo();
63
64  BufferStart = BufStart;
65  BufferPtr = BufPtr;
66  BufferEnd = BufEnd;
67
68  assert(BufEnd[0] == 0 &&
69         "We assume that the input buffer has a null character at the end"
70         " to simplify lexing!");
71
72  Is_PragmaLexer = false;
73
74  // Start of the file is a start of line.
75  IsAtStartOfLine = true;
76
77  // We are not after parsing a #.
78  ParsingPreprocessorDirective = false;
79
80  // We are not after parsing #include.
81  ParsingFilename = false;
82
83  // We are not in raw mode.  Raw mode disables diagnostics and interpretation
84  // of tokens (e.g. identifiers, thus disabling macro expansion).  It is used
85  // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
86  // or otherwise skipping over tokens.
87  LexingRawMode = false;
88
89  // Default to not keeping comments.
90  ExtendedTokenMode = 0;
91}
92
93/// Lexer constructor - Create a new lexer object for the specified buffer
94/// with the specified preprocessor managing the lexing process.  This lexer
95/// assumes that the associated file buffer and Preprocessor objects will
96/// outlive it, so it doesn't take ownership of either of them.
97Lexer::Lexer(FileID FID, Preprocessor &PP)
98  : PreprocessorLexer(&PP, FID),
99    FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
100    Features(PP.getLangOptions()) {
101
102  const llvm::MemoryBuffer *InputFile = PP.getSourceManager().getBuffer(FID);
103
104  InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
105            InputFile->getBufferEnd());
106
107  // Default to keeping comments if the preprocessor wants them.
108  SetCommentRetentionState(PP.getCommentRetentionState());
109}
110
111/// Lexer constructor - Create a new raw lexer object.  This object is only
112/// suitable for calls to 'LexRawToken'.  This lexer assumes that the text
113/// range will outlive it, so it doesn't take ownership of it.
114Lexer::Lexer(SourceLocation fileloc, const LangOptions &features,
115             const char *BufStart, const char *BufPtr, const char *BufEnd)
116  : FileLoc(fileloc), Features(features) {
117
118  InitLexer(BufStart, BufPtr, BufEnd);
119
120  // We *are* in raw mode.
121  LexingRawMode = true;
122}
123
124/// Lexer constructor - Create a new raw lexer object.  This object is only
125/// suitable for calls to 'LexRawToken'.  This lexer assumes that the text
126/// range will outlive it, so it doesn't take ownership of it.
127Lexer::Lexer(FileID FID, const SourceManager &SM, const LangOptions &features)
128  : FileLoc(SM.getLocForStartOfFile(FID)), Features(features) {
129  const llvm::MemoryBuffer *FromFile = SM.getBuffer(FID);
130
131  InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(),
132            FromFile->getBufferEnd());
133
134  // We *are* in raw mode.
135  LexingRawMode = true;
136}
137
138/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
139/// _Pragma expansion.  This has a variety of magic semantics that this method
140/// sets up.  It returns a new'd Lexer that must be delete'd when done.
141///
142/// On entrance to this routine, TokStartLoc is a macro location which has a
143/// spelling loc that indicates the bytes to be lexed for the token and an
144/// instantiation location that indicates where all lexed tokens should be
145/// "expanded from".
146///
147/// FIXME: It would really be nice to make _Pragma just be a wrapper around a
148/// normal lexer that remaps tokens as they fly by.  This would require making
149/// Preprocessor::Lex virtual.  Given that, we could just dump in a magic lexer
150/// interface that could handle this stuff.  This would pull GetMappedTokenLoc
151/// out of the critical path of the lexer!
152///
153Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
154                                 SourceLocation InstantiationLoc,
155                                 unsigned TokLen, Preprocessor &PP) {
156  SourceManager &SM = PP.getSourceManager();
157
158  // Create the lexer as if we were going to lex the file normally.
159  FileID SpellingFID = SM.getFileID(SpellingLoc);
160  Lexer *L = new Lexer(SpellingFID, PP);
161
162  // Now that the lexer is created, change the start/end locations so that we
163  // just lex the subsection of the file that we want.  This is lexing from a
164  // scratch buffer.
165  const char *StrData = SM.getCharacterData(SpellingLoc);
166
167  L->BufferPtr = StrData;
168  L->BufferEnd = StrData+TokLen;
169
170  // Set the SourceLocation with the remapping information.  This ensures that
171  // GetMappedTokenLoc will remap the tokens as they are lexed.
172  L->FileLoc = SM.getInstantiationLoc(SM.getLocForStartOfFile(SpellingFID),
173                                      InstantiationLoc);
174
175  // Ensure that the lexer thinks it is inside a directive, so that end \n will
176  // return an EOM token.
177  L->ParsingPreprocessorDirective = true;
178
179  // This lexer really is for _Pragma.
180  L->Is_PragmaLexer = true;
181  return L;
182}
183
184
185/// Stringify - Convert the specified string into a C string, with surrounding
186/// ""'s, and with escaped \ and " characters.
187std::string Lexer::Stringify(const std::string &Str, bool Charify) {
188  std::string Result = Str;
189  char Quote = Charify ? '\'' : '"';
190  for (unsigned i = 0, e = Result.size(); i != e; ++i) {
191    if (Result[i] == '\\' || Result[i] == Quote) {
192      Result.insert(Result.begin()+i, '\\');
193      ++i; ++e;
194    }
195  }
196  return Result;
197}
198
199/// Stringify - Convert the specified string into a C string by escaping '\'
200/// and " characters.  This does not add surrounding ""'s to the string.
201void Lexer::Stringify(llvm::SmallVectorImpl<char> &Str) {
202  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
203    if (Str[i] == '\\' || Str[i] == '"') {
204      Str.insert(Str.begin()+i, '\\');
205      ++i; ++e;
206    }
207  }
208}
209
210
211/// MeasureTokenLength - Relex the token at the specified location and return
212/// its length in bytes in the input file.  If the token needs cleaning (e.g.
213/// includes a trigraph or an escaped newline) then this count includes bytes
214/// that are part of that.
215unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
216                                   const SourceManager &SM) {
217  // If this comes from a macro expansion, we really do want the macro name, not
218  // the token this macro expanded to.
219  Loc = SM.getInstantiationLoc(Loc);
220
221  // TODO: this could be special cased for common tokens like identifiers, ')',
222  // etc to make this faster, if it mattered.  Just look at StrData[0] to handle
223  // all obviously single-char tokens.  This could use
224  // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
225  // something.
226  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedFileLoc(Loc);
227  std::pair<const char *,const char *> Buffer = SM.getBufferData(LocInfo.first);
228  const char *StrData = Buffer.first+LocInfo.second;
229
230  // Create a langops struct and enable trigraphs.  This is sufficient for
231  // measuring tokens.
232  LangOptions LangOpts;
233  LangOpts.Trigraphs = true;
234
235  // Create a lexer starting at the beginning of this token.
236  Lexer TheLexer(Loc, LangOpts, Buffer.first, StrData, Buffer.second);
237  Token TheTok;
238  TheLexer.LexFromRawLexer(TheTok);
239  return TheTok.getLength();
240}
241
242//===----------------------------------------------------------------------===//
243// Character information.
244//===----------------------------------------------------------------------===//
245
246static unsigned char CharInfo[256];
247
248enum {
249  CHAR_HORZ_WS  = 0x01,  // ' ', '\t', '\f', '\v'.  Note, no '\0'
250  CHAR_VERT_WS  = 0x02,  // '\r', '\n'
251  CHAR_LETTER   = 0x04,  // a-z,A-Z
252  CHAR_NUMBER   = 0x08,  // 0-9
253  CHAR_UNDER    = 0x10,  // _
254  CHAR_PERIOD   = 0x20   // .
255};
256
257static void InitCharacterInfo() {
258  static bool isInited = false;
259  if (isInited) return;
260  isInited = true;
261
262  // Intiialize the CharInfo table.
263  // TODO: statically initialize this.
264  CharInfo[(int)' '] = CharInfo[(int)'\t'] =
265  CharInfo[(int)'\f'] = CharInfo[(int)'\v'] = CHAR_HORZ_WS;
266  CharInfo[(int)'\n'] = CharInfo[(int)'\r'] = CHAR_VERT_WS;
267
268  CharInfo[(int)'_'] = CHAR_UNDER;
269  CharInfo[(int)'.'] = CHAR_PERIOD;
270  for (unsigned i = 'a'; i <= 'z'; ++i)
271    CharInfo[i] = CharInfo[i+'A'-'a'] = CHAR_LETTER;
272  for (unsigned i = '0'; i <= '9'; ++i)
273    CharInfo[i] = CHAR_NUMBER;
274}
275
276/// isIdentifierBody - Return true if this is the body character of an
277/// identifier, which is [a-zA-Z0-9_].
278static inline bool isIdentifierBody(unsigned char c) {
279  return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false;
280}
281
282/// isHorizontalWhitespace - Return true if this character is horizontal
283/// whitespace: ' ', '\t', '\f', '\v'.  Note that this returns false for '\0'.
284static inline bool isHorizontalWhitespace(unsigned char c) {
285  return (CharInfo[c] & CHAR_HORZ_WS) ? true : false;
286}
287
288/// isWhitespace - Return true if this character is horizontal or vertical
289/// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'.  Note that this returns false
290/// for '\0'.
291static inline bool isWhitespace(unsigned char c) {
292  return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false;
293}
294
295/// isNumberBody - Return true if this is the body character of an
296/// preprocessing number, which is [a-zA-Z0-9_.].
297static inline bool isNumberBody(unsigned char c) {
298  return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ?
299    true : false;
300}
301
302
303//===----------------------------------------------------------------------===//
304// Diagnostics forwarding code.
305//===----------------------------------------------------------------------===//
306
307/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
308/// lexer buffer was all instantiated at a single point, perform the mapping.
309/// This is currently only used for _Pragma implementation, so it is the slow
310/// path of the hot getSourceLocation method.  Do not allow it to be inlined.
311static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
312                                        SourceLocation FileLoc,
313                                        unsigned CharNo) DISABLE_INLINE;
314static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
315                                        SourceLocation FileLoc,
316                                        unsigned CharNo) {
317  // Otherwise, we're lexing "mapped tokens".  This is used for things like
318  // _Pragma handling.  Combine the instantiation location of FileLoc with the
319  // spelling location.
320  SourceManager &SourceMgr = PP.getSourceManager();
321
322  // Create a new SLoc which is expanded from Instantiation(FileLoc) but whose
323  // characters come from spelling(FileLoc)+Offset.
324  SourceLocation InstLoc = SourceMgr.getInstantiationLoc(FileLoc);
325  SourceLocation SpellingLoc = SourceMgr.getSpellingLoc(FileLoc);
326  SpellingLoc = SpellingLoc.getFileLocWithOffset(CharNo);
327  return SourceMgr.getInstantiationLoc(SpellingLoc, InstLoc);
328}
329
330/// getSourceLocation - Return a source location identifier for the specified
331/// offset in the current file.
332SourceLocation Lexer::getSourceLocation(const char *Loc) const {
333  assert(Loc >= BufferStart && Loc <= BufferEnd &&
334         "Location out of range for this buffer!");
335
336  // In the normal case, we're just lexing from a simple file buffer, return
337  // the file id from FileLoc with the offset specified.
338  unsigned CharNo = Loc-BufferStart;
339  if (FileLoc.isFileID())
340    return FileLoc.getFileLocWithOffset(CharNo);
341
342  // Otherwise, this is the _Pragma lexer case, which pretends that all of the
343  // tokens are lexed from where the _Pragma was defined.
344  assert(PP && "This doesn't work on raw lexers");
345  return GetMappedTokenLoc(*PP, FileLoc, CharNo);
346}
347
348/// Diag - Forwarding function for diagnostics.  This translate a source
349/// position in the current buffer into a SourceLocation object for rendering.
350DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
351  return PP->Diag(getSourceLocation(Loc), DiagID);
352}
353
354//===----------------------------------------------------------------------===//
355// Trigraph and Escaped Newline Handling Code.
356//===----------------------------------------------------------------------===//
357
358/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
359/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
360static char GetTrigraphCharForLetter(char Letter) {
361  switch (Letter) {
362  default:   return 0;
363  case '=':  return '#';
364  case ')':  return ']';
365  case '(':  return '[';
366  case '!':  return '|';
367  case '\'': return '^';
368  case '>':  return '}';
369  case '/':  return '\\';
370  case '<':  return '{';
371  case '-':  return '~';
372  }
373}
374
375/// DecodeTrigraphChar - If the specified character is a legal trigraph when
376/// prefixed with ??, emit a trigraph warning.  If trigraphs are enabled,
377/// return the result character.  Finally, emit a warning about trigraph use
378/// whether trigraphs are enabled or not.
379static char DecodeTrigraphChar(const char *CP, Lexer *L) {
380  char Res = GetTrigraphCharForLetter(*CP);
381  if (!Res || !L) return Res;
382
383  if (!L->getFeatures().Trigraphs) {
384    if (!L->isLexingRawMode())
385      L->Diag(CP-2, diag::trigraph_ignored);
386    return 0;
387  }
388
389  if (!L->isLexingRawMode())
390    L->Diag(CP-2, diag::trigraph_converted) << std::string()+Res;
391  return Res;
392}
393
394/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
395/// get its size, and return it.  This is tricky in several cases:
396///   1. If currently at the start of a trigraph, we warn about the trigraph,
397///      then either return the trigraph (skipping 3 chars) or the '?',
398///      depending on whether trigraphs are enabled or not.
399///   2. If this is an escaped newline (potentially with whitespace between
400///      the backslash and newline), implicitly skip the newline and return
401///      the char after it.
402///   3. If this is a UCN, return it.  FIXME: C++ UCN's?
403///
404/// This handles the slow/uncommon case of the getCharAndSize method.  Here we
405/// know that we can accumulate into Size, and that we have already incremented
406/// Ptr by Size bytes.
407///
408/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
409/// be updated to match.
410///
411char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
412                               Token *Tok) {
413  // If we have a slash, look for an escaped newline.
414  if (Ptr[0] == '\\') {
415    ++Size;
416    ++Ptr;
417Slash:
418    // Common case, backslash-char where the char is not whitespace.
419    if (!isWhitespace(Ptr[0])) return '\\';
420
421    // See if we have optional whitespace characters followed by a newline.
422    {
423      unsigned SizeTmp = 0;
424      do {
425        ++SizeTmp;
426        if (Ptr[SizeTmp-1] == '\n' || Ptr[SizeTmp-1] == '\r') {
427          // Remember that this token needs to be cleaned.
428          if (Tok) Tok->setFlag(Token::NeedsCleaning);
429
430          // Warn if there was whitespace between the backslash and newline.
431          if (SizeTmp != 1 && Tok && !isLexingRawMode())
432            Diag(Ptr, diag::backslash_newline_space);
433
434          // If this is a \r\n or \n\r, skip the newlines.
435          if ((Ptr[SizeTmp] == '\r' || Ptr[SizeTmp] == '\n') &&
436              Ptr[SizeTmp-1] != Ptr[SizeTmp])
437            ++SizeTmp;
438
439          // Found backslash<whitespace><newline>.  Parse the char after it.
440          Size += SizeTmp;
441          Ptr  += SizeTmp;
442          // Use slow version to accumulate a correct size field.
443          return getCharAndSizeSlow(Ptr, Size, Tok);
444        }
445      } while (isWhitespace(Ptr[SizeTmp]));
446    }
447
448    // Otherwise, this is not an escaped newline, just return the slash.
449    return '\\';
450  }
451
452  // If this is a trigraph, process it.
453  if (Ptr[0] == '?' && Ptr[1] == '?') {
454    // If this is actually a legal trigraph (not something like "??x"), emit
455    // a trigraph warning.  If so, and if trigraphs are enabled, return it.
456    if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) {
457      // Remember that this token needs to be cleaned.
458      if (Tok) Tok->setFlag(Token::NeedsCleaning);
459
460      Ptr += 3;
461      Size += 3;
462      if (C == '\\') goto Slash;
463      return C;
464    }
465  }
466
467  // If this is neither, return a single character.
468  ++Size;
469  return *Ptr;
470}
471
472
473/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
474/// getCharAndSizeNoWarn method.  Here we know that we can accumulate into Size,
475/// and that we have already incremented Ptr by Size bytes.
476///
477/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
478/// be updated to match.
479char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
480                                     const LangOptions &Features) {
481  // If we have a slash, look for an escaped newline.
482  if (Ptr[0] == '\\') {
483    ++Size;
484    ++Ptr;
485Slash:
486    // Common case, backslash-char where the char is not whitespace.
487    if (!isWhitespace(Ptr[0])) return '\\';
488
489    // See if we have optional whitespace characters followed by a newline.
490    {
491      unsigned SizeTmp = 0;
492      do {
493        ++SizeTmp;
494        if (Ptr[SizeTmp-1] == '\n' || Ptr[SizeTmp-1] == '\r') {
495
496          // If this is a \r\n or \n\r, skip the newlines.
497          if ((Ptr[SizeTmp] == '\r' || Ptr[SizeTmp] == '\n') &&
498              Ptr[SizeTmp-1] != Ptr[SizeTmp])
499            ++SizeTmp;
500
501          // Found backslash<whitespace><newline>.  Parse the char after it.
502          Size += SizeTmp;
503          Ptr  += SizeTmp;
504
505          // Use slow version to accumulate a correct size field.
506          return getCharAndSizeSlowNoWarn(Ptr, Size, Features);
507        }
508      } while (isWhitespace(Ptr[SizeTmp]));
509    }
510
511    // Otherwise, this is not an escaped newline, just return the slash.
512    return '\\';
513  }
514
515  // If this is a trigraph, process it.
516  if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
517    // If this is actually a legal trigraph (not something like "??x"), return
518    // it.
519    if (char C = GetTrigraphCharForLetter(Ptr[2])) {
520      Ptr += 3;
521      Size += 3;
522      if (C == '\\') goto Slash;
523      return C;
524    }
525  }
526
527  // If this is neither, return a single character.
528  ++Size;
529  return *Ptr;
530}
531
532//===----------------------------------------------------------------------===//
533// Helper methods for lexing.
534//===----------------------------------------------------------------------===//
535
536void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
537  // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
538  unsigned Size;
539  unsigned char C = *CurPtr++;
540  while (isIdentifierBody(C)) {
541    C = *CurPtr++;
542  }
543  --CurPtr;   // Back up over the skipped character.
544
545  // Fast path, no $,\,? in identifier found.  '\' might be an escaped newline
546  // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
547  // FIXME: UCNs.
548  if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) {
549FinishIdentifier:
550    const char *IdStart = BufferPtr;
551    FormTokenWithChars(Result, CurPtr, tok::identifier);
552
553    // If we are in raw mode, return this identifier raw.  There is no need to
554    // look up identifier information or attempt to macro expand it.
555    if (LexingRawMode) return;
556
557    // Fill in Result.IdentifierInfo, looking up the identifier in the
558    // identifier table.
559    IdentifierInfo *II = PP->LookUpIdentifierInfo(Result, IdStart);
560
561    // Finally, now that we know we have an identifier, pass this off to the
562    // preprocessor, which may macro expand it or something.
563    if (II->isHandleIdentifierCase())
564      PP->HandleIdentifier(Result);
565    return;
566  }
567
568  // Otherwise, $,\,? in identifier found.  Enter slower path.
569
570  C = getCharAndSize(CurPtr, Size);
571  while (1) {
572    if (C == '$') {
573      // If we hit a $ and they are not supported in identifiers, we are done.
574      if (!Features.DollarIdents) goto FinishIdentifier;
575
576      // Otherwise, emit a diagnostic and continue.
577      if (!isLexingRawMode())
578        Diag(CurPtr, diag::ext_dollar_in_identifier);
579      CurPtr = ConsumeChar(CurPtr, Size, Result);
580      C = getCharAndSize(CurPtr, Size);
581      continue;
582    } else if (!isIdentifierBody(C)) { // FIXME: UCNs.
583      // Found end of identifier.
584      goto FinishIdentifier;
585    }
586
587    // Otherwise, this character is good, consume it.
588    CurPtr = ConsumeChar(CurPtr, Size, Result);
589
590    C = getCharAndSize(CurPtr, Size);
591    while (isIdentifierBody(C)) { // FIXME: UCNs.
592      CurPtr = ConsumeChar(CurPtr, Size, Result);
593      C = getCharAndSize(CurPtr, Size);
594    }
595  }
596}
597
598
599/// LexNumericConstant - Lex the remainder of a integer or floating point
600/// constant. From[-1] is the first character lexed.  Return the end of the
601/// constant.
602void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
603  unsigned Size;
604  char C = getCharAndSize(CurPtr, Size);
605  char PrevCh = 0;
606  while (isNumberBody(C)) { // FIXME: UCNs?
607    CurPtr = ConsumeChar(CurPtr, Size, Result);
608    PrevCh = C;
609    C = getCharAndSize(CurPtr, Size);
610  }
611
612  // If we fell out, check for a sign, due to 1e+12.  If we have one, continue.
613  if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e'))
614    return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
615
616  // If we have a hex FP constant, continue.
617  if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p') &&
618      (Features.HexFloats || !Features.NoExtensions))
619    return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
620
621  // Update the location of token as well as BufferPtr.
622  FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
623}
624
625/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
626/// either " or L".
627void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) {
628  const char *NulCharacter = 0; // Does this string contain the \0 character?
629
630  char C = getAndAdvanceChar(CurPtr, Result);
631  while (C != '"') {
632    // Skip escaped characters.
633    if (C == '\\') {
634      // Skip the escaped character.
635      C = getAndAdvanceChar(CurPtr, Result);
636    } else if (C == '\n' || C == '\r' ||             // Newline.
637               (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
638      if (!isLexingRawMode())
639        Diag(BufferPtr, diag::err_unterminated_string);
640      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
641      return;
642    } else if (C == 0) {
643      NulCharacter = CurPtr-1;
644    }
645    C = getAndAdvanceChar(CurPtr, Result);
646  }
647
648  // If a nul character existed in the string, warn about it.
649  if (NulCharacter && !isLexingRawMode())
650    Diag(NulCharacter, diag::null_in_string);
651
652  // Update the location of the token as well as the BufferPtr instance var.
653  FormTokenWithChars(Result, CurPtr,
654                     Wide ? tok::wide_string_literal : tok::string_literal);
655}
656
657/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
658/// after having lexed the '<' character.  This is used for #include filenames.
659void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
660  const char *NulCharacter = 0; // Does this string contain the \0 character?
661
662  char C = getAndAdvanceChar(CurPtr, Result);
663  while (C != '>') {
664    // Skip escaped characters.
665    if (C == '\\') {
666      // Skip the escaped character.
667      C = getAndAdvanceChar(CurPtr, Result);
668    } else if (C == '\n' || C == '\r' ||             // Newline.
669               (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
670      if (!isLexingRawMode())
671        Diag(BufferPtr, diag::err_unterminated_string);
672      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
673      return;
674    } else if (C == 0) {
675      NulCharacter = CurPtr-1;
676    }
677    C = getAndAdvanceChar(CurPtr, Result);
678  }
679
680  // If a nul character existed in the string, warn about it.
681  if (NulCharacter && !isLexingRawMode())
682    Diag(NulCharacter, diag::null_in_string);
683
684  // Update the location of token as well as BufferPtr.
685  FormTokenWithChars(Result, CurPtr, tok::angle_string_literal);
686}
687
688
689/// LexCharConstant - Lex the remainder of a character constant, after having
690/// lexed either ' or L'.
691void Lexer::LexCharConstant(Token &Result, const char *CurPtr) {
692  const char *NulCharacter = 0; // Does this character contain the \0 character?
693
694  // Handle the common case of 'x' and '\y' efficiently.
695  char C = getAndAdvanceChar(CurPtr, Result);
696  if (C == '\'') {
697    if (!isLexingRawMode())
698      Diag(BufferPtr, diag::err_empty_character);
699    FormTokenWithChars(Result, CurPtr, tok::unknown);
700    return;
701  } else if (C == '\\') {
702    // Skip the escaped character.
703    // FIXME: UCN's.
704    C = getAndAdvanceChar(CurPtr, Result);
705  }
706
707  if (C && C != '\n' && C != '\r' && CurPtr[0] == '\'') {
708    ++CurPtr;
709  } else {
710    // Fall back on generic code for embedded nulls, newlines, wide chars.
711    do {
712      // Skip escaped characters.
713      if (C == '\\') {
714        // Skip the escaped character.
715        C = getAndAdvanceChar(CurPtr, Result);
716      } else if (C == '\n' || C == '\r' ||               // Newline.
717                 (C == 0 && CurPtr-1 == BufferEnd)) {    // End of file.
718        if (!isLexingRawMode())
719          Diag(BufferPtr, diag::err_unterminated_char);
720        FormTokenWithChars(Result, CurPtr-1, tok::unknown);
721        return;
722      } else if (C == 0) {
723        NulCharacter = CurPtr-1;
724      }
725      C = getAndAdvanceChar(CurPtr, Result);
726    } while (C != '\'');
727  }
728
729  if (NulCharacter && !isLexingRawMode())
730    Diag(NulCharacter, diag::null_in_char);
731
732  // Update the location of token as well as BufferPtr.
733  FormTokenWithChars(Result, CurPtr, tok::char_constant);
734}
735
736/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
737/// Update BufferPtr to point to the next non-whitespace character and return.
738///
739/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
740///
741bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
742  // Whitespace - Skip it, then return the token after the whitespace.
743  unsigned char Char = *CurPtr;  // Skip consequtive spaces efficiently.
744  while (1) {
745    // Skip horizontal whitespace very aggressively.
746    while (isHorizontalWhitespace(Char))
747      Char = *++CurPtr;
748
749    // Otherwise if we have something other than whitespace, we're done.
750    if (Char != '\n' && Char != '\r')
751      break;
752
753    if (ParsingPreprocessorDirective) {
754      // End of preprocessor directive line, let LexTokenInternal handle this.
755      BufferPtr = CurPtr;
756      return false;
757    }
758
759    // ok, but handle newline.
760    // The returned token is at the start of the line.
761    Result.setFlag(Token::StartOfLine);
762    // No leading whitespace seen so far.
763    Result.clearFlag(Token::LeadingSpace);
764    Char = *++CurPtr;
765  }
766
767  // If this isn't immediately after a newline, there is leading space.
768  char PrevChar = CurPtr[-1];
769  if (PrevChar != '\n' && PrevChar != '\r')
770    Result.setFlag(Token::LeadingSpace);
771
772  // If the client wants us to return whitespace, return it now.
773  if (isKeepWhitespaceMode()) {
774    FormTokenWithChars(Result, CurPtr, tok::unknown);
775    return true;
776  }
777
778  BufferPtr = CurPtr;
779  return false;
780}
781
782// SkipBCPLComment - We have just read the // characters from input.  Skip until
783// we find the newline character thats terminate the comment.  Then update
784/// BufferPtr and return.  If we're in KeepCommentMode, this will form the token
785/// and return true.
786bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) {
787  // If BCPL comments aren't explicitly enabled for this language, emit an
788  // extension warning.
789  if (!Features.BCPLComment && !isLexingRawMode()) {
790    Diag(BufferPtr, diag::ext_bcpl_comment);
791
792    // Mark them enabled so we only emit one warning for this translation
793    // unit.
794    Features.BCPLComment = true;
795  }
796
797  // Scan over the body of the comment.  The common case, when scanning, is that
798  // the comment contains normal ascii characters with nothing interesting in
799  // them.  As such, optimize for this case with the inner loop.
800  char C;
801  do {
802    C = *CurPtr;
803    // FIXME: Speedup BCPL comment lexing.  Just scan for a \n or \r character.
804    // If we find a \n character, scan backwards, checking to see if it's an
805    // escaped newline, like we do for block comments.
806
807    // Skip over characters in the fast loop.
808    while (C != 0 &&                // Potentially EOF.
809           C != '\\' &&             // Potentially escaped newline.
810           C != '?' &&              // Potentially trigraph.
811           C != '\n' && C != '\r')  // Newline or DOS-style newline.
812      C = *++CurPtr;
813
814    // If this is a newline, we're done.
815    if (C == '\n' || C == '\r')
816      break;  // Found the newline? Break out!
817
818    // Otherwise, this is a hard case.  Fall back on getAndAdvanceChar to
819    // properly decode the character.  Read it in raw mode to avoid emitting
820    // diagnostics about things like trigraphs.  If we see an escaped newline,
821    // we'll handle it below.
822    const char *OldPtr = CurPtr;
823    bool OldRawMode = isLexingRawMode();
824    LexingRawMode = true;
825    C = getAndAdvanceChar(CurPtr, Result);
826    LexingRawMode = OldRawMode;
827
828    // If we read multiple characters, and one of those characters was a \r or
829    // \n, then we had an escaped newline within the comment.  Emit diagnostic
830    // unless the next line is also a // comment.
831    if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') {
832      for (; OldPtr != CurPtr; ++OldPtr)
833        if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
834          // Okay, we found a // comment that ends in a newline, if the next
835          // line is also a // comment, but has spaces, don't emit a diagnostic.
836          if (isspace(C)) {
837            const char *ForwardPtr = CurPtr;
838            while (isspace(*ForwardPtr))  // Skip whitespace.
839              ++ForwardPtr;
840            if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
841              break;
842          }
843
844          if (!isLexingRawMode())
845            Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment);
846          break;
847        }
848    }
849
850    if (CurPtr == BufferEnd+1) { --CurPtr; break; }
851  } while (C != '\n' && C != '\r');
852
853  // Found but did not consume the newline.
854
855  // If we are returning comments as tokens, return this comment as a token.
856  if (inKeepCommentMode())
857    return SaveBCPLComment(Result, CurPtr);
858
859  // If we are inside a preprocessor directive and we see the end of line,
860  // return immediately, so that the lexer can return this as an EOM token.
861  if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
862    BufferPtr = CurPtr;
863    return false;
864  }
865
866  // Otherwise, eat the \n character.  We don't care if this is a \n\r or
867  // \r\n sequence.  This is an efficiency hack (because we know the \n can't
868  // contribute to another token), it isn't needed for correctness.  Note that
869  // this is ok even in KeepWhitespaceMode, because we would have returned the
870  /// comment above in that mode.
871  ++CurPtr;
872
873  // The next returned token is at the start of the line.
874  Result.setFlag(Token::StartOfLine);
875  // No leading whitespace seen so far.
876  Result.clearFlag(Token::LeadingSpace);
877  BufferPtr = CurPtr;
878  return false;
879}
880
881/// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in
882/// an appropriate way and return it.
883bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) {
884  // If we're not in a preprocessor directive, just return the // comment
885  // directly.
886  FormTokenWithChars(Result, CurPtr, tok::comment);
887
888  if (!ParsingPreprocessorDirective)
889    return true;
890
891  // If this BCPL-style comment is in a macro definition, transmogrify it into
892  // a C-style block comment.
893  std::string Spelling = PP->getSpelling(Result);
894  assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?");
895  Spelling[1] = '*';   // Change prefix to "/*".
896  Spelling += "*/";    // add suffix.
897
898  Result.setKind(tok::comment);
899  Result.setLocation(PP->CreateString(&Spelling[0], Spelling.size(),
900                                      Result.getLocation()));
901  Result.setLength(Spelling.size());
902  return true;
903}
904
905/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
906/// character (either \n or \r) is part of an escaped newline sequence.  Issue a
907/// diagnostic if so.  We know that the newline is inside of a block comment.
908static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
909                                                  Lexer *L) {
910  assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
911
912  // Back up off the newline.
913  --CurPtr;
914
915  // If this is a two-character newline sequence, skip the other character.
916  if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
917    // \n\n or \r\r -> not escaped newline.
918    if (CurPtr[0] == CurPtr[1])
919      return false;
920    // \n\r or \r\n -> skip the newline.
921    --CurPtr;
922  }
923
924  // If we have horizontal whitespace, skip over it.  We allow whitespace
925  // between the slash and newline.
926  bool HasSpace = false;
927  while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
928    --CurPtr;
929    HasSpace = true;
930  }
931
932  // If we have a slash, we know this is an escaped newline.
933  if (*CurPtr == '\\') {
934    if (CurPtr[-1] != '*') return false;
935  } else {
936    // It isn't a slash, is it the ?? / trigraph?
937    if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' ||
938        CurPtr[-3] != '*')
939      return false;
940
941    // This is the trigraph ending the comment.  Emit a stern warning!
942    CurPtr -= 2;
943
944    // If no trigraphs are enabled, warn that we ignored this trigraph and
945    // ignore this * character.
946    if (!L->getFeatures().Trigraphs) {
947      if (!L->isLexingRawMode())
948        L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
949      return false;
950    }
951    if (!L->isLexingRawMode())
952      L->Diag(CurPtr, diag::trigraph_ends_block_comment);
953  }
954
955  // Warn about having an escaped newline between the */ characters.
956  if (!L->isLexingRawMode())
957    L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
958
959  // If there was space between the backslash and newline, warn about it.
960  if (HasSpace && !L->isLexingRawMode())
961    L->Diag(CurPtr, diag::backslash_newline_space);
962
963  return true;
964}
965
966#ifdef __SSE2__
967#include <emmintrin.h>
968#elif __ALTIVEC__
969#include <altivec.h>
970#undef bool
971#endif
972
973/// SkipBlockComment - We have just read the /* characters from input.  Read
974/// until we find the */ characters that terminate the comment.  Note that we
975/// don't bother decoding trigraphs or escaped newlines in block comments,
976/// because they cannot cause the comment to end.  The only thing that can
977/// happen is the comment could end with an escaped newline between the */ end
978/// of comment.
979///
980/// If KeepCommentMode is enabled, this forms a token from the comment and
981/// returns true.
982bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
983  // Scan one character past where we should, looking for a '/' character.  Once
984  // we find it, check to see if it was preceeded by a *.  This common
985  // optimization helps people who like to put a lot of * characters in their
986  // comments.
987
988  // The first character we get with newlines and trigraphs skipped to handle
989  // the degenerate /*/ case below correctly if the * has an escaped newline
990  // after it.
991  unsigned CharSize;
992  unsigned char C = getCharAndSize(CurPtr, CharSize);
993  CurPtr += CharSize;
994  if (C == 0 && CurPtr == BufferEnd+1) {
995    if (!isLexingRawMode())
996      Diag(BufferPtr, diag::err_unterminated_block_comment);
997    --CurPtr;
998
999    // KeepWhitespaceMode should return this broken comment as a token.  Since
1000    // it isn't a well formed comment, just return it as an 'unknown' token.
1001    if (isKeepWhitespaceMode()) {
1002      FormTokenWithChars(Result, CurPtr, tok::unknown);
1003      return true;
1004    }
1005
1006    BufferPtr = CurPtr;
1007    return false;
1008  }
1009
1010  // Check to see if the first character after the '/*' is another /.  If so,
1011  // then this slash does not end the block comment, it is part of it.
1012  if (C == '/')
1013    C = *CurPtr++;
1014
1015  while (1) {
1016    // Skip over all non-interesting characters until we find end of buffer or a
1017    // (probably ending) '/' character.
1018    if (CurPtr + 24 < BufferEnd) {
1019      // While not aligned to a 16-byte boundary.
1020      while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
1021        C = *CurPtr++;
1022
1023      if (C == '/') goto FoundSlash;
1024
1025#ifdef __SSE2__
1026      __m128i Slashes = _mm_set_epi8('/', '/', '/', '/', '/', '/', '/', '/',
1027                                     '/', '/', '/', '/', '/', '/', '/', '/');
1028      while (CurPtr+16 <= BufferEnd &&
1029             _mm_movemask_epi8(_mm_cmpeq_epi8(*(__m128i*)CurPtr, Slashes)) == 0)
1030        CurPtr += 16;
1031#elif __ALTIVEC__
1032      __vector unsigned char Slashes = {
1033        '/', '/', '/', '/',  '/', '/', '/', '/',
1034        '/', '/', '/', '/',  '/', '/', '/', '/'
1035      };
1036      while (CurPtr+16 <= BufferEnd &&
1037             !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes))
1038        CurPtr += 16;
1039#else
1040      // Scan for '/' quickly.  Many block comments are very large.
1041      while (CurPtr[0] != '/' &&
1042             CurPtr[1] != '/' &&
1043             CurPtr[2] != '/' &&
1044             CurPtr[3] != '/' &&
1045             CurPtr+4 < BufferEnd) {
1046        CurPtr += 4;
1047      }
1048#endif
1049
1050      // It has to be one of the bytes scanned, increment to it and read one.
1051      C = *CurPtr++;
1052    }
1053
1054    // Loop to scan the remainder.
1055    while (C != '/' && C != '\0')
1056      C = *CurPtr++;
1057
1058  FoundSlash:
1059    if (C == '/') {
1060      if (CurPtr[-2] == '*')  // We found the final */.  We're done!
1061        break;
1062
1063      if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
1064        if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
1065          // We found the final */, though it had an escaped newline between the
1066          // * and /.  We're done!
1067          break;
1068        }
1069      }
1070      if (CurPtr[0] == '*' && CurPtr[1] != '/') {
1071        // If this is a /* inside of the comment, emit a warning.  Don't do this
1072        // if this is a /*/, which will end the comment.  This misses cases with
1073        // embedded escaped newlines, but oh well.
1074        if (!isLexingRawMode())
1075          Diag(CurPtr-1, diag::warn_nested_block_comment);
1076      }
1077    } else if (C == 0 && CurPtr == BufferEnd+1) {
1078      if (!isLexingRawMode())
1079        Diag(BufferPtr, diag::err_unterminated_block_comment);
1080      // Note: the user probably forgot a */.  We could continue immediately
1081      // after the /*, but this would involve lexing a lot of what really is the
1082      // comment, which surely would confuse the parser.
1083      --CurPtr;
1084
1085      // KeepWhitespaceMode should return this broken comment as a token.  Since
1086      // it isn't a well formed comment, just return it as an 'unknown' token.
1087      if (isKeepWhitespaceMode()) {
1088        FormTokenWithChars(Result, CurPtr, tok::unknown);
1089        return true;
1090      }
1091
1092      BufferPtr = CurPtr;
1093      return false;
1094    }
1095    C = *CurPtr++;
1096  }
1097
1098  // If we are returning comments as tokens, return this comment as a token.
1099  if (inKeepCommentMode()) {
1100    FormTokenWithChars(Result, CurPtr, tok::comment);
1101    return true;
1102  }
1103
1104  // It is common for the tokens immediately after a /**/ comment to be
1105  // whitespace.  Instead of going through the big switch, handle it
1106  // efficiently now.  This is safe even in KeepWhitespaceMode because we would
1107  // have already returned above with the comment as a token.
1108  if (isHorizontalWhitespace(*CurPtr)) {
1109    Result.setFlag(Token::LeadingSpace);
1110    SkipWhitespace(Result, CurPtr+1);
1111    return false;
1112  }
1113
1114  // Otherwise, just return so that the next character will be lexed as a token.
1115  BufferPtr = CurPtr;
1116  Result.setFlag(Token::LeadingSpace);
1117  return false;
1118}
1119
1120//===----------------------------------------------------------------------===//
1121// Primary Lexing Entry Points
1122//===----------------------------------------------------------------------===//
1123
1124/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
1125/// uninterpreted string.  This switches the lexer out of directive mode.
1126std::string Lexer::ReadToEndOfLine() {
1127  assert(ParsingPreprocessorDirective && ParsingFilename == false &&
1128         "Must be in a preprocessing directive!");
1129  std::string Result;
1130  Token Tmp;
1131
1132  // CurPtr - Cache BufferPtr in an automatic variable.
1133  const char *CurPtr = BufferPtr;
1134  while (1) {
1135    char Char = getAndAdvanceChar(CurPtr, Tmp);
1136    switch (Char) {
1137    default:
1138      Result += Char;
1139      break;
1140    case 0:  // Null.
1141      // Found end of file?
1142      if (CurPtr-1 != BufferEnd) {
1143        // Nope, normal character, continue.
1144        Result += Char;
1145        break;
1146      }
1147      // FALL THROUGH.
1148    case '\r':
1149    case '\n':
1150      // Okay, we found the end of the line. First, back up past the \0, \r, \n.
1151      assert(CurPtr[-1] == Char && "Trigraphs for newline?");
1152      BufferPtr = CurPtr-1;
1153
1154      // Next, lex the character, which should handle the EOM transition.
1155      Lex(Tmp);
1156      assert(Tmp.is(tok::eom) && "Unexpected token!");
1157
1158      // Finally, we're done, return the string we found.
1159      return Result;
1160    }
1161  }
1162}
1163
1164/// LexEndOfFile - CurPtr points to the end of this file.  Handle this
1165/// condition, reporting diagnostics and handling other edge cases as required.
1166/// This returns true if Result contains a token, false if PP.Lex should be
1167/// called again.
1168bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
1169  // If we hit the end of the file while parsing a preprocessor directive,
1170  // end the preprocessor directive first.  The next token returned will
1171  // then be the end of file.
1172  if (ParsingPreprocessorDirective) {
1173    // Done parsing the "line".
1174    ParsingPreprocessorDirective = false;
1175    // Update the location of token as well as BufferPtr.
1176    FormTokenWithChars(Result, CurPtr, tok::eom);
1177
1178    // Restore comment saving mode, in case it was disabled for directive.
1179    SetCommentRetentionState(PP->getCommentRetentionState());
1180    return true;  // Have a token.
1181  }
1182
1183  // If we are in raw mode, return this event as an EOF token.  Let the caller
1184  // that put us in raw mode handle the event.
1185  if (isLexingRawMode()) {
1186    Result.startToken();
1187    BufferPtr = BufferEnd;
1188    FormTokenWithChars(Result, BufferEnd, tok::eof);
1189    return true;
1190  }
1191
1192  // Otherwise, issue diagnostics for unterminated #if and missing newline.
1193
1194  // If we are in a #if directive, emit an error.
1195  while (!ConditionalStack.empty()) {
1196    PP->Diag(ConditionalStack.back().IfLoc,
1197             diag::err_pp_unterminated_conditional);
1198    ConditionalStack.pop_back();
1199  }
1200
1201  // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
1202  // a pedwarn.
1203  if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r'))
1204    Diag(BufferEnd, diag::ext_no_newline_eof);
1205
1206  BufferPtr = CurPtr;
1207
1208  // Finally, let the preprocessor handle this.
1209  return PP->HandleEndOfFile(Result);
1210}
1211
1212/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
1213/// the specified lexer will return a tok::l_paren token, 0 if it is something
1214/// else and 2 if there are no more tokens in the buffer controlled by the
1215/// lexer.
1216unsigned Lexer::isNextPPTokenLParen() {
1217  assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
1218
1219  // Switch to 'skipping' mode.  This will ensure that we can lex a token
1220  // without emitting diagnostics, disables macro expansion, and will cause EOF
1221  // to return an EOF token instead of popping the include stack.
1222  LexingRawMode = true;
1223
1224  // Save state that can be changed while lexing so that we can restore it.
1225  const char *TmpBufferPtr = BufferPtr;
1226
1227  Token Tok;
1228  Tok.startToken();
1229  LexTokenInternal(Tok);
1230
1231  // Restore state that may have changed.
1232  BufferPtr = TmpBufferPtr;
1233
1234  // Restore the lexer back to non-skipping mode.
1235  LexingRawMode = false;
1236
1237  if (Tok.is(tok::eof))
1238    return 2;
1239  return Tok.is(tok::l_paren);
1240}
1241
1242
1243/// LexTokenInternal - This implements a simple C family lexer.  It is an
1244/// extremely performance critical piece of code.  This assumes that the buffer
1245/// has a null character at the end of the file.  Return true if an error
1246/// occurred and compilation should terminate, false if normal.  This returns a
1247/// preprocessing token, not a normal token, as such, it is an internal
1248/// interface.  It assumes that the Flags of result have been cleared before
1249/// calling this.
1250void Lexer::LexTokenInternal(Token &Result) {
1251LexNextToken:
1252  // New token, can't need cleaning yet.
1253  Result.clearFlag(Token::NeedsCleaning);
1254  Result.setIdentifierInfo(0);
1255
1256  // CurPtr - Cache BufferPtr in an automatic variable.
1257  const char *CurPtr = BufferPtr;
1258
1259  // Small amounts of horizontal whitespace is very common between tokens.
1260  if ((*CurPtr == ' ') || (*CurPtr == '\t')) {
1261    ++CurPtr;
1262    while ((*CurPtr == ' ') || (*CurPtr == '\t'))
1263      ++CurPtr;
1264
1265    // If we are keeping whitespace and other tokens, just return what we just
1266    // skipped.  The next lexer invocation will return the token after the
1267    // whitespace.
1268    if (isKeepWhitespaceMode()) {
1269      FormTokenWithChars(Result, CurPtr, tok::unknown);
1270      return;
1271    }
1272
1273    BufferPtr = CurPtr;
1274    Result.setFlag(Token::LeadingSpace);
1275  }
1276
1277  unsigned SizeTmp, SizeTmp2;   // Temporaries for use in cases below.
1278
1279  // Read a character, advancing over it.
1280  char Char = getAndAdvanceChar(CurPtr, Result);
1281  tok::TokenKind Kind;
1282
1283  switch (Char) {
1284  case 0:  // Null.
1285    // Found end of file?
1286    if (CurPtr-1 == BufferEnd) {
1287      // Read the PP instance variable into an automatic variable, because
1288      // LexEndOfFile will often delete 'this'.
1289      Preprocessor *PPCache = PP;
1290      if (LexEndOfFile(Result, CurPtr-1))  // Retreat back into the file.
1291        return;   // Got a token to return.
1292      assert(PPCache && "Raw buffer::LexEndOfFile should return a token");
1293      return PPCache->Lex(Result);
1294    }
1295
1296    if (!isLexingRawMode())
1297      Diag(CurPtr-1, diag::null_in_file);
1298    Result.setFlag(Token::LeadingSpace);
1299    if (SkipWhitespace(Result, CurPtr))
1300      return; // KeepWhitespaceMode
1301
1302    goto LexNextToken;   // GCC isn't tail call eliminating.
1303  case '\n':
1304  case '\r':
1305    // If we are inside a preprocessor directive and we see the end of line,
1306    // we know we are done with the directive, so return an EOM token.
1307    if (ParsingPreprocessorDirective) {
1308      // Done parsing the "line".
1309      ParsingPreprocessorDirective = false;
1310
1311      // Restore comment saving mode, in case it was disabled for directive.
1312      SetCommentRetentionState(PP->getCommentRetentionState());
1313
1314      // Since we consumed a newline, we are back at the start of a line.
1315      IsAtStartOfLine = true;
1316
1317      Kind = tok::eom;
1318      break;
1319    }
1320    // The returned token is at the start of the line.
1321    Result.setFlag(Token::StartOfLine);
1322    // No leading whitespace seen so far.
1323    Result.clearFlag(Token::LeadingSpace);
1324
1325    if (SkipWhitespace(Result, CurPtr))
1326      return; // KeepWhitespaceMode
1327    goto LexNextToken;   // GCC isn't tail call eliminating.
1328  case ' ':
1329  case '\t':
1330  case '\f':
1331  case '\v':
1332  SkipHorizontalWhitespace:
1333    Result.setFlag(Token::LeadingSpace);
1334    if (SkipWhitespace(Result, CurPtr))
1335      return; // KeepWhitespaceMode
1336
1337  SkipIgnoredUnits:
1338    CurPtr = BufferPtr;
1339
1340    // If the next token is obviously a // or /* */ comment, skip it efficiently
1341    // too (without going through the big switch stmt).
1342    if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
1343        Features.BCPLComment) {
1344      SkipBCPLComment(Result, CurPtr+2);
1345      goto SkipIgnoredUnits;
1346    } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
1347      SkipBlockComment(Result, CurPtr+2);
1348      goto SkipIgnoredUnits;
1349    } else if (isHorizontalWhitespace(*CurPtr)) {
1350      goto SkipHorizontalWhitespace;
1351    }
1352    goto LexNextToken;   // GCC isn't tail call eliminating.
1353
1354  // C99 6.4.4.1: Integer Constants.
1355  // C99 6.4.4.2: Floating Constants.
1356  case '0': case '1': case '2': case '3': case '4':
1357  case '5': case '6': case '7': case '8': case '9':
1358    // Notify MIOpt that we read a non-whitespace/non-comment token.
1359    MIOpt.ReadToken();
1360    return LexNumericConstant(Result, CurPtr);
1361
1362  case 'L':   // Identifier (Loony) or wide literal (L'x' or L"xyz").
1363    // Notify MIOpt that we read a non-whitespace/non-comment token.
1364    MIOpt.ReadToken();
1365    Char = getCharAndSize(CurPtr, SizeTmp);
1366
1367    // Wide string literal.
1368    if (Char == '"')
1369      return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
1370                              true);
1371
1372    // Wide character constant.
1373    if (Char == '\'')
1374      return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
1375    // FALL THROUGH, treating L like the start of an identifier.
1376
1377  // C99 6.4.2: Identifiers.
1378  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
1379  case 'H': case 'I': case 'J': case 'K':    /*'L'*/case 'M': case 'N':
1380  case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
1381  case 'V': case 'W': case 'X': case 'Y': case 'Z':
1382  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
1383  case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
1384  case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
1385  case 'v': case 'w': case 'x': case 'y': case 'z':
1386  case '_':
1387    // Notify MIOpt that we read a non-whitespace/non-comment token.
1388    MIOpt.ReadToken();
1389    return LexIdentifier(Result, CurPtr);
1390
1391  case '$':   // $ in identifiers.
1392    if (Features.DollarIdents) {
1393      if (!isLexingRawMode())
1394        Diag(CurPtr-1, diag::ext_dollar_in_identifier);
1395      // Notify MIOpt that we read a non-whitespace/non-comment token.
1396      MIOpt.ReadToken();
1397      return LexIdentifier(Result, CurPtr);
1398    }
1399
1400    Kind = tok::unknown;
1401    break;
1402
1403  // C99 6.4.4: Character Constants.
1404  case '\'':
1405    // Notify MIOpt that we read a non-whitespace/non-comment token.
1406    MIOpt.ReadToken();
1407    return LexCharConstant(Result, CurPtr);
1408
1409  // C99 6.4.5: String Literals.
1410  case '"':
1411    // Notify MIOpt that we read a non-whitespace/non-comment token.
1412    MIOpt.ReadToken();
1413    return LexStringLiteral(Result, CurPtr, false);
1414
1415  // C99 6.4.6: Punctuators.
1416  case '?':
1417    Kind = tok::question;
1418    break;
1419  case '[':
1420    Kind = tok::l_square;
1421    break;
1422  case ']':
1423    Kind = tok::r_square;
1424    break;
1425  case '(':
1426    Kind = tok::l_paren;
1427    break;
1428  case ')':
1429    Kind = tok::r_paren;
1430    break;
1431  case '{':
1432    Kind = tok::l_brace;
1433    break;
1434  case '}':
1435    Kind = tok::r_brace;
1436    break;
1437  case '.':
1438    Char = getCharAndSize(CurPtr, SizeTmp);
1439    if (Char >= '0' && Char <= '9') {
1440      // Notify MIOpt that we read a non-whitespace/non-comment token.
1441      MIOpt.ReadToken();
1442
1443      return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
1444    } else if (Features.CPlusPlus && Char == '*') {
1445      Kind = tok::periodstar;
1446      CurPtr += SizeTmp;
1447    } else if (Char == '.' &&
1448               getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
1449      Kind = tok::ellipsis;
1450      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
1451                           SizeTmp2, Result);
1452    } else {
1453      Kind = tok::period;
1454    }
1455    break;
1456  case '&':
1457    Char = getCharAndSize(CurPtr, SizeTmp);
1458    if (Char == '&') {
1459      Kind = tok::ampamp;
1460      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1461    } else if (Char == '=') {
1462      Kind = tok::ampequal;
1463      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1464    } else {
1465      Kind = tok::amp;
1466    }
1467    break;
1468  case '*':
1469    if (getCharAndSize(CurPtr, SizeTmp) == '=') {
1470      Kind = tok::starequal;
1471      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1472    } else {
1473      Kind = tok::star;
1474    }
1475    break;
1476  case '+':
1477    Char = getCharAndSize(CurPtr, SizeTmp);
1478    if (Char == '+') {
1479      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1480      Kind = tok::plusplus;
1481    } else if (Char == '=') {
1482      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1483      Kind = tok::plusequal;
1484    } else {
1485      Kind = tok::plus;
1486    }
1487    break;
1488  case '-':
1489    Char = getCharAndSize(CurPtr, SizeTmp);
1490    if (Char == '-') {      // --
1491      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1492      Kind = tok::minusminus;
1493    } else if (Char == '>' && Features.CPlusPlus &&
1494               getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') {  // C++ ->*
1495      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
1496                           SizeTmp2, Result);
1497      Kind = tok::arrowstar;
1498    } else if (Char == '>') {   // ->
1499      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1500      Kind = tok::arrow;
1501    } else if (Char == '=') {   // -=
1502      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1503      Kind = tok::minusequal;
1504    } else {
1505      Kind = tok::minus;
1506    }
1507    break;
1508  case '~':
1509    Kind = tok::tilde;
1510    break;
1511  case '!':
1512    if (getCharAndSize(CurPtr, SizeTmp) == '=') {
1513      Kind = tok::exclaimequal;
1514      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1515    } else {
1516      Kind = tok::exclaim;
1517    }
1518    break;
1519  case '/':
1520    // 6.4.9: Comments
1521    Char = getCharAndSize(CurPtr, SizeTmp);
1522    if (Char == '/') {         // BCPL comment.
1523      // Even if BCPL comments are disabled (e.g. in C89 mode), we generally
1524      // want to lex this as a comment.  There is one problem with this though,
1525      // that in one particular corner case, this can change the behavior of the
1526      // resultant program.  For example, In  "foo //**/ bar", C89 would lex
1527      // this as "foo / bar" and langauges with BCPL comments would lex it as
1528      // "foo".  Check to see if the character after the second slash is a '*'.
1529      // If so, we will lex that as a "/" instead of the start of a comment.
1530      if (Features.BCPLComment ||
1531          getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') {
1532        if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
1533          return; // KeepCommentMode
1534
1535        // It is common for the tokens immediately after a // comment to be
1536        // whitespace (indentation for the next line).  Instead of going through
1537        // the big switch, handle it efficiently now.
1538        goto SkipIgnoredUnits;
1539      }
1540    }
1541
1542    if (Char == '*') {  // /**/ comment.
1543      if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
1544        return; // KeepCommentMode
1545      goto LexNextToken;   // GCC isn't tail call eliminating.
1546    }
1547
1548    if (Char == '=') {
1549      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1550      Kind = tok::slashequal;
1551    } else {
1552      Kind = tok::slash;
1553    }
1554    break;
1555  case '%':
1556    Char = getCharAndSize(CurPtr, SizeTmp);
1557    if (Char == '=') {
1558      Kind = tok::percentequal;
1559      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1560    } else if (Features.Digraphs && Char == '>') {
1561      Kind = tok::r_brace;                             // '%>' -> '}'
1562      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1563    } else if (Features.Digraphs && Char == ':') {
1564      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1565      Char = getCharAndSize(CurPtr, SizeTmp);
1566      if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
1567        Kind = tok::hashhash;                          // '%:%:' -> '##'
1568        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
1569                             SizeTmp2, Result);
1570      } else if (Char == '@' && Features.Microsoft) {  // %:@ -> #@ -> Charize
1571        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1572        if (!isLexingRawMode())
1573          Diag(BufferPtr, diag::charize_microsoft_ext);
1574        Kind = tok::hashat;
1575      } else {
1576        Kind = tok::hash;       // '%:' -> '#'
1577
1578        // We parsed a # character.  If this occurs at the start of the line,
1579        // it's actually the start of a preprocessing directive.  Callback to
1580        // the preprocessor to handle it.
1581        // FIXME: -fpreprocessed mode??
1582        if (Result.isAtStartOfLine() && !LexingRawMode) {
1583          BufferPtr = CurPtr;
1584          PP->HandleDirective(Result);
1585
1586          // As an optimization, if the preprocessor didn't switch lexers, tail
1587          // recurse.
1588          if (PP->isCurrentLexer(this)) {
1589            // Start a new token. If this is a #include or something, the PP may
1590            // want us starting at the beginning of the line again.  If so, set
1591            // the StartOfLine flag.
1592            if (IsAtStartOfLine) {
1593              Result.setFlag(Token::StartOfLine);
1594              IsAtStartOfLine = false;
1595            }
1596            goto LexNextToken;   // GCC isn't tail call eliminating.
1597          }
1598
1599          return PP->Lex(Result);
1600        }
1601      }
1602    } else {
1603      Kind = tok::percent;
1604    }
1605    break;
1606  case '<':
1607    Char = getCharAndSize(CurPtr, SizeTmp);
1608    if (ParsingFilename) {
1609      return LexAngledStringLiteral(Result, CurPtr+SizeTmp);
1610    } else if (Char == '<' &&
1611               getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '=') {
1612      Kind = tok::lesslessequal;
1613      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
1614                           SizeTmp2, Result);
1615    } else if (Char == '<') {
1616      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1617      Kind = tok::lessless;
1618    } else if (Char == '=') {
1619      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1620      Kind = tok::lessequal;
1621    } else if (Features.Digraphs && Char == ':') {     // '<:' -> '['
1622      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1623      Kind = tok::l_square;
1624    } else if (Features.Digraphs && Char == '%') {     // '<%' -> '{'
1625      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1626      Kind = tok::l_brace;
1627    } else {
1628      Kind = tok::less;
1629    }
1630    break;
1631  case '>':
1632    Char = getCharAndSize(CurPtr, SizeTmp);
1633    if (Char == '=') {
1634      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1635      Kind = tok::greaterequal;
1636    } else if (Char == '>' &&
1637               getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '=') {
1638      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
1639                           SizeTmp2, Result);
1640      Kind = tok::greatergreaterequal;
1641    } else if (Char == '>') {
1642      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1643      Kind = tok::greatergreater;
1644    } else {
1645      Kind = tok::greater;
1646    }
1647    break;
1648  case '^':
1649    Char = getCharAndSize(CurPtr, SizeTmp);
1650    if (Char == '=') {
1651      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1652      Kind = tok::caretequal;
1653    } else {
1654      Kind = tok::caret;
1655    }
1656    break;
1657  case '|':
1658    Char = getCharAndSize(CurPtr, SizeTmp);
1659    if (Char == '=') {
1660      Kind = tok::pipeequal;
1661      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1662    } else if (Char == '|') {
1663      Kind = tok::pipepipe;
1664      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1665    } else {
1666      Kind = tok::pipe;
1667    }
1668    break;
1669  case ':':
1670    Char = getCharAndSize(CurPtr, SizeTmp);
1671    if (Features.Digraphs && Char == '>') {
1672      Kind = tok::r_square; // ':>' -> ']'
1673      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1674    } else if (Features.CPlusPlus && Char == ':') {
1675      Kind = tok::coloncolon;
1676      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1677    } else {
1678      Kind = tok::colon;
1679    }
1680    break;
1681  case ';':
1682    Kind = tok::semi;
1683    break;
1684  case '=':
1685    Char = getCharAndSize(CurPtr, SizeTmp);
1686    if (Char == '=') {
1687      Kind = tok::equalequal;
1688      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1689    } else {
1690      Kind = tok::equal;
1691    }
1692    break;
1693  case ',':
1694    Kind = tok::comma;
1695    break;
1696  case '#':
1697    Char = getCharAndSize(CurPtr, SizeTmp);
1698    if (Char == '#') {
1699      Kind = tok::hashhash;
1700      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1701    } else if (Char == '@' && Features.Microsoft) {  // #@ -> Charize
1702      Kind = tok::hashat;
1703      if (!isLexingRawMode())
1704        Diag(BufferPtr, diag::charize_microsoft_ext);
1705      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1706    } else {
1707      Kind = tok::hash;
1708      // We parsed a # character.  If this occurs at the start of the line,
1709      // it's actually the start of a preprocessing directive.  Callback to
1710      // the preprocessor to handle it.
1711      // FIXME: -fpreprocessed mode??
1712      if (Result.isAtStartOfLine() && !LexingRawMode) {
1713        BufferPtr = CurPtr;
1714        PP->HandleDirective(Result);
1715
1716        // As an optimization, if the preprocessor didn't switch lexers, tail
1717        // recurse.
1718        if (PP->isCurrentLexer(this)) {
1719          // Start a new token.  If this is a #include or something, the PP may
1720          // want us starting at the beginning of the line again.  If so, set
1721          // the StartOfLine flag.
1722          if (IsAtStartOfLine) {
1723            Result.setFlag(Token::StartOfLine);
1724            IsAtStartOfLine = false;
1725          }
1726          goto LexNextToken;   // GCC isn't tail call eliminating.
1727        }
1728        return PP->Lex(Result);
1729      }
1730    }
1731    break;
1732
1733  case '@':
1734    // Objective C support.
1735    if (CurPtr[-1] == '@' && Features.ObjC1)
1736      Kind = tok::at;
1737    else
1738      Kind = tok::unknown;
1739    break;
1740
1741  case '\\':
1742    // FIXME: UCN's.
1743    // FALL THROUGH.
1744  default:
1745    Kind = tok::unknown;
1746    break;
1747  }
1748
1749  // Notify MIOpt that we read a non-whitespace/non-comment token.
1750  MIOpt.ReadToken();
1751
1752  // Update the location of token as well as BufferPtr.
1753  FormTokenWithChars(Result, CurPtr, Kind);
1754}
1755