Lexer.cpp revision f01fdff97b245caac98100d232c760b4d0531411
1//===--- Lexer.cpp - C Language Family Lexer ------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10//  This file implements the Lexer and Token interfaces.
11//
12//===----------------------------------------------------------------------===//
13//
14// TODO: GCC Diagnostics emitted by the lexer:
15// PEDWARN: (form feed|vertical tab) in preprocessing directive
16//
17// Universal characters, unicode, char mapping:
18// WARNING: `%.*s' is not in NFKC
19// WARNING: `%.*s' is not in NFC
20//
21// Other:
22// TODO: Options to support:
23//    -fexec-charset,-fwide-exec-charset
24//
25//===----------------------------------------------------------------------===//
26
27#include "clang/Lex/Lexer.h"
28#include "clang/Lex/Preprocessor.h"
29#include "clang/Lex/LexDiagnostic.h"
30#include "clang/Basic/SourceManager.h"
31#include "llvm/Support/Compiler.h"
32#include "llvm/Support/MemoryBuffer.h"
33#include <cctype>
34using namespace clang;
35
36static void InitCharacterInfo();
37
38//===----------------------------------------------------------------------===//
39// Token Class Implementation
40//===----------------------------------------------------------------------===//
41
42/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
43bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
44  if (IdentifierInfo *II = getIdentifierInfo())
45    return II->getObjCKeywordID() == objcKey;
46  return false;
47}
48
49/// getObjCKeywordID - Return the ObjC keyword kind.
50tok::ObjCKeywordKind Token::getObjCKeywordID() const {
51  IdentifierInfo *specId = getIdentifierInfo();
52  return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
53}
54
55
56//===----------------------------------------------------------------------===//
57// Lexer Class Implementation
58//===----------------------------------------------------------------------===//
59
60void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
61                      const char *BufEnd) {
62  InitCharacterInfo();
63
64  BufferStart = BufStart;
65  BufferPtr = BufPtr;
66  BufferEnd = BufEnd;
67
68  assert(BufEnd[0] == 0 &&
69         "We assume that the input buffer has a null character at the end"
70         " to simplify lexing!");
71
72  Is_PragmaLexer = false;
73
74  // Start of the file is a start of line.
75  IsAtStartOfLine = true;
76
77  // We are not after parsing a #.
78  ParsingPreprocessorDirective = false;
79
80  // We are not after parsing #include.
81  ParsingFilename = false;
82
83  // We are not in raw mode.  Raw mode disables diagnostics and interpretation
84  // of tokens (e.g. identifiers, thus disabling macro expansion).  It is used
85  // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
86  // or otherwise skipping over tokens.
87  LexingRawMode = false;
88
89  // Default to not keeping comments.
90  ExtendedTokenMode = 0;
91}
92
93/// Lexer constructor - Create a new lexer object for the specified buffer
94/// with the specified preprocessor managing the lexing process.  This lexer
95/// assumes that the associated file buffer and Preprocessor objects will
96/// outlive it, so it doesn't take ownership of either of them.
97Lexer::Lexer(FileID FID, Preprocessor &PP)
98  : PreprocessorLexer(&PP, FID),
99    FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
100    Features(PP.getLangOptions()) {
101
102  const llvm::MemoryBuffer *InputFile = PP.getSourceManager().getBuffer(FID);
103
104  InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(),
105            InputFile->getBufferEnd());
106
107  // Default to keeping comments if the preprocessor wants them.
108  SetCommentRetentionState(PP.getCommentRetentionState());
109}
110
111/// Lexer constructor - Create a new raw lexer object.  This object is only
112/// suitable for calls to 'LexRawToken'.  This lexer assumes that the text
113/// range will outlive it, so it doesn't take ownership of it.
114Lexer::Lexer(SourceLocation fileloc, const LangOptions &features,
115             const char *BufStart, const char *BufPtr, const char *BufEnd)
116  : FileLoc(fileloc), Features(features) {
117
118  InitLexer(BufStart, BufPtr, BufEnd);
119
120  // We *are* in raw mode.
121  LexingRawMode = true;
122}
123
124/// Lexer constructor - Create a new raw lexer object.  This object is only
125/// suitable for calls to 'LexRawToken'.  This lexer assumes that the text
126/// range will outlive it, so it doesn't take ownership of it.
127Lexer::Lexer(FileID FID, const SourceManager &SM, const LangOptions &features)
128  : FileLoc(SM.getLocForStartOfFile(FID)), Features(features) {
129  const llvm::MemoryBuffer *FromFile = SM.getBuffer(FID);
130
131  InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(),
132            FromFile->getBufferEnd());
133
134  // We *are* in raw mode.
135  LexingRawMode = true;
136}
137
138/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
139/// _Pragma expansion.  This has a variety of magic semantics that this method
140/// sets up.  It returns a new'd Lexer that must be delete'd when done.
141///
142/// On entrance to this routine, TokStartLoc is a macro location which has a
143/// spelling loc that indicates the bytes to be lexed for the token and an
144/// instantiation location that indicates where all lexed tokens should be
145/// "expanded from".
146///
147/// FIXME: It would really be nice to make _Pragma just be a wrapper around a
148/// normal lexer that remaps tokens as they fly by.  This would require making
149/// Preprocessor::Lex virtual.  Given that, we could just dump in a magic lexer
150/// interface that could handle this stuff.  This would pull GetMappedTokenLoc
151/// out of the critical path of the lexer!
152///
153Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
154                                 SourceLocation InstantiationLocStart,
155                                 SourceLocation InstantiationLocEnd,
156                                 unsigned TokLen, Preprocessor &PP) {
157  SourceManager &SM = PP.getSourceManager();
158
159  // Create the lexer as if we were going to lex the file normally.
160  FileID SpellingFID = SM.getFileID(SpellingLoc);
161  Lexer *L = new Lexer(SpellingFID, PP);
162
163  // Now that the lexer is created, change the start/end locations so that we
164  // just lex the subsection of the file that we want.  This is lexing from a
165  // scratch buffer.
166  const char *StrData = SM.getCharacterData(SpellingLoc);
167
168  L->BufferPtr = StrData;
169  L->BufferEnd = StrData+TokLen;
170  assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
171
172  // Set the SourceLocation with the remapping information.  This ensures that
173  // GetMappedTokenLoc will remap the tokens as they are lexed.
174  L->FileLoc = SM.createInstantiationLoc(SM.getLocForStartOfFile(SpellingFID),
175                                         InstantiationLocStart,
176                                         InstantiationLocEnd, TokLen);
177
178  // Ensure that the lexer thinks it is inside a directive, so that end \n will
179  // return an EOM token.
180  L->ParsingPreprocessorDirective = true;
181
182  // This lexer really is for _Pragma.
183  L->Is_PragmaLexer = true;
184  return L;
185}
186
187
188/// Stringify - Convert the specified string into a C string, with surrounding
189/// ""'s, and with escaped \ and " characters.
190std::string Lexer::Stringify(const std::string &Str, bool Charify) {
191  std::string Result = Str;
192  char Quote = Charify ? '\'' : '"';
193  for (unsigned i = 0, e = Result.size(); i != e; ++i) {
194    if (Result[i] == '\\' || Result[i] == Quote) {
195      Result.insert(Result.begin()+i, '\\');
196      ++i; ++e;
197    }
198  }
199  return Result;
200}
201
202/// Stringify - Convert the specified string into a C string by escaping '\'
203/// and " characters.  This does not add surrounding ""'s to the string.
204void Lexer::Stringify(llvm::SmallVectorImpl<char> &Str) {
205  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
206    if (Str[i] == '\\' || Str[i] == '"') {
207      Str.insert(Str.begin()+i, '\\');
208      ++i; ++e;
209    }
210  }
211}
212
213
214/// MeasureTokenLength - Relex the token at the specified location and return
215/// its length in bytes in the input file.  If the token needs cleaning (e.g.
216/// includes a trigraph or an escaped newline) then this count includes bytes
217/// that are part of that.
218unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
219                                   const SourceManager &SM,
220                                   const LangOptions &LangOpts) {
221  // TODO: this could be special cased for common tokens like identifiers, ')',
222  // etc to make this faster, if it mattered.  Just look at StrData[0] to handle
223  // all obviously single-char tokens.  This could use
224  // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
225  // something.
226
227  // If this comes from a macro expansion, we really do want the macro name, not
228  // the token this macro expanded to.
229  Loc = SM.getInstantiationLoc(Loc);
230  std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
231  std::pair<const char *,const char *> Buffer = SM.getBufferData(LocInfo.first);
232  const char *StrData = Buffer.first+LocInfo.second;
233
234  // Create a lexer starting at the beginning of this token.
235  Lexer TheLexer(Loc, LangOpts, Buffer.first, StrData, Buffer.second);
236  Token TheTok;
237  TheLexer.LexFromRawLexer(TheTok);
238  return TheTok.getLength();
239}
240
241//===----------------------------------------------------------------------===//
242// Character information.
243//===----------------------------------------------------------------------===//
244
245static unsigned char CharInfo[256];
246
247enum {
248  CHAR_HORZ_WS  = 0x01,  // ' ', '\t', '\f', '\v'.  Note, no '\0'
249  CHAR_VERT_WS  = 0x02,  // '\r', '\n'
250  CHAR_LETTER   = 0x04,  // a-z,A-Z
251  CHAR_NUMBER   = 0x08,  // 0-9
252  CHAR_UNDER    = 0x10,  // _
253  CHAR_PERIOD   = 0x20   // .
254};
255
256static void InitCharacterInfo() {
257  static bool isInited = false;
258  if (isInited) return;
259  isInited = true;
260
261  // Intiialize the CharInfo table.
262  // TODO: statically initialize this.
263  CharInfo[(int)' '] = CharInfo[(int)'\t'] =
264  CharInfo[(int)'\f'] = CharInfo[(int)'\v'] = CHAR_HORZ_WS;
265  CharInfo[(int)'\n'] = CharInfo[(int)'\r'] = CHAR_VERT_WS;
266
267  CharInfo[(int)'_'] = CHAR_UNDER;
268  CharInfo[(int)'.'] = CHAR_PERIOD;
269  for (unsigned i = 'a'; i <= 'z'; ++i)
270    CharInfo[i] = CharInfo[i+'A'-'a'] = CHAR_LETTER;
271  for (unsigned i = '0'; i <= '9'; ++i)
272    CharInfo[i] = CHAR_NUMBER;
273}
274
275/// isIdentifierBody - Return true if this is the body character of an
276/// identifier, which is [a-zA-Z0-9_].
277static inline bool isIdentifierBody(unsigned char c) {
278  return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false;
279}
280
281/// isHorizontalWhitespace - Return true if this character is horizontal
282/// whitespace: ' ', '\t', '\f', '\v'.  Note that this returns false for '\0'.
283static inline bool isHorizontalWhitespace(unsigned char c) {
284  return (CharInfo[c] & CHAR_HORZ_WS) ? true : false;
285}
286
287/// isWhitespace - Return true if this character is horizontal or vertical
288/// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'.  Note that this returns false
289/// for '\0'.
290static inline bool isWhitespace(unsigned char c) {
291  return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false;
292}
293
294/// isNumberBody - Return true if this is the body character of an
295/// preprocessing number, which is [a-zA-Z0-9_.].
296static inline bool isNumberBody(unsigned char c) {
297  return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ?
298    true : false;
299}
300
301
302//===----------------------------------------------------------------------===//
303// Diagnostics forwarding code.
304//===----------------------------------------------------------------------===//
305
306/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
307/// lexer buffer was all instantiated at a single point, perform the mapping.
308/// This is currently only used for _Pragma implementation, so it is the slow
309/// path of the hot getSourceLocation method.  Do not allow it to be inlined.
310static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
311                                        SourceLocation FileLoc,
312                                        unsigned CharNo,
313                                        unsigned TokLen) DISABLE_INLINE;
314static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
315                                        SourceLocation FileLoc,
316                                        unsigned CharNo, unsigned TokLen) {
317  assert(FileLoc.isMacroID() && "Must be an instantiation");
318
319  // Otherwise, we're lexing "mapped tokens".  This is used for things like
320  // _Pragma handling.  Combine the instantiation location of FileLoc with the
321  // spelling location.
322  SourceManager &SM = PP.getSourceManager();
323
324  // Create a new SLoc which is expanded from Instantiation(FileLoc) but whose
325  // characters come from spelling(FileLoc)+Offset.
326  SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
327  SpellingLoc = SpellingLoc.getFileLocWithOffset(CharNo);
328
329  // Figure out the expansion loc range, which is the range covered by the
330  // original _Pragma(...) sequence.
331  std::pair<SourceLocation,SourceLocation> II =
332    SM.getImmediateInstantiationRange(FileLoc);
333
334  return SM.createInstantiationLoc(SpellingLoc, II.first, II.second, TokLen);
335}
336
337/// getSourceLocation - Return a source location identifier for the specified
338/// offset in the current file.
339SourceLocation Lexer::getSourceLocation(const char *Loc,
340                                        unsigned TokLen) const {
341  assert(Loc >= BufferStart && Loc <= BufferEnd &&
342         "Location out of range for this buffer!");
343
344  // In the normal case, we're just lexing from a simple file buffer, return
345  // the file id from FileLoc with the offset specified.
346  unsigned CharNo = Loc-BufferStart;
347  if (FileLoc.isFileID())
348    return FileLoc.getFileLocWithOffset(CharNo);
349
350  // Otherwise, this is the _Pragma lexer case, which pretends that all of the
351  // tokens are lexed from where the _Pragma was defined.
352  assert(PP && "This doesn't work on raw lexers");
353  return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
354}
355
356/// Diag - Forwarding function for diagnostics.  This translate a source
357/// position in the current buffer into a SourceLocation object for rendering.
358DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
359  return PP->Diag(getSourceLocation(Loc), DiagID);
360}
361
362//===----------------------------------------------------------------------===//
363// Trigraph and Escaped Newline Handling Code.
364//===----------------------------------------------------------------------===//
365
366/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
367/// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
368static char GetTrigraphCharForLetter(char Letter) {
369  switch (Letter) {
370  default:   return 0;
371  case '=':  return '#';
372  case ')':  return ']';
373  case '(':  return '[';
374  case '!':  return '|';
375  case '\'': return '^';
376  case '>':  return '}';
377  case '/':  return '\\';
378  case '<':  return '{';
379  case '-':  return '~';
380  }
381}
382
383/// DecodeTrigraphChar - If the specified character is a legal trigraph when
384/// prefixed with ??, emit a trigraph warning.  If trigraphs are enabled,
385/// return the result character.  Finally, emit a warning about trigraph use
386/// whether trigraphs are enabled or not.
387static char DecodeTrigraphChar(const char *CP, Lexer *L) {
388  char Res = GetTrigraphCharForLetter(*CP);
389  if (!Res || !L) return Res;
390
391  if (!L->getFeatures().Trigraphs) {
392    if (!L->isLexingRawMode())
393      L->Diag(CP-2, diag::trigraph_ignored);
394    return 0;
395  }
396
397  if (!L->isLexingRawMode())
398    L->Diag(CP-2, diag::trigraph_converted) << std::string()+Res;
399  return Res;
400}
401
402/// getEscapedNewLineSize - Return the size of the specified escaped newline,
403/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
404/// trigraph equivalent on entry to this function.
405unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
406  unsigned Size = 0;
407  while (isWhitespace(Ptr[Size])) {
408    ++Size;
409
410    if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
411      continue;
412
413    // If this is a \r\n or \n\r, skip the other half.
414    if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
415        Ptr[Size-1] != Ptr[Size])
416      ++Size;
417
418    return Size;
419  }
420
421  // Not an escaped newline, must be a \t or something else.
422  return 0;
423}
424
425/// SkipEscapedNewLines - If P points to an escaped newline (or a series of
426/// them), skip over them and return the first non-escaped-newline found,
427/// otherwise return P.
428const char *Lexer::SkipEscapedNewLines(const char *P) {
429  while (1) {
430    const char *AfterEscape;
431    if (*P == '\\') {
432      AfterEscape = P+1;
433    } else if (*P == '?') {
434      // If not a trigraph for escape, bail out.
435      if (P[1] != '?' || P[2] != '/')
436        return P;
437      AfterEscape = P+3;
438    } else {
439      return P;
440    }
441
442    unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
443    if (NewLineSize == 0) return P;
444    P = AfterEscape+NewLineSize;
445  }
446}
447
448
449/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
450/// get its size, and return it.  This is tricky in several cases:
451///   1. If currently at the start of a trigraph, we warn about the trigraph,
452///      then either return the trigraph (skipping 3 chars) or the '?',
453///      depending on whether trigraphs are enabled or not.
454///   2. If this is an escaped newline (potentially with whitespace between
455///      the backslash and newline), implicitly skip the newline and return
456///      the char after it.
457///   3. If this is a UCN, return it.  FIXME: C++ UCN's?
458///
459/// This handles the slow/uncommon case of the getCharAndSize method.  Here we
460/// know that we can accumulate into Size, and that we have already incremented
461/// Ptr by Size bytes.
462///
463/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
464/// be updated to match.
465///
466char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size,
467                               Token *Tok) {
468  // If we have a slash, look for an escaped newline.
469  if (Ptr[0] == '\\') {
470    ++Size;
471    ++Ptr;
472Slash:
473    // Common case, backslash-char where the char is not whitespace.
474    if (!isWhitespace(Ptr[0])) return '\\';
475
476    // See if we have optional whitespace characters followed by a newline.
477    if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
478      // Remember that this token needs to be cleaned.
479      if (Tok) Tok->setFlag(Token::NeedsCleaning);
480
481      // Warn if there was whitespace between the backslash and newline.
482      if (EscapedNewLineSize != 1 && Tok && !isLexingRawMode())
483        Diag(Ptr, diag::backslash_newline_space);
484
485      // Found backslash<whitespace><newline>.  Parse the char after it.
486      Size += EscapedNewLineSize;
487      Ptr  += EscapedNewLineSize;
488      // Use slow version to accumulate a correct size field.
489      return getCharAndSizeSlow(Ptr, Size, Tok);
490    }
491
492    // Otherwise, this is not an escaped newline, just return the slash.
493    return '\\';
494  }
495
496  // If this is a trigraph, process it.
497  if (Ptr[0] == '?' && Ptr[1] == '?') {
498    // If this is actually a legal trigraph (not something like "??x"), emit
499    // a trigraph warning.  If so, and if trigraphs are enabled, return it.
500    if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) {
501      // Remember that this token needs to be cleaned.
502      if (Tok) Tok->setFlag(Token::NeedsCleaning);
503
504      Ptr += 3;
505      Size += 3;
506      if (C == '\\') goto Slash;
507      return C;
508    }
509  }
510
511  // If this is neither, return a single character.
512  ++Size;
513  return *Ptr;
514}
515
516
517/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
518/// getCharAndSizeNoWarn method.  Here we know that we can accumulate into Size,
519/// and that we have already incremented Ptr by Size bytes.
520///
521/// NOTE: When this method is updated, getCharAndSizeSlow (above) should
522/// be updated to match.
523char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
524                                     const LangOptions &Features) {
525  // If we have a slash, look for an escaped newline.
526  if (Ptr[0] == '\\') {
527    ++Size;
528    ++Ptr;
529Slash:
530    // Common case, backslash-char where the char is not whitespace.
531    if (!isWhitespace(Ptr[0])) return '\\';
532
533    // See if we have optional whitespace characters followed by a newline.
534    if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
535      // Found backslash<whitespace><newline>.  Parse the char after it.
536      Size += EscapedNewLineSize;
537      Ptr  += EscapedNewLineSize;
538
539      // Use slow version to accumulate a correct size field.
540      return getCharAndSizeSlowNoWarn(Ptr, Size, Features);
541    }
542
543    // Otherwise, this is not an escaped newline, just return the slash.
544    return '\\';
545  }
546
547  // If this is a trigraph, process it.
548  if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
549    // If this is actually a legal trigraph (not something like "??x"), return
550    // it.
551    if (char C = GetTrigraphCharForLetter(Ptr[2])) {
552      Ptr += 3;
553      Size += 3;
554      if (C == '\\') goto Slash;
555      return C;
556    }
557  }
558
559  // If this is neither, return a single character.
560  ++Size;
561  return *Ptr;
562}
563
564//===----------------------------------------------------------------------===//
565// Helper methods for lexing.
566//===----------------------------------------------------------------------===//
567
568void Lexer::LexIdentifier(Token &Result, const char *CurPtr) {
569  // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$]
570  unsigned Size;
571  unsigned char C = *CurPtr++;
572  while (isIdentifierBody(C)) {
573    C = *CurPtr++;
574  }
575  --CurPtr;   // Back up over the skipped character.
576
577  // Fast path, no $,\,? in identifier found.  '\' might be an escaped newline
578  // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN.
579  // FIXME: UCNs.
580  if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) {
581FinishIdentifier:
582    const char *IdStart = BufferPtr;
583    FormTokenWithChars(Result, CurPtr, tok::identifier);
584
585    // If we are in raw mode, return this identifier raw.  There is no need to
586    // look up identifier information or attempt to macro expand it.
587    if (LexingRawMode) return;
588
589    // Fill in Result.IdentifierInfo, looking up the identifier in the
590    // identifier table.
591    IdentifierInfo *II = PP->LookUpIdentifierInfo(Result, IdStart);
592
593    // Change the kind of this identifier to the appropriate token kind, e.g.
594    // turning "for" into a keyword.
595    Result.setKind(II->getTokenID());
596
597    // Finally, now that we know we have an identifier, pass this off to the
598    // preprocessor, which may macro expand it or something.
599    if (II->isHandleIdentifierCase())
600      PP->HandleIdentifier(Result);
601    return;
602  }
603
604  // Otherwise, $,\,? in identifier found.  Enter slower path.
605
606  C = getCharAndSize(CurPtr, Size);
607  while (1) {
608    if (C == '$') {
609      // If we hit a $ and they are not supported in identifiers, we are done.
610      if (!Features.DollarIdents) goto FinishIdentifier;
611
612      // Otherwise, emit a diagnostic and continue.
613      if (!isLexingRawMode())
614        Diag(CurPtr, diag::ext_dollar_in_identifier);
615      CurPtr = ConsumeChar(CurPtr, Size, Result);
616      C = getCharAndSize(CurPtr, Size);
617      continue;
618    } else if (!isIdentifierBody(C)) { // FIXME: UCNs.
619      // Found end of identifier.
620      goto FinishIdentifier;
621    }
622
623    // Otherwise, this character is good, consume it.
624    CurPtr = ConsumeChar(CurPtr, Size, Result);
625
626    C = getCharAndSize(CurPtr, Size);
627    while (isIdentifierBody(C)) { // FIXME: UCNs.
628      CurPtr = ConsumeChar(CurPtr, Size, Result);
629      C = getCharAndSize(CurPtr, Size);
630    }
631  }
632}
633
634
635/// LexNumericConstant - Lex the remainder of a integer or floating point
636/// constant. From[-1] is the first character lexed.  Return the end of the
637/// constant.
638void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
639  unsigned Size;
640  char C = getCharAndSize(CurPtr, Size);
641  char PrevCh = 0;
642  while (isNumberBody(C)) { // FIXME: UCNs?
643    CurPtr = ConsumeChar(CurPtr, Size, Result);
644    PrevCh = C;
645    C = getCharAndSize(CurPtr, Size);
646  }
647
648  // If we fell out, check for a sign, due to 1e+12.  If we have one, continue.
649  if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e'))
650    return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
651
652  // If we have a hex FP constant, continue.
653  if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p'))
654    return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
655
656  // Update the location of token as well as BufferPtr.
657  const char *TokStart = BufferPtr;
658  FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
659  Result.setLiteralData(TokStart);
660}
661
662/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
663/// either " or L".
664void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) {
665  const char *NulCharacter = 0; // Does this string contain the \0 character?
666
667  char C = getAndAdvanceChar(CurPtr, Result);
668  while (C != '"') {
669    // Skip escaped characters.
670    if (C == '\\') {
671      // Skip the escaped character.
672      C = getAndAdvanceChar(CurPtr, Result);
673    } else if (C == '\n' || C == '\r' ||             // Newline.
674               (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
675      if (!isLexingRawMode() && !Features.AsmPreprocessor)
676        Diag(BufferPtr, diag::err_unterminated_string);
677      FormTokenWithChars(Result, CurPtr-1, tok::unknown);
678      return;
679    } else if (C == 0) {
680      NulCharacter = CurPtr-1;
681    }
682    C = getAndAdvanceChar(CurPtr, Result);
683  }
684
685  // If a nul character existed in the string, warn about it.
686  if (NulCharacter && !isLexingRawMode())
687    Diag(NulCharacter, diag::null_in_string);
688
689  // Update the location of the token as well as the BufferPtr instance var.
690  const char *TokStart = BufferPtr;
691  FormTokenWithChars(Result, CurPtr,
692                     Wide ? tok::wide_string_literal : tok::string_literal);
693  Result.setLiteralData(TokStart);
694}
695
696/// LexAngledStringLiteral - Lex the remainder of an angled string literal,
697/// after having lexed the '<' character.  This is used for #include filenames.
698void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
699  const char *NulCharacter = 0; // Does this string contain the \0 character?
700  const char *AfterLessPos = CurPtr;
701  char C = getAndAdvanceChar(CurPtr, Result);
702  while (C != '>') {
703    // Skip escaped characters.
704    if (C == '\\') {
705      // Skip the escaped character.
706      C = getAndAdvanceChar(CurPtr, Result);
707    } else if (C == '\n' || C == '\r' ||             // Newline.
708               (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
709      // If the filename is unterminated, then it must just be a lone <
710      // character.  Return this as such.
711      FormTokenWithChars(Result, AfterLessPos, tok::less);
712      return;
713    } else if (C == 0) {
714      NulCharacter = CurPtr-1;
715    }
716    C = getAndAdvanceChar(CurPtr, Result);
717  }
718
719  // If a nul character existed in the string, warn about it.
720  if (NulCharacter && !isLexingRawMode())
721    Diag(NulCharacter, diag::null_in_string);
722
723  // Update the location of token as well as BufferPtr.
724  const char *TokStart = BufferPtr;
725  FormTokenWithChars(Result, CurPtr, tok::angle_string_literal);
726  Result.setLiteralData(TokStart);
727}
728
729
730/// LexCharConstant - Lex the remainder of a character constant, after having
731/// lexed either ' or L'.
732void Lexer::LexCharConstant(Token &Result, const char *CurPtr) {
733  const char *NulCharacter = 0; // Does this character contain the \0 character?
734
735  // Handle the common case of 'x' and '\y' efficiently.
736  char C = getAndAdvanceChar(CurPtr, Result);
737  if (C == '\'') {
738    if (!isLexingRawMode() && !Features.AsmPreprocessor)
739      Diag(BufferPtr, diag::err_empty_character);
740    FormTokenWithChars(Result, CurPtr, tok::unknown);
741    return;
742  } else if (C == '\\') {
743    // Skip the escaped character.
744    // FIXME: UCN's.
745    C = getAndAdvanceChar(CurPtr, Result);
746  }
747
748  if (C && C != '\n' && C != '\r' && CurPtr[0] == '\'') {
749    ++CurPtr;
750  } else {
751    // Fall back on generic code for embedded nulls, newlines, wide chars.
752    do {
753      // Skip escaped characters.
754      if (C == '\\') {
755        // Skip the escaped character.
756        C = getAndAdvanceChar(CurPtr, Result);
757      } else if (C == '\n' || C == '\r' ||               // Newline.
758                 (C == 0 && CurPtr-1 == BufferEnd)) {    // End of file.
759        if (!isLexingRawMode() && !Features.AsmPreprocessor)
760          Diag(BufferPtr, diag::err_unterminated_char);
761        FormTokenWithChars(Result, CurPtr-1, tok::unknown);
762        return;
763      } else if (C == 0) {
764        NulCharacter = CurPtr-1;
765      }
766      C = getAndAdvanceChar(CurPtr, Result);
767    } while (C != '\'');
768  }
769
770  if (NulCharacter && !isLexingRawMode())
771    Diag(NulCharacter, diag::null_in_char);
772
773  // Update the location of token as well as BufferPtr.
774  const char *TokStart = BufferPtr;
775  FormTokenWithChars(Result, CurPtr, tok::char_constant);
776  Result.setLiteralData(TokStart);
777}
778
779/// SkipWhitespace - Efficiently skip over a series of whitespace characters.
780/// Update BufferPtr to point to the next non-whitespace character and return.
781///
782/// This method forms a token and returns true if KeepWhitespaceMode is enabled.
783///
784bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) {
785  // Whitespace - Skip it, then return the token after the whitespace.
786  unsigned char Char = *CurPtr;  // Skip consequtive spaces efficiently.
787  while (1) {
788    // Skip horizontal whitespace very aggressively.
789    while (isHorizontalWhitespace(Char))
790      Char = *++CurPtr;
791
792    // Otherwise if we have something other than whitespace, we're done.
793    if (Char != '\n' && Char != '\r')
794      break;
795
796    if (ParsingPreprocessorDirective) {
797      // End of preprocessor directive line, let LexTokenInternal handle this.
798      BufferPtr = CurPtr;
799      return false;
800    }
801
802    // ok, but handle newline.
803    // The returned token is at the start of the line.
804    Result.setFlag(Token::StartOfLine);
805    // No leading whitespace seen so far.
806    Result.clearFlag(Token::LeadingSpace);
807    Char = *++CurPtr;
808  }
809
810  // If this isn't immediately after a newline, there is leading space.
811  char PrevChar = CurPtr[-1];
812  if (PrevChar != '\n' && PrevChar != '\r')
813    Result.setFlag(Token::LeadingSpace);
814
815  // If the client wants us to return whitespace, return it now.
816  if (isKeepWhitespaceMode()) {
817    FormTokenWithChars(Result, CurPtr, tok::unknown);
818    return true;
819  }
820
821  BufferPtr = CurPtr;
822  return false;
823}
824
825// SkipBCPLComment - We have just read the // characters from input.  Skip until
826// we find the newline character thats terminate the comment.  Then update
827/// BufferPtr and return.  If we're in KeepCommentMode, this will form the token
828/// and return true.
829bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) {
830  // If BCPL comments aren't explicitly enabled for this language, emit an
831  // extension warning.
832  if (!Features.BCPLComment && !isLexingRawMode()) {
833    Diag(BufferPtr, diag::ext_bcpl_comment);
834
835    // Mark them enabled so we only emit one warning for this translation
836    // unit.
837    Features.BCPLComment = true;
838  }
839
840  // Scan over the body of the comment.  The common case, when scanning, is that
841  // the comment contains normal ascii characters with nothing interesting in
842  // them.  As such, optimize for this case with the inner loop.
843  char C;
844  do {
845    C = *CurPtr;
846    // FIXME: Speedup BCPL comment lexing.  Just scan for a \n or \r character.
847    // If we find a \n character, scan backwards, checking to see if it's an
848    // escaped newline, like we do for block comments.
849
850    // Skip over characters in the fast loop.
851    while (C != 0 &&                // Potentially EOF.
852           C != '\\' &&             // Potentially escaped newline.
853           C != '?' &&              // Potentially trigraph.
854           C != '\n' && C != '\r')  // Newline or DOS-style newline.
855      C = *++CurPtr;
856
857    // If this is a newline, we're done.
858    if (C == '\n' || C == '\r')
859      break;  // Found the newline? Break out!
860
861    // Otherwise, this is a hard case.  Fall back on getAndAdvanceChar to
862    // properly decode the character.  Read it in raw mode to avoid emitting
863    // diagnostics about things like trigraphs.  If we see an escaped newline,
864    // we'll handle it below.
865    const char *OldPtr = CurPtr;
866    bool OldRawMode = isLexingRawMode();
867    LexingRawMode = true;
868    C = getAndAdvanceChar(CurPtr, Result);
869    LexingRawMode = OldRawMode;
870
871    // If the char that we finally got was a \n, then we must have had something
872    // like \<newline><newline>.  We don't want to have consumed the second
873    // newline, we want CurPtr, to end up pointing to it down below.
874    if (C == '\n' || C == '\r') {
875      --CurPtr;
876      C = 'x'; // doesn't matter what this is.
877    }
878
879    // If we read multiple characters, and one of those characters was a \r or
880    // \n, then we had an escaped newline within the comment.  Emit diagnostic
881    // unless the next line is also a // comment.
882    if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') {
883      for (; OldPtr != CurPtr; ++OldPtr)
884        if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
885          // Okay, we found a // comment that ends in a newline, if the next
886          // line is also a // comment, but has spaces, don't emit a diagnostic.
887          if (isspace(C)) {
888            const char *ForwardPtr = CurPtr;
889            while (isspace(*ForwardPtr))  // Skip whitespace.
890              ++ForwardPtr;
891            if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
892              break;
893          }
894
895          if (!isLexingRawMode())
896            Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment);
897          break;
898        }
899    }
900
901    if (CurPtr == BufferEnd+1) { --CurPtr; break; }
902  } while (C != '\n' && C != '\r');
903
904  // Found but did not consume the newline.
905
906  // If we are returning comments as tokens, return this comment as a token.
907  if (inKeepCommentMode())
908    return SaveBCPLComment(Result, CurPtr);
909
910  // If we are inside a preprocessor directive and we see the end of line,
911  // return immediately, so that the lexer can return this as an EOM token.
912  if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
913    BufferPtr = CurPtr;
914    return false;
915  }
916
917  // Otherwise, eat the \n character.  We don't care if this is a \n\r or
918  // \r\n sequence.  This is an efficiency hack (because we know the \n can't
919  // contribute to another token), it isn't needed for correctness.  Note that
920  // this is ok even in KeepWhitespaceMode, because we would have returned the
921  /// comment above in that mode.
922  ++CurPtr;
923
924  // The next returned token is at the start of the line.
925  Result.setFlag(Token::StartOfLine);
926  // No leading whitespace seen so far.
927  Result.clearFlag(Token::LeadingSpace);
928  BufferPtr = CurPtr;
929  return false;
930}
931
932/// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in
933/// an appropriate way and return it.
934bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) {
935  // If we're not in a preprocessor directive, just return the // comment
936  // directly.
937  FormTokenWithChars(Result, CurPtr, tok::comment);
938
939  if (!ParsingPreprocessorDirective)
940    return true;
941
942  // If this BCPL-style comment is in a macro definition, transmogrify it into
943  // a C-style block comment.
944  std::string Spelling = PP->getSpelling(Result);
945  assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?");
946  Spelling[1] = '*';   // Change prefix to "/*".
947  Spelling += "*/";    // add suffix.
948
949  Result.setKind(tok::comment);
950  PP->CreateString(&Spelling[0], Spelling.size(), Result,
951                   Result.getLocation());
952  return true;
953}
954
955/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
956/// character (either \n or \r) is part of an escaped newline sequence.  Issue a
957/// diagnostic if so.  We know that the newline is inside of a block comment.
958static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr,
959                                                  Lexer *L) {
960  assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
961
962  // Back up off the newline.
963  --CurPtr;
964
965  // If this is a two-character newline sequence, skip the other character.
966  if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
967    // \n\n or \r\r -> not escaped newline.
968    if (CurPtr[0] == CurPtr[1])
969      return false;
970    // \n\r or \r\n -> skip the newline.
971    --CurPtr;
972  }
973
974  // If we have horizontal whitespace, skip over it.  We allow whitespace
975  // between the slash and newline.
976  bool HasSpace = false;
977  while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
978    --CurPtr;
979    HasSpace = true;
980  }
981
982  // If we have a slash, we know this is an escaped newline.
983  if (*CurPtr == '\\') {
984    if (CurPtr[-1] != '*') return false;
985  } else {
986    // It isn't a slash, is it the ?? / trigraph?
987    if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' ||
988        CurPtr[-3] != '*')
989      return false;
990
991    // This is the trigraph ending the comment.  Emit a stern warning!
992    CurPtr -= 2;
993
994    // If no trigraphs are enabled, warn that we ignored this trigraph and
995    // ignore this * character.
996    if (!L->getFeatures().Trigraphs) {
997      if (!L->isLexingRawMode())
998        L->Diag(CurPtr, diag::trigraph_ignored_block_comment);
999      return false;
1000    }
1001    if (!L->isLexingRawMode())
1002      L->Diag(CurPtr, diag::trigraph_ends_block_comment);
1003  }
1004
1005  // Warn about having an escaped newline between the */ characters.
1006  if (!L->isLexingRawMode())
1007    L->Diag(CurPtr, diag::escaped_newline_block_comment_end);
1008
1009  // If there was space between the backslash and newline, warn about it.
1010  if (HasSpace && !L->isLexingRawMode())
1011    L->Diag(CurPtr, diag::backslash_newline_space);
1012
1013  return true;
1014}
1015
1016#ifdef __SSE2__
1017#include <emmintrin.h>
1018#elif __ALTIVEC__
1019#include <altivec.h>
1020#undef bool
1021#endif
1022
1023/// SkipBlockComment - We have just read the /* characters from input.  Read
1024/// until we find the */ characters that terminate the comment.  Note that we
1025/// don't bother decoding trigraphs or escaped newlines in block comments,
1026/// because they cannot cause the comment to end.  The only thing that can
1027/// happen is the comment could end with an escaped newline between the */ end
1028/// of comment.
1029///
1030/// If KeepCommentMode is enabled, this forms a token from the comment and
1031/// returns true.
1032bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) {
1033  // Scan one character past where we should, looking for a '/' character.  Once
1034  // we find it, check to see if it was preceeded by a *.  This common
1035  // optimization helps people who like to put a lot of * characters in their
1036  // comments.
1037
1038  // The first character we get with newlines and trigraphs skipped to handle
1039  // the degenerate /*/ case below correctly if the * has an escaped newline
1040  // after it.
1041  unsigned CharSize;
1042  unsigned char C = getCharAndSize(CurPtr, CharSize);
1043  CurPtr += CharSize;
1044  if (C == 0 && CurPtr == BufferEnd+1) {
1045    if (!isLexingRawMode())
1046      Diag(BufferPtr, diag::err_unterminated_block_comment);
1047    --CurPtr;
1048
1049    // KeepWhitespaceMode should return this broken comment as a token.  Since
1050    // it isn't a well formed comment, just return it as an 'unknown' token.
1051    if (isKeepWhitespaceMode()) {
1052      FormTokenWithChars(Result, CurPtr, tok::unknown);
1053      return true;
1054    }
1055
1056    BufferPtr = CurPtr;
1057    return false;
1058  }
1059
1060  // Check to see if the first character after the '/*' is another /.  If so,
1061  // then this slash does not end the block comment, it is part of it.
1062  if (C == '/')
1063    C = *CurPtr++;
1064
1065  while (1) {
1066    // Skip over all non-interesting characters until we find end of buffer or a
1067    // (probably ending) '/' character.
1068    if (CurPtr + 24 < BufferEnd) {
1069      // While not aligned to a 16-byte boundary.
1070      while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0)
1071        C = *CurPtr++;
1072
1073      if (C == '/') goto FoundSlash;
1074
1075#ifdef __SSE2__
1076      __m128i Slashes = _mm_set_epi8('/', '/', '/', '/', '/', '/', '/', '/',
1077                                     '/', '/', '/', '/', '/', '/', '/', '/');
1078      while (CurPtr+16 <= BufferEnd &&
1079             _mm_movemask_epi8(_mm_cmpeq_epi8(*(__m128i*)CurPtr, Slashes)) == 0)
1080        CurPtr += 16;
1081#elif __ALTIVEC__
1082      __vector unsigned char Slashes = {
1083        '/', '/', '/', '/',  '/', '/', '/', '/',
1084        '/', '/', '/', '/',  '/', '/', '/', '/'
1085      };
1086      while (CurPtr+16 <= BufferEnd &&
1087             !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes))
1088        CurPtr += 16;
1089#else
1090      // Scan for '/' quickly.  Many block comments are very large.
1091      while (CurPtr[0] != '/' &&
1092             CurPtr[1] != '/' &&
1093             CurPtr[2] != '/' &&
1094             CurPtr[3] != '/' &&
1095             CurPtr+4 < BufferEnd) {
1096        CurPtr += 4;
1097      }
1098#endif
1099
1100      // It has to be one of the bytes scanned, increment to it and read one.
1101      C = *CurPtr++;
1102    }
1103
1104    // Loop to scan the remainder.
1105    while (C != '/' && C != '\0')
1106      C = *CurPtr++;
1107
1108  FoundSlash:
1109    if (C == '/') {
1110      if (CurPtr[-2] == '*')  // We found the final */.  We're done!
1111        break;
1112
1113      if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
1114        if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) {
1115          // We found the final */, though it had an escaped newline between the
1116          // * and /.  We're done!
1117          break;
1118        }
1119      }
1120      if (CurPtr[0] == '*' && CurPtr[1] != '/') {
1121        // If this is a /* inside of the comment, emit a warning.  Don't do this
1122        // if this is a /*/, which will end the comment.  This misses cases with
1123        // embedded escaped newlines, but oh well.
1124        if (!isLexingRawMode())
1125          Diag(CurPtr-1, diag::warn_nested_block_comment);
1126      }
1127    } else if (C == 0 && CurPtr == BufferEnd+1) {
1128      if (!isLexingRawMode())
1129        Diag(BufferPtr, diag::err_unterminated_block_comment);
1130      // Note: the user probably forgot a */.  We could continue immediately
1131      // after the /*, but this would involve lexing a lot of what really is the
1132      // comment, which surely would confuse the parser.
1133      --CurPtr;
1134
1135      // KeepWhitespaceMode should return this broken comment as a token.  Since
1136      // it isn't a well formed comment, just return it as an 'unknown' token.
1137      if (isKeepWhitespaceMode()) {
1138        FormTokenWithChars(Result, CurPtr, tok::unknown);
1139        return true;
1140      }
1141
1142      BufferPtr = CurPtr;
1143      return false;
1144    }
1145    C = *CurPtr++;
1146  }
1147
1148  // If we are returning comments as tokens, return this comment as a token.
1149  if (inKeepCommentMode()) {
1150    FormTokenWithChars(Result, CurPtr, tok::comment);
1151    return true;
1152  }
1153
1154  // It is common for the tokens immediately after a /**/ comment to be
1155  // whitespace.  Instead of going through the big switch, handle it
1156  // efficiently now.  This is safe even in KeepWhitespaceMode because we would
1157  // have already returned above with the comment as a token.
1158  if (isHorizontalWhitespace(*CurPtr)) {
1159    Result.setFlag(Token::LeadingSpace);
1160    SkipWhitespace(Result, CurPtr+1);
1161    return false;
1162  }
1163
1164  // Otherwise, just return so that the next character will be lexed as a token.
1165  BufferPtr = CurPtr;
1166  Result.setFlag(Token::LeadingSpace);
1167  return false;
1168}
1169
1170//===----------------------------------------------------------------------===//
1171// Primary Lexing Entry Points
1172//===----------------------------------------------------------------------===//
1173
1174/// ReadToEndOfLine - Read the rest of the current preprocessor line as an
1175/// uninterpreted string.  This switches the lexer out of directive mode.
1176std::string Lexer::ReadToEndOfLine() {
1177  assert(ParsingPreprocessorDirective && ParsingFilename == false &&
1178         "Must be in a preprocessing directive!");
1179  std::string Result;
1180  Token Tmp;
1181
1182  // CurPtr - Cache BufferPtr in an automatic variable.
1183  const char *CurPtr = BufferPtr;
1184  while (1) {
1185    char Char = getAndAdvanceChar(CurPtr, Tmp);
1186    switch (Char) {
1187    default:
1188      Result += Char;
1189      break;
1190    case 0:  // Null.
1191      // Found end of file?
1192      if (CurPtr-1 != BufferEnd) {
1193        // Nope, normal character, continue.
1194        Result += Char;
1195        break;
1196      }
1197      // FALL THROUGH.
1198    case '\r':
1199    case '\n':
1200      // Okay, we found the end of the line. First, back up past the \0, \r, \n.
1201      assert(CurPtr[-1] == Char && "Trigraphs for newline?");
1202      BufferPtr = CurPtr-1;
1203
1204      // Next, lex the character, which should handle the EOM transition.
1205      Lex(Tmp);
1206      assert(Tmp.is(tok::eom) && "Unexpected token!");
1207
1208      // Finally, we're done, return the string we found.
1209      return Result;
1210    }
1211  }
1212}
1213
1214/// LexEndOfFile - CurPtr points to the end of this file.  Handle this
1215/// condition, reporting diagnostics and handling other edge cases as required.
1216/// This returns true if Result contains a token, false if PP.Lex should be
1217/// called again.
1218bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
1219  // If we hit the end of the file while parsing a preprocessor directive,
1220  // end the preprocessor directive first.  The next token returned will
1221  // then be the end of file.
1222  if (ParsingPreprocessorDirective) {
1223    // Done parsing the "line".
1224    ParsingPreprocessorDirective = false;
1225    // Update the location of token as well as BufferPtr.
1226    FormTokenWithChars(Result, CurPtr, tok::eom);
1227
1228    // Restore comment saving mode, in case it was disabled for directive.
1229    SetCommentRetentionState(PP->getCommentRetentionState());
1230    return true;  // Have a token.
1231  }
1232
1233  // If we are in raw mode, return this event as an EOF token.  Let the caller
1234  // that put us in raw mode handle the event.
1235  if (isLexingRawMode()) {
1236    Result.startToken();
1237    BufferPtr = BufferEnd;
1238    FormTokenWithChars(Result, BufferEnd, tok::eof);
1239    return true;
1240  }
1241
1242  // Otherwise, issue diagnostics for unterminated #if and missing newline.
1243
1244  // If we are in a #if directive, emit an error.
1245  while (!ConditionalStack.empty()) {
1246    PP->Diag(ConditionalStack.back().IfLoc,
1247             diag::err_pp_unterminated_conditional);
1248    ConditionalStack.pop_back();
1249  }
1250
1251  // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
1252  // a pedwarn.
1253  if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r'))
1254    Diag(BufferEnd, diag::ext_no_newline_eof)
1255      << CodeModificationHint::CreateInsertion(getSourceLocation(BufferEnd),
1256                                               "\n");
1257
1258  BufferPtr = CurPtr;
1259
1260  // Finally, let the preprocessor handle this.
1261  return PP->HandleEndOfFile(Result);
1262}
1263
1264/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
1265/// the specified lexer will return a tok::l_paren token, 0 if it is something
1266/// else and 2 if there are no more tokens in the buffer controlled by the
1267/// lexer.
1268unsigned Lexer::isNextPPTokenLParen() {
1269  assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
1270
1271  // Switch to 'skipping' mode.  This will ensure that we can lex a token
1272  // without emitting diagnostics, disables macro expansion, and will cause EOF
1273  // to return an EOF token instead of popping the include stack.
1274  LexingRawMode = true;
1275
1276  // Save state that can be changed while lexing so that we can restore it.
1277  const char *TmpBufferPtr = BufferPtr;
1278  bool inPPDirectiveMode = ParsingPreprocessorDirective;
1279
1280  Token Tok;
1281  Tok.startToken();
1282  LexTokenInternal(Tok);
1283
1284  // Restore state that may have changed.
1285  BufferPtr = TmpBufferPtr;
1286  ParsingPreprocessorDirective = inPPDirectiveMode;
1287
1288  // Restore the lexer back to non-skipping mode.
1289  LexingRawMode = false;
1290
1291  if (Tok.is(tok::eof))
1292    return 2;
1293  return Tok.is(tok::l_paren);
1294}
1295
1296
1297/// LexTokenInternal - This implements a simple C family lexer.  It is an
1298/// extremely performance critical piece of code.  This assumes that the buffer
1299/// has a null character at the end of the file.  Return true if an error
1300/// occurred and compilation should terminate, false if normal.  This returns a
1301/// preprocessing token, not a normal token, as such, it is an internal
1302/// interface.  It assumes that the Flags of result have been cleared before
1303/// calling this.
1304void Lexer::LexTokenInternal(Token &Result) {
1305LexNextToken:
1306  // New token, can't need cleaning yet.
1307  Result.clearFlag(Token::NeedsCleaning);
1308  Result.setIdentifierInfo(0);
1309
1310  // CurPtr - Cache BufferPtr in an automatic variable.
1311  const char *CurPtr = BufferPtr;
1312
1313  // Small amounts of horizontal whitespace is very common between tokens.
1314  if ((*CurPtr == ' ') || (*CurPtr == '\t')) {
1315    ++CurPtr;
1316    while ((*CurPtr == ' ') || (*CurPtr == '\t'))
1317      ++CurPtr;
1318
1319    // If we are keeping whitespace and other tokens, just return what we just
1320    // skipped.  The next lexer invocation will return the token after the
1321    // whitespace.
1322    if (isKeepWhitespaceMode()) {
1323      FormTokenWithChars(Result, CurPtr, tok::unknown);
1324      return;
1325    }
1326
1327    BufferPtr = CurPtr;
1328    Result.setFlag(Token::LeadingSpace);
1329  }
1330
1331  unsigned SizeTmp, SizeTmp2;   // Temporaries for use in cases below.
1332
1333  // Read a character, advancing over it.
1334  char Char = getAndAdvanceChar(CurPtr, Result);
1335  tok::TokenKind Kind;
1336
1337  switch (Char) {
1338  case 0:  // Null.
1339    // Found end of file?
1340    if (CurPtr-1 == BufferEnd) {
1341      // Read the PP instance variable into an automatic variable, because
1342      // LexEndOfFile will often delete 'this'.
1343      Preprocessor *PPCache = PP;
1344      if (LexEndOfFile(Result, CurPtr-1))  // Retreat back into the file.
1345        return;   // Got a token to return.
1346      assert(PPCache && "Raw buffer::LexEndOfFile should return a token");
1347      return PPCache->Lex(Result);
1348    }
1349
1350    if (!isLexingRawMode())
1351      Diag(CurPtr-1, diag::null_in_file);
1352    Result.setFlag(Token::LeadingSpace);
1353    if (SkipWhitespace(Result, CurPtr))
1354      return; // KeepWhitespaceMode
1355
1356    goto LexNextToken;   // GCC isn't tail call eliminating.
1357  case '\n':
1358  case '\r':
1359    // If we are inside a preprocessor directive and we see the end of line,
1360    // we know we are done with the directive, so return an EOM token.
1361    if (ParsingPreprocessorDirective) {
1362      // Done parsing the "line".
1363      ParsingPreprocessorDirective = false;
1364
1365      // Restore comment saving mode, in case it was disabled for directive.
1366      SetCommentRetentionState(PP->getCommentRetentionState());
1367
1368      // Since we consumed a newline, we are back at the start of a line.
1369      IsAtStartOfLine = true;
1370
1371      Kind = tok::eom;
1372      break;
1373    }
1374    // The returned token is at the start of the line.
1375    Result.setFlag(Token::StartOfLine);
1376    // No leading whitespace seen so far.
1377    Result.clearFlag(Token::LeadingSpace);
1378
1379    if (SkipWhitespace(Result, CurPtr))
1380      return; // KeepWhitespaceMode
1381    goto LexNextToken;   // GCC isn't tail call eliminating.
1382  case ' ':
1383  case '\t':
1384  case '\f':
1385  case '\v':
1386  SkipHorizontalWhitespace:
1387    Result.setFlag(Token::LeadingSpace);
1388    if (SkipWhitespace(Result, CurPtr))
1389      return; // KeepWhitespaceMode
1390
1391  SkipIgnoredUnits:
1392    CurPtr = BufferPtr;
1393
1394    // If the next token is obviously a // or /* */ comment, skip it efficiently
1395    // too (without going through the big switch stmt).
1396    if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
1397        Features.BCPLComment) {
1398      SkipBCPLComment(Result, CurPtr+2);
1399      goto SkipIgnoredUnits;
1400    } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
1401      SkipBlockComment(Result, CurPtr+2);
1402      goto SkipIgnoredUnits;
1403    } else if (isHorizontalWhitespace(*CurPtr)) {
1404      goto SkipHorizontalWhitespace;
1405    }
1406    goto LexNextToken;   // GCC isn't tail call eliminating.
1407
1408  // C99 6.4.4.1: Integer Constants.
1409  // C99 6.4.4.2: Floating Constants.
1410  case '0': case '1': case '2': case '3': case '4':
1411  case '5': case '6': case '7': case '8': case '9':
1412    // Notify MIOpt that we read a non-whitespace/non-comment token.
1413    MIOpt.ReadToken();
1414    return LexNumericConstant(Result, CurPtr);
1415
1416  case 'L':   // Identifier (Loony) or wide literal (L'x' or L"xyz").
1417    // Notify MIOpt that we read a non-whitespace/non-comment token.
1418    MIOpt.ReadToken();
1419    Char = getCharAndSize(CurPtr, SizeTmp);
1420
1421    // Wide string literal.
1422    if (Char == '"')
1423      return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
1424                              true);
1425
1426    // Wide character constant.
1427    if (Char == '\'')
1428      return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
1429    // FALL THROUGH, treating L like the start of an identifier.
1430
1431  // C99 6.4.2: Identifiers.
1432  case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
1433  case 'H': case 'I': case 'J': case 'K':    /*'L'*/case 'M': case 'N':
1434  case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
1435  case 'V': case 'W': case 'X': case 'Y': case 'Z':
1436  case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
1437  case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
1438  case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
1439  case 'v': case 'w': case 'x': case 'y': case 'z':
1440  case '_':
1441    // Notify MIOpt that we read a non-whitespace/non-comment token.
1442    MIOpt.ReadToken();
1443    return LexIdentifier(Result, CurPtr);
1444
1445  case '$':   // $ in identifiers.
1446    if (Features.DollarIdents) {
1447      if (!isLexingRawMode())
1448        Diag(CurPtr-1, diag::ext_dollar_in_identifier);
1449      // Notify MIOpt that we read a non-whitespace/non-comment token.
1450      MIOpt.ReadToken();
1451      return LexIdentifier(Result, CurPtr);
1452    }
1453
1454    Kind = tok::unknown;
1455    break;
1456
1457  // C99 6.4.4: Character Constants.
1458  case '\'':
1459    // Notify MIOpt that we read a non-whitespace/non-comment token.
1460    MIOpt.ReadToken();
1461    return LexCharConstant(Result, CurPtr);
1462
1463  // C99 6.4.5: String Literals.
1464  case '"':
1465    // Notify MIOpt that we read a non-whitespace/non-comment token.
1466    MIOpt.ReadToken();
1467    return LexStringLiteral(Result, CurPtr, false);
1468
1469  // C99 6.4.6: Punctuators.
1470  case '?':
1471    Kind = tok::question;
1472    break;
1473  case '[':
1474    Kind = tok::l_square;
1475    break;
1476  case ']':
1477    Kind = tok::r_square;
1478    break;
1479  case '(':
1480    Kind = tok::l_paren;
1481    break;
1482  case ')':
1483    Kind = tok::r_paren;
1484    break;
1485  case '{':
1486    Kind = tok::l_brace;
1487    break;
1488  case '}':
1489    Kind = tok::r_brace;
1490    break;
1491  case '.':
1492    Char = getCharAndSize(CurPtr, SizeTmp);
1493    if (Char >= '0' && Char <= '9') {
1494      // Notify MIOpt that we read a non-whitespace/non-comment token.
1495      MIOpt.ReadToken();
1496
1497      return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
1498    } else if (Features.CPlusPlus && Char == '*') {
1499      Kind = tok::periodstar;
1500      CurPtr += SizeTmp;
1501    } else if (Char == '.' &&
1502               getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
1503      Kind = tok::ellipsis;
1504      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
1505                           SizeTmp2, Result);
1506    } else {
1507      Kind = tok::period;
1508    }
1509    break;
1510  case '&':
1511    Char = getCharAndSize(CurPtr, SizeTmp);
1512    if (Char == '&') {
1513      Kind = tok::ampamp;
1514      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1515    } else if (Char == '=') {
1516      Kind = tok::ampequal;
1517      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1518    } else {
1519      Kind = tok::amp;
1520    }
1521    break;
1522  case '*':
1523    if (getCharAndSize(CurPtr, SizeTmp) == '=') {
1524      Kind = tok::starequal;
1525      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1526    } else {
1527      Kind = tok::star;
1528    }
1529    break;
1530  case '+':
1531    Char = getCharAndSize(CurPtr, SizeTmp);
1532    if (Char == '+') {
1533      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1534      Kind = tok::plusplus;
1535    } else if (Char == '=') {
1536      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1537      Kind = tok::plusequal;
1538    } else {
1539      Kind = tok::plus;
1540    }
1541    break;
1542  case '-':
1543    Char = getCharAndSize(CurPtr, SizeTmp);
1544    if (Char == '-') {      // --
1545      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1546      Kind = tok::minusminus;
1547    } else if (Char == '>' && Features.CPlusPlus &&
1548               getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') {  // C++ ->*
1549      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
1550                           SizeTmp2, Result);
1551      Kind = tok::arrowstar;
1552    } else if (Char == '>') {   // ->
1553      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1554      Kind = tok::arrow;
1555    } else if (Char == '=') {   // -=
1556      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1557      Kind = tok::minusequal;
1558    } else {
1559      Kind = tok::minus;
1560    }
1561    break;
1562  case '~':
1563    Kind = tok::tilde;
1564    break;
1565  case '!':
1566    if (getCharAndSize(CurPtr, SizeTmp) == '=') {
1567      Kind = tok::exclaimequal;
1568      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1569    } else {
1570      Kind = tok::exclaim;
1571    }
1572    break;
1573  case '/':
1574    // 6.4.9: Comments
1575    Char = getCharAndSize(CurPtr, SizeTmp);
1576    if (Char == '/') {         // BCPL comment.
1577      // Even if BCPL comments are disabled (e.g. in C89 mode), we generally
1578      // want to lex this as a comment.  There is one problem with this though,
1579      // that in one particular corner case, this can change the behavior of the
1580      // resultant program.  For example, In  "foo //**/ bar", C89 would lex
1581      // this as "foo / bar" and langauges with BCPL comments would lex it as
1582      // "foo".  Check to see if the character after the second slash is a '*'.
1583      // If so, we will lex that as a "/" instead of the start of a comment.
1584      if (Features.BCPLComment ||
1585          getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') {
1586        if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
1587          return; // KeepCommentMode
1588
1589        // It is common for the tokens immediately after a // comment to be
1590        // whitespace (indentation for the next line).  Instead of going through
1591        // the big switch, handle it efficiently now.
1592        goto SkipIgnoredUnits;
1593      }
1594    }
1595
1596    if (Char == '*') {  // /**/ comment.
1597      if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result)))
1598        return; // KeepCommentMode
1599      goto LexNextToken;   // GCC isn't tail call eliminating.
1600    }
1601
1602    if (Char == '=') {
1603      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1604      Kind = tok::slashequal;
1605    } else {
1606      Kind = tok::slash;
1607    }
1608    break;
1609  case '%':
1610    Char = getCharAndSize(CurPtr, SizeTmp);
1611    if (Char == '=') {
1612      Kind = tok::percentequal;
1613      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1614    } else if (Features.Digraphs && Char == '>') {
1615      Kind = tok::r_brace;                             // '%>' -> '}'
1616      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1617    } else if (Features.Digraphs && Char == ':') {
1618      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1619      Char = getCharAndSize(CurPtr, SizeTmp);
1620      if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
1621        Kind = tok::hashhash;                          // '%:%:' -> '##'
1622        CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
1623                             SizeTmp2, Result);
1624      } else if (Char == '@' && Features.Microsoft) {  // %:@ -> #@ -> Charize
1625        CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1626        if (!isLexingRawMode())
1627          Diag(BufferPtr, diag::charize_microsoft_ext);
1628        Kind = tok::hashat;
1629      } else {                                         // '%:' -> '#'
1630        // We parsed a # character.  If this occurs at the start of the line,
1631        // it's actually the start of a preprocessing directive.  Callback to
1632        // the preprocessor to handle it.
1633        // FIXME: -fpreprocessed mode??
1634        if (Result.isAtStartOfLine() && !LexingRawMode) {
1635          FormTokenWithChars(Result, CurPtr, tok::hash);
1636          PP->HandleDirective(Result);
1637
1638          // As an optimization, if the preprocessor didn't switch lexers, tail
1639          // recurse.
1640          if (PP->isCurrentLexer(this)) {
1641            // Start a new token. If this is a #include or something, the PP may
1642            // want us starting at the beginning of the line again.  If so, set
1643            // the StartOfLine flag.
1644            if (IsAtStartOfLine) {
1645              Result.setFlag(Token::StartOfLine);
1646              IsAtStartOfLine = false;
1647            }
1648            goto LexNextToken;   // GCC isn't tail call eliminating.
1649          }
1650
1651          return PP->Lex(Result);
1652        }
1653
1654        Kind = tok::hash;
1655      }
1656    } else {
1657      Kind = tok::percent;
1658    }
1659    break;
1660  case '<':
1661    Char = getCharAndSize(CurPtr, SizeTmp);
1662    if (ParsingFilename) {
1663      return LexAngledStringLiteral(Result, CurPtr);
1664    } else if (Char == '<' &&
1665               getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '=') {
1666      Kind = tok::lesslessequal;
1667      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
1668                           SizeTmp2, Result);
1669    } else if (Char == '<') {
1670      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1671      Kind = tok::lessless;
1672    } else if (Char == '=') {
1673      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1674      Kind = tok::lessequal;
1675    } else if (Features.Digraphs && Char == ':') {     // '<:' -> '['
1676      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1677      Kind = tok::l_square;
1678    } else if (Features.Digraphs && Char == '%') {     // '<%' -> '{'
1679      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1680      Kind = tok::l_brace;
1681    } else {
1682      Kind = tok::less;
1683    }
1684    break;
1685  case '>':
1686    Char = getCharAndSize(CurPtr, SizeTmp);
1687    if (Char == '=') {
1688      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1689      Kind = tok::greaterequal;
1690    } else if (Char == '>' &&
1691               getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '=') {
1692      CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
1693                           SizeTmp2, Result);
1694      Kind = tok::greatergreaterequal;
1695    } else if (Char == '>') {
1696      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1697      Kind = tok::greatergreater;
1698    } else {
1699      Kind = tok::greater;
1700    }
1701    break;
1702  case '^':
1703    Char = getCharAndSize(CurPtr, SizeTmp);
1704    if (Char == '=') {
1705      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1706      Kind = tok::caretequal;
1707    } else {
1708      Kind = tok::caret;
1709    }
1710    break;
1711  case '|':
1712    Char = getCharAndSize(CurPtr, SizeTmp);
1713    if (Char == '=') {
1714      Kind = tok::pipeequal;
1715      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1716    } else if (Char == '|') {
1717      Kind = tok::pipepipe;
1718      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1719    } else {
1720      Kind = tok::pipe;
1721    }
1722    break;
1723  case ':':
1724    Char = getCharAndSize(CurPtr, SizeTmp);
1725    if (Features.Digraphs && Char == '>') {
1726      Kind = tok::r_square; // ':>' -> ']'
1727      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1728    } else if (Features.CPlusPlus && Char == ':') {
1729      Kind = tok::coloncolon;
1730      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1731    } else {
1732      Kind = tok::colon;
1733    }
1734    break;
1735  case ';':
1736    Kind = tok::semi;
1737    break;
1738  case '=':
1739    Char = getCharAndSize(CurPtr, SizeTmp);
1740    if (Char == '=') {
1741      Kind = tok::equalequal;
1742      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1743    } else {
1744      Kind = tok::equal;
1745    }
1746    break;
1747  case ',':
1748    Kind = tok::comma;
1749    break;
1750  case '#':
1751    Char = getCharAndSize(CurPtr, SizeTmp);
1752    if (Char == '#') {
1753      Kind = tok::hashhash;
1754      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1755    } else if (Char == '@' && Features.Microsoft) {  // #@ -> Charize
1756      Kind = tok::hashat;
1757      if (!isLexingRawMode())
1758        Diag(BufferPtr, diag::charize_microsoft_ext);
1759      CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
1760    } else {
1761      // We parsed a # character.  If this occurs at the start of the line,
1762      // it's actually the start of a preprocessing directive.  Callback to
1763      // the preprocessor to handle it.
1764      // FIXME: -fpreprocessed mode??
1765      if (Result.isAtStartOfLine() && !LexingRawMode) {
1766        FormTokenWithChars(Result, CurPtr, tok::hash);
1767        PP->HandleDirective(Result);
1768
1769        // As an optimization, if the preprocessor didn't switch lexers, tail
1770        // recurse.
1771        if (PP->isCurrentLexer(this)) {
1772          // Start a new token.  If this is a #include or something, the PP may
1773          // want us starting at the beginning of the line again.  If so, set
1774          // the StartOfLine flag.
1775          if (IsAtStartOfLine) {
1776            Result.setFlag(Token::StartOfLine);
1777            IsAtStartOfLine = false;
1778          }
1779          goto LexNextToken;   // GCC isn't tail call eliminating.
1780        }
1781        return PP->Lex(Result);
1782      }
1783
1784      Kind = tok::hash;
1785    }
1786    break;
1787
1788  case '@':
1789    // Objective C support.
1790    if (CurPtr[-1] == '@' && Features.ObjC1)
1791      Kind = tok::at;
1792    else
1793      Kind = tok::unknown;
1794    break;
1795
1796  case '\\':
1797    // FIXME: UCN's.
1798    // FALL THROUGH.
1799  default:
1800    Kind = tok::unknown;
1801    break;
1802  }
1803
1804  // Notify MIOpt that we read a non-whitespace/non-comment token.
1805  MIOpt.ReadToken();
1806
1807  // Update the location of token as well as BufferPtr.
1808  FormTokenWithChars(Result, CurPtr, Kind);
1809}
1810