Lexer.cpp revision f01fdff97b245caac98100d232c760b4d0531411
1//===--- Lexer.cpp - C Language Family Lexer ------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements the Lexer and Token interfaces. 11// 12//===----------------------------------------------------------------------===// 13// 14// TODO: GCC Diagnostics emitted by the lexer: 15// PEDWARN: (form feed|vertical tab) in preprocessing directive 16// 17// Universal characters, unicode, char mapping: 18// WARNING: `%.*s' is not in NFKC 19// WARNING: `%.*s' is not in NFC 20// 21// Other: 22// TODO: Options to support: 23// -fexec-charset,-fwide-exec-charset 24// 25//===----------------------------------------------------------------------===// 26 27#include "clang/Lex/Lexer.h" 28#include "clang/Lex/Preprocessor.h" 29#include "clang/Lex/LexDiagnostic.h" 30#include "clang/Basic/SourceManager.h" 31#include "llvm/Support/Compiler.h" 32#include "llvm/Support/MemoryBuffer.h" 33#include <cctype> 34using namespace clang; 35 36static void InitCharacterInfo(); 37 38//===----------------------------------------------------------------------===// 39// Token Class Implementation 40//===----------------------------------------------------------------------===// 41 42/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 43bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 44 if (IdentifierInfo *II = getIdentifierInfo()) 45 return II->getObjCKeywordID() == objcKey; 46 return false; 47} 48 49/// getObjCKeywordID - Return the ObjC keyword kind. 50tok::ObjCKeywordKind Token::getObjCKeywordID() const { 51 IdentifierInfo *specId = getIdentifierInfo(); 52 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 53} 54 55 56//===----------------------------------------------------------------------===// 57// Lexer Class Implementation 58//===----------------------------------------------------------------------===// 59 60void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 61 const char *BufEnd) { 62 InitCharacterInfo(); 63 64 BufferStart = BufStart; 65 BufferPtr = BufPtr; 66 BufferEnd = BufEnd; 67 68 assert(BufEnd[0] == 0 && 69 "We assume that the input buffer has a null character at the end" 70 " to simplify lexing!"); 71 72 Is_PragmaLexer = false; 73 74 // Start of the file is a start of line. 75 IsAtStartOfLine = true; 76 77 // We are not after parsing a #. 78 ParsingPreprocessorDirective = false; 79 80 // We are not after parsing #include. 81 ParsingFilename = false; 82 83 // We are not in raw mode. Raw mode disables diagnostics and interpretation 84 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 85 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 86 // or otherwise skipping over tokens. 87 LexingRawMode = false; 88 89 // Default to not keeping comments. 90 ExtendedTokenMode = 0; 91} 92 93/// Lexer constructor - Create a new lexer object for the specified buffer 94/// with the specified preprocessor managing the lexing process. This lexer 95/// assumes that the associated file buffer and Preprocessor objects will 96/// outlive it, so it doesn't take ownership of either of them. 97Lexer::Lexer(FileID FID, Preprocessor &PP) 98 : PreprocessorLexer(&PP, FID), 99 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 100 Features(PP.getLangOptions()) { 101 102 const llvm::MemoryBuffer *InputFile = PP.getSourceManager().getBuffer(FID); 103 104 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), 105 InputFile->getBufferEnd()); 106 107 // Default to keeping comments if the preprocessor wants them. 108 SetCommentRetentionState(PP.getCommentRetentionState()); 109} 110 111/// Lexer constructor - Create a new raw lexer object. This object is only 112/// suitable for calls to 'LexRawToken'. This lexer assumes that the text 113/// range will outlive it, so it doesn't take ownership of it. 114Lexer::Lexer(SourceLocation fileloc, const LangOptions &features, 115 const char *BufStart, const char *BufPtr, const char *BufEnd) 116 : FileLoc(fileloc), Features(features) { 117 118 InitLexer(BufStart, BufPtr, BufEnd); 119 120 // We *are* in raw mode. 121 LexingRawMode = true; 122} 123 124/// Lexer constructor - Create a new raw lexer object. This object is only 125/// suitable for calls to 'LexRawToken'. This lexer assumes that the text 126/// range will outlive it, so it doesn't take ownership of it. 127Lexer::Lexer(FileID FID, const SourceManager &SM, const LangOptions &features) 128 : FileLoc(SM.getLocForStartOfFile(FID)), Features(features) { 129 const llvm::MemoryBuffer *FromFile = SM.getBuffer(FID); 130 131 InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(), 132 FromFile->getBufferEnd()); 133 134 // We *are* in raw mode. 135 LexingRawMode = true; 136} 137 138/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 139/// _Pragma expansion. This has a variety of magic semantics that this method 140/// sets up. It returns a new'd Lexer that must be delete'd when done. 141/// 142/// On entrance to this routine, TokStartLoc is a macro location which has a 143/// spelling loc that indicates the bytes to be lexed for the token and an 144/// instantiation location that indicates where all lexed tokens should be 145/// "expanded from". 146/// 147/// FIXME: It would really be nice to make _Pragma just be a wrapper around a 148/// normal lexer that remaps tokens as they fly by. This would require making 149/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 150/// interface that could handle this stuff. This would pull GetMappedTokenLoc 151/// out of the critical path of the lexer! 152/// 153Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 154 SourceLocation InstantiationLocStart, 155 SourceLocation InstantiationLocEnd, 156 unsigned TokLen, Preprocessor &PP) { 157 SourceManager &SM = PP.getSourceManager(); 158 159 // Create the lexer as if we were going to lex the file normally. 160 FileID SpellingFID = SM.getFileID(SpellingLoc); 161 Lexer *L = new Lexer(SpellingFID, PP); 162 163 // Now that the lexer is created, change the start/end locations so that we 164 // just lex the subsection of the file that we want. This is lexing from a 165 // scratch buffer. 166 const char *StrData = SM.getCharacterData(SpellingLoc); 167 168 L->BufferPtr = StrData; 169 L->BufferEnd = StrData+TokLen; 170 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 171 172 // Set the SourceLocation with the remapping information. This ensures that 173 // GetMappedTokenLoc will remap the tokens as they are lexed. 174 L->FileLoc = SM.createInstantiationLoc(SM.getLocForStartOfFile(SpellingFID), 175 InstantiationLocStart, 176 InstantiationLocEnd, TokLen); 177 178 // Ensure that the lexer thinks it is inside a directive, so that end \n will 179 // return an EOM token. 180 L->ParsingPreprocessorDirective = true; 181 182 // This lexer really is for _Pragma. 183 L->Is_PragmaLexer = true; 184 return L; 185} 186 187 188/// Stringify - Convert the specified string into a C string, with surrounding 189/// ""'s, and with escaped \ and " characters. 190std::string Lexer::Stringify(const std::string &Str, bool Charify) { 191 std::string Result = Str; 192 char Quote = Charify ? '\'' : '"'; 193 for (unsigned i = 0, e = Result.size(); i != e; ++i) { 194 if (Result[i] == '\\' || Result[i] == Quote) { 195 Result.insert(Result.begin()+i, '\\'); 196 ++i; ++e; 197 } 198 } 199 return Result; 200} 201 202/// Stringify - Convert the specified string into a C string by escaping '\' 203/// and " characters. This does not add surrounding ""'s to the string. 204void Lexer::Stringify(llvm::SmallVectorImpl<char> &Str) { 205 for (unsigned i = 0, e = Str.size(); i != e; ++i) { 206 if (Str[i] == '\\' || Str[i] == '"') { 207 Str.insert(Str.begin()+i, '\\'); 208 ++i; ++e; 209 } 210 } 211} 212 213 214/// MeasureTokenLength - Relex the token at the specified location and return 215/// its length in bytes in the input file. If the token needs cleaning (e.g. 216/// includes a trigraph or an escaped newline) then this count includes bytes 217/// that are part of that. 218unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 219 const SourceManager &SM, 220 const LangOptions &LangOpts) { 221 // TODO: this could be special cased for common tokens like identifiers, ')', 222 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 223 // all obviously single-char tokens. This could use 224 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 225 // something. 226 227 // If this comes from a macro expansion, we really do want the macro name, not 228 // the token this macro expanded to. 229 Loc = SM.getInstantiationLoc(Loc); 230 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 231 std::pair<const char *,const char *> Buffer = SM.getBufferData(LocInfo.first); 232 const char *StrData = Buffer.first+LocInfo.second; 233 234 // Create a lexer starting at the beginning of this token. 235 Lexer TheLexer(Loc, LangOpts, Buffer.first, StrData, Buffer.second); 236 Token TheTok; 237 TheLexer.LexFromRawLexer(TheTok); 238 return TheTok.getLength(); 239} 240 241//===----------------------------------------------------------------------===// 242// Character information. 243//===----------------------------------------------------------------------===// 244 245static unsigned char CharInfo[256]; 246 247enum { 248 CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0' 249 CHAR_VERT_WS = 0x02, // '\r', '\n' 250 CHAR_LETTER = 0x04, // a-z,A-Z 251 CHAR_NUMBER = 0x08, // 0-9 252 CHAR_UNDER = 0x10, // _ 253 CHAR_PERIOD = 0x20 // . 254}; 255 256static void InitCharacterInfo() { 257 static bool isInited = false; 258 if (isInited) return; 259 isInited = true; 260 261 // Intiialize the CharInfo table. 262 // TODO: statically initialize this. 263 CharInfo[(int)' '] = CharInfo[(int)'\t'] = 264 CharInfo[(int)'\f'] = CharInfo[(int)'\v'] = CHAR_HORZ_WS; 265 CharInfo[(int)'\n'] = CharInfo[(int)'\r'] = CHAR_VERT_WS; 266 267 CharInfo[(int)'_'] = CHAR_UNDER; 268 CharInfo[(int)'.'] = CHAR_PERIOD; 269 for (unsigned i = 'a'; i <= 'z'; ++i) 270 CharInfo[i] = CharInfo[i+'A'-'a'] = CHAR_LETTER; 271 for (unsigned i = '0'; i <= '9'; ++i) 272 CharInfo[i] = CHAR_NUMBER; 273} 274 275/// isIdentifierBody - Return true if this is the body character of an 276/// identifier, which is [a-zA-Z0-9_]. 277static inline bool isIdentifierBody(unsigned char c) { 278 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false; 279} 280 281/// isHorizontalWhitespace - Return true if this character is horizontal 282/// whitespace: ' ', '\t', '\f', '\v'. Note that this returns false for '\0'. 283static inline bool isHorizontalWhitespace(unsigned char c) { 284 return (CharInfo[c] & CHAR_HORZ_WS) ? true : false; 285} 286 287/// isWhitespace - Return true if this character is horizontal or vertical 288/// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'. Note that this returns false 289/// for '\0'. 290static inline bool isWhitespace(unsigned char c) { 291 return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false; 292} 293 294/// isNumberBody - Return true if this is the body character of an 295/// preprocessing number, which is [a-zA-Z0-9_.]. 296static inline bool isNumberBody(unsigned char c) { 297 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ? 298 true : false; 299} 300 301 302//===----------------------------------------------------------------------===// 303// Diagnostics forwarding code. 304//===----------------------------------------------------------------------===// 305 306/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 307/// lexer buffer was all instantiated at a single point, perform the mapping. 308/// This is currently only used for _Pragma implementation, so it is the slow 309/// path of the hot getSourceLocation method. Do not allow it to be inlined. 310static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 311 SourceLocation FileLoc, 312 unsigned CharNo, 313 unsigned TokLen) DISABLE_INLINE; 314static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 315 SourceLocation FileLoc, 316 unsigned CharNo, unsigned TokLen) { 317 assert(FileLoc.isMacroID() && "Must be an instantiation"); 318 319 // Otherwise, we're lexing "mapped tokens". This is used for things like 320 // _Pragma handling. Combine the instantiation location of FileLoc with the 321 // spelling location. 322 SourceManager &SM = PP.getSourceManager(); 323 324 // Create a new SLoc which is expanded from Instantiation(FileLoc) but whose 325 // characters come from spelling(FileLoc)+Offset. 326 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 327 SpellingLoc = SpellingLoc.getFileLocWithOffset(CharNo); 328 329 // Figure out the expansion loc range, which is the range covered by the 330 // original _Pragma(...) sequence. 331 std::pair<SourceLocation,SourceLocation> II = 332 SM.getImmediateInstantiationRange(FileLoc); 333 334 return SM.createInstantiationLoc(SpellingLoc, II.first, II.second, TokLen); 335} 336 337/// getSourceLocation - Return a source location identifier for the specified 338/// offset in the current file. 339SourceLocation Lexer::getSourceLocation(const char *Loc, 340 unsigned TokLen) const { 341 assert(Loc >= BufferStart && Loc <= BufferEnd && 342 "Location out of range for this buffer!"); 343 344 // In the normal case, we're just lexing from a simple file buffer, return 345 // the file id from FileLoc with the offset specified. 346 unsigned CharNo = Loc-BufferStart; 347 if (FileLoc.isFileID()) 348 return FileLoc.getFileLocWithOffset(CharNo); 349 350 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 351 // tokens are lexed from where the _Pragma was defined. 352 assert(PP && "This doesn't work on raw lexers"); 353 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 354} 355 356/// Diag - Forwarding function for diagnostics. This translate a source 357/// position in the current buffer into a SourceLocation object for rendering. 358DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 359 return PP->Diag(getSourceLocation(Loc), DiagID); 360} 361 362//===----------------------------------------------------------------------===// 363// Trigraph and Escaped Newline Handling Code. 364//===----------------------------------------------------------------------===// 365 366/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 367/// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 368static char GetTrigraphCharForLetter(char Letter) { 369 switch (Letter) { 370 default: return 0; 371 case '=': return '#'; 372 case ')': return ']'; 373 case '(': return '['; 374 case '!': return '|'; 375 case '\'': return '^'; 376 case '>': return '}'; 377 case '/': return '\\'; 378 case '<': return '{'; 379 case '-': return '~'; 380 } 381} 382 383/// DecodeTrigraphChar - If the specified character is a legal trigraph when 384/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 385/// return the result character. Finally, emit a warning about trigraph use 386/// whether trigraphs are enabled or not. 387static char DecodeTrigraphChar(const char *CP, Lexer *L) { 388 char Res = GetTrigraphCharForLetter(*CP); 389 if (!Res || !L) return Res; 390 391 if (!L->getFeatures().Trigraphs) { 392 if (!L->isLexingRawMode()) 393 L->Diag(CP-2, diag::trigraph_ignored); 394 return 0; 395 } 396 397 if (!L->isLexingRawMode()) 398 L->Diag(CP-2, diag::trigraph_converted) << std::string()+Res; 399 return Res; 400} 401 402/// getEscapedNewLineSize - Return the size of the specified escaped newline, 403/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 404/// trigraph equivalent on entry to this function. 405unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 406 unsigned Size = 0; 407 while (isWhitespace(Ptr[Size])) { 408 ++Size; 409 410 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 411 continue; 412 413 // If this is a \r\n or \n\r, skip the other half. 414 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 415 Ptr[Size-1] != Ptr[Size]) 416 ++Size; 417 418 return Size; 419 } 420 421 // Not an escaped newline, must be a \t or something else. 422 return 0; 423} 424 425/// SkipEscapedNewLines - If P points to an escaped newline (or a series of 426/// them), skip over them and return the first non-escaped-newline found, 427/// otherwise return P. 428const char *Lexer::SkipEscapedNewLines(const char *P) { 429 while (1) { 430 const char *AfterEscape; 431 if (*P == '\\') { 432 AfterEscape = P+1; 433 } else if (*P == '?') { 434 // If not a trigraph for escape, bail out. 435 if (P[1] != '?' || P[2] != '/') 436 return P; 437 AfterEscape = P+3; 438 } else { 439 return P; 440 } 441 442 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 443 if (NewLineSize == 0) return P; 444 P = AfterEscape+NewLineSize; 445 } 446} 447 448 449/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 450/// get its size, and return it. This is tricky in several cases: 451/// 1. If currently at the start of a trigraph, we warn about the trigraph, 452/// then either return the trigraph (skipping 3 chars) or the '?', 453/// depending on whether trigraphs are enabled or not. 454/// 2. If this is an escaped newline (potentially with whitespace between 455/// the backslash and newline), implicitly skip the newline and return 456/// the char after it. 457/// 3. If this is a UCN, return it. FIXME: C++ UCN's? 458/// 459/// This handles the slow/uncommon case of the getCharAndSize method. Here we 460/// know that we can accumulate into Size, and that we have already incremented 461/// Ptr by Size bytes. 462/// 463/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 464/// be updated to match. 465/// 466char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 467 Token *Tok) { 468 // If we have a slash, look for an escaped newline. 469 if (Ptr[0] == '\\') { 470 ++Size; 471 ++Ptr; 472Slash: 473 // Common case, backslash-char where the char is not whitespace. 474 if (!isWhitespace(Ptr[0])) return '\\'; 475 476 // See if we have optional whitespace characters followed by a newline. 477 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 478 // Remember that this token needs to be cleaned. 479 if (Tok) Tok->setFlag(Token::NeedsCleaning); 480 481 // Warn if there was whitespace between the backslash and newline. 482 if (EscapedNewLineSize != 1 && Tok && !isLexingRawMode()) 483 Diag(Ptr, diag::backslash_newline_space); 484 485 // Found backslash<whitespace><newline>. Parse the char after it. 486 Size += EscapedNewLineSize; 487 Ptr += EscapedNewLineSize; 488 // Use slow version to accumulate a correct size field. 489 return getCharAndSizeSlow(Ptr, Size, Tok); 490 } 491 492 // Otherwise, this is not an escaped newline, just return the slash. 493 return '\\'; 494 } 495 496 // If this is a trigraph, process it. 497 if (Ptr[0] == '?' && Ptr[1] == '?') { 498 // If this is actually a legal trigraph (not something like "??x"), emit 499 // a trigraph warning. If so, and if trigraphs are enabled, return it. 500 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) { 501 // Remember that this token needs to be cleaned. 502 if (Tok) Tok->setFlag(Token::NeedsCleaning); 503 504 Ptr += 3; 505 Size += 3; 506 if (C == '\\') goto Slash; 507 return C; 508 } 509 } 510 511 // If this is neither, return a single character. 512 ++Size; 513 return *Ptr; 514} 515 516 517/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 518/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 519/// and that we have already incremented Ptr by Size bytes. 520/// 521/// NOTE: When this method is updated, getCharAndSizeSlow (above) should 522/// be updated to match. 523char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 524 const LangOptions &Features) { 525 // If we have a slash, look for an escaped newline. 526 if (Ptr[0] == '\\') { 527 ++Size; 528 ++Ptr; 529Slash: 530 // Common case, backslash-char where the char is not whitespace. 531 if (!isWhitespace(Ptr[0])) return '\\'; 532 533 // See if we have optional whitespace characters followed by a newline. 534 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 535 // Found backslash<whitespace><newline>. Parse the char after it. 536 Size += EscapedNewLineSize; 537 Ptr += EscapedNewLineSize; 538 539 // Use slow version to accumulate a correct size field. 540 return getCharAndSizeSlowNoWarn(Ptr, Size, Features); 541 } 542 543 // Otherwise, this is not an escaped newline, just return the slash. 544 return '\\'; 545 } 546 547 // If this is a trigraph, process it. 548 if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 549 // If this is actually a legal trigraph (not something like "??x"), return 550 // it. 551 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 552 Ptr += 3; 553 Size += 3; 554 if (C == '\\') goto Slash; 555 return C; 556 } 557 } 558 559 // If this is neither, return a single character. 560 ++Size; 561 return *Ptr; 562} 563 564//===----------------------------------------------------------------------===// 565// Helper methods for lexing. 566//===----------------------------------------------------------------------===// 567 568void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { 569 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] 570 unsigned Size; 571 unsigned char C = *CurPtr++; 572 while (isIdentifierBody(C)) { 573 C = *CurPtr++; 574 } 575 --CurPtr; // Back up over the skipped character. 576 577 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline 578 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. 579 // FIXME: UCNs. 580 if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) { 581FinishIdentifier: 582 const char *IdStart = BufferPtr; 583 FormTokenWithChars(Result, CurPtr, tok::identifier); 584 585 // If we are in raw mode, return this identifier raw. There is no need to 586 // look up identifier information or attempt to macro expand it. 587 if (LexingRawMode) return; 588 589 // Fill in Result.IdentifierInfo, looking up the identifier in the 590 // identifier table. 591 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result, IdStart); 592 593 // Change the kind of this identifier to the appropriate token kind, e.g. 594 // turning "for" into a keyword. 595 Result.setKind(II->getTokenID()); 596 597 // Finally, now that we know we have an identifier, pass this off to the 598 // preprocessor, which may macro expand it or something. 599 if (II->isHandleIdentifierCase()) 600 PP->HandleIdentifier(Result); 601 return; 602 } 603 604 // Otherwise, $,\,? in identifier found. Enter slower path. 605 606 C = getCharAndSize(CurPtr, Size); 607 while (1) { 608 if (C == '$') { 609 // If we hit a $ and they are not supported in identifiers, we are done. 610 if (!Features.DollarIdents) goto FinishIdentifier; 611 612 // Otherwise, emit a diagnostic and continue. 613 if (!isLexingRawMode()) 614 Diag(CurPtr, diag::ext_dollar_in_identifier); 615 CurPtr = ConsumeChar(CurPtr, Size, Result); 616 C = getCharAndSize(CurPtr, Size); 617 continue; 618 } else if (!isIdentifierBody(C)) { // FIXME: UCNs. 619 // Found end of identifier. 620 goto FinishIdentifier; 621 } 622 623 // Otherwise, this character is good, consume it. 624 CurPtr = ConsumeChar(CurPtr, Size, Result); 625 626 C = getCharAndSize(CurPtr, Size); 627 while (isIdentifierBody(C)) { // FIXME: UCNs. 628 CurPtr = ConsumeChar(CurPtr, Size, Result); 629 C = getCharAndSize(CurPtr, Size); 630 } 631 } 632} 633 634 635/// LexNumericConstant - Lex the remainder of a integer or floating point 636/// constant. From[-1] is the first character lexed. Return the end of the 637/// constant. 638void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 639 unsigned Size; 640 char C = getCharAndSize(CurPtr, Size); 641 char PrevCh = 0; 642 while (isNumberBody(C)) { // FIXME: UCNs? 643 CurPtr = ConsumeChar(CurPtr, Size, Result); 644 PrevCh = C; 645 C = getCharAndSize(CurPtr, Size); 646 } 647 648 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 649 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) 650 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 651 652 // If we have a hex FP constant, continue. 653 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) 654 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 655 656 // Update the location of token as well as BufferPtr. 657 const char *TokStart = BufferPtr; 658 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 659 Result.setLiteralData(TokStart); 660} 661 662/// LexStringLiteral - Lex the remainder of a string literal, after having lexed 663/// either " or L". 664void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) { 665 const char *NulCharacter = 0; // Does this string contain the \0 character? 666 667 char C = getAndAdvanceChar(CurPtr, Result); 668 while (C != '"') { 669 // Skip escaped characters. 670 if (C == '\\') { 671 // Skip the escaped character. 672 C = getAndAdvanceChar(CurPtr, Result); 673 } else if (C == '\n' || C == '\r' || // Newline. 674 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 675 if (!isLexingRawMode() && !Features.AsmPreprocessor) 676 Diag(BufferPtr, diag::err_unterminated_string); 677 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 678 return; 679 } else if (C == 0) { 680 NulCharacter = CurPtr-1; 681 } 682 C = getAndAdvanceChar(CurPtr, Result); 683 } 684 685 // If a nul character existed in the string, warn about it. 686 if (NulCharacter && !isLexingRawMode()) 687 Diag(NulCharacter, diag::null_in_string); 688 689 // Update the location of the token as well as the BufferPtr instance var. 690 const char *TokStart = BufferPtr; 691 FormTokenWithChars(Result, CurPtr, 692 Wide ? tok::wide_string_literal : tok::string_literal); 693 Result.setLiteralData(TokStart); 694} 695 696/// LexAngledStringLiteral - Lex the remainder of an angled string literal, 697/// after having lexed the '<' character. This is used for #include filenames. 698void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 699 const char *NulCharacter = 0; // Does this string contain the \0 character? 700 const char *AfterLessPos = CurPtr; 701 char C = getAndAdvanceChar(CurPtr, Result); 702 while (C != '>') { 703 // Skip escaped characters. 704 if (C == '\\') { 705 // Skip the escaped character. 706 C = getAndAdvanceChar(CurPtr, Result); 707 } else if (C == '\n' || C == '\r' || // Newline. 708 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 709 // If the filename is unterminated, then it must just be a lone < 710 // character. Return this as such. 711 FormTokenWithChars(Result, AfterLessPos, tok::less); 712 return; 713 } else if (C == 0) { 714 NulCharacter = CurPtr-1; 715 } 716 C = getAndAdvanceChar(CurPtr, Result); 717 } 718 719 // If a nul character existed in the string, warn about it. 720 if (NulCharacter && !isLexingRawMode()) 721 Diag(NulCharacter, diag::null_in_string); 722 723 // Update the location of token as well as BufferPtr. 724 const char *TokStart = BufferPtr; 725 FormTokenWithChars(Result, CurPtr, tok::angle_string_literal); 726 Result.setLiteralData(TokStart); 727} 728 729 730/// LexCharConstant - Lex the remainder of a character constant, after having 731/// lexed either ' or L'. 732void Lexer::LexCharConstant(Token &Result, const char *CurPtr) { 733 const char *NulCharacter = 0; // Does this character contain the \0 character? 734 735 // Handle the common case of 'x' and '\y' efficiently. 736 char C = getAndAdvanceChar(CurPtr, Result); 737 if (C == '\'') { 738 if (!isLexingRawMode() && !Features.AsmPreprocessor) 739 Diag(BufferPtr, diag::err_empty_character); 740 FormTokenWithChars(Result, CurPtr, tok::unknown); 741 return; 742 } else if (C == '\\') { 743 // Skip the escaped character. 744 // FIXME: UCN's. 745 C = getAndAdvanceChar(CurPtr, Result); 746 } 747 748 if (C && C != '\n' && C != '\r' && CurPtr[0] == '\'') { 749 ++CurPtr; 750 } else { 751 // Fall back on generic code for embedded nulls, newlines, wide chars. 752 do { 753 // Skip escaped characters. 754 if (C == '\\') { 755 // Skip the escaped character. 756 C = getAndAdvanceChar(CurPtr, Result); 757 } else if (C == '\n' || C == '\r' || // Newline. 758 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 759 if (!isLexingRawMode() && !Features.AsmPreprocessor) 760 Diag(BufferPtr, diag::err_unterminated_char); 761 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 762 return; 763 } else if (C == 0) { 764 NulCharacter = CurPtr-1; 765 } 766 C = getAndAdvanceChar(CurPtr, Result); 767 } while (C != '\''); 768 } 769 770 if (NulCharacter && !isLexingRawMode()) 771 Diag(NulCharacter, diag::null_in_char); 772 773 // Update the location of token as well as BufferPtr. 774 const char *TokStart = BufferPtr; 775 FormTokenWithChars(Result, CurPtr, tok::char_constant); 776 Result.setLiteralData(TokStart); 777} 778 779/// SkipWhitespace - Efficiently skip over a series of whitespace characters. 780/// Update BufferPtr to point to the next non-whitespace character and return. 781/// 782/// This method forms a token and returns true if KeepWhitespaceMode is enabled. 783/// 784bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { 785 // Whitespace - Skip it, then return the token after the whitespace. 786 unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently. 787 while (1) { 788 // Skip horizontal whitespace very aggressively. 789 while (isHorizontalWhitespace(Char)) 790 Char = *++CurPtr; 791 792 // Otherwise if we have something other than whitespace, we're done. 793 if (Char != '\n' && Char != '\r') 794 break; 795 796 if (ParsingPreprocessorDirective) { 797 // End of preprocessor directive line, let LexTokenInternal handle this. 798 BufferPtr = CurPtr; 799 return false; 800 } 801 802 // ok, but handle newline. 803 // The returned token is at the start of the line. 804 Result.setFlag(Token::StartOfLine); 805 // No leading whitespace seen so far. 806 Result.clearFlag(Token::LeadingSpace); 807 Char = *++CurPtr; 808 } 809 810 // If this isn't immediately after a newline, there is leading space. 811 char PrevChar = CurPtr[-1]; 812 if (PrevChar != '\n' && PrevChar != '\r') 813 Result.setFlag(Token::LeadingSpace); 814 815 // If the client wants us to return whitespace, return it now. 816 if (isKeepWhitespaceMode()) { 817 FormTokenWithChars(Result, CurPtr, tok::unknown); 818 return true; 819 } 820 821 BufferPtr = CurPtr; 822 return false; 823} 824 825// SkipBCPLComment - We have just read the // characters from input. Skip until 826// we find the newline character thats terminate the comment. Then update 827/// BufferPtr and return. If we're in KeepCommentMode, this will form the token 828/// and return true. 829bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { 830 // If BCPL comments aren't explicitly enabled for this language, emit an 831 // extension warning. 832 if (!Features.BCPLComment && !isLexingRawMode()) { 833 Diag(BufferPtr, diag::ext_bcpl_comment); 834 835 // Mark them enabled so we only emit one warning for this translation 836 // unit. 837 Features.BCPLComment = true; 838 } 839 840 // Scan over the body of the comment. The common case, when scanning, is that 841 // the comment contains normal ascii characters with nothing interesting in 842 // them. As such, optimize for this case with the inner loop. 843 char C; 844 do { 845 C = *CurPtr; 846 // FIXME: Speedup BCPL comment lexing. Just scan for a \n or \r character. 847 // If we find a \n character, scan backwards, checking to see if it's an 848 // escaped newline, like we do for block comments. 849 850 // Skip over characters in the fast loop. 851 while (C != 0 && // Potentially EOF. 852 C != '\\' && // Potentially escaped newline. 853 C != '?' && // Potentially trigraph. 854 C != '\n' && C != '\r') // Newline or DOS-style newline. 855 C = *++CurPtr; 856 857 // If this is a newline, we're done. 858 if (C == '\n' || C == '\r') 859 break; // Found the newline? Break out! 860 861 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 862 // properly decode the character. Read it in raw mode to avoid emitting 863 // diagnostics about things like trigraphs. If we see an escaped newline, 864 // we'll handle it below. 865 const char *OldPtr = CurPtr; 866 bool OldRawMode = isLexingRawMode(); 867 LexingRawMode = true; 868 C = getAndAdvanceChar(CurPtr, Result); 869 LexingRawMode = OldRawMode; 870 871 // If the char that we finally got was a \n, then we must have had something 872 // like \<newline><newline>. We don't want to have consumed the second 873 // newline, we want CurPtr, to end up pointing to it down below. 874 if (C == '\n' || C == '\r') { 875 --CurPtr; 876 C = 'x'; // doesn't matter what this is. 877 } 878 879 // If we read multiple characters, and one of those characters was a \r or 880 // \n, then we had an escaped newline within the comment. Emit diagnostic 881 // unless the next line is also a // comment. 882 if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') { 883 for (; OldPtr != CurPtr; ++OldPtr) 884 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 885 // Okay, we found a // comment that ends in a newline, if the next 886 // line is also a // comment, but has spaces, don't emit a diagnostic. 887 if (isspace(C)) { 888 const char *ForwardPtr = CurPtr; 889 while (isspace(*ForwardPtr)) // Skip whitespace. 890 ++ForwardPtr; 891 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 892 break; 893 } 894 895 if (!isLexingRawMode()) 896 Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment); 897 break; 898 } 899 } 900 901 if (CurPtr == BufferEnd+1) { --CurPtr; break; } 902 } while (C != '\n' && C != '\r'); 903 904 // Found but did not consume the newline. 905 906 // If we are returning comments as tokens, return this comment as a token. 907 if (inKeepCommentMode()) 908 return SaveBCPLComment(Result, CurPtr); 909 910 // If we are inside a preprocessor directive and we see the end of line, 911 // return immediately, so that the lexer can return this as an EOM token. 912 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 913 BufferPtr = CurPtr; 914 return false; 915 } 916 917 // Otherwise, eat the \n character. We don't care if this is a \n\r or 918 // \r\n sequence. This is an efficiency hack (because we know the \n can't 919 // contribute to another token), it isn't needed for correctness. Note that 920 // this is ok even in KeepWhitespaceMode, because we would have returned the 921 /// comment above in that mode. 922 ++CurPtr; 923 924 // The next returned token is at the start of the line. 925 Result.setFlag(Token::StartOfLine); 926 // No leading whitespace seen so far. 927 Result.clearFlag(Token::LeadingSpace); 928 BufferPtr = CurPtr; 929 return false; 930} 931 932/// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in 933/// an appropriate way and return it. 934bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) { 935 // If we're not in a preprocessor directive, just return the // comment 936 // directly. 937 FormTokenWithChars(Result, CurPtr, tok::comment); 938 939 if (!ParsingPreprocessorDirective) 940 return true; 941 942 // If this BCPL-style comment is in a macro definition, transmogrify it into 943 // a C-style block comment. 944 std::string Spelling = PP->getSpelling(Result); 945 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?"); 946 Spelling[1] = '*'; // Change prefix to "/*". 947 Spelling += "*/"; // add suffix. 948 949 Result.setKind(tok::comment); 950 PP->CreateString(&Spelling[0], Spelling.size(), Result, 951 Result.getLocation()); 952 return true; 953} 954 955/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 956/// character (either \n or \r) is part of an escaped newline sequence. Issue a 957/// diagnostic if so. We know that the newline is inside of a block comment. 958static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, 959 Lexer *L) { 960 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 961 962 // Back up off the newline. 963 --CurPtr; 964 965 // If this is a two-character newline sequence, skip the other character. 966 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 967 // \n\n or \r\r -> not escaped newline. 968 if (CurPtr[0] == CurPtr[1]) 969 return false; 970 // \n\r or \r\n -> skip the newline. 971 --CurPtr; 972 } 973 974 // If we have horizontal whitespace, skip over it. We allow whitespace 975 // between the slash and newline. 976 bool HasSpace = false; 977 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 978 --CurPtr; 979 HasSpace = true; 980 } 981 982 // If we have a slash, we know this is an escaped newline. 983 if (*CurPtr == '\\') { 984 if (CurPtr[-1] != '*') return false; 985 } else { 986 // It isn't a slash, is it the ?? / trigraph? 987 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || 988 CurPtr[-3] != '*') 989 return false; 990 991 // This is the trigraph ending the comment. Emit a stern warning! 992 CurPtr -= 2; 993 994 // If no trigraphs are enabled, warn that we ignored this trigraph and 995 // ignore this * character. 996 if (!L->getFeatures().Trigraphs) { 997 if (!L->isLexingRawMode()) 998 L->Diag(CurPtr, diag::trigraph_ignored_block_comment); 999 return false; 1000 } 1001 if (!L->isLexingRawMode()) 1002 L->Diag(CurPtr, diag::trigraph_ends_block_comment); 1003 } 1004 1005 // Warn about having an escaped newline between the */ characters. 1006 if (!L->isLexingRawMode()) 1007 L->Diag(CurPtr, diag::escaped_newline_block_comment_end); 1008 1009 // If there was space between the backslash and newline, warn about it. 1010 if (HasSpace && !L->isLexingRawMode()) 1011 L->Diag(CurPtr, diag::backslash_newline_space); 1012 1013 return true; 1014} 1015 1016#ifdef __SSE2__ 1017#include <emmintrin.h> 1018#elif __ALTIVEC__ 1019#include <altivec.h> 1020#undef bool 1021#endif 1022 1023/// SkipBlockComment - We have just read the /* characters from input. Read 1024/// until we find the */ characters that terminate the comment. Note that we 1025/// don't bother decoding trigraphs or escaped newlines in block comments, 1026/// because they cannot cause the comment to end. The only thing that can 1027/// happen is the comment could end with an escaped newline between the */ end 1028/// of comment. 1029/// 1030/// If KeepCommentMode is enabled, this forms a token from the comment and 1031/// returns true. 1032bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { 1033 // Scan one character past where we should, looking for a '/' character. Once 1034 // we find it, check to see if it was preceeded by a *. This common 1035 // optimization helps people who like to put a lot of * characters in their 1036 // comments. 1037 1038 // The first character we get with newlines and trigraphs skipped to handle 1039 // the degenerate /*/ case below correctly if the * has an escaped newline 1040 // after it. 1041 unsigned CharSize; 1042 unsigned char C = getCharAndSize(CurPtr, CharSize); 1043 CurPtr += CharSize; 1044 if (C == 0 && CurPtr == BufferEnd+1) { 1045 if (!isLexingRawMode()) 1046 Diag(BufferPtr, diag::err_unterminated_block_comment); 1047 --CurPtr; 1048 1049 // KeepWhitespaceMode should return this broken comment as a token. Since 1050 // it isn't a well formed comment, just return it as an 'unknown' token. 1051 if (isKeepWhitespaceMode()) { 1052 FormTokenWithChars(Result, CurPtr, tok::unknown); 1053 return true; 1054 } 1055 1056 BufferPtr = CurPtr; 1057 return false; 1058 } 1059 1060 // Check to see if the first character after the '/*' is another /. If so, 1061 // then this slash does not end the block comment, it is part of it. 1062 if (C == '/') 1063 C = *CurPtr++; 1064 1065 while (1) { 1066 // Skip over all non-interesting characters until we find end of buffer or a 1067 // (probably ending) '/' character. 1068 if (CurPtr + 24 < BufferEnd) { 1069 // While not aligned to a 16-byte boundary. 1070 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 1071 C = *CurPtr++; 1072 1073 if (C == '/') goto FoundSlash; 1074 1075#ifdef __SSE2__ 1076 __m128i Slashes = _mm_set_epi8('/', '/', '/', '/', '/', '/', '/', '/', 1077 '/', '/', '/', '/', '/', '/', '/', '/'); 1078 while (CurPtr+16 <= BufferEnd && 1079 _mm_movemask_epi8(_mm_cmpeq_epi8(*(__m128i*)CurPtr, Slashes)) == 0) 1080 CurPtr += 16; 1081#elif __ALTIVEC__ 1082 __vector unsigned char Slashes = { 1083 '/', '/', '/', '/', '/', '/', '/', '/', 1084 '/', '/', '/', '/', '/', '/', '/', '/' 1085 }; 1086 while (CurPtr+16 <= BufferEnd && 1087 !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes)) 1088 CurPtr += 16; 1089#else 1090 // Scan for '/' quickly. Many block comments are very large. 1091 while (CurPtr[0] != '/' && 1092 CurPtr[1] != '/' && 1093 CurPtr[2] != '/' && 1094 CurPtr[3] != '/' && 1095 CurPtr+4 < BufferEnd) { 1096 CurPtr += 4; 1097 } 1098#endif 1099 1100 // It has to be one of the bytes scanned, increment to it and read one. 1101 C = *CurPtr++; 1102 } 1103 1104 // Loop to scan the remainder. 1105 while (C != '/' && C != '\0') 1106 C = *CurPtr++; 1107 1108 FoundSlash: 1109 if (C == '/') { 1110 if (CurPtr[-2] == '*') // We found the final */. We're done! 1111 break; 1112 1113 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 1114 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { 1115 // We found the final */, though it had an escaped newline between the 1116 // * and /. We're done! 1117 break; 1118 } 1119 } 1120 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 1121 // If this is a /* inside of the comment, emit a warning. Don't do this 1122 // if this is a /*/, which will end the comment. This misses cases with 1123 // embedded escaped newlines, but oh well. 1124 if (!isLexingRawMode()) 1125 Diag(CurPtr-1, diag::warn_nested_block_comment); 1126 } 1127 } else if (C == 0 && CurPtr == BufferEnd+1) { 1128 if (!isLexingRawMode()) 1129 Diag(BufferPtr, diag::err_unterminated_block_comment); 1130 // Note: the user probably forgot a */. We could continue immediately 1131 // after the /*, but this would involve lexing a lot of what really is the 1132 // comment, which surely would confuse the parser. 1133 --CurPtr; 1134 1135 // KeepWhitespaceMode should return this broken comment as a token. Since 1136 // it isn't a well formed comment, just return it as an 'unknown' token. 1137 if (isKeepWhitespaceMode()) { 1138 FormTokenWithChars(Result, CurPtr, tok::unknown); 1139 return true; 1140 } 1141 1142 BufferPtr = CurPtr; 1143 return false; 1144 } 1145 C = *CurPtr++; 1146 } 1147 1148 // If we are returning comments as tokens, return this comment as a token. 1149 if (inKeepCommentMode()) { 1150 FormTokenWithChars(Result, CurPtr, tok::comment); 1151 return true; 1152 } 1153 1154 // It is common for the tokens immediately after a /**/ comment to be 1155 // whitespace. Instead of going through the big switch, handle it 1156 // efficiently now. This is safe even in KeepWhitespaceMode because we would 1157 // have already returned above with the comment as a token. 1158 if (isHorizontalWhitespace(*CurPtr)) { 1159 Result.setFlag(Token::LeadingSpace); 1160 SkipWhitespace(Result, CurPtr+1); 1161 return false; 1162 } 1163 1164 // Otherwise, just return so that the next character will be lexed as a token. 1165 BufferPtr = CurPtr; 1166 Result.setFlag(Token::LeadingSpace); 1167 return false; 1168} 1169 1170//===----------------------------------------------------------------------===// 1171// Primary Lexing Entry Points 1172//===----------------------------------------------------------------------===// 1173 1174/// ReadToEndOfLine - Read the rest of the current preprocessor line as an 1175/// uninterpreted string. This switches the lexer out of directive mode. 1176std::string Lexer::ReadToEndOfLine() { 1177 assert(ParsingPreprocessorDirective && ParsingFilename == false && 1178 "Must be in a preprocessing directive!"); 1179 std::string Result; 1180 Token Tmp; 1181 1182 // CurPtr - Cache BufferPtr in an automatic variable. 1183 const char *CurPtr = BufferPtr; 1184 while (1) { 1185 char Char = getAndAdvanceChar(CurPtr, Tmp); 1186 switch (Char) { 1187 default: 1188 Result += Char; 1189 break; 1190 case 0: // Null. 1191 // Found end of file? 1192 if (CurPtr-1 != BufferEnd) { 1193 // Nope, normal character, continue. 1194 Result += Char; 1195 break; 1196 } 1197 // FALL THROUGH. 1198 case '\r': 1199 case '\n': 1200 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 1201 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 1202 BufferPtr = CurPtr-1; 1203 1204 // Next, lex the character, which should handle the EOM transition. 1205 Lex(Tmp); 1206 assert(Tmp.is(tok::eom) && "Unexpected token!"); 1207 1208 // Finally, we're done, return the string we found. 1209 return Result; 1210 } 1211 } 1212} 1213 1214/// LexEndOfFile - CurPtr points to the end of this file. Handle this 1215/// condition, reporting diagnostics and handling other edge cases as required. 1216/// This returns true if Result contains a token, false if PP.Lex should be 1217/// called again. 1218bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 1219 // If we hit the end of the file while parsing a preprocessor directive, 1220 // end the preprocessor directive first. The next token returned will 1221 // then be the end of file. 1222 if (ParsingPreprocessorDirective) { 1223 // Done parsing the "line". 1224 ParsingPreprocessorDirective = false; 1225 // Update the location of token as well as BufferPtr. 1226 FormTokenWithChars(Result, CurPtr, tok::eom); 1227 1228 // Restore comment saving mode, in case it was disabled for directive. 1229 SetCommentRetentionState(PP->getCommentRetentionState()); 1230 return true; // Have a token. 1231 } 1232 1233 // If we are in raw mode, return this event as an EOF token. Let the caller 1234 // that put us in raw mode handle the event. 1235 if (isLexingRawMode()) { 1236 Result.startToken(); 1237 BufferPtr = BufferEnd; 1238 FormTokenWithChars(Result, BufferEnd, tok::eof); 1239 return true; 1240 } 1241 1242 // Otherwise, issue diagnostics for unterminated #if and missing newline. 1243 1244 // If we are in a #if directive, emit an error. 1245 while (!ConditionalStack.empty()) { 1246 PP->Diag(ConditionalStack.back().IfLoc, 1247 diag::err_pp_unterminated_conditional); 1248 ConditionalStack.pop_back(); 1249 } 1250 1251 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 1252 // a pedwarn. 1253 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) 1254 Diag(BufferEnd, diag::ext_no_newline_eof) 1255 << CodeModificationHint::CreateInsertion(getSourceLocation(BufferEnd), 1256 "\n"); 1257 1258 BufferPtr = CurPtr; 1259 1260 // Finally, let the preprocessor handle this. 1261 return PP->HandleEndOfFile(Result); 1262} 1263 1264/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 1265/// the specified lexer will return a tok::l_paren token, 0 if it is something 1266/// else and 2 if there are no more tokens in the buffer controlled by the 1267/// lexer. 1268unsigned Lexer::isNextPPTokenLParen() { 1269 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 1270 1271 // Switch to 'skipping' mode. This will ensure that we can lex a token 1272 // without emitting diagnostics, disables macro expansion, and will cause EOF 1273 // to return an EOF token instead of popping the include stack. 1274 LexingRawMode = true; 1275 1276 // Save state that can be changed while lexing so that we can restore it. 1277 const char *TmpBufferPtr = BufferPtr; 1278 bool inPPDirectiveMode = ParsingPreprocessorDirective; 1279 1280 Token Tok; 1281 Tok.startToken(); 1282 LexTokenInternal(Tok); 1283 1284 // Restore state that may have changed. 1285 BufferPtr = TmpBufferPtr; 1286 ParsingPreprocessorDirective = inPPDirectiveMode; 1287 1288 // Restore the lexer back to non-skipping mode. 1289 LexingRawMode = false; 1290 1291 if (Tok.is(tok::eof)) 1292 return 2; 1293 return Tok.is(tok::l_paren); 1294} 1295 1296 1297/// LexTokenInternal - This implements a simple C family lexer. It is an 1298/// extremely performance critical piece of code. This assumes that the buffer 1299/// has a null character at the end of the file. Return true if an error 1300/// occurred and compilation should terminate, false if normal. This returns a 1301/// preprocessing token, not a normal token, as such, it is an internal 1302/// interface. It assumes that the Flags of result have been cleared before 1303/// calling this. 1304void Lexer::LexTokenInternal(Token &Result) { 1305LexNextToken: 1306 // New token, can't need cleaning yet. 1307 Result.clearFlag(Token::NeedsCleaning); 1308 Result.setIdentifierInfo(0); 1309 1310 // CurPtr - Cache BufferPtr in an automatic variable. 1311 const char *CurPtr = BufferPtr; 1312 1313 // Small amounts of horizontal whitespace is very common between tokens. 1314 if ((*CurPtr == ' ') || (*CurPtr == '\t')) { 1315 ++CurPtr; 1316 while ((*CurPtr == ' ') || (*CurPtr == '\t')) 1317 ++CurPtr; 1318 1319 // If we are keeping whitespace and other tokens, just return what we just 1320 // skipped. The next lexer invocation will return the token after the 1321 // whitespace. 1322 if (isKeepWhitespaceMode()) { 1323 FormTokenWithChars(Result, CurPtr, tok::unknown); 1324 return; 1325 } 1326 1327 BufferPtr = CurPtr; 1328 Result.setFlag(Token::LeadingSpace); 1329 } 1330 1331 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 1332 1333 // Read a character, advancing over it. 1334 char Char = getAndAdvanceChar(CurPtr, Result); 1335 tok::TokenKind Kind; 1336 1337 switch (Char) { 1338 case 0: // Null. 1339 // Found end of file? 1340 if (CurPtr-1 == BufferEnd) { 1341 // Read the PP instance variable into an automatic variable, because 1342 // LexEndOfFile will often delete 'this'. 1343 Preprocessor *PPCache = PP; 1344 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 1345 return; // Got a token to return. 1346 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 1347 return PPCache->Lex(Result); 1348 } 1349 1350 if (!isLexingRawMode()) 1351 Diag(CurPtr-1, diag::null_in_file); 1352 Result.setFlag(Token::LeadingSpace); 1353 if (SkipWhitespace(Result, CurPtr)) 1354 return; // KeepWhitespaceMode 1355 1356 goto LexNextToken; // GCC isn't tail call eliminating. 1357 case '\n': 1358 case '\r': 1359 // If we are inside a preprocessor directive and we see the end of line, 1360 // we know we are done with the directive, so return an EOM token. 1361 if (ParsingPreprocessorDirective) { 1362 // Done parsing the "line". 1363 ParsingPreprocessorDirective = false; 1364 1365 // Restore comment saving mode, in case it was disabled for directive. 1366 SetCommentRetentionState(PP->getCommentRetentionState()); 1367 1368 // Since we consumed a newline, we are back at the start of a line. 1369 IsAtStartOfLine = true; 1370 1371 Kind = tok::eom; 1372 break; 1373 } 1374 // The returned token is at the start of the line. 1375 Result.setFlag(Token::StartOfLine); 1376 // No leading whitespace seen so far. 1377 Result.clearFlag(Token::LeadingSpace); 1378 1379 if (SkipWhitespace(Result, CurPtr)) 1380 return; // KeepWhitespaceMode 1381 goto LexNextToken; // GCC isn't tail call eliminating. 1382 case ' ': 1383 case '\t': 1384 case '\f': 1385 case '\v': 1386 SkipHorizontalWhitespace: 1387 Result.setFlag(Token::LeadingSpace); 1388 if (SkipWhitespace(Result, CurPtr)) 1389 return; // KeepWhitespaceMode 1390 1391 SkipIgnoredUnits: 1392 CurPtr = BufferPtr; 1393 1394 // If the next token is obviously a // or /* */ comment, skip it efficiently 1395 // too (without going through the big switch stmt). 1396 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 1397 Features.BCPLComment) { 1398 SkipBCPLComment(Result, CurPtr+2); 1399 goto SkipIgnoredUnits; 1400 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 1401 SkipBlockComment(Result, CurPtr+2); 1402 goto SkipIgnoredUnits; 1403 } else if (isHorizontalWhitespace(*CurPtr)) { 1404 goto SkipHorizontalWhitespace; 1405 } 1406 goto LexNextToken; // GCC isn't tail call eliminating. 1407 1408 // C99 6.4.4.1: Integer Constants. 1409 // C99 6.4.4.2: Floating Constants. 1410 case '0': case '1': case '2': case '3': case '4': 1411 case '5': case '6': case '7': case '8': case '9': 1412 // Notify MIOpt that we read a non-whitespace/non-comment token. 1413 MIOpt.ReadToken(); 1414 return LexNumericConstant(Result, CurPtr); 1415 1416 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 1417 // Notify MIOpt that we read a non-whitespace/non-comment token. 1418 MIOpt.ReadToken(); 1419 Char = getCharAndSize(CurPtr, SizeTmp); 1420 1421 // Wide string literal. 1422 if (Char == '"') 1423 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 1424 true); 1425 1426 // Wide character constant. 1427 if (Char == '\'') 1428 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 1429 // FALL THROUGH, treating L like the start of an identifier. 1430 1431 // C99 6.4.2: Identifiers. 1432 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 1433 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 1434 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': 1435 case 'V': case 'W': case 'X': case 'Y': case 'Z': 1436 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 1437 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 1438 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': 1439 case 'v': case 'w': case 'x': case 'y': case 'z': 1440 case '_': 1441 // Notify MIOpt that we read a non-whitespace/non-comment token. 1442 MIOpt.ReadToken(); 1443 return LexIdentifier(Result, CurPtr); 1444 1445 case '$': // $ in identifiers. 1446 if (Features.DollarIdents) { 1447 if (!isLexingRawMode()) 1448 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 1449 // Notify MIOpt that we read a non-whitespace/non-comment token. 1450 MIOpt.ReadToken(); 1451 return LexIdentifier(Result, CurPtr); 1452 } 1453 1454 Kind = tok::unknown; 1455 break; 1456 1457 // C99 6.4.4: Character Constants. 1458 case '\'': 1459 // Notify MIOpt that we read a non-whitespace/non-comment token. 1460 MIOpt.ReadToken(); 1461 return LexCharConstant(Result, CurPtr); 1462 1463 // C99 6.4.5: String Literals. 1464 case '"': 1465 // Notify MIOpt that we read a non-whitespace/non-comment token. 1466 MIOpt.ReadToken(); 1467 return LexStringLiteral(Result, CurPtr, false); 1468 1469 // C99 6.4.6: Punctuators. 1470 case '?': 1471 Kind = tok::question; 1472 break; 1473 case '[': 1474 Kind = tok::l_square; 1475 break; 1476 case ']': 1477 Kind = tok::r_square; 1478 break; 1479 case '(': 1480 Kind = tok::l_paren; 1481 break; 1482 case ')': 1483 Kind = tok::r_paren; 1484 break; 1485 case '{': 1486 Kind = tok::l_brace; 1487 break; 1488 case '}': 1489 Kind = tok::r_brace; 1490 break; 1491 case '.': 1492 Char = getCharAndSize(CurPtr, SizeTmp); 1493 if (Char >= '0' && Char <= '9') { 1494 // Notify MIOpt that we read a non-whitespace/non-comment token. 1495 MIOpt.ReadToken(); 1496 1497 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 1498 } else if (Features.CPlusPlus && Char == '*') { 1499 Kind = tok::periodstar; 1500 CurPtr += SizeTmp; 1501 } else if (Char == '.' && 1502 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 1503 Kind = tok::ellipsis; 1504 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1505 SizeTmp2, Result); 1506 } else { 1507 Kind = tok::period; 1508 } 1509 break; 1510 case '&': 1511 Char = getCharAndSize(CurPtr, SizeTmp); 1512 if (Char == '&') { 1513 Kind = tok::ampamp; 1514 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1515 } else if (Char == '=') { 1516 Kind = tok::ampequal; 1517 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1518 } else { 1519 Kind = tok::amp; 1520 } 1521 break; 1522 case '*': 1523 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 1524 Kind = tok::starequal; 1525 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1526 } else { 1527 Kind = tok::star; 1528 } 1529 break; 1530 case '+': 1531 Char = getCharAndSize(CurPtr, SizeTmp); 1532 if (Char == '+') { 1533 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1534 Kind = tok::plusplus; 1535 } else if (Char == '=') { 1536 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1537 Kind = tok::plusequal; 1538 } else { 1539 Kind = tok::plus; 1540 } 1541 break; 1542 case '-': 1543 Char = getCharAndSize(CurPtr, SizeTmp); 1544 if (Char == '-') { // -- 1545 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1546 Kind = tok::minusminus; 1547 } else if (Char == '>' && Features.CPlusPlus && 1548 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 1549 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1550 SizeTmp2, Result); 1551 Kind = tok::arrowstar; 1552 } else if (Char == '>') { // -> 1553 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1554 Kind = tok::arrow; 1555 } else if (Char == '=') { // -= 1556 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1557 Kind = tok::minusequal; 1558 } else { 1559 Kind = tok::minus; 1560 } 1561 break; 1562 case '~': 1563 Kind = tok::tilde; 1564 break; 1565 case '!': 1566 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 1567 Kind = tok::exclaimequal; 1568 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1569 } else { 1570 Kind = tok::exclaim; 1571 } 1572 break; 1573 case '/': 1574 // 6.4.9: Comments 1575 Char = getCharAndSize(CurPtr, SizeTmp); 1576 if (Char == '/') { // BCPL comment. 1577 // Even if BCPL comments are disabled (e.g. in C89 mode), we generally 1578 // want to lex this as a comment. There is one problem with this though, 1579 // that in one particular corner case, this can change the behavior of the 1580 // resultant program. For example, In "foo //**/ bar", C89 would lex 1581 // this as "foo / bar" and langauges with BCPL comments would lex it as 1582 // "foo". Check to see if the character after the second slash is a '*'. 1583 // If so, we will lex that as a "/" instead of the start of a comment. 1584 if (Features.BCPLComment || 1585 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') { 1586 if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 1587 return; // KeepCommentMode 1588 1589 // It is common for the tokens immediately after a // comment to be 1590 // whitespace (indentation for the next line). Instead of going through 1591 // the big switch, handle it efficiently now. 1592 goto SkipIgnoredUnits; 1593 } 1594 } 1595 1596 if (Char == '*') { // /**/ comment. 1597 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 1598 return; // KeepCommentMode 1599 goto LexNextToken; // GCC isn't tail call eliminating. 1600 } 1601 1602 if (Char == '=') { 1603 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1604 Kind = tok::slashequal; 1605 } else { 1606 Kind = tok::slash; 1607 } 1608 break; 1609 case '%': 1610 Char = getCharAndSize(CurPtr, SizeTmp); 1611 if (Char == '=') { 1612 Kind = tok::percentequal; 1613 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1614 } else if (Features.Digraphs && Char == '>') { 1615 Kind = tok::r_brace; // '%>' -> '}' 1616 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1617 } else if (Features.Digraphs && Char == ':') { 1618 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1619 Char = getCharAndSize(CurPtr, SizeTmp); 1620 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 1621 Kind = tok::hashhash; // '%:%:' -> '##' 1622 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1623 SizeTmp2, Result); 1624 } else if (Char == '@' && Features.Microsoft) { // %:@ -> #@ -> Charize 1625 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1626 if (!isLexingRawMode()) 1627 Diag(BufferPtr, diag::charize_microsoft_ext); 1628 Kind = tok::hashat; 1629 } else { // '%:' -> '#' 1630 // We parsed a # character. If this occurs at the start of the line, 1631 // it's actually the start of a preprocessing directive. Callback to 1632 // the preprocessor to handle it. 1633 // FIXME: -fpreprocessed mode?? 1634 if (Result.isAtStartOfLine() && !LexingRawMode) { 1635 FormTokenWithChars(Result, CurPtr, tok::hash); 1636 PP->HandleDirective(Result); 1637 1638 // As an optimization, if the preprocessor didn't switch lexers, tail 1639 // recurse. 1640 if (PP->isCurrentLexer(this)) { 1641 // Start a new token. If this is a #include or something, the PP may 1642 // want us starting at the beginning of the line again. If so, set 1643 // the StartOfLine flag. 1644 if (IsAtStartOfLine) { 1645 Result.setFlag(Token::StartOfLine); 1646 IsAtStartOfLine = false; 1647 } 1648 goto LexNextToken; // GCC isn't tail call eliminating. 1649 } 1650 1651 return PP->Lex(Result); 1652 } 1653 1654 Kind = tok::hash; 1655 } 1656 } else { 1657 Kind = tok::percent; 1658 } 1659 break; 1660 case '<': 1661 Char = getCharAndSize(CurPtr, SizeTmp); 1662 if (ParsingFilename) { 1663 return LexAngledStringLiteral(Result, CurPtr); 1664 } else if (Char == '<' && 1665 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '=') { 1666 Kind = tok::lesslessequal; 1667 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1668 SizeTmp2, Result); 1669 } else if (Char == '<') { 1670 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1671 Kind = tok::lessless; 1672 } else if (Char == '=') { 1673 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1674 Kind = tok::lessequal; 1675 } else if (Features.Digraphs && Char == ':') { // '<:' -> '[' 1676 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1677 Kind = tok::l_square; 1678 } else if (Features.Digraphs && Char == '%') { // '<%' -> '{' 1679 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1680 Kind = tok::l_brace; 1681 } else { 1682 Kind = tok::less; 1683 } 1684 break; 1685 case '>': 1686 Char = getCharAndSize(CurPtr, SizeTmp); 1687 if (Char == '=') { 1688 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1689 Kind = tok::greaterequal; 1690 } else if (Char == '>' && 1691 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '=') { 1692 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1693 SizeTmp2, Result); 1694 Kind = tok::greatergreaterequal; 1695 } else if (Char == '>') { 1696 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1697 Kind = tok::greatergreater; 1698 } else { 1699 Kind = tok::greater; 1700 } 1701 break; 1702 case '^': 1703 Char = getCharAndSize(CurPtr, SizeTmp); 1704 if (Char == '=') { 1705 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1706 Kind = tok::caretequal; 1707 } else { 1708 Kind = tok::caret; 1709 } 1710 break; 1711 case '|': 1712 Char = getCharAndSize(CurPtr, SizeTmp); 1713 if (Char == '=') { 1714 Kind = tok::pipeequal; 1715 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1716 } else if (Char == '|') { 1717 Kind = tok::pipepipe; 1718 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1719 } else { 1720 Kind = tok::pipe; 1721 } 1722 break; 1723 case ':': 1724 Char = getCharAndSize(CurPtr, SizeTmp); 1725 if (Features.Digraphs && Char == '>') { 1726 Kind = tok::r_square; // ':>' -> ']' 1727 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1728 } else if (Features.CPlusPlus && Char == ':') { 1729 Kind = tok::coloncolon; 1730 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1731 } else { 1732 Kind = tok::colon; 1733 } 1734 break; 1735 case ';': 1736 Kind = tok::semi; 1737 break; 1738 case '=': 1739 Char = getCharAndSize(CurPtr, SizeTmp); 1740 if (Char == '=') { 1741 Kind = tok::equalequal; 1742 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1743 } else { 1744 Kind = tok::equal; 1745 } 1746 break; 1747 case ',': 1748 Kind = tok::comma; 1749 break; 1750 case '#': 1751 Char = getCharAndSize(CurPtr, SizeTmp); 1752 if (Char == '#') { 1753 Kind = tok::hashhash; 1754 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1755 } else if (Char == '@' && Features.Microsoft) { // #@ -> Charize 1756 Kind = tok::hashat; 1757 if (!isLexingRawMode()) 1758 Diag(BufferPtr, diag::charize_microsoft_ext); 1759 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1760 } else { 1761 // We parsed a # character. If this occurs at the start of the line, 1762 // it's actually the start of a preprocessing directive. Callback to 1763 // the preprocessor to handle it. 1764 // FIXME: -fpreprocessed mode?? 1765 if (Result.isAtStartOfLine() && !LexingRawMode) { 1766 FormTokenWithChars(Result, CurPtr, tok::hash); 1767 PP->HandleDirective(Result); 1768 1769 // As an optimization, if the preprocessor didn't switch lexers, tail 1770 // recurse. 1771 if (PP->isCurrentLexer(this)) { 1772 // Start a new token. If this is a #include or something, the PP may 1773 // want us starting at the beginning of the line again. If so, set 1774 // the StartOfLine flag. 1775 if (IsAtStartOfLine) { 1776 Result.setFlag(Token::StartOfLine); 1777 IsAtStartOfLine = false; 1778 } 1779 goto LexNextToken; // GCC isn't tail call eliminating. 1780 } 1781 return PP->Lex(Result); 1782 } 1783 1784 Kind = tok::hash; 1785 } 1786 break; 1787 1788 case '@': 1789 // Objective C support. 1790 if (CurPtr[-1] == '@' && Features.ObjC1) 1791 Kind = tok::at; 1792 else 1793 Kind = tok::unknown; 1794 break; 1795 1796 case '\\': 1797 // FIXME: UCN's. 1798 // FALL THROUGH. 1799 default: 1800 Kind = tok::unknown; 1801 break; 1802 } 1803 1804 // Notify MIOpt that we read a non-whitespace/non-comment token. 1805 MIOpt.ReadToken(); 1806 1807 // Update the location of token as well as BufferPtr. 1808 FormTokenWithChars(Result, CurPtr, Kind); 1809} 1810