Lexer.cpp revision 04a94bcc56438b17e88db592708324041f75d48c
1//===--- Lexer.cpp - C Language Family Lexer ------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements the Lexer and Token interfaces. 11// 12//===----------------------------------------------------------------------===// 13// 14// TODO: GCC Diagnostics emitted by the lexer: 15// PEDWARN: (form feed|vertical tab) in preprocessing directive 16// 17// Universal characters, unicode, char mapping: 18// WARNING: `%.*s' is not in NFKC 19// WARNING: `%.*s' is not in NFC 20// 21// Other: 22// TODO: Options to support: 23// -fexec-charset,-fwide-exec-charset 24// 25//===----------------------------------------------------------------------===// 26 27#include "clang/Lex/Lexer.h" 28#include "clang/Lex/Preprocessor.h" 29#include "clang/Lex/LexDiagnostic.h" 30#include "clang/Lex/CodeCompletionHandler.h" 31#include "clang/Basic/SourceManager.h" 32#include "llvm/ADT/StringSwitch.h" 33#include "llvm/Support/Compiler.h" 34#include "llvm/Support/MemoryBuffer.h" 35#include <cstring> 36using namespace clang; 37 38static void InitCharacterInfo(); 39 40//===----------------------------------------------------------------------===// 41// Token Class Implementation 42//===----------------------------------------------------------------------===// 43 44/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 45bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 46 if (IdentifierInfo *II = getIdentifierInfo()) 47 return II->getObjCKeywordID() == objcKey; 48 return false; 49} 50 51/// getObjCKeywordID - Return the ObjC keyword kind. 52tok::ObjCKeywordKind Token::getObjCKeywordID() const { 53 IdentifierInfo *specId = getIdentifierInfo(); 54 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 55} 56 57 58//===----------------------------------------------------------------------===// 59// Lexer Class Implementation 60//===----------------------------------------------------------------------===// 61 62void Lexer::anchor() { } 63 64void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 65 const char *BufEnd) { 66 InitCharacterInfo(); 67 68 BufferStart = BufStart; 69 BufferPtr = BufPtr; 70 BufferEnd = BufEnd; 71 72 assert(BufEnd[0] == 0 && 73 "We assume that the input buffer has a null character at the end" 74 " to simplify lexing!"); 75 76 // Check whether we have a BOM in the beginning of the buffer. If yes - act 77 // accordingly. Right now we support only UTF-8 with and without BOM, so, just 78 // skip the UTF-8 BOM if it's present. 79 if (BufferStart == BufferPtr) { 80 // Determine the size of the BOM. 81 StringRef Buf(BufferStart, BufferEnd - BufferStart); 82 size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 83 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 84 .Default(0); 85 86 // Skip the BOM. 87 BufferPtr += BOMLength; 88 } 89 90 Is_PragmaLexer = false; 91 CurrentConflictMarkerState = CMK_None; 92 93 // Start of the file is a start of line. 94 IsAtStartOfLine = true; 95 96 // We are not after parsing a #. 97 ParsingPreprocessorDirective = false; 98 99 // We are not after parsing #include. 100 ParsingFilename = false; 101 102 // We are not in raw mode. Raw mode disables diagnostics and interpretation 103 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 104 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 105 // or otherwise skipping over tokens. 106 LexingRawMode = false; 107 108 // Default to not keeping comments. 109 ExtendedTokenMode = 0; 110} 111 112/// Lexer constructor - Create a new lexer object for the specified buffer 113/// with the specified preprocessor managing the lexing process. This lexer 114/// assumes that the associated file buffer and Preprocessor objects will 115/// outlive it, so it doesn't take ownership of either of them. 116Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP) 117 : PreprocessorLexer(&PP, FID), 118 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 119 Features(PP.getLangOptions()) { 120 121 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), 122 InputFile->getBufferEnd()); 123 124 // Default to keeping comments if the preprocessor wants them. 125 SetCommentRetentionState(PP.getCommentRetentionState()); 126} 127 128/// Lexer constructor - Create a new raw lexer object. This object is only 129/// suitable for calls to 'LexRawToken'. This lexer assumes that the text 130/// range will outlive it, so it doesn't take ownership of it. 131Lexer::Lexer(SourceLocation fileloc, const LangOptions &features, 132 const char *BufStart, const char *BufPtr, const char *BufEnd) 133 : FileLoc(fileloc), Features(features) { 134 135 InitLexer(BufStart, BufPtr, BufEnd); 136 137 // We *are* in raw mode. 138 LexingRawMode = true; 139} 140 141/// Lexer constructor - Create a new raw lexer object. This object is only 142/// suitable for calls to 'LexRawToken'. This lexer assumes that the text 143/// range will outlive it, so it doesn't take ownership of it. 144Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile, 145 const SourceManager &SM, const LangOptions &features) 146 : FileLoc(SM.getLocForStartOfFile(FID)), Features(features) { 147 148 InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(), 149 FromFile->getBufferEnd()); 150 151 // We *are* in raw mode. 152 LexingRawMode = true; 153} 154 155/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 156/// _Pragma expansion. This has a variety of magic semantics that this method 157/// sets up. It returns a new'd Lexer that must be delete'd when done. 158/// 159/// On entrance to this routine, TokStartLoc is a macro location which has a 160/// spelling loc that indicates the bytes to be lexed for the token and an 161/// expansion location that indicates where all lexed tokens should be 162/// "expanded from". 163/// 164/// FIXME: It would really be nice to make _Pragma just be a wrapper around a 165/// normal lexer that remaps tokens as they fly by. This would require making 166/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 167/// interface that could handle this stuff. This would pull GetMappedTokenLoc 168/// out of the critical path of the lexer! 169/// 170Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 171 SourceLocation ExpansionLocStart, 172 SourceLocation ExpansionLocEnd, 173 unsigned TokLen, Preprocessor &PP) { 174 SourceManager &SM = PP.getSourceManager(); 175 176 // Create the lexer as if we were going to lex the file normally. 177 FileID SpellingFID = SM.getFileID(SpellingLoc); 178 const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID); 179 Lexer *L = new Lexer(SpellingFID, InputFile, PP); 180 181 // Now that the lexer is created, change the start/end locations so that we 182 // just lex the subsection of the file that we want. This is lexing from a 183 // scratch buffer. 184 const char *StrData = SM.getCharacterData(SpellingLoc); 185 186 L->BufferPtr = StrData; 187 L->BufferEnd = StrData+TokLen; 188 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 189 190 // Set the SourceLocation with the remapping information. This ensures that 191 // GetMappedTokenLoc will remap the tokens as they are lexed. 192 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 193 ExpansionLocStart, 194 ExpansionLocEnd, TokLen); 195 196 // Ensure that the lexer thinks it is inside a directive, so that end \n will 197 // return an EOD token. 198 L->ParsingPreprocessorDirective = true; 199 200 // This lexer really is for _Pragma. 201 L->Is_PragmaLexer = true; 202 return L; 203} 204 205 206/// Stringify - Convert the specified string into a C string, with surrounding 207/// ""'s, and with escaped \ and " characters. 208std::string Lexer::Stringify(const std::string &Str, bool Charify) { 209 std::string Result = Str; 210 char Quote = Charify ? '\'' : '"'; 211 for (unsigned i = 0, e = Result.size(); i != e; ++i) { 212 if (Result[i] == '\\' || Result[i] == Quote) { 213 Result.insert(Result.begin()+i, '\\'); 214 ++i; ++e; 215 } 216 } 217 return Result; 218} 219 220/// Stringify - Convert the specified string into a C string by escaping '\' 221/// and " characters. This does not add surrounding ""'s to the string. 222void Lexer::Stringify(SmallVectorImpl<char> &Str) { 223 for (unsigned i = 0, e = Str.size(); i != e; ++i) { 224 if (Str[i] == '\\' || Str[i] == '"') { 225 Str.insert(Str.begin()+i, '\\'); 226 ++i; ++e; 227 } 228 } 229} 230 231//===----------------------------------------------------------------------===// 232// Token Spelling 233//===----------------------------------------------------------------------===// 234 235/// getSpelling() - Return the 'spelling' of this token. The spelling of a 236/// token are the characters used to represent the token in the source file 237/// after trigraph expansion and escaped-newline folding. In particular, this 238/// wants to get the true, uncanonicalized, spelling of things like digraphs 239/// UCNs, etc. 240StringRef Lexer::getSpelling(SourceLocation loc, 241 SmallVectorImpl<char> &buffer, 242 const SourceManager &SM, 243 const LangOptions &options, 244 bool *invalid) { 245 // Break down the source location. 246 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 247 248 // Try to the load the file buffer. 249 bool invalidTemp = false; 250 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 251 if (invalidTemp) { 252 if (invalid) *invalid = true; 253 return StringRef(); 254 } 255 256 const char *tokenBegin = file.data() + locInfo.second; 257 258 // Lex from the start of the given location. 259 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 260 file.begin(), tokenBegin, file.end()); 261 Token token; 262 lexer.LexFromRawLexer(token); 263 264 unsigned length = token.getLength(); 265 266 // Common case: no need for cleaning. 267 if (!token.needsCleaning()) 268 return StringRef(tokenBegin, length); 269 270 // Hard case, we need to relex the characters into the string. 271 buffer.clear(); 272 buffer.reserve(length); 273 274 for (const char *ti = tokenBegin, *te = ti + length; ti != te; ) { 275 unsigned charSize; 276 buffer.push_back(Lexer::getCharAndSizeNoWarn(ti, charSize, options)); 277 ti += charSize; 278 } 279 280 return StringRef(buffer.data(), buffer.size()); 281} 282 283/// getSpelling() - Return the 'spelling' of this token. The spelling of a 284/// token are the characters used to represent the token in the source file 285/// after trigraph expansion and escaped-newline folding. In particular, this 286/// wants to get the true, uncanonicalized, spelling of things like digraphs 287/// UCNs, etc. 288std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 289 const LangOptions &Features, bool *Invalid) { 290 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 291 292 // If this token contains nothing interesting, return it directly. 293 bool CharDataInvalid = false; 294 const char* TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 295 &CharDataInvalid); 296 if (Invalid) 297 *Invalid = CharDataInvalid; 298 if (CharDataInvalid) 299 return std::string(); 300 301 if (!Tok.needsCleaning()) 302 return std::string(TokStart, TokStart+Tok.getLength()); 303 304 std::string Result; 305 Result.reserve(Tok.getLength()); 306 307 // Otherwise, hard case, relex the characters into the string. 308 for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength(); 309 Ptr != End; ) { 310 unsigned CharSize; 311 Result.push_back(Lexer::getCharAndSizeNoWarn(Ptr, CharSize, Features)); 312 Ptr += CharSize; 313 } 314 assert(Result.size() != unsigned(Tok.getLength()) && 315 "NeedsCleaning flag set on something that didn't need cleaning!"); 316 return Result; 317} 318 319/// getSpelling - This method is used to get the spelling of a token into a 320/// preallocated buffer, instead of as an std::string. The caller is required 321/// to allocate enough space for the token, which is guaranteed to be at least 322/// Tok.getLength() bytes long. The actual length of the token is returned. 323/// 324/// Note that this method may do two possible things: it may either fill in 325/// the buffer specified with characters, or it may *change the input pointer* 326/// to point to a constant buffer with the data already in it (avoiding a 327/// copy). The caller is not allowed to modify the returned buffer pointer 328/// if an internal buffer is returned. 329unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 330 const SourceManager &SourceMgr, 331 const LangOptions &Features, bool *Invalid) { 332 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 333 334 const char *TokStart = 0; 335 // NOTE: this has to be checked *before* testing for an IdentifierInfo. 336 if (Tok.is(tok::raw_identifier)) 337 TokStart = Tok.getRawIdentifierData(); 338 else if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 339 // Just return the string from the identifier table, which is very quick. 340 Buffer = II->getNameStart(); 341 return II->getLength(); 342 } 343 344 // NOTE: this can be checked even after testing for an IdentifierInfo. 345 if (Tok.isLiteral()) 346 TokStart = Tok.getLiteralData(); 347 348 if (TokStart == 0) { 349 // Compute the start of the token in the input lexer buffer. 350 bool CharDataInvalid = false; 351 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 352 if (Invalid) 353 *Invalid = CharDataInvalid; 354 if (CharDataInvalid) { 355 Buffer = ""; 356 return 0; 357 } 358 } 359 360 // If this token contains nothing interesting, return it directly. 361 if (!Tok.needsCleaning()) { 362 Buffer = TokStart; 363 return Tok.getLength(); 364 } 365 366 // Otherwise, hard case, relex the characters into the string. 367 char *OutBuf = const_cast<char*>(Buffer); 368 for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength(); 369 Ptr != End; ) { 370 unsigned CharSize; 371 *OutBuf++ = Lexer::getCharAndSizeNoWarn(Ptr, CharSize, Features); 372 Ptr += CharSize; 373 } 374 assert(unsigned(OutBuf-Buffer) != Tok.getLength() && 375 "NeedsCleaning flag set on something that didn't need cleaning!"); 376 377 return OutBuf-Buffer; 378} 379 380 381 382static bool isWhitespace(unsigned char c); 383 384/// MeasureTokenLength - Relex the token at the specified location and return 385/// its length in bytes in the input file. If the token needs cleaning (e.g. 386/// includes a trigraph or an escaped newline) then this count includes bytes 387/// that are part of that. 388unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 389 const SourceManager &SM, 390 const LangOptions &LangOpts) { 391 // TODO: this could be special cased for common tokens like identifiers, ')', 392 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 393 // all obviously single-char tokens. This could use 394 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 395 // something. 396 397 // If this comes from a macro expansion, we really do want the macro name, not 398 // the token this macro expanded to. 399 Loc = SM.getExpansionLoc(Loc); 400 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 401 bool Invalid = false; 402 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 403 if (Invalid) 404 return 0; 405 406 const char *StrData = Buffer.data()+LocInfo.second; 407 408 if (isWhitespace(StrData[0])) 409 return 0; 410 411 // Create a lexer starting at the beginning of this token. 412 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 413 Buffer.begin(), StrData, Buffer.end()); 414 TheLexer.SetCommentRetentionState(true); 415 Token TheTok; 416 TheLexer.LexFromRawLexer(TheTok); 417 return TheTok.getLength(); 418} 419 420static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 421 const SourceManager &SM, 422 const LangOptions &LangOpts) { 423 assert(Loc.isFileID()); 424 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 425 if (LocInfo.first.isInvalid()) 426 return Loc; 427 428 bool Invalid = false; 429 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 430 if (Invalid) 431 return Loc; 432 433 // Back up from the current location until we hit the beginning of a line 434 // (or the buffer). We'll relex from that point. 435 const char *BufStart = Buffer.data(); 436 if (LocInfo.second >= Buffer.size()) 437 return Loc; 438 439 const char *StrData = BufStart+LocInfo.second; 440 if (StrData[0] == '\n' || StrData[0] == '\r') 441 return Loc; 442 443 const char *LexStart = StrData; 444 while (LexStart != BufStart) { 445 if (LexStart[0] == '\n' || LexStart[0] == '\r') { 446 ++LexStart; 447 break; 448 } 449 450 --LexStart; 451 } 452 453 // Create a lexer starting at the beginning of this token. 454 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 455 Lexer TheLexer(LexerStartLoc, LangOpts, BufStart, LexStart, Buffer.end()); 456 TheLexer.SetCommentRetentionState(true); 457 458 // Lex tokens until we find the token that contains the source location. 459 Token TheTok; 460 do { 461 TheLexer.LexFromRawLexer(TheTok); 462 463 if (TheLexer.getBufferLocation() > StrData) { 464 // Lexing this token has taken the lexer past the source location we're 465 // looking for. If the current token encompasses our source location, 466 // return the beginning of that token. 467 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 468 return TheTok.getLocation(); 469 470 // We ended up skipping over the source location entirely, which means 471 // that it points into whitespace. We're done here. 472 break; 473 } 474 } while (TheTok.getKind() != tok::eof); 475 476 // We've passed our source location; just return the original source location. 477 return Loc; 478} 479 480SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 481 const SourceManager &SM, 482 const LangOptions &LangOpts) { 483 if (Loc.isFileID()) 484 return getBeginningOfFileToken(Loc, SM, LangOpts); 485 486 if (!SM.isMacroArgExpansion(Loc)) 487 return Loc; 488 489 SourceLocation FileLoc = SM.getSpellingLoc(Loc); 490 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 491 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); 492 std::pair<FileID, unsigned> BeginFileLocInfo= SM.getDecomposedLoc(BeginFileLoc); 493 assert(FileLocInfo.first == BeginFileLocInfo.first && 494 FileLocInfo.second >= BeginFileLocInfo.second); 495 return Loc.getLocWithOffset(SM.getDecomposedLoc(BeginFileLoc).second - 496 SM.getDecomposedLoc(FileLoc).second); 497} 498 499namespace { 500 enum PreambleDirectiveKind { 501 PDK_Skipped, 502 PDK_StartIf, 503 PDK_EndIf, 504 PDK_Unknown 505 }; 506} 507 508std::pair<unsigned, bool> 509Lexer::ComputePreamble(const llvm::MemoryBuffer *Buffer, 510 const LangOptions &Features, unsigned MaxLines) { 511 // Create a lexer starting at the beginning of the file. Note that we use a 512 // "fake" file source location at offset 1 so that the lexer will track our 513 // position within the file. 514 const unsigned StartOffset = 1; 515 SourceLocation StartLoc = SourceLocation::getFromRawEncoding(StartOffset); 516 Lexer TheLexer(StartLoc, Features, Buffer->getBufferStart(), 517 Buffer->getBufferStart(), Buffer->getBufferEnd()); 518 519 bool InPreprocessorDirective = false; 520 Token TheTok; 521 Token IfStartTok; 522 unsigned IfCount = 0; 523 524 unsigned MaxLineOffset = 0; 525 if (MaxLines) { 526 const char *CurPtr = Buffer->getBufferStart(); 527 unsigned CurLine = 0; 528 while (CurPtr != Buffer->getBufferEnd()) { 529 char ch = *CurPtr++; 530 if (ch == '\n') { 531 ++CurLine; 532 if (CurLine == MaxLines) 533 break; 534 } 535 } 536 if (CurPtr != Buffer->getBufferEnd()) 537 MaxLineOffset = CurPtr - Buffer->getBufferStart(); 538 } 539 540 do { 541 TheLexer.LexFromRawLexer(TheTok); 542 543 if (InPreprocessorDirective) { 544 // If we've hit the end of the file, we're done. 545 if (TheTok.getKind() == tok::eof) { 546 InPreprocessorDirective = false; 547 break; 548 } 549 550 // If we haven't hit the end of the preprocessor directive, skip this 551 // token. 552 if (!TheTok.isAtStartOfLine()) 553 continue; 554 555 // We've passed the end of the preprocessor directive, and will look 556 // at this token again below. 557 InPreprocessorDirective = false; 558 } 559 560 // Keep track of the # of lines in the preamble. 561 if (TheTok.isAtStartOfLine()) { 562 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 563 564 // If we were asked to limit the number of lines in the preamble, 565 // and we're about to exceed that limit, we're done. 566 if (MaxLineOffset && TokOffset >= MaxLineOffset) 567 break; 568 } 569 570 // Comments are okay; skip over them. 571 if (TheTok.getKind() == tok::comment) 572 continue; 573 574 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 575 // This is the start of a preprocessor directive. 576 Token HashTok = TheTok; 577 InPreprocessorDirective = true; 578 579 // Figure out which directive this is. Since we're lexing raw tokens, 580 // we don't have an identifier table available. Instead, just look at 581 // the raw identifier to recognize and categorize preprocessor directives. 582 TheLexer.LexFromRawLexer(TheTok); 583 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 584 StringRef Keyword(TheTok.getRawIdentifierData(), 585 TheTok.getLength()); 586 PreambleDirectiveKind PDK 587 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 588 .Case("include", PDK_Skipped) 589 .Case("__include_macros", PDK_Skipped) 590 .Case("define", PDK_Skipped) 591 .Case("undef", PDK_Skipped) 592 .Case("line", PDK_Skipped) 593 .Case("error", PDK_Skipped) 594 .Case("pragma", PDK_Skipped) 595 .Case("import", PDK_Skipped) 596 .Case("include_next", PDK_Skipped) 597 .Case("warning", PDK_Skipped) 598 .Case("ident", PDK_Skipped) 599 .Case("sccs", PDK_Skipped) 600 .Case("assert", PDK_Skipped) 601 .Case("unassert", PDK_Skipped) 602 .Case("if", PDK_StartIf) 603 .Case("ifdef", PDK_StartIf) 604 .Case("ifndef", PDK_StartIf) 605 .Case("elif", PDK_Skipped) 606 .Case("else", PDK_Skipped) 607 .Case("endif", PDK_EndIf) 608 .Default(PDK_Unknown); 609 610 switch (PDK) { 611 case PDK_Skipped: 612 continue; 613 614 case PDK_StartIf: 615 if (IfCount == 0) 616 IfStartTok = HashTok; 617 618 ++IfCount; 619 continue; 620 621 case PDK_EndIf: 622 // Mismatched #endif. The preamble ends here. 623 if (IfCount == 0) 624 break; 625 626 --IfCount; 627 continue; 628 629 case PDK_Unknown: 630 // We don't know what this directive is; stop at the '#'. 631 break; 632 } 633 } 634 635 // We only end up here if we didn't recognize the preprocessor 636 // directive or it was one that can't occur in the preamble at this 637 // point. Roll back the current token to the location of the '#'. 638 InPreprocessorDirective = false; 639 TheTok = HashTok; 640 } 641 642 // We hit a token that we don't recognize as being in the 643 // "preprocessing only" part of the file, so we're no longer in 644 // the preamble. 645 break; 646 } while (true); 647 648 SourceLocation End = IfCount? IfStartTok.getLocation() : TheTok.getLocation(); 649 return std::make_pair(End.getRawEncoding() - StartLoc.getRawEncoding(), 650 IfCount? IfStartTok.isAtStartOfLine() 651 : TheTok.isAtStartOfLine()); 652} 653 654 655/// AdvanceToTokenCharacter - Given a location that specifies the start of a 656/// token, return a new location that specifies a character within the token. 657SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart, 658 unsigned CharNo, 659 const SourceManager &SM, 660 const LangOptions &Features) { 661 // Figure out how many physical characters away the specified expansion 662 // character is. This needs to take into consideration newlines and 663 // trigraphs. 664 bool Invalid = false; 665 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 666 667 // If they request the first char of the token, we're trivially done. 668 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 669 return TokStart; 670 671 unsigned PhysOffset = 0; 672 673 // The usual case is that tokens don't contain anything interesting. Skip 674 // over the uninteresting characters. If a token only consists of simple 675 // chars, this method is extremely fast. 676 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 677 if (CharNo == 0) 678 return TokStart.getLocWithOffset(PhysOffset); 679 ++TokPtr, --CharNo, ++PhysOffset; 680 } 681 682 // If we have a character that may be a trigraph or escaped newline, use a 683 // lexer to parse it correctly. 684 for (; CharNo; --CharNo) { 685 unsigned Size; 686 Lexer::getCharAndSizeNoWarn(TokPtr, Size, Features); 687 TokPtr += Size; 688 PhysOffset += Size; 689 } 690 691 // Final detail: if we end up on an escaped newline, we want to return the 692 // location of the actual byte of the token. For example foo\<newline>bar 693 // advanced by 3 should return the location of b, not of \\. One compounding 694 // detail of this is that the escape may be made by a trigraph. 695 if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 696 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 697 698 return TokStart.getLocWithOffset(PhysOffset); 699} 700 701/// \brief Computes the source location just past the end of the 702/// token at this source location. 703/// 704/// This routine can be used to produce a source location that 705/// points just past the end of the token referenced by \p Loc, and 706/// is generally used when a diagnostic needs to point just after a 707/// token where it expected something different that it received. If 708/// the returned source location would not be meaningful (e.g., if 709/// it points into a macro), this routine returns an invalid 710/// source location. 711/// 712/// \param Offset an offset from the end of the token, where the source 713/// location should refer to. The default offset (0) produces a source 714/// location pointing just past the end of the token; an offset of 1 produces 715/// a source location pointing to the last character in the token, etc. 716SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 717 const SourceManager &SM, 718 const LangOptions &Features) { 719 if (Loc.isInvalid()) 720 return SourceLocation(); 721 722 if (Loc.isMacroID()) { 723 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, Features)) 724 return SourceLocation(); // Points inside the macro expansion. 725 726 // Continue and find the location just after the macro expansion. 727 Loc = SM.getExpansionRange(Loc).second; 728 } 729 730 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, Features); 731 if (Len > Offset) 732 Len = Len - Offset; 733 else 734 return Loc; 735 736 return Loc.getLocWithOffset(Len); 737} 738 739/// \brief Returns true if the given MacroID location points at the first 740/// token of the macro expansion. 741bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 742 const SourceManager &SM, 743 const LangOptions &LangOpts) { 744 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 745 746 std::pair<FileID, unsigned> infoLoc = SM.getDecomposedLoc(loc); 747 // FIXME: If the token comes from the macro token paste operator ('##') 748 // this function will always return false; 749 if (infoLoc.second > 0) 750 return false; // Does not point at the start of token. 751 752 SourceLocation expansionLoc = 753 SM.getSLocEntry(infoLoc.first).getExpansion().getExpansionLocStart(); 754 if (expansionLoc.isFileID()) 755 return true; // No other macro expansions, this is the first. 756 757 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts); 758} 759 760/// \brief Returns true if the given MacroID location points at the last 761/// token of the macro expansion. 762bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 763 const SourceManager &SM, 764 const LangOptions &LangOpts) { 765 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 766 767 SourceLocation spellLoc = SM.getSpellingLoc(loc); 768 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 769 if (tokLen == 0) 770 return false; 771 772 FileID FID = SM.getFileID(loc); 773 SourceLocation afterLoc = loc.getLocWithOffset(tokLen+1); 774 if (SM.isInFileID(afterLoc, FID)) 775 return false; // Still in the same FileID, does not point to the last token. 776 777 // FIXME: If the token comes from the macro token paste operator ('##') 778 // or the stringify operator ('#') this function will always return false; 779 780 SourceLocation expansionLoc = 781 SM.getSLocEntry(FID).getExpansion().getExpansionLocEnd(); 782 if (expansionLoc.isFileID()) 783 return true; // No other macro expansions. 784 785 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts); 786} 787 788//===----------------------------------------------------------------------===// 789// Character information. 790//===----------------------------------------------------------------------===// 791 792enum { 793 CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0' 794 CHAR_VERT_WS = 0x02, // '\r', '\n' 795 CHAR_LETTER = 0x04, // a-z,A-Z 796 CHAR_NUMBER = 0x08, // 0-9 797 CHAR_UNDER = 0x10, // _ 798 CHAR_PERIOD = 0x20, // . 799 CHAR_RAWDEL = 0x40 // {}[]#<>%:;?*+-/^&|~!=,"' 800}; 801 802// Statically initialize CharInfo table based on ASCII character set 803// Reference: FreeBSD 7.2 /usr/share/misc/ascii 804static const unsigned char CharInfo[256] = 805{ 806// 0 NUL 1 SOH 2 STX 3 ETX 807// 4 EOT 5 ENQ 6 ACK 7 BEL 808 0 , 0 , 0 , 0 , 809 0 , 0 , 0 , 0 , 810// 8 BS 9 HT 10 NL 11 VT 811//12 NP 13 CR 14 SO 15 SI 812 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS, 813 CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 , 814//16 DLE 17 DC1 18 DC2 19 DC3 815//20 DC4 21 NAK 22 SYN 23 ETB 816 0 , 0 , 0 , 0 , 817 0 , 0 , 0 , 0 , 818//24 CAN 25 EM 26 SUB 27 ESC 819//28 FS 29 GS 30 RS 31 US 820 0 , 0 , 0 , 0 , 821 0 , 0 , 0 , 0 , 822//32 SP 33 ! 34 " 35 # 823//36 $ 37 % 38 & 39 ' 824 CHAR_HORZ_WS, CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 825 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 826//40 ( 41 ) 42 * 43 + 827//44 , 45 - 46 . 47 / 828 0 , 0 , CHAR_RAWDEL , CHAR_RAWDEL , 829 CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL , 830//48 0 49 1 50 2 51 3 831//52 4 53 5 54 6 55 7 832 CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , 833 CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , 834//56 8 57 9 58 : 59 ; 835//60 < 61 = 62 > 63 ? 836 CHAR_NUMBER , CHAR_NUMBER , CHAR_RAWDEL , CHAR_RAWDEL , 837 CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 838//64 @ 65 A 66 B 67 C 839//68 D 69 E 70 F 71 G 840 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 841 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 842//72 H 73 I 74 J 75 K 843//76 L 77 M 78 N 79 O 844 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 845 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 846//80 P 81 Q 82 R 83 S 847//84 T 85 U 86 V 87 W 848 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 849 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 850//88 X 89 Y 90 Z 91 [ 851//92 \ 93 ] 94 ^ 95 _ 852 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL , 853 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER , 854//96 ` 97 a 98 b 99 c 855//100 d 101 e 102 f 103 g 856 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 857 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 858//104 h 105 i 106 j 107 k 859//108 l 109 m 110 n 111 o 860 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 861 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 862//112 p 113 q 114 r 115 s 863//116 t 117 u 118 v 119 w 864 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 865 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 866//120 x 121 y 122 z 123 { 867//124 | 125 } 126 ~ 127 DEL 868 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL , 869 CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0 870}; 871 872static void InitCharacterInfo() { 873 static bool isInited = false; 874 if (isInited) return; 875 // check the statically-initialized CharInfo table 876 assert(CHAR_HORZ_WS == CharInfo[(int)' ']); 877 assert(CHAR_HORZ_WS == CharInfo[(int)'\t']); 878 assert(CHAR_HORZ_WS == CharInfo[(int)'\f']); 879 assert(CHAR_HORZ_WS == CharInfo[(int)'\v']); 880 assert(CHAR_VERT_WS == CharInfo[(int)'\n']); 881 assert(CHAR_VERT_WS == CharInfo[(int)'\r']); 882 assert(CHAR_UNDER == CharInfo[(int)'_']); 883 assert(CHAR_PERIOD == CharInfo[(int)'.']); 884 for (unsigned i = 'a'; i <= 'z'; ++i) { 885 assert(CHAR_LETTER == CharInfo[i]); 886 assert(CHAR_LETTER == CharInfo[i+'A'-'a']); 887 } 888 for (unsigned i = '0'; i <= '9'; ++i) 889 assert(CHAR_NUMBER == CharInfo[i]); 890 891 isInited = true; 892} 893 894 895/// isIdentifierBody - Return true if this is the body character of an 896/// identifier, which is [a-zA-Z0-9_]. 897static inline bool isIdentifierBody(unsigned char c) { 898 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false; 899} 900 901/// isHorizontalWhitespace - Return true if this character is horizontal 902/// whitespace: ' ', '\t', '\f', '\v'. Note that this returns false for '\0'. 903static inline bool isHorizontalWhitespace(unsigned char c) { 904 return (CharInfo[c] & CHAR_HORZ_WS) ? true : false; 905} 906 907/// isVerticalWhitespace - Return true if this character is vertical 908/// whitespace: '\n', '\r'. Note that this returns false for '\0'. 909static inline bool isVerticalWhitespace(unsigned char c) { 910 return (CharInfo[c] & CHAR_VERT_WS) ? true : false; 911} 912 913/// isWhitespace - Return true if this character is horizontal or vertical 914/// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'. Note that this returns false 915/// for '\0'. 916static inline bool isWhitespace(unsigned char c) { 917 return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false; 918} 919 920/// isNumberBody - Return true if this is the body character of an 921/// preprocessing number, which is [a-zA-Z0-9_.]. 922static inline bool isNumberBody(unsigned char c) { 923 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ? 924 true : false; 925} 926 927/// isRawStringDelimBody - Return true if this is the body character of a 928/// raw string delimiter. 929static inline bool isRawStringDelimBody(unsigned char c) { 930 return (CharInfo[c] & 931 (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL)) ? 932 true : false; 933} 934 935 936//===----------------------------------------------------------------------===// 937// Diagnostics forwarding code. 938//===----------------------------------------------------------------------===// 939 940/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 941/// lexer buffer was all expanded at a single point, perform the mapping. 942/// This is currently only used for _Pragma implementation, so it is the slow 943/// path of the hot getSourceLocation method. Do not allow it to be inlined. 944static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 945 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 946static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 947 SourceLocation FileLoc, 948 unsigned CharNo, unsigned TokLen) { 949 assert(FileLoc.isMacroID() && "Must be a macro expansion"); 950 951 // Otherwise, we're lexing "mapped tokens". This is used for things like 952 // _Pragma handling. Combine the expansion location of FileLoc with the 953 // spelling location. 954 SourceManager &SM = PP.getSourceManager(); 955 956 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 957 // characters come from spelling(FileLoc)+Offset. 958 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 959 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 960 961 // Figure out the expansion loc range, which is the range covered by the 962 // original _Pragma(...) sequence. 963 std::pair<SourceLocation,SourceLocation> II = 964 SM.getImmediateExpansionRange(FileLoc); 965 966 return SM.createExpansionLoc(SpellingLoc, II.first, II.second, TokLen); 967} 968 969/// getSourceLocation - Return a source location identifier for the specified 970/// offset in the current file. 971SourceLocation Lexer::getSourceLocation(const char *Loc, 972 unsigned TokLen) const { 973 assert(Loc >= BufferStart && Loc <= BufferEnd && 974 "Location out of range for this buffer!"); 975 976 // In the normal case, we're just lexing from a simple file buffer, return 977 // the file id from FileLoc with the offset specified. 978 unsigned CharNo = Loc-BufferStart; 979 if (FileLoc.isFileID()) 980 return FileLoc.getLocWithOffset(CharNo); 981 982 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 983 // tokens are lexed from where the _Pragma was defined. 984 assert(PP && "This doesn't work on raw lexers"); 985 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 986} 987 988/// Diag - Forwarding function for diagnostics. This translate a source 989/// position in the current buffer into a SourceLocation object for rendering. 990DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 991 return PP->Diag(getSourceLocation(Loc), DiagID); 992} 993 994//===----------------------------------------------------------------------===// 995// Trigraph and Escaped Newline Handling Code. 996//===----------------------------------------------------------------------===// 997 998/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 999/// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 1000static char GetTrigraphCharForLetter(char Letter) { 1001 switch (Letter) { 1002 default: return 0; 1003 case '=': return '#'; 1004 case ')': return ']'; 1005 case '(': return '['; 1006 case '!': return '|'; 1007 case '\'': return '^'; 1008 case '>': return '}'; 1009 case '/': return '\\'; 1010 case '<': return '{'; 1011 case '-': return '~'; 1012 } 1013} 1014 1015/// DecodeTrigraphChar - If the specified character is a legal trigraph when 1016/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 1017/// return the result character. Finally, emit a warning about trigraph use 1018/// whether trigraphs are enabled or not. 1019static char DecodeTrigraphChar(const char *CP, Lexer *L) { 1020 char Res = GetTrigraphCharForLetter(*CP); 1021 if (!Res || !L) return Res; 1022 1023 if (!L->getFeatures().Trigraphs) { 1024 if (!L->isLexingRawMode()) 1025 L->Diag(CP-2, diag::trigraph_ignored); 1026 return 0; 1027 } 1028 1029 if (!L->isLexingRawMode()) 1030 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 1031 return Res; 1032} 1033 1034/// getEscapedNewLineSize - Return the size of the specified escaped newline, 1035/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 1036/// trigraph equivalent on entry to this function. 1037unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 1038 unsigned Size = 0; 1039 while (isWhitespace(Ptr[Size])) { 1040 ++Size; 1041 1042 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 1043 continue; 1044 1045 // If this is a \r\n or \n\r, skip the other half. 1046 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 1047 Ptr[Size-1] != Ptr[Size]) 1048 ++Size; 1049 1050 return Size; 1051 } 1052 1053 // Not an escaped newline, must be a \t or something else. 1054 return 0; 1055} 1056 1057/// SkipEscapedNewLines - If P points to an escaped newline (or a series of 1058/// them), skip over them and return the first non-escaped-newline found, 1059/// otherwise return P. 1060const char *Lexer::SkipEscapedNewLines(const char *P) { 1061 while (1) { 1062 const char *AfterEscape; 1063 if (*P == '\\') { 1064 AfterEscape = P+1; 1065 } else if (*P == '?') { 1066 // If not a trigraph for escape, bail out. 1067 if (P[1] != '?' || P[2] != '/') 1068 return P; 1069 AfterEscape = P+3; 1070 } else { 1071 return P; 1072 } 1073 1074 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 1075 if (NewLineSize == 0) return P; 1076 P = AfterEscape+NewLineSize; 1077 } 1078} 1079 1080/// \brief Checks that the given token is the first token that occurs after the 1081/// given location (this excludes comments and whitespace). Returns the location 1082/// immediately after the specified token. If the token is not found or the 1083/// location is inside a macro, the returned source location will be invalid. 1084SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc, 1085 tok::TokenKind TKind, 1086 const SourceManager &SM, 1087 const LangOptions &LangOpts, 1088 bool SkipTrailingWhitespaceAndNewLine) { 1089 if (Loc.isMacroID()) { 1090 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts)) 1091 return SourceLocation(); 1092 Loc = SM.getExpansionRange(Loc).second; 1093 } 1094 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 1095 1096 // Break down the source location. 1097 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1098 1099 // Try to load the file buffer. 1100 bool InvalidTemp = false; 1101 llvm::StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 1102 if (InvalidTemp) 1103 return SourceLocation(); 1104 1105 const char *TokenBegin = File.data() + LocInfo.second; 1106 1107 // Lex from the start of the given location. 1108 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 1109 TokenBegin, File.end()); 1110 // Find the token. 1111 Token Tok; 1112 lexer.LexFromRawLexer(Tok); 1113 if (Tok.isNot(TKind)) 1114 return SourceLocation(); 1115 SourceLocation TokenLoc = Tok.getLocation(); 1116 1117 // Calculate how much whitespace needs to be skipped if any. 1118 unsigned NumWhitespaceChars = 0; 1119 if (SkipTrailingWhitespaceAndNewLine) { 1120 const char *TokenEnd = SM.getCharacterData(TokenLoc) + 1121 Tok.getLength(); 1122 unsigned char C = *TokenEnd; 1123 while (isHorizontalWhitespace(C)) { 1124 C = *(++TokenEnd); 1125 NumWhitespaceChars++; 1126 } 1127 if (isVerticalWhitespace(C)) 1128 NumWhitespaceChars++; 1129 } 1130 1131 return TokenLoc.getLocWithOffset(Tok.getLength() + NumWhitespaceChars); 1132} 1133 1134/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 1135/// get its size, and return it. This is tricky in several cases: 1136/// 1. If currently at the start of a trigraph, we warn about the trigraph, 1137/// then either return the trigraph (skipping 3 chars) or the '?', 1138/// depending on whether trigraphs are enabled or not. 1139/// 2. If this is an escaped newline (potentially with whitespace between 1140/// the backslash and newline), implicitly skip the newline and return 1141/// the char after it. 1142/// 3. If this is a UCN, return it. FIXME: C++ UCN's? 1143/// 1144/// This handles the slow/uncommon case of the getCharAndSize method. Here we 1145/// know that we can accumulate into Size, and that we have already incremented 1146/// Ptr by Size bytes. 1147/// 1148/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 1149/// be updated to match. 1150/// 1151char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 1152 Token *Tok) { 1153 // If we have a slash, look for an escaped newline. 1154 if (Ptr[0] == '\\') { 1155 ++Size; 1156 ++Ptr; 1157Slash: 1158 // Common case, backslash-char where the char is not whitespace. 1159 if (!isWhitespace(Ptr[0])) return '\\'; 1160 1161 // See if we have optional whitespace characters between the slash and 1162 // newline. 1163 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1164 // Remember that this token needs to be cleaned. 1165 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1166 1167 // Warn if there was whitespace between the backslash and newline. 1168 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 1169 Diag(Ptr, diag::backslash_newline_space); 1170 1171 // Found backslash<whitespace><newline>. Parse the char after it. 1172 Size += EscapedNewLineSize; 1173 Ptr += EscapedNewLineSize; 1174 1175 // If the char that we finally got was a \n, then we must have had 1176 // something like \<newline><newline>. We don't want to consume the 1177 // second newline. 1178 if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0') 1179 return ' '; 1180 1181 // Use slow version to accumulate a correct size field. 1182 return getCharAndSizeSlow(Ptr, Size, Tok); 1183 } 1184 1185 // Otherwise, this is not an escaped newline, just return the slash. 1186 return '\\'; 1187 } 1188 1189 // If this is a trigraph, process it. 1190 if (Ptr[0] == '?' && Ptr[1] == '?') { 1191 // If this is actually a legal trigraph (not something like "??x"), emit 1192 // a trigraph warning. If so, and if trigraphs are enabled, return it. 1193 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) { 1194 // Remember that this token needs to be cleaned. 1195 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1196 1197 Ptr += 3; 1198 Size += 3; 1199 if (C == '\\') goto Slash; 1200 return C; 1201 } 1202 } 1203 1204 // If this is neither, return a single character. 1205 ++Size; 1206 return *Ptr; 1207} 1208 1209 1210/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 1211/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 1212/// and that we have already incremented Ptr by Size bytes. 1213/// 1214/// NOTE: When this method is updated, getCharAndSizeSlow (above) should 1215/// be updated to match. 1216char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 1217 const LangOptions &Features) { 1218 // If we have a slash, look for an escaped newline. 1219 if (Ptr[0] == '\\') { 1220 ++Size; 1221 ++Ptr; 1222Slash: 1223 // Common case, backslash-char where the char is not whitespace. 1224 if (!isWhitespace(Ptr[0])) return '\\'; 1225 1226 // See if we have optional whitespace characters followed by a newline. 1227 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1228 // Found backslash<whitespace><newline>. Parse the char after it. 1229 Size += EscapedNewLineSize; 1230 Ptr += EscapedNewLineSize; 1231 1232 // If the char that we finally got was a \n, then we must have had 1233 // something like \<newline><newline>. We don't want to consume the 1234 // second newline. 1235 if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0') 1236 return ' '; 1237 1238 // Use slow version to accumulate a correct size field. 1239 return getCharAndSizeSlowNoWarn(Ptr, Size, Features); 1240 } 1241 1242 // Otherwise, this is not an escaped newline, just return the slash. 1243 return '\\'; 1244 } 1245 1246 // If this is a trigraph, process it. 1247 if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 1248 // If this is actually a legal trigraph (not something like "??x"), return 1249 // it. 1250 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 1251 Ptr += 3; 1252 Size += 3; 1253 if (C == '\\') goto Slash; 1254 return C; 1255 } 1256 } 1257 1258 // If this is neither, return a single character. 1259 ++Size; 1260 return *Ptr; 1261} 1262 1263//===----------------------------------------------------------------------===// 1264// Helper methods for lexing. 1265//===----------------------------------------------------------------------===// 1266 1267/// \brief Routine that indiscriminately skips bytes in the source file. 1268void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) { 1269 BufferPtr += Bytes; 1270 if (BufferPtr > BufferEnd) 1271 BufferPtr = BufferEnd; 1272 IsAtStartOfLine = StartOfLine; 1273} 1274 1275void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { 1276 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] 1277 unsigned Size; 1278 unsigned char C = *CurPtr++; 1279 while (isIdentifierBody(C)) 1280 C = *CurPtr++; 1281 1282 --CurPtr; // Back up over the skipped character. 1283 1284 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline 1285 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. 1286 // FIXME: UCNs. 1287 // 1288 // TODO: Could merge these checks into a CharInfo flag to make the comparison 1289 // cheaper 1290 if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) { 1291FinishIdentifier: 1292 const char *IdStart = BufferPtr; 1293 FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 1294 Result.setRawIdentifierData(IdStart); 1295 1296 // If we are in raw mode, return this identifier raw. There is no need to 1297 // look up identifier information or attempt to macro expand it. 1298 if (LexingRawMode) 1299 return; 1300 1301 // Fill in Result.IdentifierInfo and update the token kind, 1302 // looking up the identifier in the identifier table. 1303 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 1304 1305 // Finally, now that we know we have an identifier, pass this off to the 1306 // preprocessor, which may macro expand it or something. 1307 if (II->isHandleIdentifierCase()) 1308 PP->HandleIdentifier(Result); 1309 1310 return; 1311 } 1312 1313 // Otherwise, $,\,? in identifier found. Enter slower path. 1314 1315 C = getCharAndSize(CurPtr, Size); 1316 while (1) { 1317 if (C == '$') { 1318 // If we hit a $ and they are not supported in identifiers, we are done. 1319 if (!Features.DollarIdents) goto FinishIdentifier; 1320 1321 // Otherwise, emit a diagnostic and continue. 1322 if (!isLexingRawMode()) 1323 Diag(CurPtr, diag::ext_dollar_in_identifier); 1324 CurPtr = ConsumeChar(CurPtr, Size, Result); 1325 C = getCharAndSize(CurPtr, Size); 1326 continue; 1327 } else if (!isIdentifierBody(C)) { // FIXME: UCNs. 1328 // Found end of identifier. 1329 goto FinishIdentifier; 1330 } 1331 1332 // Otherwise, this character is good, consume it. 1333 CurPtr = ConsumeChar(CurPtr, Size, Result); 1334 1335 C = getCharAndSize(CurPtr, Size); 1336 while (isIdentifierBody(C)) { // FIXME: UCNs. 1337 CurPtr = ConsumeChar(CurPtr, Size, Result); 1338 C = getCharAndSize(CurPtr, Size); 1339 } 1340 } 1341} 1342 1343/// isHexaLiteral - Return true if Start points to a hex constant. 1344/// in microsoft mode (where this is supposed to be several different tokens). 1345static bool isHexaLiteral(const char *Start, const LangOptions &Features) { 1346 unsigned Size; 1347 char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, Features); 1348 if (C1 != '0') 1349 return false; 1350 char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, Features); 1351 return (C2 == 'x' || C2 == 'X'); 1352} 1353 1354/// LexNumericConstant - Lex the remainder of a integer or floating point 1355/// constant. From[-1] is the first character lexed. Return the end of the 1356/// constant. 1357void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 1358 unsigned Size; 1359 char C = getCharAndSize(CurPtr, Size); 1360 char PrevCh = 0; 1361 while (isNumberBody(C)) { // FIXME: UCNs? 1362 CurPtr = ConsumeChar(CurPtr, Size, Result); 1363 PrevCh = C; 1364 C = getCharAndSize(CurPtr, Size); 1365 } 1366 1367 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 1368 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 1369 // If we are in Microsoft mode, don't continue if the constant is hex. 1370 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 1371 if (!Features.MicrosoftExt || !isHexaLiteral(BufferPtr, Features)) 1372 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 1373 } 1374 1375 // If we have a hex FP constant, continue. 1376 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) 1377 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 1378 1379 // Update the location of token as well as BufferPtr. 1380 const char *TokStart = BufferPtr; 1381 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 1382 Result.setLiteralData(TokStart); 1383} 1384 1385/// LexStringLiteral - Lex the remainder of a string literal, after having lexed 1386/// either " or L" or u8" or u" or U". 1387void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 1388 tok::TokenKind Kind) { 1389 const char *NulCharacter = 0; // Does this string contain the \0 character? 1390 1391 if (!isLexingRawMode() && 1392 (Kind == tok::utf8_string_literal || 1393 Kind == tok::utf16_string_literal || 1394 Kind == tok::utf32_string_literal)) 1395 Diag(BufferPtr, diag::warn_cxx98_compat_unicode_literal); 1396 1397 char C = getAndAdvanceChar(CurPtr, Result); 1398 while (C != '"') { 1399 // Skip escaped characters. Escaped newlines will already be processed by 1400 // getAndAdvanceChar. 1401 if (C == '\\') 1402 C = getAndAdvanceChar(CurPtr, Result); 1403 1404 if (C == '\n' || C == '\r' || // Newline. 1405 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 1406 if (!isLexingRawMode() && !Features.AsmPreprocessor) 1407 Diag(BufferPtr, diag::warn_unterminated_string); 1408 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1409 return; 1410 } 1411 1412 if (C == 0) { 1413 if (isCodeCompletionPoint(CurPtr-1)) { 1414 PP->CodeCompleteNaturalLanguage(); 1415 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1416 return cutOffLexing(); 1417 } 1418 1419 NulCharacter = CurPtr-1; 1420 } 1421 C = getAndAdvanceChar(CurPtr, Result); 1422 } 1423 1424 // If a nul character existed in the string, warn about it. 1425 if (NulCharacter && !isLexingRawMode()) 1426 Diag(NulCharacter, diag::null_in_string); 1427 1428 // Update the location of the token as well as the BufferPtr instance var. 1429 const char *TokStart = BufferPtr; 1430 FormTokenWithChars(Result, CurPtr, Kind); 1431 Result.setLiteralData(TokStart); 1432} 1433 1434/// LexRawStringLiteral - Lex the remainder of a raw string literal, after 1435/// having lexed R", LR", u8R", uR", or UR". 1436void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 1437 tok::TokenKind Kind) { 1438 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 1439 // Between the initial and final double quote characters of the raw string, 1440 // any transformations performed in phases 1 and 2 (trigraphs, 1441 // universal-character-names, and line splicing) are reverted. 1442 1443 if (!isLexingRawMode()) 1444 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 1445 1446 unsigned PrefixLen = 0; 1447 1448 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) 1449 ++PrefixLen; 1450 1451 // If the last character was not a '(', then we didn't lex a valid delimiter. 1452 if (CurPtr[PrefixLen] != '(') { 1453 if (!isLexingRawMode()) { 1454 const char *PrefixEnd = &CurPtr[PrefixLen]; 1455 if (PrefixLen == 16) { 1456 Diag(PrefixEnd, diag::err_raw_delim_too_long); 1457 } else { 1458 Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 1459 << StringRef(PrefixEnd, 1); 1460 } 1461 } 1462 1463 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 1464 // it's possible the '"' was intended to be part of the raw string, but 1465 // there's not much we can do about that. 1466 while (1) { 1467 char C = *CurPtr++; 1468 1469 if (C == '"') 1470 break; 1471 if (C == 0 && CurPtr-1 == BufferEnd) { 1472 --CurPtr; 1473 break; 1474 } 1475 } 1476 1477 FormTokenWithChars(Result, CurPtr, tok::unknown); 1478 return; 1479 } 1480 1481 // Save prefix and move CurPtr past it 1482 const char *Prefix = CurPtr; 1483 CurPtr += PrefixLen + 1; // skip over prefix and '(' 1484 1485 while (1) { 1486 char C = *CurPtr++; 1487 1488 if (C == ')') { 1489 // Check for prefix match and closing quote. 1490 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 1491 CurPtr += PrefixLen + 1; // skip over prefix and '"' 1492 break; 1493 } 1494 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 1495 if (!isLexingRawMode()) 1496 Diag(BufferPtr, diag::err_unterminated_raw_string) 1497 << StringRef(Prefix, PrefixLen); 1498 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1499 return; 1500 } 1501 } 1502 1503 // Update the location of token as well as BufferPtr. 1504 const char *TokStart = BufferPtr; 1505 FormTokenWithChars(Result, CurPtr, Kind); 1506 Result.setLiteralData(TokStart); 1507} 1508 1509/// LexAngledStringLiteral - Lex the remainder of an angled string literal, 1510/// after having lexed the '<' character. This is used for #include filenames. 1511void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 1512 const char *NulCharacter = 0; // Does this string contain the \0 character? 1513 const char *AfterLessPos = CurPtr; 1514 char C = getAndAdvanceChar(CurPtr, Result); 1515 while (C != '>') { 1516 // Skip escaped characters. 1517 if (C == '\\') { 1518 // Skip the escaped character. 1519 C = getAndAdvanceChar(CurPtr, Result); 1520 } else if (C == '\n' || C == '\r' || // Newline. 1521 (C == 0 && (CurPtr-1 == BufferEnd || // End of file. 1522 isCodeCompletionPoint(CurPtr-1)))) { 1523 // If the filename is unterminated, then it must just be a lone < 1524 // character. Return this as such. 1525 FormTokenWithChars(Result, AfterLessPos, tok::less); 1526 return; 1527 } else if (C == 0) { 1528 NulCharacter = CurPtr-1; 1529 } 1530 C = getAndAdvanceChar(CurPtr, Result); 1531 } 1532 1533 // If a nul character existed in the string, warn about it. 1534 if (NulCharacter && !isLexingRawMode()) 1535 Diag(NulCharacter, diag::null_in_string); 1536 1537 // Update the location of token as well as BufferPtr. 1538 const char *TokStart = BufferPtr; 1539 FormTokenWithChars(Result, CurPtr, tok::angle_string_literal); 1540 Result.setLiteralData(TokStart); 1541} 1542 1543 1544/// LexCharConstant - Lex the remainder of a character constant, after having 1545/// lexed either ' or L' or u' or U'. 1546void Lexer::LexCharConstant(Token &Result, const char *CurPtr, 1547 tok::TokenKind Kind) { 1548 const char *NulCharacter = 0; // Does this character contain the \0 character? 1549 1550 if (!isLexingRawMode() && 1551 (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)) 1552 Diag(BufferPtr, diag::warn_cxx98_compat_unicode_literal); 1553 1554 char C = getAndAdvanceChar(CurPtr, Result); 1555 if (C == '\'') { 1556 if (!isLexingRawMode() && !Features.AsmPreprocessor) 1557 Diag(BufferPtr, diag::err_empty_character); 1558 FormTokenWithChars(Result, CurPtr, tok::unknown); 1559 return; 1560 } 1561 1562 while (C != '\'') { 1563 // Skip escaped characters. 1564 if (C == '\\') { 1565 // Skip the escaped character. 1566 // FIXME: UCN's 1567 C = getAndAdvanceChar(CurPtr, Result); 1568 } else if (C == '\n' || C == '\r' || // Newline. 1569 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 1570 if (!isLexingRawMode() && !Features.AsmPreprocessor) 1571 Diag(BufferPtr, diag::warn_unterminated_char); 1572 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1573 return; 1574 } else if (C == 0) { 1575 if (isCodeCompletionPoint(CurPtr-1)) { 1576 PP->CodeCompleteNaturalLanguage(); 1577 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1578 return cutOffLexing(); 1579 } 1580 1581 NulCharacter = CurPtr-1; 1582 } 1583 C = getAndAdvanceChar(CurPtr, Result); 1584 } 1585 1586 // If a nul character existed in the character, warn about it. 1587 if (NulCharacter && !isLexingRawMode()) 1588 Diag(NulCharacter, diag::null_in_char); 1589 1590 // Update the location of token as well as BufferPtr. 1591 const char *TokStart = BufferPtr; 1592 FormTokenWithChars(Result, CurPtr, Kind); 1593 Result.setLiteralData(TokStart); 1594} 1595 1596/// SkipWhitespace - Efficiently skip over a series of whitespace characters. 1597/// Update BufferPtr to point to the next non-whitespace character and return. 1598/// 1599/// This method forms a token and returns true if KeepWhitespaceMode is enabled. 1600/// 1601bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { 1602 // Whitespace - Skip it, then return the token after the whitespace. 1603 unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently. 1604 while (1) { 1605 // Skip horizontal whitespace very aggressively. 1606 while (isHorizontalWhitespace(Char)) 1607 Char = *++CurPtr; 1608 1609 // Otherwise if we have something other than whitespace, we're done. 1610 if (Char != '\n' && Char != '\r') 1611 break; 1612 1613 if (ParsingPreprocessorDirective) { 1614 // End of preprocessor directive line, let LexTokenInternal handle this. 1615 BufferPtr = CurPtr; 1616 return false; 1617 } 1618 1619 // ok, but handle newline. 1620 // The returned token is at the start of the line. 1621 Result.setFlag(Token::StartOfLine); 1622 // No leading whitespace seen so far. 1623 Result.clearFlag(Token::LeadingSpace); 1624 Char = *++CurPtr; 1625 } 1626 1627 // If this isn't immediately after a newline, there is leading space. 1628 char PrevChar = CurPtr[-1]; 1629 if (PrevChar != '\n' && PrevChar != '\r') 1630 Result.setFlag(Token::LeadingSpace); 1631 1632 // If the client wants us to return whitespace, return it now. 1633 if (isKeepWhitespaceMode()) { 1634 FormTokenWithChars(Result, CurPtr, tok::unknown); 1635 return true; 1636 } 1637 1638 BufferPtr = CurPtr; 1639 return false; 1640} 1641 1642// SkipBCPLComment - We have just read the // characters from input. Skip until 1643// we find the newline character thats terminate the comment. Then update 1644/// BufferPtr and return. 1645/// 1646/// If we're in KeepCommentMode or any CommentHandler has inserted 1647/// some tokens, this will store the first token and return true. 1648bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { 1649 // If BCPL comments aren't explicitly enabled for this language, emit an 1650 // extension warning. 1651 if (!Features.BCPLComment && !isLexingRawMode()) { 1652 Diag(BufferPtr, diag::ext_bcpl_comment); 1653 1654 // Mark them enabled so we only emit one warning for this translation 1655 // unit. 1656 Features.BCPLComment = true; 1657 } 1658 1659 // Scan over the body of the comment. The common case, when scanning, is that 1660 // the comment contains normal ascii characters with nothing interesting in 1661 // them. As such, optimize for this case with the inner loop. 1662 char C; 1663 do { 1664 C = *CurPtr; 1665 // Skip over characters in the fast loop. 1666 while (C != 0 && // Potentially EOF. 1667 C != '\n' && C != '\r') // Newline or DOS-style newline. 1668 C = *++CurPtr; 1669 1670 const char *NextLine = CurPtr; 1671 if (C != 0) { 1672 // We found a newline, see if it's escaped. 1673 const char *EscapePtr = CurPtr-1; 1674 while (isHorizontalWhitespace(*EscapePtr)) // Skip whitespace. 1675 --EscapePtr; 1676 1677 if (*EscapePtr == '\\') // Escaped newline. 1678 CurPtr = EscapePtr; 1679 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 1680 EscapePtr[-2] == '?') // Trigraph-escaped newline. 1681 CurPtr = EscapePtr-2; 1682 else 1683 break; // This is a newline, we're done. 1684 1685 C = *CurPtr; 1686 } 1687 1688 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 1689 // properly decode the character. Read it in raw mode to avoid emitting 1690 // diagnostics about things like trigraphs. If we see an escaped newline, 1691 // we'll handle it below. 1692 const char *OldPtr = CurPtr; 1693 bool OldRawMode = isLexingRawMode(); 1694 LexingRawMode = true; 1695 C = getAndAdvanceChar(CurPtr, Result); 1696 LexingRawMode = OldRawMode; 1697 1698 // If we only read only one character, then no special handling is needed. 1699 // We're done and can skip forward to the newline. 1700 if (C != 0 && CurPtr == OldPtr+1) { 1701 CurPtr = NextLine; 1702 break; 1703 } 1704 1705 // If we read multiple characters, and one of those characters was a \r or 1706 // \n, then we had an escaped newline within the comment. Emit diagnostic 1707 // unless the next line is also a // comment. 1708 if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') { 1709 for (; OldPtr != CurPtr; ++OldPtr) 1710 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 1711 // Okay, we found a // comment that ends in a newline, if the next 1712 // line is also a // comment, but has spaces, don't emit a diagnostic. 1713 if (isWhitespace(C)) { 1714 const char *ForwardPtr = CurPtr; 1715 while (isWhitespace(*ForwardPtr)) // Skip whitespace. 1716 ++ForwardPtr; 1717 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 1718 break; 1719 } 1720 1721 if (!isLexingRawMode()) 1722 Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment); 1723 break; 1724 } 1725 } 1726 1727 if (CurPtr == BufferEnd+1) { 1728 --CurPtr; 1729 break; 1730 } 1731 1732 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 1733 PP->CodeCompleteNaturalLanguage(); 1734 cutOffLexing(); 1735 return false; 1736 } 1737 1738 } while (C != '\n' && C != '\r'); 1739 1740 // Found but did not consume the newline. Notify comment handlers about the 1741 // comment unless we're in a #if 0 block. 1742 if (PP && !isLexingRawMode() && 1743 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 1744 getSourceLocation(CurPtr)))) { 1745 BufferPtr = CurPtr; 1746 return true; // A token has to be returned. 1747 } 1748 1749 // If we are returning comments as tokens, return this comment as a token. 1750 if (inKeepCommentMode()) 1751 return SaveBCPLComment(Result, CurPtr); 1752 1753 // If we are inside a preprocessor directive and we see the end of line, 1754 // return immediately, so that the lexer can return this as an EOD token. 1755 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 1756 BufferPtr = CurPtr; 1757 return false; 1758 } 1759 1760 // Otherwise, eat the \n character. We don't care if this is a \n\r or 1761 // \r\n sequence. This is an efficiency hack (because we know the \n can't 1762 // contribute to another token), it isn't needed for correctness. Note that 1763 // this is ok even in KeepWhitespaceMode, because we would have returned the 1764 /// comment above in that mode. 1765 ++CurPtr; 1766 1767 // The next returned token is at the start of the line. 1768 Result.setFlag(Token::StartOfLine); 1769 // No leading whitespace seen so far. 1770 Result.clearFlag(Token::LeadingSpace); 1771 BufferPtr = CurPtr; 1772 return false; 1773} 1774 1775/// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in 1776/// an appropriate way and return it. 1777bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) { 1778 // If we're not in a preprocessor directive, just return the // comment 1779 // directly. 1780 FormTokenWithChars(Result, CurPtr, tok::comment); 1781 1782 if (!ParsingPreprocessorDirective) 1783 return true; 1784 1785 // If this BCPL-style comment is in a macro definition, transmogrify it into 1786 // a C-style block comment. 1787 bool Invalid = false; 1788 std::string Spelling = PP->getSpelling(Result, &Invalid); 1789 if (Invalid) 1790 return true; 1791 1792 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?"); 1793 Spelling[1] = '*'; // Change prefix to "/*". 1794 Spelling += "*/"; // add suffix. 1795 1796 Result.setKind(tok::comment); 1797 PP->CreateString(&Spelling[0], Spelling.size(), Result, 1798 Result.getLocation(), Result.getLocation()); 1799 return true; 1800} 1801 1802/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 1803/// character (either \n or \r) is part of an escaped newline sequence. Issue a 1804/// diagnostic if so. We know that the newline is inside of a block comment. 1805static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, 1806 Lexer *L) { 1807 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 1808 1809 // Back up off the newline. 1810 --CurPtr; 1811 1812 // If this is a two-character newline sequence, skip the other character. 1813 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 1814 // \n\n or \r\r -> not escaped newline. 1815 if (CurPtr[0] == CurPtr[1]) 1816 return false; 1817 // \n\r or \r\n -> skip the newline. 1818 --CurPtr; 1819 } 1820 1821 // If we have horizontal whitespace, skip over it. We allow whitespace 1822 // between the slash and newline. 1823 bool HasSpace = false; 1824 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 1825 --CurPtr; 1826 HasSpace = true; 1827 } 1828 1829 // If we have a slash, we know this is an escaped newline. 1830 if (*CurPtr == '\\') { 1831 if (CurPtr[-1] != '*') return false; 1832 } else { 1833 // It isn't a slash, is it the ?? / trigraph? 1834 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || 1835 CurPtr[-3] != '*') 1836 return false; 1837 1838 // This is the trigraph ending the comment. Emit a stern warning! 1839 CurPtr -= 2; 1840 1841 // If no trigraphs are enabled, warn that we ignored this trigraph and 1842 // ignore this * character. 1843 if (!L->getFeatures().Trigraphs) { 1844 if (!L->isLexingRawMode()) 1845 L->Diag(CurPtr, diag::trigraph_ignored_block_comment); 1846 return false; 1847 } 1848 if (!L->isLexingRawMode()) 1849 L->Diag(CurPtr, diag::trigraph_ends_block_comment); 1850 } 1851 1852 // Warn about having an escaped newline between the */ characters. 1853 if (!L->isLexingRawMode()) 1854 L->Diag(CurPtr, diag::escaped_newline_block_comment_end); 1855 1856 // If there was space between the backslash and newline, warn about it. 1857 if (HasSpace && !L->isLexingRawMode()) 1858 L->Diag(CurPtr, diag::backslash_newline_space); 1859 1860 return true; 1861} 1862 1863#ifdef __SSE2__ 1864#include <emmintrin.h> 1865#elif __ALTIVEC__ 1866#include <altivec.h> 1867#undef bool 1868#endif 1869 1870/// SkipBlockComment - We have just read the /* characters from input. Read 1871/// until we find the */ characters that terminate the comment. Note that we 1872/// don't bother decoding trigraphs or escaped newlines in block comments, 1873/// because they cannot cause the comment to end. The only thing that can 1874/// happen is the comment could end with an escaped newline between the */ end 1875/// of comment. 1876/// 1877/// If we're in KeepCommentMode or any CommentHandler has inserted 1878/// some tokens, this will store the first token and return true. 1879bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { 1880 // Scan one character past where we should, looking for a '/' character. Once 1881 // we find it, check to see if it was preceded by a *. This common 1882 // optimization helps people who like to put a lot of * characters in their 1883 // comments. 1884 1885 // The first character we get with newlines and trigraphs skipped to handle 1886 // the degenerate /*/ case below correctly if the * has an escaped newline 1887 // after it. 1888 unsigned CharSize; 1889 unsigned char C = getCharAndSize(CurPtr, CharSize); 1890 CurPtr += CharSize; 1891 if (C == 0 && CurPtr == BufferEnd+1) { 1892 if (!isLexingRawMode()) 1893 Diag(BufferPtr, diag::err_unterminated_block_comment); 1894 --CurPtr; 1895 1896 // KeepWhitespaceMode should return this broken comment as a token. Since 1897 // it isn't a well formed comment, just return it as an 'unknown' token. 1898 if (isKeepWhitespaceMode()) { 1899 FormTokenWithChars(Result, CurPtr, tok::unknown); 1900 return true; 1901 } 1902 1903 BufferPtr = CurPtr; 1904 return false; 1905 } 1906 1907 // Check to see if the first character after the '/*' is another /. If so, 1908 // then this slash does not end the block comment, it is part of it. 1909 if (C == '/') 1910 C = *CurPtr++; 1911 1912 while (1) { 1913 // Skip over all non-interesting characters until we find end of buffer or a 1914 // (probably ending) '/' character. 1915 if (CurPtr + 24 < BufferEnd && 1916 // If there is a code-completion point avoid the fast scan because it 1917 // doesn't check for '\0'. 1918 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 1919 // While not aligned to a 16-byte boundary. 1920 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 1921 C = *CurPtr++; 1922 1923 if (C == '/') goto FoundSlash; 1924 1925#ifdef __SSE2__ 1926 __m128i Slashes = _mm_set1_epi8('/'); 1927 while (CurPtr+16 <= BufferEnd) { 1928 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(__m128i*)CurPtr, Slashes)); 1929 if (cmp != 0) { 1930 // Adjust the pointer to point directly after the first slash. It's 1931 // not necessary to set C here, it will be overwritten at the end of 1932 // the outer loop. 1933 CurPtr += llvm::CountTrailingZeros_32(cmp) + 1; 1934 goto FoundSlash; 1935 } 1936 CurPtr += 16; 1937 } 1938#elif __ALTIVEC__ 1939 __vector unsigned char Slashes = { 1940 '/', '/', '/', '/', '/', '/', '/', '/', 1941 '/', '/', '/', '/', '/', '/', '/', '/' 1942 }; 1943 while (CurPtr+16 <= BufferEnd && 1944 !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes)) 1945 CurPtr += 16; 1946#else 1947 // Scan for '/' quickly. Many block comments are very large. 1948 while (CurPtr[0] != '/' && 1949 CurPtr[1] != '/' && 1950 CurPtr[2] != '/' && 1951 CurPtr[3] != '/' && 1952 CurPtr+4 < BufferEnd) { 1953 CurPtr += 4; 1954 } 1955#endif 1956 1957 // It has to be one of the bytes scanned, increment to it and read one. 1958 C = *CurPtr++; 1959 } 1960 1961 // Loop to scan the remainder. 1962 while (C != '/' && C != '\0') 1963 C = *CurPtr++; 1964 1965 if (C == '/') { 1966 FoundSlash: 1967 if (CurPtr[-2] == '*') // We found the final */. We're done! 1968 break; 1969 1970 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 1971 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { 1972 // We found the final */, though it had an escaped newline between the 1973 // * and /. We're done! 1974 break; 1975 } 1976 } 1977 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 1978 // If this is a /* inside of the comment, emit a warning. Don't do this 1979 // if this is a /*/, which will end the comment. This misses cases with 1980 // embedded escaped newlines, but oh well. 1981 if (!isLexingRawMode()) 1982 Diag(CurPtr-1, diag::warn_nested_block_comment); 1983 } 1984 } else if (C == 0 && CurPtr == BufferEnd+1) { 1985 if (!isLexingRawMode()) 1986 Diag(BufferPtr, diag::err_unterminated_block_comment); 1987 // Note: the user probably forgot a */. We could continue immediately 1988 // after the /*, but this would involve lexing a lot of what really is the 1989 // comment, which surely would confuse the parser. 1990 --CurPtr; 1991 1992 // KeepWhitespaceMode should return this broken comment as a token. Since 1993 // it isn't a well formed comment, just return it as an 'unknown' token. 1994 if (isKeepWhitespaceMode()) { 1995 FormTokenWithChars(Result, CurPtr, tok::unknown); 1996 return true; 1997 } 1998 1999 BufferPtr = CurPtr; 2000 return false; 2001 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 2002 PP->CodeCompleteNaturalLanguage(); 2003 cutOffLexing(); 2004 return false; 2005 } 2006 2007 C = *CurPtr++; 2008 } 2009 2010 // Notify comment handlers about the comment unless we're in a #if 0 block. 2011 if (PP && !isLexingRawMode() && 2012 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 2013 getSourceLocation(CurPtr)))) { 2014 BufferPtr = CurPtr; 2015 return true; // A token has to be returned. 2016 } 2017 2018 // If we are returning comments as tokens, return this comment as a token. 2019 if (inKeepCommentMode()) { 2020 FormTokenWithChars(Result, CurPtr, tok::comment); 2021 return true; 2022 } 2023 2024 // It is common for the tokens immediately after a /**/ comment to be 2025 // whitespace. Instead of going through the big switch, handle it 2026 // efficiently now. This is safe even in KeepWhitespaceMode because we would 2027 // have already returned above with the comment as a token. 2028 if (isHorizontalWhitespace(*CurPtr)) { 2029 Result.setFlag(Token::LeadingSpace); 2030 SkipWhitespace(Result, CurPtr+1); 2031 return false; 2032 } 2033 2034 // Otherwise, just return so that the next character will be lexed as a token. 2035 BufferPtr = CurPtr; 2036 Result.setFlag(Token::LeadingSpace); 2037 return false; 2038} 2039 2040//===----------------------------------------------------------------------===// 2041// Primary Lexing Entry Points 2042//===----------------------------------------------------------------------===// 2043 2044/// ReadToEndOfLine - Read the rest of the current preprocessor line as an 2045/// uninterpreted string. This switches the lexer out of directive mode. 2046std::string Lexer::ReadToEndOfLine() { 2047 assert(ParsingPreprocessorDirective && ParsingFilename == false && 2048 "Must be in a preprocessing directive!"); 2049 std::string Result; 2050 Token Tmp; 2051 2052 // CurPtr - Cache BufferPtr in an automatic variable. 2053 const char *CurPtr = BufferPtr; 2054 while (1) { 2055 char Char = getAndAdvanceChar(CurPtr, Tmp); 2056 switch (Char) { 2057 default: 2058 Result += Char; 2059 break; 2060 case 0: // Null. 2061 // Found end of file? 2062 if (CurPtr-1 != BufferEnd) { 2063 if (isCodeCompletionPoint(CurPtr-1)) { 2064 PP->CodeCompleteNaturalLanguage(); 2065 cutOffLexing(); 2066 return Result; 2067 } 2068 2069 // Nope, normal character, continue. 2070 Result += Char; 2071 break; 2072 } 2073 // FALL THROUGH. 2074 case '\r': 2075 case '\n': 2076 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 2077 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 2078 BufferPtr = CurPtr-1; 2079 2080 // Next, lex the character, which should handle the EOD transition. 2081 Lex(Tmp); 2082 if (Tmp.is(tok::code_completion)) { 2083 if (PP) 2084 PP->CodeCompleteNaturalLanguage(); 2085 Lex(Tmp); 2086 } 2087 assert(Tmp.is(tok::eod) && "Unexpected token!"); 2088 2089 // Finally, we're done, return the string we found. 2090 return Result; 2091 } 2092 } 2093} 2094 2095/// LexEndOfFile - CurPtr points to the end of this file. Handle this 2096/// condition, reporting diagnostics and handling other edge cases as required. 2097/// This returns true if Result contains a token, false if PP.Lex should be 2098/// called again. 2099bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 2100 // If we hit the end of the file while parsing a preprocessor directive, 2101 // end the preprocessor directive first. The next token returned will 2102 // then be the end of file. 2103 if (ParsingPreprocessorDirective) { 2104 // Done parsing the "line". 2105 ParsingPreprocessorDirective = false; 2106 // Update the location of token as well as BufferPtr. 2107 FormTokenWithChars(Result, CurPtr, tok::eod); 2108 2109 // Restore comment saving mode, in case it was disabled for directive. 2110 SetCommentRetentionState(PP->getCommentRetentionState()); 2111 return true; // Have a token. 2112 } 2113 2114 // If we are in raw mode, return this event as an EOF token. Let the caller 2115 // that put us in raw mode handle the event. 2116 if (isLexingRawMode()) { 2117 Result.startToken(); 2118 BufferPtr = BufferEnd; 2119 FormTokenWithChars(Result, BufferEnd, tok::eof); 2120 return true; 2121 } 2122 2123 // Issue diagnostics for unterminated #if and missing newline. 2124 2125 // If we are in a #if directive, emit an error. 2126 while (!ConditionalStack.empty()) { 2127 if (PP->getCodeCompletionFileLoc() != FileLoc) 2128 PP->Diag(ConditionalStack.back().IfLoc, 2129 diag::err_pp_unterminated_conditional); 2130 ConditionalStack.pop_back(); 2131 } 2132 2133 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 2134 // a pedwarn. 2135 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) 2136 Diag(BufferEnd, diag::ext_no_newline_eof) 2137 << FixItHint::CreateInsertion(getSourceLocation(BufferEnd), "\n"); 2138 2139 BufferPtr = CurPtr; 2140 2141 // Finally, let the preprocessor handle this. 2142 return PP->HandleEndOfFile(Result); 2143} 2144 2145/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 2146/// the specified lexer will return a tok::l_paren token, 0 if it is something 2147/// else and 2 if there are no more tokens in the buffer controlled by the 2148/// lexer. 2149unsigned Lexer::isNextPPTokenLParen() { 2150 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 2151 2152 // Switch to 'skipping' mode. This will ensure that we can lex a token 2153 // without emitting diagnostics, disables macro expansion, and will cause EOF 2154 // to return an EOF token instead of popping the include stack. 2155 LexingRawMode = true; 2156 2157 // Save state that can be changed while lexing so that we can restore it. 2158 const char *TmpBufferPtr = BufferPtr; 2159 bool inPPDirectiveMode = ParsingPreprocessorDirective; 2160 2161 Token Tok; 2162 Tok.startToken(); 2163 LexTokenInternal(Tok); 2164 2165 // Restore state that may have changed. 2166 BufferPtr = TmpBufferPtr; 2167 ParsingPreprocessorDirective = inPPDirectiveMode; 2168 2169 // Restore the lexer back to non-skipping mode. 2170 LexingRawMode = false; 2171 2172 if (Tok.is(tok::eof)) 2173 return 2; 2174 return Tok.is(tok::l_paren); 2175} 2176 2177/// FindConflictEnd - Find the end of a version control conflict marker. 2178static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 2179 ConflictMarkerKind CMK) { 2180 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 2181 size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 2182 StringRef RestOfBuffer(CurPtr+TermLen, BufferEnd-CurPtr-TermLen); 2183 size_t Pos = RestOfBuffer.find(Terminator); 2184 while (Pos != StringRef::npos) { 2185 // Must occur at start of line. 2186 if (RestOfBuffer[Pos-1] != '\r' && 2187 RestOfBuffer[Pos-1] != '\n') { 2188 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 2189 Pos = RestOfBuffer.find(Terminator); 2190 continue; 2191 } 2192 return RestOfBuffer.data()+Pos; 2193 } 2194 return 0; 2195} 2196 2197/// IsStartOfConflictMarker - If the specified pointer is the start of a version 2198/// control conflict marker like '<<<<<<<', recognize it as such, emit an error 2199/// and recover nicely. This returns true if it is a conflict marker and false 2200/// if not. 2201bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 2202 // Only a conflict marker if it starts at the beginning of a line. 2203 if (CurPtr != BufferStart && 2204 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 2205 return false; 2206 2207 // Check to see if we have <<<<<<< or >>>>. 2208 if ((BufferEnd-CurPtr < 8 || StringRef(CurPtr, 7) != "<<<<<<<") && 2209 (BufferEnd-CurPtr < 6 || StringRef(CurPtr, 5) != ">>>> ")) 2210 return false; 2211 2212 // If we have a situation where we don't care about conflict markers, ignore 2213 // it. 2214 if (CurrentConflictMarkerState || isLexingRawMode()) 2215 return false; 2216 2217 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 2218 2219 // Check to see if there is an ending marker somewhere in the buffer at the 2220 // start of a line to terminate this conflict marker. 2221 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 2222 // We found a match. We are really in a conflict marker. 2223 // Diagnose this, and ignore to the end of line. 2224 Diag(CurPtr, diag::err_conflict_marker); 2225 CurrentConflictMarkerState = Kind; 2226 2227 // Skip ahead to the end of line. We know this exists because the 2228 // end-of-conflict marker starts with \r or \n. 2229 while (*CurPtr != '\r' && *CurPtr != '\n') { 2230 assert(CurPtr != BufferEnd && "Didn't find end of line"); 2231 ++CurPtr; 2232 } 2233 BufferPtr = CurPtr; 2234 return true; 2235 } 2236 2237 // No end of conflict marker found. 2238 return false; 2239} 2240 2241 2242/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 2243/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 2244/// is the end of a conflict marker. Handle it by ignoring up until the end of 2245/// the line. This returns true if it is a conflict marker and false if not. 2246bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 2247 // Only a conflict marker if it starts at the beginning of a line. 2248 if (CurPtr != BufferStart && 2249 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 2250 return false; 2251 2252 // If we have a situation where we don't care about conflict markers, ignore 2253 // it. 2254 if (!CurrentConflictMarkerState || isLexingRawMode()) 2255 return false; 2256 2257 // Check to see if we have the marker (4 characters in a row). 2258 for (unsigned i = 1; i != 4; ++i) 2259 if (CurPtr[i] != CurPtr[0]) 2260 return false; 2261 2262 // If we do have it, search for the end of the conflict marker. This could 2263 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 2264 // be the end of conflict marker. 2265 if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 2266 CurrentConflictMarkerState)) { 2267 CurPtr = End; 2268 2269 // Skip ahead to the end of line. 2270 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 2271 ++CurPtr; 2272 2273 BufferPtr = CurPtr; 2274 2275 // No longer in the conflict marker. 2276 CurrentConflictMarkerState = CMK_None; 2277 return true; 2278 } 2279 2280 return false; 2281} 2282 2283bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 2284 if (PP && PP->isCodeCompletionEnabled()) { 2285 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 2286 return Loc == PP->getCodeCompletionLoc(); 2287 } 2288 2289 return false; 2290} 2291 2292 2293/// LexTokenInternal - This implements a simple C family lexer. It is an 2294/// extremely performance critical piece of code. This assumes that the buffer 2295/// has a null character at the end of the file. This returns a preprocessing 2296/// token, not a normal token, as such, it is an internal interface. It assumes 2297/// that the Flags of result have been cleared before calling this. 2298void Lexer::LexTokenInternal(Token &Result) { 2299LexNextToken: 2300 // New token, can't need cleaning yet. 2301 Result.clearFlag(Token::NeedsCleaning); 2302 Result.setIdentifierInfo(0); 2303 2304 // CurPtr - Cache BufferPtr in an automatic variable. 2305 const char *CurPtr = BufferPtr; 2306 2307 // Small amounts of horizontal whitespace is very common between tokens. 2308 if ((*CurPtr == ' ') || (*CurPtr == '\t')) { 2309 ++CurPtr; 2310 while ((*CurPtr == ' ') || (*CurPtr == '\t')) 2311 ++CurPtr; 2312 2313 // If we are keeping whitespace and other tokens, just return what we just 2314 // skipped. The next lexer invocation will return the token after the 2315 // whitespace. 2316 if (isKeepWhitespaceMode()) { 2317 FormTokenWithChars(Result, CurPtr, tok::unknown); 2318 return; 2319 } 2320 2321 BufferPtr = CurPtr; 2322 Result.setFlag(Token::LeadingSpace); 2323 } 2324 2325 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 2326 2327 // Read a character, advancing over it. 2328 char Char = getAndAdvanceChar(CurPtr, Result); 2329 tok::TokenKind Kind; 2330 2331 switch (Char) { 2332 case 0: // Null. 2333 // Found end of file? 2334 if (CurPtr-1 == BufferEnd) { 2335 // Read the PP instance variable into an automatic variable, because 2336 // LexEndOfFile will often delete 'this'. 2337 Preprocessor *PPCache = PP; 2338 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 2339 return; // Got a token to return. 2340 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 2341 return PPCache->Lex(Result); 2342 } 2343 2344 // Check if we are performing code completion. 2345 if (isCodeCompletionPoint(CurPtr-1)) { 2346 // Return the code-completion token. 2347 Result.startToken(); 2348 FormTokenWithChars(Result, CurPtr, tok::code_completion); 2349 return; 2350 } 2351 2352 if (!isLexingRawMode()) 2353 Diag(CurPtr-1, diag::null_in_file); 2354 Result.setFlag(Token::LeadingSpace); 2355 if (SkipWhitespace(Result, CurPtr)) 2356 return; // KeepWhitespaceMode 2357 2358 goto LexNextToken; // GCC isn't tail call eliminating. 2359 2360 case 26: // DOS & CP/M EOF: "^Z". 2361 // If we're in Microsoft extensions mode, treat this as end of file. 2362 if (Features.MicrosoftExt) { 2363 // Read the PP instance variable into an automatic variable, because 2364 // LexEndOfFile will often delete 'this'. 2365 Preprocessor *PPCache = PP; 2366 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 2367 return; // Got a token to return. 2368 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 2369 return PPCache->Lex(Result); 2370 } 2371 // If Microsoft extensions are disabled, this is just random garbage. 2372 Kind = tok::unknown; 2373 break; 2374 2375 case '\n': 2376 case '\r': 2377 // If we are inside a preprocessor directive and we see the end of line, 2378 // we know we are done with the directive, so return an EOD token. 2379 if (ParsingPreprocessorDirective) { 2380 // Done parsing the "line". 2381 ParsingPreprocessorDirective = false; 2382 2383 // Restore comment saving mode, in case it was disabled for directive. 2384 SetCommentRetentionState(PP->getCommentRetentionState()); 2385 2386 // Since we consumed a newline, we are back at the start of a line. 2387 IsAtStartOfLine = true; 2388 2389 Kind = tok::eod; 2390 break; 2391 } 2392 // The returned token is at the start of the line. 2393 Result.setFlag(Token::StartOfLine); 2394 // No leading whitespace seen so far. 2395 Result.clearFlag(Token::LeadingSpace); 2396 2397 if (SkipWhitespace(Result, CurPtr)) 2398 return; // KeepWhitespaceMode 2399 goto LexNextToken; // GCC isn't tail call eliminating. 2400 case ' ': 2401 case '\t': 2402 case '\f': 2403 case '\v': 2404 SkipHorizontalWhitespace: 2405 Result.setFlag(Token::LeadingSpace); 2406 if (SkipWhitespace(Result, CurPtr)) 2407 return; // KeepWhitespaceMode 2408 2409 SkipIgnoredUnits: 2410 CurPtr = BufferPtr; 2411 2412 // If the next token is obviously a // or /* */ comment, skip it efficiently 2413 // too (without going through the big switch stmt). 2414 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 2415 Features.BCPLComment && !Features.TraditionalCPP) { 2416 if (SkipBCPLComment(Result, CurPtr+2)) 2417 return; // There is a token to return. 2418 goto SkipIgnoredUnits; 2419 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 2420 if (SkipBlockComment(Result, CurPtr+2)) 2421 return; // There is a token to return. 2422 goto SkipIgnoredUnits; 2423 } else if (isHorizontalWhitespace(*CurPtr)) { 2424 goto SkipHorizontalWhitespace; 2425 } 2426 goto LexNextToken; // GCC isn't tail call eliminating. 2427 2428 // C99 6.4.4.1: Integer Constants. 2429 // C99 6.4.4.2: Floating Constants. 2430 case '0': case '1': case '2': case '3': case '4': 2431 case '5': case '6': case '7': case '8': case '9': 2432 // Notify MIOpt that we read a non-whitespace/non-comment token. 2433 MIOpt.ReadToken(); 2434 return LexNumericConstant(Result, CurPtr); 2435 2436 case 'u': // Identifier (uber) or C++0x UTF-8 or UTF-16 string literal 2437 // Notify MIOpt that we read a non-whitespace/non-comment token. 2438 MIOpt.ReadToken(); 2439 2440 if (Features.CPlusPlus0x) { 2441 Char = getCharAndSize(CurPtr, SizeTmp); 2442 2443 // UTF-16 string literal 2444 if (Char == '"') 2445 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2446 tok::utf16_string_literal); 2447 2448 // UTF-16 character constant 2449 if (Char == '\'') 2450 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2451 tok::utf16_char_constant); 2452 2453 // UTF-16 raw string literal 2454 if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 2455 return LexRawStringLiteral(Result, 2456 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2457 SizeTmp2, Result), 2458 tok::utf16_string_literal); 2459 2460 if (Char == '8') { 2461 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 2462 2463 // UTF-8 string literal 2464 if (Char2 == '"') 2465 return LexStringLiteral(Result, 2466 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2467 SizeTmp2, Result), 2468 tok::utf8_string_literal); 2469 2470 if (Char2 == 'R') { 2471 unsigned SizeTmp3; 2472 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 2473 // UTF-8 raw string literal 2474 if (Char3 == '"') { 2475 return LexRawStringLiteral(Result, 2476 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2477 SizeTmp2, Result), 2478 SizeTmp3, Result), 2479 tok::utf8_string_literal); 2480 } 2481 } 2482 } 2483 } 2484 2485 // treat u like the start of an identifier. 2486 return LexIdentifier(Result, CurPtr); 2487 2488 case 'U': // Identifier (Uber) or C++0x UTF-32 string literal 2489 // Notify MIOpt that we read a non-whitespace/non-comment token. 2490 MIOpt.ReadToken(); 2491 2492 if (Features.CPlusPlus0x) { 2493 Char = getCharAndSize(CurPtr, SizeTmp); 2494 2495 // UTF-32 string literal 2496 if (Char == '"') 2497 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2498 tok::utf32_string_literal); 2499 2500 // UTF-32 character constant 2501 if (Char == '\'') 2502 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2503 tok::utf32_char_constant); 2504 2505 // UTF-32 raw string literal 2506 if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 2507 return LexRawStringLiteral(Result, 2508 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2509 SizeTmp2, Result), 2510 tok::utf32_string_literal); 2511 } 2512 2513 // treat U like the start of an identifier. 2514 return LexIdentifier(Result, CurPtr); 2515 2516 case 'R': // Identifier or C++0x raw string literal 2517 // Notify MIOpt that we read a non-whitespace/non-comment token. 2518 MIOpt.ReadToken(); 2519 2520 if (Features.CPlusPlus0x) { 2521 Char = getCharAndSize(CurPtr, SizeTmp); 2522 2523 if (Char == '"') 2524 return LexRawStringLiteral(Result, 2525 ConsumeChar(CurPtr, SizeTmp, Result), 2526 tok::string_literal); 2527 } 2528 2529 // treat R like the start of an identifier. 2530 return LexIdentifier(Result, CurPtr); 2531 2532 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 2533 // Notify MIOpt that we read a non-whitespace/non-comment token. 2534 MIOpt.ReadToken(); 2535 Char = getCharAndSize(CurPtr, SizeTmp); 2536 2537 // Wide string literal. 2538 if (Char == '"') 2539 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2540 tok::wide_string_literal); 2541 2542 // Wide raw string literal. 2543 if (Features.CPlusPlus0x && Char == 'R' && 2544 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 2545 return LexRawStringLiteral(Result, 2546 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2547 SizeTmp2, Result), 2548 tok::wide_string_literal); 2549 2550 // Wide character constant. 2551 if (Char == '\'') 2552 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2553 tok::wide_char_constant); 2554 // FALL THROUGH, treating L like the start of an identifier. 2555 2556 // C99 6.4.2: Identifiers. 2557 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 2558 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 2559 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 2560 case 'V': case 'W': case 'X': case 'Y': case 'Z': 2561 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 2562 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 2563 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 2564 case 'v': case 'w': case 'x': case 'y': case 'z': 2565 case '_': 2566 // Notify MIOpt that we read a non-whitespace/non-comment token. 2567 MIOpt.ReadToken(); 2568 return LexIdentifier(Result, CurPtr); 2569 2570 case '$': // $ in identifiers. 2571 if (Features.DollarIdents) { 2572 if (!isLexingRawMode()) 2573 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 2574 // Notify MIOpt that we read a non-whitespace/non-comment token. 2575 MIOpt.ReadToken(); 2576 return LexIdentifier(Result, CurPtr); 2577 } 2578 2579 Kind = tok::unknown; 2580 break; 2581 2582 // C99 6.4.4: Character Constants. 2583 case '\'': 2584 // Notify MIOpt that we read a non-whitespace/non-comment token. 2585 MIOpt.ReadToken(); 2586 return LexCharConstant(Result, CurPtr, tok::char_constant); 2587 2588 // C99 6.4.5: String Literals. 2589 case '"': 2590 // Notify MIOpt that we read a non-whitespace/non-comment token. 2591 MIOpt.ReadToken(); 2592 return LexStringLiteral(Result, CurPtr, tok::string_literal); 2593 2594 // C99 6.4.6: Punctuators. 2595 case '?': 2596 Kind = tok::question; 2597 break; 2598 case '[': 2599 Kind = tok::l_square; 2600 break; 2601 case ']': 2602 Kind = tok::r_square; 2603 break; 2604 case '(': 2605 Kind = tok::l_paren; 2606 break; 2607 case ')': 2608 Kind = tok::r_paren; 2609 break; 2610 case '{': 2611 Kind = tok::l_brace; 2612 break; 2613 case '}': 2614 Kind = tok::r_brace; 2615 break; 2616 case '.': 2617 Char = getCharAndSize(CurPtr, SizeTmp); 2618 if (Char >= '0' && Char <= '9') { 2619 // Notify MIOpt that we read a non-whitespace/non-comment token. 2620 MIOpt.ReadToken(); 2621 2622 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 2623 } else if (Features.CPlusPlus && Char == '*') { 2624 Kind = tok::periodstar; 2625 CurPtr += SizeTmp; 2626 } else if (Char == '.' && 2627 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 2628 Kind = tok::ellipsis; 2629 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2630 SizeTmp2, Result); 2631 } else { 2632 Kind = tok::period; 2633 } 2634 break; 2635 case '&': 2636 Char = getCharAndSize(CurPtr, SizeTmp); 2637 if (Char == '&') { 2638 Kind = tok::ampamp; 2639 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2640 } else if (Char == '=') { 2641 Kind = tok::ampequal; 2642 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2643 } else { 2644 Kind = tok::amp; 2645 } 2646 break; 2647 case '*': 2648 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 2649 Kind = tok::starequal; 2650 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2651 } else { 2652 Kind = tok::star; 2653 } 2654 break; 2655 case '+': 2656 Char = getCharAndSize(CurPtr, SizeTmp); 2657 if (Char == '+') { 2658 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2659 Kind = tok::plusplus; 2660 } else if (Char == '=') { 2661 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2662 Kind = tok::plusequal; 2663 } else { 2664 Kind = tok::plus; 2665 } 2666 break; 2667 case '-': 2668 Char = getCharAndSize(CurPtr, SizeTmp); 2669 if (Char == '-') { // -- 2670 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2671 Kind = tok::minusminus; 2672 } else if (Char == '>' && Features.CPlusPlus && 2673 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 2674 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2675 SizeTmp2, Result); 2676 Kind = tok::arrowstar; 2677 } else if (Char == '>') { // -> 2678 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2679 Kind = tok::arrow; 2680 } else if (Char == '=') { // -= 2681 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2682 Kind = tok::minusequal; 2683 } else { 2684 Kind = tok::minus; 2685 } 2686 break; 2687 case '~': 2688 Kind = tok::tilde; 2689 break; 2690 case '!': 2691 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 2692 Kind = tok::exclaimequal; 2693 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2694 } else { 2695 Kind = tok::exclaim; 2696 } 2697 break; 2698 case '/': 2699 // 6.4.9: Comments 2700 Char = getCharAndSize(CurPtr, SizeTmp); 2701 if (Char == '/') { // BCPL comment. 2702 // Even if BCPL comments are disabled (e.g. in C89 mode), we generally 2703 // want to lex this as a comment. There is one problem with this though, 2704 // that in one particular corner case, this can change the behavior of the 2705 // resultant program. For example, In "foo //**/ bar", C89 would lex 2706 // this as "foo / bar" and langauges with BCPL comments would lex it as 2707 // "foo". Check to see if the character after the second slash is a '*'. 2708 // If so, we will lex that as a "/" instead of the start of a comment. 2709 // However, we never do this in -traditional-cpp mode. 2710 if ((Features.BCPLComment || 2711 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') && 2712 !Features.TraditionalCPP) { 2713 if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 2714 return; // There is a token to return. 2715 2716 // It is common for the tokens immediately after a // comment to be 2717 // whitespace (indentation for the next line). Instead of going through 2718 // the big switch, handle it efficiently now. 2719 goto SkipIgnoredUnits; 2720 } 2721 } 2722 2723 if (Char == '*') { // /**/ comment. 2724 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 2725 return; // There is a token to return. 2726 goto LexNextToken; // GCC isn't tail call eliminating. 2727 } 2728 2729 if (Char == '=') { 2730 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2731 Kind = tok::slashequal; 2732 } else { 2733 Kind = tok::slash; 2734 } 2735 break; 2736 case '%': 2737 Char = getCharAndSize(CurPtr, SizeTmp); 2738 if (Char == '=') { 2739 Kind = tok::percentequal; 2740 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2741 } else if (Features.Digraphs && Char == '>') { 2742 Kind = tok::r_brace; // '%>' -> '}' 2743 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2744 } else if (Features.Digraphs && Char == ':') { 2745 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2746 Char = getCharAndSize(CurPtr, SizeTmp); 2747 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 2748 Kind = tok::hashhash; // '%:%:' -> '##' 2749 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2750 SizeTmp2, Result); 2751 } else if (Char == '@' && Features.MicrosoftExt) {// %:@ -> #@ -> Charize 2752 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2753 if (!isLexingRawMode()) 2754 Diag(BufferPtr, diag::ext_charize_microsoft); 2755 Kind = tok::hashat; 2756 } else { // '%:' -> '#' 2757 // We parsed a # character. If this occurs at the start of the line, 2758 // it's actually the start of a preprocessing directive. Callback to 2759 // the preprocessor to handle it. 2760 // FIXME: -fpreprocessed mode?? 2761 if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) { 2762 FormTokenWithChars(Result, CurPtr, tok::hash); 2763 PP->HandleDirective(Result); 2764 2765 // As an optimization, if the preprocessor didn't switch lexers, tail 2766 // recurse. 2767 if (PP->isCurrentLexer(this)) { 2768 // Start a new token. If this is a #include or something, the PP may 2769 // want us starting at the beginning of the line again. If so, set 2770 // the StartOfLine flag and clear LeadingSpace. 2771 if (IsAtStartOfLine) { 2772 Result.setFlag(Token::StartOfLine); 2773 Result.clearFlag(Token::LeadingSpace); 2774 IsAtStartOfLine = false; 2775 } 2776 goto LexNextToken; // GCC isn't tail call eliminating. 2777 } 2778 2779 return PP->Lex(Result); 2780 } 2781 2782 Kind = tok::hash; 2783 } 2784 } else { 2785 Kind = tok::percent; 2786 } 2787 break; 2788 case '<': 2789 Char = getCharAndSize(CurPtr, SizeTmp); 2790 if (ParsingFilename) { 2791 return LexAngledStringLiteral(Result, CurPtr); 2792 } else if (Char == '<') { 2793 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 2794 if (After == '=') { 2795 Kind = tok::lesslessequal; 2796 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2797 SizeTmp2, Result); 2798 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 2799 // If this is actually a '<<<<<<<' version control conflict marker, 2800 // recognize it as such and recover nicely. 2801 goto LexNextToken; 2802 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 2803 // If this is '<<<<' and we're in a Perforce-style conflict marker, 2804 // ignore it. 2805 goto LexNextToken; 2806 } else if (Features.CUDA && After == '<') { 2807 Kind = tok::lesslessless; 2808 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2809 SizeTmp2, Result); 2810 } else { 2811 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2812 Kind = tok::lessless; 2813 } 2814 } else if (Char == '=') { 2815 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2816 Kind = tok::lessequal; 2817 } else if (Features.Digraphs && Char == ':') { // '<:' -> '[' 2818 if (Features.CPlusPlus0x && 2819 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 2820 // C++0x [lex.pptoken]p3: 2821 // Otherwise, if the next three characters are <:: and the subsequent 2822 // character is neither : nor >, the < is treated as a preprocessor 2823 // token by itself and not as the first character of the alternative 2824 // token <:. 2825 unsigned SizeTmp3; 2826 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 2827 if (After != ':' && After != '>') { 2828 Kind = tok::less; 2829 if (!isLexingRawMode()) 2830 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 2831 break; 2832 } 2833 } 2834 2835 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2836 Kind = tok::l_square; 2837 } else if (Features.Digraphs && Char == '%') { // '<%' -> '{' 2838 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2839 Kind = tok::l_brace; 2840 } else { 2841 Kind = tok::less; 2842 } 2843 break; 2844 case '>': 2845 Char = getCharAndSize(CurPtr, SizeTmp); 2846 if (Char == '=') { 2847 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2848 Kind = tok::greaterequal; 2849 } else if (Char == '>') { 2850 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 2851 if (After == '=') { 2852 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2853 SizeTmp2, Result); 2854 Kind = tok::greatergreaterequal; 2855 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 2856 // If this is actually a '>>>>' conflict marker, recognize it as such 2857 // and recover nicely. 2858 goto LexNextToken; 2859 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 2860 // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 2861 goto LexNextToken; 2862 } else if (Features.CUDA && After == '>') { 2863 Kind = tok::greatergreatergreater; 2864 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2865 SizeTmp2, Result); 2866 } else { 2867 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2868 Kind = tok::greatergreater; 2869 } 2870 2871 } else { 2872 Kind = tok::greater; 2873 } 2874 break; 2875 case '^': 2876 Char = getCharAndSize(CurPtr, SizeTmp); 2877 if (Char == '=') { 2878 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2879 Kind = tok::caretequal; 2880 } else { 2881 Kind = tok::caret; 2882 } 2883 break; 2884 case '|': 2885 Char = getCharAndSize(CurPtr, SizeTmp); 2886 if (Char == '=') { 2887 Kind = tok::pipeequal; 2888 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2889 } else if (Char == '|') { 2890 // If this is '|||||||' and we're in a conflict marker, ignore it. 2891 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 2892 goto LexNextToken; 2893 Kind = tok::pipepipe; 2894 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2895 } else { 2896 Kind = tok::pipe; 2897 } 2898 break; 2899 case ':': 2900 Char = getCharAndSize(CurPtr, SizeTmp); 2901 if (Features.Digraphs && Char == '>') { 2902 Kind = tok::r_square; // ':>' -> ']' 2903 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2904 } else if (Features.CPlusPlus && Char == ':') { 2905 Kind = tok::coloncolon; 2906 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2907 } else { 2908 Kind = tok::colon; 2909 } 2910 break; 2911 case ';': 2912 Kind = tok::semi; 2913 break; 2914 case '=': 2915 Char = getCharAndSize(CurPtr, SizeTmp); 2916 if (Char == '=') { 2917 // If this is '====' and we're in a conflict marker, ignore it. 2918 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 2919 goto LexNextToken; 2920 2921 Kind = tok::equalequal; 2922 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2923 } else { 2924 Kind = tok::equal; 2925 } 2926 break; 2927 case ',': 2928 Kind = tok::comma; 2929 break; 2930 case '#': 2931 Char = getCharAndSize(CurPtr, SizeTmp); 2932 if (Char == '#') { 2933 Kind = tok::hashhash; 2934 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2935 } else if (Char == '@' && Features.MicrosoftExt) { // #@ -> Charize 2936 Kind = tok::hashat; 2937 if (!isLexingRawMode()) 2938 Diag(BufferPtr, diag::ext_charize_microsoft); 2939 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2940 } else { 2941 // We parsed a # character. If this occurs at the start of the line, 2942 // it's actually the start of a preprocessing directive. Callback to 2943 // the preprocessor to handle it. 2944 // FIXME: -fpreprocessed mode?? 2945 if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) { 2946 FormTokenWithChars(Result, CurPtr, tok::hash); 2947 PP->HandleDirective(Result); 2948 2949 // As an optimization, if the preprocessor didn't switch lexers, tail 2950 // recurse. 2951 if (PP->isCurrentLexer(this)) { 2952 // Start a new token. If this is a #include or something, the PP may 2953 // want us starting at the beginning of the line again. If so, set 2954 // the StartOfLine flag and clear LeadingSpace. 2955 if (IsAtStartOfLine) { 2956 Result.setFlag(Token::StartOfLine); 2957 Result.clearFlag(Token::LeadingSpace); 2958 IsAtStartOfLine = false; 2959 } 2960 goto LexNextToken; // GCC isn't tail call eliminating. 2961 } 2962 return PP->Lex(Result); 2963 } 2964 2965 Kind = tok::hash; 2966 } 2967 break; 2968 2969 case '@': 2970 // Objective C support. 2971 if (CurPtr[-1] == '@' && Features.ObjC1) 2972 Kind = tok::at; 2973 else 2974 Kind = tok::unknown; 2975 break; 2976 2977 case '\\': 2978 // FIXME: UCN's. 2979 // FALL THROUGH. 2980 default: 2981 Kind = tok::unknown; 2982 break; 2983 } 2984 2985 // Notify MIOpt that we read a non-whitespace/non-comment token. 2986 MIOpt.ReadToken(); 2987 2988 // Update the location of token as well as BufferPtr. 2989 FormTokenWithChars(Result, CurPtr, Kind); 2990} 2991