Lexer.cpp revision 5cc2c6eb67b6e5361bbe96f79b519fd62ec666d6
1//===--- Lexer.cpp - C Language Family Lexer ------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements the Lexer and Token interfaces. 11// 12//===----------------------------------------------------------------------===// 13// 14// TODO: GCC Diagnostics emitted by the lexer: 15// PEDWARN: (form feed|vertical tab) in preprocessing directive 16// 17// Universal characters, unicode, char mapping: 18// WARNING: `%.*s' is not in NFKC 19// WARNING: `%.*s' is not in NFC 20// 21// Other: 22// TODO: Options to support: 23// -fexec-charset,-fwide-exec-charset 24// 25//===----------------------------------------------------------------------===// 26 27#include "clang/Lex/Lexer.h" 28#include "clang/Lex/Preprocessor.h" 29#include "clang/Lex/LexDiagnostic.h" 30#include "clang/Lex/CodeCompletionHandler.h" 31#include "clang/Basic/SourceManager.h" 32#include "llvm/ADT/StringSwitch.h" 33#include "llvm/ADT/STLExtras.h" 34#include "llvm/Support/Compiler.h" 35#include "llvm/Support/MemoryBuffer.h" 36#include <cstring> 37using namespace clang; 38 39static void InitCharacterInfo(); 40 41//===----------------------------------------------------------------------===// 42// Token Class Implementation 43//===----------------------------------------------------------------------===// 44 45/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 46bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 47 if (IdentifierInfo *II = getIdentifierInfo()) 48 return II->getObjCKeywordID() == objcKey; 49 return false; 50} 51 52/// getObjCKeywordID - Return the ObjC keyword kind. 53tok::ObjCKeywordKind Token::getObjCKeywordID() const { 54 IdentifierInfo *specId = getIdentifierInfo(); 55 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 56} 57 58 59//===----------------------------------------------------------------------===// 60// Lexer Class Implementation 61//===----------------------------------------------------------------------===// 62 63void Lexer::anchor() { } 64 65void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 66 const char *BufEnd) { 67 InitCharacterInfo(); 68 69 BufferStart = BufStart; 70 BufferPtr = BufPtr; 71 BufferEnd = BufEnd; 72 73 assert(BufEnd[0] == 0 && 74 "We assume that the input buffer has a null character at the end" 75 " to simplify lexing!"); 76 77 // Check whether we have a BOM in the beginning of the buffer. If yes - act 78 // accordingly. Right now we support only UTF-8 with and without BOM, so, just 79 // skip the UTF-8 BOM if it's present. 80 if (BufferStart == BufferPtr) { 81 // Determine the size of the BOM. 82 StringRef Buf(BufferStart, BufferEnd - BufferStart); 83 size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 84 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 85 .Default(0); 86 87 // Skip the BOM. 88 BufferPtr += BOMLength; 89 } 90 91 Is_PragmaLexer = false; 92 CurrentConflictMarkerState = CMK_None; 93 94 // Start of the file is a start of line. 95 IsAtStartOfLine = true; 96 97 // We are not after parsing a #. 98 ParsingPreprocessorDirective = false; 99 100 // We are not after parsing #include. 101 ParsingFilename = false; 102 103 // We are not in raw mode. Raw mode disables diagnostics and interpretation 104 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 105 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 106 // or otherwise skipping over tokens. 107 LexingRawMode = false; 108 109 // Default to not keeping comments. 110 ExtendedTokenMode = 0; 111} 112 113/// Lexer constructor - Create a new lexer object for the specified buffer 114/// with the specified preprocessor managing the lexing process. This lexer 115/// assumes that the associated file buffer and Preprocessor objects will 116/// outlive it, so it doesn't take ownership of either of them. 117Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP) 118 : PreprocessorLexer(&PP, FID), 119 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 120 Features(PP.getLangOptions()) { 121 122 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), 123 InputFile->getBufferEnd()); 124 125 // Default to keeping comments if the preprocessor wants them. 126 SetCommentRetentionState(PP.getCommentRetentionState()); 127} 128 129/// Lexer constructor - Create a new raw lexer object. This object is only 130/// suitable for calls to 'LexRawToken'. This lexer assumes that the text 131/// range will outlive it, so it doesn't take ownership of it. 132Lexer::Lexer(SourceLocation fileloc, const LangOptions &features, 133 const char *BufStart, const char *BufPtr, const char *BufEnd) 134 : FileLoc(fileloc), Features(features) { 135 136 InitLexer(BufStart, BufPtr, BufEnd); 137 138 // We *are* in raw mode. 139 LexingRawMode = true; 140} 141 142/// Lexer constructor - Create a new raw lexer object. This object is only 143/// suitable for calls to 'LexRawToken'. This lexer assumes that the text 144/// range will outlive it, so it doesn't take ownership of it. 145Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile, 146 const SourceManager &SM, const LangOptions &features) 147 : FileLoc(SM.getLocForStartOfFile(FID)), Features(features) { 148 149 InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(), 150 FromFile->getBufferEnd()); 151 152 // We *are* in raw mode. 153 LexingRawMode = true; 154} 155 156/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 157/// _Pragma expansion. This has a variety of magic semantics that this method 158/// sets up. It returns a new'd Lexer that must be delete'd when done. 159/// 160/// On entrance to this routine, TokStartLoc is a macro location which has a 161/// spelling loc that indicates the bytes to be lexed for the token and an 162/// expansion location that indicates where all lexed tokens should be 163/// "expanded from". 164/// 165/// FIXME: It would really be nice to make _Pragma just be a wrapper around a 166/// normal lexer that remaps tokens as they fly by. This would require making 167/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 168/// interface that could handle this stuff. This would pull GetMappedTokenLoc 169/// out of the critical path of the lexer! 170/// 171Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 172 SourceLocation ExpansionLocStart, 173 SourceLocation ExpansionLocEnd, 174 unsigned TokLen, Preprocessor &PP) { 175 SourceManager &SM = PP.getSourceManager(); 176 177 // Create the lexer as if we were going to lex the file normally. 178 FileID SpellingFID = SM.getFileID(SpellingLoc); 179 const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID); 180 Lexer *L = new Lexer(SpellingFID, InputFile, PP); 181 182 // Now that the lexer is created, change the start/end locations so that we 183 // just lex the subsection of the file that we want. This is lexing from a 184 // scratch buffer. 185 const char *StrData = SM.getCharacterData(SpellingLoc); 186 187 L->BufferPtr = StrData; 188 L->BufferEnd = StrData+TokLen; 189 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 190 191 // Set the SourceLocation with the remapping information. This ensures that 192 // GetMappedTokenLoc will remap the tokens as they are lexed. 193 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 194 ExpansionLocStart, 195 ExpansionLocEnd, TokLen); 196 197 // Ensure that the lexer thinks it is inside a directive, so that end \n will 198 // return an EOD token. 199 L->ParsingPreprocessorDirective = true; 200 201 // This lexer really is for _Pragma. 202 L->Is_PragmaLexer = true; 203 return L; 204} 205 206 207/// Stringify - Convert the specified string into a C string, with surrounding 208/// ""'s, and with escaped \ and " characters. 209std::string Lexer::Stringify(const std::string &Str, bool Charify) { 210 std::string Result = Str; 211 char Quote = Charify ? '\'' : '"'; 212 for (unsigned i = 0, e = Result.size(); i != e; ++i) { 213 if (Result[i] == '\\' || Result[i] == Quote) { 214 Result.insert(Result.begin()+i, '\\'); 215 ++i; ++e; 216 } 217 } 218 return Result; 219} 220 221/// Stringify - Convert the specified string into a C string by escaping '\' 222/// and " characters. This does not add surrounding ""'s to the string. 223void Lexer::Stringify(SmallVectorImpl<char> &Str) { 224 for (unsigned i = 0, e = Str.size(); i != e; ++i) { 225 if (Str[i] == '\\' || Str[i] == '"') { 226 Str.insert(Str.begin()+i, '\\'); 227 ++i; ++e; 228 } 229 } 230} 231 232//===----------------------------------------------------------------------===// 233// Token Spelling 234//===----------------------------------------------------------------------===// 235 236/// getSpelling() - Return the 'spelling' of this token. The spelling of a 237/// token are the characters used to represent the token in the source file 238/// after trigraph expansion and escaped-newline folding. In particular, this 239/// wants to get the true, uncanonicalized, spelling of things like digraphs 240/// UCNs, etc. 241StringRef Lexer::getSpelling(SourceLocation loc, 242 SmallVectorImpl<char> &buffer, 243 const SourceManager &SM, 244 const LangOptions &options, 245 bool *invalid) { 246 // Break down the source location. 247 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 248 249 // Try to the load the file buffer. 250 bool invalidTemp = false; 251 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 252 if (invalidTemp) { 253 if (invalid) *invalid = true; 254 return StringRef(); 255 } 256 257 const char *tokenBegin = file.data() + locInfo.second; 258 259 // Lex from the start of the given location. 260 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 261 file.begin(), tokenBegin, file.end()); 262 Token token; 263 lexer.LexFromRawLexer(token); 264 265 unsigned length = token.getLength(); 266 267 // Common case: no need for cleaning. 268 if (!token.needsCleaning()) 269 return StringRef(tokenBegin, length); 270 271 // Hard case, we need to relex the characters into the string. 272 buffer.clear(); 273 buffer.reserve(length); 274 275 for (const char *ti = tokenBegin, *te = ti + length; ti != te; ) { 276 unsigned charSize; 277 buffer.push_back(Lexer::getCharAndSizeNoWarn(ti, charSize, options)); 278 ti += charSize; 279 } 280 281 return StringRef(buffer.data(), buffer.size()); 282} 283 284/// getSpelling() - Return the 'spelling' of this token. The spelling of a 285/// token are the characters used to represent the token in the source file 286/// after trigraph expansion and escaped-newline folding. In particular, this 287/// wants to get the true, uncanonicalized, spelling of things like digraphs 288/// UCNs, etc. 289std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 290 const LangOptions &Features, bool *Invalid) { 291 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 292 293 // If this token contains nothing interesting, return it directly. 294 bool CharDataInvalid = false; 295 const char* TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 296 &CharDataInvalid); 297 if (Invalid) 298 *Invalid = CharDataInvalid; 299 if (CharDataInvalid) 300 return std::string(); 301 302 if (!Tok.needsCleaning()) 303 return std::string(TokStart, TokStart+Tok.getLength()); 304 305 std::string Result; 306 Result.reserve(Tok.getLength()); 307 308 // Otherwise, hard case, relex the characters into the string. 309 for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength(); 310 Ptr != End; ) { 311 unsigned CharSize; 312 Result.push_back(Lexer::getCharAndSizeNoWarn(Ptr, CharSize, Features)); 313 Ptr += CharSize; 314 } 315 assert(Result.size() != unsigned(Tok.getLength()) && 316 "NeedsCleaning flag set on something that didn't need cleaning!"); 317 return Result; 318} 319 320/// getSpelling - This method is used to get the spelling of a token into a 321/// preallocated buffer, instead of as an std::string. The caller is required 322/// to allocate enough space for the token, which is guaranteed to be at least 323/// Tok.getLength() bytes long. The actual length of the token is returned. 324/// 325/// Note that this method may do two possible things: it may either fill in 326/// the buffer specified with characters, or it may *change the input pointer* 327/// to point to a constant buffer with the data already in it (avoiding a 328/// copy). The caller is not allowed to modify the returned buffer pointer 329/// if an internal buffer is returned. 330unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 331 const SourceManager &SourceMgr, 332 const LangOptions &Features, bool *Invalid) { 333 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 334 335 const char *TokStart = 0; 336 // NOTE: this has to be checked *before* testing for an IdentifierInfo. 337 if (Tok.is(tok::raw_identifier)) 338 TokStart = Tok.getRawIdentifierData(); 339 else if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 340 // Just return the string from the identifier table, which is very quick. 341 Buffer = II->getNameStart(); 342 return II->getLength(); 343 } 344 345 // NOTE: this can be checked even after testing for an IdentifierInfo. 346 if (Tok.isLiteral()) 347 TokStart = Tok.getLiteralData(); 348 349 if (TokStart == 0) { 350 // Compute the start of the token in the input lexer buffer. 351 bool CharDataInvalid = false; 352 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 353 if (Invalid) 354 *Invalid = CharDataInvalid; 355 if (CharDataInvalid) { 356 Buffer = ""; 357 return 0; 358 } 359 } 360 361 // If this token contains nothing interesting, return it directly. 362 if (!Tok.needsCleaning()) { 363 Buffer = TokStart; 364 return Tok.getLength(); 365 } 366 367 // Otherwise, hard case, relex the characters into the string. 368 char *OutBuf = const_cast<char*>(Buffer); 369 for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength(); 370 Ptr != End; ) { 371 unsigned CharSize; 372 *OutBuf++ = Lexer::getCharAndSizeNoWarn(Ptr, CharSize, Features); 373 Ptr += CharSize; 374 } 375 assert(unsigned(OutBuf-Buffer) != Tok.getLength() && 376 "NeedsCleaning flag set on something that didn't need cleaning!"); 377 378 return OutBuf-Buffer; 379} 380 381 382 383static bool isWhitespace(unsigned char c); 384 385/// MeasureTokenLength - Relex the token at the specified location and return 386/// its length in bytes in the input file. If the token needs cleaning (e.g. 387/// includes a trigraph or an escaped newline) then this count includes bytes 388/// that are part of that. 389unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 390 const SourceManager &SM, 391 const LangOptions &LangOpts) { 392 // TODO: this could be special cased for common tokens like identifiers, ')', 393 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 394 // all obviously single-char tokens. This could use 395 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 396 // something. 397 398 // If this comes from a macro expansion, we really do want the macro name, not 399 // the token this macro expanded to. 400 Loc = SM.getExpansionLoc(Loc); 401 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 402 bool Invalid = false; 403 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 404 if (Invalid) 405 return 0; 406 407 const char *StrData = Buffer.data()+LocInfo.second; 408 409 if (isWhitespace(StrData[0])) 410 return 0; 411 412 // Create a lexer starting at the beginning of this token. 413 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 414 Buffer.begin(), StrData, Buffer.end()); 415 TheLexer.SetCommentRetentionState(true); 416 Token TheTok; 417 TheLexer.LexFromRawLexer(TheTok); 418 return TheTok.getLength(); 419} 420 421static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 422 const SourceManager &SM, 423 const LangOptions &LangOpts) { 424 assert(Loc.isFileID()); 425 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 426 if (LocInfo.first.isInvalid()) 427 return Loc; 428 429 bool Invalid = false; 430 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 431 if (Invalid) 432 return Loc; 433 434 // Back up from the current location until we hit the beginning of a line 435 // (or the buffer). We'll relex from that point. 436 const char *BufStart = Buffer.data(); 437 if (LocInfo.second >= Buffer.size()) 438 return Loc; 439 440 const char *StrData = BufStart+LocInfo.second; 441 if (StrData[0] == '\n' || StrData[0] == '\r') 442 return Loc; 443 444 const char *LexStart = StrData; 445 while (LexStart != BufStart) { 446 if (LexStart[0] == '\n' || LexStart[0] == '\r') { 447 ++LexStart; 448 break; 449 } 450 451 --LexStart; 452 } 453 454 // Create a lexer starting at the beginning of this token. 455 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 456 Lexer TheLexer(LexerStartLoc, LangOpts, BufStart, LexStart, Buffer.end()); 457 TheLexer.SetCommentRetentionState(true); 458 459 // Lex tokens until we find the token that contains the source location. 460 Token TheTok; 461 do { 462 TheLexer.LexFromRawLexer(TheTok); 463 464 if (TheLexer.getBufferLocation() > StrData) { 465 // Lexing this token has taken the lexer past the source location we're 466 // looking for. If the current token encompasses our source location, 467 // return the beginning of that token. 468 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 469 return TheTok.getLocation(); 470 471 // We ended up skipping over the source location entirely, which means 472 // that it points into whitespace. We're done here. 473 break; 474 } 475 } while (TheTok.getKind() != tok::eof); 476 477 // We've passed our source location; just return the original source location. 478 return Loc; 479} 480 481SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 482 const SourceManager &SM, 483 const LangOptions &LangOpts) { 484 if (Loc.isFileID()) 485 return getBeginningOfFileToken(Loc, SM, LangOpts); 486 487 if (!SM.isMacroArgExpansion(Loc)) 488 return Loc; 489 490 SourceLocation FileLoc = SM.getSpellingLoc(Loc); 491 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 492 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); 493 std::pair<FileID, unsigned> BeginFileLocInfo 494 = SM.getDecomposedLoc(BeginFileLoc); 495 assert(FileLocInfo.first == BeginFileLocInfo.first && 496 FileLocInfo.second >= BeginFileLocInfo.second); 497 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); 498} 499 500namespace { 501 enum PreambleDirectiveKind { 502 PDK_Skipped, 503 PDK_StartIf, 504 PDK_EndIf, 505 PDK_Unknown 506 }; 507} 508 509std::pair<unsigned, bool> 510Lexer::ComputePreamble(const llvm::MemoryBuffer *Buffer, 511 const LangOptions &Features, unsigned MaxLines) { 512 // Create a lexer starting at the beginning of the file. Note that we use a 513 // "fake" file source location at offset 1 so that the lexer will track our 514 // position within the file. 515 const unsigned StartOffset = 1; 516 SourceLocation StartLoc = SourceLocation::getFromRawEncoding(StartOffset); 517 Lexer TheLexer(StartLoc, Features, Buffer->getBufferStart(), 518 Buffer->getBufferStart(), Buffer->getBufferEnd()); 519 520 bool InPreprocessorDirective = false; 521 Token TheTok; 522 Token IfStartTok; 523 unsigned IfCount = 0; 524 525 unsigned MaxLineOffset = 0; 526 if (MaxLines) { 527 const char *CurPtr = Buffer->getBufferStart(); 528 unsigned CurLine = 0; 529 while (CurPtr != Buffer->getBufferEnd()) { 530 char ch = *CurPtr++; 531 if (ch == '\n') { 532 ++CurLine; 533 if (CurLine == MaxLines) 534 break; 535 } 536 } 537 if (CurPtr != Buffer->getBufferEnd()) 538 MaxLineOffset = CurPtr - Buffer->getBufferStart(); 539 } 540 541 do { 542 TheLexer.LexFromRawLexer(TheTok); 543 544 if (InPreprocessorDirective) { 545 // If we've hit the end of the file, we're done. 546 if (TheTok.getKind() == tok::eof) { 547 InPreprocessorDirective = false; 548 break; 549 } 550 551 // If we haven't hit the end of the preprocessor directive, skip this 552 // token. 553 if (!TheTok.isAtStartOfLine()) 554 continue; 555 556 // We've passed the end of the preprocessor directive, and will look 557 // at this token again below. 558 InPreprocessorDirective = false; 559 } 560 561 // Keep track of the # of lines in the preamble. 562 if (TheTok.isAtStartOfLine()) { 563 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 564 565 // If we were asked to limit the number of lines in the preamble, 566 // and we're about to exceed that limit, we're done. 567 if (MaxLineOffset && TokOffset >= MaxLineOffset) 568 break; 569 } 570 571 // Comments are okay; skip over them. 572 if (TheTok.getKind() == tok::comment) 573 continue; 574 575 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 576 // This is the start of a preprocessor directive. 577 Token HashTok = TheTok; 578 InPreprocessorDirective = true; 579 580 // Figure out which directive this is. Since we're lexing raw tokens, 581 // we don't have an identifier table available. Instead, just look at 582 // the raw identifier to recognize and categorize preprocessor directives. 583 TheLexer.LexFromRawLexer(TheTok); 584 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 585 StringRef Keyword(TheTok.getRawIdentifierData(), 586 TheTok.getLength()); 587 PreambleDirectiveKind PDK 588 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 589 .Case("include", PDK_Skipped) 590 .Case("__include_macros", PDK_Skipped) 591 .Case("define", PDK_Skipped) 592 .Case("undef", PDK_Skipped) 593 .Case("line", PDK_Skipped) 594 .Case("error", PDK_Skipped) 595 .Case("pragma", PDK_Skipped) 596 .Case("import", PDK_Skipped) 597 .Case("include_next", PDK_Skipped) 598 .Case("warning", PDK_Skipped) 599 .Case("ident", PDK_Skipped) 600 .Case("sccs", PDK_Skipped) 601 .Case("assert", PDK_Skipped) 602 .Case("unassert", PDK_Skipped) 603 .Case("if", PDK_StartIf) 604 .Case("ifdef", PDK_StartIf) 605 .Case("ifndef", PDK_StartIf) 606 .Case("elif", PDK_Skipped) 607 .Case("else", PDK_Skipped) 608 .Case("endif", PDK_EndIf) 609 .Default(PDK_Unknown); 610 611 switch (PDK) { 612 case PDK_Skipped: 613 continue; 614 615 case PDK_StartIf: 616 if (IfCount == 0) 617 IfStartTok = HashTok; 618 619 ++IfCount; 620 continue; 621 622 case PDK_EndIf: 623 // Mismatched #endif. The preamble ends here. 624 if (IfCount == 0) 625 break; 626 627 --IfCount; 628 continue; 629 630 case PDK_Unknown: 631 // We don't know what this directive is; stop at the '#'. 632 break; 633 } 634 } 635 636 // We only end up here if we didn't recognize the preprocessor 637 // directive or it was one that can't occur in the preamble at this 638 // point. Roll back the current token to the location of the '#'. 639 InPreprocessorDirective = false; 640 TheTok = HashTok; 641 } 642 643 // We hit a token that we don't recognize as being in the 644 // "preprocessing only" part of the file, so we're no longer in 645 // the preamble. 646 break; 647 } while (true); 648 649 SourceLocation End = IfCount? IfStartTok.getLocation() : TheTok.getLocation(); 650 return std::make_pair(End.getRawEncoding() - StartLoc.getRawEncoding(), 651 IfCount? IfStartTok.isAtStartOfLine() 652 : TheTok.isAtStartOfLine()); 653} 654 655 656/// AdvanceToTokenCharacter - Given a location that specifies the start of a 657/// token, return a new location that specifies a character within the token. 658SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart, 659 unsigned CharNo, 660 const SourceManager &SM, 661 const LangOptions &Features) { 662 // Figure out how many physical characters away the specified expansion 663 // character is. This needs to take into consideration newlines and 664 // trigraphs. 665 bool Invalid = false; 666 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 667 668 // If they request the first char of the token, we're trivially done. 669 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 670 return TokStart; 671 672 unsigned PhysOffset = 0; 673 674 // The usual case is that tokens don't contain anything interesting. Skip 675 // over the uninteresting characters. If a token only consists of simple 676 // chars, this method is extremely fast. 677 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 678 if (CharNo == 0) 679 return TokStart.getLocWithOffset(PhysOffset); 680 ++TokPtr, --CharNo, ++PhysOffset; 681 } 682 683 // If we have a character that may be a trigraph or escaped newline, use a 684 // lexer to parse it correctly. 685 for (; CharNo; --CharNo) { 686 unsigned Size; 687 Lexer::getCharAndSizeNoWarn(TokPtr, Size, Features); 688 TokPtr += Size; 689 PhysOffset += Size; 690 } 691 692 // Final detail: if we end up on an escaped newline, we want to return the 693 // location of the actual byte of the token. For example foo\<newline>bar 694 // advanced by 3 should return the location of b, not of \\. One compounding 695 // detail of this is that the escape may be made by a trigraph. 696 if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 697 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 698 699 return TokStart.getLocWithOffset(PhysOffset); 700} 701 702/// \brief Computes the source location just past the end of the 703/// token at this source location. 704/// 705/// This routine can be used to produce a source location that 706/// points just past the end of the token referenced by \p Loc, and 707/// is generally used when a diagnostic needs to point just after a 708/// token where it expected something different that it received. If 709/// the returned source location would not be meaningful (e.g., if 710/// it points into a macro), this routine returns an invalid 711/// source location. 712/// 713/// \param Offset an offset from the end of the token, where the source 714/// location should refer to. The default offset (0) produces a source 715/// location pointing just past the end of the token; an offset of 1 produces 716/// a source location pointing to the last character in the token, etc. 717SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 718 const SourceManager &SM, 719 const LangOptions &Features) { 720 if (Loc.isInvalid()) 721 return SourceLocation(); 722 723 if (Loc.isMacroID()) { 724 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, Features, &Loc)) 725 return SourceLocation(); // Points inside the macro expansion. 726 } 727 728 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, Features); 729 if (Len > Offset) 730 Len = Len - Offset; 731 else 732 return Loc; 733 734 return Loc.getLocWithOffset(Len); 735} 736 737/// \brief Returns true if the given MacroID location points at the first 738/// token of the macro expansion. 739bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 740 const SourceManager &SM, 741 const LangOptions &LangOpts, 742 SourceLocation *MacroBegin) { 743 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 744 745 std::pair<FileID, unsigned> infoLoc = SM.getDecomposedLoc(loc); 746 // FIXME: If the token comes from the macro token paste operator ('##') 747 // this function will always return false; 748 if (infoLoc.second > 0) 749 return false; // Does not point at the start of token. 750 751 SourceLocation expansionLoc = 752 SM.getSLocEntry(infoLoc.first).getExpansion().getExpansionLocStart(); 753 if (expansionLoc.isFileID()) { 754 // No other macro expansions, this is the first. 755 if (MacroBegin) 756 *MacroBegin = expansionLoc; 757 return true; 758 } 759 760 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin); 761} 762 763/// \brief Returns true if the given MacroID location points at the last 764/// token of the macro expansion. 765bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 766 const SourceManager &SM, 767 const LangOptions &LangOpts, 768 SourceLocation *MacroEnd) { 769 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 770 771 SourceLocation spellLoc = SM.getSpellingLoc(loc); 772 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 773 if (tokLen == 0) 774 return false; 775 776 FileID FID = SM.getFileID(loc); 777 SourceLocation afterLoc = loc.getLocWithOffset(tokLen+1); 778 if (SM.isInFileID(afterLoc, FID)) 779 return false; // Still in the same FileID, does not point to the last token. 780 781 // FIXME: If the token comes from the macro token paste operator ('##') 782 // or the stringify operator ('#') this function will always return false; 783 784 SourceLocation expansionLoc = 785 SM.getSLocEntry(FID).getExpansion().getExpansionLocEnd(); 786 if (expansionLoc.isFileID()) { 787 // No other macro expansions. 788 if (MacroEnd) 789 *MacroEnd = expansionLoc; 790 return true; 791 } 792 793 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd); 794} 795 796static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, 797 const SourceManager &SM, 798 const LangOptions &LangOpts) { 799 SourceLocation Begin = Range.getBegin(); 800 SourceLocation End = Range.getEnd(); 801 assert(Begin.isFileID() && End.isFileID()); 802 if (Range.isTokenRange()) { 803 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); 804 if (End.isInvalid()) 805 return CharSourceRange(); 806 } 807 808 // Break down the source locations. 809 FileID FID; 810 unsigned BeginOffs; 811 llvm::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 812 if (FID.isInvalid()) 813 return CharSourceRange(); 814 815 unsigned EndOffs; 816 if (!SM.isInFileID(End, FID, &EndOffs) || 817 BeginOffs > EndOffs) 818 return CharSourceRange(); 819 820 return CharSourceRange::getCharRange(Begin, End); 821} 822 823/// \brief Accepts a range and returns a character range with file locations. 824/// 825/// Returns a null range if a part of the range resides inside a macro 826/// expansion or the range does not reside on the same FileID. 827CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, 828 const SourceManager &SM, 829 const LangOptions &LangOpts) { 830 SourceLocation Begin = Range.getBegin(); 831 SourceLocation End = Range.getEnd(); 832 if (Begin.isInvalid() || End.isInvalid()) 833 return CharSourceRange(); 834 835 if (Begin.isFileID() && End.isFileID()) 836 return makeRangeFromFileLocs(Range, SM, LangOpts); 837 838 if (Begin.isMacroID() && End.isFileID()) { 839 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) 840 return CharSourceRange(); 841 Range.setBegin(Begin); 842 return makeRangeFromFileLocs(Range, SM, LangOpts); 843 } 844 845 if (Begin.isFileID() && End.isMacroID()) { 846 if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts, 847 &End)) || 848 (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts, 849 &End))) 850 return CharSourceRange(); 851 Range.setEnd(End); 852 return makeRangeFromFileLocs(Range, SM, LangOpts); 853 } 854 855 assert(Begin.isMacroID() && End.isMacroID()); 856 SourceLocation MacroBegin, MacroEnd; 857 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) && 858 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts, 859 &MacroEnd)) || 860 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts, 861 &MacroEnd)))) { 862 Range.setBegin(MacroBegin); 863 Range.setEnd(MacroEnd); 864 return makeRangeFromFileLocs(Range, SM, LangOpts); 865 } 866 867 FileID FID; 868 unsigned BeginOffs; 869 llvm::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 870 if (FID.isInvalid()) 871 return CharSourceRange(); 872 873 unsigned EndOffs; 874 if (!SM.isInFileID(End, FID, &EndOffs) || 875 BeginOffs > EndOffs) 876 return CharSourceRange(); 877 878 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 879 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 880 if (Expansion.isMacroArgExpansion() && 881 Expansion.getSpellingLoc().isFileID()) { 882 SourceLocation SpellLoc = Expansion.getSpellingLoc(); 883 Range.setBegin(SpellLoc.getLocWithOffset(BeginOffs)); 884 Range.setEnd(SpellLoc.getLocWithOffset(EndOffs)); 885 return makeRangeFromFileLocs(Range, SM, LangOpts); 886 } 887 888 return CharSourceRange(); 889} 890 891StringRef Lexer::getSourceText(CharSourceRange Range, 892 const SourceManager &SM, 893 const LangOptions &LangOpts, 894 bool *Invalid) { 895 Range = makeFileCharRange(Range, SM, LangOpts); 896 if (Range.isInvalid()) { 897 if (Invalid) *Invalid = true; 898 return StringRef(); 899 } 900 901 // Break down the source location. 902 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin()); 903 if (beginInfo.first.isInvalid()) { 904 if (Invalid) *Invalid = true; 905 return StringRef(); 906 } 907 908 unsigned EndOffs; 909 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || 910 beginInfo.second > EndOffs) { 911 if (Invalid) *Invalid = true; 912 return StringRef(); 913 } 914 915 // Try to the load the file buffer. 916 bool invalidTemp = false; 917 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); 918 if (invalidTemp) { 919 if (Invalid) *Invalid = true; 920 return StringRef(); 921 } 922 923 if (Invalid) *Invalid = false; 924 return file.substr(beginInfo.second, EndOffs - beginInfo.second); 925} 926 927StringRef Lexer::getImmediateMacroName(SourceLocation Loc, 928 const SourceManager &SM, 929 const LangOptions &LangOpts) { 930 assert(Loc.isMacroID() && "Only reasonble to call this on macros"); 931 932 // Find the location of the immediate macro expansion. 933 while (1) { 934 FileID FID = SM.getFileID(Loc); 935 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 936 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 937 Loc = Expansion.getExpansionLocStart(); 938 if (!Expansion.isMacroArgExpansion()) 939 break; 940 941 // For macro arguments we need to check that the argument did not come 942 // from an inner macro, e.g: "MAC1( MAC2(foo) )" 943 944 // Loc points to the argument id of the macro definition, move to the 945 // macro expansion. 946 Loc = SM.getImmediateExpansionRange(Loc).first; 947 SourceLocation SpellLoc = Expansion.getSpellingLoc(); 948 if (SpellLoc.isFileID()) 949 break; // No inner macro. 950 951 // If spelling location resides in the same FileID as macro expansion 952 // location, it means there is no inner macro. 953 FileID MacroFID = SM.getFileID(Loc); 954 if (SM.isInFileID(SpellLoc, MacroFID)) 955 break; 956 957 // Argument came from inner macro. 958 Loc = SpellLoc; 959 } 960 961 // Find the spelling location of the start of the non-argument expansion 962 // range. This is where the macro name was spelled in order to begin 963 // expanding this macro. 964 Loc = SM.getSpellingLoc(Loc); 965 966 // Dig out the buffer where the macro name was spelled and the extents of the 967 // name so that we can render it into the expansion note. 968 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 969 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 970 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 971 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 972} 973 974//===----------------------------------------------------------------------===// 975// Character information. 976//===----------------------------------------------------------------------===// 977 978enum { 979 CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0' 980 CHAR_VERT_WS = 0x02, // '\r', '\n' 981 CHAR_LETTER = 0x04, // a-z,A-Z 982 CHAR_NUMBER = 0x08, // 0-9 983 CHAR_UNDER = 0x10, // _ 984 CHAR_PERIOD = 0x20, // . 985 CHAR_RAWDEL = 0x40 // {}[]#<>%:;?*+-/^&|~!=,"' 986}; 987 988// Statically initialize CharInfo table based on ASCII character set 989// Reference: FreeBSD 7.2 /usr/share/misc/ascii 990static const unsigned char CharInfo[256] = 991{ 992// 0 NUL 1 SOH 2 STX 3 ETX 993// 4 EOT 5 ENQ 6 ACK 7 BEL 994 0 , 0 , 0 , 0 , 995 0 , 0 , 0 , 0 , 996// 8 BS 9 HT 10 NL 11 VT 997//12 NP 13 CR 14 SO 15 SI 998 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS, 999 CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 , 1000//16 DLE 17 DC1 18 DC2 19 DC3 1001//20 DC4 21 NAK 22 SYN 23 ETB 1002 0 , 0 , 0 , 0 , 1003 0 , 0 , 0 , 0 , 1004//24 CAN 25 EM 26 SUB 27 ESC 1005//28 FS 29 GS 30 RS 31 US 1006 0 , 0 , 0 , 0 , 1007 0 , 0 , 0 , 0 , 1008//32 SP 33 ! 34 " 35 # 1009//36 $ 37 % 38 & 39 ' 1010 CHAR_HORZ_WS, CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 1011 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 1012//40 ( 41 ) 42 * 43 + 1013//44 , 45 - 46 . 47 / 1014 0 , 0 , CHAR_RAWDEL , CHAR_RAWDEL , 1015 CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL , 1016//48 0 49 1 50 2 51 3 1017//52 4 53 5 54 6 55 7 1018 CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , 1019 CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , 1020//56 8 57 9 58 : 59 ; 1021//60 < 61 = 62 > 63 ? 1022 CHAR_NUMBER , CHAR_NUMBER , CHAR_RAWDEL , CHAR_RAWDEL , 1023 CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 1024//64 @ 65 A 66 B 67 C 1025//68 D 69 E 70 F 71 G 1026 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1027 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1028//72 H 73 I 74 J 75 K 1029//76 L 77 M 78 N 79 O 1030 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1031 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1032//80 P 81 Q 82 R 83 S 1033//84 T 85 U 86 V 87 W 1034 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1035 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1036//88 X 89 Y 90 Z 91 [ 1037//92 \ 93 ] 94 ^ 95 _ 1038 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL , 1039 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER , 1040//96 ` 97 a 98 b 99 c 1041//100 d 101 e 102 f 103 g 1042 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1043 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1044//104 h 105 i 106 j 107 k 1045//108 l 109 m 110 n 111 o 1046 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1047 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1048//112 p 113 q 114 r 115 s 1049//116 t 117 u 118 v 119 w 1050 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1051 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1052//120 x 121 y 122 z 123 { 1053//124 | 125 } 126 ~ 127 DEL 1054 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL , 1055 CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0 1056}; 1057 1058static void InitCharacterInfo() { 1059 static bool isInited = false; 1060 if (isInited) return; 1061 // check the statically-initialized CharInfo table 1062 assert(CHAR_HORZ_WS == CharInfo[(int)' ']); 1063 assert(CHAR_HORZ_WS == CharInfo[(int)'\t']); 1064 assert(CHAR_HORZ_WS == CharInfo[(int)'\f']); 1065 assert(CHAR_HORZ_WS == CharInfo[(int)'\v']); 1066 assert(CHAR_VERT_WS == CharInfo[(int)'\n']); 1067 assert(CHAR_VERT_WS == CharInfo[(int)'\r']); 1068 assert(CHAR_UNDER == CharInfo[(int)'_']); 1069 assert(CHAR_PERIOD == CharInfo[(int)'.']); 1070 for (unsigned i = 'a'; i <= 'z'; ++i) { 1071 assert(CHAR_LETTER == CharInfo[i]); 1072 assert(CHAR_LETTER == CharInfo[i+'A'-'a']); 1073 } 1074 for (unsigned i = '0'; i <= '9'; ++i) 1075 assert(CHAR_NUMBER == CharInfo[i]); 1076 1077 isInited = true; 1078} 1079 1080 1081/// isIdentifierHead - Return true if this is the first character of an 1082/// identifier, which is [a-zA-Z_]. 1083static inline bool isIdentifierHead(unsigned char c) { 1084 return (CharInfo[c] & (CHAR_LETTER|CHAR_UNDER)) ? true : false; 1085} 1086 1087/// isIdentifierBody - Return true if this is the body character of an 1088/// identifier, which is [a-zA-Z0-9_]. 1089static inline bool isIdentifierBody(unsigned char c) { 1090 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false; 1091} 1092 1093/// isHorizontalWhitespace - Return true if this character is horizontal 1094/// whitespace: ' ', '\t', '\f', '\v'. Note that this returns false for '\0'. 1095static inline bool isHorizontalWhitespace(unsigned char c) { 1096 return (CharInfo[c] & CHAR_HORZ_WS) ? true : false; 1097} 1098 1099/// isVerticalWhitespace - Return true if this character is vertical 1100/// whitespace: '\n', '\r'. Note that this returns false for '\0'. 1101static inline bool isVerticalWhitespace(unsigned char c) { 1102 return (CharInfo[c] & CHAR_VERT_WS) ? true : false; 1103} 1104 1105/// isWhitespace - Return true if this character is horizontal or vertical 1106/// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'. Note that this returns false 1107/// for '\0'. 1108static inline bool isWhitespace(unsigned char c) { 1109 return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false; 1110} 1111 1112/// isNumberBody - Return true if this is the body character of an 1113/// preprocessing number, which is [a-zA-Z0-9_.]. 1114static inline bool isNumberBody(unsigned char c) { 1115 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ? 1116 true : false; 1117} 1118 1119/// isRawStringDelimBody - Return true if this is the body character of a 1120/// raw string delimiter. 1121static inline bool isRawStringDelimBody(unsigned char c) { 1122 return (CharInfo[c] & 1123 (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL)) ? 1124 true : false; 1125} 1126 1127 1128//===----------------------------------------------------------------------===// 1129// Diagnostics forwarding code. 1130//===----------------------------------------------------------------------===// 1131 1132/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 1133/// lexer buffer was all expanded at a single point, perform the mapping. 1134/// This is currently only used for _Pragma implementation, so it is the slow 1135/// path of the hot getSourceLocation method. Do not allow it to be inlined. 1136static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 1137 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 1138static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 1139 SourceLocation FileLoc, 1140 unsigned CharNo, unsigned TokLen) { 1141 assert(FileLoc.isMacroID() && "Must be a macro expansion"); 1142 1143 // Otherwise, we're lexing "mapped tokens". This is used for things like 1144 // _Pragma handling. Combine the expansion location of FileLoc with the 1145 // spelling location. 1146 SourceManager &SM = PP.getSourceManager(); 1147 1148 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 1149 // characters come from spelling(FileLoc)+Offset. 1150 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 1151 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 1152 1153 // Figure out the expansion loc range, which is the range covered by the 1154 // original _Pragma(...) sequence. 1155 std::pair<SourceLocation,SourceLocation> II = 1156 SM.getImmediateExpansionRange(FileLoc); 1157 1158 return SM.createExpansionLoc(SpellingLoc, II.first, II.second, TokLen); 1159} 1160 1161/// getSourceLocation - Return a source location identifier for the specified 1162/// offset in the current file. 1163SourceLocation Lexer::getSourceLocation(const char *Loc, 1164 unsigned TokLen) const { 1165 assert(Loc >= BufferStart && Loc <= BufferEnd && 1166 "Location out of range for this buffer!"); 1167 1168 // In the normal case, we're just lexing from a simple file buffer, return 1169 // the file id from FileLoc with the offset specified. 1170 unsigned CharNo = Loc-BufferStart; 1171 if (FileLoc.isFileID()) 1172 return FileLoc.getLocWithOffset(CharNo); 1173 1174 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 1175 // tokens are lexed from where the _Pragma was defined. 1176 assert(PP && "This doesn't work on raw lexers"); 1177 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 1178} 1179 1180/// Diag - Forwarding function for diagnostics. This translate a source 1181/// position in the current buffer into a SourceLocation object for rendering. 1182DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 1183 return PP->Diag(getSourceLocation(Loc), DiagID); 1184} 1185 1186//===----------------------------------------------------------------------===// 1187// Trigraph and Escaped Newline Handling Code. 1188//===----------------------------------------------------------------------===// 1189 1190/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 1191/// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 1192static char GetTrigraphCharForLetter(char Letter) { 1193 switch (Letter) { 1194 default: return 0; 1195 case '=': return '#'; 1196 case ')': return ']'; 1197 case '(': return '['; 1198 case '!': return '|'; 1199 case '\'': return '^'; 1200 case '>': return '}'; 1201 case '/': return '\\'; 1202 case '<': return '{'; 1203 case '-': return '~'; 1204 } 1205} 1206 1207/// DecodeTrigraphChar - If the specified character is a legal trigraph when 1208/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 1209/// return the result character. Finally, emit a warning about trigraph use 1210/// whether trigraphs are enabled or not. 1211static char DecodeTrigraphChar(const char *CP, Lexer *L) { 1212 char Res = GetTrigraphCharForLetter(*CP); 1213 if (!Res || !L) return Res; 1214 1215 if (!L->getFeatures().Trigraphs) { 1216 if (!L->isLexingRawMode()) 1217 L->Diag(CP-2, diag::trigraph_ignored); 1218 return 0; 1219 } 1220 1221 if (!L->isLexingRawMode()) 1222 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 1223 return Res; 1224} 1225 1226/// getEscapedNewLineSize - Return the size of the specified escaped newline, 1227/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 1228/// trigraph equivalent on entry to this function. 1229unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 1230 unsigned Size = 0; 1231 while (isWhitespace(Ptr[Size])) { 1232 ++Size; 1233 1234 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 1235 continue; 1236 1237 // If this is a \r\n or \n\r, skip the other half. 1238 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 1239 Ptr[Size-1] != Ptr[Size]) 1240 ++Size; 1241 1242 return Size; 1243 } 1244 1245 // Not an escaped newline, must be a \t or something else. 1246 return 0; 1247} 1248 1249/// SkipEscapedNewLines - If P points to an escaped newline (or a series of 1250/// them), skip over them and return the first non-escaped-newline found, 1251/// otherwise return P. 1252const char *Lexer::SkipEscapedNewLines(const char *P) { 1253 while (1) { 1254 const char *AfterEscape; 1255 if (*P == '\\') { 1256 AfterEscape = P+1; 1257 } else if (*P == '?') { 1258 // If not a trigraph for escape, bail out. 1259 if (P[1] != '?' || P[2] != '/') 1260 return P; 1261 AfterEscape = P+3; 1262 } else { 1263 return P; 1264 } 1265 1266 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 1267 if (NewLineSize == 0) return P; 1268 P = AfterEscape+NewLineSize; 1269 } 1270} 1271 1272/// \brief Checks that the given token is the first token that occurs after the 1273/// given location (this excludes comments and whitespace). Returns the location 1274/// immediately after the specified token. If the token is not found or the 1275/// location is inside a macro, the returned source location will be invalid. 1276SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc, 1277 tok::TokenKind TKind, 1278 const SourceManager &SM, 1279 const LangOptions &LangOpts, 1280 bool SkipTrailingWhitespaceAndNewLine) { 1281 if (Loc.isMacroID()) { 1282 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 1283 return SourceLocation(); 1284 } 1285 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 1286 1287 // Break down the source location. 1288 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1289 1290 // Try to load the file buffer. 1291 bool InvalidTemp = false; 1292 llvm::StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 1293 if (InvalidTemp) 1294 return SourceLocation(); 1295 1296 const char *TokenBegin = File.data() + LocInfo.second; 1297 1298 // Lex from the start of the given location. 1299 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 1300 TokenBegin, File.end()); 1301 // Find the token. 1302 Token Tok; 1303 lexer.LexFromRawLexer(Tok); 1304 if (Tok.isNot(TKind)) 1305 return SourceLocation(); 1306 SourceLocation TokenLoc = Tok.getLocation(); 1307 1308 // Calculate how much whitespace needs to be skipped if any. 1309 unsigned NumWhitespaceChars = 0; 1310 if (SkipTrailingWhitespaceAndNewLine) { 1311 const char *TokenEnd = SM.getCharacterData(TokenLoc) + 1312 Tok.getLength(); 1313 unsigned char C = *TokenEnd; 1314 while (isHorizontalWhitespace(C)) { 1315 C = *(++TokenEnd); 1316 NumWhitespaceChars++; 1317 } 1318 if (isVerticalWhitespace(C)) 1319 NumWhitespaceChars++; 1320 } 1321 1322 return TokenLoc.getLocWithOffset(Tok.getLength() + NumWhitespaceChars); 1323} 1324 1325/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 1326/// get its size, and return it. This is tricky in several cases: 1327/// 1. If currently at the start of a trigraph, we warn about the trigraph, 1328/// then either return the trigraph (skipping 3 chars) or the '?', 1329/// depending on whether trigraphs are enabled or not. 1330/// 2. If this is an escaped newline (potentially with whitespace between 1331/// the backslash and newline), implicitly skip the newline and return 1332/// the char after it. 1333/// 3. If this is a UCN, return it. FIXME: C++ UCN's? 1334/// 1335/// This handles the slow/uncommon case of the getCharAndSize method. Here we 1336/// know that we can accumulate into Size, and that we have already incremented 1337/// Ptr by Size bytes. 1338/// 1339/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 1340/// be updated to match. 1341/// 1342char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 1343 Token *Tok) { 1344 // If we have a slash, look for an escaped newline. 1345 if (Ptr[0] == '\\') { 1346 ++Size; 1347 ++Ptr; 1348Slash: 1349 // Common case, backslash-char where the char is not whitespace. 1350 if (!isWhitespace(Ptr[0])) return '\\'; 1351 1352 // See if we have optional whitespace characters between the slash and 1353 // newline. 1354 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1355 // Remember that this token needs to be cleaned. 1356 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1357 1358 // Warn if there was whitespace between the backslash and newline. 1359 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 1360 Diag(Ptr, diag::backslash_newline_space); 1361 1362 // Found backslash<whitespace><newline>. Parse the char after it. 1363 Size += EscapedNewLineSize; 1364 Ptr += EscapedNewLineSize; 1365 1366 // If the char that we finally got was a \n, then we must have had 1367 // something like \<newline><newline>. We don't want to consume the 1368 // second newline. 1369 if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0') 1370 return ' '; 1371 1372 // Use slow version to accumulate a correct size field. 1373 return getCharAndSizeSlow(Ptr, Size, Tok); 1374 } 1375 1376 // Otherwise, this is not an escaped newline, just return the slash. 1377 return '\\'; 1378 } 1379 1380 // If this is a trigraph, process it. 1381 if (Ptr[0] == '?' && Ptr[1] == '?') { 1382 // If this is actually a legal trigraph (not something like "??x"), emit 1383 // a trigraph warning. If so, and if trigraphs are enabled, return it. 1384 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) { 1385 // Remember that this token needs to be cleaned. 1386 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1387 1388 Ptr += 3; 1389 Size += 3; 1390 if (C == '\\') goto Slash; 1391 return C; 1392 } 1393 } 1394 1395 // If this is neither, return a single character. 1396 ++Size; 1397 return *Ptr; 1398} 1399 1400 1401/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 1402/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 1403/// and that we have already incremented Ptr by Size bytes. 1404/// 1405/// NOTE: When this method is updated, getCharAndSizeSlow (above) should 1406/// be updated to match. 1407char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 1408 const LangOptions &Features) { 1409 // If we have a slash, look for an escaped newline. 1410 if (Ptr[0] == '\\') { 1411 ++Size; 1412 ++Ptr; 1413Slash: 1414 // Common case, backslash-char where the char is not whitespace. 1415 if (!isWhitespace(Ptr[0])) return '\\'; 1416 1417 // See if we have optional whitespace characters followed by a newline. 1418 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1419 // Found backslash<whitespace><newline>. Parse the char after it. 1420 Size += EscapedNewLineSize; 1421 Ptr += EscapedNewLineSize; 1422 1423 // If the char that we finally got was a \n, then we must have had 1424 // something like \<newline><newline>. We don't want to consume the 1425 // second newline. 1426 if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0') 1427 return ' '; 1428 1429 // Use slow version to accumulate a correct size field. 1430 return getCharAndSizeSlowNoWarn(Ptr, Size, Features); 1431 } 1432 1433 // Otherwise, this is not an escaped newline, just return the slash. 1434 return '\\'; 1435 } 1436 1437 // If this is a trigraph, process it. 1438 if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 1439 // If this is actually a legal trigraph (not something like "??x"), return 1440 // it. 1441 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 1442 Ptr += 3; 1443 Size += 3; 1444 if (C == '\\') goto Slash; 1445 return C; 1446 } 1447 } 1448 1449 // If this is neither, return a single character. 1450 ++Size; 1451 return *Ptr; 1452} 1453 1454//===----------------------------------------------------------------------===// 1455// Helper methods for lexing. 1456//===----------------------------------------------------------------------===// 1457 1458/// \brief Routine that indiscriminately skips bytes in the source file. 1459void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) { 1460 BufferPtr += Bytes; 1461 if (BufferPtr > BufferEnd) 1462 BufferPtr = BufferEnd; 1463 IsAtStartOfLine = StartOfLine; 1464} 1465 1466void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { 1467 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] 1468 unsigned Size; 1469 unsigned char C = *CurPtr++; 1470 while (isIdentifierBody(C)) 1471 C = *CurPtr++; 1472 1473 --CurPtr; // Back up over the skipped character. 1474 1475 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline 1476 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. 1477 // FIXME: UCNs. 1478 // 1479 // TODO: Could merge these checks into a CharInfo flag to make the comparison 1480 // cheaper 1481 if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) { 1482FinishIdentifier: 1483 const char *IdStart = BufferPtr; 1484 FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 1485 Result.setRawIdentifierData(IdStart); 1486 1487 // If we are in raw mode, return this identifier raw. There is no need to 1488 // look up identifier information or attempt to macro expand it. 1489 if (LexingRawMode) 1490 return; 1491 1492 // Fill in Result.IdentifierInfo and update the token kind, 1493 // looking up the identifier in the identifier table. 1494 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 1495 1496 // Finally, now that we know we have an identifier, pass this off to the 1497 // preprocessor, which may macro expand it or something. 1498 if (II->isHandleIdentifierCase()) 1499 PP->HandleIdentifier(Result); 1500 1501 return; 1502 } 1503 1504 // Otherwise, $,\,? in identifier found. Enter slower path. 1505 1506 C = getCharAndSize(CurPtr, Size); 1507 while (1) { 1508 if (C == '$') { 1509 // If we hit a $ and they are not supported in identifiers, we are done. 1510 if (!Features.DollarIdents) goto FinishIdentifier; 1511 1512 // Otherwise, emit a diagnostic and continue. 1513 if (!isLexingRawMode()) 1514 Diag(CurPtr, diag::ext_dollar_in_identifier); 1515 CurPtr = ConsumeChar(CurPtr, Size, Result); 1516 C = getCharAndSize(CurPtr, Size); 1517 continue; 1518 } else if (!isIdentifierBody(C)) { // FIXME: UCNs. 1519 // Found end of identifier. 1520 goto FinishIdentifier; 1521 } 1522 1523 // Otherwise, this character is good, consume it. 1524 CurPtr = ConsumeChar(CurPtr, Size, Result); 1525 1526 C = getCharAndSize(CurPtr, Size); 1527 while (isIdentifierBody(C)) { // FIXME: UCNs. 1528 CurPtr = ConsumeChar(CurPtr, Size, Result); 1529 C = getCharAndSize(CurPtr, Size); 1530 } 1531 } 1532} 1533 1534/// isHexaLiteral - Return true if Start points to a hex constant. 1535/// in microsoft mode (where this is supposed to be several different tokens). 1536static bool isHexaLiteral(const char *Start, const LangOptions &Features) { 1537 unsigned Size; 1538 char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, Features); 1539 if (C1 != '0') 1540 return false; 1541 char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, Features); 1542 return (C2 == 'x' || C2 == 'X'); 1543} 1544 1545/// LexNumericConstant - Lex the remainder of a integer or floating point 1546/// constant. From[-1] is the first character lexed. Return the end of the 1547/// constant. 1548void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 1549 unsigned Size; 1550 char C = getCharAndSize(CurPtr, Size); 1551 char PrevCh = 0; 1552 while (isNumberBody(C)) { // FIXME: UCNs. 1553 CurPtr = ConsumeChar(CurPtr, Size, Result); 1554 PrevCh = C; 1555 C = getCharAndSize(CurPtr, Size); 1556 } 1557 1558 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 1559 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 1560 // If we are in Microsoft mode, don't continue if the constant is hex. 1561 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 1562 if (!Features.MicrosoftExt || !isHexaLiteral(BufferPtr, Features)) 1563 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 1564 } 1565 1566 // If we have a hex FP constant, continue. 1567 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) 1568 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 1569 1570 // Update the location of token as well as BufferPtr. 1571 const char *TokStart = BufferPtr; 1572 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 1573 Result.setLiteralData(TokStart); 1574} 1575 1576/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes 1577/// in C++11. 1578const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr) { 1579 assert(getFeatures().CPlusPlus0x && "ud-suffix only exists in C++11"); 1580 1581 // Maximally munch an identifier. FIXME: UCNs. 1582 unsigned Size; 1583 char C = getCharAndSize(CurPtr, Size); 1584 if (isIdentifierHead(C)) { 1585 do { 1586 CurPtr = ConsumeChar(CurPtr, Size, Result); 1587 C = getCharAndSize(CurPtr, Size); 1588 } while (isIdentifierBody(C)); 1589 } 1590 return CurPtr; 1591} 1592 1593/// LexStringLiteral - Lex the remainder of a string literal, after having lexed 1594/// either " or L" or u8" or u" or U". 1595void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 1596 tok::TokenKind Kind) { 1597 const char *NulCharacter = 0; // Does this string contain the \0 character? 1598 1599 if (!isLexingRawMode() && 1600 (Kind == tok::utf8_string_literal || 1601 Kind == tok::utf16_string_literal || 1602 Kind == tok::utf32_string_literal)) 1603 Diag(BufferPtr, diag::warn_cxx98_compat_unicode_literal); 1604 1605 char C = getAndAdvanceChar(CurPtr, Result); 1606 while (C != '"') { 1607 // Skip escaped characters. Escaped newlines will already be processed by 1608 // getAndAdvanceChar. 1609 if (C == '\\') 1610 C = getAndAdvanceChar(CurPtr, Result); 1611 1612 if (C == '\n' || C == '\r' || // Newline. 1613 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 1614 if (!isLexingRawMode() && !Features.AsmPreprocessor) 1615 Diag(BufferPtr, diag::warn_unterminated_string); 1616 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1617 return; 1618 } 1619 1620 if (C == 0) { 1621 if (isCodeCompletionPoint(CurPtr-1)) { 1622 PP->CodeCompleteNaturalLanguage(); 1623 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1624 return cutOffLexing(); 1625 } 1626 1627 NulCharacter = CurPtr-1; 1628 } 1629 C = getAndAdvanceChar(CurPtr, Result); 1630 } 1631 1632 // If we are in C++11, lex the optional ud-suffix. 1633 if (getFeatures().CPlusPlus0x) 1634 CurPtr = LexUDSuffix(Result, CurPtr); 1635 1636 // If a nul character existed in the string, warn about it. 1637 if (NulCharacter && !isLexingRawMode()) 1638 Diag(NulCharacter, diag::null_in_string); 1639 1640 // Update the location of the token as well as the BufferPtr instance var. 1641 const char *TokStart = BufferPtr; 1642 FormTokenWithChars(Result, CurPtr, Kind); 1643 Result.setLiteralData(TokStart); 1644} 1645 1646/// LexRawStringLiteral - Lex the remainder of a raw string literal, after 1647/// having lexed R", LR", u8R", uR", or UR". 1648void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 1649 tok::TokenKind Kind) { 1650 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 1651 // Between the initial and final double quote characters of the raw string, 1652 // any transformations performed in phases 1 and 2 (trigraphs, 1653 // universal-character-names, and line splicing) are reverted. 1654 1655 if (!isLexingRawMode()) 1656 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 1657 1658 unsigned PrefixLen = 0; 1659 1660 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) 1661 ++PrefixLen; 1662 1663 // If the last character was not a '(', then we didn't lex a valid delimiter. 1664 if (CurPtr[PrefixLen] != '(') { 1665 if (!isLexingRawMode()) { 1666 const char *PrefixEnd = &CurPtr[PrefixLen]; 1667 if (PrefixLen == 16) { 1668 Diag(PrefixEnd, diag::err_raw_delim_too_long); 1669 } else { 1670 Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 1671 << StringRef(PrefixEnd, 1); 1672 } 1673 } 1674 1675 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 1676 // it's possible the '"' was intended to be part of the raw string, but 1677 // there's not much we can do about that. 1678 while (1) { 1679 char C = *CurPtr++; 1680 1681 if (C == '"') 1682 break; 1683 if (C == 0 && CurPtr-1 == BufferEnd) { 1684 --CurPtr; 1685 break; 1686 } 1687 } 1688 1689 FormTokenWithChars(Result, CurPtr, tok::unknown); 1690 return; 1691 } 1692 1693 // Save prefix and move CurPtr past it 1694 const char *Prefix = CurPtr; 1695 CurPtr += PrefixLen + 1; // skip over prefix and '(' 1696 1697 while (1) { 1698 char C = *CurPtr++; 1699 1700 if (C == ')') { 1701 // Check for prefix match and closing quote. 1702 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 1703 CurPtr += PrefixLen + 1; // skip over prefix and '"' 1704 break; 1705 } 1706 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 1707 if (!isLexingRawMode()) 1708 Diag(BufferPtr, diag::err_unterminated_raw_string) 1709 << StringRef(Prefix, PrefixLen); 1710 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1711 return; 1712 } 1713 } 1714 1715 // If we are in C++11, lex the optional ud-suffix. 1716 if (getFeatures().CPlusPlus0x) 1717 CurPtr = LexUDSuffix(Result, CurPtr); 1718 1719 // Update the location of token as well as BufferPtr. 1720 const char *TokStart = BufferPtr; 1721 FormTokenWithChars(Result, CurPtr, Kind); 1722 Result.setLiteralData(TokStart); 1723} 1724 1725/// LexAngledStringLiteral - Lex the remainder of an angled string literal, 1726/// after having lexed the '<' character. This is used for #include filenames. 1727void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 1728 const char *NulCharacter = 0; // Does this string contain the \0 character? 1729 const char *AfterLessPos = CurPtr; 1730 char C = getAndAdvanceChar(CurPtr, Result); 1731 while (C != '>') { 1732 // Skip escaped characters. 1733 if (C == '\\') { 1734 // Skip the escaped character. 1735 C = getAndAdvanceChar(CurPtr, Result); 1736 } else if (C == '\n' || C == '\r' || // Newline. 1737 (C == 0 && (CurPtr-1 == BufferEnd || // End of file. 1738 isCodeCompletionPoint(CurPtr-1)))) { 1739 // If the filename is unterminated, then it must just be a lone < 1740 // character. Return this as such. 1741 FormTokenWithChars(Result, AfterLessPos, tok::less); 1742 return; 1743 } else if (C == 0) { 1744 NulCharacter = CurPtr-1; 1745 } 1746 C = getAndAdvanceChar(CurPtr, Result); 1747 } 1748 1749 // If a nul character existed in the string, warn about it. 1750 if (NulCharacter && !isLexingRawMode()) 1751 Diag(NulCharacter, diag::null_in_string); 1752 1753 // Update the location of token as well as BufferPtr. 1754 const char *TokStart = BufferPtr; 1755 FormTokenWithChars(Result, CurPtr, tok::angle_string_literal); 1756 Result.setLiteralData(TokStart); 1757} 1758 1759 1760/// LexCharConstant - Lex the remainder of a character constant, after having 1761/// lexed either ' or L' or u' or U'. 1762void Lexer::LexCharConstant(Token &Result, const char *CurPtr, 1763 tok::TokenKind Kind) { 1764 const char *NulCharacter = 0; // Does this character contain the \0 character? 1765 1766 if (!isLexingRawMode() && 1767 (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)) 1768 Diag(BufferPtr, diag::warn_cxx98_compat_unicode_literal); 1769 1770 char C = getAndAdvanceChar(CurPtr, Result); 1771 if (C == '\'') { 1772 if (!isLexingRawMode() && !Features.AsmPreprocessor) 1773 Diag(BufferPtr, diag::err_empty_character); 1774 FormTokenWithChars(Result, CurPtr, tok::unknown); 1775 return; 1776 } 1777 1778 while (C != '\'') { 1779 // Skip escaped characters. 1780 if (C == '\\') { 1781 // Skip the escaped character. 1782 // FIXME: UCN's 1783 C = getAndAdvanceChar(CurPtr, Result); 1784 } else if (C == '\n' || C == '\r' || // Newline. 1785 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 1786 if (!isLexingRawMode() && !Features.AsmPreprocessor) 1787 Diag(BufferPtr, diag::warn_unterminated_char); 1788 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1789 return; 1790 } else if (C == 0) { 1791 if (isCodeCompletionPoint(CurPtr-1)) { 1792 PP->CodeCompleteNaturalLanguage(); 1793 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1794 return cutOffLexing(); 1795 } 1796 1797 NulCharacter = CurPtr-1; 1798 } 1799 C = getAndAdvanceChar(CurPtr, Result); 1800 } 1801 1802 // If we are in C++11, lex the optional ud-suffix. 1803 if (getFeatures().CPlusPlus0x) 1804 CurPtr = LexUDSuffix(Result, CurPtr); 1805 1806 // If a nul character existed in the character, warn about it. 1807 if (NulCharacter && !isLexingRawMode()) 1808 Diag(NulCharacter, diag::null_in_char); 1809 1810 // Update the location of token as well as BufferPtr. 1811 const char *TokStart = BufferPtr; 1812 FormTokenWithChars(Result, CurPtr, Kind); 1813 Result.setLiteralData(TokStart); 1814} 1815 1816/// SkipWhitespace - Efficiently skip over a series of whitespace characters. 1817/// Update BufferPtr to point to the next non-whitespace character and return. 1818/// 1819/// This method forms a token and returns true if KeepWhitespaceMode is enabled. 1820/// 1821bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { 1822 // Whitespace - Skip it, then return the token after the whitespace. 1823 unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently. 1824 while (1) { 1825 // Skip horizontal whitespace very aggressively. 1826 while (isHorizontalWhitespace(Char)) 1827 Char = *++CurPtr; 1828 1829 // Otherwise if we have something other than whitespace, we're done. 1830 if (Char != '\n' && Char != '\r') 1831 break; 1832 1833 if (ParsingPreprocessorDirective) { 1834 // End of preprocessor directive line, let LexTokenInternal handle this. 1835 BufferPtr = CurPtr; 1836 return false; 1837 } 1838 1839 // ok, but handle newline. 1840 // The returned token is at the start of the line. 1841 Result.setFlag(Token::StartOfLine); 1842 // No leading whitespace seen so far. 1843 Result.clearFlag(Token::LeadingSpace); 1844 Char = *++CurPtr; 1845 } 1846 1847 // If this isn't immediately after a newline, there is leading space. 1848 char PrevChar = CurPtr[-1]; 1849 if (PrevChar != '\n' && PrevChar != '\r') 1850 Result.setFlag(Token::LeadingSpace); 1851 1852 // If the client wants us to return whitespace, return it now. 1853 if (isKeepWhitespaceMode()) { 1854 FormTokenWithChars(Result, CurPtr, tok::unknown); 1855 return true; 1856 } 1857 1858 BufferPtr = CurPtr; 1859 return false; 1860} 1861 1862// SkipBCPLComment - We have just read the // characters from input. Skip until 1863// we find the newline character thats terminate the comment. Then update 1864/// BufferPtr and return. 1865/// 1866/// If we're in KeepCommentMode or any CommentHandler has inserted 1867/// some tokens, this will store the first token and return true. 1868bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { 1869 // If BCPL comments aren't explicitly enabled for this language, emit an 1870 // extension warning. 1871 if (!Features.BCPLComment && !isLexingRawMode()) { 1872 Diag(BufferPtr, diag::ext_bcpl_comment); 1873 1874 // Mark them enabled so we only emit one warning for this translation 1875 // unit. 1876 Features.BCPLComment = true; 1877 } 1878 1879 // Scan over the body of the comment. The common case, when scanning, is that 1880 // the comment contains normal ascii characters with nothing interesting in 1881 // them. As such, optimize for this case with the inner loop. 1882 char C; 1883 do { 1884 C = *CurPtr; 1885 // Skip over characters in the fast loop. 1886 while (C != 0 && // Potentially EOF. 1887 C != '\n' && C != '\r') // Newline or DOS-style newline. 1888 C = *++CurPtr; 1889 1890 const char *NextLine = CurPtr; 1891 if (C != 0) { 1892 // We found a newline, see if it's escaped. 1893 const char *EscapePtr = CurPtr-1; 1894 while (isHorizontalWhitespace(*EscapePtr)) // Skip whitespace. 1895 --EscapePtr; 1896 1897 if (*EscapePtr == '\\') // Escaped newline. 1898 CurPtr = EscapePtr; 1899 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 1900 EscapePtr[-2] == '?') // Trigraph-escaped newline. 1901 CurPtr = EscapePtr-2; 1902 else 1903 break; // This is a newline, we're done. 1904 1905 C = *CurPtr; 1906 } 1907 1908 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 1909 // properly decode the character. Read it in raw mode to avoid emitting 1910 // diagnostics about things like trigraphs. If we see an escaped newline, 1911 // we'll handle it below. 1912 const char *OldPtr = CurPtr; 1913 bool OldRawMode = isLexingRawMode(); 1914 LexingRawMode = true; 1915 C = getAndAdvanceChar(CurPtr, Result); 1916 LexingRawMode = OldRawMode; 1917 1918 // If we only read only one character, then no special handling is needed. 1919 // We're done and can skip forward to the newline. 1920 if (C != 0 && CurPtr == OldPtr+1) { 1921 CurPtr = NextLine; 1922 break; 1923 } 1924 1925 // If we read multiple characters, and one of those characters was a \r or 1926 // \n, then we had an escaped newline within the comment. Emit diagnostic 1927 // unless the next line is also a // comment. 1928 if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') { 1929 for (; OldPtr != CurPtr; ++OldPtr) 1930 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 1931 // Okay, we found a // comment that ends in a newline, if the next 1932 // line is also a // comment, but has spaces, don't emit a diagnostic. 1933 if (isWhitespace(C)) { 1934 const char *ForwardPtr = CurPtr; 1935 while (isWhitespace(*ForwardPtr)) // Skip whitespace. 1936 ++ForwardPtr; 1937 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 1938 break; 1939 } 1940 1941 if (!isLexingRawMode()) 1942 Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment); 1943 break; 1944 } 1945 } 1946 1947 if (CurPtr == BufferEnd+1) { 1948 --CurPtr; 1949 break; 1950 } 1951 1952 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 1953 PP->CodeCompleteNaturalLanguage(); 1954 cutOffLexing(); 1955 return false; 1956 } 1957 1958 } while (C != '\n' && C != '\r'); 1959 1960 // Found but did not consume the newline. Notify comment handlers about the 1961 // comment unless we're in a #if 0 block. 1962 if (PP && !isLexingRawMode() && 1963 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 1964 getSourceLocation(CurPtr)))) { 1965 BufferPtr = CurPtr; 1966 return true; // A token has to be returned. 1967 } 1968 1969 // If we are returning comments as tokens, return this comment as a token. 1970 if (inKeepCommentMode()) 1971 return SaveBCPLComment(Result, CurPtr); 1972 1973 // If we are inside a preprocessor directive and we see the end of line, 1974 // return immediately, so that the lexer can return this as an EOD token. 1975 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 1976 BufferPtr = CurPtr; 1977 return false; 1978 } 1979 1980 // Otherwise, eat the \n character. We don't care if this is a \n\r or 1981 // \r\n sequence. This is an efficiency hack (because we know the \n can't 1982 // contribute to another token), it isn't needed for correctness. Note that 1983 // this is ok even in KeepWhitespaceMode, because we would have returned the 1984 /// comment above in that mode. 1985 ++CurPtr; 1986 1987 // The next returned token is at the start of the line. 1988 Result.setFlag(Token::StartOfLine); 1989 // No leading whitespace seen so far. 1990 Result.clearFlag(Token::LeadingSpace); 1991 BufferPtr = CurPtr; 1992 return false; 1993} 1994 1995/// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in 1996/// an appropriate way and return it. 1997bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) { 1998 // If we're not in a preprocessor directive, just return the // comment 1999 // directly. 2000 FormTokenWithChars(Result, CurPtr, tok::comment); 2001 2002 if (!ParsingPreprocessorDirective) 2003 return true; 2004 2005 // If this BCPL-style comment is in a macro definition, transmogrify it into 2006 // a C-style block comment. 2007 bool Invalid = false; 2008 std::string Spelling = PP->getSpelling(Result, &Invalid); 2009 if (Invalid) 2010 return true; 2011 2012 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?"); 2013 Spelling[1] = '*'; // Change prefix to "/*". 2014 Spelling += "*/"; // add suffix. 2015 2016 Result.setKind(tok::comment); 2017 PP->CreateString(&Spelling[0], Spelling.size(), Result, 2018 Result.getLocation(), Result.getLocation()); 2019 return true; 2020} 2021 2022/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 2023/// character (either \n or \r) is part of an escaped newline sequence. Issue a 2024/// diagnostic if so. We know that the newline is inside of a block comment. 2025static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, 2026 Lexer *L) { 2027 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 2028 2029 // Back up off the newline. 2030 --CurPtr; 2031 2032 // If this is a two-character newline sequence, skip the other character. 2033 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 2034 // \n\n or \r\r -> not escaped newline. 2035 if (CurPtr[0] == CurPtr[1]) 2036 return false; 2037 // \n\r or \r\n -> skip the newline. 2038 --CurPtr; 2039 } 2040 2041 // If we have horizontal whitespace, skip over it. We allow whitespace 2042 // between the slash and newline. 2043 bool HasSpace = false; 2044 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 2045 --CurPtr; 2046 HasSpace = true; 2047 } 2048 2049 // If we have a slash, we know this is an escaped newline. 2050 if (*CurPtr == '\\') { 2051 if (CurPtr[-1] != '*') return false; 2052 } else { 2053 // It isn't a slash, is it the ?? / trigraph? 2054 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || 2055 CurPtr[-3] != '*') 2056 return false; 2057 2058 // This is the trigraph ending the comment. Emit a stern warning! 2059 CurPtr -= 2; 2060 2061 // If no trigraphs are enabled, warn that we ignored this trigraph and 2062 // ignore this * character. 2063 if (!L->getFeatures().Trigraphs) { 2064 if (!L->isLexingRawMode()) 2065 L->Diag(CurPtr, diag::trigraph_ignored_block_comment); 2066 return false; 2067 } 2068 if (!L->isLexingRawMode()) 2069 L->Diag(CurPtr, diag::trigraph_ends_block_comment); 2070 } 2071 2072 // Warn about having an escaped newline between the */ characters. 2073 if (!L->isLexingRawMode()) 2074 L->Diag(CurPtr, diag::escaped_newline_block_comment_end); 2075 2076 // If there was space between the backslash and newline, warn about it. 2077 if (HasSpace && !L->isLexingRawMode()) 2078 L->Diag(CurPtr, diag::backslash_newline_space); 2079 2080 return true; 2081} 2082 2083#ifdef __SSE2__ 2084#include <emmintrin.h> 2085#elif __ALTIVEC__ 2086#include <altivec.h> 2087#undef bool 2088#endif 2089 2090/// SkipBlockComment - We have just read the /* characters from input. Read 2091/// until we find the */ characters that terminate the comment. Note that we 2092/// don't bother decoding trigraphs or escaped newlines in block comments, 2093/// because they cannot cause the comment to end. The only thing that can 2094/// happen is the comment could end with an escaped newline between the */ end 2095/// of comment. 2096/// 2097/// If we're in KeepCommentMode or any CommentHandler has inserted 2098/// some tokens, this will store the first token and return true. 2099bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { 2100 // Scan one character past where we should, looking for a '/' character. Once 2101 // we find it, check to see if it was preceded by a *. This common 2102 // optimization helps people who like to put a lot of * characters in their 2103 // comments. 2104 2105 // The first character we get with newlines and trigraphs skipped to handle 2106 // the degenerate /*/ case below correctly if the * has an escaped newline 2107 // after it. 2108 unsigned CharSize; 2109 unsigned char C = getCharAndSize(CurPtr, CharSize); 2110 CurPtr += CharSize; 2111 if (C == 0 && CurPtr == BufferEnd+1) { 2112 if (!isLexingRawMode()) 2113 Diag(BufferPtr, diag::err_unterminated_block_comment); 2114 --CurPtr; 2115 2116 // KeepWhitespaceMode should return this broken comment as a token. Since 2117 // it isn't a well formed comment, just return it as an 'unknown' token. 2118 if (isKeepWhitespaceMode()) { 2119 FormTokenWithChars(Result, CurPtr, tok::unknown); 2120 return true; 2121 } 2122 2123 BufferPtr = CurPtr; 2124 return false; 2125 } 2126 2127 // Check to see if the first character after the '/*' is another /. If so, 2128 // then this slash does not end the block comment, it is part of it. 2129 if (C == '/') 2130 C = *CurPtr++; 2131 2132 while (1) { 2133 // Skip over all non-interesting characters until we find end of buffer or a 2134 // (probably ending) '/' character. 2135 if (CurPtr + 24 < BufferEnd && 2136 // If there is a code-completion point avoid the fast scan because it 2137 // doesn't check for '\0'. 2138 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 2139 // While not aligned to a 16-byte boundary. 2140 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 2141 C = *CurPtr++; 2142 2143 if (C == '/') goto FoundSlash; 2144 2145#ifdef __SSE2__ 2146 __m128i Slashes = _mm_set1_epi8('/'); 2147 while (CurPtr+16 <= BufferEnd) { 2148 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(__m128i*)CurPtr, Slashes)); 2149 if (cmp != 0) { 2150 // Adjust the pointer to point directly after the first slash. It's 2151 // not necessary to set C here, it will be overwritten at the end of 2152 // the outer loop. 2153 CurPtr += llvm::CountTrailingZeros_32(cmp) + 1; 2154 goto FoundSlash; 2155 } 2156 CurPtr += 16; 2157 } 2158#elif __ALTIVEC__ 2159 __vector unsigned char Slashes = { 2160 '/', '/', '/', '/', '/', '/', '/', '/', 2161 '/', '/', '/', '/', '/', '/', '/', '/' 2162 }; 2163 while (CurPtr+16 <= BufferEnd && 2164 !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes)) 2165 CurPtr += 16; 2166#else 2167 // Scan for '/' quickly. Many block comments are very large. 2168 while (CurPtr[0] != '/' && 2169 CurPtr[1] != '/' && 2170 CurPtr[2] != '/' && 2171 CurPtr[3] != '/' && 2172 CurPtr+4 < BufferEnd) { 2173 CurPtr += 4; 2174 } 2175#endif 2176 2177 // It has to be one of the bytes scanned, increment to it and read one. 2178 C = *CurPtr++; 2179 } 2180 2181 // Loop to scan the remainder. 2182 while (C != '/' && C != '\0') 2183 C = *CurPtr++; 2184 2185 if (C == '/') { 2186 FoundSlash: 2187 if (CurPtr[-2] == '*') // We found the final */. We're done! 2188 break; 2189 2190 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 2191 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { 2192 // We found the final */, though it had an escaped newline between the 2193 // * and /. We're done! 2194 break; 2195 } 2196 } 2197 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 2198 // If this is a /* inside of the comment, emit a warning. Don't do this 2199 // if this is a /*/, which will end the comment. This misses cases with 2200 // embedded escaped newlines, but oh well. 2201 if (!isLexingRawMode()) 2202 Diag(CurPtr-1, diag::warn_nested_block_comment); 2203 } 2204 } else if (C == 0 && CurPtr == BufferEnd+1) { 2205 if (!isLexingRawMode()) 2206 Diag(BufferPtr, diag::err_unterminated_block_comment); 2207 // Note: the user probably forgot a */. We could continue immediately 2208 // after the /*, but this would involve lexing a lot of what really is the 2209 // comment, which surely would confuse the parser. 2210 --CurPtr; 2211 2212 // KeepWhitespaceMode should return this broken comment as a token. Since 2213 // it isn't a well formed comment, just return it as an 'unknown' token. 2214 if (isKeepWhitespaceMode()) { 2215 FormTokenWithChars(Result, CurPtr, tok::unknown); 2216 return true; 2217 } 2218 2219 BufferPtr = CurPtr; 2220 return false; 2221 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 2222 PP->CodeCompleteNaturalLanguage(); 2223 cutOffLexing(); 2224 return false; 2225 } 2226 2227 C = *CurPtr++; 2228 } 2229 2230 // Notify comment handlers about the comment unless we're in a #if 0 block. 2231 if (PP && !isLexingRawMode() && 2232 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 2233 getSourceLocation(CurPtr)))) { 2234 BufferPtr = CurPtr; 2235 return true; // A token has to be returned. 2236 } 2237 2238 // If we are returning comments as tokens, return this comment as a token. 2239 if (inKeepCommentMode()) { 2240 FormTokenWithChars(Result, CurPtr, tok::comment); 2241 return true; 2242 } 2243 2244 // It is common for the tokens immediately after a /**/ comment to be 2245 // whitespace. Instead of going through the big switch, handle it 2246 // efficiently now. This is safe even in KeepWhitespaceMode because we would 2247 // have already returned above with the comment as a token. 2248 if (isHorizontalWhitespace(*CurPtr)) { 2249 Result.setFlag(Token::LeadingSpace); 2250 SkipWhitespace(Result, CurPtr+1); 2251 return false; 2252 } 2253 2254 // Otherwise, just return so that the next character will be lexed as a token. 2255 BufferPtr = CurPtr; 2256 Result.setFlag(Token::LeadingSpace); 2257 return false; 2258} 2259 2260//===----------------------------------------------------------------------===// 2261// Primary Lexing Entry Points 2262//===----------------------------------------------------------------------===// 2263 2264/// ReadToEndOfLine - Read the rest of the current preprocessor line as an 2265/// uninterpreted string. This switches the lexer out of directive mode. 2266std::string Lexer::ReadToEndOfLine() { 2267 assert(ParsingPreprocessorDirective && ParsingFilename == false && 2268 "Must be in a preprocessing directive!"); 2269 std::string Result; 2270 Token Tmp; 2271 2272 // CurPtr - Cache BufferPtr in an automatic variable. 2273 const char *CurPtr = BufferPtr; 2274 while (1) { 2275 char Char = getAndAdvanceChar(CurPtr, Tmp); 2276 switch (Char) { 2277 default: 2278 Result += Char; 2279 break; 2280 case 0: // Null. 2281 // Found end of file? 2282 if (CurPtr-1 != BufferEnd) { 2283 if (isCodeCompletionPoint(CurPtr-1)) { 2284 PP->CodeCompleteNaturalLanguage(); 2285 cutOffLexing(); 2286 return Result; 2287 } 2288 2289 // Nope, normal character, continue. 2290 Result += Char; 2291 break; 2292 } 2293 // FALL THROUGH. 2294 case '\r': 2295 case '\n': 2296 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 2297 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 2298 BufferPtr = CurPtr-1; 2299 2300 // Next, lex the character, which should handle the EOD transition. 2301 Lex(Tmp); 2302 if (Tmp.is(tok::code_completion)) { 2303 if (PP) 2304 PP->CodeCompleteNaturalLanguage(); 2305 Lex(Tmp); 2306 } 2307 assert(Tmp.is(tok::eod) && "Unexpected token!"); 2308 2309 // Finally, we're done, return the string we found. 2310 return Result; 2311 } 2312 } 2313} 2314 2315/// LexEndOfFile - CurPtr points to the end of this file. Handle this 2316/// condition, reporting diagnostics and handling other edge cases as required. 2317/// This returns true if Result contains a token, false if PP.Lex should be 2318/// called again. 2319bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 2320 // If we hit the end of the file while parsing a preprocessor directive, 2321 // end the preprocessor directive first. The next token returned will 2322 // then be the end of file. 2323 if (ParsingPreprocessorDirective) { 2324 // Done parsing the "line". 2325 ParsingPreprocessorDirective = false; 2326 // Update the location of token as well as BufferPtr. 2327 FormTokenWithChars(Result, CurPtr, tok::eod); 2328 2329 // Restore comment saving mode, in case it was disabled for directive. 2330 SetCommentRetentionState(PP->getCommentRetentionState()); 2331 return true; // Have a token. 2332 } 2333 2334 // If we are in raw mode, return this event as an EOF token. Let the caller 2335 // that put us in raw mode handle the event. 2336 if (isLexingRawMode()) { 2337 Result.startToken(); 2338 BufferPtr = BufferEnd; 2339 FormTokenWithChars(Result, BufferEnd, tok::eof); 2340 return true; 2341 } 2342 2343 // Issue diagnostics for unterminated #if and missing newline. 2344 2345 // If we are in a #if directive, emit an error. 2346 while (!ConditionalStack.empty()) { 2347 if (PP->getCodeCompletionFileLoc() != FileLoc) 2348 PP->Diag(ConditionalStack.back().IfLoc, 2349 diag::err_pp_unterminated_conditional); 2350 ConditionalStack.pop_back(); 2351 } 2352 2353 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 2354 // a pedwarn. 2355 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) 2356 Diag(BufferEnd, diag::ext_no_newline_eof) 2357 << FixItHint::CreateInsertion(getSourceLocation(BufferEnd), "\n"); 2358 2359 BufferPtr = CurPtr; 2360 2361 // Finally, let the preprocessor handle this. 2362 return PP->HandleEndOfFile(Result); 2363} 2364 2365/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 2366/// the specified lexer will return a tok::l_paren token, 0 if it is something 2367/// else and 2 if there are no more tokens in the buffer controlled by the 2368/// lexer. 2369unsigned Lexer::isNextPPTokenLParen() { 2370 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 2371 2372 // Switch to 'skipping' mode. This will ensure that we can lex a token 2373 // without emitting diagnostics, disables macro expansion, and will cause EOF 2374 // to return an EOF token instead of popping the include stack. 2375 LexingRawMode = true; 2376 2377 // Save state that can be changed while lexing so that we can restore it. 2378 const char *TmpBufferPtr = BufferPtr; 2379 bool inPPDirectiveMode = ParsingPreprocessorDirective; 2380 2381 Token Tok; 2382 Tok.startToken(); 2383 LexTokenInternal(Tok); 2384 2385 // Restore state that may have changed. 2386 BufferPtr = TmpBufferPtr; 2387 ParsingPreprocessorDirective = inPPDirectiveMode; 2388 2389 // Restore the lexer back to non-skipping mode. 2390 LexingRawMode = false; 2391 2392 if (Tok.is(tok::eof)) 2393 return 2; 2394 return Tok.is(tok::l_paren); 2395} 2396 2397/// FindConflictEnd - Find the end of a version control conflict marker. 2398static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 2399 ConflictMarkerKind CMK) { 2400 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 2401 size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 2402 StringRef RestOfBuffer(CurPtr+TermLen, BufferEnd-CurPtr-TermLen); 2403 size_t Pos = RestOfBuffer.find(Terminator); 2404 while (Pos != StringRef::npos) { 2405 // Must occur at start of line. 2406 if (RestOfBuffer[Pos-1] != '\r' && 2407 RestOfBuffer[Pos-1] != '\n') { 2408 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 2409 Pos = RestOfBuffer.find(Terminator); 2410 continue; 2411 } 2412 return RestOfBuffer.data()+Pos; 2413 } 2414 return 0; 2415} 2416 2417/// IsStartOfConflictMarker - If the specified pointer is the start of a version 2418/// control conflict marker like '<<<<<<<', recognize it as such, emit an error 2419/// and recover nicely. This returns true if it is a conflict marker and false 2420/// if not. 2421bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 2422 // Only a conflict marker if it starts at the beginning of a line. 2423 if (CurPtr != BufferStart && 2424 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 2425 return false; 2426 2427 // Check to see if we have <<<<<<< or >>>>. 2428 if ((BufferEnd-CurPtr < 8 || StringRef(CurPtr, 7) != "<<<<<<<") && 2429 (BufferEnd-CurPtr < 6 || StringRef(CurPtr, 5) != ">>>> ")) 2430 return false; 2431 2432 // If we have a situation where we don't care about conflict markers, ignore 2433 // it. 2434 if (CurrentConflictMarkerState || isLexingRawMode()) 2435 return false; 2436 2437 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 2438 2439 // Check to see if there is an ending marker somewhere in the buffer at the 2440 // start of a line to terminate this conflict marker. 2441 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 2442 // We found a match. We are really in a conflict marker. 2443 // Diagnose this, and ignore to the end of line. 2444 Diag(CurPtr, diag::err_conflict_marker); 2445 CurrentConflictMarkerState = Kind; 2446 2447 // Skip ahead to the end of line. We know this exists because the 2448 // end-of-conflict marker starts with \r or \n. 2449 while (*CurPtr != '\r' && *CurPtr != '\n') { 2450 assert(CurPtr != BufferEnd && "Didn't find end of line"); 2451 ++CurPtr; 2452 } 2453 BufferPtr = CurPtr; 2454 return true; 2455 } 2456 2457 // No end of conflict marker found. 2458 return false; 2459} 2460 2461 2462/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 2463/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 2464/// is the end of a conflict marker. Handle it by ignoring up until the end of 2465/// the line. This returns true if it is a conflict marker and false if not. 2466bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 2467 // Only a conflict marker if it starts at the beginning of a line. 2468 if (CurPtr != BufferStart && 2469 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 2470 return false; 2471 2472 // If we have a situation where we don't care about conflict markers, ignore 2473 // it. 2474 if (!CurrentConflictMarkerState || isLexingRawMode()) 2475 return false; 2476 2477 // Check to see if we have the marker (4 characters in a row). 2478 for (unsigned i = 1; i != 4; ++i) 2479 if (CurPtr[i] != CurPtr[0]) 2480 return false; 2481 2482 // If we do have it, search for the end of the conflict marker. This could 2483 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 2484 // be the end of conflict marker. 2485 if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 2486 CurrentConflictMarkerState)) { 2487 CurPtr = End; 2488 2489 // Skip ahead to the end of line. 2490 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 2491 ++CurPtr; 2492 2493 BufferPtr = CurPtr; 2494 2495 // No longer in the conflict marker. 2496 CurrentConflictMarkerState = CMK_None; 2497 return true; 2498 } 2499 2500 return false; 2501} 2502 2503bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 2504 if (PP && PP->isCodeCompletionEnabled()) { 2505 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 2506 return Loc == PP->getCodeCompletionLoc(); 2507 } 2508 2509 return false; 2510} 2511 2512 2513/// LexTokenInternal - This implements a simple C family lexer. It is an 2514/// extremely performance critical piece of code. This assumes that the buffer 2515/// has a null character at the end of the file. This returns a preprocessing 2516/// token, not a normal token, as such, it is an internal interface. It assumes 2517/// that the Flags of result have been cleared before calling this. 2518void Lexer::LexTokenInternal(Token &Result) { 2519LexNextToken: 2520 // New token, can't need cleaning yet. 2521 Result.clearFlag(Token::NeedsCleaning); 2522 Result.setIdentifierInfo(0); 2523 2524 // CurPtr - Cache BufferPtr in an automatic variable. 2525 const char *CurPtr = BufferPtr; 2526 2527 // Small amounts of horizontal whitespace is very common between tokens. 2528 if ((*CurPtr == ' ') || (*CurPtr == '\t')) { 2529 ++CurPtr; 2530 while ((*CurPtr == ' ') || (*CurPtr == '\t')) 2531 ++CurPtr; 2532 2533 // If we are keeping whitespace and other tokens, just return what we just 2534 // skipped. The next lexer invocation will return the token after the 2535 // whitespace. 2536 if (isKeepWhitespaceMode()) { 2537 FormTokenWithChars(Result, CurPtr, tok::unknown); 2538 return; 2539 } 2540 2541 BufferPtr = CurPtr; 2542 Result.setFlag(Token::LeadingSpace); 2543 } 2544 2545 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 2546 2547 // Read a character, advancing over it. 2548 char Char = getAndAdvanceChar(CurPtr, Result); 2549 tok::TokenKind Kind; 2550 2551 switch (Char) { 2552 case 0: // Null. 2553 // Found end of file? 2554 if (CurPtr-1 == BufferEnd) { 2555 // Read the PP instance variable into an automatic variable, because 2556 // LexEndOfFile will often delete 'this'. 2557 Preprocessor *PPCache = PP; 2558 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 2559 return; // Got a token to return. 2560 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 2561 return PPCache->Lex(Result); 2562 } 2563 2564 // Check if we are performing code completion. 2565 if (isCodeCompletionPoint(CurPtr-1)) { 2566 // Return the code-completion token. 2567 Result.startToken(); 2568 FormTokenWithChars(Result, CurPtr, tok::code_completion); 2569 return; 2570 } 2571 2572 if (!isLexingRawMode()) 2573 Diag(CurPtr-1, diag::null_in_file); 2574 Result.setFlag(Token::LeadingSpace); 2575 if (SkipWhitespace(Result, CurPtr)) 2576 return; // KeepWhitespaceMode 2577 2578 goto LexNextToken; // GCC isn't tail call eliminating. 2579 2580 case 26: // DOS & CP/M EOF: "^Z". 2581 // If we're in Microsoft extensions mode, treat this as end of file. 2582 if (Features.MicrosoftExt) { 2583 // Read the PP instance variable into an automatic variable, because 2584 // LexEndOfFile will often delete 'this'. 2585 Preprocessor *PPCache = PP; 2586 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 2587 return; // Got a token to return. 2588 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 2589 return PPCache->Lex(Result); 2590 } 2591 // If Microsoft extensions are disabled, this is just random garbage. 2592 Kind = tok::unknown; 2593 break; 2594 2595 case '\n': 2596 case '\r': 2597 // If we are inside a preprocessor directive and we see the end of line, 2598 // we know we are done with the directive, so return an EOD token. 2599 if (ParsingPreprocessorDirective) { 2600 // Done parsing the "line". 2601 ParsingPreprocessorDirective = false; 2602 2603 // Restore comment saving mode, in case it was disabled for directive. 2604 SetCommentRetentionState(PP->getCommentRetentionState()); 2605 2606 // Since we consumed a newline, we are back at the start of a line. 2607 IsAtStartOfLine = true; 2608 2609 Kind = tok::eod; 2610 break; 2611 } 2612 // The returned token is at the start of the line. 2613 Result.setFlag(Token::StartOfLine); 2614 // No leading whitespace seen so far. 2615 Result.clearFlag(Token::LeadingSpace); 2616 2617 if (SkipWhitespace(Result, CurPtr)) 2618 return; // KeepWhitespaceMode 2619 goto LexNextToken; // GCC isn't tail call eliminating. 2620 case ' ': 2621 case '\t': 2622 case '\f': 2623 case '\v': 2624 SkipHorizontalWhitespace: 2625 Result.setFlag(Token::LeadingSpace); 2626 if (SkipWhitespace(Result, CurPtr)) 2627 return; // KeepWhitespaceMode 2628 2629 SkipIgnoredUnits: 2630 CurPtr = BufferPtr; 2631 2632 // If the next token is obviously a // or /* */ comment, skip it efficiently 2633 // too (without going through the big switch stmt). 2634 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 2635 Features.BCPLComment && !Features.TraditionalCPP) { 2636 if (SkipBCPLComment(Result, CurPtr+2)) 2637 return; // There is a token to return. 2638 goto SkipIgnoredUnits; 2639 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 2640 if (SkipBlockComment(Result, CurPtr+2)) 2641 return; // There is a token to return. 2642 goto SkipIgnoredUnits; 2643 } else if (isHorizontalWhitespace(*CurPtr)) { 2644 goto SkipHorizontalWhitespace; 2645 } 2646 goto LexNextToken; // GCC isn't tail call eliminating. 2647 2648 // C99 6.4.4.1: Integer Constants. 2649 // C99 6.4.4.2: Floating Constants. 2650 case '0': case '1': case '2': case '3': case '4': 2651 case '5': case '6': case '7': case '8': case '9': 2652 // Notify MIOpt that we read a non-whitespace/non-comment token. 2653 MIOpt.ReadToken(); 2654 return LexNumericConstant(Result, CurPtr); 2655 2656 case 'u': // Identifier (uber) or C++0x UTF-8 or UTF-16 string literal 2657 // Notify MIOpt that we read a non-whitespace/non-comment token. 2658 MIOpt.ReadToken(); 2659 2660 if (Features.CPlusPlus0x) { 2661 Char = getCharAndSize(CurPtr, SizeTmp); 2662 2663 // UTF-16 string literal 2664 if (Char == '"') 2665 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2666 tok::utf16_string_literal); 2667 2668 // UTF-16 character constant 2669 if (Char == '\'') 2670 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2671 tok::utf16_char_constant); 2672 2673 // UTF-16 raw string literal 2674 if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 2675 return LexRawStringLiteral(Result, 2676 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2677 SizeTmp2, Result), 2678 tok::utf16_string_literal); 2679 2680 if (Char == '8') { 2681 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 2682 2683 // UTF-8 string literal 2684 if (Char2 == '"') 2685 return LexStringLiteral(Result, 2686 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2687 SizeTmp2, Result), 2688 tok::utf8_string_literal); 2689 2690 if (Char2 == 'R') { 2691 unsigned SizeTmp3; 2692 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 2693 // UTF-8 raw string literal 2694 if (Char3 == '"') { 2695 return LexRawStringLiteral(Result, 2696 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2697 SizeTmp2, Result), 2698 SizeTmp3, Result), 2699 tok::utf8_string_literal); 2700 } 2701 } 2702 } 2703 } 2704 2705 // treat u like the start of an identifier. 2706 return LexIdentifier(Result, CurPtr); 2707 2708 case 'U': // Identifier (Uber) or C++0x UTF-32 string literal 2709 // Notify MIOpt that we read a non-whitespace/non-comment token. 2710 MIOpt.ReadToken(); 2711 2712 if (Features.CPlusPlus0x) { 2713 Char = getCharAndSize(CurPtr, SizeTmp); 2714 2715 // UTF-32 string literal 2716 if (Char == '"') 2717 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2718 tok::utf32_string_literal); 2719 2720 // UTF-32 character constant 2721 if (Char == '\'') 2722 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2723 tok::utf32_char_constant); 2724 2725 // UTF-32 raw string literal 2726 if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 2727 return LexRawStringLiteral(Result, 2728 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2729 SizeTmp2, Result), 2730 tok::utf32_string_literal); 2731 } 2732 2733 // treat U like the start of an identifier. 2734 return LexIdentifier(Result, CurPtr); 2735 2736 case 'R': // Identifier or C++0x raw string literal 2737 // Notify MIOpt that we read a non-whitespace/non-comment token. 2738 MIOpt.ReadToken(); 2739 2740 if (Features.CPlusPlus0x) { 2741 Char = getCharAndSize(CurPtr, SizeTmp); 2742 2743 if (Char == '"') 2744 return LexRawStringLiteral(Result, 2745 ConsumeChar(CurPtr, SizeTmp, Result), 2746 tok::string_literal); 2747 } 2748 2749 // treat R like the start of an identifier. 2750 return LexIdentifier(Result, CurPtr); 2751 2752 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 2753 // Notify MIOpt that we read a non-whitespace/non-comment token. 2754 MIOpt.ReadToken(); 2755 Char = getCharAndSize(CurPtr, SizeTmp); 2756 2757 // Wide string literal. 2758 if (Char == '"') 2759 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2760 tok::wide_string_literal); 2761 2762 // Wide raw string literal. 2763 if (Features.CPlusPlus0x && Char == 'R' && 2764 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 2765 return LexRawStringLiteral(Result, 2766 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2767 SizeTmp2, Result), 2768 tok::wide_string_literal); 2769 2770 // Wide character constant. 2771 if (Char == '\'') 2772 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2773 tok::wide_char_constant); 2774 // FALL THROUGH, treating L like the start of an identifier. 2775 2776 // C99 6.4.2: Identifiers. 2777 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 2778 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 2779 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 2780 case 'V': case 'W': case 'X': case 'Y': case 'Z': 2781 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 2782 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 2783 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 2784 case 'v': case 'w': case 'x': case 'y': case 'z': 2785 case '_': 2786 // Notify MIOpt that we read a non-whitespace/non-comment token. 2787 MIOpt.ReadToken(); 2788 return LexIdentifier(Result, CurPtr); 2789 2790 case '$': // $ in identifiers. 2791 if (Features.DollarIdents) { 2792 if (!isLexingRawMode()) 2793 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 2794 // Notify MIOpt that we read a non-whitespace/non-comment token. 2795 MIOpt.ReadToken(); 2796 return LexIdentifier(Result, CurPtr); 2797 } 2798 2799 Kind = tok::unknown; 2800 break; 2801 2802 // C99 6.4.4: Character Constants. 2803 case '\'': 2804 // Notify MIOpt that we read a non-whitespace/non-comment token. 2805 MIOpt.ReadToken(); 2806 return LexCharConstant(Result, CurPtr, tok::char_constant); 2807 2808 // C99 6.4.5: String Literals. 2809 case '"': 2810 // Notify MIOpt that we read a non-whitespace/non-comment token. 2811 MIOpt.ReadToken(); 2812 return LexStringLiteral(Result, CurPtr, tok::string_literal); 2813 2814 // C99 6.4.6: Punctuators. 2815 case '?': 2816 Kind = tok::question; 2817 break; 2818 case '[': 2819 Kind = tok::l_square; 2820 break; 2821 case ']': 2822 Kind = tok::r_square; 2823 break; 2824 case '(': 2825 Kind = tok::l_paren; 2826 break; 2827 case ')': 2828 Kind = tok::r_paren; 2829 break; 2830 case '{': 2831 Kind = tok::l_brace; 2832 break; 2833 case '}': 2834 Kind = tok::r_brace; 2835 break; 2836 case '.': 2837 Char = getCharAndSize(CurPtr, SizeTmp); 2838 if (Char >= '0' && Char <= '9') { 2839 // Notify MIOpt that we read a non-whitespace/non-comment token. 2840 MIOpt.ReadToken(); 2841 2842 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 2843 } else if (Features.CPlusPlus && Char == '*') { 2844 Kind = tok::periodstar; 2845 CurPtr += SizeTmp; 2846 } else if (Char == '.' && 2847 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 2848 Kind = tok::ellipsis; 2849 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2850 SizeTmp2, Result); 2851 } else { 2852 Kind = tok::period; 2853 } 2854 break; 2855 case '&': 2856 Char = getCharAndSize(CurPtr, SizeTmp); 2857 if (Char == '&') { 2858 Kind = tok::ampamp; 2859 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2860 } else if (Char == '=') { 2861 Kind = tok::ampequal; 2862 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2863 } else { 2864 Kind = tok::amp; 2865 } 2866 break; 2867 case '*': 2868 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 2869 Kind = tok::starequal; 2870 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2871 } else { 2872 Kind = tok::star; 2873 } 2874 break; 2875 case '+': 2876 Char = getCharAndSize(CurPtr, SizeTmp); 2877 if (Char == '+') { 2878 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2879 Kind = tok::plusplus; 2880 } else if (Char == '=') { 2881 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2882 Kind = tok::plusequal; 2883 } else { 2884 Kind = tok::plus; 2885 } 2886 break; 2887 case '-': 2888 Char = getCharAndSize(CurPtr, SizeTmp); 2889 if (Char == '-') { // -- 2890 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2891 Kind = tok::minusminus; 2892 } else if (Char == '>' && Features.CPlusPlus && 2893 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 2894 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2895 SizeTmp2, Result); 2896 Kind = tok::arrowstar; 2897 } else if (Char == '>') { // -> 2898 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2899 Kind = tok::arrow; 2900 } else if (Char == '=') { // -= 2901 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2902 Kind = tok::minusequal; 2903 } else { 2904 Kind = tok::minus; 2905 } 2906 break; 2907 case '~': 2908 Kind = tok::tilde; 2909 break; 2910 case '!': 2911 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 2912 Kind = tok::exclaimequal; 2913 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2914 } else { 2915 Kind = tok::exclaim; 2916 } 2917 break; 2918 case '/': 2919 // 6.4.9: Comments 2920 Char = getCharAndSize(CurPtr, SizeTmp); 2921 if (Char == '/') { // BCPL comment. 2922 // Even if BCPL comments are disabled (e.g. in C89 mode), we generally 2923 // want to lex this as a comment. There is one problem with this though, 2924 // that in one particular corner case, this can change the behavior of the 2925 // resultant program. For example, In "foo //**/ bar", C89 would lex 2926 // this as "foo / bar" and langauges with BCPL comments would lex it as 2927 // "foo". Check to see if the character after the second slash is a '*'. 2928 // If so, we will lex that as a "/" instead of the start of a comment. 2929 // However, we never do this in -traditional-cpp mode. 2930 if ((Features.BCPLComment || 2931 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') && 2932 !Features.TraditionalCPP) { 2933 if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 2934 return; // There is a token to return. 2935 2936 // It is common for the tokens immediately after a // comment to be 2937 // whitespace (indentation for the next line). Instead of going through 2938 // the big switch, handle it efficiently now. 2939 goto SkipIgnoredUnits; 2940 } 2941 } 2942 2943 if (Char == '*') { // /**/ comment. 2944 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 2945 return; // There is a token to return. 2946 goto LexNextToken; // GCC isn't tail call eliminating. 2947 } 2948 2949 if (Char == '=') { 2950 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2951 Kind = tok::slashequal; 2952 } else { 2953 Kind = tok::slash; 2954 } 2955 break; 2956 case '%': 2957 Char = getCharAndSize(CurPtr, SizeTmp); 2958 if (Char == '=') { 2959 Kind = tok::percentequal; 2960 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2961 } else if (Features.Digraphs && Char == '>') { 2962 Kind = tok::r_brace; // '%>' -> '}' 2963 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2964 } else if (Features.Digraphs && Char == ':') { 2965 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2966 Char = getCharAndSize(CurPtr, SizeTmp); 2967 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 2968 Kind = tok::hashhash; // '%:%:' -> '##' 2969 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2970 SizeTmp2, Result); 2971 } else if (Char == '@' && Features.MicrosoftExt) {// %:@ -> #@ -> Charize 2972 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2973 if (!isLexingRawMode()) 2974 Diag(BufferPtr, diag::ext_charize_microsoft); 2975 Kind = tok::hashat; 2976 } else { // '%:' -> '#' 2977 // We parsed a # character. If this occurs at the start of the line, 2978 // it's actually the start of a preprocessing directive. Callback to 2979 // the preprocessor to handle it. 2980 // FIXME: -fpreprocessed mode?? 2981 if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) { 2982 FormTokenWithChars(Result, CurPtr, tok::hash); 2983 PP->HandleDirective(Result); 2984 2985 // As an optimization, if the preprocessor didn't switch lexers, tail 2986 // recurse. 2987 if (PP->isCurrentLexer(this)) { 2988 // Start a new token. If this is a #include or something, the PP may 2989 // want us starting at the beginning of the line again. If so, set 2990 // the StartOfLine flag and clear LeadingSpace. 2991 if (IsAtStartOfLine) { 2992 Result.setFlag(Token::StartOfLine); 2993 Result.clearFlag(Token::LeadingSpace); 2994 IsAtStartOfLine = false; 2995 } 2996 goto LexNextToken; // GCC isn't tail call eliminating. 2997 } 2998 2999 return PP->Lex(Result); 3000 } 3001 3002 Kind = tok::hash; 3003 } 3004 } else { 3005 Kind = tok::percent; 3006 } 3007 break; 3008 case '<': 3009 Char = getCharAndSize(CurPtr, SizeTmp); 3010 if (ParsingFilename) { 3011 return LexAngledStringLiteral(Result, CurPtr); 3012 } else if (Char == '<') { 3013 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 3014 if (After == '=') { 3015 Kind = tok::lesslessequal; 3016 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3017 SizeTmp2, Result); 3018 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 3019 // If this is actually a '<<<<<<<' version control conflict marker, 3020 // recognize it as such and recover nicely. 3021 goto LexNextToken; 3022 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 3023 // If this is '<<<<' and we're in a Perforce-style conflict marker, 3024 // ignore it. 3025 goto LexNextToken; 3026 } else if (Features.CUDA && After == '<') { 3027 Kind = tok::lesslessless; 3028 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3029 SizeTmp2, Result); 3030 } else { 3031 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3032 Kind = tok::lessless; 3033 } 3034 } else if (Char == '=') { 3035 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3036 Kind = tok::lessequal; 3037 } else if (Features.Digraphs && Char == ':') { // '<:' -> '[' 3038 if (Features.CPlusPlus0x && 3039 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 3040 // C++0x [lex.pptoken]p3: 3041 // Otherwise, if the next three characters are <:: and the subsequent 3042 // character is neither : nor >, the < is treated as a preprocessor 3043 // token by itself and not as the first character of the alternative 3044 // token <:. 3045 unsigned SizeTmp3; 3046 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 3047 if (After != ':' && After != '>') { 3048 Kind = tok::less; 3049 if (!isLexingRawMode()) 3050 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 3051 break; 3052 } 3053 } 3054 3055 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3056 Kind = tok::l_square; 3057 } else if (Features.Digraphs && Char == '%') { // '<%' -> '{' 3058 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3059 Kind = tok::l_brace; 3060 } else { 3061 Kind = tok::less; 3062 } 3063 break; 3064 case '>': 3065 Char = getCharAndSize(CurPtr, SizeTmp); 3066 if (Char == '=') { 3067 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3068 Kind = tok::greaterequal; 3069 } else if (Char == '>') { 3070 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 3071 if (After == '=') { 3072 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3073 SizeTmp2, Result); 3074 Kind = tok::greatergreaterequal; 3075 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 3076 // If this is actually a '>>>>' conflict marker, recognize it as such 3077 // and recover nicely. 3078 goto LexNextToken; 3079 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 3080 // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 3081 goto LexNextToken; 3082 } else if (Features.CUDA && After == '>') { 3083 Kind = tok::greatergreatergreater; 3084 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3085 SizeTmp2, Result); 3086 } else { 3087 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3088 Kind = tok::greatergreater; 3089 } 3090 3091 } else { 3092 Kind = tok::greater; 3093 } 3094 break; 3095 case '^': 3096 Char = getCharAndSize(CurPtr, SizeTmp); 3097 if (Char == '=') { 3098 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3099 Kind = tok::caretequal; 3100 } else { 3101 Kind = tok::caret; 3102 } 3103 break; 3104 case '|': 3105 Char = getCharAndSize(CurPtr, SizeTmp); 3106 if (Char == '=') { 3107 Kind = tok::pipeequal; 3108 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3109 } else if (Char == '|') { 3110 // If this is '|||||||' and we're in a conflict marker, ignore it. 3111 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 3112 goto LexNextToken; 3113 Kind = tok::pipepipe; 3114 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3115 } else { 3116 Kind = tok::pipe; 3117 } 3118 break; 3119 case ':': 3120 Char = getCharAndSize(CurPtr, SizeTmp); 3121 if (Features.Digraphs && Char == '>') { 3122 Kind = tok::r_square; // ':>' -> ']' 3123 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3124 } else if (Features.CPlusPlus && Char == ':') { 3125 Kind = tok::coloncolon; 3126 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3127 } else { 3128 Kind = tok::colon; 3129 } 3130 break; 3131 case ';': 3132 Kind = tok::semi; 3133 break; 3134 case '=': 3135 Char = getCharAndSize(CurPtr, SizeTmp); 3136 if (Char == '=') { 3137 // If this is '====' and we're in a conflict marker, ignore it. 3138 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 3139 goto LexNextToken; 3140 3141 Kind = tok::equalequal; 3142 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3143 } else { 3144 Kind = tok::equal; 3145 } 3146 break; 3147 case ',': 3148 Kind = tok::comma; 3149 break; 3150 case '#': 3151 Char = getCharAndSize(CurPtr, SizeTmp); 3152 if (Char == '#') { 3153 Kind = tok::hashhash; 3154 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3155 } else if (Char == '@' && Features.MicrosoftExt) { // #@ -> Charize 3156 Kind = tok::hashat; 3157 if (!isLexingRawMode()) 3158 Diag(BufferPtr, diag::ext_charize_microsoft); 3159 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3160 } else { 3161 // We parsed a # character. If this occurs at the start of the line, 3162 // it's actually the start of a preprocessing directive. Callback to 3163 // the preprocessor to handle it. 3164 // FIXME: -fpreprocessed mode?? 3165 if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) { 3166 FormTokenWithChars(Result, CurPtr, tok::hash); 3167 PP->HandleDirective(Result); 3168 3169 // As an optimization, if the preprocessor didn't switch lexers, tail 3170 // recurse. 3171 if (PP->isCurrentLexer(this)) { 3172 // Start a new token. If this is a #include or something, the PP may 3173 // want us starting at the beginning of the line again. If so, set 3174 // the StartOfLine flag and clear LeadingSpace. 3175 if (IsAtStartOfLine) { 3176 Result.setFlag(Token::StartOfLine); 3177 Result.clearFlag(Token::LeadingSpace); 3178 IsAtStartOfLine = false; 3179 } 3180 goto LexNextToken; // GCC isn't tail call eliminating. 3181 } 3182 return PP->Lex(Result); 3183 } 3184 3185 Kind = tok::hash; 3186 } 3187 break; 3188 3189 case '@': 3190 // Objective C support. 3191 if (CurPtr[-1] == '@' && Features.ObjC1) 3192 Kind = tok::at; 3193 else 3194 Kind = tok::unknown; 3195 break; 3196 3197 case '\\': 3198 // FIXME: UCN's. 3199 // FALL THROUGH. 3200 default: 3201 Kind = tok::unknown; 3202 break; 3203 } 3204 3205 // Notify MIOpt that we read a non-whitespace/non-comment token. 3206 MIOpt.ReadToken(); 3207 3208 // Update the location of token as well as BufferPtr. 3209 FormTokenWithChars(Result, CurPtr, Kind); 3210} 3211