Lexer.cpp revision bf340e452339e374ea6eef78c1f0a2abdd16c5a3
1//===--- Lexer.cpp - C Language Family Lexer ------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements the Lexer and Token interfaces. 11// 12//===----------------------------------------------------------------------===// 13// 14// TODO: GCC Diagnostics emitted by the lexer: 15// PEDWARN: (form feed|vertical tab) in preprocessing directive 16// 17// Universal characters, unicode, char mapping: 18// WARNING: `%.*s' is not in NFKC 19// WARNING: `%.*s' is not in NFC 20// 21// Other: 22// TODO: Options to support: 23// -fexec-charset,-fwide-exec-charset 24// 25//===----------------------------------------------------------------------===// 26 27#include "clang/Lex/Lexer.h" 28#include "clang/Lex/Preprocessor.h" 29#include "clang/Lex/LexDiagnostic.h" 30#include "clang/Lex/CodeCompletionHandler.h" 31#include "clang/Basic/SourceManager.h" 32#include "llvm/ADT/StringSwitch.h" 33#include "llvm/Support/Compiler.h" 34#include "llvm/Support/MemoryBuffer.h" 35#include <cctype> 36using namespace clang; 37 38static void InitCharacterInfo(); 39 40//===----------------------------------------------------------------------===// 41// Token Class Implementation 42//===----------------------------------------------------------------------===// 43 44/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 45bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 46 if (IdentifierInfo *II = getIdentifierInfo()) 47 return II->getObjCKeywordID() == objcKey; 48 return false; 49} 50 51/// getObjCKeywordID - Return the ObjC keyword kind. 52tok::ObjCKeywordKind Token::getObjCKeywordID() const { 53 IdentifierInfo *specId = getIdentifierInfo(); 54 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 55} 56 57 58//===----------------------------------------------------------------------===// 59// Lexer Class Implementation 60//===----------------------------------------------------------------------===// 61 62void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 63 const char *BufEnd) { 64 InitCharacterInfo(); 65 66 BufferStart = BufStart; 67 BufferPtr = BufPtr; 68 BufferEnd = BufEnd; 69 70 assert(BufEnd[0] == 0 && 71 "We assume that the input buffer has a null character at the end" 72 " to simplify lexing!"); 73 74 // Check whether we have a BOM in the beginning of the buffer. If yes - act 75 // accordingly. Right now we support only UTF-8 with and without BOM, so, just 76 // skip the UTF-8 BOM if it's present. 77 if (BufferStart == BufferPtr) { 78 // Determine the size of the BOM. 79 StringRef Buf(BufferStart, BufferEnd - BufferStart); 80 size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 81 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 82 .Default(0); 83 84 // Skip the BOM. 85 BufferPtr += BOMLength; 86 } 87 88 Is_PragmaLexer = false; 89 IsInConflictMarker = false; 90 91 // Start of the file is a start of line. 92 IsAtStartOfLine = true; 93 94 // We are not after parsing a #. 95 ParsingPreprocessorDirective = false; 96 97 // We are not after parsing #include. 98 ParsingFilename = false; 99 100 // We are not in raw mode. Raw mode disables diagnostics and interpretation 101 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 102 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 103 // or otherwise skipping over tokens. 104 LexingRawMode = false; 105 106 // Default to not keeping comments. 107 ExtendedTokenMode = 0; 108} 109 110/// Lexer constructor - Create a new lexer object for the specified buffer 111/// with the specified preprocessor managing the lexing process. This lexer 112/// assumes that the associated file buffer and Preprocessor objects will 113/// outlive it, so it doesn't take ownership of either of them. 114Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP) 115 : PreprocessorLexer(&PP, FID), 116 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 117 Features(PP.getLangOptions()) { 118 119 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), 120 InputFile->getBufferEnd()); 121 122 // Default to keeping comments if the preprocessor wants them. 123 SetCommentRetentionState(PP.getCommentRetentionState()); 124} 125 126/// Lexer constructor - Create a new raw lexer object. This object is only 127/// suitable for calls to 'LexRawToken'. This lexer assumes that the text 128/// range will outlive it, so it doesn't take ownership of it. 129Lexer::Lexer(SourceLocation fileloc, const LangOptions &features, 130 const char *BufStart, const char *BufPtr, const char *BufEnd) 131 : FileLoc(fileloc), Features(features) { 132 133 InitLexer(BufStart, BufPtr, BufEnd); 134 135 // We *are* in raw mode. 136 LexingRawMode = true; 137} 138 139/// Lexer constructor - Create a new raw lexer object. This object is only 140/// suitable for calls to 'LexRawToken'. This lexer assumes that the text 141/// range will outlive it, so it doesn't take ownership of it. 142Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile, 143 const SourceManager &SM, const LangOptions &features) 144 : FileLoc(SM.getLocForStartOfFile(FID)), Features(features) { 145 146 InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(), 147 FromFile->getBufferEnd()); 148 149 // We *are* in raw mode. 150 LexingRawMode = true; 151} 152 153/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 154/// _Pragma expansion. This has a variety of magic semantics that this method 155/// sets up. It returns a new'd Lexer that must be delete'd when done. 156/// 157/// On entrance to this routine, TokStartLoc is a macro location which has a 158/// spelling loc that indicates the bytes to be lexed for the token and an 159/// expansion location that indicates where all lexed tokens should be 160/// "expanded from". 161/// 162/// FIXME: It would really be nice to make _Pragma just be a wrapper around a 163/// normal lexer that remaps tokens as they fly by. This would require making 164/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 165/// interface that could handle this stuff. This would pull GetMappedTokenLoc 166/// out of the critical path of the lexer! 167/// 168Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 169 SourceLocation ExpansionLocStart, 170 SourceLocation ExpansionLocEnd, 171 unsigned TokLen, Preprocessor &PP) { 172 SourceManager &SM = PP.getSourceManager(); 173 174 // Create the lexer as if we were going to lex the file normally. 175 FileID SpellingFID = SM.getFileID(SpellingLoc); 176 const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID); 177 Lexer *L = new Lexer(SpellingFID, InputFile, PP); 178 179 // Now that the lexer is created, change the start/end locations so that we 180 // just lex the subsection of the file that we want. This is lexing from a 181 // scratch buffer. 182 const char *StrData = SM.getCharacterData(SpellingLoc); 183 184 L->BufferPtr = StrData; 185 L->BufferEnd = StrData+TokLen; 186 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 187 188 // Set the SourceLocation with the remapping information. This ensures that 189 // GetMappedTokenLoc will remap the tokens as they are lexed. 190 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 191 ExpansionLocStart, 192 ExpansionLocEnd, TokLen); 193 194 // Ensure that the lexer thinks it is inside a directive, so that end \n will 195 // return an EOD token. 196 L->ParsingPreprocessorDirective = true; 197 198 // This lexer really is for _Pragma. 199 L->Is_PragmaLexer = true; 200 return L; 201} 202 203 204/// Stringify - Convert the specified string into a C string, with surrounding 205/// ""'s, and with escaped \ and " characters. 206std::string Lexer::Stringify(const std::string &Str, bool Charify) { 207 std::string Result = Str; 208 char Quote = Charify ? '\'' : '"'; 209 for (unsigned i = 0, e = Result.size(); i != e; ++i) { 210 if (Result[i] == '\\' || Result[i] == Quote) { 211 Result.insert(Result.begin()+i, '\\'); 212 ++i; ++e; 213 } 214 } 215 return Result; 216} 217 218/// Stringify - Convert the specified string into a C string by escaping '\' 219/// and " characters. This does not add surrounding ""'s to the string. 220void Lexer::Stringify(SmallVectorImpl<char> &Str) { 221 for (unsigned i = 0, e = Str.size(); i != e; ++i) { 222 if (Str[i] == '\\' || Str[i] == '"') { 223 Str.insert(Str.begin()+i, '\\'); 224 ++i; ++e; 225 } 226 } 227} 228 229//===----------------------------------------------------------------------===// 230// Token Spelling 231//===----------------------------------------------------------------------===// 232 233/// getSpelling() - Return the 'spelling' of this token. The spelling of a 234/// token are the characters used to represent the token in the source file 235/// after trigraph expansion and escaped-newline folding. In particular, this 236/// wants to get the true, uncanonicalized, spelling of things like digraphs 237/// UCNs, etc. 238StringRef Lexer::getSpelling(SourceLocation loc, 239 SmallVectorImpl<char> &buffer, 240 const SourceManager &SM, 241 const LangOptions &options, 242 bool *invalid) { 243 // Break down the source location. 244 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 245 246 // Try to the load the file buffer. 247 bool invalidTemp = false; 248 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 249 if (invalidTemp) { 250 if (invalid) *invalid = true; 251 return StringRef(); 252 } 253 254 const char *tokenBegin = file.data() + locInfo.second; 255 256 // Lex from the start of the given location. 257 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 258 file.begin(), tokenBegin, file.end()); 259 Token token; 260 lexer.LexFromRawLexer(token); 261 262 unsigned length = token.getLength(); 263 264 // Common case: no need for cleaning. 265 if (!token.needsCleaning()) 266 return StringRef(tokenBegin, length); 267 268 // Hard case, we need to relex the characters into the string. 269 buffer.clear(); 270 buffer.reserve(length); 271 272 for (const char *ti = tokenBegin, *te = ti + length; ti != te; ) { 273 unsigned charSize; 274 buffer.push_back(Lexer::getCharAndSizeNoWarn(ti, charSize, options)); 275 ti += charSize; 276 } 277 278 return StringRef(buffer.data(), buffer.size()); 279} 280 281/// getSpelling() - Return the 'spelling' of this token. The spelling of a 282/// token are the characters used to represent the token in the source file 283/// after trigraph expansion and escaped-newline folding. In particular, this 284/// wants to get the true, uncanonicalized, spelling of things like digraphs 285/// UCNs, etc. 286std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 287 const LangOptions &Features, bool *Invalid) { 288 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 289 290 // If this token contains nothing interesting, return it directly. 291 bool CharDataInvalid = false; 292 const char* TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 293 &CharDataInvalid); 294 if (Invalid) 295 *Invalid = CharDataInvalid; 296 if (CharDataInvalid) 297 return std::string(); 298 299 if (!Tok.needsCleaning()) 300 return std::string(TokStart, TokStart+Tok.getLength()); 301 302 std::string Result; 303 Result.reserve(Tok.getLength()); 304 305 // Otherwise, hard case, relex the characters into the string. 306 for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength(); 307 Ptr != End; ) { 308 unsigned CharSize; 309 Result.push_back(Lexer::getCharAndSizeNoWarn(Ptr, CharSize, Features)); 310 Ptr += CharSize; 311 } 312 assert(Result.size() != unsigned(Tok.getLength()) && 313 "NeedsCleaning flag set on something that didn't need cleaning!"); 314 return Result; 315} 316 317/// getSpelling - This method is used to get the spelling of a token into a 318/// preallocated buffer, instead of as an std::string. The caller is required 319/// to allocate enough space for the token, which is guaranteed to be at least 320/// Tok.getLength() bytes long. The actual length of the token is returned. 321/// 322/// Note that this method may do two possible things: it may either fill in 323/// the buffer specified with characters, or it may *change the input pointer* 324/// to point to a constant buffer with the data already in it (avoiding a 325/// copy). The caller is not allowed to modify the returned buffer pointer 326/// if an internal buffer is returned. 327unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 328 const SourceManager &SourceMgr, 329 const LangOptions &Features, bool *Invalid) { 330 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 331 332 const char *TokStart = 0; 333 // NOTE: this has to be checked *before* testing for an IdentifierInfo. 334 if (Tok.is(tok::raw_identifier)) 335 TokStart = Tok.getRawIdentifierData(); 336 else if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 337 // Just return the string from the identifier table, which is very quick. 338 Buffer = II->getNameStart(); 339 return II->getLength(); 340 } 341 342 // NOTE: this can be checked even after testing for an IdentifierInfo. 343 if (Tok.isLiteral()) 344 TokStart = Tok.getLiteralData(); 345 346 if (TokStart == 0) { 347 // Compute the start of the token in the input lexer buffer. 348 bool CharDataInvalid = false; 349 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 350 if (Invalid) 351 *Invalid = CharDataInvalid; 352 if (CharDataInvalid) { 353 Buffer = ""; 354 return 0; 355 } 356 } 357 358 // If this token contains nothing interesting, return it directly. 359 if (!Tok.needsCleaning()) { 360 Buffer = TokStart; 361 return Tok.getLength(); 362 } 363 364 // Otherwise, hard case, relex the characters into the string. 365 char *OutBuf = const_cast<char*>(Buffer); 366 for (const char *Ptr = TokStart, *End = TokStart+Tok.getLength(); 367 Ptr != End; ) { 368 unsigned CharSize; 369 *OutBuf++ = Lexer::getCharAndSizeNoWarn(Ptr, CharSize, Features); 370 Ptr += CharSize; 371 } 372 assert(unsigned(OutBuf-Buffer) != Tok.getLength() && 373 "NeedsCleaning flag set on something that didn't need cleaning!"); 374 375 return OutBuf-Buffer; 376} 377 378 379 380static bool isWhitespace(unsigned char c); 381 382/// MeasureTokenLength - Relex the token at the specified location and return 383/// its length in bytes in the input file. If the token needs cleaning (e.g. 384/// includes a trigraph or an escaped newline) then this count includes bytes 385/// that are part of that. 386unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 387 const SourceManager &SM, 388 const LangOptions &LangOpts) { 389 // TODO: this could be special cased for common tokens like identifiers, ')', 390 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 391 // all obviously single-char tokens. This could use 392 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 393 // something. 394 395 // If this comes from a macro expansion, we really do want the macro name, not 396 // the token this macro expanded to. 397 Loc = SM.getExpansionLoc(Loc); 398 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 399 bool Invalid = false; 400 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 401 if (Invalid) 402 return 0; 403 404 const char *StrData = Buffer.data()+LocInfo.second; 405 406 if (isWhitespace(StrData[0])) 407 return 0; 408 409 // Create a lexer starting at the beginning of this token. 410 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 411 Buffer.begin(), StrData, Buffer.end()); 412 TheLexer.SetCommentRetentionState(true); 413 Token TheTok; 414 TheLexer.LexFromRawLexer(TheTok); 415 return TheTok.getLength(); 416} 417 418SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 419 const SourceManager &SM, 420 const LangOptions &LangOpts) { 421 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 422 if (LocInfo.first.isInvalid()) 423 return Loc; 424 425 bool Invalid = false; 426 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 427 if (Invalid) 428 return Loc; 429 430 // Back up from the current location until we hit the beginning of a line 431 // (or the buffer). We'll relex from that point. 432 const char *BufStart = Buffer.data(); 433 if (LocInfo.second >= Buffer.size()) 434 return Loc; 435 436 const char *StrData = BufStart+LocInfo.second; 437 if (StrData[0] == '\n' || StrData[0] == '\r') 438 return Loc; 439 440 const char *LexStart = StrData; 441 while (LexStart != BufStart) { 442 if (LexStart[0] == '\n' || LexStart[0] == '\r') { 443 ++LexStart; 444 break; 445 } 446 447 --LexStart; 448 } 449 450 // Create a lexer starting at the beginning of this token. 451 SourceLocation LexerStartLoc = Loc.getFileLocWithOffset(-LocInfo.second); 452 Lexer TheLexer(LexerStartLoc, LangOpts, BufStart, LexStart, Buffer.end()); 453 TheLexer.SetCommentRetentionState(true); 454 455 // Lex tokens until we find the token that contains the source location. 456 Token TheTok; 457 do { 458 TheLexer.LexFromRawLexer(TheTok); 459 460 if (TheLexer.getBufferLocation() > StrData) { 461 // Lexing this token has taken the lexer past the source location we're 462 // looking for. If the current token encompasses our source location, 463 // return the beginning of that token. 464 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 465 return TheTok.getLocation(); 466 467 // We ended up skipping over the source location entirely, which means 468 // that it points into whitespace. We're done here. 469 break; 470 } 471 } while (TheTok.getKind() != tok::eof); 472 473 // We've passed our source location; just return the original source location. 474 return Loc; 475} 476 477namespace { 478 enum PreambleDirectiveKind { 479 PDK_Skipped, 480 PDK_StartIf, 481 PDK_EndIf, 482 PDK_Unknown 483 }; 484} 485 486std::pair<unsigned, bool> 487Lexer::ComputePreamble(const llvm::MemoryBuffer *Buffer, unsigned MaxLines) { 488 // Create a lexer starting at the beginning of the file. Note that we use a 489 // "fake" file source location at offset 1 so that the lexer will track our 490 // position within the file. 491 const unsigned StartOffset = 1; 492 SourceLocation StartLoc = SourceLocation::getFromRawEncoding(StartOffset); 493 LangOptions LangOpts; 494 Lexer TheLexer(StartLoc, LangOpts, Buffer->getBufferStart(), 495 Buffer->getBufferStart(), Buffer->getBufferEnd()); 496 497 bool InPreprocessorDirective = false; 498 Token TheTok; 499 Token IfStartTok; 500 unsigned IfCount = 0; 501 unsigned Line = 0; 502 503 do { 504 TheLexer.LexFromRawLexer(TheTok); 505 506 if (InPreprocessorDirective) { 507 // If we've hit the end of the file, we're done. 508 if (TheTok.getKind() == tok::eof) { 509 InPreprocessorDirective = false; 510 break; 511 } 512 513 // If we haven't hit the end of the preprocessor directive, skip this 514 // token. 515 if (!TheTok.isAtStartOfLine()) 516 continue; 517 518 // We've passed the end of the preprocessor directive, and will look 519 // at this token again below. 520 InPreprocessorDirective = false; 521 } 522 523 // Keep track of the # of lines in the preamble. 524 if (TheTok.isAtStartOfLine()) { 525 ++Line; 526 527 // If we were asked to limit the number of lines in the preamble, 528 // and we're about to exceed that limit, we're done. 529 if (MaxLines && Line >= MaxLines) 530 break; 531 } 532 533 // Comments are okay; skip over them. 534 if (TheTok.getKind() == tok::comment) 535 continue; 536 537 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 538 // This is the start of a preprocessor directive. 539 Token HashTok = TheTok; 540 InPreprocessorDirective = true; 541 542 // Figure out which directive this is. Since we're lexing raw tokens, 543 // we don't have an identifier table available. Instead, just look at 544 // the raw identifier to recognize and categorize preprocessor directives. 545 TheLexer.LexFromRawLexer(TheTok); 546 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 547 StringRef Keyword(TheTok.getRawIdentifierData(), 548 TheTok.getLength()); 549 PreambleDirectiveKind PDK 550 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 551 .Case("include", PDK_Skipped) 552 .Case("__include_macros", PDK_Skipped) 553 .Case("define", PDK_Skipped) 554 .Case("undef", PDK_Skipped) 555 .Case("line", PDK_Skipped) 556 .Case("error", PDK_Skipped) 557 .Case("pragma", PDK_Skipped) 558 .Case("import", PDK_Skipped) 559 .Case("include_next", PDK_Skipped) 560 .Case("warning", PDK_Skipped) 561 .Case("ident", PDK_Skipped) 562 .Case("sccs", PDK_Skipped) 563 .Case("assert", PDK_Skipped) 564 .Case("unassert", PDK_Skipped) 565 .Case("if", PDK_StartIf) 566 .Case("ifdef", PDK_StartIf) 567 .Case("ifndef", PDK_StartIf) 568 .Case("elif", PDK_Skipped) 569 .Case("else", PDK_Skipped) 570 .Case("endif", PDK_EndIf) 571 .Default(PDK_Unknown); 572 573 switch (PDK) { 574 case PDK_Skipped: 575 continue; 576 577 case PDK_StartIf: 578 if (IfCount == 0) 579 IfStartTok = HashTok; 580 581 ++IfCount; 582 continue; 583 584 case PDK_EndIf: 585 // Mismatched #endif. The preamble ends here. 586 if (IfCount == 0) 587 break; 588 589 --IfCount; 590 continue; 591 592 case PDK_Unknown: 593 // We don't know what this directive is; stop at the '#'. 594 break; 595 } 596 } 597 598 // We only end up here if we didn't recognize the preprocessor 599 // directive or it was one that can't occur in the preamble at this 600 // point. Roll back the current token to the location of the '#'. 601 InPreprocessorDirective = false; 602 TheTok = HashTok; 603 } 604 605 // We hit a token that we don't recognize as being in the 606 // "preprocessing only" part of the file, so we're no longer in 607 // the preamble. 608 break; 609 } while (true); 610 611 SourceLocation End = IfCount? IfStartTok.getLocation() : TheTok.getLocation(); 612 return std::make_pair(End.getRawEncoding() - StartLoc.getRawEncoding(), 613 IfCount? IfStartTok.isAtStartOfLine() 614 : TheTok.isAtStartOfLine()); 615} 616 617 618/// AdvanceToTokenCharacter - Given a location that specifies the start of a 619/// token, return a new location that specifies a character within the token. 620SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart, 621 unsigned CharNo, 622 const SourceManager &SM, 623 const LangOptions &Features) { 624 // Figure out how many physical characters away the specified expansion 625 // character is. This needs to take into consideration newlines and 626 // trigraphs. 627 bool Invalid = false; 628 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 629 630 // If they request the first char of the token, we're trivially done. 631 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 632 return TokStart; 633 634 unsigned PhysOffset = 0; 635 636 // The usual case is that tokens don't contain anything interesting. Skip 637 // over the uninteresting characters. If a token only consists of simple 638 // chars, this method is extremely fast. 639 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 640 if (CharNo == 0) 641 return TokStart.getFileLocWithOffset(PhysOffset); 642 ++TokPtr, --CharNo, ++PhysOffset; 643 } 644 645 // If we have a character that may be a trigraph or escaped newline, use a 646 // lexer to parse it correctly. 647 for (; CharNo; --CharNo) { 648 unsigned Size; 649 Lexer::getCharAndSizeNoWarn(TokPtr, Size, Features); 650 TokPtr += Size; 651 PhysOffset += Size; 652 } 653 654 // Final detail: if we end up on an escaped newline, we want to return the 655 // location of the actual byte of the token. For example foo\<newline>bar 656 // advanced by 3 should return the location of b, not of \\. One compounding 657 // detail of this is that the escape may be made by a trigraph. 658 if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 659 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 660 661 return TokStart.getFileLocWithOffset(PhysOffset); 662} 663 664/// \brief Computes the source location just past the end of the 665/// token at this source location. 666/// 667/// This routine can be used to produce a source location that 668/// points just past the end of the token referenced by \p Loc, and 669/// is generally used when a diagnostic needs to point just after a 670/// token where it expected something different that it received. If 671/// the returned source location would not be meaningful (e.g., if 672/// it points into a macro), this routine returns an invalid 673/// source location. 674/// 675/// \param Offset an offset from the end of the token, where the source 676/// location should refer to. The default offset (0) produces a source 677/// location pointing just past the end of the token; an offset of 1 produces 678/// a source location pointing to the last character in the token, etc. 679SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 680 const SourceManager &SM, 681 const LangOptions &Features) { 682 if (Loc.isInvalid()) 683 return SourceLocation(); 684 685 if (Loc.isMacroID()) { 686 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, Features)) 687 return SourceLocation(); // Points inside the macro expansion. 688 689 // Continue and find the location just after the macro expansion. 690 Loc = SM.getExpansionRange(Loc).second; 691 } 692 693 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, Features); 694 if (Len > Offset) 695 Len = Len - Offset; 696 else 697 return Loc; 698 699 return Loc.getFileLocWithOffset(Len); 700} 701 702/// \brief Returns true if the given MacroID location points at the first 703/// token of the macro expansion. 704bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 705 const SourceManager &SM, 706 const LangOptions &LangOpts) { 707 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 708 709 std::pair<FileID, unsigned> infoLoc = SM.getDecomposedLoc(loc); 710 // FIXME: If the token comes from the macro token paste operator ('##') 711 // this function will always return false; 712 if (infoLoc.second > 0) 713 return false; // Does not point at the start of token. 714 715 SourceLocation expansionLoc = 716 SM.getSLocEntry(infoLoc.first) 717 .getInstantiation().getInstantiationLocStart(); 718 if (expansionLoc.isFileID()) 719 return true; // No other macro expansions, this is the first. 720 721 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts); 722} 723 724/// \brief Returns true if the given MacroID location points at the last 725/// token of the macro expansion. 726bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 727 const SourceManager &SM, 728 const LangOptions &LangOpts) { 729 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 730 731 SourceLocation spellLoc = SM.getSpellingLoc(loc); 732 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 733 if (tokLen == 0) 734 return false; 735 736 FileID FID = SM.getFileID(loc); 737 SourceLocation afterLoc = loc.getFileLocWithOffset(tokLen+1); 738 if (!SM.isBeforeInSourceLocationOffset(afterLoc, SM.getNextLocalOffset())) 739 return true; // We got past the last FileID, this points to the last token. 740 741 // FIXME: If the token comes from the macro token paste operator ('##') 742 // or the stringify operator ('#') this function will always return false; 743 if (FID == SM.getFileID(afterLoc)) 744 return false; // Still in the same FileID, does not point to the last token. 745 746 SourceLocation expansionLoc = 747 SM.getSLocEntry(FID).getInstantiation().getInstantiationLocEnd(); 748 if (expansionLoc.isFileID()) 749 return true; // No other macro expansions. 750 751 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts); 752} 753 754//===----------------------------------------------------------------------===// 755// Character information. 756//===----------------------------------------------------------------------===// 757 758enum { 759 CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0' 760 CHAR_VERT_WS = 0x02, // '\r', '\n' 761 CHAR_LETTER = 0x04, // a-z,A-Z 762 CHAR_NUMBER = 0x08, // 0-9 763 CHAR_UNDER = 0x10, // _ 764 CHAR_PERIOD = 0x20 // . 765}; 766 767// Statically initialize CharInfo table based on ASCII character set 768// Reference: FreeBSD 7.2 /usr/share/misc/ascii 769static const unsigned char CharInfo[256] = 770{ 771// 0 NUL 1 SOH 2 STX 3 ETX 772// 4 EOT 5 ENQ 6 ACK 7 BEL 773 0 , 0 , 0 , 0 , 774 0 , 0 , 0 , 0 , 775// 8 BS 9 HT 10 NL 11 VT 776//12 NP 13 CR 14 SO 15 SI 777 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS, 778 CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 , 779//16 DLE 17 DC1 18 DC2 19 DC3 780//20 DC4 21 NAK 22 SYN 23 ETB 781 0 , 0 , 0 , 0 , 782 0 , 0 , 0 , 0 , 783//24 CAN 25 EM 26 SUB 27 ESC 784//28 FS 29 GS 30 RS 31 US 785 0 , 0 , 0 , 0 , 786 0 , 0 , 0 , 0 , 787//32 SP 33 ! 34 " 35 # 788//36 $ 37 % 38 & 39 ' 789 CHAR_HORZ_WS, 0 , 0 , 0 , 790 0 , 0 , 0 , 0 , 791//40 ( 41 ) 42 * 43 + 792//44 , 45 - 46 . 47 / 793 0 , 0 , 0 , 0 , 794 0 , 0 , CHAR_PERIOD , 0 , 795//48 0 49 1 50 2 51 3 796//52 4 53 5 54 6 55 7 797 CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , 798 CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , 799//56 8 57 9 58 : 59 ; 800//60 < 61 = 62 > 63 ? 801 CHAR_NUMBER , CHAR_NUMBER , 0 , 0 , 802 0 , 0 , 0 , 0 , 803//64 @ 65 A 66 B 67 C 804//68 D 69 E 70 F 71 G 805 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 806 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 807//72 H 73 I 74 J 75 K 808//76 L 77 M 78 N 79 O 809 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 810 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 811//80 P 81 Q 82 R 83 S 812//84 T 85 U 86 V 87 W 813 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 814 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 815//88 X 89 Y 90 Z 91 [ 816//92 \ 93 ] 94 ^ 95 _ 817 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0 , 818 0 , 0 , 0 , CHAR_UNDER , 819//96 ` 97 a 98 b 99 c 820//100 d 101 e 102 f 103 g 821 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 822 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 823//104 h 105 i 106 j 107 k 824//108 l 109 m 110 n 111 o 825 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 826 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 827//112 p 113 q 114 r 115 s 828//116 t 117 u 118 v 119 w 829 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 830 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 831//120 x 121 y 122 z 123 { 832//124 | 125 } 126 ~ 127 DEL 833 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0 , 834 0 , 0 , 0 , 0 835}; 836 837static void InitCharacterInfo() { 838 static bool isInited = false; 839 if (isInited) return; 840 // check the statically-initialized CharInfo table 841 assert(CHAR_HORZ_WS == CharInfo[(int)' ']); 842 assert(CHAR_HORZ_WS == CharInfo[(int)'\t']); 843 assert(CHAR_HORZ_WS == CharInfo[(int)'\f']); 844 assert(CHAR_HORZ_WS == CharInfo[(int)'\v']); 845 assert(CHAR_VERT_WS == CharInfo[(int)'\n']); 846 assert(CHAR_VERT_WS == CharInfo[(int)'\r']); 847 assert(CHAR_UNDER == CharInfo[(int)'_']); 848 assert(CHAR_PERIOD == CharInfo[(int)'.']); 849 for (unsigned i = 'a'; i <= 'z'; ++i) { 850 assert(CHAR_LETTER == CharInfo[i]); 851 assert(CHAR_LETTER == CharInfo[i+'A'-'a']); 852 } 853 for (unsigned i = '0'; i <= '9'; ++i) 854 assert(CHAR_NUMBER == CharInfo[i]); 855 856 isInited = true; 857} 858 859 860/// isIdentifierBody - Return true if this is the body character of an 861/// identifier, which is [a-zA-Z0-9_]. 862static inline bool isIdentifierBody(unsigned char c) { 863 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false; 864} 865 866/// isHorizontalWhitespace - Return true if this character is horizontal 867/// whitespace: ' ', '\t', '\f', '\v'. Note that this returns false for '\0'. 868static inline bool isHorizontalWhitespace(unsigned char c) { 869 return (CharInfo[c] & CHAR_HORZ_WS) ? true : false; 870} 871 872/// isWhitespace - Return true if this character is horizontal or vertical 873/// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'. Note that this returns false 874/// for '\0'. 875static inline bool isWhitespace(unsigned char c) { 876 return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false; 877} 878 879/// isNumberBody - Return true if this is the body character of an 880/// preprocessing number, which is [a-zA-Z0-9_.]. 881static inline bool isNumberBody(unsigned char c) { 882 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ? 883 true : false; 884} 885 886 887//===----------------------------------------------------------------------===// 888// Diagnostics forwarding code. 889//===----------------------------------------------------------------------===// 890 891/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 892/// lexer buffer was all expanded at a single point, perform the mapping. 893/// This is currently only used for _Pragma implementation, so it is the slow 894/// path of the hot getSourceLocation method. Do not allow it to be inlined. 895static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 896 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 897static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 898 SourceLocation FileLoc, 899 unsigned CharNo, unsigned TokLen) { 900 assert(FileLoc.isMacroID() && "Must be a macro expansion"); 901 902 // Otherwise, we're lexing "mapped tokens". This is used for things like 903 // _Pragma handling. Combine the expansion location of FileLoc with the 904 // spelling location. 905 SourceManager &SM = PP.getSourceManager(); 906 907 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 908 // characters come from spelling(FileLoc)+Offset. 909 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 910 SpellingLoc = SpellingLoc.getFileLocWithOffset(CharNo); 911 912 // Figure out the expansion loc range, which is the range covered by the 913 // original _Pragma(...) sequence. 914 std::pair<SourceLocation,SourceLocation> II = 915 SM.getImmediateExpansionRange(FileLoc); 916 917 return SM.createExpansionLoc(SpellingLoc, II.first, II.second, TokLen); 918} 919 920/// getSourceLocation - Return a source location identifier for the specified 921/// offset in the current file. 922SourceLocation Lexer::getSourceLocation(const char *Loc, 923 unsigned TokLen) const { 924 assert(Loc >= BufferStart && Loc <= BufferEnd && 925 "Location out of range for this buffer!"); 926 927 // In the normal case, we're just lexing from a simple file buffer, return 928 // the file id from FileLoc with the offset specified. 929 unsigned CharNo = Loc-BufferStart; 930 if (FileLoc.isFileID()) 931 return FileLoc.getFileLocWithOffset(CharNo); 932 933 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 934 // tokens are lexed from where the _Pragma was defined. 935 assert(PP && "This doesn't work on raw lexers"); 936 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 937} 938 939/// Diag - Forwarding function for diagnostics. This translate a source 940/// position in the current buffer into a SourceLocation object for rendering. 941DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 942 return PP->Diag(getSourceLocation(Loc), DiagID); 943} 944 945//===----------------------------------------------------------------------===// 946// Trigraph and Escaped Newline Handling Code. 947//===----------------------------------------------------------------------===// 948 949/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 950/// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 951static char GetTrigraphCharForLetter(char Letter) { 952 switch (Letter) { 953 default: return 0; 954 case '=': return '#'; 955 case ')': return ']'; 956 case '(': return '['; 957 case '!': return '|'; 958 case '\'': return '^'; 959 case '>': return '}'; 960 case '/': return '\\'; 961 case '<': return '{'; 962 case '-': return '~'; 963 } 964} 965 966/// DecodeTrigraphChar - If the specified character is a legal trigraph when 967/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 968/// return the result character. Finally, emit a warning about trigraph use 969/// whether trigraphs are enabled or not. 970static char DecodeTrigraphChar(const char *CP, Lexer *L) { 971 char Res = GetTrigraphCharForLetter(*CP); 972 if (!Res || !L) return Res; 973 974 if (!L->getFeatures().Trigraphs) { 975 if (!L->isLexingRawMode()) 976 L->Diag(CP-2, diag::trigraph_ignored); 977 return 0; 978 } 979 980 if (!L->isLexingRawMode()) 981 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 982 return Res; 983} 984 985/// getEscapedNewLineSize - Return the size of the specified escaped newline, 986/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 987/// trigraph equivalent on entry to this function. 988unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 989 unsigned Size = 0; 990 while (isWhitespace(Ptr[Size])) { 991 ++Size; 992 993 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 994 continue; 995 996 // If this is a \r\n or \n\r, skip the other half. 997 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 998 Ptr[Size-1] != Ptr[Size]) 999 ++Size; 1000 1001 return Size; 1002 } 1003 1004 // Not an escaped newline, must be a \t or something else. 1005 return 0; 1006} 1007 1008/// SkipEscapedNewLines - If P points to an escaped newline (or a series of 1009/// them), skip over them and return the first non-escaped-newline found, 1010/// otherwise return P. 1011const char *Lexer::SkipEscapedNewLines(const char *P) { 1012 while (1) { 1013 const char *AfterEscape; 1014 if (*P == '\\') { 1015 AfterEscape = P+1; 1016 } else if (*P == '?') { 1017 // If not a trigraph for escape, bail out. 1018 if (P[1] != '?' || P[2] != '/') 1019 return P; 1020 AfterEscape = P+3; 1021 } else { 1022 return P; 1023 } 1024 1025 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 1026 if (NewLineSize == 0) return P; 1027 P = AfterEscape+NewLineSize; 1028 } 1029} 1030 1031 1032/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 1033/// get its size, and return it. This is tricky in several cases: 1034/// 1. If currently at the start of a trigraph, we warn about the trigraph, 1035/// then either return the trigraph (skipping 3 chars) or the '?', 1036/// depending on whether trigraphs are enabled or not. 1037/// 2. If this is an escaped newline (potentially with whitespace between 1038/// the backslash and newline), implicitly skip the newline and return 1039/// the char after it. 1040/// 3. If this is a UCN, return it. FIXME: C++ UCN's? 1041/// 1042/// This handles the slow/uncommon case of the getCharAndSize method. Here we 1043/// know that we can accumulate into Size, and that we have already incremented 1044/// Ptr by Size bytes. 1045/// 1046/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 1047/// be updated to match. 1048/// 1049char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 1050 Token *Tok) { 1051 // If we have a slash, look for an escaped newline. 1052 if (Ptr[0] == '\\') { 1053 ++Size; 1054 ++Ptr; 1055Slash: 1056 // Common case, backslash-char where the char is not whitespace. 1057 if (!isWhitespace(Ptr[0])) return '\\'; 1058 1059 // See if we have optional whitespace characters between the slash and 1060 // newline. 1061 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1062 // Remember that this token needs to be cleaned. 1063 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1064 1065 // Warn if there was whitespace between the backslash and newline. 1066 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 1067 Diag(Ptr, diag::backslash_newline_space); 1068 1069 // Found backslash<whitespace><newline>. Parse the char after it. 1070 Size += EscapedNewLineSize; 1071 Ptr += EscapedNewLineSize; 1072 // Use slow version to accumulate a correct size field. 1073 return getCharAndSizeSlow(Ptr, Size, Tok); 1074 } 1075 1076 // Otherwise, this is not an escaped newline, just return the slash. 1077 return '\\'; 1078 } 1079 1080 // If this is a trigraph, process it. 1081 if (Ptr[0] == '?' && Ptr[1] == '?') { 1082 // If this is actually a legal trigraph (not something like "??x"), emit 1083 // a trigraph warning. If so, and if trigraphs are enabled, return it. 1084 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) { 1085 // Remember that this token needs to be cleaned. 1086 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1087 1088 Ptr += 3; 1089 Size += 3; 1090 if (C == '\\') goto Slash; 1091 return C; 1092 } 1093 } 1094 1095 // If this is neither, return a single character. 1096 ++Size; 1097 return *Ptr; 1098} 1099 1100 1101/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 1102/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 1103/// and that we have already incremented Ptr by Size bytes. 1104/// 1105/// NOTE: When this method is updated, getCharAndSizeSlow (above) should 1106/// be updated to match. 1107char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 1108 const LangOptions &Features) { 1109 // If we have a slash, look for an escaped newline. 1110 if (Ptr[0] == '\\') { 1111 ++Size; 1112 ++Ptr; 1113Slash: 1114 // Common case, backslash-char where the char is not whitespace. 1115 if (!isWhitespace(Ptr[0])) return '\\'; 1116 1117 // See if we have optional whitespace characters followed by a newline. 1118 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1119 // Found backslash<whitespace><newline>. Parse the char after it. 1120 Size += EscapedNewLineSize; 1121 Ptr += EscapedNewLineSize; 1122 1123 // Use slow version to accumulate a correct size field. 1124 return getCharAndSizeSlowNoWarn(Ptr, Size, Features); 1125 } 1126 1127 // Otherwise, this is not an escaped newline, just return the slash. 1128 return '\\'; 1129 } 1130 1131 // If this is a trigraph, process it. 1132 if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 1133 // If this is actually a legal trigraph (not something like "??x"), return 1134 // it. 1135 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 1136 Ptr += 3; 1137 Size += 3; 1138 if (C == '\\') goto Slash; 1139 return C; 1140 } 1141 } 1142 1143 // If this is neither, return a single character. 1144 ++Size; 1145 return *Ptr; 1146} 1147 1148//===----------------------------------------------------------------------===// 1149// Helper methods for lexing. 1150//===----------------------------------------------------------------------===// 1151 1152/// \brief Routine that indiscriminately skips bytes in the source file. 1153void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) { 1154 BufferPtr += Bytes; 1155 if (BufferPtr > BufferEnd) 1156 BufferPtr = BufferEnd; 1157 IsAtStartOfLine = StartOfLine; 1158} 1159 1160void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { 1161 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] 1162 unsigned Size; 1163 unsigned char C = *CurPtr++; 1164 while (isIdentifierBody(C)) 1165 C = *CurPtr++; 1166 1167 --CurPtr; // Back up over the skipped character. 1168 1169 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline 1170 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. 1171 // FIXME: UCNs. 1172 // 1173 // TODO: Could merge these checks into a CharInfo flag to make the comparison 1174 // cheaper 1175 if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) { 1176FinishIdentifier: 1177 const char *IdStart = BufferPtr; 1178 FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 1179 Result.setRawIdentifierData(IdStart); 1180 1181 // If we are in raw mode, return this identifier raw. There is no need to 1182 // look up identifier information or attempt to macro expand it. 1183 if (LexingRawMode) 1184 return; 1185 1186 // Fill in Result.IdentifierInfo and update the token kind, 1187 // looking up the identifier in the identifier table. 1188 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 1189 1190 // Finally, now that we know we have an identifier, pass this off to the 1191 // preprocessor, which may macro expand it or something. 1192 if (II->isHandleIdentifierCase()) 1193 PP->HandleIdentifier(Result); 1194 return; 1195 } 1196 1197 // Otherwise, $,\,? in identifier found. Enter slower path. 1198 1199 C = getCharAndSize(CurPtr, Size); 1200 while (1) { 1201 if (C == '$') { 1202 // If we hit a $ and they are not supported in identifiers, we are done. 1203 if (!Features.DollarIdents) goto FinishIdentifier; 1204 1205 // Otherwise, emit a diagnostic and continue. 1206 if (!isLexingRawMode()) 1207 Diag(CurPtr, diag::ext_dollar_in_identifier); 1208 CurPtr = ConsumeChar(CurPtr, Size, Result); 1209 C = getCharAndSize(CurPtr, Size); 1210 continue; 1211 } else if (!isIdentifierBody(C)) { // FIXME: UCNs. 1212 // Found end of identifier. 1213 goto FinishIdentifier; 1214 } 1215 1216 // Otherwise, this character is good, consume it. 1217 CurPtr = ConsumeChar(CurPtr, Size, Result); 1218 1219 C = getCharAndSize(CurPtr, Size); 1220 while (isIdentifierBody(C)) { // FIXME: UCNs. 1221 CurPtr = ConsumeChar(CurPtr, Size, Result); 1222 C = getCharAndSize(CurPtr, Size); 1223 } 1224 } 1225} 1226 1227/// isHexaLiteral - Return true if Start points to a hex constant. 1228/// in microsoft mode (where this is supposed to be several different tokens). 1229static bool isHexaLiteral(const char *Start, const LangOptions &Features) { 1230 unsigned Size; 1231 char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, Features); 1232 if (C1 != '0') 1233 return false; 1234 char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, Features); 1235 return (C2 == 'x' || C2 == 'X'); 1236} 1237 1238/// LexNumericConstant - Lex the remainder of a integer or floating point 1239/// constant. From[-1] is the first character lexed. Return the end of the 1240/// constant. 1241void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 1242 unsigned Size; 1243 char C = getCharAndSize(CurPtr, Size); 1244 char PrevCh = 0; 1245 while (isNumberBody(C)) { // FIXME: UCNs? 1246 CurPtr = ConsumeChar(CurPtr, Size, Result); 1247 PrevCh = C; 1248 C = getCharAndSize(CurPtr, Size); 1249 } 1250 1251 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 1252 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 1253 // If we are in Microsoft mode, don't continue if the constant is hex. 1254 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 1255 if (!Features.Microsoft || !isHexaLiteral(BufferPtr, Features)) 1256 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 1257 } 1258 1259 // If we have a hex FP constant, continue. 1260 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p') && 1261 !Features.CPlusPlus0x) 1262 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 1263 1264 // Update the location of token as well as BufferPtr. 1265 const char *TokStart = BufferPtr; 1266 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 1267 Result.setLiteralData(TokStart); 1268} 1269 1270/// LexStringLiteral - Lex the remainder of a string literal, after having lexed 1271/// either " or L". 1272void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) { 1273 const char *NulCharacter = 0; // Does this string contain the \0 character? 1274 1275 char C = getAndAdvanceChar(CurPtr, Result); 1276 while (C != '"') { 1277 // Skip escaped characters. Escaped newlines will already be processed by 1278 // getAndAdvanceChar. 1279 if (C == '\\') 1280 C = getAndAdvanceChar(CurPtr, Result); 1281 1282 if (C == '\n' || C == '\r' || // Newline. 1283 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 1284 if (C == 0 && PP && PP->isCodeCompletionFile(FileLoc)) 1285 PP->CodeCompleteNaturalLanguage(); 1286 else if (!isLexingRawMode() && !Features.AsmPreprocessor) 1287 Diag(BufferPtr, diag::warn_unterminated_string); 1288 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1289 return; 1290 } 1291 1292 if (C == 0) 1293 NulCharacter = CurPtr-1; 1294 C = getAndAdvanceChar(CurPtr, Result); 1295 } 1296 1297 // If a nul character existed in the string, warn about it. 1298 if (NulCharacter && !isLexingRawMode()) 1299 Diag(NulCharacter, diag::null_in_string); 1300 1301 // Update the location of the token as well as the BufferPtr instance var. 1302 const char *TokStart = BufferPtr; 1303 FormTokenWithChars(Result, CurPtr, 1304 Wide ? tok::wide_string_literal : tok::string_literal); 1305 Result.setLiteralData(TokStart); 1306} 1307 1308/// LexAngledStringLiteral - Lex the remainder of an angled string literal, 1309/// after having lexed the '<' character. This is used for #include filenames. 1310void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 1311 const char *NulCharacter = 0; // Does this string contain the \0 character? 1312 const char *AfterLessPos = CurPtr; 1313 char C = getAndAdvanceChar(CurPtr, Result); 1314 while (C != '>') { 1315 // Skip escaped characters. 1316 if (C == '\\') { 1317 // Skip the escaped character. 1318 C = getAndAdvanceChar(CurPtr, Result); 1319 } else if (C == '\n' || C == '\r' || // Newline. 1320 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 1321 // If the filename is unterminated, then it must just be a lone < 1322 // character. Return this as such. 1323 FormTokenWithChars(Result, AfterLessPos, tok::less); 1324 return; 1325 } else if (C == 0) { 1326 NulCharacter = CurPtr-1; 1327 } 1328 C = getAndAdvanceChar(CurPtr, Result); 1329 } 1330 1331 // If a nul character existed in the string, warn about it. 1332 if (NulCharacter && !isLexingRawMode()) 1333 Diag(NulCharacter, diag::null_in_string); 1334 1335 // Update the location of token as well as BufferPtr. 1336 const char *TokStart = BufferPtr; 1337 FormTokenWithChars(Result, CurPtr, tok::angle_string_literal); 1338 Result.setLiteralData(TokStart); 1339} 1340 1341 1342/// LexCharConstant - Lex the remainder of a character constant, after having 1343/// lexed either ' or L'. 1344void Lexer::LexCharConstant(Token &Result, const char *CurPtr) { 1345 const char *NulCharacter = 0; // Does this character contain the \0 character? 1346 1347 char C = getAndAdvanceChar(CurPtr, Result); 1348 if (C == '\'') { 1349 if (!isLexingRawMode() && !Features.AsmPreprocessor) 1350 Diag(BufferPtr, diag::err_empty_character); 1351 FormTokenWithChars(Result, CurPtr, tok::unknown); 1352 return; 1353 } 1354 1355 while (C != '\'') { 1356 // Skip escaped characters. 1357 if (C == '\\') { 1358 // Skip the escaped character. 1359 // FIXME: UCN's 1360 C = getAndAdvanceChar(CurPtr, Result); 1361 } else if (C == '\n' || C == '\r' || // Newline. 1362 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 1363 if (C == 0 && PP && PP->isCodeCompletionFile(FileLoc)) 1364 PP->CodeCompleteNaturalLanguage(); 1365 else if (!isLexingRawMode() && !Features.AsmPreprocessor) 1366 Diag(BufferPtr, diag::warn_unterminated_char); 1367 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1368 return; 1369 } else if (C == 0) { 1370 NulCharacter = CurPtr-1; 1371 } 1372 C = getAndAdvanceChar(CurPtr, Result); 1373 } 1374 1375 // If a nul character existed in the character, warn about it. 1376 if (NulCharacter && !isLexingRawMode()) 1377 Diag(NulCharacter, diag::null_in_char); 1378 1379 // Update the location of token as well as BufferPtr. 1380 const char *TokStart = BufferPtr; 1381 FormTokenWithChars(Result, CurPtr, tok::char_constant); 1382 Result.setLiteralData(TokStart); 1383} 1384 1385/// SkipWhitespace - Efficiently skip over a series of whitespace characters. 1386/// Update BufferPtr to point to the next non-whitespace character and return. 1387/// 1388/// This method forms a token and returns true if KeepWhitespaceMode is enabled. 1389/// 1390bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { 1391 // Whitespace - Skip it, then return the token after the whitespace. 1392 unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently. 1393 while (1) { 1394 // Skip horizontal whitespace very aggressively. 1395 while (isHorizontalWhitespace(Char)) 1396 Char = *++CurPtr; 1397 1398 // Otherwise if we have something other than whitespace, we're done. 1399 if (Char != '\n' && Char != '\r') 1400 break; 1401 1402 if (ParsingPreprocessorDirective) { 1403 // End of preprocessor directive line, let LexTokenInternal handle this. 1404 BufferPtr = CurPtr; 1405 return false; 1406 } 1407 1408 // ok, but handle newline. 1409 // The returned token is at the start of the line. 1410 Result.setFlag(Token::StartOfLine); 1411 // No leading whitespace seen so far. 1412 Result.clearFlag(Token::LeadingSpace); 1413 Char = *++CurPtr; 1414 } 1415 1416 // If this isn't immediately after a newline, there is leading space. 1417 char PrevChar = CurPtr[-1]; 1418 if (PrevChar != '\n' && PrevChar != '\r') 1419 Result.setFlag(Token::LeadingSpace); 1420 1421 // If the client wants us to return whitespace, return it now. 1422 if (isKeepWhitespaceMode()) { 1423 FormTokenWithChars(Result, CurPtr, tok::unknown); 1424 return true; 1425 } 1426 1427 BufferPtr = CurPtr; 1428 return false; 1429} 1430 1431// SkipBCPLComment - We have just read the // characters from input. Skip until 1432// we find the newline character thats terminate the comment. Then update 1433/// BufferPtr and return. 1434/// 1435/// If we're in KeepCommentMode or any CommentHandler has inserted 1436/// some tokens, this will store the first token and return true. 1437bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { 1438 // If BCPL comments aren't explicitly enabled for this language, emit an 1439 // extension warning. 1440 if (!Features.BCPLComment && !isLexingRawMode()) { 1441 Diag(BufferPtr, diag::ext_bcpl_comment); 1442 1443 // Mark them enabled so we only emit one warning for this translation 1444 // unit. 1445 Features.BCPLComment = true; 1446 } 1447 1448 // Scan over the body of the comment. The common case, when scanning, is that 1449 // the comment contains normal ascii characters with nothing interesting in 1450 // them. As such, optimize for this case with the inner loop. 1451 char C; 1452 do { 1453 C = *CurPtr; 1454 // FIXME: Speedup BCPL comment lexing. Just scan for a \n or \r character. 1455 // If we find a \n character, scan backwards, checking to see if it's an 1456 // escaped newline, like we do for block comments. 1457 1458 // Skip over characters in the fast loop. 1459 while (C != 0 && // Potentially EOF. 1460 C != '\\' && // Potentially escaped newline. 1461 C != '?' && // Potentially trigraph. 1462 C != '\n' && C != '\r') // Newline or DOS-style newline. 1463 C = *++CurPtr; 1464 1465 // If this is a newline, we're done. 1466 if (C == '\n' || C == '\r') 1467 break; // Found the newline? Break out! 1468 1469 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 1470 // properly decode the character. Read it in raw mode to avoid emitting 1471 // diagnostics about things like trigraphs. If we see an escaped newline, 1472 // we'll handle it below. 1473 const char *OldPtr = CurPtr; 1474 bool OldRawMode = isLexingRawMode(); 1475 LexingRawMode = true; 1476 C = getAndAdvanceChar(CurPtr, Result); 1477 LexingRawMode = OldRawMode; 1478 1479 // If the char that we finally got was a \n, then we must have had something 1480 // like \<newline><newline>. We don't want to have consumed the second 1481 // newline, we want CurPtr, to end up pointing to it down below. 1482 if (C == '\n' || C == '\r') { 1483 --CurPtr; 1484 C = 'x'; // doesn't matter what this is. 1485 } 1486 1487 // If we read multiple characters, and one of those characters was a \r or 1488 // \n, then we had an escaped newline within the comment. Emit diagnostic 1489 // unless the next line is also a // comment. 1490 if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') { 1491 for (; OldPtr != CurPtr; ++OldPtr) 1492 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 1493 // Okay, we found a // comment that ends in a newline, if the next 1494 // line is also a // comment, but has spaces, don't emit a diagnostic. 1495 if (isspace(C)) { 1496 const char *ForwardPtr = CurPtr; 1497 while (isspace(*ForwardPtr)) // Skip whitespace. 1498 ++ForwardPtr; 1499 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 1500 break; 1501 } 1502 1503 if (!isLexingRawMode()) 1504 Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment); 1505 break; 1506 } 1507 } 1508 1509 if (CurPtr == BufferEnd+1) { 1510 if (PP && PP->isCodeCompletionFile(FileLoc)) 1511 PP->CodeCompleteNaturalLanguage(); 1512 1513 --CurPtr; 1514 break; 1515 } 1516 } while (C != '\n' && C != '\r'); 1517 1518 // Found but did not consume the newline. Notify comment handlers about the 1519 // comment unless we're in a #if 0 block. 1520 if (PP && !isLexingRawMode() && 1521 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 1522 getSourceLocation(CurPtr)))) { 1523 BufferPtr = CurPtr; 1524 return true; // A token has to be returned. 1525 } 1526 1527 // If we are returning comments as tokens, return this comment as a token. 1528 if (inKeepCommentMode()) 1529 return SaveBCPLComment(Result, CurPtr); 1530 1531 // If we are inside a preprocessor directive and we see the end of line, 1532 // return immediately, so that the lexer can return this as an EOD token. 1533 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 1534 BufferPtr = CurPtr; 1535 return false; 1536 } 1537 1538 // Otherwise, eat the \n character. We don't care if this is a \n\r or 1539 // \r\n sequence. This is an efficiency hack (because we know the \n can't 1540 // contribute to another token), it isn't needed for correctness. Note that 1541 // this is ok even in KeepWhitespaceMode, because we would have returned the 1542 /// comment above in that mode. 1543 ++CurPtr; 1544 1545 // The next returned token is at the start of the line. 1546 Result.setFlag(Token::StartOfLine); 1547 // No leading whitespace seen so far. 1548 Result.clearFlag(Token::LeadingSpace); 1549 BufferPtr = CurPtr; 1550 return false; 1551} 1552 1553/// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in 1554/// an appropriate way and return it. 1555bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) { 1556 // If we're not in a preprocessor directive, just return the // comment 1557 // directly. 1558 FormTokenWithChars(Result, CurPtr, tok::comment); 1559 1560 if (!ParsingPreprocessorDirective) 1561 return true; 1562 1563 // If this BCPL-style comment is in a macro definition, transmogrify it into 1564 // a C-style block comment. 1565 bool Invalid = false; 1566 std::string Spelling = PP->getSpelling(Result, &Invalid); 1567 if (Invalid) 1568 return true; 1569 1570 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?"); 1571 Spelling[1] = '*'; // Change prefix to "/*". 1572 Spelling += "*/"; // add suffix. 1573 1574 Result.setKind(tok::comment); 1575 PP->CreateString(&Spelling[0], Spelling.size(), Result, 1576 Result.getLocation()); 1577 return true; 1578} 1579 1580/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 1581/// character (either \n or \r) is part of an escaped newline sequence. Issue a 1582/// diagnostic if so. We know that the newline is inside of a block comment. 1583static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, 1584 Lexer *L) { 1585 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 1586 1587 // Back up off the newline. 1588 --CurPtr; 1589 1590 // If this is a two-character newline sequence, skip the other character. 1591 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 1592 // \n\n or \r\r -> not escaped newline. 1593 if (CurPtr[0] == CurPtr[1]) 1594 return false; 1595 // \n\r or \r\n -> skip the newline. 1596 --CurPtr; 1597 } 1598 1599 // If we have horizontal whitespace, skip over it. We allow whitespace 1600 // between the slash and newline. 1601 bool HasSpace = false; 1602 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 1603 --CurPtr; 1604 HasSpace = true; 1605 } 1606 1607 // If we have a slash, we know this is an escaped newline. 1608 if (*CurPtr == '\\') { 1609 if (CurPtr[-1] != '*') return false; 1610 } else { 1611 // It isn't a slash, is it the ?? / trigraph? 1612 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || 1613 CurPtr[-3] != '*') 1614 return false; 1615 1616 // This is the trigraph ending the comment. Emit a stern warning! 1617 CurPtr -= 2; 1618 1619 // If no trigraphs are enabled, warn that we ignored this trigraph and 1620 // ignore this * character. 1621 if (!L->getFeatures().Trigraphs) { 1622 if (!L->isLexingRawMode()) 1623 L->Diag(CurPtr, diag::trigraph_ignored_block_comment); 1624 return false; 1625 } 1626 if (!L->isLexingRawMode()) 1627 L->Diag(CurPtr, diag::trigraph_ends_block_comment); 1628 } 1629 1630 // Warn about having an escaped newline between the */ characters. 1631 if (!L->isLexingRawMode()) 1632 L->Diag(CurPtr, diag::escaped_newline_block_comment_end); 1633 1634 // If there was space between the backslash and newline, warn about it. 1635 if (HasSpace && !L->isLexingRawMode()) 1636 L->Diag(CurPtr, diag::backslash_newline_space); 1637 1638 return true; 1639} 1640 1641#ifdef __SSE2__ 1642#include <emmintrin.h> 1643#elif __ALTIVEC__ 1644#include <altivec.h> 1645#undef bool 1646#endif 1647 1648/// SkipBlockComment - We have just read the /* characters from input. Read 1649/// until we find the */ characters that terminate the comment. Note that we 1650/// don't bother decoding trigraphs or escaped newlines in block comments, 1651/// because they cannot cause the comment to end. The only thing that can 1652/// happen is the comment could end with an escaped newline between the */ end 1653/// of comment. 1654/// 1655/// If we're in KeepCommentMode or any CommentHandler has inserted 1656/// some tokens, this will store the first token and return true. 1657bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { 1658 // Scan one character past where we should, looking for a '/' character. Once 1659 // we find it, check to see if it was preceded by a *. This common 1660 // optimization helps people who like to put a lot of * characters in their 1661 // comments. 1662 1663 // The first character we get with newlines and trigraphs skipped to handle 1664 // the degenerate /*/ case below correctly if the * has an escaped newline 1665 // after it. 1666 unsigned CharSize; 1667 unsigned char C = getCharAndSize(CurPtr, CharSize); 1668 CurPtr += CharSize; 1669 if (C == 0 && CurPtr == BufferEnd+1) { 1670 if (!isLexingRawMode() && 1671 !PP->isCodeCompletionFile(FileLoc)) 1672 Diag(BufferPtr, diag::err_unterminated_block_comment); 1673 --CurPtr; 1674 1675 // KeepWhitespaceMode should return this broken comment as a token. Since 1676 // it isn't a well formed comment, just return it as an 'unknown' token. 1677 if (isKeepWhitespaceMode()) { 1678 FormTokenWithChars(Result, CurPtr, tok::unknown); 1679 return true; 1680 } 1681 1682 BufferPtr = CurPtr; 1683 return false; 1684 } 1685 1686 // Check to see if the first character after the '/*' is another /. If so, 1687 // then this slash does not end the block comment, it is part of it. 1688 if (C == '/') 1689 C = *CurPtr++; 1690 1691 while (1) { 1692 // Skip over all non-interesting characters until we find end of buffer or a 1693 // (probably ending) '/' character. 1694 if (CurPtr + 24 < BufferEnd) { 1695 // While not aligned to a 16-byte boundary. 1696 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 1697 C = *CurPtr++; 1698 1699 if (C == '/') goto FoundSlash; 1700 1701#ifdef __SSE2__ 1702 __m128i Slashes = _mm_set_epi8('/', '/', '/', '/', '/', '/', '/', '/', 1703 '/', '/', '/', '/', '/', '/', '/', '/'); 1704 while (CurPtr+16 <= BufferEnd && 1705 _mm_movemask_epi8(_mm_cmpeq_epi8(*(__m128i*)CurPtr, Slashes)) == 0) 1706 CurPtr += 16; 1707#elif __ALTIVEC__ 1708 __vector unsigned char Slashes = { 1709 '/', '/', '/', '/', '/', '/', '/', '/', 1710 '/', '/', '/', '/', '/', '/', '/', '/' 1711 }; 1712 while (CurPtr+16 <= BufferEnd && 1713 !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes)) 1714 CurPtr += 16; 1715#else 1716 // Scan for '/' quickly. Many block comments are very large. 1717 while (CurPtr[0] != '/' && 1718 CurPtr[1] != '/' && 1719 CurPtr[2] != '/' && 1720 CurPtr[3] != '/' && 1721 CurPtr+4 < BufferEnd) { 1722 CurPtr += 4; 1723 } 1724#endif 1725 1726 // It has to be one of the bytes scanned, increment to it and read one. 1727 C = *CurPtr++; 1728 } 1729 1730 // Loop to scan the remainder. 1731 while (C != '/' && C != '\0') 1732 C = *CurPtr++; 1733 1734 FoundSlash: 1735 if (C == '/') { 1736 if (CurPtr[-2] == '*') // We found the final */. We're done! 1737 break; 1738 1739 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 1740 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { 1741 // We found the final */, though it had an escaped newline between the 1742 // * and /. We're done! 1743 break; 1744 } 1745 } 1746 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 1747 // If this is a /* inside of the comment, emit a warning. Don't do this 1748 // if this is a /*/, which will end the comment. This misses cases with 1749 // embedded escaped newlines, but oh well. 1750 if (!isLexingRawMode()) 1751 Diag(CurPtr-1, diag::warn_nested_block_comment); 1752 } 1753 } else if (C == 0 && CurPtr == BufferEnd+1) { 1754 if (PP && PP->isCodeCompletionFile(FileLoc)) 1755 PP->CodeCompleteNaturalLanguage(); 1756 else if (!isLexingRawMode()) 1757 Diag(BufferPtr, diag::err_unterminated_block_comment); 1758 // Note: the user probably forgot a */. We could continue immediately 1759 // after the /*, but this would involve lexing a lot of what really is the 1760 // comment, which surely would confuse the parser. 1761 --CurPtr; 1762 1763 // KeepWhitespaceMode should return this broken comment as a token. Since 1764 // it isn't a well formed comment, just return it as an 'unknown' token. 1765 if (isKeepWhitespaceMode()) { 1766 FormTokenWithChars(Result, CurPtr, tok::unknown); 1767 return true; 1768 } 1769 1770 BufferPtr = CurPtr; 1771 return false; 1772 } 1773 C = *CurPtr++; 1774 } 1775 1776 // Notify comment handlers about the comment unless we're in a #if 0 block. 1777 if (PP && !isLexingRawMode() && 1778 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 1779 getSourceLocation(CurPtr)))) { 1780 BufferPtr = CurPtr; 1781 return true; // A token has to be returned. 1782 } 1783 1784 // If we are returning comments as tokens, return this comment as a token. 1785 if (inKeepCommentMode()) { 1786 FormTokenWithChars(Result, CurPtr, tok::comment); 1787 return true; 1788 } 1789 1790 // It is common for the tokens immediately after a /**/ comment to be 1791 // whitespace. Instead of going through the big switch, handle it 1792 // efficiently now. This is safe even in KeepWhitespaceMode because we would 1793 // have already returned above with the comment as a token. 1794 if (isHorizontalWhitespace(*CurPtr)) { 1795 Result.setFlag(Token::LeadingSpace); 1796 SkipWhitespace(Result, CurPtr+1); 1797 return false; 1798 } 1799 1800 // Otherwise, just return so that the next character will be lexed as a token. 1801 BufferPtr = CurPtr; 1802 Result.setFlag(Token::LeadingSpace); 1803 return false; 1804} 1805 1806//===----------------------------------------------------------------------===// 1807// Primary Lexing Entry Points 1808//===----------------------------------------------------------------------===// 1809 1810/// ReadToEndOfLine - Read the rest of the current preprocessor line as an 1811/// uninterpreted string. This switches the lexer out of directive mode. 1812std::string Lexer::ReadToEndOfLine() { 1813 assert(ParsingPreprocessorDirective && ParsingFilename == false && 1814 "Must be in a preprocessing directive!"); 1815 std::string Result; 1816 Token Tmp; 1817 1818 // CurPtr - Cache BufferPtr in an automatic variable. 1819 const char *CurPtr = BufferPtr; 1820 while (1) { 1821 char Char = getAndAdvanceChar(CurPtr, Tmp); 1822 switch (Char) { 1823 default: 1824 Result += Char; 1825 break; 1826 case 0: // Null. 1827 // Found end of file? 1828 if (CurPtr-1 != BufferEnd) { 1829 // Nope, normal character, continue. 1830 Result += Char; 1831 break; 1832 } 1833 // FALL THROUGH. 1834 case '\r': 1835 case '\n': 1836 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 1837 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 1838 BufferPtr = CurPtr-1; 1839 1840 // Next, lex the character, which should handle the EOD transition. 1841 Lex(Tmp); 1842 if (Tmp.is(tok::code_completion)) { 1843 if (PP && PP->getCodeCompletionHandler()) 1844 PP->getCodeCompletionHandler()->CodeCompleteNaturalLanguage(); 1845 Lex(Tmp); 1846 } 1847 assert(Tmp.is(tok::eod) && "Unexpected token!"); 1848 1849 // Finally, we're done, return the string we found. 1850 return Result; 1851 } 1852 } 1853} 1854 1855/// LexEndOfFile - CurPtr points to the end of this file. Handle this 1856/// condition, reporting diagnostics and handling other edge cases as required. 1857/// This returns true if Result contains a token, false if PP.Lex should be 1858/// called again. 1859bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 1860 // Check if we are performing code completion. 1861 if (PP && PP->isCodeCompletionFile(FileLoc)) { 1862 // We're at the end of the file, but we've been asked to consider the 1863 // end of the file to be a code-completion token. Return the 1864 // code-completion token. 1865 Result.startToken(); 1866 FormTokenWithChars(Result, CurPtr, tok::code_completion); 1867 1868 // Only do the eof -> code_completion translation once. 1869 PP->SetCodeCompletionPoint(0, 0, 0); 1870 1871 // Silence any diagnostics that occur once we hit the code-completion point. 1872 PP->getDiagnostics().setSuppressAllDiagnostics(true); 1873 return true; 1874 } 1875 1876 // If we hit the end of the file while parsing a preprocessor directive, 1877 // end the preprocessor directive first. The next token returned will 1878 // then be the end of file. 1879 if (ParsingPreprocessorDirective) { 1880 // Done parsing the "line". 1881 ParsingPreprocessorDirective = false; 1882 // Update the location of token as well as BufferPtr. 1883 FormTokenWithChars(Result, CurPtr, tok::eod); 1884 1885 // Restore comment saving mode, in case it was disabled for directive. 1886 SetCommentRetentionState(PP->getCommentRetentionState()); 1887 return true; // Have a token. 1888 } 1889 1890 // If we are in raw mode, return this event as an EOF token. Let the caller 1891 // that put us in raw mode handle the event. 1892 if (isLexingRawMode()) { 1893 Result.startToken(); 1894 BufferPtr = BufferEnd; 1895 FormTokenWithChars(Result, BufferEnd, tok::eof); 1896 return true; 1897 } 1898 1899 // Issue diagnostics for unterminated #if and missing newline. 1900 1901 // If we are in a #if directive, emit an error. 1902 while (!ConditionalStack.empty()) { 1903 if (!PP->isCodeCompletionFile(FileLoc)) 1904 PP->Diag(ConditionalStack.back().IfLoc, 1905 diag::err_pp_unterminated_conditional); 1906 ConditionalStack.pop_back(); 1907 } 1908 1909 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 1910 // a pedwarn. 1911 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) 1912 Diag(BufferEnd, diag::ext_no_newline_eof) 1913 << FixItHint::CreateInsertion(getSourceLocation(BufferEnd), "\n"); 1914 1915 BufferPtr = CurPtr; 1916 1917 // Finally, let the preprocessor handle this. 1918 return PP->HandleEndOfFile(Result); 1919} 1920 1921/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 1922/// the specified lexer will return a tok::l_paren token, 0 if it is something 1923/// else and 2 if there are no more tokens in the buffer controlled by the 1924/// lexer. 1925unsigned Lexer::isNextPPTokenLParen() { 1926 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 1927 1928 // Switch to 'skipping' mode. This will ensure that we can lex a token 1929 // without emitting diagnostics, disables macro expansion, and will cause EOF 1930 // to return an EOF token instead of popping the include stack. 1931 LexingRawMode = true; 1932 1933 // Save state that can be changed while lexing so that we can restore it. 1934 const char *TmpBufferPtr = BufferPtr; 1935 bool inPPDirectiveMode = ParsingPreprocessorDirective; 1936 1937 Token Tok; 1938 Tok.startToken(); 1939 LexTokenInternal(Tok); 1940 1941 // Restore state that may have changed. 1942 BufferPtr = TmpBufferPtr; 1943 ParsingPreprocessorDirective = inPPDirectiveMode; 1944 1945 // Restore the lexer back to non-skipping mode. 1946 LexingRawMode = false; 1947 1948 if (Tok.is(tok::eof)) 1949 return 2; 1950 return Tok.is(tok::l_paren); 1951} 1952 1953/// FindConflictEnd - Find the end of a version control conflict marker. 1954static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd) { 1955 StringRef RestOfBuffer(CurPtr+7, BufferEnd-CurPtr-7); 1956 size_t Pos = RestOfBuffer.find(">>>>>>>"); 1957 while (Pos != StringRef::npos) { 1958 // Must occur at start of line. 1959 if (RestOfBuffer[Pos-1] != '\r' && 1960 RestOfBuffer[Pos-1] != '\n') { 1961 RestOfBuffer = RestOfBuffer.substr(Pos+7); 1962 Pos = RestOfBuffer.find(">>>>>>>"); 1963 continue; 1964 } 1965 return RestOfBuffer.data()+Pos; 1966 } 1967 return 0; 1968} 1969 1970/// IsStartOfConflictMarker - If the specified pointer is the start of a version 1971/// control conflict marker like '<<<<<<<', recognize it as such, emit an error 1972/// and recover nicely. This returns true if it is a conflict marker and false 1973/// if not. 1974bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 1975 // Only a conflict marker if it starts at the beginning of a line. 1976 if (CurPtr != BufferStart && 1977 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 1978 return false; 1979 1980 // Check to see if we have <<<<<<<. 1981 if (BufferEnd-CurPtr < 8 || 1982 StringRef(CurPtr, 7) != "<<<<<<<") 1983 return false; 1984 1985 // If we have a situation where we don't care about conflict markers, ignore 1986 // it. 1987 if (IsInConflictMarker || isLexingRawMode()) 1988 return false; 1989 1990 // Check to see if there is a >>>>>>> somewhere in the buffer at the start of 1991 // a line to terminate this conflict marker. 1992 if (FindConflictEnd(CurPtr, BufferEnd)) { 1993 // We found a match. We are really in a conflict marker. 1994 // Diagnose this, and ignore to the end of line. 1995 Diag(CurPtr, diag::err_conflict_marker); 1996 IsInConflictMarker = true; 1997 1998 // Skip ahead to the end of line. We know this exists because the 1999 // end-of-conflict marker starts with \r or \n. 2000 while (*CurPtr != '\r' && *CurPtr != '\n') { 2001 assert(CurPtr != BufferEnd && "Didn't find end of line"); 2002 ++CurPtr; 2003 } 2004 BufferPtr = CurPtr; 2005 return true; 2006 } 2007 2008 // No end of conflict marker found. 2009 return false; 2010} 2011 2012 2013/// HandleEndOfConflictMarker - If this is a '=======' or '|||||||' or '>>>>>>>' 2014/// marker, then it is the end of a conflict marker. Handle it by ignoring up 2015/// until the end of the line. This returns true if it is a conflict marker and 2016/// false if not. 2017bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 2018 // Only a conflict marker if it starts at the beginning of a line. 2019 if (CurPtr != BufferStart && 2020 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 2021 return false; 2022 2023 // If we have a situation where we don't care about conflict markers, ignore 2024 // it. 2025 if (!IsInConflictMarker || isLexingRawMode()) 2026 return false; 2027 2028 // Check to see if we have the marker (7 characters in a row). 2029 for (unsigned i = 1; i != 7; ++i) 2030 if (CurPtr[i] != CurPtr[0]) 2031 return false; 2032 2033 // If we do have it, search for the end of the conflict marker. This could 2034 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 2035 // be the end of conflict marker. 2036 if (const char *End = FindConflictEnd(CurPtr, BufferEnd)) { 2037 CurPtr = End; 2038 2039 // Skip ahead to the end of line. 2040 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 2041 ++CurPtr; 2042 2043 BufferPtr = CurPtr; 2044 2045 // No longer in the conflict marker. 2046 IsInConflictMarker = false; 2047 return true; 2048 } 2049 2050 return false; 2051} 2052 2053 2054/// LexTokenInternal - This implements a simple C family lexer. It is an 2055/// extremely performance critical piece of code. This assumes that the buffer 2056/// has a null character at the end of the file. This returns a preprocessing 2057/// token, not a normal token, as such, it is an internal interface. It assumes 2058/// that the Flags of result have been cleared before calling this. 2059void Lexer::LexTokenInternal(Token &Result) { 2060LexNextToken: 2061 // New token, can't need cleaning yet. 2062 Result.clearFlag(Token::NeedsCleaning); 2063 Result.setIdentifierInfo(0); 2064 2065 // CurPtr - Cache BufferPtr in an automatic variable. 2066 const char *CurPtr = BufferPtr; 2067 2068 // Small amounts of horizontal whitespace is very common between tokens. 2069 if ((*CurPtr == ' ') || (*CurPtr == '\t')) { 2070 ++CurPtr; 2071 while ((*CurPtr == ' ') || (*CurPtr == '\t')) 2072 ++CurPtr; 2073 2074 // If we are keeping whitespace and other tokens, just return what we just 2075 // skipped. The next lexer invocation will return the token after the 2076 // whitespace. 2077 if (isKeepWhitespaceMode()) { 2078 FormTokenWithChars(Result, CurPtr, tok::unknown); 2079 return; 2080 } 2081 2082 BufferPtr = CurPtr; 2083 Result.setFlag(Token::LeadingSpace); 2084 } 2085 2086 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 2087 2088 // Read a character, advancing over it. 2089 char Char = getAndAdvanceChar(CurPtr, Result); 2090 tok::TokenKind Kind; 2091 2092 switch (Char) { 2093 case 0: // Null. 2094 // Found end of file? 2095 if (CurPtr-1 == BufferEnd) { 2096 // Read the PP instance variable into an automatic variable, because 2097 // LexEndOfFile will often delete 'this'. 2098 Preprocessor *PPCache = PP; 2099 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 2100 return; // Got a token to return. 2101 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 2102 return PPCache->Lex(Result); 2103 } 2104 2105 if (!isLexingRawMode()) 2106 Diag(CurPtr-1, diag::null_in_file); 2107 Result.setFlag(Token::LeadingSpace); 2108 if (SkipWhitespace(Result, CurPtr)) 2109 return; // KeepWhitespaceMode 2110 2111 goto LexNextToken; // GCC isn't tail call eliminating. 2112 2113 case 26: // DOS & CP/M EOF: "^Z". 2114 // If we're in Microsoft extensions mode, treat this as end of file. 2115 if (Features.Microsoft) { 2116 // Read the PP instance variable into an automatic variable, because 2117 // LexEndOfFile will often delete 'this'. 2118 Preprocessor *PPCache = PP; 2119 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 2120 return; // Got a token to return. 2121 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 2122 return PPCache->Lex(Result); 2123 } 2124 // If Microsoft extensions are disabled, this is just random garbage. 2125 Kind = tok::unknown; 2126 break; 2127 2128 case '\n': 2129 case '\r': 2130 // If we are inside a preprocessor directive and we see the end of line, 2131 // we know we are done with the directive, so return an EOD token. 2132 if (ParsingPreprocessorDirective) { 2133 // Done parsing the "line". 2134 ParsingPreprocessorDirective = false; 2135 2136 // Restore comment saving mode, in case it was disabled for directive. 2137 SetCommentRetentionState(PP->getCommentRetentionState()); 2138 2139 // Since we consumed a newline, we are back at the start of a line. 2140 IsAtStartOfLine = true; 2141 2142 Kind = tok::eod; 2143 break; 2144 } 2145 // The returned token is at the start of the line. 2146 Result.setFlag(Token::StartOfLine); 2147 // No leading whitespace seen so far. 2148 Result.clearFlag(Token::LeadingSpace); 2149 2150 if (SkipWhitespace(Result, CurPtr)) 2151 return; // KeepWhitespaceMode 2152 goto LexNextToken; // GCC isn't tail call eliminating. 2153 case ' ': 2154 case '\t': 2155 case '\f': 2156 case '\v': 2157 SkipHorizontalWhitespace: 2158 Result.setFlag(Token::LeadingSpace); 2159 if (SkipWhitespace(Result, CurPtr)) 2160 return; // KeepWhitespaceMode 2161 2162 SkipIgnoredUnits: 2163 CurPtr = BufferPtr; 2164 2165 // If the next token is obviously a // or /* */ comment, skip it efficiently 2166 // too (without going through the big switch stmt). 2167 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 2168 Features.BCPLComment && !Features.TraditionalCPP) { 2169 if (SkipBCPLComment(Result, CurPtr+2)) 2170 return; // There is a token to return. 2171 goto SkipIgnoredUnits; 2172 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 2173 if (SkipBlockComment(Result, CurPtr+2)) 2174 return; // There is a token to return. 2175 goto SkipIgnoredUnits; 2176 } else if (isHorizontalWhitespace(*CurPtr)) { 2177 goto SkipHorizontalWhitespace; 2178 } 2179 goto LexNextToken; // GCC isn't tail call eliminating. 2180 2181 // C99 6.4.4.1: Integer Constants. 2182 // C99 6.4.4.2: Floating Constants. 2183 case '0': case '1': case '2': case '3': case '4': 2184 case '5': case '6': case '7': case '8': case '9': 2185 // Notify MIOpt that we read a non-whitespace/non-comment token. 2186 MIOpt.ReadToken(); 2187 return LexNumericConstant(Result, CurPtr); 2188 2189 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 2190 // Notify MIOpt that we read a non-whitespace/non-comment token. 2191 MIOpt.ReadToken(); 2192 Char = getCharAndSize(CurPtr, SizeTmp); 2193 2194 // Wide string literal. 2195 if (Char == '"') 2196 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 2197 true); 2198 2199 // Wide character constant. 2200 if (Char == '\'') 2201 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 2202 // FALL THROUGH, treating L like the start of an identifier. 2203 2204 // C99 6.4.2: Identifiers. 2205 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 2206 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 2207 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': 2208 case 'V': case 'W': case 'X': case 'Y': case 'Z': 2209 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 2210 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 2211 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': 2212 case 'v': case 'w': case 'x': case 'y': case 'z': 2213 case '_': 2214 // Notify MIOpt that we read a non-whitespace/non-comment token. 2215 MIOpt.ReadToken(); 2216 return LexIdentifier(Result, CurPtr); 2217 2218 case '$': // $ in identifiers. 2219 if (Features.DollarIdents) { 2220 if (!isLexingRawMode()) 2221 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 2222 // Notify MIOpt that we read a non-whitespace/non-comment token. 2223 MIOpt.ReadToken(); 2224 return LexIdentifier(Result, CurPtr); 2225 } 2226 2227 Kind = tok::unknown; 2228 break; 2229 2230 // C99 6.4.4: Character Constants. 2231 case '\'': 2232 // Notify MIOpt that we read a non-whitespace/non-comment token. 2233 MIOpt.ReadToken(); 2234 return LexCharConstant(Result, CurPtr); 2235 2236 // C99 6.4.5: String Literals. 2237 case '"': 2238 // Notify MIOpt that we read a non-whitespace/non-comment token. 2239 MIOpt.ReadToken(); 2240 return LexStringLiteral(Result, CurPtr, false); 2241 2242 // C99 6.4.6: Punctuators. 2243 case '?': 2244 Kind = tok::question; 2245 break; 2246 case '[': 2247 Kind = tok::l_square; 2248 break; 2249 case ']': 2250 Kind = tok::r_square; 2251 break; 2252 case '(': 2253 Kind = tok::l_paren; 2254 break; 2255 case ')': 2256 Kind = tok::r_paren; 2257 break; 2258 case '{': 2259 Kind = tok::l_brace; 2260 break; 2261 case '}': 2262 Kind = tok::r_brace; 2263 break; 2264 case '.': 2265 Char = getCharAndSize(CurPtr, SizeTmp); 2266 if (Char >= '0' && Char <= '9') { 2267 // Notify MIOpt that we read a non-whitespace/non-comment token. 2268 MIOpt.ReadToken(); 2269 2270 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 2271 } else if (Features.CPlusPlus && Char == '*') { 2272 Kind = tok::periodstar; 2273 CurPtr += SizeTmp; 2274 } else if (Char == '.' && 2275 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 2276 Kind = tok::ellipsis; 2277 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2278 SizeTmp2, Result); 2279 } else { 2280 Kind = tok::period; 2281 } 2282 break; 2283 case '&': 2284 Char = getCharAndSize(CurPtr, SizeTmp); 2285 if (Char == '&') { 2286 Kind = tok::ampamp; 2287 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2288 } else if (Char == '=') { 2289 Kind = tok::ampequal; 2290 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2291 } else { 2292 Kind = tok::amp; 2293 } 2294 break; 2295 case '*': 2296 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 2297 Kind = tok::starequal; 2298 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2299 } else { 2300 Kind = tok::star; 2301 } 2302 break; 2303 case '+': 2304 Char = getCharAndSize(CurPtr, SizeTmp); 2305 if (Char == '+') { 2306 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2307 Kind = tok::plusplus; 2308 } else if (Char == '=') { 2309 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2310 Kind = tok::plusequal; 2311 } else { 2312 Kind = tok::plus; 2313 } 2314 break; 2315 case '-': 2316 Char = getCharAndSize(CurPtr, SizeTmp); 2317 if (Char == '-') { // -- 2318 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2319 Kind = tok::minusminus; 2320 } else if (Char == '>' && Features.CPlusPlus && 2321 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 2322 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2323 SizeTmp2, Result); 2324 Kind = tok::arrowstar; 2325 } else if (Char == '>') { // -> 2326 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2327 Kind = tok::arrow; 2328 } else if (Char == '=') { // -= 2329 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2330 Kind = tok::minusequal; 2331 } else { 2332 Kind = tok::minus; 2333 } 2334 break; 2335 case '~': 2336 Kind = tok::tilde; 2337 break; 2338 case '!': 2339 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 2340 Kind = tok::exclaimequal; 2341 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2342 } else { 2343 Kind = tok::exclaim; 2344 } 2345 break; 2346 case '/': 2347 // 6.4.9: Comments 2348 Char = getCharAndSize(CurPtr, SizeTmp); 2349 if (Char == '/') { // BCPL comment. 2350 // Even if BCPL comments are disabled (e.g. in C89 mode), we generally 2351 // want to lex this as a comment. There is one problem with this though, 2352 // that in one particular corner case, this can change the behavior of the 2353 // resultant program. For example, In "foo //**/ bar", C89 would lex 2354 // this as "foo / bar" and langauges with BCPL comments would lex it as 2355 // "foo". Check to see if the character after the second slash is a '*'. 2356 // If so, we will lex that as a "/" instead of the start of a comment. 2357 // However, we never do this in -traditional-cpp mode. 2358 if ((Features.BCPLComment || 2359 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') && 2360 !Features.TraditionalCPP) { 2361 if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 2362 return; // There is a token to return. 2363 2364 // It is common for the tokens immediately after a // comment to be 2365 // whitespace (indentation for the next line). Instead of going through 2366 // the big switch, handle it efficiently now. 2367 goto SkipIgnoredUnits; 2368 } 2369 } 2370 2371 if (Char == '*') { // /**/ comment. 2372 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 2373 return; // There is a token to return. 2374 goto LexNextToken; // GCC isn't tail call eliminating. 2375 } 2376 2377 if (Char == '=') { 2378 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2379 Kind = tok::slashequal; 2380 } else { 2381 Kind = tok::slash; 2382 } 2383 break; 2384 case '%': 2385 Char = getCharAndSize(CurPtr, SizeTmp); 2386 if (Char == '=') { 2387 Kind = tok::percentequal; 2388 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2389 } else if (Features.Digraphs && Char == '>') { 2390 Kind = tok::r_brace; // '%>' -> '}' 2391 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2392 } else if (Features.Digraphs && Char == ':') { 2393 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2394 Char = getCharAndSize(CurPtr, SizeTmp); 2395 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 2396 Kind = tok::hashhash; // '%:%:' -> '##' 2397 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2398 SizeTmp2, Result); 2399 } else if (Char == '@' && Features.Microsoft) { // %:@ -> #@ -> Charize 2400 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2401 if (!isLexingRawMode()) 2402 Diag(BufferPtr, diag::charize_microsoft_ext); 2403 Kind = tok::hashat; 2404 } else { // '%:' -> '#' 2405 // We parsed a # character. If this occurs at the start of the line, 2406 // it's actually the start of a preprocessing directive. Callback to 2407 // the preprocessor to handle it. 2408 // FIXME: -fpreprocessed mode?? 2409 if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) { 2410 FormTokenWithChars(Result, CurPtr, tok::hash); 2411 PP->HandleDirective(Result); 2412 2413 // As an optimization, if the preprocessor didn't switch lexers, tail 2414 // recurse. 2415 if (PP->isCurrentLexer(this)) { 2416 // Start a new token. If this is a #include or something, the PP may 2417 // want us starting at the beginning of the line again. If so, set 2418 // the StartOfLine flag and clear LeadingSpace. 2419 if (IsAtStartOfLine) { 2420 Result.setFlag(Token::StartOfLine); 2421 Result.clearFlag(Token::LeadingSpace); 2422 IsAtStartOfLine = false; 2423 } 2424 goto LexNextToken; // GCC isn't tail call eliminating. 2425 } 2426 2427 return PP->Lex(Result); 2428 } 2429 2430 Kind = tok::hash; 2431 } 2432 } else { 2433 Kind = tok::percent; 2434 } 2435 break; 2436 case '<': 2437 Char = getCharAndSize(CurPtr, SizeTmp); 2438 if (ParsingFilename) { 2439 return LexAngledStringLiteral(Result, CurPtr); 2440 } else if (Char == '<') { 2441 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 2442 if (After == '=') { 2443 Kind = tok::lesslessequal; 2444 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2445 SizeTmp2, Result); 2446 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 2447 // If this is actually a '<<<<<<<' version control conflict marker, 2448 // recognize it as such and recover nicely. 2449 goto LexNextToken; 2450 } else if (Features.CUDA && After == '<') { 2451 Kind = tok::lesslessless; 2452 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2453 SizeTmp2, Result); 2454 } else { 2455 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2456 Kind = tok::lessless; 2457 } 2458 } else if (Char == '=') { 2459 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2460 Kind = tok::lessequal; 2461 } else if (Features.Digraphs && Char == ':') { // '<:' -> '[' 2462 if (Features.CPlusPlus0x && 2463 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 2464 // C++0x [lex.pptoken]p3: 2465 // Otherwise, if the next three characters are <:: and the subsequent 2466 // character is neither : nor >, the < is treated as a preprocessor 2467 // token by itself and not as the first character of the alternative 2468 // token <:. 2469 unsigned SizeTmp3; 2470 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 2471 if (After != ':' && After != '>') { 2472 Kind = tok::less; 2473 break; 2474 } 2475 } 2476 2477 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2478 Kind = tok::l_square; 2479 } else if (Features.Digraphs && Char == '%') { // '<%' -> '{' 2480 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2481 Kind = tok::l_brace; 2482 } else { 2483 Kind = tok::less; 2484 } 2485 break; 2486 case '>': 2487 Char = getCharAndSize(CurPtr, SizeTmp); 2488 if (Char == '=') { 2489 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2490 Kind = tok::greaterequal; 2491 } else if (Char == '>') { 2492 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 2493 if (After == '=') { 2494 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2495 SizeTmp2, Result); 2496 Kind = tok::greatergreaterequal; 2497 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 2498 // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 2499 goto LexNextToken; 2500 } else if (Features.CUDA && After == '>') { 2501 Kind = tok::greatergreatergreater; 2502 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 2503 SizeTmp2, Result); 2504 } else { 2505 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2506 Kind = tok::greatergreater; 2507 } 2508 2509 } else { 2510 Kind = tok::greater; 2511 } 2512 break; 2513 case '^': 2514 Char = getCharAndSize(CurPtr, SizeTmp); 2515 if (Char == '=') { 2516 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2517 Kind = tok::caretequal; 2518 } else { 2519 Kind = tok::caret; 2520 } 2521 break; 2522 case '|': 2523 Char = getCharAndSize(CurPtr, SizeTmp); 2524 if (Char == '=') { 2525 Kind = tok::pipeequal; 2526 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2527 } else if (Char == '|') { 2528 // If this is '|||||||' and we're in a conflict marker, ignore it. 2529 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 2530 goto LexNextToken; 2531 Kind = tok::pipepipe; 2532 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2533 } else { 2534 Kind = tok::pipe; 2535 } 2536 break; 2537 case ':': 2538 Char = getCharAndSize(CurPtr, SizeTmp); 2539 if (Features.Digraphs && Char == '>') { 2540 Kind = tok::r_square; // ':>' -> ']' 2541 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2542 } else if (Features.CPlusPlus && Char == ':') { 2543 Kind = tok::coloncolon; 2544 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2545 } else { 2546 Kind = tok::colon; 2547 } 2548 break; 2549 case ';': 2550 Kind = tok::semi; 2551 break; 2552 case '=': 2553 Char = getCharAndSize(CurPtr, SizeTmp); 2554 if (Char == '=') { 2555 // If this is '=======' and we're in a conflict marker, ignore it. 2556 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 2557 goto LexNextToken; 2558 2559 Kind = tok::equalequal; 2560 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2561 } else { 2562 Kind = tok::equal; 2563 } 2564 break; 2565 case ',': 2566 Kind = tok::comma; 2567 break; 2568 case '#': 2569 Char = getCharAndSize(CurPtr, SizeTmp); 2570 if (Char == '#') { 2571 Kind = tok::hashhash; 2572 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2573 } else if (Char == '@' && Features.Microsoft) { // #@ -> Charize 2574 Kind = tok::hashat; 2575 if (!isLexingRawMode()) 2576 Diag(BufferPtr, diag::charize_microsoft_ext); 2577 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2578 } else { 2579 // We parsed a # character. If this occurs at the start of the line, 2580 // it's actually the start of a preprocessing directive. Callback to 2581 // the preprocessor to handle it. 2582 // FIXME: -fpreprocessed mode?? 2583 if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) { 2584 FormTokenWithChars(Result, CurPtr, tok::hash); 2585 PP->HandleDirective(Result); 2586 2587 // As an optimization, if the preprocessor didn't switch lexers, tail 2588 // recurse. 2589 if (PP->isCurrentLexer(this)) { 2590 // Start a new token. If this is a #include or something, the PP may 2591 // want us starting at the beginning of the line again. If so, set 2592 // the StartOfLine flag and clear LeadingSpace. 2593 if (IsAtStartOfLine) { 2594 Result.setFlag(Token::StartOfLine); 2595 Result.clearFlag(Token::LeadingSpace); 2596 IsAtStartOfLine = false; 2597 } 2598 goto LexNextToken; // GCC isn't tail call eliminating. 2599 } 2600 return PP->Lex(Result); 2601 } 2602 2603 Kind = tok::hash; 2604 } 2605 break; 2606 2607 case '@': 2608 // Objective C support. 2609 if (CurPtr[-1] == '@' && Features.ObjC1) 2610 Kind = tok::at; 2611 else 2612 Kind = tok::unknown; 2613 break; 2614 2615 case '\\': 2616 // FIXME: UCN's. 2617 // FALL THROUGH. 2618 default: 2619 Kind = tok::unknown; 2620 break; 2621 } 2622 2623 // Notify MIOpt that we read a non-whitespace/non-comment token. 2624 MIOpt.ReadToken(); 2625 2626 // Update the location of token as well as BufferPtr. 2627 FormTokenWithChars(Result, CurPtr, Kind); 2628} 2629