Lexer.cpp revision b87672b124ab4fbe6f2cabc2ad71655fc71230ea
1//===--- Lexer.cpp - C Language Family Lexer ------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements the Lexer and Token interfaces. 11// 12//===----------------------------------------------------------------------===// 13// 14// TODO: GCC Diagnostics emitted by the lexer: 15// PEDWARN: (form feed|vertical tab) in preprocessing directive 16// 17// Universal characters, unicode, char mapping: 18// WARNING: `%.*s' is not in NFKC 19// WARNING: `%.*s' is not in NFC 20// 21// Other: 22// TODO: Options to support: 23// -fexec-charset,-fwide-exec-charset 24// 25//===----------------------------------------------------------------------===// 26 27#include "clang/Lex/Lexer.h" 28#include "clang/Basic/ConvertUTF.h" 29#include "clang/Basic/SourceManager.h" 30#include "clang/Lex/CodeCompletionHandler.h" 31#include "clang/Lex/LexDiagnostic.h" 32#include "clang/Lex/Preprocessor.h" 33#include "llvm/ADT/STLExtras.h" 34#include "llvm/ADT/StringExtras.h" 35#include "llvm/ADT/StringSwitch.h" 36#include "llvm/Support/Compiler.h" 37#include "llvm/Support/MemoryBuffer.h" 38#include <cstring> 39using namespace clang; 40 41static void InitCharacterInfo(); 42 43//===----------------------------------------------------------------------===// 44// Token Class Implementation 45//===----------------------------------------------------------------------===// 46 47/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 48bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 49 if (IdentifierInfo *II = getIdentifierInfo()) 50 return II->getObjCKeywordID() == objcKey; 51 return false; 52} 53 54/// getObjCKeywordID - Return the ObjC keyword kind. 55tok::ObjCKeywordKind Token::getObjCKeywordID() const { 56 IdentifierInfo *specId = getIdentifierInfo(); 57 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 58} 59 60 61//===----------------------------------------------------------------------===// 62// Lexer Class Implementation 63//===----------------------------------------------------------------------===// 64 65void Lexer::anchor() { } 66 67void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 68 const char *BufEnd) { 69 InitCharacterInfo(); 70 71 BufferStart = BufStart; 72 BufferPtr = BufPtr; 73 BufferEnd = BufEnd; 74 75 assert(BufEnd[0] == 0 && 76 "We assume that the input buffer has a null character at the end" 77 " to simplify lexing!"); 78 79 // Check whether we have a BOM in the beginning of the buffer. If yes - act 80 // accordingly. Right now we support only UTF-8 with and without BOM, so, just 81 // skip the UTF-8 BOM if it's present. 82 if (BufferStart == BufferPtr) { 83 // Determine the size of the BOM. 84 StringRef Buf(BufferStart, BufferEnd - BufferStart); 85 size_t BOMLength = llvm::StringSwitch<size_t>(Buf) 86 .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM 87 .Default(0); 88 89 // Skip the BOM. 90 BufferPtr += BOMLength; 91 } 92 93 Is_PragmaLexer = false; 94 CurrentConflictMarkerState = CMK_None; 95 96 // Start of the file is a start of line. 97 IsAtStartOfLine = true; 98 99 // We are not after parsing a #. 100 ParsingPreprocessorDirective = false; 101 102 // We are not after parsing #include. 103 ParsingFilename = false; 104 105 // We are not in raw mode. Raw mode disables diagnostics and interpretation 106 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 107 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 108 // or otherwise skipping over tokens. 109 LexingRawMode = false; 110 111 // Default to not keeping comments. 112 ExtendedTokenMode = 0; 113} 114 115/// Lexer constructor - Create a new lexer object for the specified buffer 116/// with the specified preprocessor managing the lexing process. This lexer 117/// assumes that the associated file buffer and Preprocessor objects will 118/// outlive it, so it doesn't take ownership of either of them. 119Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP) 120 : PreprocessorLexer(&PP, FID), 121 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 122 LangOpts(PP.getLangOpts()) { 123 124 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), 125 InputFile->getBufferEnd()); 126 127 // Default to keeping comments if the preprocessor wants them. 128 SetCommentRetentionState(PP.getCommentRetentionState()); 129} 130 131/// Lexer constructor - Create a new raw lexer object. This object is only 132/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 133/// range will outlive it, so it doesn't take ownership of it. 134Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts, 135 const char *BufStart, const char *BufPtr, const char *BufEnd) 136 : FileLoc(fileloc), LangOpts(langOpts) { 137 138 InitLexer(BufStart, BufPtr, BufEnd); 139 140 // We *are* in raw mode. 141 LexingRawMode = true; 142} 143 144/// Lexer constructor - Create a new raw lexer object. This object is only 145/// suitable for calls to 'LexFromRawLexer'. This lexer assumes that the text 146/// range will outlive it, so it doesn't take ownership of it. 147Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile, 148 const SourceManager &SM, const LangOptions &langOpts) 149 : FileLoc(SM.getLocForStartOfFile(FID)), LangOpts(langOpts) { 150 151 InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(), 152 FromFile->getBufferEnd()); 153 154 // We *are* in raw mode. 155 LexingRawMode = true; 156} 157 158/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 159/// _Pragma expansion. This has a variety of magic semantics that this method 160/// sets up. It returns a new'd Lexer that must be delete'd when done. 161/// 162/// On entrance to this routine, TokStartLoc is a macro location which has a 163/// spelling loc that indicates the bytes to be lexed for the token and an 164/// expansion location that indicates where all lexed tokens should be 165/// "expanded from". 166/// 167/// FIXME: It would really be nice to make _Pragma just be a wrapper around a 168/// normal lexer that remaps tokens as they fly by. This would require making 169/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 170/// interface that could handle this stuff. This would pull GetMappedTokenLoc 171/// out of the critical path of the lexer! 172/// 173Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 174 SourceLocation ExpansionLocStart, 175 SourceLocation ExpansionLocEnd, 176 unsigned TokLen, Preprocessor &PP) { 177 SourceManager &SM = PP.getSourceManager(); 178 179 // Create the lexer as if we were going to lex the file normally. 180 FileID SpellingFID = SM.getFileID(SpellingLoc); 181 const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID); 182 Lexer *L = new Lexer(SpellingFID, InputFile, PP); 183 184 // Now that the lexer is created, change the start/end locations so that we 185 // just lex the subsection of the file that we want. This is lexing from a 186 // scratch buffer. 187 const char *StrData = SM.getCharacterData(SpellingLoc); 188 189 L->BufferPtr = StrData; 190 L->BufferEnd = StrData+TokLen; 191 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 192 193 // Set the SourceLocation with the remapping information. This ensures that 194 // GetMappedTokenLoc will remap the tokens as they are lexed. 195 L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID), 196 ExpansionLocStart, 197 ExpansionLocEnd, TokLen); 198 199 // Ensure that the lexer thinks it is inside a directive, so that end \n will 200 // return an EOD token. 201 L->ParsingPreprocessorDirective = true; 202 203 // This lexer really is for _Pragma. 204 L->Is_PragmaLexer = true; 205 return L; 206} 207 208 209/// Stringify - Convert the specified string into a C string, with surrounding 210/// ""'s, and with escaped \ and " characters. 211std::string Lexer::Stringify(const std::string &Str, bool Charify) { 212 std::string Result = Str; 213 char Quote = Charify ? '\'' : '"'; 214 for (unsigned i = 0, e = Result.size(); i != e; ++i) { 215 if (Result[i] == '\\' || Result[i] == Quote) { 216 Result.insert(Result.begin()+i, '\\'); 217 ++i; ++e; 218 } 219 } 220 return Result; 221} 222 223/// Stringify - Convert the specified string into a C string by escaping '\' 224/// and " characters. This does not add surrounding ""'s to the string. 225void Lexer::Stringify(SmallVectorImpl<char> &Str) { 226 for (unsigned i = 0, e = Str.size(); i != e; ++i) { 227 if (Str[i] == '\\' || Str[i] == '"') { 228 Str.insert(Str.begin()+i, '\\'); 229 ++i; ++e; 230 } 231 } 232} 233 234//===----------------------------------------------------------------------===// 235// Token Spelling 236//===----------------------------------------------------------------------===// 237 238/// \brief Slow case of getSpelling. Extract the characters comprising the 239/// spelling of this token from the provided input buffer. 240static size_t getSpellingSlow(const Token &Tok, const char *BufPtr, 241 const LangOptions &LangOpts, char *Spelling) { 242 assert(Tok.needsCleaning() && "getSpellingSlow called on simple token"); 243 244 size_t Length = 0; 245 const char *BufEnd = BufPtr + Tok.getLength(); 246 247 if (Tok.is(tok::string_literal)) { 248 // Munch the encoding-prefix and opening double-quote. 249 while (BufPtr < BufEnd) { 250 unsigned Size; 251 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); 252 BufPtr += Size; 253 254 if (Spelling[Length - 1] == '"') 255 break; 256 } 257 258 // Raw string literals need special handling; trigraph expansion and line 259 // splicing do not occur within their d-char-sequence nor within their 260 // r-char-sequence. 261 if (Length >= 2 && 262 Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') { 263 // Search backwards from the end of the token to find the matching closing 264 // quote. 265 const char *RawEnd = BufEnd; 266 do --RawEnd; while (*RawEnd != '"'); 267 size_t RawLength = RawEnd - BufPtr + 1; 268 269 // Everything between the quotes is included verbatim in the spelling. 270 memcpy(Spelling + Length, BufPtr, RawLength); 271 Length += RawLength; 272 BufPtr += RawLength; 273 274 // The rest of the token is lexed normally. 275 } 276 } 277 278 while (BufPtr < BufEnd) { 279 unsigned Size; 280 Spelling[Length++] = Lexer::getCharAndSizeNoWarn(BufPtr, Size, LangOpts); 281 BufPtr += Size; 282 } 283 284 assert(Length < Tok.getLength() && 285 "NeedsCleaning flag set on token that didn't need cleaning!"); 286 return Length; 287} 288 289/// getSpelling() - Return the 'spelling' of this token. The spelling of a 290/// token are the characters used to represent the token in the source file 291/// after trigraph expansion and escaped-newline folding. In particular, this 292/// wants to get the true, uncanonicalized, spelling of things like digraphs 293/// UCNs, etc. 294StringRef Lexer::getSpelling(SourceLocation loc, 295 SmallVectorImpl<char> &buffer, 296 const SourceManager &SM, 297 const LangOptions &options, 298 bool *invalid) { 299 // Break down the source location. 300 std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc); 301 302 // Try to the load the file buffer. 303 bool invalidTemp = false; 304 StringRef file = SM.getBufferData(locInfo.first, &invalidTemp); 305 if (invalidTemp) { 306 if (invalid) *invalid = true; 307 return StringRef(); 308 } 309 310 const char *tokenBegin = file.data() + locInfo.second; 311 312 // Lex from the start of the given location. 313 Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options, 314 file.begin(), tokenBegin, file.end()); 315 Token token; 316 lexer.LexFromRawLexer(token); 317 318 unsigned length = token.getLength(); 319 320 // Common case: no need for cleaning. 321 if (!token.needsCleaning()) 322 return StringRef(tokenBegin, length); 323 324 // Hard case, we need to relex the characters into the string. 325 buffer.resize(length); 326 buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data())); 327 return StringRef(buffer.data(), buffer.size()); 328} 329 330/// getSpelling() - Return the 'spelling' of this token. The spelling of a 331/// token are the characters used to represent the token in the source file 332/// after trigraph expansion and escaped-newline folding. In particular, this 333/// wants to get the true, uncanonicalized, spelling of things like digraphs 334/// UCNs, etc. 335std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr, 336 const LangOptions &LangOpts, bool *Invalid) { 337 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 338 339 bool CharDataInvalid = false; 340 const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(), 341 &CharDataInvalid); 342 if (Invalid) 343 *Invalid = CharDataInvalid; 344 if (CharDataInvalid) 345 return std::string(); 346 347 // If this token contains nothing interesting, return it directly. 348 if (!Tok.needsCleaning()) 349 return std::string(TokStart, TokStart + Tok.getLength()); 350 351 std::string Result; 352 Result.resize(Tok.getLength()); 353 Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin())); 354 return Result; 355} 356 357/// getSpelling - This method is used to get the spelling of a token into a 358/// preallocated buffer, instead of as an std::string. The caller is required 359/// to allocate enough space for the token, which is guaranteed to be at least 360/// Tok.getLength() bytes long. The actual length of the token is returned. 361/// 362/// Note that this method may do two possible things: it may either fill in 363/// the buffer specified with characters, or it may *change the input pointer* 364/// to point to a constant buffer with the data already in it (avoiding a 365/// copy). The caller is not allowed to modify the returned buffer pointer 366/// if an internal buffer is returned. 367unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer, 368 const SourceManager &SourceMgr, 369 const LangOptions &LangOpts, bool *Invalid) { 370 assert((int)Tok.getLength() >= 0 && "Token character range is bogus!"); 371 372 const char *TokStart = 0; 373 // NOTE: this has to be checked *before* testing for an IdentifierInfo. 374 if (Tok.is(tok::raw_identifier)) 375 TokStart = Tok.getRawIdentifierData(); 376 else if (!Tok.hasUCN()) { 377 if (const IdentifierInfo *II = Tok.getIdentifierInfo()) { 378 // Just return the string from the identifier table, which is very quick. 379 Buffer = II->getNameStart(); 380 return II->getLength(); 381 } 382 } 383 384 // NOTE: this can be checked even after testing for an IdentifierInfo. 385 if (Tok.isLiteral()) 386 TokStart = Tok.getLiteralData(); 387 388 if (TokStart == 0) { 389 // Compute the start of the token in the input lexer buffer. 390 bool CharDataInvalid = false; 391 TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid); 392 if (Invalid) 393 *Invalid = CharDataInvalid; 394 if (CharDataInvalid) { 395 Buffer = ""; 396 return 0; 397 } 398 } 399 400 // If this token contains nothing interesting, return it directly. 401 if (!Tok.needsCleaning()) { 402 Buffer = TokStart; 403 return Tok.getLength(); 404 } 405 406 // Otherwise, hard case, relex the characters into the string. 407 return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer)); 408} 409 410 411 412static bool isWhitespace(unsigned char c); 413 414/// MeasureTokenLength - Relex the token at the specified location and return 415/// its length in bytes in the input file. If the token needs cleaning (e.g. 416/// includes a trigraph or an escaped newline) then this count includes bytes 417/// that are part of that. 418unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 419 const SourceManager &SM, 420 const LangOptions &LangOpts) { 421 Token TheTok; 422 if (getRawToken(Loc, TheTok, SM, LangOpts)) 423 return 0; 424 return TheTok.getLength(); 425} 426 427/// \brief Relex the token at the specified location. 428/// \returns true if there was a failure, false on success. 429bool Lexer::getRawToken(SourceLocation Loc, Token &Result, 430 const SourceManager &SM, 431 const LangOptions &LangOpts) { 432 // TODO: this could be special cased for common tokens like identifiers, ')', 433 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 434 // all obviously single-char tokens. This could use 435 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 436 // something. 437 438 // If this comes from a macro expansion, we really do want the macro name, not 439 // the token this macro expanded to. 440 Loc = SM.getExpansionLoc(Loc); 441 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 442 bool Invalid = false; 443 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 444 if (Invalid) 445 return true; 446 447 const char *StrData = Buffer.data()+LocInfo.second; 448 449 if (isWhitespace(StrData[0])) 450 return true; 451 452 // Create a lexer starting at the beginning of this token. 453 Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, 454 Buffer.begin(), StrData, Buffer.end()); 455 TheLexer.SetCommentRetentionState(true); 456 TheLexer.LexFromRawLexer(Result); 457 return false; 458} 459 460static SourceLocation getBeginningOfFileToken(SourceLocation Loc, 461 const SourceManager &SM, 462 const LangOptions &LangOpts) { 463 assert(Loc.isFileID()); 464 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 465 if (LocInfo.first.isInvalid()) 466 return Loc; 467 468 bool Invalid = false; 469 StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 470 if (Invalid) 471 return Loc; 472 473 // Back up from the current location until we hit the beginning of a line 474 // (or the buffer). We'll relex from that point. 475 const char *BufStart = Buffer.data(); 476 if (LocInfo.second >= Buffer.size()) 477 return Loc; 478 479 const char *StrData = BufStart+LocInfo.second; 480 if (StrData[0] == '\n' || StrData[0] == '\r') 481 return Loc; 482 483 const char *LexStart = StrData; 484 while (LexStart != BufStart) { 485 if (LexStart[0] == '\n' || LexStart[0] == '\r') { 486 ++LexStart; 487 break; 488 } 489 490 --LexStart; 491 } 492 493 // Create a lexer starting at the beginning of this token. 494 SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second); 495 Lexer TheLexer(LexerStartLoc, LangOpts, BufStart, LexStart, Buffer.end()); 496 TheLexer.SetCommentRetentionState(true); 497 498 // Lex tokens until we find the token that contains the source location. 499 Token TheTok; 500 do { 501 TheLexer.LexFromRawLexer(TheTok); 502 503 if (TheLexer.getBufferLocation() > StrData) { 504 // Lexing this token has taken the lexer past the source location we're 505 // looking for. If the current token encompasses our source location, 506 // return the beginning of that token. 507 if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData) 508 return TheTok.getLocation(); 509 510 // We ended up skipping over the source location entirely, which means 511 // that it points into whitespace. We're done here. 512 break; 513 } 514 } while (TheTok.getKind() != tok::eof); 515 516 // We've passed our source location; just return the original source location. 517 return Loc; 518} 519 520SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc, 521 const SourceManager &SM, 522 const LangOptions &LangOpts) { 523 if (Loc.isFileID()) 524 return getBeginningOfFileToken(Loc, SM, LangOpts); 525 526 if (!SM.isMacroArgExpansion(Loc)) 527 return Loc; 528 529 SourceLocation FileLoc = SM.getSpellingLoc(Loc); 530 SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts); 531 std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc); 532 std::pair<FileID, unsigned> BeginFileLocInfo 533 = SM.getDecomposedLoc(BeginFileLoc); 534 assert(FileLocInfo.first == BeginFileLocInfo.first && 535 FileLocInfo.second >= BeginFileLocInfo.second); 536 return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second); 537} 538 539namespace { 540 enum PreambleDirectiveKind { 541 PDK_Skipped, 542 PDK_StartIf, 543 PDK_EndIf, 544 PDK_Unknown 545 }; 546} 547 548std::pair<unsigned, bool> 549Lexer::ComputePreamble(const llvm::MemoryBuffer *Buffer, 550 const LangOptions &LangOpts, unsigned MaxLines) { 551 // Create a lexer starting at the beginning of the file. Note that we use a 552 // "fake" file source location at offset 1 so that the lexer will track our 553 // position within the file. 554 const unsigned StartOffset = 1; 555 SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset); 556 Lexer TheLexer(FileLoc, LangOpts, Buffer->getBufferStart(), 557 Buffer->getBufferStart(), Buffer->getBufferEnd()); 558 559 // StartLoc will differ from FileLoc if there is a BOM that was skipped. 560 SourceLocation StartLoc = TheLexer.getSourceLocation(); 561 562 bool InPreprocessorDirective = false; 563 Token TheTok; 564 Token IfStartTok; 565 unsigned IfCount = 0; 566 567 unsigned MaxLineOffset = 0; 568 if (MaxLines) { 569 const char *CurPtr = Buffer->getBufferStart(); 570 unsigned CurLine = 0; 571 while (CurPtr != Buffer->getBufferEnd()) { 572 char ch = *CurPtr++; 573 if (ch == '\n') { 574 ++CurLine; 575 if (CurLine == MaxLines) 576 break; 577 } 578 } 579 if (CurPtr != Buffer->getBufferEnd()) 580 MaxLineOffset = CurPtr - Buffer->getBufferStart(); 581 } 582 583 do { 584 TheLexer.LexFromRawLexer(TheTok); 585 586 if (InPreprocessorDirective) { 587 // If we've hit the end of the file, we're done. 588 if (TheTok.getKind() == tok::eof) { 589 break; 590 } 591 592 // If we haven't hit the end of the preprocessor directive, skip this 593 // token. 594 if (!TheTok.isAtStartOfLine()) 595 continue; 596 597 // We've passed the end of the preprocessor directive, and will look 598 // at this token again below. 599 InPreprocessorDirective = false; 600 } 601 602 // Keep track of the # of lines in the preamble. 603 if (TheTok.isAtStartOfLine()) { 604 unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset; 605 606 // If we were asked to limit the number of lines in the preamble, 607 // and we're about to exceed that limit, we're done. 608 if (MaxLineOffset && TokOffset >= MaxLineOffset) 609 break; 610 } 611 612 // Comments are okay; skip over them. 613 if (TheTok.getKind() == tok::comment) 614 continue; 615 616 if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) { 617 // This is the start of a preprocessor directive. 618 Token HashTok = TheTok; 619 InPreprocessorDirective = true; 620 621 // Figure out which directive this is. Since we're lexing raw tokens, 622 // we don't have an identifier table available. Instead, just look at 623 // the raw identifier to recognize and categorize preprocessor directives. 624 TheLexer.LexFromRawLexer(TheTok); 625 if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) { 626 StringRef Keyword(TheTok.getRawIdentifierData(), 627 TheTok.getLength()); 628 PreambleDirectiveKind PDK 629 = llvm::StringSwitch<PreambleDirectiveKind>(Keyword) 630 .Case("include", PDK_Skipped) 631 .Case("__include_macros", PDK_Skipped) 632 .Case("define", PDK_Skipped) 633 .Case("undef", PDK_Skipped) 634 .Case("line", PDK_Skipped) 635 .Case("error", PDK_Skipped) 636 .Case("pragma", PDK_Skipped) 637 .Case("import", PDK_Skipped) 638 .Case("include_next", PDK_Skipped) 639 .Case("warning", PDK_Skipped) 640 .Case("ident", PDK_Skipped) 641 .Case("sccs", PDK_Skipped) 642 .Case("assert", PDK_Skipped) 643 .Case("unassert", PDK_Skipped) 644 .Case("if", PDK_StartIf) 645 .Case("ifdef", PDK_StartIf) 646 .Case("ifndef", PDK_StartIf) 647 .Case("elif", PDK_Skipped) 648 .Case("else", PDK_Skipped) 649 .Case("endif", PDK_EndIf) 650 .Default(PDK_Unknown); 651 652 switch (PDK) { 653 case PDK_Skipped: 654 continue; 655 656 case PDK_StartIf: 657 if (IfCount == 0) 658 IfStartTok = HashTok; 659 660 ++IfCount; 661 continue; 662 663 case PDK_EndIf: 664 // Mismatched #endif. The preamble ends here. 665 if (IfCount == 0) 666 break; 667 668 --IfCount; 669 continue; 670 671 case PDK_Unknown: 672 // We don't know what this directive is; stop at the '#'. 673 break; 674 } 675 } 676 677 // We only end up here if we didn't recognize the preprocessor 678 // directive or it was one that can't occur in the preamble at this 679 // point. Roll back the current token to the location of the '#'. 680 InPreprocessorDirective = false; 681 TheTok = HashTok; 682 } 683 684 // We hit a token that we don't recognize as being in the 685 // "preprocessing only" part of the file, so we're no longer in 686 // the preamble. 687 break; 688 } while (true); 689 690 SourceLocation End = IfCount? IfStartTok.getLocation() : TheTok.getLocation(); 691 return std::make_pair(End.getRawEncoding() - StartLoc.getRawEncoding(), 692 IfCount? IfStartTok.isAtStartOfLine() 693 : TheTok.isAtStartOfLine()); 694} 695 696 697/// AdvanceToTokenCharacter - Given a location that specifies the start of a 698/// token, return a new location that specifies a character within the token. 699SourceLocation Lexer::AdvanceToTokenCharacter(SourceLocation TokStart, 700 unsigned CharNo, 701 const SourceManager &SM, 702 const LangOptions &LangOpts) { 703 // Figure out how many physical characters away the specified expansion 704 // character is. This needs to take into consideration newlines and 705 // trigraphs. 706 bool Invalid = false; 707 const char *TokPtr = SM.getCharacterData(TokStart, &Invalid); 708 709 // If they request the first char of the token, we're trivially done. 710 if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr))) 711 return TokStart; 712 713 unsigned PhysOffset = 0; 714 715 // The usual case is that tokens don't contain anything interesting. Skip 716 // over the uninteresting characters. If a token only consists of simple 717 // chars, this method is extremely fast. 718 while (Lexer::isObviouslySimpleCharacter(*TokPtr)) { 719 if (CharNo == 0) 720 return TokStart.getLocWithOffset(PhysOffset); 721 ++TokPtr, --CharNo, ++PhysOffset; 722 } 723 724 // If we have a character that may be a trigraph or escaped newline, use a 725 // lexer to parse it correctly. 726 for (; CharNo; --CharNo) { 727 unsigned Size; 728 Lexer::getCharAndSizeNoWarn(TokPtr, Size, LangOpts); 729 TokPtr += Size; 730 PhysOffset += Size; 731 } 732 733 // Final detail: if we end up on an escaped newline, we want to return the 734 // location of the actual byte of the token. For example foo\<newline>bar 735 // advanced by 3 should return the location of b, not of \\. One compounding 736 // detail of this is that the escape may be made by a trigraph. 737 if (!Lexer::isObviouslySimpleCharacter(*TokPtr)) 738 PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr; 739 740 return TokStart.getLocWithOffset(PhysOffset); 741} 742 743/// \brief Computes the source location just past the end of the 744/// token at this source location. 745/// 746/// This routine can be used to produce a source location that 747/// points just past the end of the token referenced by \p Loc, and 748/// is generally used when a diagnostic needs to point just after a 749/// token where it expected something different that it received. If 750/// the returned source location would not be meaningful (e.g., if 751/// it points into a macro), this routine returns an invalid 752/// source location. 753/// 754/// \param Offset an offset from the end of the token, where the source 755/// location should refer to. The default offset (0) produces a source 756/// location pointing just past the end of the token; an offset of 1 produces 757/// a source location pointing to the last character in the token, etc. 758SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset, 759 const SourceManager &SM, 760 const LangOptions &LangOpts) { 761 if (Loc.isInvalid()) 762 return SourceLocation(); 763 764 if (Loc.isMacroID()) { 765 if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 766 return SourceLocation(); // Points inside the macro expansion. 767 } 768 769 unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 770 if (Len > Offset) 771 Len = Len - Offset; 772 else 773 return Loc; 774 775 return Loc.getLocWithOffset(Len); 776} 777 778/// \brief Returns true if the given MacroID location points at the first 779/// token of the macro expansion. 780bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc, 781 const SourceManager &SM, 782 const LangOptions &LangOpts, 783 SourceLocation *MacroBegin) { 784 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 785 786 std::pair<FileID, unsigned> infoLoc = SM.getDecomposedLoc(loc); 787 // FIXME: If the token comes from the macro token paste operator ('##') 788 // this function will always return false; 789 if (infoLoc.second > 0) 790 return false; // Does not point at the start of token. 791 792 SourceLocation expansionLoc = 793 SM.getSLocEntry(infoLoc.first).getExpansion().getExpansionLocStart(); 794 if (expansionLoc.isFileID()) { 795 // No other macro expansions, this is the first. 796 if (MacroBegin) 797 *MacroBegin = expansionLoc; 798 return true; 799 } 800 801 return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin); 802} 803 804/// \brief Returns true if the given MacroID location points at the last 805/// token of the macro expansion. 806bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc, 807 const SourceManager &SM, 808 const LangOptions &LangOpts, 809 SourceLocation *MacroEnd) { 810 assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc"); 811 812 SourceLocation spellLoc = SM.getSpellingLoc(loc); 813 unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts); 814 if (tokLen == 0) 815 return false; 816 817 FileID FID = SM.getFileID(loc); 818 SourceLocation afterLoc = loc.getLocWithOffset(tokLen+1); 819 if (SM.isInFileID(afterLoc, FID)) 820 return false; // Still in the same FileID, does not point to the last token. 821 822 // FIXME: If the token comes from the macro token paste operator ('##') 823 // or the stringify operator ('#') this function will always return false; 824 825 SourceLocation expansionLoc = 826 SM.getSLocEntry(FID).getExpansion().getExpansionLocEnd(); 827 if (expansionLoc.isFileID()) { 828 // No other macro expansions. 829 if (MacroEnd) 830 *MacroEnd = expansionLoc; 831 return true; 832 } 833 834 return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd); 835} 836 837static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range, 838 const SourceManager &SM, 839 const LangOptions &LangOpts) { 840 SourceLocation Begin = Range.getBegin(); 841 SourceLocation End = Range.getEnd(); 842 assert(Begin.isFileID() && End.isFileID()); 843 if (Range.isTokenRange()) { 844 End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts); 845 if (End.isInvalid()) 846 return CharSourceRange(); 847 } 848 849 // Break down the source locations. 850 FileID FID; 851 unsigned BeginOffs; 852 llvm::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 853 if (FID.isInvalid()) 854 return CharSourceRange(); 855 856 unsigned EndOffs; 857 if (!SM.isInFileID(End, FID, &EndOffs) || 858 BeginOffs > EndOffs) 859 return CharSourceRange(); 860 861 return CharSourceRange::getCharRange(Begin, End); 862} 863 864CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range, 865 const SourceManager &SM, 866 const LangOptions &LangOpts) { 867 SourceLocation Begin = Range.getBegin(); 868 SourceLocation End = Range.getEnd(); 869 if (Begin.isInvalid() || End.isInvalid()) 870 return CharSourceRange(); 871 872 if (Begin.isFileID() && End.isFileID()) 873 return makeRangeFromFileLocs(Range, SM, LangOpts); 874 875 if (Begin.isMacroID() && End.isFileID()) { 876 if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin)) 877 return CharSourceRange(); 878 Range.setBegin(Begin); 879 return makeRangeFromFileLocs(Range, SM, LangOpts); 880 } 881 882 if (Begin.isFileID() && End.isMacroID()) { 883 if ((Range.isTokenRange() && !isAtEndOfMacroExpansion(End, SM, LangOpts, 884 &End)) || 885 (Range.isCharRange() && !isAtStartOfMacroExpansion(End, SM, LangOpts, 886 &End))) 887 return CharSourceRange(); 888 Range.setEnd(End); 889 return makeRangeFromFileLocs(Range, SM, LangOpts); 890 } 891 892 assert(Begin.isMacroID() && End.isMacroID()); 893 SourceLocation MacroBegin, MacroEnd; 894 if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) && 895 ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts, 896 &MacroEnd)) || 897 (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts, 898 &MacroEnd)))) { 899 Range.setBegin(MacroBegin); 900 Range.setEnd(MacroEnd); 901 return makeRangeFromFileLocs(Range, SM, LangOpts); 902 } 903 904 FileID FID; 905 unsigned BeginOffs; 906 llvm::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin); 907 if (FID.isInvalid()) 908 return CharSourceRange(); 909 910 unsigned EndOffs; 911 if (!SM.isInFileID(End, FID, &EndOffs) || 912 BeginOffs > EndOffs) 913 return CharSourceRange(); 914 915 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 916 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 917 if (Expansion.isMacroArgExpansion() && 918 Expansion.getSpellingLoc().isFileID()) { 919 SourceLocation SpellLoc = Expansion.getSpellingLoc(); 920 Range.setBegin(SpellLoc.getLocWithOffset(BeginOffs)); 921 Range.setEnd(SpellLoc.getLocWithOffset(EndOffs)); 922 return makeRangeFromFileLocs(Range, SM, LangOpts); 923 } 924 925 return CharSourceRange(); 926} 927 928StringRef Lexer::getSourceText(CharSourceRange Range, 929 const SourceManager &SM, 930 const LangOptions &LangOpts, 931 bool *Invalid) { 932 Range = makeFileCharRange(Range, SM, LangOpts); 933 if (Range.isInvalid()) { 934 if (Invalid) *Invalid = true; 935 return StringRef(); 936 } 937 938 // Break down the source location. 939 std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin()); 940 if (beginInfo.first.isInvalid()) { 941 if (Invalid) *Invalid = true; 942 return StringRef(); 943 } 944 945 unsigned EndOffs; 946 if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) || 947 beginInfo.second > EndOffs) { 948 if (Invalid) *Invalid = true; 949 return StringRef(); 950 } 951 952 // Try to the load the file buffer. 953 bool invalidTemp = false; 954 StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp); 955 if (invalidTemp) { 956 if (Invalid) *Invalid = true; 957 return StringRef(); 958 } 959 960 if (Invalid) *Invalid = false; 961 return file.substr(beginInfo.second, EndOffs - beginInfo.second); 962} 963 964StringRef Lexer::getImmediateMacroName(SourceLocation Loc, 965 const SourceManager &SM, 966 const LangOptions &LangOpts) { 967 assert(Loc.isMacroID() && "Only reasonble to call this on macros"); 968 969 // Find the location of the immediate macro expansion. 970 while (1) { 971 FileID FID = SM.getFileID(Loc); 972 const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID); 973 const SrcMgr::ExpansionInfo &Expansion = E->getExpansion(); 974 Loc = Expansion.getExpansionLocStart(); 975 if (!Expansion.isMacroArgExpansion()) 976 break; 977 978 // For macro arguments we need to check that the argument did not come 979 // from an inner macro, e.g: "MAC1( MAC2(foo) )" 980 981 // Loc points to the argument id of the macro definition, move to the 982 // macro expansion. 983 Loc = SM.getImmediateExpansionRange(Loc).first; 984 SourceLocation SpellLoc = Expansion.getSpellingLoc(); 985 if (SpellLoc.isFileID()) 986 break; // No inner macro. 987 988 // If spelling location resides in the same FileID as macro expansion 989 // location, it means there is no inner macro. 990 FileID MacroFID = SM.getFileID(Loc); 991 if (SM.isInFileID(SpellLoc, MacroFID)) 992 break; 993 994 // Argument came from inner macro. 995 Loc = SpellLoc; 996 } 997 998 // Find the spelling location of the start of the non-argument expansion 999 // range. This is where the macro name was spelled in order to begin 1000 // expanding this macro. 1001 Loc = SM.getSpellingLoc(Loc); 1002 1003 // Dig out the buffer where the macro name was spelled and the extents of the 1004 // name so that we can render it into the expansion note. 1005 std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc); 1006 unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts); 1007 StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first); 1008 return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength); 1009} 1010 1011//===----------------------------------------------------------------------===// 1012// Character information. 1013//===----------------------------------------------------------------------===// 1014 1015enum { 1016 CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0' 1017 CHAR_VERT_WS = 0x02, // '\r', '\n' 1018 CHAR_LETTER = 0x04, // a-z,A-Z 1019 CHAR_NUMBER = 0x08, // 0-9 1020 CHAR_UNDER = 0x10, // _ 1021 CHAR_PERIOD = 0x20, // . 1022 CHAR_RAWDEL = 0x40 // {}[]#<>%:;?*+-/^&|~!=,"' 1023}; 1024 1025// Statically initialize CharInfo table based on ASCII character set 1026// Reference: FreeBSD 7.2 /usr/share/misc/ascii 1027static const unsigned char CharInfo[256] = 1028{ 1029// 0 NUL 1 SOH 2 STX 3 ETX 1030// 4 EOT 5 ENQ 6 ACK 7 BEL 1031 0 , 0 , 0 , 0 , 1032 0 , 0 , 0 , 0 , 1033// 8 BS 9 HT 10 NL 11 VT 1034//12 NP 13 CR 14 SO 15 SI 1035 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS, 1036 CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 , 1037//16 DLE 17 DC1 18 DC2 19 DC3 1038//20 DC4 21 NAK 22 SYN 23 ETB 1039 0 , 0 , 0 , 0 , 1040 0 , 0 , 0 , 0 , 1041//24 CAN 25 EM 26 SUB 27 ESC 1042//28 FS 29 GS 30 RS 31 US 1043 0 , 0 , 0 , 0 , 1044 0 , 0 , 0 , 0 , 1045//32 SP 33 ! 34 " 35 # 1046//36 $ 37 % 38 & 39 ' 1047 CHAR_HORZ_WS, CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 1048 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 1049//40 ( 41 ) 42 * 43 + 1050//44 , 45 - 46 . 47 / 1051 0 , 0 , CHAR_RAWDEL , CHAR_RAWDEL , 1052 CHAR_RAWDEL , CHAR_RAWDEL , CHAR_PERIOD , CHAR_RAWDEL , 1053//48 0 49 1 50 2 51 3 1054//52 4 53 5 54 6 55 7 1055 CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , 1056 CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , 1057//56 8 57 9 58 : 59 ; 1058//60 < 61 = 62 > 63 ? 1059 CHAR_NUMBER , CHAR_NUMBER , CHAR_RAWDEL , CHAR_RAWDEL , 1060 CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 1061//64 @ 65 A 66 B 67 C 1062//68 D 69 E 70 F 71 G 1063 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1064 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1065//72 H 73 I 74 J 75 K 1066//76 L 77 M 78 N 79 O 1067 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1068 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1069//80 P 81 Q 82 R 83 S 1070//84 T 85 U 86 V 87 W 1071 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1072 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1073//88 X 89 Y 90 Z 91 [ 1074//92 \ 93 ] 94 ^ 95 _ 1075 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL , 1076 0 , CHAR_RAWDEL , CHAR_RAWDEL , CHAR_UNDER , 1077//96 ` 97 a 98 b 99 c 1078//100 d 101 e 102 f 103 g 1079 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1080 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1081//104 h 105 i 106 j 107 k 1082//108 l 109 m 110 n 111 o 1083 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1084 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1085//112 p 113 q 114 r 115 s 1086//116 t 117 u 118 v 119 w 1087 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1088 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 1089//120 x 121 y 122 z 123 { 1090//124 | 125 } 126 ~ 127 DEL 1091 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_RAWDEL , 1092 CHAR_RAWDEL , CHAR_RAWDEL , CHAR_RAWDEL , 0 1093}; 1094 1095static void InitCharacterInfo() { 1096 static bool isInited = false; 1097 if (isInited) return; 1098 // check the statically-initialized CharInfo table 1099 assert(CHAR_HORZ_WS == CharInfo[(int)' ']); 1100 assert(CHAR_HORZ_WS == CharInfo[(int)'\t']); 1101 assert(CHAR_HORZ_WS == CharInfo[(int)'\f']); 1102 assert(CHAR_HORZ_WS == CharInfo[(int)'\v']); 1103 assert(CHAR_VERT_WS == CharInfo[(int)'\n']); 1104 assert(CHAR_VERT_WS == CharInfo[(int)'\r']); 1105 assert(CHAR_UNDER == CharInfo[(int)'_']); 1106 assert(CHAR_PERIOD == CharInfo[(int)'.']); 1107 for (unsigned i = 'a'; i <= 'z'; ++i) { 1108 assert(CHAR_LETTER == CharInfo[i]); 1109 assert(CHAR_LETTER == CharInfo[i+'A'-'a']); 1110 } 1111 for (unsigned i = '0'; i <= '9'; ++i) 1112 assert(CHAR_NUMBER == CharInfo[i]); 1113 1114 isInited = true; 1115} 1116 1117 1118/// isIdentifierHead - Return true if this is the first character of an 1119/// identifier, which is [a-zA-Z_]. 1120static inline bool isIdentifierHead(unsigned char c) { 1121 return (CharInfo[c] & (CHAR_LETTER|CHAR_UNDER)) ? true : false; 1122} 1123 1124/// isIdentifierBody - Return true if this is the body character of an 1125/// identifier, which is [a-zA-Z0-9_]. 1126static inline bool isIdentifierBody(unsigned char c) { 1127 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false; 1128} 1129 1130/// isHorizontalWhitespace - Return true if this character is horizontal 1131/// whitespace: ' ', '\\t', '\\f', '\\v'. Note that this returns false for 1132/// '\\0'. 1133static inline bool isHorizontalWhitespace(unsigned char c) { 1134 return (CharInfo[c] & CHAR_HORZ_WS) ? true : false; 1135} 1136 1137/// isVerticalWhitespace - Return true if this character is vertical 1138/// whitespace: '\\n', '\\r'. Note that this returns false for '\\0'. 1139static inline bool isVerticalWhitespace(unsigned char c) { 1140 return (CharInfo[c] & CHAR_VERT_WS) ? true : false; 1141} 1142 1143/// isWhitespace - Return true if this character is horizontal or vertical 1144/// whitespace: ' ', '\\t', '\\f', '\\v', '\\n', '\\r'. Note that this returns 1145/// false for '\\0'. 1146static inline bool isWhitespace(unsigned char c) { 1147 return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false; 1148} 1149 1150/// isNumberBody - Return true if this is the body character of an 1151/// preprocessing number, which is [a-zA-Z0-9_.]. 1152static inline bool isNumberBody(unsigned char c) { 1153 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ? 1154 true : false; 1155} 1156 1157/// isRawStringDelimBody - Return true if this is the body character of a 1158/// raw string delimiter. 1159static inline bool isRawStringDelimBody(unsigned char c) { 1160 return (CharInfo[c] & 1161 (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL)) ? 1162 true : false; 1163} 1164 1165// Allow external clients to make use of CharInfo. 1166bool Lexer::isIdentifierBodyChar(char c, const LangOptions &LangOpts) { 1167 return isIdentifierBody(c) || (c == '$' && LangOpts.DollarIdents); 1168} 1169 1170 1171//===----------------------------------------------------------------------===// 1172// Diagnostics forwarding code. 1173//===----------------------------------------------------------------------===// 1174 1175/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 1176/// lexer buffer was all expanded at a single point, perform the mapping. 1177/// This is currently only used for _Pragma implementation, so it is the slow 1178/// path of the hot getSourceLocation method. Do not allow it to be inlined. 1179static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc( 1180 Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen); 1181static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 1182 SourceLocation FileLoc, 1183 unsigned CharNo, unsigned TokLen) { 1184 assert(FileLoc.isMacroID() && "Must be a macro expansion"); 1185 1186 // Otherwise, we're lexing "mapped tokens". This is used for things like 1187 // _Pragma handling. Combine the expansion location of FileLoc with the 1188 // spelling location. 1189 SourceManager &SM = PP.getSourceManager(); 1190 1191 // Create a new SLoc which is expanded from Expansion(FileLoc) but whose 1192 // characters come from spelling(FileLoc)+Offset. 1193 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 1194 SpellingLoc = SpellingLoc.getLocWithOffset(CharNo); 1195 1196 // Figure out the expansion loc range, which is the range covered by the 1197 // original _Pragma(...) sequence. 1198 std::pair<SourceLocation,SourceLocation> II = 1199 SM.getImmediateExpansionRange(FileLoc); 1200 1201 return SM.createExpansionLoc(SpellingLoc, II.first, II.second, TokLen); 1202} 1203 1204/// getSourceLocation - Return a source location identifier for the specified 1205/// offset in the current file. 1206SourceLocation Lexer::getSourceLocation(const char *Loc, 1207 unsigned TokLen) const { 1208 assert(Loc >= BufferStart && Loc <= BufferEnd && 1209 "Location out of range for this buffer!"); 1210 1211 // In the normal case, we're just lexing from a simple file buffer, return 1212 // the file id from FileLoc with the offset specified. 1213 unsigned CharNo = Loc-BufferStart; 1214 if (FileLoc.isFileID()) 1215 return FileLoc.getLocWithOffset(CharNo); 1216 1217 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 1218 // tokens are lexed from where the _Pragma was defined. 1219 assert(PP && "This doesn't work on raw lexers"); 1220 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 1221} 1222 1223/// Diag - Forwarding function for diagnostics. This translate a source 1224/// position in the current buffer into a SourceLocation object for rendering. 1225DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 1226 return PP->Diag(getSourceLocation(Loc), DiagID); 1227} 1228 1229//===----------------------------------------------------------------------===// 1230// Trigraph and Escaped Newline Handling Code. 1231//===----------------------------------------------------------------------===// 1232 1233/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 1234/// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 1235static char GetTrigraphCharForLetter(char Letter) { 1236 switch (Letter) { 1237 default: return 0; 1238 case '=': return '#'; 1239 case ')': return ']'; 1240 case '(': return '['; 1241 case '!': return '|'; 1242 case '\'': return '^'; 1243 case '>': return '}'; 1244 case '/': return '\\'; 1245 case '<': return '{'; 1246 case '-': return '~'; 1247 } 1248} 1249 1250/// DecodeTrigraphChar - If the specified character is a legal trigraph when 1251/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 1252/// return the result character. Finally, emit a warning about trigraph use 1253/// whether trigraphs are enabled or not. 1254static char DecodeTrigraphChar(const char *CP, Lexer *L) { 1255 char Res = GetTrigraphCharForLetter(*CP); 1256 if (!Res || !L) return Res; 1257 1258 if (!L->getLangOpts().Trigraphs) { 1259 if (!L->isLexingRawMode()) 1260 L->Diag(CP-2, diag::trigraph_ignored); 1261 return 0; 1262 } 1263 1264 if (!L->isLexingRawMode()) 1265 L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1); 1266 return Res; 1267} 1268 1269/// getEscapedNewLineSize - Return the size of the specified escaped newline, 1270/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 1271/// trigraph equivalent on entry to this function. 1272unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 1273 unsigned Size = 0; 1274 while (isWhitespace(Ptr[Size])) { 1275 ++Size; 1276 1277 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 1278 continue; 1279 1280 // If this is a \r\n or \n\r, skip the other half. 1281 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 1282 Ptr[Size-1] != Ptr[Size]) 1283 ++Size; 1284 1285 return Size; 1286 } 1287 1288 // Not an escaped newline, must be a \t or something else. 1289 return 0; 1290} 1291 1292/// SkipEscapedNewLines - If P points to an escaped newline (or a series of 1293/// them), skip over them and return the first non-escaped-newline found, 1294/// otherwise return P. 1295const char *Lexer::SkipEscapedNewLines(const char *P) { 1296 while (1) { 1297 const char *AfterEscape; 1298 if (*P == '\\') { 1299 AfterEscape = P+1; 1300 } else if (*P == '?') { 1301 // If not a trigraph for escape, bail out. 1302 if (P[1] != '?' || P[2] != '/') 1303 return P; 1304 AfterEscape = P+3; 1305 } else { 1306 return P; 1307 } 1308 1309 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 1310 if (NewLineSize == 0) return P; 1311 P = AfterEscape+NewLineSize; 1312 } 1313} 1314 1315/// \brief Checks that the given token is the first token that occurs after the 1316/// given location (this excludes comments and whitespace). Returns the location 1317/// immediately after the specified token. If the token is not found or the 1318/// location is inside a macro, the returned source location will be invalid. 1319SourceLocation Lexer::findLocationAfterToken(SourceLocation Loc, 1320 tok::TokenKind TKind, 1321 const SourceManager &SM, 1322 const LangOptions &LangOpts, 1323 bool SkipTrailingWhitespaceAndNewLine) { 1324 if (Loc.isMacroID()) { 1325 if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc)) 1326 return SourceLocation(); 1327 } 1328 Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts); 1329 1330 // Break down the source location. 1331 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 1332 1333 // Try to load the file buffer. 1334 bool InvalidTemp = false; 1335 StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp); 1336 if (InvalidTemp) 1337 return SourceLocation(); 1338 1339 const char *TokenBegin = File.data() + LocInfo.second; 1340 1341 // Lex from the start of the given location. 1342 Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(), 1343 TokenBegin, File.end()); 1344 // Find the token. 1345 Token Tok; 1346 lexer.LexFromRawLexer(Tok); 1347 if (Tok.isNot(TKind)) 1348 return SourceLocation(); 1349 SourceLocation TokenLoc = Tok.getLocation(); 1350 1351 // Calculate how much whitespace needs to be skipped if any. 1352 unsigned NumWhitespaceChars = 0; 1353 if (SkipTrailingWhitespaceAndNewLine) { 1354 const char *TokenEnd = SM.getCharacterData(TokenLoc) + 1355 Tok.getLength(); 1356 unsigned char C = *TokenEnd; 1357 while (isHorizontalWhitespace(C)) { 1358 C = *(++TokenEnd); 1359 NumWhitespaceChars++; 1360 } 1361 1362 // Skip \r, \n, \r\n, or \n\r 1363 if (C == '\n' || C == '\r') { 1364 char PrevC = C; 1365 C = *(++TokenEnd); 1366 NumWhitespaceChars++; 1367 if ((C == '\n' || C == '\r') && C != PrevC) 1368 NumWhitespaceChars++; 1369 } 1370 } 1371 1372 return TokenLoc.getLocWithOffset(Tok.getLength() + NumWhitespaceChars); 1373} 1374 1375/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 1376/// get its size, and return it. This is tricky in several cases: 1377/// 1. If currently at the start of a trigraph, we warn about the trigraph, 1378/// then either return the trigraph (skipping 3 chars) or the '?', 1379/// depending on whether trigraphs are enabled or not. 1380/// 2. If this is an escaped newline (potentially with whitespace between 1381/// the backslash and newline), implicitly skip the newline and return 1382/// the char after it. 1383/// 1384/// This handles the slow/uncommon case of the getCharAndSize method. Here we 1385/// know that we can accumulate into Size, and that we have already incremented 1386/// Ptr by Size bytes. 1387/// 1388/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 1389/// be updated to match. 1390/// 1391char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 1392 Token *Tok) { 1393 // If we have a slash, look for an escaped newline. 1394 if (Ptr[0] == '\\') { 1395 ++Size; 1396 ++Ptr; 1397Slash: 1398 // Common case, backslash-char where the char is not whitespace. 1399 if (!isWhitespace(Ptr[0])) return '\\'; 1400 1401 // See if we have optional whitespace characters between the slash and 1402 // newline. 1403 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1404 // Remember that this token needs to be cleaned. 1405 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1406 1407 // Warn if there was whitespace between the backslash and newline. 1408 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 1409 Diag(Ptr, diag::backslash_newline_space); 1410 1411 // Found backslash<whitespace><newline>. Parse the char after it. 1412 Size += EscapedNewLineSize; 1413 Ptr += EscapedNewLineSize; 1414 1415 // If the char that we finally got was a \n, then we must have had 1416 // something like \<newline><newline>. We don't want to consume the 1417 // second newline. 1418 if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0') 1419 return ' '; 1420 1421 // Use slow version to accumulate a correct size field. 1422 return getCharAndSizeSlow(Ptr, Size, Tok); 1423 } 1424 1425 // Otherwise, this is not an escaped newline, just return the slash. 1426 return '\\'; 1427 } 1428 1429 // If this is a trigraph, process it. 1430 if (Ptr[0] == '?' && Ptr[1] == '?') { 1431 // If this is actually a legal trigraph (not something like "??x"), emit 1432 // a trigraph warning. If so, and if trigraphs are enabled, return it. 1433 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) { 1434 // Remember that this token needs to be cleaned. 1435 if (Tok) Tok->setFlag(Token::NeedsCleaning); 1436 1437 Ptr += 3; 1438 Size += 3; 1439 if (C == '\\') goto Slash; 1440 return C; 1441 } 1442 } 1443 1444 // If this is neither, return a single character. 1445 ++Size; 1446 return *Ptr; 1447} 1448 1449 1450/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 1451/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 1452/// and that we have already incremented Ptr by Size bytes. 1453/// 1454/// NOTE: When this method is updated, getCharAndSizeSlow (above) should 1455/// be updated to match. 1456char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 1457 const LangOptions &LangOpts) { 1458 // If we have a slash, look for an escaped newline. 1459 if (Ptr[0] == '\\') { 1460 ++Size; 1461 ++Ptr; 1462Slash: 1463 // Common case, backslash-char where the char is not whitespace. 1464 if (!isWhitespace(Ptr[0])) return '\\'; 1465 1466 // See if we have optional whitespace characters followed by a newline. 1467 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 1468 // Found backslash<whitespace><newline>. Parse the char after it. 1469 Size += EscapedNewLineSize; 1470 Ptr += EscapedNewLineSize; 1471 1472 // If the char that we finally got was a \n, then we must have had 1473 // something like \<newline><newline>. We don't want to consume the 1474 // second newline. 1475 if (*Ptr == '\n' || *Ptr == '\r' || *Ptr == '\0') 1476 return ' '; 1477 1478 // Use slow version to accumulate a correct size field. 1479 return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts); 1480 } 1481 1482 // Otherwise, this is not an escaped newline, just return the slash. 1483 return '\\'; 1484 } 1485 1486 // If this is a trigraph, process it. 1487 if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 1488 // If this is actually a legal trigraph (not something like "??x"), return 1489 // it. 1490 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 1491 Ptr += 3; 1492 Size += 3; 1493 if (C == '\\') goto Slash; 1494 return C; 1495 } 1496 } 1497 1498 // If this is neither, return a single character. 1499 ++Size; 1500 return *Ptr; 1501} 1502 1503//===----------------------------------------------------------------------===// 1504// Helper methods for lexing. 1505//===----------------------------------------------------------------------===// 1506 1507/// \brief Routine that indiscriminately skips bytes in the source file. 1508void Lexer::SkipBytes(unsigned Bytes, bool StartOfLine) { 1509 BufferPtr += Bytes; 1510 if (BufferPtr > BufferEnd) 1511 BufferPtr = BufferEnd; 1512 IsAtStartOfLine = StartOfLine; 1513} 1514 1515namespace { 1516 struct UCNCharRange { 1517 uint32_t Lower; 1518 uint32_t Upper; 1519 }; 1520 1521 // C11 D.1, C++11 [charname.allowed] 1522 // FIXME: C99 and C++03 each have a different set of allowed UCNs. 1523 const UCNCharRange UCNAllowedCharRanges[] = { 1524 // 1 1525 { 0x00A8, 0x00A8 }, { 0x00AA, 0x00AA }, { 0x00AD, 0x00AD }, 1526 { 0x00AF, 0x00AF }, { 0x00B2, 0x00B5 }, { 0x00B7, 0x00BA }, 1527 { 0x00BC, 0x00BE }, { 0x00C0, 0x00D6 }, { 0x00D8, 0x00F6 }, 1528 { 0x00F8, 0x00FF }, 1529 // 2 1530 { 0x0100, 0x167F }, { 0x1681, 0x180D }, { 0x180F, 0x1FFF }, 1531 // 3 1532 { 0x200B, 0x200D }, { 0x202A, 0x202E }, { 0x203F, 0x2040 }, 1533 { 0x2054, 0x2054 }, { 0x2060, 0x206F }, 1534 // 4 1535 { 0x2070, 0x218F }, { 0x2460, 0x24FF }, { 0x2776, 0x2793 }, 1536 { 0x2C00, 0x2DFF }, { 0x2E80, 0x2FFF }, 1537 // 5 1538 { 0x3004, 0x3007 }, { 0x3021, 0x302F }, { 0x3031, 0x303F }, 1539 // 6 1540 { 0x3040, 0xD7FF }, 1541 // 7 1542 { 0xF900, 0xFD3D }, { 0xFD40, 0xFDCF }, { 0xFDF0, 0xFE44 }, 1543 { 0xFE47, 0xFFFD }, 1544 // 8 1545 { 0x10000, 0x1FFFD }, { 0x20000, 0x2FFFD }, { 0x30000, 0x3FFFD }, 1546 { 0x40000, 0x4FFFD }, { 0x50000, 0x5FFFD }, { 0x60000, 0x6FFFD }, 1547 { 0x70000, 0x7FFFD }, { 0x80000, 0x8FFFD }, { 0x90000, 0x9FFFD }, 1548 { 0xA0000, 0xAFFFD }, { 0xB0000, 0xBFFFD }, { 0xC0000, 0xCFFFD }, 1549 { 0xD0000, 0xDFFFD }, { 0xE0000, 0xEFFFD } 1550 }; 1551} 1552 1553static bool isAllowedIDChar(uint32_t c) { 1554 unsigned LowPoint = 0; 1555 unsigned HighPoint = llvm::array_lengthof(UCNAllowedCharRanges); 1556 1557 // Binary search the UCNAllowedCharRanges set. 1558 while (HighPoint != LowPoint) { 1559 unsigned MidPoint = (HighPoint + LowPoint) / 2; 1560 if (c < UCNAllowedCharRanges[MidPoint].Lower) 1561 HighPoint = MidPoint; 1562 else if (c > UCNAllowedCharRanges[MidPoint].Upper) 1563 LowPoint = MidPoint + 1; 1564 else 1565 return true; 1566 } 1567 1568 return false; 1569} 1570 1571static bool isAllowedInitiallyIDChar(uint32_t c) { 1572 // C11 D.2, C++11 [charname.disallowed] 1573 // FIXME: C99 only forbids "digits", presumably as described in C99 Annex D. 1574 // FIXME: C++03 does not forbid any initial characters. 1575 return !(0x0300 <= c && c <= 0x036F) && 1576 !(0x1DC0 <= c && c <= 0x1DFF) && 1577 !(0x20D0 <= c && c <= 0x20FF) && 1578 !(0xFE20 <= c && c <= 0xFE2F); 1579} 1580 1581static inline bool isASCII(char C) { 1582 return static_cast<signed char>(C) >= 0; 1583} 1584 1585 1586void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { 1587 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] 1588 unsigned Size; 1589 unsigned char C = *CurPtr++; 1590 while (isIdentifierBody(C)) 1591 C = *CurPtr++; 1592 1593 --CurPtr; // Back up over the skipped character. 1594 1595 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline 1596 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. 1597 // 1598 // TODO: Could merge these checks into a CharInfo flag to make the comparison 1599 // cheaper 1600 if (isASCII(C) && C != '\\' && C != '?' && 1601 (C != '$' || !LangOpts.DollarIdents)) { 1602FinishIdentifier: 1603 const char *IdStart = BufferPtr; 1604 FormTokenWithChars(Result, CurPtr, tok::raw_identifier); 1605 Result.setRawIdentifierData(IdStart); 1606 1607 // If we are in raw mode, return this identifier raw. There is no need to 1608 // look up identifier information or attempt to macro expand it. 1609 if (LexingRawMode) 1610 return; 1611 1612 // Fill in Result.IdentifierInfo and update the token kind, 1613 // looking up the identifier in the identifier table. 1614 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result); 1615 1616 // Finally, now that we know we have an identifier, pass this off to the 1617 // preprocessor, which may macro expand it or something. 1618 if (II->isHandleIdentifierCase()) 1619 PP->HandleIdentifier(Result); 1620 1621 return; 1622 } 1623 1624 // Otherwise, $,\,? in identifier found. Enter slower path. 1625 1626 C = getCharAndSize(CurPtr, Size); 1627 while (1) { 1628 if (C == '$') { 1629 // If we hit a $ and they are not supported in identifiers, we are done. 1630 if (!LangOpts.DollarIdents) goto FinishIdentifier; 1631 1632 // Otherwise, emit a diagnostic and continue. 1633 if (!isLexingRawMode()) 1634 Diag(CurPtr, diag::ext_dollar_in_identifier); 1635 CurPtr = ConsumeChar(CurPtr, Size, Result); 1636 C = getCharAndSize(CurPtr, Size); 1637 continue; 1638 1639 } else if (C == '\\') { 1640 const char *UCNPtr = CurPtr + Size; 1641 uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/0); 1642 if (CodePoint == 0 || !isAllowedIDChar(CodePoint)) 1643 goto FinishIdentifier; 1644 1645 Result.setFlag(Token::HasUCN); 1646 if ((UCNPtr - CurPtr == 6 && CurPtr[1] == 'u') || 1647 (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U')) 1648 CurPtr = UCNPtr; 1649 else 1650 while (CurPtr != UCNPtr) 1651 (void)getAndAdvanceChar(CurPtr, Result); 1652 1653 C = getCharAndSize(CurPtr, Size); 1654 continue; 1655 } else if (!isASCII(C)) { 1656 const char *UnicodePtr = CurPtr; 1657 UTF32 CodePoint; 1658 ConversionResult Result = convertUTF8Sequence((const UTF8 **)&UnicodePtr, 1659 (const UTF8 *)BufferEnd, 1660 &CodePoint, 1661 strictConversion); 1662 if (Result != conversionOK || 1663 !isAllowedIDChar(static_cast<uint32_t>(CodePoint))) 1664 goto FinishIdentifier; 1665 1666 CurPtr = UnicodePtr; 1667 C = getCharAndSize(CurPtr, Size); 1668 continue; 1669 } else if (!isIdentifierBody(C)) { 1670 goto FinishIdentifier; 1671 } 1672 1673 // Otherwise, this character is good, consume it. 1674 CurPtr = ConsumeChar(CurPtr, Size, Result); 1675 1676 C = getCharAndSize(CurPtr, Size); 1677 while (isIdentifierBody(C)) { 1678 CurPtr = ConsumeChar(CurPtr, Size, Result); 1679 C = getCharAndSize(CurPtr, Size); 1680 } 1681 } 1682} 1683 1684/// isHexaLiteral - Return true if Start points to a hex constant. 1685/// in microsoft mode (where this is supposed to be several different tokens). 1686bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) { 1687 unsigned Size; 1688 char C1 = Lexer::getCharAndSizeNoWarn(Start, Size, LangOpts); 1689 if (C1 != '0') 1690 return false; 1691 char C2 = Lexer::getCharAndSizeNoWarn(Start + Size, Size, LangOpts); 1692 return (C2 == 'x' || C2 == 'X'); 1693} 1694 1695/// LexNumericConstant - Lex the remainder of a integer or floating point 1696/// constant. From[-1] is the first character lexed. Return the end of the 1697/// constant. 1698void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 1699 unsigned Size; 1700 char C = getCharAndSize(CurPtr, Size); 1701 char PrevCh = 0; 1702 while (isNumberBody(C)) { // FIXME: UCNs in ud-suffix. 1703 CurPtr = ConsumeChar(CurPtr, Size, Result); 1704 PrevCh = C; 1705 C = getCharAndSize(CurPtr, Size); 1706 } 1707 1708 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 1709 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) { 1710 // If we are in Microsoft mode, don't continue if the constant is hex. 1711 // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1 1712 if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts)) 1713 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 1714 } 1715 1716 // If we have a hex FP constant, continue. 1717 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) { 1718 // Outside C99, we accept hexadecimal floating point numbers as a 1719 // not-quite-conforming extension. Only do so if this looks like it's 1720 // actually meant to be a hexfloat, and not if it has a ud-suffix. 1721 bool IsHexFloat = true; 1722 if (!LangOpts.C99) { 1723 if (!isHexaLiteral(BufferPtr, LangOpts)) 1724 IsHexFloat = false; 1725 else if (std::find(BufferPtr, CurPtr, '_') != CurPtr) 1726 IsHexFloat = false; 1727 } 1728 if (IsHexFloat) 1729 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 1730 } 1731 1732 // Update the location of token as well as BufferPtr. 1733 const char *TokStart = BufferPtr; 1734 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 1735 Result.setLiteralData(TokStart); 1736} 1737 1738/// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes 1739/// in C++11, or warn on a ud-suffix in C++98. 1740const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr) { 1741 assert(getLangOpts().CPlusPlus); 1742 1743 // Maximally munch an identifier. FIXME: UCNs. 1744 unsigned Size; 1745 char C = getCharAndSize(CurPtr, Size); 1746 if (isIdentifierHead(C)) { 1747 if (!getLangOpts().CPlusPlus11) { 1748 if (!isLexingRawMode()) 1749 Diag(CurPtr, 1750 C == '_' ? diag::warn_cxx11_compat_user_defined_literal 1751 : diag::warn_cxx11_compat_reserved_user_defined_literal) 1752 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 1753 return CurPtr; 1754 } 1755 1756 // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix 1757 // that does not start with an underscore is ill-formed. As a conforming 1758 // extension, we treat all such suffixes as if they had whitespace before 1759 // them. 1760 if (C != '_') { 1761 if (!isLexingRawMode()) 1762 Diag(CurPtr, getLangOpts().MicrosoftMode ? 1763 diag::ext_ms_reserved_user_defined_literal : 1764 diag::ext_reserved_user_defined_literal) 1765 << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " "); 1766 return CurPtr; 1767 } 1768 1769 Result.setFlag(Token::HasUDSuffix); 1770 do { 1771 CurPtr = ConsumeChar(CurPtr, Size, Result); 1772 C = getCharAndSize(CurPtr, Size); 1773 } while (isIdentifierBody(C)); 1774 } 1775 return CurPtr; 1776} 1777 1778/// LexStringLiteral - Lex the remainder of a string literal, after having lexed 1779/// either " or L" or u8" or u" or U". 1780void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, 1781 tok::TokenKind Kind) { 1782 const char *NulCharacter = 0; // Does this string contain the \0 character? 1783 1784 if (!isLexingRawMode() && 1785 (Kind == tok::utf8_string_literal || 1786 Kind == tok::utf16_string_literal || 1787 Kind == tok::utf32_string_literal)) 1788 Diag(BufferPtr, diag::warn_cxx98_compat_unicode_literal); 1789 1790 char C = getAndAdvanceChar(CurPtr, Result); 1791 while (C != '"') { 1792 // Skip escaped characters. Escaped newlines will already be processed by 1793 // getAndAdvanceChar. 1794 if (C == '\\') 1795 C = getAndAdvanceChar(CurPtr, Result); 1796 1797 if (C == '\n' || C == '\r' || // Newline. 1798 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 1799 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 1800 Diag(BufferPtr, diag::ext_unterminated_string); 1801 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1802 return; 1803 } 1804 1805 if (C == 0) { 1806 if (isCodeCompletionPoint(CurPtr-1)) { 1807 PP->CodeCompleteNaturalLanguage(); 1808 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1809 return cutOffLexing(); 1810 } 1811 1812 NulCharacter = CurPtr-1; 1813 } 1814 C = getAndAdvanceChar(CurPtr, Result); 1815 } 1816 1817 // If we are in C++11, lex the optional ud-suffix. 1818 if (getLangOpts().CPlusPlus) 1819 CurPtr = LexUDSuffix(Result, CurPtr); 1820 1821 // If a nul character existed in the string, warn about it. 1822 if (NulCharacter && !isLexingRawMode()) 1823 Diag(NulCharacter, diag::null_in_string); 1824 1825 // Update the location of the token as well as the BufferPtr instance var. 1826 const char *TokStart = BufferPtr; 1827 FormTokenWithChars(Result, CurPtr, Kind); 1828 Result.setLiteralData(TokStart); 1829} 1830 1831/// LexRawStringLiteral - Lex the remainder of a raw string literal, after 1832/// having lexed R", LR", u8R", uR", or UR". 1833void Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr, 1834 tok::TokenKind Kind) { 1835 // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3: 1836 // Between the initial and final double quote characters of the raw string, 1837 // any transformations performed in phases 1 and 2 (trigraphs, 1838 // universal-character-names, and line splicing) are reverted. 1839 1840 if (!isLexingRawMode()) 1841 Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal); 1842 1843 unsigned PrefixLen = 0; 1844 1845 while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen])) 1846 ++PrefixLen; 1847 1848 // If the last character was not a '(', then we didn't lex a valid delimiter. 1849 if (CurPtr[PrefixLen] != '(') { 1850 if (!isLexingRawMode()) { 1851 const char *PrefixEnd = &CurPtr[PrefixLen]; 1852 if (PrefixLen == 16) { 1853 Diag(PrefixEnd, diag::err_raw_delim_too_long); 1854 } else { 1855 Diag(PrefixEnd, diag::err_invalid_char_raw_delim) 1856 << StringRef(PrefixEnd, 1); 1857 } 1858 } 1859 1860 // Search for the next '"' in hopes of salvaging the lexer. Unfortunately, 1861 // it's possible the '"' was intended to be part of the raw string, but 1862 // there's not much we can do about that. 1863 while (1) { 1864 char C = *CurPtr++; 1865 1866 if (C == '"') 1867 break; 1868 if (C == 0 && CurPtr-1 == BufferEnd) { 1869 --CurPtr; 1870 break; 1871 } 1872 } 1873 1874 FormTokenWithChars(Result, CurPtr, tok::unknown); 1875 return; 1876 } 1877 1878 // Save prefix and move CurPtr past it 1879 const char *Prefix = CurPtr; 1880 CurPtr += PrefixLen + 1; // skip over prefix and '(' 1881 1882 while (1) { 1883 char C = *CurPtr++; 1884 1885 if (C == ')') { 1886 // Check for prefix match and closing quote. 1887 if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') { 1888 CurPtr += PrefixLen + 1; // skip over prefix and '"' 1889 break; 1890 } 1891 } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file. 1892 if (!isLexingRawMode()) 1893 Diag(BufferPtr, diag::err_unterminated_raw_string) 1894 << StringRef(Prefix, PrefixLen); 1895 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1896 return; 1897 } 1898 } 1899 1900 // If we are in C++11, lex the optional ud-suffix. 1901 if (getLangOpts().CPlusPlus) 1902 CurPtr = LexUDSuffix(Result, CurPtr); 1903 1904 // Update the location of token as well as BufferPtr. 1905 const char *TokStart = BufferPtr; 1906 FormTokenWithChars(Result, CurPtr, Kind); 1907 Result.setLiteralData(TokStart); 1908} 1909 1910/// LexAngledStringLiteral - Lex the remainder of an angled string literal, 1911/// after having lexed the '<' character. This is used for #include filenames. 1912void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 1913 const char *NulCharacter = 0; // Does this string contain the \0 character? 1914 const char *AfterLessPos = CurPtr; 1915 char C = getAndAdvanceChar(CurPtr, Result); 1916 while (C != '>') { 1917 // Skip escaped characters. 1918 if (C == '\\') { 1919 // Skip the escaped character. 1920 getAndAdvanceChar(CurPtr, Result); 1921 } else if (C == '\n' || C == '\r' || // Newline. 1922 (C == 0 && (CurPtr-1 == BufferEnd || // End of file. 1923 isCodeCompletionPoint(CurPtr-1)))) { 1924 // If the filename is unterminated, then it must just be a lone < 1925 // character. Return this as such. 1926 FormTokenWithChars(Result, AfterLessPos, tok::less); 1927 return; 1928 } else if (C == 0) { 1929 NulCharacter = CurPtr-1; 1930 } 1931 C = getAndAdvanceChar(CurPtr, Result); 1932 } 1933 1934 // If a nul character existed in the string, warn about it. 1935 if (NulCharacter && !isLexingRawMode()) 1936 Diag(NulCharacter, diag::null_in_string); 1937 1938 // Update the location of token as well as BufferPtr. 1939 const char *TokStart = BufferPtr; 1940 FormTokenWithChars(Result, CurPtr, tok::angle_string_literal); 1941 Result.setLiteralData(TokStart); 1942} 1943 1944 1945/// LexCharConstant - Lex the remainder of a character constant, after having 1946/// lexed either ' or L' or u' or U'. 1947void Lexer::LexCharConstant(Token &Result, const char *CurPtr, 1948 tok::TokenKind Kind) { 1949 const char *NulCharacter = 0; // Does this character contain the \0 character? 1950 1951 if (!isLexingRawMode() && 1952 (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)) 1953 Diag(BufferPtr, diag::warn_cxx98_compat_unicode_literal); 1954 1955 char C = getAndAdvanceChar(CurPtr, Result); 1956 if (C == '\'') { 1957 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 1958 Diag(BufferPtr, diag::ext_empty_character); 1959 FormTokenWithChars(Result, CurPtr, tok::unknown); 1960 return; 1961 } 1962 1963 while (C != '\'') { 1964 // Skip escaped characters. 1965 if (C == '\\') 1966 C = getAndAdvanceChar(CurPtr, Result); 1967 1968 if (C == '\n' || C == '\r' || // Newline. 1969 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 1970 if (!isLexingRawMode() && !LangOpts.AsmPreprocessor) 1971 Diag(BufferPtr, diag::ext_unterminated_char); 1972 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1973 return; 1974 } 1975 1976 if (C == 0) { 1977 if (isCodeCompletionPoint(CurPtr-1)) { 1978 PP->CodeCompleteNaturalLanguage(); 1979 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 1980 return cutOffLexing(); 1981 } 1982 1983 NulCharacter = CurPtr-1; 1984 } 1985 C = getAndAdvanceChar(CurPtr, Result); 1986 } 1987 1988 // If we are in C++11, lex the optional ud-suffix. 1989 if (getLangOpts().CPlusPlus) 1990 CurPtr = LexUDSuffix(Result, CurPtr); 1991 1992 // If a nul character existed in the character, warn about it. 1993 if (NulCharacter && !isLexingRawMode()) 1994 Diag(NulCharacter, diag::null_in_char); 1995 1996 // Update the location of token as well as BufferPtr. 1997 const char *TokStart = BufferPtr; 1998 FormTokenWithChars(Result, CurPtr, Kind); 1999 Result.setLiteralData(TokStart); 2000} 2001 2002/// SkipWhitespace - Efficiently skip over a series of whitespace characters. 2003/// Update BufferPtr to point to the next non-whitespace character and return. 2004/// 2005/// This method forms a token and returns true if KeepWhitespaceMode is enabled. 2006/// 2007bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { 2008 // Whitespace - Skip it, then return the token after the whitespace. 2009 unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently. 2010 while (1) { 2011 // Skip horizontal whitespace very aggressively. 2012 while (isHorizontalWhitespace(Char)) 2013 Char = *++CurPtr; 2014 2015 // Otherwise if we have something other than whitespace, we're done. 2016 if (Char != '\n' && Char != '\r') 2017 break; 2018 2019 if (ParsingPreprocessorDirective) { 2020 // End of preprocessor directive line, let LexTokenInternal handle this. 2021 BufferPtr = CurPtr; 2022 return false; 2023 } 2024 2025 // ok, but handle newline. 2026 // The returned token is at the start of the line. 2027 Result.setFlag(Token::StartOfLine); 2028 // No leading whitespace seen so far. 2029 Result.clearFlag(Token::LeadingSpace); 2030 Char = *++CurPtr; 2031 } 2032 2033 // If this isn't immediately after a newline, there is leading space. 2034 char PrevChar = CurPtr[-1]; 2035 if (PrevChar != '\n' && PrevChar != '\r') 2036 Result.setFlag(Token::LeadingSpace); 2037 2038 // If the client wants us to return whitespace, return it now. 2039 if (isKeepWhitespaceMode()) { 2040 FormTokenWithChars(Result, CurPtr, tok::unknown); 2041 return true; 2042 } 2043 2044 BufferPtr = CurPtr; 2045 return false; 2046} 2047 2048/// We have just read the // characters from input. Skip until we find the 2049/// newline character thats terminate the comment. Then update BufferPtr and 2050/// return. 2051/// 2052/// If we're in KeepCommentMode or any CommentHandler has inserted 2053/// some tokens, this will store the first token and return true. 2054bool Lexer::SkipLineComment(Token &Result, const char *CurPtr) { 2055 // If Line comments aren't explicitly enabled for this language, emit an 2056 // extension warning. 2057 if (!LangOpts.LineComment && !isLexingRawMode()) { 2058 Diag(BufferPtr, diag::ext_line_comment); 2059 2060 // Mark them enabled so we only emit one warning for this translation 2061 // unit. 2062 LangOpts.LineComment = true; 2063 } 2064 2065 // Scan over the body of the comment. The common case, when scanning, is that 2066 // the comment contains normal ascii characters with nothing interesting in 2067 // them. As such, optimize for this case with the inner loop. 2068 char C; 2069 do { 2070 C = *CurPtr; 2071 // Skip over characters in the fast loop. 2072 while (C != 0 && // Potentially EOF. 2073 C != '\n' && C != '\r') // Newline or DOS-style newline. 2074 C = *++CurPtr; 2075 2076 const char *NextLine = CurPtr; 2077 if (C != 0) { 2078 // We found a newline, see if it's escaped. 2079 const char *EscapePtr = CurPtr-1; 2080 while (isHorizontalWhitespace(*EscapePtr)) // Skip whitespace. 2081 --EscapePtr; 2082 2083 if (*EscapePtr == '\\') // Escaped newline. 2084 CurPtr = EscapePtr; 2085 else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' && 2086 EscapePtr[-2] == '?') // Trigraph-escaped newline. 2087 CurPtr = EscapePtr-2; 2088 else 2089 break; // This is a newline, we're done. 2090 } 2091 2092 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 2093 // properly decode the character. Read it in raw mode to avoid emitting 2094 // diagnostics about things like trigraphs. If we see an escaped newline, 2095 // we'll handle it below. 2096 const char *OldPtr = CurPtr; 2097 bool OldRawMode = isLexingRawMode(); 2098 LexingRawMode = true; 2099 C = getAndAdvanceChar(CurPtr, Result); 2100 LexingRawMode = OldRawMode; 2101 2102 // If we only read only one character, then no special handling is needed. 2103 // We're done and can skip forward to the newline. 2104 if (C != 0 && CurPtr == OldPtr+1) { 2105 CurPtr = NextLine; 2106 break; 2107 } 2108 2109 // If we read multiple characters, and one of those characters was a \r or 2110 // \n, then we had an escaped newline within the comment. Emit diagnostic 2111 // unless the next line is also a // comment. 2112 if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') { 2113 for (; OldPtr != CurPtr; ++OldPtr) 2114 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 2115 // Okay, we found a // comment that ends in a newline, if the next 2116 // line is also a // comment, but has spaces, don't emit a diagnostic. 2117 if (isWhitespace(C)) { 2118 const char *ForwardPtr = CurPtr; 2119 while (isWhitespace(*ForwardPtr)) // Skip whitespace. 2120 ++ForwardPtr; 2121 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 2122 break; 2123 } 2124 2125 if (!isLexingRawMode()) 2126 Diag(OldPtr-1, diag::ext_multi_line_line_comment); 2127 break; 2128 } 2129 } 2130 2131 if (CurPtr == BufferEnd+1) { 2132 --CurPtr; 2133 break; 2134 } 2135 2136 if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 2137 PP->CodeCompleteNaturalLanguage(); 2138 cutOffLexing(); 2139 return false; 2140 } 2141 2142 } while (C != '\n' && C != '\r'); 2143 2144 // Found but did not consume the newline. Notify comment handlers about the 2145 // comment unless we're in a #if 0 block. 2146 if (PP && !isLexingRawMode() && 2147 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 2148 getSourceLocation(CurPtr)))) { 2149 BufferPtr = CurPtr; 2150 return true; // A token has to be returned. 2151 } 2152 2153 // If we are returning comments as tokens, return this comment as a token. 2154 if (inKeepCommentMode()) 2155 return SaveLineComment(Result, CurPtr); 2156 2157 // If we are inside a preprocessor directive and we see the end of line, 2158 // return immediately, so that the lexer can return this as an EOD token. 2159 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 2160 BufferPtr = CurPtr; 2161 return false; 2162 } 2163 2164 // Otherwise, eat the \n character. We don't care if this is a \n\r or 2165 // \r\n sequence. This is an efficiency hack (because we know the \n can't 2166 // contribute to another token), it isn't needed for correctness. Note that 2167 // this is ok even in KeepWhitespaceMode, because we would have returned the 2168 /// comment above in that mode. 2169 ++CurPtr; 2170 2171 // The next returned token is at the start of the line. 2172 Result.setFlag(Token::StartOfLine); 2173 // No leading whitespace seen so far. 2174 Result.clearFlag(Token::LeadingSpace); 2175 BufferPtr = CurPtr; 2176 return false; 2177} 2178 2179/// If in save-comment mode, package up this Line comment in an appropriate 2180/// way and return it. 2181bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) { 2182 // If we're not in a preprocessor directive, just return the // comment 2183 // directly. 2184 FormTokenWithChars(Result, CurPtr, tok::comment); 2185 2186 if (!ParsingPreprocessorDirective || LexingRawMode) 2187 return true; 2188 2189 // If this Line-style comment is in a macro definition, transmogrify it into 2190 // a C-style block comment. 2191 bool Invalid = false; 2192 std::string Spelling = PP->getSpelling(Result, &Invalid); 2193 if (Invalid) 2194 return true; 2195 2196 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?"); 2197 Spelling[1] = '*'; // Change prefix to "/*". 2198 Spelling += "*/"; // add suffix. 2199 2200 Result.setKind(tok::comment); 2201 PP->CreateString(Spelling, Result, 2202 Result.getLocation(), Result.getLocation()); 2203 return true; 2204} 2205 2206/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 2207/// character (either \\n or \\r) is part of an escaped newline sequence. Issue 2208/// a diagnostic if so. We know that the newline is inside of a block comment. 2209static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, 2210 Lexer *L) { 2211 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 2212 2213 // Back up off the newline. 2214 --CurPtr; 2215 2216 // If this is a two-character newline sequence, skip the other character. 2217 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 2218 // \n\n or \r\r -> not escaped newline. 2219 if (CurPtr[0] == CurPtr[1]) 2220 return false; 2221 // \n\r or \r\n -> skip the newline. 2222 --CurPtr; 2223 } 2224 2225 // If we have horizontal whitespace, skip over it. We allow whitespace 2226 // between the slash and newline. 2227 bool HasSpace = false; 2228 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 2229 --CurPtr; 2230 HasSpace = true; 2231 } 2232 2233 // If we have a slash, we know this is an escaped newline. 2234 if (*CurPtr == '\\') { 2235 if (CurPtr[-1] != '*') return false; 2236 } else { 2237 // It isn't a slash, is it the ?? / trigraph? 2238 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || 2239 CurPtr[-3] != '*') 2240 return false; 2241 2242 // This is the trigraph ending the comment. Emit a stern warning! 2243 CurPtr -= 2; 2244 2245 // If no trigraphs are enabled, warn that we ignored this trigraph and 2246 // ignore this * character. 2247 if (!L->getLangOpts().Trigraphs) { 2248 if (!L->isLexingRawMode()) 2249 L->Diag(CurPtr, diag::trigraph_ignored_block_comment); 2250 return false; 2251 } 2252 if (!L->isLexingRawMode()) 2253 L->Diag(CurPtr, diag::trigraph_ends_block_comment); 2254 } 2255 2256 // Warn about having an escaped newline between the */ characters. 2257 if (!L->isLexingRawMode()) 2258 L->Diag(CurPtr, diag::escaped_newline_block_comment_end); 2259 2260 // If there was space between the backslash and newline, warn about it. 2261 if (HasSpace && !L->isLexingRawMode()) 2262 L->Diag(CurPtr, diag::backslash_newline_space); 2263 2264 return true; 2265} 2266 2267#ifdef __SSE2__ 2268#include <emmintrin.h> 2269#elif __ALTIVEC__ 2270#include <altivec.h> 2271#undef bool 2272#endif 2273 2274/// We have just read from input the / and * characters that started a comment. 2275/// Read until we find the * and / characters that terminate the comment. 2276/// Note that we don't bother decoding trigraphs or escaped newlines in block 2277/// comments, because they cannot cause the comment to end. The only thing 2278/// that can happen is the comment could end with an escaped newline between 2279/// the terminating * and /. 2280/// 2281/// If we're in KeepCommentMode or any CommentHandler has inserted 2282/// some tokens, this will store the first token and return true. 2283bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { 2284 // Scan one character past where we should, looking for a '/' character. Once 2285 // we find it, check to see if it was preceded by a *. This common 2286 // optimization helps people who like to put a lot of * characters in their 2287 // comments. 2288 2289 // The first character we get with newlines and trigraphs skipped to handle 2290 // the degenerate /*/ case below correctly if the * has an escaped newline 2291 // after it. 2292 unsigned CharSize; 2293 unsigned char C = getCharAndSize(CurPtr, CharSize); 2294 CurPtr += CharSize; 2295 if (C == 0 && CurPtr == BufferEnd+1) { 2296 if (!isLexingRawMode()) 2297 Diag(BufferPtr, diag::err_unterminated_block_comment); 2298 --CurPtr; 2299 2300 // KeepWhitespaceMode should return this broken comment as a token. Since 2301 // it isn't a well formed comment, just return it as an 'unknown' token. 2302 if (isKeepWhitespaceMode()) { 2303 FormTokenWithChars(Result, CurPtr, tok::unknown); 2304 return true; 2305 } 2306 2307 BufferPtr = CurPtr; 2308 return false; 2309 } 2310 2311 // Check to see if the first character after the '/*' is another /. If so, 2312 // then this slash does not end the block comment, it is part of it. 2313 if (C == '/') 2314 C = *CurPtr++; 2315 2316 while (1) { 2317 // Skip over all non-interesting characters until we find end of buffer or a 2318 // (probably ending) '/' character. 2319 if (CurPtr + 24 < BufferEnd && 2320 // If there is a code-completion point avoid the fast scan because it 2321 // doesn't check for '\0'. 2322 !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) { 2323 // While not aligned to a 16-byte boundary. 2324 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 2325 C = *CurPtr++; 2326 2327 if (C == '/') goto FoundSlash; 2328 2329#ifdef __SSE2__ 2330 __m128i Slashes = _mm_set1_epi8('/'); 2331 while (CurPtr+16 <= BufferEnd) { 2332 int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr, 2333 Slashes)); 2334 if (cmp != 0) { 2335 // Adjust the pointer to point directly after the first slash. It's 2336 // not necessary to set C here, it will be overwritten at the end of 2337 // the outer loop. 2338 CurPtr += llvm::CountTrailingZeros_32(cmp) + 1; 2339 goto FoundSlash; 2340 } 2341 CurPtr += 16; 2342 } 2343#elif __ALTIVEC__ 2344 __vector unsigned char Slashes = { 2345 '/', '/', '/', '/', '/', '/', '/', '/', 2346 '/', '/', '/', '/', '/', '/', '/', '/' 2347 }; 2348 while (CurPtr+16 <= BufferEnd && 2349 !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes)) 2350 CurPtr += 16; 2351#else 2352 // Scan for '/' quickly. Many block comments are very large. 2353 while (CurPtr[0] != '/' && 2354 CurPtr[1] != '/' && 2355 CurPtr[2] != '/' && 2356 CurPtr[3] != '/' && 2357 CurPtr+4 < BufferEnd) { 2358 CurPtr += 4; 2359 } 2360#endif 2361 2362 // It has to be one of the bytes scanned, increment to it and read one. 2363 C = *CurPtr++; 2364 } 2365 2366 // Loop to scan the remainder. 2367 while (C != '/' && C != '\0') 2368 C = *CurPtr++; 2369 2370 if (C == '/') { 2371 FoundSlash: 2372 if (CurPtr[-2] == '*') // We found the final */. We're done! 2373 break; 2374 2375 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 2376 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { 2377 // We found the final */, though it had an escaped newline between the 2378 // * and /. We're done! 2379 break; 2380 } 2381 } 2382 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 2383 // If this is a /* inside of the comment, emit a warning. Don't do this 2384 // if this is a /*/, which will end the comment. This misses cases with 2385 // embedded escaped newlines, but oh well. 2386 if (!isLexingRawMode()) 2387 Diag(CurPtr-1, diag::warn_nested_block_comment); 2388 } 2389 } else if (C == 0 && CurPtr == BufferEnd+1) { 2390 if (!isLexingRawMode()) 2391 Diag(BufferPtr, diag::err_unterminated_block_comment); 2392 // Note: the user probably forgot a */. We could continue immediately 2393 // after the /*, but this would involve lexing a lot of what really is the 2394 // comment, which surely would confuse the parser. 2395 --CurPtr; 2396 2397 // KeepWhitespaceMode should return this broken comment as a token. Since 2398 // it isn't a well formed comment, just return it as an 'unknown' token. 2399 if (isKeepWhitespaceMode()) { 2400 FormTokenWithChars(Result, CurPtr, tok::unknown); 2401 return true; 2402 } 2403 2404 BufferPtr = CurPtr; 2405 return false; 2406 } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) { 2407 PP->CodeCompleteNaturalLanguage(); 2408 cutOffLexing(); 2409 return false; 2410 } 2411 2412 C = *CurPtr++; 2413 } 2414 2415 // Notify comment handlers about the comment unless we're in a #if 0 block. 2416 if (PP && !isLexingRawMode() && 2417 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 2418 getSourceLocation(CurPtr)))) { 2419 BufferPtr = CurPtr; 2420 return true; // A token has to be returned. 2421 } 2422 2423 // If we are returning comments as tokens, return this comment as a token. 2424 if (inKeepCommentMode()) { 2425 FormTokenWithChars(Result, CurPtr, tok::comment); 2426 return true; 2427 } 2428 2429 // It is common for the tokens immediately after a /**/ comment to be 2430 // whitespace. Instead of going through the big switch, handle it 2431 // efficiently now. This is safe even in KeepWhitespaceMode because we would 2432 // have already returned above with the comment as a token. 2433 if (isHorizontalWhitespace(*CurPtr)) { 2434 Result.setFlag(Token::LeadingSpace); 2435 SkipWhitespace(Result, CurPtr+1); 2436 return false; 2437 } 2438 2439 // Otherwise, just return so that the next character will be lexed as a token. 2440 BufferPtr = CurPtr; 2441 Result.setFlag(Token::LeadingSpace); 2442 return false; 2443} 2444 2445//===----------------------------------------------------------------------===// 2446// Primary Lexing Entry Points 2447//===----------------------------------------------------------------------===// 2448 2449/// ReadToEndOfLine - Read the rest of the current preprocessor line as an 2450/// uninterpreted string. This switches the lexer out of directive mode. 2451void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) { 2452 assert(ParsingPreprocessorDirective && ParsingFilename == false && 2453 "Must be in a preprocessing directive!"); 2454 Token Tmp; 2455 2456 // CurPtr - Cache BufferPtr in an automatic variable. 2457 const char *CurPtr = BufferPtr; 2458 while (1) { 2459 char Char = getAndAdvanceChar(CurPtr, Tmp); 2460 switch (Char) { 2461 default: 2462 if (Result) 2463 Result->push_back(Char); 2464 break; 2465 case 0: // Null. 2466 // Found end of file? 2467 if (CurPtr-1 != BufferEnd) { 2468 if (isCodeCompletionPoint(CurPtr-1)) { 2469 PP->CodeCompleteNaturalLanguage(); 2470 cutOffLexing(); 2471 return; 2472 } 2473 2474 // Nope, normal character, continue. 2475 if (Result) 2476 Result->push_back(Char); 2477 break; 2478 } 2479 // FALL THROUGH. 2480 case '\r': 2481 case '\n': 2482 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 2483 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 2484 BufferPtr = CurPtr-1; 2485 2486 // Next, lex the character, which should handle the EOD transition. 2487 Lex(Tmp); 2488 if (Tmp.is(tok::code_completion)) { 2489 if (PP) 2490 PP->CodeCompleteNaturalLanguage(); 2491 Lex(Tmp); 2492 } 2493 assert(Tmp.is(tok::eod) && "Unexpected token!"); 2494 2495 // Finally, we're done; 2496 return; 2497 } 2498 } 2499} 2500 2501/// LexEndOfFile - CurPtr points to the end of this file. Handle this 2502/// condition, reporting diagnostics and handling other edge cases as required. 2503/// This returns true if Result contains a token, false if PP.Lex should be 2504/// called again. 2505bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 2506 // If we hit the end of the file while parsing a preprocessor directive, 2507 // end the preprocessor directive first. The next token returned will 2508 // then be the end of file. 2509 if (ParsingPreprocessorDirective) { 2510 // Done parsing the "line". 2511 ParsingPreprocessorDirective = false; 2512 // Update the location of token as well as BufferPtr. 2513 FormTokenWithChars(Result, CurPtr, tok::eod); 2514 2515 // Restore comment saving mode, in case it was disabled for directive. 2516 SetCommentRetentionState(PP->getCommentRetentionState()); 2517 return true; // Have a token. 2518 } 2519 2520 // If we are in raw mode, return this event as an EOF token. Let the caller 2521 // that put us in raw mode handle the event. 2522 if (isLexingRawMode()) { 2523 Result.startToken(); 2524 BufferPtr = BufferEnd; 2525 FormTokenWithChars(Result, BufferEnd, tok::eof); 2526 return true; 2527 } 2528 2529 // Issue diagnostics for unterminated #if and missing newline. 2530 2531 // If we are in a #if directive, emit an error. 2532 while (!ConditionalStack.empty()) { 2533 if (PP->getCodeCompletionFileLoc() != FileLoc) 2534 PP->Diag(ConditionalStack.back().IfLoc, 2535 diag::err_pp_unterminated_conditional); 2536 ConditionalStack.pop_back(); 2537 } 2538 2539 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 2540 // a pedwarn. 2541 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) 2542 Diag(BufferEnd, LangOpts.CPlusPlus11 ? // C++11 [lex.phases] 2.2 p2 2543 diag::warn_cxx98_compat_no_newline_eof : diag::ext_no_newline_eof) 2544 << FixItHint::CreateInsertion(getSourceLocation(BufferEnd), "\n"); 2545 2546 BufferPtr = CurPtr; 2547 2548 // Finally, let the preprocessor handle this. 2549 return PP->HandleEndOfFile(Result, isPragmaLexer()); 2550} 2551 2552/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 2553/// the specified lexer will return a tok::l_paren token, 0 if it is something 2554/// else and 2 if there are no more tokens in the buffer controlled by the 2555/// lexer. 2556unsigned Lexer::isNextPPTokenLParen() { 2557 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 2558 2559 // Switch to 'skipping' mode. This will ensure that we can lex a token 2560 // without emitting diagnostics, disables macro expansion, and will cause EOF 2561 // to return an EOF token instead of popping the include stack. 2562 LexingRawMode = true; 2563 2564 // Save state that can be changed while lexing so that we can restore it. 2565 const char *TmpBufferPtr = BufferPtr; 2566 bool inPPDirectiveMode = ParsingPreprocessorDirective; 2567 2568 Token Tok; 2569 Tok.startToken(); 2570 LexTokenInternal(Tok); 2571 2572 // Restore state that may have changed. 2573 BufferPtr = TmpBufferPtr; 2574 ParsingPreprocessorDirective = inPPDirectiveMode; 2575 2576 // Restore the lexer back to non-skipping mode. 2577 LexingRawMode = false; 2578 2579 if (Tok.is(tok::eof)) 2580 return 2; 2581 return Tok.is(tok::l_paren); 2582} 2583 2584/// \brief Find the end of a version control conflict marker. 2585static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd, 2586 ConflictMarkerKind CMK) { 2587 const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>"; 2588 size_t TermLen = CMK == CMK_Perforce ? 5 : 7; 2589 StringRef RestOfBuffer(CurPtr+TermLen, BufferEnd-CurPtr-TermLen); 2590 size_t Pos = RestOfBuffer.find(Terminator); 2591 while (Pos != StringRef::npos) { 2592 // Must occur at start of line. 2593 if (RestOfBuffer[Pos-1] != '\r' && 2594 RestOfBuffer[Pos-1] != '\n') { 2595 RestOfBuffer = RestOfBuffer.substr(Pos+TermLen); 2596 Pos = RestOfBuffer.find(Terminator); 2597 continue; 2598 } 2599 return RestOfBuffer.data()+Pos; 2600 } 2601 return 0; 2602} 2603 2604/// IsStartOfConflictMarker - If the specified pointer is the start of a version 2605/// control conflict marker like '<<<<<<<', recognize it as such, emit an error 2606/// and recover nicely. This returns true if it is a conflict marker and false 2607/// if not. 2608bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 2609 // Only a conflict marker if it starts at the beginning of a line. 2610 if (CurPtr != BufferStart && 2611 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 2612 return false; 2613 2614 // Check to see if we have <<<<<<< or >>>>. 2615 if ((BufferEnd-CurPtr < 8 || StringRef(CurPtr, 7) != "<<<<<<<") && 2616 (BufferEnd-CurPtr < 6 || StringRef(CurPtr, 5) != ">>>> ")) 2617 return false; 2618 2619 // If we have a situation where we don't care about conflict markers, ignore 2620 // it. 2621 if (CurrentConflictMarkerState || isLexingRawMode()) 2622 return false; 2623 2624 ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce; 2625 2626 // Check to see if there is an ending marker somewhere in the buffer at the 2627 // start of a line to terminate this conflict marker. 2628 if (FindConflictEnd(CurPtr, BufferEnd, Kind)) { 2629 // We found a match. We are really in a conflict marker. 2630 // Diagnose this, and ignore to the end of line. 2631 Diag(CurPtr, diag::err_conflict_marker); 2632 CurrentConflictMarkerState = Kind; 2633 2634 // Skip ahead to the end of line. We know this exists because the 2635 // end-of-conflict marker starts with \r or \n. 2636 while (*CurPtr != '\r' && *CurPtr != '\n') { 2637 assert(CurPtr != BufferEnd && "Didn't find end of line"); 2638 ++CurPtr; 2639 } 2640 BufferPtr = CurPtr; 2641 return true; 2642 } 2643 2644 // No end of conflict marker found. 2645 return false; 2646} 2647 2648 2649/// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if 2650/// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it 2651/// is the end of a conflict marker. Handle it by ignoring up until the end of 2652/// the line. This returns true if it is a conflict marker and false if not. 2653bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 2654 // Only a conflict marker if it starts at the beginning of a line. 2655 if (CurPtr != BufferStart && 2656 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 2657 return false; 2658 2659 // If we have a situation where we don't care about conflict markers, ignore 2660 // it. 2661 if (!CurrentConflictMarkerState || isLexingRawMode()) 2662 return false; 2663 2664 // Check to see if we have the marker (4 characters in a row). 2665 for (unsigned i = 1; i != 4; ++i) 2666 if (CurPtr[i] != CurPtr[0]) 2667 return false; 2668 2669 // If we do have it, search for the end of the conflict marker. This could 2670 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 2671 // be the end of conflict marker. 2672 if (const char *End = FindConflictEnd(CurPtr, BufferEnd, 2673 CurrentConflictMarkerState)) { 2674 CurPtr = End; 2675 2676 // Skip ahead to the end of line. 2677 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 2678 ++CurPtr; 2679 2680 BufferPtr = CurPtr; 2681 2682 // No longer in the conflict marker. 2683 CurrentConflictMarkerState = CMK_None; 2684 return true; 2685 } 2686 2687 return false; 2688} 2689 2690bool Lexer::isCodeCompletionPoint(const char *CurPtr) const { 2691 if (PP && PP->isCodeCompletionEnabled()) { 2692 SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart); 2693 return Loc == PP->getCodeCompletionLoc(); 2694 } 2695 2696 return false; 2697} 2698 2699uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc, 2700 Token *Result) { 2701 assert(LangOpts.CPlusPlus || LangOpts.C99); 2702 2703 unsigned CharSize; 2704 char Kind = getCharAndSize(StartPtr, CharSize); 2705 2706 unsigned NumHexDigits; 2707 if (Kind == 'u') 2708 NumHexDigits = 4; 2709 else if (Kind == 'U') 2710 NumHexDigits = 8; 2711 else 2712 return 0; 2713 2714 const char *CurPtr = StartPtr + CharSize; 2715 const char *KindLoc = &CurPtr[-1]; 2716 2717 uint32_t CodePoint = 0; 2718 for (unsigned i = 0; i < NumHexDigits; ++i) { 2719 char C = getCharAndSize(CurPtr, CharSize); 2720 2721 unsigned Value = llvm::hexDigitValue(C); 2722 if (Value == -1U) { 2723 if (Result && !isLexingRawMode()) { 2724 if (i == 0) { 2725 Diag(BufferPtr, diag::warn_ucn_escape_no_digits) 2726 << StringRef(KindLoc, 1); 2727 } else { 2728 Diag(BufferPtr, diag::warn_ucn_escape_incomplete); 2729 2730 // If the user wrote \U1234, suggest a fixit to \u. 2731 if (i == 4 && NumHexDigits == 8) { 2732 CharSourceRange URange = 2733 CharSourceRange::getCharRange(getSourceLocation(KindLoc), 2734 getSourceLocation(KindLoc + 1)); 2735 Diag(KindLoc, diag::note_ucn_four_not_eight) 2736 << FixItHint::CreateReplacement(URange, "u"); 2737 } 2738 } 2739 } 2740 2741 return 0; 2742 } 2743 2744 CodePoint <<= 4; 2745 CodePoint += Value; 2746 2747 CurPtr += CharSize; 2748 } 2749 2750 if (Result) { 2751 Result->setFlag(Token::HasUCN); 2752 if (CurPtr - StartPtr == NumHexDigits + 2) 2753 StartPtr = CurPtr; 2754 else 2755 while (StartPtr != CurPtr) 2756 (void)getAndAdvanceChar(StartPtr, *Result); 2757 } else { 2758 StartPtr = CurPtr; 2759 } 2760 2761 // C99 6.4.3p2: A universal character name shall not specify a character whose 2762 // short identifier is less than 00A0 other than 0024 ($), 0040 (@), or 2763 // 0060 (`), nor one in the range D800 through DFFF inclusive.) 2764 // C++11 [lex.charset]p2: If the hexadecimal value for a 2765 // universal-character-name corresponds to a surrogate code point (in the 2766 // range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally, 2767 // if the hexadecimal value for a universal-character-name outside the 2768 // c-char-sequence, s-char-sequence, or r-char-sequence of a character or 2769 // string literal corresponds to a control character (in either of the 2770 // ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the 2771 // basic source character set, the program is ill-formed. 2772 if (CodePoint < 0xA0) { 2773 if (CodePoint == 0x24 || CodePoint == 0x40 || CodePoint == 0x60) 2774 return CodePoint; 2775 2776 // We don't use isLexingRawMode() here because we need to warn about bad 2777 // UCNs even when skipping preprocessing tokens in a #if block. 2778 if (Result && PP) { 2779 if (CodePoint < 0x20 || CodePoint >= 0x7F) 2780 Diag(BufferPtr, diag::err_ucn_control_character); 2781 else { 2782 char C = static_cast<char>(CodePoint); 2783 Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1); 2784 } 2785 } 2786 2787 return 0; 2788 2789 } else if ((!LangOpts.CPlusPlus || LangOpts.CPlusPlus11) && 2790 (CodePoint >= 0xD800 && CodePoint <= 0xDFFF)) { 2791 // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't. 2792 // We don't use isLexingRawMode() here because we need to warn about bad 2793 // UCNs even when skipping preprocessing tokens in a #if block. 2794 if (Result && PP) 2795 Diag(BufferPtr, diag::err_ucn_escape_invalid); 2796 return 0; 2797 } 2798 2799 return CodePoint; 2800} 2801 2802static bool isUnicodeWhitespace(uint32_t C) { 2803 return (C == 0x0085 || C == 0x00A0 || C == 0x1680 || 2804 C == 0x180E || (C >= 0x2000 && C <= 0x200A) || 2805 C == 0x2028 || C == 0x2029 || C == 0x202F || 2806 C == 0x205F || C == 0x3000); 2807} 2808 2809void Lexer::LexUnicode(Token &Result, uint32_t C, const char *CurPtr) { 2810 if (isUnicodeWhitespace(C)) { 2811 if (!isLexingRawMode()) { 2812 CharSourceRange CharRange = 2813 CharSourceRange::getCharRange(getSourceLocation(), 2814 getSourceLocation(CurPtr)); 2815 Diag(BufferPtr, diag::ext_unicode_whitespace) 2816 << CharRange; 2817 } 2818 2819 Result.setFlag(Token::LeadingSpace); 2820 if (SkipWhitespace(Result, CurPtr)) 2821 return; // KeepWhitespaceMode 2822 2823 return LexTokenInternal(Result); 2824 } 2825 2826 if (isAllowedIDChar(C) && isAllowedInitiallyIDChar(C)) { 2827 MIOpt.ReadToken(); 2828 return LexIdentifier(Result, CurPtr); 2829 } 2830 2831 if (!isASCII(*BufferPtr) && !isAllowedIDChar(C)) { 2832 // Non-ASCII characters tend to creep into source code unintentionally. 2833 // Instead of letting the parser complain about the unknown token, 2834 // just drop the character. 2835 // Note that we can /only/ do this when the non-ASCII character is actually 2836 // spelled as Unicode, not written as a UCN. The standard requires that 2837 // we not throw away any possible preprocessor tokens, but there's a 2838 // loophole in the mapping of Unicode characters to basic character set 2839 // characters that allows us to map these particular characters to, say, 2840 // whitespace. 2841 if (!isLexingRawMode()) { 2842 CharSourceRange CharRange = 2843 CharSourceRange::getCharRange(getSourceLocation(), 2844 getSourceLocation(CurPtr)); 2845 Diag(BufferPtr, diag::err_non_ascii) 2846 << FixItHint::CreateRemoval(CharRange); 2847 } 2848 2849 BufferPtr = CurPtr; 2850 return LexTokenInternal(Result); 2851 } 2852 2853 // Otherwise, we have an explicit UCN or a character that's unlikely to show 2854 // up by accident. 2855 MIOpt.ReadToken(); 2856 FormTokenWithChars(Result, CurPtr, tok::unknown); 2857} 2858 2859 2860/// LexTokenInternal - This implements a simple C family lexer. It is an 2861/// extremely performance critical piece of code. This assumes that the buffer 2862/// has a null character at the end of the file. This returns a preprocessing 2863/// token, not a normal token, as such, it is an internal interface. It assumes 2864/// that the Flags of result have been cleared before calling this. 2865void Lexer::LexTokenInternal(Token &Result) { 2866LexNextToken: 2867 // New token, can't need cleaning yet. 2868 Result.clearFlag(Token::NeedsCleaning); 2869 Result.setIdentifierInfo(0); 2870 2871 // CurPtr - Cache BufferPtr in an automatic variable. 2872 const char *CurPtr = BufferPtr; 2873 2874 // Small amounts of horizontal whitespace is very common between tokens. 2875 if ((*CurPtr == ' ') || (*CurPtr == '\t')) { 2876 ++CurPtr; 2877 while ((*CurPtr == ' ') || (*CurPtr == '\t')) 2878 ++CurPtr; 2879 2880 // If we are keeping whitespace and other tokens, just return what we just 2881 // skipped. The next lexer invocation will return the token after the 2882 // whitespace. 2883 if (isKeepWhitespaceMode()) { 2884 FormTokenWithChars(Result, CurPtr, tok::unknown); 2885 return; 2886 } 2887 2888 BufferPtr = CurPtr; 2889 Result.setFlag(Token::LeadingSpace); 2890 } 2891 2892 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 2893 2894 // Read a character, advancing over it. 2895 char Char = getAndAdvanceChar(CurPtr, Result); 2896 tok::TokenKind Kind; 2897 2898 switch (Char) { 2899 case 0: // Null. 2900 // Found end of file? 2901 if (CurPtr-1 == BufferEnd) { 2902 // Read the PP instance variable into an automatic variable, because 2903 // LexEndOfFile will often delete 'this'. 2904 Preprocessor *PPCache = PP; 2905 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 2906 return; // Got a token to return. 2907 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 2908 return PPCache->Lex(Result); 2909 } 2910 2911 // Check if we are performing code completion. 2912 if (isCodeCompletionPoint(CurPtr-1)) { 2913 // Return the code-completion token. 2914 Result.startToken(); 2915 FormTokenWithChars(Result, CurPtr, tok::code_completion); 2916 return; 2917 } 2918 2919 if (!isLexingRawMode()) 2920 Diag(CurPtr-1, diag::null_in_file); 2921 Result.setFlag(Token::LeadingSpace); 2922 if (SkipWhitespace(Result, CurPtr)) 2923 return; // KeepWhitespaceMode 2924 2925 goto LexNextToken; // GCC isn't tail call eliminating. 2926 2927 case 26: // DOS & CP/M EOF: "^Z". 2928 // If we're in Microsoft extensions mode, treat this as end of file. 2929 if (LangOpts.MicrosoftExt) { 2930 // Read the PP instance variable into an automatic variable, because 2931 // LexEndOfFile will often delete 'this'. 2932 Preprocessor *PPCache = PP; 2933 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 2934 return; // Got a token to return. 2935 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 2936 return PPCache->Lex(Result); 2937 } 2938 // If Microsoft extensions are disabled, this is just random garbage. 2939 Kind = tok::unknown; 2940 break; 2941 2942 case '\n': 2943 case '\r': 2944 // If we are inside a preprocessor directive and we see the end of line, 2945 // we know we are done with the directive, so return an EOD token. 2946 if (ParsingPreprocessorDirective) { 2947 // Done parsing the "line". 2948 ParsingPreprocessorDirective = false; 2949 2950 // Restore comment saving mode, in case it was disabled for directive. 2951 if (PP) 2952 SetCommentRetentionState(PP->getCommentRetentionState()); 2953 2954 // Since we consumed a newline, we are back at the start of a line. 2955 IsAtStartOfLine = true; 2956 2957 Kind = tok::eod; 2958 break; 2959 } 2960 // The returned token is at the start of the line. 2961 Result.setFlag(Token::StartOfLine); 2962 // No leading whitespace seen so far. 2963 Result.clearFlag(Token::LeadingSpace); 2964 2965 if (SkipWhitespace(Result, CurPtr)) 2966 return; // KeepWhitespaceMode 2967 goto LexNextToken; // GCC isn't tail call eliminating. 2968 case ' ': 2969 case '\t': 2970 case '\f': 2971 case '\v': 2972 SkipHorizontalWhitespace: 2973 Result.setFlag(Token::LeadingSpace); 2974 if (SkipWhitespace(Result, CurPtr)) 2975 return; // KeepWhitespaceMode 2976 2977 SkipIgnoredUnits: 2978 CurPtr = BufferPtr; 2979 2980 // If the next token is obviously a // or /* */ comment, skip it efficiently 2981 // too (without going through the big switch stmt). 2982 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 2983 LangOpts.LineComment && !LangOpts.TraditionalCPP) { 2984 if (SkipLineComment(Result, CurPtr+2)) 2985 return; // There is a token to return. 2986 goto SkipIgnoredUnits; 2987 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 2988 if (SkipBlockComment(Result, CurPtr+2)) 2989 return; // There is a token to return. 2990 goto SkipIgnoredUnits; 2991 } else if (isHorizontalWhitespace(*CurPtr)) { 2992 goto SkipHorizontalWhitespace; 2993 } 2994 goto LexNextToken; // GCC isn't tail call eliminating. 2995 2996 // C99 6.4.4.1: Integer Constants. 2997 // C99 6.4.4.2: Floating Constants. 2998 case '0': case '1': case '2': case '3': case '4': 2999 case '5': case '6': case '7': case '8': case '9': 3000 // Notify MIOpt that we read a non-whitespace/non-comment token. 3001 MIOpt.ReadToken(); 3002 return LexNumericConstant(Result, CurPtr); 3003 3004 case 'u': // Identifier (uber) or C++0x UTF-8 or UTF-16 string literal 3005 // Notify MIOpt that we read a non-whitespace/non-comment token. 3006 MIOpt.ReadToken(); 3007 3008 if (LangOpts.CPlusPlus11) { 3009 Char = getCharAndSize(CurPtr, SizeTmp); 3010 3011 // UTF-16 string literal 3012 if (Char == '"') 3013 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3014 tok::utf16_string_literal); 3015 3016 // UTF-16 character constant 3017 if (Char == '\'') 3018 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3019 tok::utf16_char_constant); 3020 3021 // UTF-16 raw string literal 3022 if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3023 return LexRawStringLiteral(Result, 3024 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3025 SizeTmp2, Result), 3026 tok::utf16_string_literal); 3027 3028 if (Char == '8') { 3029 char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2); 3030 3031 // UTF-8 string literal 3032 if (Char2 == '"') 3033 return LexStringLiteral(Result, 3034 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3035 SizeTmp2, Result), 3036 tok::utf8_string_literal); 3037 3038 if (Char2 == 'R') { 3039 unsigned SizeTmp3; 3040 char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 3041 // UTF-8 raw string literal 3042 if (Char3 == '"') { 3043 return LexRawStringLiteral(Result, 3044 ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3045 SizeTmp2, Result), 3046 SizeTmp3, Result), 3047 tok::utf8_string_literal); 3048 } 3049 } 3050 } 3051 } 3052 3053 // treat u like the start of an identifier. 3054 return LexIdentifier(Result, CurPtr); 3055 3056 case 'U': // Identifier (Uber) or C++0x UTF-32 string literal 3057 // Notify MIOpt that we read a non-whitespace/non-comment token. 3058 MIOpt.ReadToken(); 3059 3060 if (LangOpts.CPlusPlus11) { 3061 Char = getCharAndSize(CurPtr, SizeTmp); 3062 3063 // UTF-32 string literal 3064 if (Char == '"') 3065 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3066 tok::utf32_string_literal); 3067 3068 // UTF-32 character constant 3069 if (Char == '\'') 3070 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3071 tok::utf32_char_constant); 3072 3073 // UTF-32 raw string literal 3074 if (Char == 'R' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3075 return LexRawStringLiteral(Result, 3076 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3077 SizeTmp2, Result), 3078 tok::utf32_string_literal); 3079 } 3080 3081 // treat U like the start of an identifier. 3082 return LexIdentifier(Result, CurPtr); 3083 3084 case 'R': // Identifier or C++0x raw string literal 3085 // Notify MIOpt that we read a non-whitespace/non-comment token. 3086 MIOpt.ReadToken(); 3087 3088 if (LangOpts.CPlusPlus11) { 3089 Char = getCharAndSize(CurPtr, SizeTmp); 3090 3091 if (Char == '"') 3092 return LexRawStringLiteral(Result, 3093 ConsumeChar(CurPtr, SizeTmp, Result), 3094 tok::string_literal); 3095 } 3096 3097 // treat R like the start of an identifier. 3098 return LexIdentifier(Result, CurPtr); 3099 3100 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 3101 // Notify MIOpt that we read a non-whitespace/non-comment token. 3102 MIOpt.ReadToken(); 3103 Char = getCharAndSize(CurPtr, SizeTmp); 3104 3105 // Wide string literal. 3106 if (Char == '"') 3107 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3108 tok::wide_string_literal); 3109 3110 // Wide raw string literal. 3111 if (LangOpts.CPlusPlus11 && Char == 'R' && 3112 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"') 3113 return LexRawStringLiteral(Result, 3114 ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3115 SizeTmp2, Result), 3116 tok::wide_string_literal); 3117 3118 // Wide character constant. 3119 if (Char == '\'') 3120 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result), 3121 tok::wide_char_constant); 3122 // FALL THROUGH, treating L like the start of an identifier. 3123 3124 // C99 6.4.2: Identifiers. 3125 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 3126 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 3127 case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/ 3128 case 'V': case 'W': case 'X': case 'Y': case 'Z': 3129 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 3130 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 3131 case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/ 3132 case 'v': case 'w': case 'x': case 'y': case 'z': 3133 case '_': 3134 // Notify MIOpt that we read a non-whitespace/non-comment token. 3135 MIOpt.ReadToken(); 3136 return LexIdentifier(Result, CurPtr); 3137 3138 case '$': // $ in identifiers. 3139 if (LangOpts.DollarIdents) { 3140 if (!isLexingRawMode()) 3141 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 3142 // Notify MIOpt that we read a non-whitespace/non-comment token. 3143 MIOpt.ReadToken(); 3144 return LexIdentifier(Result, CurPtr); 3145 } 3146 3147 Kind = tok::unknown; 3148 break; 3149 3150 // C99 6.4.4: Character Constants. 3151 case '\'': 3152 // Notify MIOpt that we read a non-whitespace/non-comment token. 3153 MIOpt.ReadToken(); 3154 return LexCharConstant(Result, CurPtr, tok::char_constant); 3155 3156 // C99 6.4.5: String Literals. 3157 case '"': 3158 // Notify MIOpt that we read a non-whitespace/non-comment token. 3159 MIOpt.ReadToken(); 3160 return LexStringLiteral(Result, CurPtr, tok::string_literal); 3161 3162 // C99 6.4.6: Punctuators. 3163 case '?': 3164 Kind = tok::question; 3165 break; 3166 case '[': 3167 Kind = tok::l_square; 3168 break; 3169 case ']': 3170 Kind = tok::r_square; 3171 break; 3172 case '(': 3173 Kind = tok::l_paren; 3174 break; 3175 case ')': 3176 Kind = tok::r_paren; 3177 break; 3178 case '{': 3179 Kind = tok::l_brace; 3180 break; 3181 case '}': 3182 Kind = tok::r_brace; 3183 break; 3184 case '.': 3185 Char = getCharAndSize(CurPtr, SizeTmp); 3186 if (Char >= '0' && Char <= '9') { 3187 // Notify MIOpt that we read a non-whitespace/non-comment token. 3188 MIOpt.ReadToken(); 3189 3190 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 3191 } else if (LangOpts.CPlusPlus && Char == '*') { 3192 Kind = tok::periodstar; 3193 CurPtr += SizeTmp; 3194 } else if (Char == '.' && 3195 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 3196 Kind = tok::ellipsis; 3197 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3198 SizeTmp2, Result); 3199 } else { 3200 Kind = tok::period; 3201 } 3202 break; 3203 case '&': 3204 Char = getCharAndSize(CurPtr, SizeTmp); 3205 if (Char == '&') { 3206 Kind = tok::ampamp; 3207 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3208 } else if (Char == '=') { 3209 Kind = tok::ampequal; 3210 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3211 } else { 3212 Kind = tok::amp; 3213 } 3214 break; 3215 case '*': 3216 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 3217 Kind = tok::starequal; 3218 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3219 } else { 3220 Kind = tok::star; 3221 } 3222 break; 3223 case '+': 3224 Char = getCharAndSize(CurPtr, SizeTmp); 3225 if (Char == '+') { 3226 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3227 Kind = tok::plusplus; 3228 } else if (Char == '=') { 3229 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3230 Kind = tok::plusequal; 3231 } else { 3232 Kind = tok::plus; 3233 } 3234 break; 3235 case '-': 3236 Char = getCharAndSize(CurPtr, SizeTmp); 3237 if (Char == '-') { // -- 3238 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3239 Kind = tok::minusminus; 3240 } else if (Char == '>' && LangOpts.CPlusPlus && 3241 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 3242 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3243 SizeTmp2, Result); 3244 Kind = tok::arrowstar; 3245 } else if (Char == '>') { // -> 3246 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3247 Kind = tok::arrow; 3248 } else if (Char == '=') { // -= 3249 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3250 Kind = tok::minusequal; 3251 } else { 3252 Kind = tok::minus; 3253 } 3254 break; 3255 case '~': 3256 Kind = tok::tilde; 3257 break; 3258 case '!': 3259 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 3260 Kind = tok::exclaimequal; 3261 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3262 } else { 3263 Kind = tok::exclaim; 3264 } 3265 break; 3266 case '/': 3267 // 6.4.9: Comments 3268 Char = getCharAndSize(CurPtr, SizeTmp); 3269 if (Char == '/') { // Line comment. 3270 // Even if Line comments are disabled (e.g. in C89 mode), we generally 3271 // want to lex this as a comment. There is one problem with this though, 3272 // that in one particular corner case, this can change the behavior of the 3273 // resultant program. For example, In "foo //**/ bar", C89 would lex 3274 // this as "foo / bar" and langauges with Line comments would lex it as 3275 // "foo". Check to see if the character after the second slash is a '*'. 3276 // If so, we will lex that as a "/" instead of the start of a comment. 3277 // However, we never do this in -traditional-cpp mode. 3278 if ((LangOpts.LineComment || 3279 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') && 3280 !LangOpts.TraditionalCPP) { 3281 if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 3282 return; // There is a token to return. 3283 3284 // It is common for the tokens immediately after a // comment to be 3285 // whitespace (indentation for the next line). Instead of going through 3286 // the big switch, handle it efficiently now. 3287 goto SkipIgnoredUnits; 3288 } 3289 } 3290 3291 if (Char == '*') { // /**/ comment. 3292 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 3293 return; // There is a token to return. 3294 goto LexNextToken; // GCC isn't tail call eliminating. 3295 } 3296 3297 if (Char == '=') { 3298 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3299 Kind = tok::slashequal; 3300 } else { 3301 Kind = tok::slash; 3302 } 3303 break; 3304 case '%': 3305 Char = getCharAndSize(CurPtr, SizeTmp); 3306 if (Char == '=') { 3307 Kind = tok::percentequal; 3308 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3309 } else if (LangOpts.Digraphs && Char == '>') { 3310 Kind = tok::r_brace; // '%>' -> '}' 3311 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3312 } else if (LangOpts.Digraphs && Char == ':') { 3313 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3314 Char = getCharAndSize(CurPtr, SizeTmp); 3315 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 3316 Kind = tok::hashhash; // '%:%:' -> '##' 3317 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3318 SizeTmp2, Result); 3319 } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize 3320 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3321 if (!isLexingRawMode()) 3322 Diag(BufferPtr, diag::ext_charize_microsoft); 3323 Kind = tok::hashat; 3324 } else { // '%:' -> '#' 3325 // We parsed a # character. If this occurs at the start of the line, 3326 // it's actually the start of a preprocessing directive. Callback to 3327 // the preprocessor to handle it. 3328 // FIXME: -fpreprocessed mode?? 3329 if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) 3330 goto HandleDirective; 3331 3332 Kind = tok::hash; 3333 } 3334 } else { 3335 Kind = tok::percent; 3336 } 3337 break; 3338 case '<': 3339 Char = getCharAndSize(CurPtr, SizeTmp); 3340 if (ParsingFilename) { 3341 return LexAngledStringLiteral(Result, CurPtr); 3342 } else if (Char == '<') { 3343 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 3344 if (After == '=') { 3345 Kind = tok::lesslessequal; 3346 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3347 SizeTmp2, Result); 3348 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 3349 // If this is actually a '<<<<<<<' version control conflict marker, 3350 // recognize it as such and recover nicely. 3351 goto LexNextToken; 3352 } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) { 3353 // If this is '<<<<' and we're in a Perforce-style conflict marker, 3354 // ignore it. 3355 goto LexNextToken; 3356 } else if (LangOpts.CUDA && After == '<') { 3357 Kind = tok::lesslessless; 3358 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3359 SizeTmp2, Result); 3360 } else { 3361 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3362 Kind = tok::lessless; 3363 } 3364 } else if (Char == '=') { 3365 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3366 Kind = tok::lessequal; 3367 } else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '[' 3368 if (LangOpts.CPlusPlus11 && 3369 getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') { 3370 // C++0x [lex.pptoken]p3: 3371 // Otherwise, if the next three characters are <:: and the subsequent 3372 // character is neither : nor >, the < is treated as a preprocessor 3373 // token by itself and not as the first character of the alternative 3374 // token <:. 3375 unsigned SizeTmp3; 3376 char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3); 3377 if (After != ':' && After != '>') { 3378 Kind = tok::less; 3379 if (!isLexingRawMode()) 3380 Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon); 3381 break; 3382 } 3383 } 3384 3385 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3386 Kind = tok::l_square; 3387 } else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{' 3388 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3389 Kind = tok::l_brace; 3390 } else { 3391 Kind = tok::less; 3392 } 3393 break; 3394 case '>': 3395 Char = getCharAndSize(CurPtr, SizeTmp); 3396 if (Char == '=') { 3397 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3398 Kind = tok::greaterequal; 3399 } else if (Char == '>') { 3400 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 3401 if (After == '=') { 3402 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3403 SizeTmp2, Result); 3404 Kind = tok::greatergreaterequal; 3405 } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) { 3406 // If this is actually a '>>>>' conflict marker, recognize it as such 3407 // and recover nicely. 3408 goto LexNextToken; 3409 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 3410 // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 3411 goto LexNextToken; 3412 } else if (LangOpts.CUDA && After == '>') { 3413 Kind = tok::greatergreatergreater; 3414 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 3415 SizeTmp2, Result); 3416 } else { 3417 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3418 Kind = tok::greatergreater; 3419 } 3420 3421 } else { 3422 Kind = tok::greater; 3423 } 3424 break; 3425 case '^': 3426 Char = getCharAndSize(CurPtr, SizeTmp); 3427 if (Char == '=') { 3428 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3429 Kind = tok::caretequal; 3430 } else { 3431 Kind = tok::caret; 3432 } 3433 break; 3434 case '|': 3435 Char = getCharAndSize(CurPtr, SizeTmp); 3436 if (Char == '=') { 3437 Kind = tok::pipeequal; 3438 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3439 } else if (Char == '|') { 3440 // If this is '|||||||' and we're in a conflict marker, ignore it. 3441 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 3442 goto LexNextToken; 3443 Kind = tok::pipepipe; 3444 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3445 } else { 3446 Kind = tok::pipe; 3447 } 3448 break; 3449 case ':': 3450 Char = getCharAndSize(CurPtr, SizeTmp); 3451 if (LangOpts.Digraphs && Char == '>') { 3452 Kind = tok::r_square; // ':>' -> ']' 3453 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3454 } else if (LangOpts.CPlusPlus && Char == ':') { 3455 Kind = tok::coloncolon; 3456 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3457 } else { 3458 Kind = tok::colon; 3459 } 3460 break; 3461 case ';': 3462 Kind = tok::semi; 3463 break; 3464 case '=': 3465 Char = getCharAndSize(CurPtr, SizeTmp); 3466 if (Char == '=') { 3467 // If this is '====' and we're in a conflict marker, ignore it. 3468 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 3469 goto LexNextToken; 3470 3471 Kind = tok::equalequal; 3472 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3473 } else { 3474 Kind = tok::equal; 3475 } 3476 break; 3477 case ',': 3478 Kind = tok::comma; 3479 break; 3480 case '#': 3481 Char = getCharAndSize(CurPtr, SizeTmp); 3482 if (Char == '#') { 3483 Kind = tok::hashhash; 3484 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3485 } else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize 3486 Kind = tok::hashat; 3487 if (!isLexingRawMode()) 3488 Diag(BufferPtr, diag::ext_charize_microsoft); 3489 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 3490 } else { 3491 // We parsed a # character. If this occurs at the start of the line, 3492 // it's actually the start of a preprocessing directive. Callback to 3493 // the preprocessor to handle it. 3494 // FIXME: -fpreprocessed mode?? 3495 if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) 3496 goto HandleDirective; 3497 3498 Kind = tok::hash; 3499 } 3500 break; 3501 3502 case '@': 3503 // Objective C support. 3504 if (CurPtr[-1] == '@' && LangOpts.ObjC1) 3505 Kind = tok::at; 3506 else 3507 Kind = tok::unknown; 3508 break; 3509 3510 // UCNs (C99 6.4.3, C++11 [lex.charset]p2) 3511 case '\\': 3512 if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) 3513 return LexUnicode(Result, CodePoint, CurPtr); 3514 3515 Kind = tok::unknown; 3516 break; 3517 3518 default: { 3519 if (isASCII(Char)) { 3520 Kind = tok::unknown; 3521 break; 3522 } 3523 3524 UTF32 CodePoint; 3525 3526 // We can't just reset CurPtr to BufferPtr because BufferPtr may point to 3527 // an escaped newline. 3528 --CurPtr; 3529 ConversionResult Status = convertUTF8Sequence((const UTF8 **)&CurPtr, 3530 (const UTF8 *)BufferEnd, 3531 &CodePoint, 3532 strictConversion); 3533 if (Status == conversionOK) 3534 return LexUnicode(Result, CodePoint, CurPtr); 3535 3536 // Non-ASCII characters tend to creep into source code unintentionally. 3537 // Instead of letting the parser complain about the unknown token, 3538 // just warn that we don't have valid UTF-8, then drop the character. 3539 if (!isLexingRawMode()) 3540 Diag(CurPtr, diag::err_invalid_utf8); 3541 3542 BufferPtr = CurPtr+1; 3543 goto LexNextToken; 3544 } 3545 } 3546 3547 // Notify MIOpt that we read a non-whitespace/non-comment token. 3548 MIOpt.ReadToken(); 3549 3550 // Update the location of token as well as BufferPtr. 3551 FormTokenWithChars(Result, CurPtr, Kind); 3552 return; 3553 3554HandleDirective: 3555 // We parsed a # character and it's the start of a preprocessing directive. 3556 3557 FormTokenWithChars(Result, CurPtr, tok::hash); 3558 PP->HandleDirective(Result); 3559 3560 // As an optimization, if the preprocessor didn't switch lexers, tail 3561 // recurse. 3562 if (PP->isCurrentLexer(this)) { 3563 // Start a new token. If this is a #include or something, the PP may 3564 // want us starting at the beginning of the line again. If so, set 3565 // the StartOfLine flag and clear LeadingSpace. 3566 if (IsAtStartOfLine) { 3567 Result.setFlag(Token::StartOfLine); 3568 Result.clearFlag(Token::LeadingSpace); 3569 IsAtStartOfLine = false; 3570 } 3571 goto LexNextToken; // GCC isn't tail call eliminating. 3572 } 3573 return PP->Lex(Result); 3574} 3575