Lexer.cpp revision e91e93225db2e66906878513c6ef4dd6a7ee2b6a
1//===--- Lexer.cpp - C Language Family Lexer ------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements the Lexer and Token interfaces. 11// 12//===----------------------------------------------------------------------===// 13// 14// TODO: GCC Diagnostics emitted by the lexer: 15// PEDWARN: (form feed|vertical tab) in preprocessing directive 16// 17// Universal characters, unicode, char mapping: 18// WARNING: `%.*s' is not in NFKC 19// WARNING: `%.*s' is not in NFC 20// 21// Other: 22// TODO: Options to support: 23// -fexec-charset,-fwide-exec-charset 24// 25//===----------------------------------------------------------------------===// 26 27#include "clang/Lex/Lexer.h" 28#include "clang/Lex/Preprocessor.h" 29#include "clang/Lex/LexDiagnostic.h" 30#include "clang/Basic/SourceManager.h" 31#include "llvm/Support/Compiler.h" 32#include "llvm/Support/MemoryBuffer.h" 33#include <cctype> 34using namespace clang; 35 36static void InitCharacterInfo(); 37 38//===----------------------------------------------------------------------===// 39// Token Class Implementation 40//===----------------------------------------------------------------------===// 41 42/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 43bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 44 if (IdentifierInfo *II = getIdentifierInfo()) 45 return II->getObjCKeywordID() == objcKey; 46 return false; 47} 48 49/// getObjCKeywordID - Return the ObjC keyword kind. 50tok::ObjCKeywordKind Token::getObjCKeywordID() const { 51 IdentifierInfo *specId = getIdentifierInfo(); 52 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 53} 54 55 56//===----------------------------------------------------------------------===// 57// Lexer Class Implementation 58//===----------------------------------------------------------------------===// 59 60void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 61 const char *BufEnd) { 62 InitCharacterInfo(); 63 64 BufferStart = BufStart; 65 BufferPtr = BufPtr; 66 BufferEnd = BufEnd; 67 68 assert(BufEnd[0] == 0 && 69 "We assume that the input buffer has a null character at the end" 70 " to simplify lexing!"); 71 72 Is_PragmaLexer = false; 73 74 // Start of the file is a start of line. 75 IsAtStartOfLine = true; 76 77 // We are not after parsing a #. 78 ParsingPreprocessorDirective = false; 79 80 // We are not after parsing #include. 81 ParsingFilename = false; 82 83 // We are not in raw mode. Raw mode disables diagnostics and interpretation 84 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 85 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 86 // or otherwise skipping over tokens. 87 LexingRawMode = false; 88 89 // Default to not keeping comments. 90 ExtendedTokenMode = 0; 91} 92 93/// Lexer constructor - Create a new lexer object for the specified buffer 94/// with the specified preprocessor managing the lexing process. This lexer 95/// assumes that the associated file buffer and Preprocessor objects will 96/// outlive it, so it doesn't take ownership of either of them. 97Lexer::Lexer(FileID FID, Preprocessor &PP) 98 : PreprocessorLexer(&PP, FID), 99 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 100 Features(PP.getLangOptions()) { 101 102 const llvm::MemoryBuffer *InputFile = PP.getSourceManager().getBuffer(FID); 103 104 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), 105 InputFile->getBufferEnd()); 106 107 // Default to keeping comments if the preprocessor wants them. 108 SetCommentRetentionState(PP.getCommentRetentionState()); 109} 110 111/// Lexer constructor - Create a new raw lexer object. This object is only 112/// suitable for calls to 'LexRawToken'. This lexer assumes that the text 113/// range will outlive it, so it doesn't take ownership of it. 114Lexer::Lexer(SourceLocation fileloc, const LangOptions &features, 115 const char *BufStart, const char *BufPtr, const char *BufEnd) 116 : FileLoc(fileloc), Features(features) { 117 118 InitLexer(BufStart, BufPtr, BufEnd); 119 120 // We *are* in raw mode. 121 LexingRawMode = true; 122} 123 124/// Lexer constructor - Create a new raw lexer object. This object is only 125/// suitable for calls to 'LexRawToken'. This lexer assumes that the text 126/// range will outlive it, so it doesn't take ownership of it. 127Lexer::Lexer(FileID FID, const SourceManager &SM, const LangOptions &features) 128 : FileLoc(SM.getLocForStartOfFile(FID)), Features(features) { 129 const llvm::MemoryBuffer *FromFile = SM.getBuffer(FID); 130 131 InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(), 132 FromFile->getBufferEnd()); 133 134 // We *are* in raw mode. 135 LexingRawMode = true; 136} 137 138/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 139/// _Pragma expansion. This has a variety of magic semantics that this method 140/// sets up. It returns a new'd Lexer that must be delete'd when done. 141/// 142/// On entrance to this routine, TokStartLoc is a macro location which has a 143/// spelling loc that indicates the bytes to be lexed for the token and an 144/// instantiation location that indicates where all lexed tokens should be 145/// "expanded from". 146/// 147/// FIXME: It would really be nice to make _Pragma just be a wrapper around a 148/// normal lexer that remaps tokens as they fly by. This would require making 149/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 150/// interface that could handle this stuff. This would pull GetMappedTokenLoc 151/// out of the critical path of the lexer! 152/// 153Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 154 SourceLocation InstantiationLocStart, 155 SourceLocation InstantiationLocEnd, 156 unsigned TokLen, Preprocessor &PP) { 157 SourceManager &SM = PP.getSourceManager(); 158 159 // Create the lexer as if we were going to lex the file normally. 160 FileID SpellingFID = SM.getFileID(SpellingLoc); 161 Lexer *L = new Lexer(SpellingFID, PP); 162 163 // Now that the lexer is created, change the start/end locations so that we 164 // just lex the subsection of the file that we want. This is lexing from a 165 // scratch buffer. 166 const char *StrData = SM.getCharacterData(SpellingLoc); 167 168 L->BufferPtr = StrData; 169 L->BufferEnd = StrData+TokLen; 170 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 171 172 // Set the SourceLocation with the remapping information. This ensures that 173 // GetMappedTokenLoc will remap the tokens as they are lexed. 174 L->FileLoc = SM.createInstantiationLoc(SM.getLocForStartOfFile(SpellingFID), 175 InstantiationLocStart, 176 InstantiationLocEnd, TokLen); 177 178 // Ensure that the lexer thinks it is inside a directive, so that end \n will 179 // return an EOM token. 180 L->ParsingPreprocessorDirective = true; 181 182 // This lexer really is for _Pragma. 183 L->Is_PragmaLexer = true; 184 return L; 185} 186 187 188/// Stringify - Convert the specified string into a C string, with surrounding 189/// ""'s, and with escaped \ and " characters. 190std::string Lexer::Stringify(const std::string &Str, bool Charify) { 191 std::string Result = Str; 192 char Quote = Charify ? '\'' : '"'; 193 for (unsigned i = 0, e = Result.size(); i != e; ++i) { 194 if (Result[i] == '\\' || Result[i] == Quote) { 195 Result.insert(Result.begin()+i, '\\'); 196 ++i; ++e; 197 } 198 } 199 return Result; 200} 201 202/// Stringify - Convert the specified string into a C string by escaping '\' 203/// and " characters. This does not add surrounding ""'s to the string. 204void Lexer::Stringify(llvm::SmallVectorImpl<char> &Str) { 205 for (unsigned i = 0, e = Str.size(); i != e; ++i) { 206 if (Str[i] == '\\' || Str[i] == '"') { 207 Str.insert(Str.begin()+i, '\\'); 208 ++i; ++e; 209 } 210 } 211} 212 213 214/// MeasureTokenLength - Relex the token at the specified location and return 215/// its length in bytes in the input file. If the token needs cleaning (e.g. 216/// includes a trigraph or an escaped newline) then this count includes bytes 217/// that are part of that. 218unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 219 const SourceManager &SM) { 220 // TODO: this could be special cased for common tokens like identifiers, ')', 221 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 222 // all obviously single-char tokens. This could use 223 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 224 // something. 225 226 // If this comes from a macro expansion, we really do want the macro name, not 227 // the token this macro expanded to. 228 Loc = SM.getInstantiationLoc(Loc); 229 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 230 std::pair<const char *,const char *> Buffer = SM.getBufferData(LocInfo.first); 231 const char *StrData = Buffer.first+LocInfo.second; 232 233 // Create a langops struct and enable trigraphs. This is sufficient for 234 // measuring tokens. 235 LangOptions LangOpts; 236 LangOpts.Trigraphs = true; 237 238 // Create a lexer starting at the beginning of this token. 239 Lexer TheLexer(Loc, LangOpts, Buffer.first, StrData, Buffer.second); 240 Token TheTok; 241 TheLexer.LexFromRawLexer(TheTok); 242 return TheTok.getLength(); 243} 244 245//===----------------------------------------------------------------------===// 246// Character information. 247//===----------------------------------------------------------------------===// 248 249static unsigned char CharInfo[256]; 250 251enum { 252 CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0' 253 CHAR_VERT_WS = 0x02, // '\r', '\n' 254 CHAR_LETTER = 0x04, // a-z,A-Z 255 CHAR_NUMBER = 0x08, // 0-9 256 CHAR_UNDER = 0x10, // _ 257 CHAR_PERIOD = 0x20 // . 258}; 259 260static void InitCharacterInfo() { 261 static bool isInited = false; 262 if (isInited) return; 263 isInited = true; 264 265 // Intiialize the CharInfo table. 266 // TODO: statically initialize this. 267 CharInfo[(int)' '] = CharInfo[(int)'\t'] = 268 CharInfo[(int)'\f'] = CharInfo[(int)'\v'] = CHAR_HORZ_WS; 269 CharInfo[(int)'\n'] = CharInfo[(int)'\r'] = CHAR_VERT_WS; 270 271 CharInfo[(int)'_'] = CHAR_UNDER; 272 CharInfo[(int)'.'] = CHAR_PERIOD; 273 for (unsigned i = 'a'; i <= 'z'; ++i) 274 CharInfo[i] = CharInfo[i+'A'-'a'] = CHAR_LETTER; 275 for (unsigned i = '0'; i <= '9'; ++i) 276 CharInfo[i] = CHAR_NUMBER; 277} 278 279/// isIdentifierBody - Return true if this is the body character of an 280/// identifier, which is [a-zA-Z0-9_]. 281static inline bool isIdentifierBody(unsigned char c) { 282 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false; 283} 284 285/// isHorizontalWhitespace - Return true if this character is horizontal 286/// whitespace: ' ', '\t', '\f', '\v'. Note that this returns false for '\0'. 287static inline bool isHorizontalWhitespace(unsigned char c) { 288 return (CharInfo[c] & CHAR_HORZ_WS) ? true : false; 289} 290 291/// isWhitespace - Return true if this character is horizontal or vertical 292/// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'. Note that this returns false 293/// for '\0'. 294static inline bool isWhitespace(unsigned char c) { 295 return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false; 296} 297 298/// isNumberBody - Return true if this is the body character of an 299/// preprocessing number, which is [a-zA-Z0-9_.]. 300static inline bool isNumberBody(unsigned char c) { 301 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ? 302 true : false; 303} 304 305 306//===----------------------------------------------------------------------===// 307// Diagnostics forwarding code. 308//===----------------------------------------------------------------------===// 309 310/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 311/// lexer buffer was all instantiated at a single point, perform the mapping. 312/// This is currently only used for _Pragma implementation, so it is the slow 313/// path of the hot getSourceLocation method. Do not allow it to be inlined. 314static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 315 SourceLocation FileLoc, 316 unsigned CharNo, 317 unsigned TokLen) DISABLE_INLINE; 318static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 319 SourceLocation FileLoc, 320 unsigned CharNo, unsigned TokLen) { 321 assert(FileLoc.isMacroID() && "Must be an instantiation"); 322 323 // Otherwise, we're lexing "mapped tokens". This is used for things like 324 // _Pragma handling. Combine the instantiation location of FileLoc with the 325 // spelling location. 326 SourceManager &SM = PP.getSourceManager(); 327 328 // Create a new SLoc which is expanded from Instantiation(FileLoc) but whose 329 // characters come from spelling(FileLoc)+Offset. 330 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 331 SpellingLoc = SpellingLoc.getFileLocWithOffset(CharNo); 332 333 // Figure out the expansion loc range, which is the range covered by the 334 // original _Pragma(...) sequence. 335 std::pair<SourceLocation,SourceLocation> II = 336 SM.getImmediateInstantiationRange(FileLoc); 337 338 return SM.createInstantiationLoc(SpellingLoc, II.first, II.second, TokLen); 339} 340 341/// getSourceLocation - Return a source location identifier for the specified 342/// offset in the current file. 343SourceLocation Lexer::getSourceLocation(const char *Loc, 344 unsigned TokLen) const { 345 assert(Loc >= BufferStart && Loc <= BufferEnd && 346 "Location out of range for this buffer!"); 347 348 // In the normal case, we're just lexing from a simple file buffer, return 349 // the file id from FileLoc with the offset specified. 350 unsigned CharNo = Loc-BufferStart; 351 if (FileLoc.isFileID()) 352 return FileLoc.getFileLocWithOffset(CharNo); 353 354 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 355 // tokens are lexed from where the _Pragma was defined. 356 assert(PP && "This doesn't work on raw lexers"); 357 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 358} 359 360/// Diag - Forwarding function for diagnostics. This translate a source 361/// position in the current buffer into a SourceLocation object for rendering. 362DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 363 return PP->Diag(getSourceLocation(Loc), DiagID); 364} 365 366//===----------------------------------------------------------------------===// 367// Trigraph and Escaped Newline Handling Code. 368//===----------------------------------------------------------------------===// 369 370/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 371/// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 372static char GetTrigraphCharForLetter(char Letter) { 373 switch (Letter) { 374 default: return 0; 375 case '=': return '#'; 376 case ')': return ']'; 377 case '(': return '['; 378 case '!': return '|'; 379 case '\'': return '^'; 380 case '>': return '}'; 381 case '/': return '\\'; 382 case '<': return '{'; 383 case '-': return '~'; 384 } 385} 386 387/// DecodeTrigraphChar - If the specified character is a legal trigraph when 388/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 389/// return the result character. Finally, emit a warning about trigraph use 390/// whether trigraphs are enabled or not. 391static char DecodeTrigraphChar(const char *CP, Lexer *L) { 392 char Res = GetTrigraphCharForLetter(*CP); 393 if (!Res || !L) return Res; 394 395 if (!L->getFeatures().Trigraphs) { 396 if (!L->isLexingRawMode()) 397 L->Diag(CP-2, diag::trigraph_ignored); 398 return 0; 399 } 400 401 if (!L->isLexingRawMode()) 402 L->Diag(CP-2, diag::trigraph_converted) << std::string()+Res; 403 return Res; 404} 405 406/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 407/// get its size, and return it. This is tricky in several cases: 408/// 1. If currently at the start of a trigraph, we warn about the trigraph, 409/// then either return the trigraph (skipping 3 chars) or the '?', 410/// depending on whether trigraphs are enabled or not. 411/// 2. If this is an escaped newline (potentially with whitespace between 412/// the backslash and newline), implicitly skip the newline and return 413/// the char after it. 414/// 3. If this is a UCN, return it. FIXME: C++ UCN's? 415/// 416/// This handles the slow/uncommon case of the getCharAndSize method. Here we 417/// know that we can accumulate into Size, and that we have already incremented 418/// Ptr by Size bytes. 419/// 420/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 421/// be updated to match. 422/// 423char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 424 Token *Tok) { 425 // If we have a slash, look for an escaped newline. 426 if (Ptr[0] == '\\') { 427 ++Size; 428 ++Ptr; 429Slash: 430 // Common case, backslash-char where the char is not whitespace. 431 if (!isWhitespace(Ptr[0])) return '\\'; 432 433 // See if we have optional whitespace characters followed by a newline. 434 { 435 unsigned SizeTmp = 0; 436 do { 437 ++SizeTmp; 438 if (Ptr[SizeTmp-1] == '\n' || Ptr[SizeTmp-1] == '\r') { 439 // Remember that this token needs to be cleaned. 440 if (Tok) Tok->setFlag(Token::NeedsCleaning); 441 442 // Warn if there was whitespace between the backslash and newline. 443 if (SizeTmp != 1 && Tok && !isLexingRawMode()) 444 Diag(Ptr, diag::backslash_newline_space); 445 446 // If this is a \r\n or \n\r, skip the newlines. 447 if ((Ptr[SizeTmp] == '\r' || Ptr[SizeTmp] == '\n') && 448 Ptr[SizeTmp-1] != Ptr[SizeTmp]) 449 ++SizeTmp; 450 451 // Found backslash<whitespace><newline>. Parse the char after it. 452 Size += SizeTmp; 453 Ptr += SizeTmp; 454 // Use slow version to accumulate a correct size field. 455 return getCharAndSizeSlow(Ptr, Size, Tok); 456 } 457 } while (isWhitespace(Ptr[SizeTmp])); 458 } 459 460 // Otherwise, this is not an escaped newline, just return the slash. 461 return '\\'; 462 } 463 464 // If this is a trigraph, process it. 465 if (Ptr[0] == '?' && Ptr[1] == '?') { 466 // If this is actually a legal trigraph (not something like "??x"), emit 467 // a trigraph warning. If so, and if trigraphs are enabled, return it. 468 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) { 469 // Remember that this token needs to be cleaned. 470 if (Tok) Tok->setFlag(Token::NeedsCleaning); 471 472 Ptr += 3; 473 Size += 3; 474 if (C == '\\') goto Slash; 475 return C; 476 } 477 } 478 479 // If this is neither, return a single character. 480 ++Size; 481 return *Ptr; 482} 483 484 485/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 486/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 487/// and that we have already incremented Ptr by Size bytes. 488/// 489/// NOTE: When this method is updated, getCharAndSizeSlow (above) should 490/// be updated to match. 491char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 492 const LangOptions &Features) { 493 // If we have a slash, look for an escaped newline. 494 if (Ptr[0] == '\\') { 495 ++Size; 496 ++Ptr; 497Slash: 498 // Common case, backslash-char where the char is not whitespace. 499 if (!isWhitespace(Ptr[0])) return '\\'; 500 501 // See if we have optional whitespace characters followed by a newline. 502 { 503 unsigned SizeTmp = 0; 504 do { 505 ++SizeTmp; 506 if (Ptr[SizeTmp-1] == '\n' || Ptr[SizeTmp-1] == '\r') { 507 508 // If this is a \r\n or \n\r, skip the newlines. 509 if ((Ptr[SizeTmp] == '\r' || Ptr[SizeTmp] == '\n') && 510 Ptr[SizeTmp-1] != Ptr[SizeTmp]) 511 ++SizeTmp; 512 513 // Found backslash<whitespace><newline>. Parse the char after it. 514 Size += SizeTmp; 515 Ptr += SizeTmp; 516 517 // Use slow version to accumulate a correct size field. 518 return getCharAndSizeSlowNoWarn(Ptr, Size, Features); 519 } 520 } while (isWhitespace(Ptr[SizeTmp])); 521 } 522 523 // Otherwise, this is not an escaped newline, just return the slash. 524 return '\\'; 525 } 526 527 // If this is a trigraph, process it. 528 if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 529 // If this is actually a legal trigraph (not something like "??x"), return 530 // it. 531 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 532 Ptr += 3; 533 Size += 3; 534 if (C == '\\') goto Slash; 535 return C; 536 } 537 } 538 539 // If this is neither, return a single character. 540 ++Size; 541 return *Ptr; 542} 543 544//===----------------------------------------------------------------------===// 545// Helper methods for lexing. 546//===----------------------------------------------------------------------===// 547 548void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { 549 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] 550 unsigned Size; 551 unsigned char C = *CurPtr++; 552 while (isIdentifierBody(C)) { 553 C = *CurPtr++; 554 } 555 --CurPtr; // Back up over the skipped character. 556 557 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline 558 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. 559 // FIXME: UCNs. 560 if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) { 561FinishIdentifier: 562 const char *IdStart = BufferPtr; 563 FormTokenWithChars(Result, CurPtr, tok::identifier); 564 565 // If we are in raw mode, return this identifier raw. There is no need to 566 // look up identifier information or attempt to macro expand it. 567 if (LexingRawMode) return; 568 569 // Fill in Result.IdentifierInfo, looking up the identifier in the 570 // identifier table. 571 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result, IdStart); 572 573 // Change the kind of this identifier to the appropriate token kind, e.g. 574 // turning "for" into a keyword. 575 Result.setKind(II->getTokenID()); 576 577 // Finally, now that we know we have an identifier, pass this off to the 578 // preprocessor, which may macro expand it or something. 579 if (II->isHandleIdentifierCase()) 580 PP->HandleIdentifier(Result); 581 return; 582 } 583 584 // Otherwise, $,\,? in identifier found. Enter slower path. 585 586 C = getCharAndSize(CurPtr, Size); 587 while (1) { 588 if (C == '$') { 589 // If we hit a $ and they are not supported in identifiers, we are done. 590 if (!Features.DollarIdents) goto FinishIdentifier; 591 592 // Otherwise, emit a diagnostic and continue. 593 if (!isLexingRawMode()) 594 Diag(CurPtr, diag::ext_dollar_in_identifier); 595 CurPtr = ConsumeChar(CurPtr, Size, Result); 596 C = getCharAndSize(CurPtr, Size); 597 continue; 598 } else if (!isIdentifierBody(C)) { // FIXME: UCNs. 599 // Found end of identifier. 600 goto FinishIdentifier; 601 } 602 603 // Otherwise, this character is good, consume it. 604 CurPtr = ConsumeChar(CurPtr, Size, Result); 605 606 C = getCharAndSize(CurPtr, Size); 607 while (isIdentifierBody(C)) { // FIXME: UCNs. 608 CurPtr = ConsumeChar(CurPtr, Size, Result); 609 C = getCharAndSize(CurPtr, Size); 610 } 611 } 612} 613 614 615/// LexNumericConstant - Lex the remainder of a integer or floating point 616/// constant. From[-1] is the first character lexed. Return the end of the 617/// constant. 618void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 619 unsigned Size; 620 char C = getCharAndSize(CurPtr, Size); 621 char PrevCh = 0; 622 while (isNumberBody(C)) { // FIXME: UCNs? 623 CurPtr = ConsumeChar(CurPtr, Size, Result); 624 PrevCh = C; 625 C = getCharAndSize(CurPtr, Size); 626 } 627 628 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 629 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) 630 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 631 632 // If we have a hex FP constant, continue. 633 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p') && 634 (Features.HexFloats || !Features.NoExtensions)) 635 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 636 637 // Update the location of token as well as BufferPtr. 638 const char *TokStart = BufferPtr; 639 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 640 Result.setLiteralData(TokStart); 641} 642 643/// LexStringLiteral - Lex the remainder of a string literal, after having lexed 644/// either " or L". 645void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) { 646 const char *NulCharacter = 0; // Does this string contain the \0 character? 647 648 char C = getAndAdvanceChar(CurPtr, Result); 649 while (C != '"') { 650 // Skip escaped characters. 651 if (C == '\\') { 652 // Skip the escaped character. 653 C = getAndAdvanceChar(CurPtr, Result); 654 } else if (C == '\n' || C == '\r' || // Newline. 655 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 656 if (!isLexingRawMode()) 657 Diag(BufferPtr, diag::err_unterminated_string); 658 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 659 return; 660 } else if (C == 0) { 661 NulCharacter = CurPtr-1; 662 } 663 C = getAndAdvanceChar(CurPtr, Result); 664 } 665 666 // If a nul character existed in the string, warn about it. 667 if (NulCharacter && !isLexingRawMode()) 668 Diag(NulCharacter, diag::null_in_string); 669 670 // Update the location of the token as well as the BufferPtr instance var. 671 const char *TokStart = BufferPtr; 672 FormTokenWithChars(Result, CurPtr, 673 Wide ? tok::wide_string_literal : tok::string_literal); 674 Result.setLiteralData(TokStart); 675} 676 677/// LexAngledStringLiteral - Lex the remainder of an angled string literal, 678/// after having lexed the '<' character. This is used for #include filenames. 679void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 680 const char *NulCharacter = 0; // Does this string contain the \0 character? 681 682 char C = getAndAdvanceChar(CurPtr, Result); 683 while (C != '>') { 684 // Skip escaped characters. 685 if (C == '\\') { 686 // Skip the escaped character. 687 C = getAndAdvanceChar(CurPtr, Result); 688 } else if (C == '\n' || C == '\r' || // Newline. 689 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 690 if (!isLexingRawMode()) 691 Diag(BufferPtr, diag::err_unterminated_angled_string); 692 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 693 return; 694 } else if (C == 0) { 695 NulCharacter = CurPtr-1; 696 } 697 C = getAndAdvanceChar(CurPtr, Result); 698 } 699 700 // If a nul character existed in the string, warn about it. 701 if (NulCharacter && !isLexingRawMode()) 702 Diag(NulCharacter, diag::null_in_string); 703 704 // Update the location of token as well as BufferPtr. 705 const char *TokStart = BufferPtr; 706 FormTokenWithChars(Result, CurPtr, tok::angle_string_literal); 707 Result.setLiteralData(TokStart); 708} 709 710 711/// LexCharConstant - Lex the remainder of a character constant, after having 712/// lexed either ' or L'. 713void Lexer::LexCharConstant(Token &Result, const char *CurPtr) { 714 const char *NulCharacter = 0; // Does this character contain the \0 character? 715 716 // Handle the common case of 'x' and '\y' efficiently. 717 char C = getAndAdvanceChar(CurPtr, Result); 718 if (C == '\'') { 719 if (!isLexingRawMode()) 720 Diag(BufferPtr, diag::err_empty_character); 721 FormTokenWithChars(Result, CurPtr, tok::unknown); 722 return; 723 } else if (C == '\\') { 724 // Skip the escaped character. 725 // FIXME: UCN's. 726 C = getAndAdvanceChar(CurPtr, Result); 727 } 728 729 if (C && C != '\n' && C != '\r' && CurPtr[0] == '\'') { 730 ++CurPtr; 731 } else { 732 // Fall back on generic code for embedded nulls, newlines, wide chars. 733 do { 734 // Skip escaped characters. 735 if (C == '\\') { 736 // Skip the escaped character. 737 C = getAndAdvanceChar(CurPtr, Result); 738 } else if (C == '\n' || C == '\r' || // Newline. 739 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 740 if (!isLexingRawMode()) 741 Diag(BufferPtr, diag::err_unterminated_char); 742 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 743 return; 744 } else if (C == 0) { 745 NulCharacter = CurPtr-1; 746 } 747 C = getAndAdvanceChar(CurPtr, Result); 748 } while (C != '\''); 749 } 750 751 if (NulCharacter && !isLexingRawMode()) 752 Diag(NulCharacter, diag::null_in_char); 753 754 // Update the location of token as well as BufferPtr. 755 const char *TokStart = BufferPtr; 756 FormTokenWithChars(Result, CurPtr, tok::char_constant); 757 Result.setLiteralData(TokStart); 758} 759 760/// SkipWhitespace - Efficiently skip over a series of whitespace characters. 761/// Update BufferPtr to point to the next non-whitespace character and return. 762/// 763/// This method forms a token and returns true if KeepWhitespaceMode is enabled. 764/// 765bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { 766 // Whitespace - Skip it, then return the token after the whitespace. 767 unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently. 768 while (1) { 769 // Skip horizontal whitespace very aggressively. 770 while (isHorizontalWhitespace(Char)) 771 Char = *++CurPtr; 772 773 // Otherwise if we have something other than whitespace, we're done. 774 if (Char != '\n' && Char != '\r') 775 break; 776 777 if (ParsingPreprocessorDirective) { 778 // End of preprocessor directive line, let LexTokenInternal handle this. 779 BufferPtr = CurPtr; 780 return false; 781 } 782 783 // ok, but handle newline. 784 // The returned token is at the start of the line. 785 Result.setFlag(Token::StartOfLine); 786 // No leading whitespace seen so far. 787 Result.clearFlag(Token::LeadingSpace); 788 Char = *++CurPtr; 789 } 790 791 // If this isn't immediately after a newline, there is leading space. 792 char PrevChar = CurPtr[-1]; 793 if (PrevChar != '\n' && PrevChar != '\r') 794 Result.setFlag(Token::LeadingSpace); 795 796 // If the client wants us to return whitespace, return it now. 797 if (isKeepWhitespaceMode()) { 798 FormTokenWithChars(Result, CurPtr, tok::unknown); 799 return true; 800 } 801 802 BufferPtr = CurPtr; 803 return false; 804} 805 806// SkipBCPLComment - We have just read the // characters from input. Skip until 807// we find the newline character thats terminate the comment. Then update 808/// BufferPtr and return. If we're in KeepCommentMode, this will form the token 809/// and return true. 810bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { 811 // If BCPL comments aren't explicitly enabled for this language, emit an 812 // extension warning. 813 if (!Features.BCPLComment && !isLexingRawMode()) { 814 Diag(BufferPtr, diag::ext_bcpl_comment); 815 816 // Mark them enabled so we only emit one warning for this translation 817 // unit. 818 Features.BCPLComment = true; 819 } 820 821 // Scan over the body of the comment. The common case, when scanning, is that 822 // the comment contains normal ascii characters with nothing interesting in 823 // them. As such, optimize for this case with the inner loop. 824 char C; 825 do { 826 C = *CurPtr; 827 // FIXME: Speedup BCPL comment lexing. Just scan for a \n or \r character. 828 // If we find a \n character, scan backwards, checking to see if it's an 829 // escaped newline, like we do for block comments. 830 831 // Skip over characters in the fast loop. 832 while (C != 0 && // Potentially EOF. 833 C != '\\' && // Potentially escaped newline. 834 C != '?' && // Potentially trigraph. 835 C != '\n' && C != '\r') // Newline or DOS-style newline. 836 C = *++CurPtr; 837 838 // If this is a newline, we're done. 839 if (C == '\n' || C == '\r') 840 break; // Found the newline? Break out! 841 842 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 843 // properly decode the character. Read it in raw mode to avoid emitting 844 // diagnostics about things like trigraphs. If we see an escaped newline, 845 // we'll handle it below. 846 const char *OldPtr = CurPtr; 847 bool OldRawMode = isLexingRawMode(); 848 LexingRawMode = true; 849 C = getAndAdvanceChar(CurPtr, Result); 850 LexingRawMode = OldRawMode; 851 852 // If we read multiple characters, and one of those characters was a \r or 853 // \n, then we had an escaped newline within the comment. Emit diagnostic 854 // unless the next line is also a // comment. 855 if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') { 856 for (; OldPtr != CurPtr; ++OldPtr) 857 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 858 // Okay, we found a // comment that ends in a newline, if the next 859 // line is also a // comment, but has spaces, don't emit a diagnostic. 860 if (isspace(C)) { 861 const char *ForwardPtr = CurPtr; 862 while (isspace(*ForwardPtr)) // Skip whitespace. 863 ++ForwardPtr; 864 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 865 break; 866 } 867 868 if (!isLexingRawMode()) 869 Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment); 870 break; 871 } 872 } 873 874 if (CurPtr == BufferEnd+1) { --CurPtr; break; } 875 } while (C != '\n' && C != '\r'); 876 877 // Found but did not consume the newline. 878 879 // If we are returning comments as tokens, return this comment as a token. 880 if (inKeepCommentMode()) 881 return SaveBCPLComment(Result, CurPtr); 882 883 // If we are inside a preprocessor directive and we see the end of line, 884 // return immediately, so that the lexer can return this as an EOM token. 885 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 886 BufferPtr = CurPtr; 887 return false; 888 } 889 890 // Otherwise, eat the \n character. We don't care if this is a \n\r or 891 // \r\n sequence. This is an efficiency hack (because we know the \n can't 892 // contribute to another token), it isn't needed for correctness. Note that 893 // this is ok even in KeepWhitespaceMode, because we would have returned the 894 /// comment above in that mode. 895 ++CurPtr; 896 897 // The next returned token is at the start of the line. 898 Result.setFlag(Token::StartOfLine); 899 // No leading whitespace seen so far. 900 Result.clearFlag(Token::LeadingSpace); 901 BufferPtr = CurPtr; 902 return false; 903} 904 905/// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in 906/// an appropriate way and return it. 907bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) { 908 // If we're not in a preprocessor directive, just return the // comment 909 // directly. 910 FormTokenWithChars(Result, CurPtr, tok::comment); 911 912 if (!ParsingPreprocessorDirective) 913 return true; 914 915 // If this BCPL-style comment is in a macro definition, transmogrify it into 916 // a C-style block comment. 917 std::string Spelling = PP->getSpelling(Result); 918 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?"); 919 Spelling[1] = '*'; // Change prefix to "/*". 920 Spelling += "*/"; // add suffix. 921 922 Result.setKind(tok::comment); 923 PP->CreateString(&Spelling[0], Spelling.size(), Result, 924 Result.getLocation()); 925 return true; 926} 927 928/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 929/// character (either \n or \r) is part of an escaped newline sequence. Issue a 930/// diagnostic if so. We know that the newline is inside of a block comment. 931static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, 932 Lexer *L) { 933 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 934 935 // Back up off the newline. 936 --CurPtr; 937 938 // If this is a two-character newline sequence, skip the other character. 939 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 940 // \n\n or \r\r -> not escaped newline. 941 if (CurPtr[0] == CurPtr[1]) 942 return false; 943 // \n\r or \r\n -> skip the newline. 944 --CurPtr; 945 } 946 947 // If we have horizontal whitespace, skip over it. We allow whitespace 948 // between the slash and newline. 949 bool HasSpace = false; 950 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 951 --CurPtr; 952 HasSpace = true; 953 } 954 955 // If we have a slash, we know this is an escaped newline. 956 if (*CurPtr == '\\') { 957 if (CurPtr[-1] != '*') return false; 958 } else { 959 // It isn't a slash, is it the ?? / trigraph? 960 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || 961 CurPtr[-3] != '*') 962 return false; 963 964 // This is the trigraph ending the comment. Emit a stern warning! 965 CurPtr -= 2; 966 967 // If no trigraphs are enabled, warn that we ignored this trigraph and 968 // ignore this * character. 969 if (!L->getFeatures().Trigraphs) { 970 if (!L->isLexingRawMode()) 971 L->Diag(CurPtr, diag::trigraph_ignored_block_comment); 972 return false; 973 } 974 if (!L->isLexingRawMode()) 975 L->Diag(CurPtr, diag::trigraph_ends_block_comment); 976 } 977 978 // Warn about having an escaped newline between the */ characters. 979 if (!L->isLexingRawMode()) 980 L->Diag(CurPtr, diag::escaped_newline_block_comment_end); 981 982 // If there was space between the backslash and newline, warn about it. 983 if (HasSpace && !L->isLexingRawMode()) 984 L->Diag(CurPtr, diag::backslash_newline_space); 985 986 return true; 987} 988 989#ifdef __SSE2__ 990#include <emmintrin.h> 991#elif __ALTIVEC__ 992#include <altivec.h> 993#undef bool 994#endif 995 996/// SkipBlockComment - We have just read the /* characters from input. Read 997/// until we find the */ characters that terminate the comment. Note that we 998/// don't bother decoding trigraphs or escaped newlines in block comments, 999/// because they cannot cause the comment to end. The only thing that can 1000/// happen is the comment could end with an escaped newline between the */ end 1001/// of comment. 1002/// 1003/// If KeepCommentMode is enabled, this forms a token from the comment and 1004/// returns true. 1005bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { 1006 // Scan one character past where we should, looking for a '/' character. Once 1007 // we find it, check to see if it was preceeded by a *. This common 1008 // optimization helps people who like to put a lot of * characters in their 1009 // comments. 1010 1011 // The first character we get with newlines and trigraphs skipped to handle 1012 // the degenerate /*/ case below correctly if the * has an escaped newline 1013 // after it. 1014 unsigned CharSize; 1015 unsigned char C = getCharAndSize(CurPtr, CharSize); 1016 CurPtr += CharSize; 1017 if (C == 0 && CurPtr == BufferEnd+1) { 1018 if (!isLexingRawMode()) 1019 Diag(BufferPtr, diag::err_unterminated_block_comment); 1020 --CurPtr; 1021 1022 // KeepWhitespaceMode should return this broken comment as a token. Since 1023 // it isn't a well formed comment, just return it as an 'unknown' token. 1024 if (isKeepWhitespaceMode()) { 1025 FormTokenWithChars(Result, CurPtr, tok::unknown); 1026 return true; 1027 } 1028 1029 BufferPtr = CurPtr; 1030 return false; 1031 } 1032 1033 // Check to see if the first character after the '/*' is another /. If so, 1034 // then this slash does not end the block comment, it is part of it. 1035 if (C == '/') 1036 C = *CurPtr++; 1037 1038 while (1) { 1039 // Skip over all non-interesting characters until we find end of buffer or a 1040 // (probably ending) '/' character. 1041 if (CurPtr + 24 < BufferEnd) { 1042 // While not aligned to a 16-byte boundary. 1043 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 1044 C = *CurPtr++; 1045 1046 if (C == '/') goto FoundSlash; 1047 1048#ifdef __SSE2__ 1049 __m128i Slashes = _mm_set_epi8('/', '/', '/', '/', '/', '/', '/', '/', 1050 '/', '/', '/', '/', '/', '/', '/', '/'); 1051 while (CurPtr+16 <= BufferEnd && 1052 _mm_movemask_epi8(_mm_cmpeq_epi8(*(__m128i*)CurPtr, Slashes)) == 0) 1053 CurPtr += 16; 1054#elif __ALTIVEC__ 1055 __vector unsigned char Slashes = { 1056 '/', '/', '/', '/', '/', '/', '/', '/', 1057 '/', '/', '/', '/', '/', '/', '/', '/' 1058 }; 1059 while (CurPtr+16 <= BufferEnd && 1060 !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes)) 1061 CurPtr += 16; 1062#else 1063 // Scan for '/' quickly. Many block comments are very large. 1064 while (CurPtr[0] != '/' && 1065 CurPtr[1] != '/' && 1066 CurPtr[2] != '/' && 1067 CurPtr[3] != '/' && 1068 CurPtr+4 < BufferEnd) { 1069 CurPtr += 4; 1070 } 1071#endif 1072 1073 // It has to be one of the bytes scanned, increment to it and read one. 1074 C = *CurPtr++; 1075 } 1076 1077 // Loop to scan the remainder. 1078 while (C != '/' && C != '\0') 1079 C = *CurPtr++; 1080 1081 FoundSlash: 1082 if (C == '/') { 1083 if (CurPtr[-2] == '*') // We found the final */. We're done! 1084 break; 1085 1086 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 1087 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { 1088 // We found the final */, though it had an escaped newline between the 1089 // * and /. We're done! 1090 break; 1091 } 1092 } 1093 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 1094 // If this is a /* inside of the comment, emit a warning. Don't do this 1095 // if this is a /*/, which will end the comment. This misses cases with 1096 // embedded escaped newlines, but oh well. 1097 if (!isLexingRawMode()) 1098 Diag(CurPtr-1, diag::warn_nested_block_comment); 1099 } 1100 } else if (C == 0 && CurPtr == BufferEnd+1) { 1101 if (!isLexingRawMode()) 1102 Diag(BufferPtr, diag::err_unterminated_block_comment); 1103 // Note: the user probably forgot a */. We could continue immediately 1104 // after the /*, but this would involve lexing a lot of what really is the 1105 // comment, which surely would confuse the parser. 1106 --CurPtr; 1107 1108 // KeepWhitespaceMode should return this broken comment as a token. Since 1109 // it isn't a well formed comment, just return it as an 'unknown' token. 1110 if (isKeepWhitespaceMode()) { 1111 FormTokenWithChars(Result, CurPtr, tok::unknown); 1112 return true; 1113 } 1114 1115 BufferPtr = CurPtr; 1116 return false; 1117 } 1118 C = *CurPtr++; 1119 } 1120 1121 // If we are returning comments as tokens, return this comment as a token. 1122 if (inKeepCommentMode()) { 1123 FormTokenWithChars(Result, CurPtr, tok::comment); 1124 return true; 1125 } 1126 1127 // It is common for the tokens immediately after a /**/ comment to be 1128 // whitespace. Instead of going through the big switch, handle it 1129 // efficiently now. This is safe even in KeepWhitespaceMode because we would 1130 // have already returned above with the comment as a token. 1131 if (isHorizontalWhitespace(*CurPtr)) { 1132 Result.setFlag(Token::LeadingSpace); 1133 SkipWhitespace(Result, CurPtr+1); 1134 return false; 1135 } 1136 1137 // Otherwise, just return so that the next character will be lexed as a token. 1138 BufferPtr = CurPtr; 1139 Result.setFlag(Token::LeadingSpace); 1140 return false; 1141} 1142 1143//===----------------------------------------------------------------------===// 1144// Primary Lexing Entry Points 1145//===----------------------------------------------------------------------===// 1146 1147/// ReadToEndOfLine - Read the rest of the current preprocessor line as an 1148/// uninterpreted string. This switches the lexer out of directive mode. 1149std::string Lexer::ReadToEndOfLine() { 1150 assert(ParsingPreprocessorDirective && ParsingFilename == false && 1151 "Must be in a preprocessing directive!"); 1152 std::string Result; 1153 Token Tmp; 1154 1155 // CurPtr - Cache BufferPtr in an automatic variable. 1156 const char *CurPtr = BufferPtr; 1157 while (1) { 1158 char Char = getAndAdvanceChar(CurPtr, Tmp); 1159 switch (Char) { 1160 default: 1161 Result += Char; 1162 break; 1163 case 0: // Null. 1164 // Found end of file? 1165 if (CurPtr-1 != BufferEnd) { 1166 // Nope, normal character, continue. 1167 Result += Char; 1168 break; 1169 } 1170 // FALL THROUGH. 1171 case '\r': 1172 case '\n': 1173 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 1174 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 1175 BufferPtr = CurPtr-1; 1176 1177 // Next, lex the character, which should handle the EOM transition. 1178 Lex(Tmp); 1179 assert(Tmp.is(tok::eom) && "Unexpected token!"); 1180 1181 // Finally, we're done, return the string we found. 1182 return Result; 1183 } 1184 } 1185} 1186 1187/// LexEndOfFile - CurPtr points to the end of this file. Handle this 1188/// condition, reporting diagnostics and handling other edge cases as required. 1189/// This returns true if Result contains a token, false if PP.Lex should be 1190/// called again. 1191bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 1192 // If we hit the end of the file while parsing a preprocessor directive, 1193 // end the preprocessor directive first. The next token returned will 1194 // then be the end of file. 1195 if (ParsingPreprocessorDirective) { 1196 // Done parsing the "line". 1197 ParsingPreprocessorDirective = false; 1198 // Update the location of token as well as BufferPtr. 1199 FormTokenWithChars(Result, CurPtr, tok::eom); 1200 1201 // Restore comment saving mode, in case it was disabled for directive. 1202 SetCommentRetentionState(PP->getCommentRetentionState()); 1203 return true; // Have a token. 1204 } 1205 1206 // If we are in raw mode, return this event as an EOF token. Let the caller 1207 // that put us in raw mode handle the event. 1208 if (isLexingRawMode()) { 1209 Result.startToken(); 1210 BufferPtr = BufferEnd; 1211 FormTokenWithChars(Result, BufferEnd, tok::eof); 1212 return true; 1213 } 1214 1215 // Otherwise, issue diagnostics for unterminated #if and missing newline. 1216 1217 // If we are in a #if directive, emit an error. 1218 while (!ConditionalStack.empty()) { 1219 PP->Diag(ConditionalStack.back().IfLoc, 1220 diag::err_pp_unterminated_conditional); 1221 ConditionalStack.pop_back(); 1222 } 1223 1224 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 1225 // a pedwarn. 1226 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) 1227 Diag(BufferEnd, diag::ext_no_newline_eof); 1228 1229 BufferPtr = CurPtr; 1230 1231 // Finally, let the preprocessor handle this. 1232 return PP->HandleEndOfFile(Result); 1233} 1234 1235/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 1236/// the specified lexer will return a tok::l_paren token, 0 if it is something 1237/// else and 2 if there are no more tokens in the buffer controlled by the 1238/// lexer. 1239unsigned Lexer::isNextPPTokenLParen() { 1240 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 1241 1242 // Switch to 'skipping' mode. This will ensure that we can lex a token 1243 // without emitting diagnostics, disables macro expansion, and will cause EOF 1244 // to return an EOF token instead of popping the include stack. 1245 LexingRawMode = true; 1246 1247 // Save state that can be changed while lexing so that we can restore it. 1248 const char *TmpBufferPtr = BufferPtr; 1249 1250 Token Tok; 1251 Tok.startToken(); 1252 LexTokenInternal(Tok); 1253 1254 // Restore state that may have changed. 1255 BufferPtr = TmpBufferPtr; 1256 1257 // Restore the lexer back to non-skipping mode. 1258 LexingRawMode = false; 1259 1260 if (Tok.is(tok::eof)) 1261 return 2; 1262 return Tok.is(tok::l_paren); 1263} 1264 1265 1266/// LexTokenInternal - This implements a simple C family lexer. It is an 1267/// extremely performance critical piece of code. This assumes that the buffer 1268/// has a null character at the end of the file. Return true if an error 1269/// occurred and compilation should terminate, false if normal. This returns a 1270/// preprocessing token, not a normal token, as such, it is an internal 1271/// interface. It assumes that the Flags of result have been cleared before 1272/// calling this. 1273void Lexer::LexTokenInternal(Token &Result) { 1274LexNextToken: 1275 // New token, can't need cleaning yet. 1276 Result.clearFlag(Token::NeedsCleaning); 1277 Result.setIdentifierInfo(0); 1278 1279 // CurPtr - Cache BufferPtr in an automatic variable. 1280 const char *CurPtr = BufferPtr; 1281 1282 // Small amounts of horizontal whitespace is very common between tokens. 1283 if ((*CurPtr == ' ') || (*CurPtr == '\t')) { 1284 ++CurPtr; 1285 while ((*CurPtr == ' ') || (*CurPtr == '\t')) 1286 ++CurPtr; 1287 1288 // If we are keeping whitespace and other tokens, just return what we just 1289 // skipped. The next lexer invocation will return the token after the 1290 // whitespace. 1291 if (isKeepWhitespaceMode()) { 1292 FormTokenWithChars(Result, CurPtr, tok::unknown); 1293 return; 1294 } 1295 1296 BufferPtr = CurPtr; 1297 Result.setFlag(Token::LeadingSpace); 1298 } 1299 1300 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 1301 1302 // Read a character, advancing over it. 1303 char Char = getAndAdvanceChar(CurPtr, Result); 1304 tok::TokenKind Kind; 1305 1306 switch (Char) { 1307 case 0: // Null. 1308 // Found end of file? 1309 if (CurPtr-1 == BufferEnd) { 1310 // Read the PP instance variable into an automatic variable, because 1311 // LexEndOfFile will often delete 'this'. 1312 Preprocessor *PPCache = PP; 1313 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 1314 return; // Got a token to return. 1315 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 1316 return PPCache->Lex(Result); 1317 } 1318 1319 if (!isLexingRawMode()) 1320 Diag(CurPtr-1, diag::null_in_file); 1321 Result.setFlag(Token::LeadingSpace); 1322 if (SkipWhitespace(Result, CurPtr)) 1323 return; // KeepWhitespaceMode 1324 1325 goto LexNextToken; // GCC isn't tail call eliminating. 1326 case '\n': 1327 case '\r': 1328 // If we are inside a preprocessor directive and we see the end of line, 1329 // we know we are done with the directive, so return an EOM token. 1330 if (ParsingPreprocessorDirective) { 1331 // Done parsing the "line". 1332 ParsingPreprocessorDirective = false; 1333 1334 // Restore comment saving mode, in case it was disabled for directive. 1335 SetCommentRetentionState(PP->getCommentRetentionState()); 1336 1337 // Since we consumed a newline, we are back at the start of a line. 1338 IsAtStartOfLine = true; 1339 1340 Kind = tok::eom; 1341 break; 1342 } 1343 // The returned token is at the start of the line. 1344 Result.setFlag(Token::StartOfLine); 1345 // No leading whitespace seen so far. 1346 Result.clearFlag(Token::LeadingSpace); 1347 1348 if (SkipWhitespace(Result, CurPtr)) 1349 return; // KeepWhitespaceMode 1350 goto LexNextToken; // GCC isn't tail call eliminating. 1351 case ' ': 1352 case '\t': 1353 case '\f': 1354 case '\v': 1355 SkipHorizontalWhitespace: 1356 Result.setFlag(Token::LeadingSpace); 1357 if (SkipWhitespace(Result, CurPtr)) 1358 return; // KeepWhitespaceMode 1359 1360 SkipIgnoredUnits: 1361 CurPtr = BufferPtr; 1362 1363 // If the next token is obviously a // or /* */ comment, skip it efficiently 1364 // too (without going through the big switch stmt). 1365 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 1366 Features.BCPLComment) { 1367 SkipBCPLComment(Result, CurPtr+2); 1368 goto SkipIgnoredUnits; 1369 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 1370 SkipBlockComment(Result, CurPtr+2); 1371 goto SkipIgnoredUnits; 1372 } else if (isHorizontalWhitespace(*CurPtr)) { 1373 goto SkipHorizontalWhitespace; 1374 } 1375 goto LexNextToken; // GCC isn't tail call eliminating. 1376 1377 // C99 6.4.4.1: Integer Constants. 1378 // C99 6.4.4.2: Floating Constants. 1379 case '0': case '1': case '2': case '3': case '4': 1380 case '5': case '6': case '7': case '8': case '9': 1381 // Notify MIOpt that we read a non-whitespace/non-comment token. 1382 MIOpt.ReadToken(); 1383 return LexNumericConstant(Result, CurPtr); 1384 1385 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 1386 // Notify MIOpt that we read a non-whitespace/non-comment token. 1387 MIOpt.ReadToken(); 1388 Char = getCharAndSize(CurPtr, SizeTmp); 1389 1390 // Wide string literal. 1391 if (Char == '"') 1392 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 1393 true); 1394 1395 // Wide character constant. 1396 if (Char == '\'') 1397 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 1398 // FALL THROUGH, treating L like the start of an identifier. 1399 1400 // C99 6.4.2: Identifiers. 1401 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 1402 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 1403 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': 1404 case 'V': case 'W': case 'X': case 'Y': case 'Z': 1405 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 1406 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 1407 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': 1408 case 'v': case 'w': case 'x': case 'y': case 'z': 1409 case '_': 1410 // Notify MIOpt that we read a non-whitespace/non-comment token. 1411 MIOpt.ReadToken(); 1412 return LexIdentifier(Result, CurPtr); 1413 1414 case '$': // $ in identifiers. 1415 if (Features.DollarIdents) { 1416 if (!isLexingRawMode()) 1417 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 1418 // Notify MIOpt that we read a non-whitespace/non-comment token. 1419 MIOpt.ReadToken(); 1420 return LexIdentifier(Result, CurPtr); 1421 } 1422 1423 Kind = tok::unknown; 1424 break; 1425 1426 // C99 6.4.4: Character Constants. 1427 case '\'': 1428 // Notify MIOpt that we read a non-whitespace/non-comment token. 1429 MIOpt.ReadToken(); 1430 return LexCharConstant(Result, CurPtr); 1431 1432 // C99 6.4.5: String Literals. 1433 case '"': 1434 // Notify MIOpt that we read a non-whitespace/non-comment token. 1435 MIOpt.ReadToken(); 1436 return LexStringLiteral(Result, CurPtr, false); 1437 1438 // C99 6.4.6: Punctuators. 1439 case '?': 1440 Kind = tok::question; 1441 break; 1442 case '[': 1443 Kind = tok::l_square; 1444 break; 1445 case ']': 1446 Kind = tok::r_square; 1447 break; 1448 case '(': 1449 Kind = tok::l_paren; 1450 break; 1451 case ')': 1452 Kind = tok::r_paren; 1453 break; 1454 case '{': 1455 Kind = tok::l_brace; 1456 break; 1457 case '}': 1458 Kind = tok::r_brace; 1459 break; 1460 case '.': 1461 Char = getCharAndSize(CurPtr, SizeTmp); 1462 if (Char >= '0' && Char <= '9') { 1463 // Notify MIOpt that we read a non-whitespace/non-comment token. 1464 MIOpt.ReadToken(); 1465 1466 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 1467 } else if (Features.CPlusPlus && Char == '*') { 1468 Kind = tok::periodstar; 1469 CurPtr += SizeTmp; 1470 } else if (Char == '.' && 1471 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 1472 Kind = tok::ellipsis; 1473 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1474 SizeTmp2, Result); 1475 } else { 1476 Kind = tok::period; 1477 } 1478 break; 1479 case '&': 1480 Char = getCharAndSize(CurPtr, SizeTmp); 1481 if (Char == '&') { 1482 Kind = tok::ampamp; 1483 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1484 } else if (Char == '=') { 1485 Kind = tok::ampequal; 1486 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1487 } else { 1488 Kind = tok::amp; 1489 } 1490 break; 1491 case '*': 1492 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 1493 Kind = tok::starequal; 1494 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1495 } else { 1496 Kind = tok::star; 1497 } 1498 break; 1499 case '+': 1500 Char = getCharAndSize(CurPtr, SizeTmp); 1501 if (Char == '+') { 1502 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1503 Kind = tok::plusplus; 1504 } else if (Char == '=') { 1505 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1506 Kind = tok::plusequal; 1507 } else { 1508 Kind = tok::plus; 1509 } 1510 break; 1511 case '-': 1512 Char = getCharAndSize(CurPtr, SizeTmp); 1513 if (Char == '-') { // -- 1514 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1515 Kind = tok::minusminus; 1516 } else if (Char == '>' && Features.CPlusPlus && 1517 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 1518 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1519 SizeTmp2, Result); 1520 Kind = tok::arrowstar; 1521 } else if (Char == '>') { // -> 1522 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1523 Kind = tok::arrow; 1524 } else if (Char == '=') { // -= 1525 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1526 Kind = tok::minusequal; 1527 } else { 1528 Kind = tok::minus; 1529 } 1530 break; 1531 case '~': 1532 Kind = tok::tilde; 1533 break; 1534 case '!': 1535 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 1536 Kind = tok::exclaimequal; 1537 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1538 } else { 1539 Kind = tok::exclaim; 1540 } 1541 break; 1542 case '/': 1543 // 6.4.9: Comments 1544 Char = getCharAndSize(CurPtr, SizeTmp); 1545 if (Char == '/') { // BCPL comment. 1546 // Even if BCPL comments are disabled (e.g. in C89 mode), we generally 1547 // want to lex this as a comment. There is one problem with this though, 1548 // that in one particular corner case, this can change the behavior of the 1549 // resultant program. For example, In "foo //**/ bar", C89 would lex 1550 // this as "foo / bar" and langauges with BCPL comments would lex it as 1551 // "foo". Check to see if the character after the second slash is a '*'. 1552 // If so, we will lex that as a "/" instead of the start of a comment. 1553 if (Features.BCPLComment || 1554 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') { 1555 if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 1556 return; // KeepCommentMode 1557 1558 // It is common for the tokens immediately after a // comment to be 1559 // whitespace (indentation for the next line). Instead of going through 1560 // the big switch, handle it efficiently now. 1561 goto SkipIgnoredUnits; 1562 } 1563 } 1564 1565 if (Char == '*') { // /**/ comment. 1566 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 1567 return; // KeepCommentMode 1568 goto LexNextToken; // GCC isn't tail call eliminating. 1569 } 1570 1571 if (Char == '=') { 1572 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1573 Kind = tok::slashequal; 1574 } else { 1575 Kind = tok::slash; 1576 } 1577 break; 1578 case '%': 1579 Char = getCharAndSize(CurPtr, SizeTmp); 1580 if (Char == '=') { 1581 Kind = tok::percentequal; 1582 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1583 } else if (Features.Digraphs && Char == '>') { 1584 Kind = tok::r_brace; // '%>' -> '}' 1585 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1586 } else if (Features.Digraphs && Char == ':') { 1587 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1588 Char = getCharAndSize(CurPtr, SizeTmp); 1589 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 1590 Kind = tok::hashhash; // '%:%:' -> '##' 1591 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1592 SizeTmp2, Result); 1593 } else if (Char == '@' && Features.Microsoft) { // %:@ -> #@ -> Charize 1594 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1595 if (!isLexingRawMode()) 1596 Diag(BufferPtr, diag::charize_microsoft_ext); 1597 Kind = tok::hashat; 1598 } else { // '%:' -> '#' 1599 // We parsed a # character. If this occurs at the start of the line, 1600 // it's actually the start of a preprocessing directive. Callback to 1601 // the preprocessor to handle it. 1602 // FIXME: -fpreprocessed mode?? 1603 if (Result.isAtStartOfLine() && !LexingRawMode) { 1604 FormTokenWithChars(Result, CurPtr, tok::hash); 1605 PP->HandleDirective(Result); 1606 1607 // As an optimization, if the preprocessor didn't switch lexers, tail 1608 // recurse. 1609 if (PP->isCurrentLexer(this)) { 1610 // Start a new token. If this is a #include or something, the PP may 1611 // want us starting at the beginning of the line again. If so, set 1612 // the StartOfLine flag. 1613 if (IsAtStartOfLine) { 1614 Result.setFlag(Token::StartOfLine); 1615 IsAtStartOfLine = false; 1616 } 1617 goto LexNextToken; // GCC isn't tail call eliminating. 1618 } 1619 1620 return PP->Lex(Result); 1621 } 1622 1623 Kind = tok::hash; 1624 } 1625 } else { 1626 Kind = tok::percent; 1627 } 1628 break; 1629 case '<': 1630 Char = getCharAndSize(CurPtr, SizeTmp); 1631 if (ParsingFilename) { 1632 return LexAngledStringLiteral(Result, CurPtr+SizeTmp); 1633 } else if (Char == '<' && 1634 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '=') { 1635 Kind = tok::lesslessequal; 1636 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1637 SizeTmp2, Result); 1638 } else if (Char == '<') { 1639 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1640 Kind = tok::lessless; 1641 } else if (Char == '=') { 1642 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1643 Kind = tok::lessequal; 1644 } else if (Features.Digraphs && Char == ':') { // '<:' -> '[' 1645 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1646 Kind = tok::l_square; 1647 } else if (Features.Digraphs && Char == '%') { // '<%' -> '{' 1648 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1649 Kind = tok::l_brace; 1650 } else { 1651 Kind = tok::less; 1652 } 1653 break; 1654 case '>': 1655 Char = getCharAndSize(CurPtr, SizeTmp); 1656 if (Char == '=') { 1657 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1658 Kind = tok::greaterequal; 1659 } else if (Char == '>' && 1660 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '=') { 1661 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1662 SizeTmp2, Result); 1663 Kind = tok::greatergreaterequal; 1664 } else if (Char == '>') { 1665 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1666 Kind = tok::greatergreater; 1667 } else { 1668 Kind = tok::greater; 1669 } 1670 break; 1671 case '^': 1672 Char = getCharAndSize(CurPtr, SizeTmp); 1673 if (Char == '=') { 1674 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1675 Kind = tok::caretequal; 1676 } else { 1677 Kind = tok::caret; 1678 } 1679 break; 1680 case '|': 1681 Char = getCharAndSize(CurPtr, SizeTmp); 1682 if (Char == '=') { 1683 Kind = tok::pipeequal; 1684 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1685 } else if (Char == '|') { 1686 Kind = tok::pipepipe; 1687 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1688 } else { 1689 Kind = tok::pipe; 1690 } 1691 break; 1692 case ':': 1693 Char = getCharAndSize(CurPtr, SizeTmp); 1694 if (Features.Digraphs && Char == '>') { 1695 Kind = tok::r_square; // ':>' -> ']' 1696 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1697 } else if (Features.CPlusPlus && Char == ':') { 1698 Kind = tok::coloncolon; 1699 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1700 } else { 1701 Kind = tok::colon; 1702 } 1703 break; 1704 case ';': 1705 Kind = tok::semi; 1706 break; 1707 case '=': 1708 Char = getCharAndSize(CurPtr, SizeTmp); 1709 if (Char == '=') { 1710 Kind = tok::equalequal; 1711 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1712 } else { 1713 Kind = tok::equal; 1714 } 1715 break; 1716 case ',': 1717 Kind = tok::comma; 1718 break; 1719 case '#': 1720 Char = getCharAndSize(CurPtr, SizeTmp); 1721 if (Char == '#') { 1722 Kind = tok::hashhash; 1723 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1724 } else if (Char == '@' && Features.Microsoft) { // #@ -> Charize 1725 Kind = tok::hashat; 1726 if (!isLexingRawMode()) 1727 Diag(BufferPtr, diag::charize_microsoft_ext); 1728 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1729 } else { 1730 // We parsed a # character. If this occurs at the start of the line, 1731 // it's actually the start of a preprocessing directive. Callback to 1732 // the preprocessor to handle it. 1733 // FIXME: -fpreprocessed mode?? 1734 if (Result.isAtStartOfLine() && !LexingRawMode) { 1735 FormTokenWithChars(Result, CurPtr, tok::hash); 1736 PP->HandleDirective(Result); 1737 1738 // As an optimization, if the preprocessor didn't switch lexers, tail 1739 // recurse. 1740 if (PP->isCurrentLexer(this)) { 1741 // Start a new token. If this is a #include or something, the PP may 1742 // want us starting at the beginning of the line again. If so, set 1743 // the StartOfLine flag. 1744 if (IsAtStartOfLine) { 1745 Result.setFlag(Token::StartOfLine); 1746 IsAtStartOfLine = false; 1747 } 1748 goto LexNextToken; // GCC isn't tail call eliminating. 1749 } 1750 return PP->Lex(Result); 1751 } 1752 1753 Kind = tok::hash; 1754 } 1755 break; 1756 1757 case '@': 1758 // Objective C support. 1759 if (CurPtr[-1] == '@' && Features.ObjC1) 1760 Kind = tok::at; 1761 else 1762 Kind = tok::unknown; 1763 break; 1764 1765 case '\\': 1766 // FIXME: UCN's. 1767 // FALL THROUGH. 1768 default: 1769 Kind = tok::unknown; 1770 break; 1771 } 1772 1773 // Notify MIOpt that we read a non-whitespace/non-comment token. 1774 MIOpt.ReadToken(); 1775 1776 // Update the location of token as well as BufferPtr. 1777 FormTokenWithChars(Result, CurPtr, Kind); 1778} 1779