Lexer.cpp revision d1186fa38166a581b51975f0382a45fc3a0733d0
1//===--- Lexer.cpp - C Language Family Lexer ------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements the Lexer and Token interfaces. 11// 12//===----------------------------------------------------------------------===// 13// 14// TODO: GCC Diagnostics emitted by the lexer: 15// PEDWARN: (form feed|vertical tab) in preprocessing directive 16// 17// Universal characters, unicode, char mapping: 18// WARNING: `%.*s' is not in NFKC 19// WARNING: `%.*s' is not in NFC 20// 21// Other: 22// TODO: Options to support: 23// -fexec-charset,-fwide-exec-charset 24// 25//===----------------------------------------------------------------------===// 26 27#include "clang/Lex/Lexer.h" 28#include "clang/Lex/Preprocessor.h" 29#include "clang/Basic/Diagnostic.h" 30#include "clang/Basic/SourceManager.h" 31#include "llvm/Support/Compiler.h" 32#include "llvm/Support/MemoryBuffer.h" 33#include <cctype> 34using namespace clang; 35 36static void InitCharacterInfo(); 37 38//===----------------------------------------------------------------------===// 39// Token Class Implementation 40//===----------------------------------------------------------------------===// 41 42/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 43bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 44 if (IdentifierInfo *II = getIdentifierInfo()) 45 return II->getObjCKeywordID() == objcKey; 46 return false; 47} 48 49/// getObjCKeywordID - Return the ObjC keyword kind. 50tok::ObjCKeywordKind Token::getObjCKeywordID() const { 51 IdentifierInfo *specId = getIdentifierInfo(); 52 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 53} 54 55 56//===----------------------------------------------------------------------===// 57// Lexer Class Implementation 58//===----------------------------------------------------------------------===// 59 60void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 61 const char *BufEnd) { 62 InitCharacterInfo(); 63 64 BufferStart = BufStart; 65 BufferPtr = BufPtr; 66 BufferEnd = BufEnd; 67 68 assert(BufEnd[0] == 0 && 69 "We assume that the input buffer has a null character at the end" 70 " to simplify lexing!"); 71 72 Is_PragmaLexer = false; 73 74 // Start of the file is a start of line. 75 IsAtStartOfLine = true; 76 77 // We are not after parsing a #. 78 ParsingPreprocessorDirective = false; 79 80 // We are not after parsing #include. 81 ParsingFilename = false; 82 83 // We are not in raw mode. Raw mode disables diagnostics and interpretation 84 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 85 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 86 // or otherwise skipping over tokens. 87 LexingRawMode = false; 88 89 // Default to not keeping comments. 90 ExtendedTokenMode = 0; 91} 92 93/// Lexer constructor - Create a new lexer object for the specified buffer 94/// with the specified preprocessor managing the lexing process. This lexer 95/// assumes that the associated file buffer and Preprocessor objects will 96/// outlive it, so it doesn't take ownership of either of them. 97Lexer::Lexer(FileID FID, Preprocessor &PP) 98 : PreprocessorLexer(&PP, FID), 99 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 100 Features(PP.getLangOptions()) { 101 102 const llvm::MemoryBuffer *InputFile = PP.getSourceManager().getBuffer(FID); 103 104 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), 105 InputFile->getBufferEnd()); 106 107 // Default to keeping comments if the preprocessor wants them. 108 SetCommentRetentionState(PP.getCommentRetentionState()); 109} 110 111/// Lexer constructor - Create a new raw lexer object. This object is only 112/// suitable for calls to 'LexRawToken'. This lexer assumes that the text 113/// range will outlive it, so it doesn't take ownership of it. 114Lexer::Lexer(SourceLocation fileloc, const LangOptions &features, 115 const char *BufStart, const char *BufPtr, const char *BufEnd) 116 : FileLoc(fileloc), Features(features) { 117 118 InitLexer(BufStart, BufPtr, BufEnd); 119 120 // We *are* in raw mode. 121 LexingRawMode = true; 122} 123 124/// Lexer constructor - Create a new raw lexer object. This object is only 125/// suitable for calls to 'LexRawToken'. This lexer assumes that the text 126/// range will outlive it, so it doesn't take ownership of it. 127Lexer::Lexer(FileID FID, const SourceManager &SM, const LangOptions &features) 128 : FileLoc(SM.getLocForStartOfFile(FID)), Features(features) { 129 const llvm::MemoryBuffer *FromFile = SM.getBuffer(FID); 130 131 InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(), 132 FromFile->getBufferEnd()); 133 134 // We *are* in raw mode. 135 LexingRawMode = true; 136} 137 138/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 139/// _Pragma expansion. This has a variety of magic semantics that this method 140/// sets up. It returns a new'd Lexer that must be delete'd when done. 141/// 142/// On entrance to this routine, TokStartLoc is a macro location which has a 143/// spelling loc that indicates the bytes to be lexed for the token and an 144/// instantiation location that indicates where all lexed tokens should be 145/// "expanded from". 146/// 147/// FIXME: It would really be nice to make _Pragma just be a wrapper around a 148/// normal lexer that remaps tokens as they fly by. This would require making 149/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 150/// interface that could handle this stuff. This would pull GetMappedTokenLoc 151/// out of the critical path of the lexer! 152/// 153Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 154 SourceLocation InstantiationLoc, 155 unsigned TokLen, Preprocessor &PP) { 156 SourceManager &SM = PP.getSourceManager(); 157 158 // Create the lexer as if we were going to lex the file normally. 159 FileID SpellingFID = SM.getFileID(SpellingLoc); 160 Lexer *L = new Lexer(SpellingFID, PP); 161 162 // Now that the lexer is created, change the start/end locations so that we 163 // just lex the subsection of the file that we want. This is lexing from a 164 // scratch buffer. 165 const char *StrData = SM.getCharacterData(SpellingLoc); 166 167 L->BufferPtr = StrData; 168 L->BufferEnd = StrData+TokLen; 169 170 // Set the SourceLocation with the remapping information. This ensures that 171 // GetMappedTokenLoc will remap the tokens as they are lexed. 172 L->FileLoc = SM.getInstantiationLoc(SM.getLocForStartOfFile(SpellingFID), 173 InstantiationLoc); 174 175 // Ensure that the lexer thinks it is inside a directive, so that end \n will 176 // return an EOM token. 177 L->ParsingPreprocessorDirective = true; 178 179 // This lexer really is for _Pragma. 180 L->Is_PragmaLexer = true; 181 return L; 182} 183 184 185/// Stringify - Convert the specified string into a C string, with surrounding 186/// ""'s, and with escaped \ and " characters. 187std::string Lexer::Stringify(const std::string &Str, bool Charify) { 188 std::string Result = Str; 189 char Quote = Charify ? '\'' : '"'; 190 for (unsigned i = 0, e = Result.size(); i != e; ++i) { 191 if (Result[i] == '\\' || Result[i] == Quote) { 192 Result.insert(Result.begin()+i, '\\'); 193 ++i; ++e; 194 } 195 } 196 return Result; 197} 198 199/// Stringify - Convert the specified string into a C string by escaping '\' 200/// and " characters. This does not add surrounding ""'s to the string. 201void Lexer::Stringify(llvm::SmallVectorImpl<char> &Str) { 202 for (unsigned i = 0, e = Str.size(); i != e; ++i) { 203 if (Str[i] == '\\' || Str[i] == '"') { 204 Str.insert(Str.begin()+i, '\\'); 205 ++i; ++e; 206 } 207 } 208} 209 210 211/// MeasureTokenLength - Relex the token at the specified location and return 212/// its length in bytes in the input file. If the token needs cleaning (e.g. 213/// includes a trigraph or an escaped newline) then this count includes bytes 214/// that are part of that. 215unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 216 const SourceManager &SM) { 217 // If this comes from a macro expansion, we really do want the macro name, not 218 // the token this macro expanded to. 219 Loc = SM.getInstantiationLoc(Loc); 220 221 // TODO: this could be special cased for common tokens like identifiers, ')', 222 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 223 // all obviously single-char tokens. This could use 224 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 225 // something. 226 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedFileLoc(Loc); 227 std::pair<const char *,const char *> Buffer = SM.getBufferData(LocInfo.first); 228 const char *StrData = Buffer.first+LocInfo.second; 229 230 // Create a langops struct and enable trigraphs. This is sufficient for 231 // measuring tokens. 232 LangOptions LangOpts; 233 LangOpts.Trigraphs = true; 234 235 // Create a lexer starting at the beginning of this token. 236 Lexer TheLexer(Loc, LangOpts, Buffer.first, StrData, Buffer.second); 237 Token TheTok; 238 TheLexer.LexFromRawLexer(TheTok); 239 return TheTok.getLength(); 240} 241 242//===----------------------------------------------------------------------===// 243// Character information. 244//===----------------------------------------------------------------------===// 245 246static unsigned char CharInfo[256]; 247 248enum { 249 CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0' 250 CHAR_VERT_WS = 0x02, // '\r', '\n' 251 CHAR_LETTER = 0x04, // a-z,A-Z 252 CHAR_NUMBER = 0x08, // 0-9 253 CHAR_UNDER = 0x10, // _ 254 CHAR_PERIOD = 0x20 // . 255}; 256 257static void InitCharacterInfo() { 258 static bool isInited = false; 259 if (isInited) return; 260 isInited = true; 261 262 // Intiialize the CharInfo table. 263 // TODO: statically initialize this. 264 CharInfo[(int)' '] = CharInfo[(int)'\t'] = 265 CharInfo[(int)'\f'] = CharInfo[(int)'\v'] = CHAR_HORZ_WS; 266 CharInfo[(int)'\n'] = CharInfo[(int)'\r'] = CHAR_VERT_WS; 267 268 CharInfo[(int)'_'] = CHAR_UNDER; 269 CharInfo[(int)'.'] = CHAR_PERIOD; 270 for (unsigned i = 'a'; i <= 'z'; ++i) 271 CharInfo[i] = CharInfo[i+'A'-'a'] = CHAR_LETTER; 272 for (unsigned i = '0'; i <= '9'; ++i) 273 CharInfo[i] = CHAR_NUMBER; 274} 275 276/// isIdentifierBody - Return true if this is the body character of an 277/// identifier, which is [a-zA-Z0-9_]. 278static inline bool isIdentifierBody(unsigned char c) { 279 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false; 280} 281 282/// isHorizontalWhitespace - Return true if this character is horizontal 283/// whitespace: ' ', '\t', '\f', '\v'. Note that this returns false for '\0'. 284static inline bool isHorizontalWhitespace(unsigned char c) { 285 return (CharInfo[c] & CHAR_HORZ_WS) ? true : false; 286} 287 288/// isWhitespace - Return true if this character is horizontal or vertical 289/// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'. Note that this returns false 290/// for '\0'. 291static inline bool isWhitespace(unsigned char c) { 292 return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false; 293} 294 295/// isNumberBody - Return true if this is the body character of an 296/// preprocessing number, which is [a-zA-Z0-9_.]. 297static inline bool isNumberBody(unsigned char c) { 298 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ? 299 true : false; 300} 301 302 303//===----------------------------------------------------------------------===// 304// Diagnostics forwarding code. 305//===----------------------------------------------------------------------===// 306 307/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 308/// lexer buffer was all instantiated at a single point, perform the mapping. 309/// This is currently only used for _Pragma implementation, so it is the slow 310/// path of the hot getSourceLocation method. Do not allow it to be inlined. 311static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 312 SourceLocation FileLoc, 313 unsigned CharNo) DISABLE_INLINE; 314static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 315 SourceLocation FileLoc, 316 unsigned CharNo) { 317 // Otherwise, we're lexing "mapped tokens". This is used for things like 318 // _Pragma handling. Combine the instantiation location of FileLoc with the 319 // spelling location. 320 SourceManager &SourceMgr = PP.getSourceManager(); 321 322 // Create a new SLoc which is expanded from Instantiation(FileLoc) but whose 323 // characters come from spelling(FileLoc)+Offset. 324 SourceLocation InstLoc = SourceMgr.getInstantiationLoc(FileLoc); 325 SourceLocation SpellingLoc = SourceMgr.getSpellingLoc(FileLoc); 326 SpellingLoc = SpellingLoc.getFileLocWithOffset(CharNo); 327 return SourceMgr.getInstantiationLoc(SpellingLoc, InstLoc); 328} 329 330/// getSourceLocation - Return a source location identifier for the specified 331/// offset in the current file. 332SourceLocation Lexer::getSourceLocation(const char *Loc) const { 333 assert(Loc >= BufferStart && Loc <= BufferEnd && 334 "Location out of range for this buffer!"); 335 336 // In the normal case, we're just lexing from a simple file buffer, return 337 // the file id from FileLoc with the offset specified. 338 unsigned CharNo = Loc-BufferStart; 339 if (FileLoc.isFileID()) 340 return FileLoc.getFileLocWithOffset(CharNo); 341 342 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 343 // tokens are lexed from where the _Pragma was defined. 344 assert(PP && "This doesn't work on raw lexers"); 345 return GetMappedTokenLoc(*PP, FileLoc, CharNo); 346} 347 348/// Diag - Forwarding function for diagnostics. This translate a source 349/// position in the current buffer into a SourceLocation object for rendering. 350DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 351 return PP->Diag(getSourceLocation(Loc), DiagID); 352} 353 354//===----------------------------------------------------------------------===// 355// Trigraph and Escaped Newline Handling Code. 356//===----------------------------------------------------------------------===// 357 358/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 359/// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 360static char GetTrigraphCharForLetter(char Letter) { 361 switch (Letter) { 362 default: return 0; 363 case '=': return '#'; 364 case ')': return ']'; 365 case '(': return '['; 366 case '!': return '|'; 367 case '\'': return '^'; 368 case '>': return '}'; 369 case '/': return '\\'; 370 case '<': return '{'; 371 case '-': return '~'; 372 } 373} 374 375/// DecodeTrigraphChar - If the specified character is a legal trigraph when 376/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 377/// return the result character. Finally, emit a warning about trigraph use 378/// whether trigraphs are enabled or not. 379static char DecodeTrigraphChar(const char *CP, Lexer *L) { 380 char Res = GetTrigraphCharForLetter(*CP); 381 if (!Res || !L) return Res; 382 383 if (!L->getFeatures().Trigraphs) { 384 if (!L->isLexingRawMode()) 385 L->Diag(CP-2, diag::trigraph_ignored); 386 return 0; 387 } 388 389 if (!L->isLexingRawMode()) 390 L->Diag(CP-2, diag::trigraph_converted) << std::string()+Res; 391 return Res; 392} 393 394/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 395/// get its size, and return it. This is tricky in several cases: 396/// 1. If currently at the start of a trigraph, we warn about the trigraph, 397/// then either return the trigraph (skipping 3 chars) or the '?', 398/// depending on whether trigraphs are enabled or not. 399/// 2. If this is an escaped newline (potentially with whitespace between 400/// the backslash and newline), implicitly skip the newline and return 401/// the char after it. 402/// 3. If this is a UCN, return it. FIXME: C++ UCN's? 403/// 404/// This handles the slow/uncommon case of the getCharAndSize method. Here we 405/// know that we can accumulate into Size, and that we have already incremented 406/// Ptr by Size bytes. 407/// 408/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 409/// be updated to match. 410/// 411char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 412 Token *Tok) { 413 // If we have a slash, look for an escaped newline. 414 if (Ptr[0] == '\\') { 415 ++Size; 416 ++Ptr; 417Slash: 418 // Common case, backslash-char where the char is not whitespace. 419 if (!isWhitespace(Ptr[0])) return '\\'; 420 421 // See if we have optional whitespace characters followed by a newline. 422 { 423 unsigned SizeTmp = 0; 424 do { 425 ++SizeTmp; 426 if (Ptr[SizeTmp-1] == '\n' || Ptr[SizeTmp-1] == '\r') { 427 // Remember that this token needs to be cleaned. 428 if (Tok) Tok->setFlag(Token::NeedsCleaning); 429 430 // Warn if there was whitespace between the backslash and newline. 431 if (SizeTmp != 1 && Tok && !isLexingRawMode()) 432 Diag(Ptr, diag::backslash_newline_space); 433 434 // If this is a \r\n or \n\r, skip the newlines. 435 if ((Ptr[SizeTmp] == '\r' || Ptr[SizeTmp] == '\n') && 436 Ptr[SizeTmp-1] != Ptr[SizeTmp]) 437 ++SizeTmp; 438 439 // Found backslash<whitespace><newline>. Parse the char after it. 440 Size += SizeTmp; 441 Ptr += SizeTmp; 442 // Use slow version to accumulate a correct size field. 443 return getCharAndSizeSlow(Ptr, Size, Tok); 444 } 445 } while (isWhitespace(Ptr[SizeTmp])); 446 } 447 448 // Otherwise, this is not an escaped newline, just return the slash. 449 return '\\'; 450 } 451 452 // If this is a trigraph, process it. 453 if (Ptr[0] == '?' && Ptr[1] == '?') { 454 // If this is actually a legal trigraph (not something like "??x"), emit 455 // a trigraph warning. If so, and if trigraphs are enabled, return it. 456 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) { 457 // Remember that this token needs to be cleaned. 458 if (Tok) Tok->setFlag(Token::NeedsCleaning); 459 460 Ptr += 3; 461 Size += 3; 462 if (C == '\\') goto Slash; 463 return C; 464 } 465 } 466 467 // If this is neither, return a single character. 468 ++Size; 469 return *Ptr; 470} 471 472 473/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 474/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 475/// and that we have already incremented Ptr by Size bytes. 476/// 477/// NOTE: When this method is updated, getCharAndSizeSlow (above) should 478/// be updated to match. 479char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 480 const LangOptions &Features) { 481 // If we have a slash, look for an escaped newline. 482 if (Ptr[0] == '\\') { 483 ++Size; 484 ++Ptr; 485Slash: 486 // Common case, backslash-char where the char is not whitespace. 487 if (!isWhitespace(Ptr[0])) return '\\'; 488 489 // See if we have optional whitespace characters followed by a newline. 490 { 491 unsigned SizeTmp = 0; 492 do { 493 ++SizeTmp; 494 if (Ptr[SizeTmp-1] == '\n' || Ptr[SizeTmp-1] == '\r') { 495 496 // If this is a \r\n or \n\r, skip the newlines. 497 if ((Ptr[SizeTmp] == '\r' || Ptr[SizeTmp] == '\n') && 498 Ptr[SizeTmp-1] != Ptr[SizeTmp]) 499 ++SizeTmp; 500 501 // Found backslash<whitespace><newline>. Parse the char after it. 502 Size += SizeTmp; 503 Ptr += SizeTmp; 504 505 // Use slow version to accumulate a correct size field. 506 return getCharAndSizeSlowNoWarn(Ptr, Size, Features); 507 } 508 } while (isWhitespace(Ptr[SizeTmp])); 509 } 510 511 // Otherwise, this is not an escaped newline, just return the slash. 512 return '\\'; 513 } 514 515 // If this is a trigraph, process it. 516 if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 517 // If this is actually a legal trigraph (not something like "??x"), return 518 // it. 519 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 520 Ptr += 3; 521 Size += 3; 522 if (C == '\\') goto Slash; 523 return C; 524 } 525 } 526 527 // If this is neither, return a single character. 528 ++Size; 529 return *Ptr; 530} 531 532//===----------------------------------------------------------------------===// 533// Helper methods for lexing. 534//===----------------------------------------------------------------------===// 535 536void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { 537 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] 538 unsigned Size; 539 unsigned char C = *CurPtr++; 540 while (isIdentifierBody(C)) { 541 C = *CurPtr++; 542 } 543 --CurPtr; // Back up over the skipped character. 544 545 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline 546 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. 547 // FIXME: UCNs. 548 if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) { 549FinishIdentifier: 550 const char *IdStart = BufferPtr; 551 FormTokenWithChars(Result, CurPtr, tok::identifier); 552 553 // If we are in raw mode, return this identifier raw. There is no need to 554 // look up identifier information or attempt to macro expand it. 555 if (LexingRawMode) return; 556 557 // Fill in Result.IdentifierInfo, looking up the identifier in the 558 // identifier table. 559 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result, IdStart); 560 561 // Finally, now that we know we have an identifier, pass this off to the 562 // preprocessor, which may macro expand it or something. 563 if (II->isHandleIdentifierCase()) 564 PP->HandleIdentifier(Result); 565 return; 566 } 567 568 // Otherwise, $,\,? in identifier found. Enter slower path. 569 570 C = getCharAndSize(CurPtr, Size); 571 while (1) { 572 if (C == '$') { 573 // If we hit a $ and they are not supported in identifiers, we are done. 574 if (!Features.DollarIdents) goto FinishIdentifier; 575 576 // Otherwise, emit a diagnostic and continue. 577 if (!isLexingRawMode()) 578 Diag(CurPtr, diag::ext_dollar_in_identifier); 579 CurPtr = ConsumeChar(CurPtr, Size, Result); 580 C = getCharAndSize(CurPtr, Size); 581 continue; 582 } else if (!isIdentifierBody(C)) { // FIXME: UCNs. 583 // Found end of identifier. 584 goto FinishIdentifier; 585 } 586 587 // Otherwise, this character is good, consume it. 588 CurPtr = ConsumeChar(CurPtr, Size, Result); 589 590 C = getCharAndSize(CurPtr, Size); 591 while (isIdentifierBody(C)) { // FIXME: UCNs. 592 CurPtr = ConsumeChar(CurPtr, Size, Result); 593 C = getCharAndSize(CurPtr, Size); 594 } 595 } 596} 597 598 599/// LexNumericConstant - Lex the remainder of a integer or floating point 600/// constant. From[-1] is the first character lexed. Return the end of the 601/// constant. 602void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 603 unsigned Size; 604 char C = getCharAndSize(CurPtr, Size); 605 char PrevCh = 0; 606 while (isNumberBody(C)) { // FIXME: UCNs? 607 CurPtr = ConsumeChar(CurPtr, Size, Result); 608 PrevCh = C; 609 C = getCharAndSize(CurPtr, Size); 610 } 611 612 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 613 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) 614 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 615 616 // If we have a hex FP constant, continue. 617 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p') && 618 (Features.HexFloats || !Features.NoExtensions)) 619 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 620 621 // Update the location of token as well as BufferPtr. 622 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 623} 624 625/// LexStringLiteral - Lex the remainder of a string literal, after having lexed 626/// either " or L". 627void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) { 628 const char *NulCharacter = 0; // Does this string contain the \0 character? 629 630 char C = getAndAdvanceChar(CurPtr, Result); 631 while (C != '"') { 632 // Skip escaped characters. 633 if (C == '\\') { 634 // Skip the escaped character. 635 C = getAndAdvanceChar(CurPtr, Result); 636 } else if (C == '\n' || C == '\r' || // Newline. 637 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 638 if (!isLexingRawMode()) 639 Diag(BufferPtr, diag::err_unterminated_string); 640 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 641 return; 642 } else if (C == 0) { 643 NulCharacter = CurPtr-1; 644 } 645 C = getAndAdvanceChar(CurPtr, Result); 646 } 647 648 // If a nul character existed in the string, warn about it. 649 if (NulCharacter && !isLexingRawMode()) 650 Diag(NulCharacter, diag::null_in_string); 651 652 // Update the location of the token as well as the BufferPtr instance var. 653 FormTokenWithChars(Result, CurPtr, 654 Wide ? tok::wide_string_literal : tok::string_literal); 655} 656 657/// LexAngledStringLiteral - Lex the remainder of an angled string literal, 658/// after having lexed the '<' character. This is used for #include filenames. 659void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 660 const char *NulCharacter = 0; // Does this string contain the \0 character? 661 662 char C = getAndAdvanceChar(CurPtr, Result); 663 while (C != '>') { 664 // Skip escaped characters. 665 if (C == '\\') { 666 // Skip the escaped character. 667 C = getAndAdvanceChar(CurPtr, Result); 668 } else if (C == '\n' || C == '\r' || // Newline. 669 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 670 if (!isLexingRawMode()) 671 Diag(BufferPtr, diag::err_unterminated_string); 672 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 673 return; 674 } else if (C == 0) { 675 NulCharacter = CurPtr-1; 676 } 677 C = getAndAdvanceChar(CurPtr, Result); 678 } 679 680 // If a nul character existed in the string, warn about it. 681 if (NulCharacter && !isLexingRawMode()) 682 Diag(NulCharacter, diag::null_in_string); 683 684 // Update the location of token as well as BufferPtr. 685 FormTokenWithChars(Result, CurPtr, tok::angle_string_literal); 686} 687 688 689/// LexCharConstant - Lex the remainder of a character constant, after having 690/// lexed either ' or L'. 691void Lexer::LexCharConstant(Token &Result, const char *CurPtr) { 692 const char *NulCharacter = 0; // Does this character contain the \0 character? 693 694 // Handle the common case of 'x' and '\y' efficiently. 695 char C = getAndAdvanceChar(CurPtr, Result); 696 if (C == '\'') { 697 if (!isLexingRawMode()) 698 Diag(BufferPtr, diag::err_empty_character); 699 FormTokenWithChars(Result, CurPtr, tok::unknown); 700 return; 701 } else if (C == '\\') { 702 // Skip the escaped character. 703 // FIXME: UCN's. 704 C = getAndAdvanceChar(CurPtr, Result); 705 } 706 707 if (C && C != '\n' && C != '\r' && CurPtr[0] == '\'') { 708 ++CurPtr; 709 } else { 710 // Fall back on generic code for embedded nulls, newlines, wide chars. 711 do { 712 // Skip escaped characters. 713 if (C == '\\') { 714 // Skip the escaped character. 715 C = getAndAdvanceChar(CurPtr, Result); 716 } else if (C == '\n' || C == '\r' || // Newline. 717 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 718 if (!isLexingRawMode()) 719 Diag(BufferPtr, diag::err_unterminated_char); 720 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 721 return; 722 } else if (C == 0) { 723 NulCharacter = CurPtr-1; 724 } 725 C = getAndAdvanceChar(CurPtr, Result); 726 } while (C != '\''); 727 } 728 729 if (NulCharacter && !isLexingRawMode()) 730 Diag(NulCharacter, diag::null_in_char); 731 732 // Update the location of token as well as BufferPtr. 733 FormTokenWithChars(Result, CurPtr, tok::char_constant); 734} 735 736/// SkipWhitespace - Efficiently skip over a series of whitespace characters. 737/// Update BufferPtr to point to the next non-whitespace character and return. 738/// 739/// This method forms a token and returns true if KeepWhitespaceMode is enabled. 740/// 741bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { 742 // Whitespace - Skip it, then return the token after the whitespace. 743 unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently. 744 while (1) { 745 // Skip horizontal whitespace very aggressively. 746 while (isHorizontalWhitespace(Char)) 747 Char = *++CurPtr; 748 749 // Otherwise if we have something other than whitespace, we're done. 750 if (Char != '\n' && Char != '\r') 751 break; 752 753 if (ParsingPreprocessorDirective) { 754 // End of preprocessor directive line, let LexTokenInternal handle this. 755 BufferPtr = CurPtr; 756 return false; 757 } 758 759 // ok, but handle newline. 760 // The returned token is at the start of the line. 761 Result.setFlag(Token::StartOfLine); 762 // No leading whitespace seen so far. 763 Result.clearFlag(Token::LeadingSpace); 764 Char = *++CurPtr; 765 } 766 767 // If this isn't immediately after a newline, there is leading space. 768 char PrevChar = CurPtr[-1]; 769 if (PrevChar != '\n' && PrevChar != '\r') 770 Result.setFlag(Token::LeadingSpace); 771 772 // If the client wants us to return whitespace, return it now. 773 if (isKeepWhitespaceMode()) { 774 FormTokenWithChars(Result, CurPtr, tok::unknown); 775 return true; 776 } 777 778 BufferPtr = CurPtr; 779 return false; 780} 781 782// SkipBCPLComment - We have just read the // characters from input. Skip until 783// we find the newline character thats terminate the comment. Then update 784/// BufferPtr and return. If we're in KeepCommentMode, this will form the token 785/// and return true. 786bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { 787 // If BCPL comments aren't explicitly enabled for this language, emit an 788 // extension warning. 789 if (!Features.BCPLComment && !isLexingRawMode()) { 790 Diag(BufferPtr, diag::ext_bcpl_comment); 791 792 // Mark them enabled so we only emit one warning for this translation 793 // unit. 794 Features.BCPLComment = true; 795 } 796 797 // Scan over the body of the comment. The common case, when scanning, is that 798 // the comment contains normal ascii characters with nothing interesting in 799 // them. As such, optimize for this case with the inner loop. 800 char C; 801 do { 802 C = *CurPtr; 803 // FIXME: Speedup BCPL comment lexing. Just scan for a \n or \r character. 804 // If we find a \n character, scan backwards, checking to see if it's an 805 // escaped newline, like we do for block comments. 806 807 // Skip over characters in the fast loop. 808 while (C != 0 && // Potentially EOF. 809 C != '\\' && // Potentially escaped newline. 810 C != '?' && // Potentially trigraph. 811 C != '\n' && C != '\r') // Newline or DOS-style newline. 812 C = *++CurPtr; 813 814 // If this is a newline, we're done. 815 if (C == '\n' || C == '\r') 816 break; // Found the newline? Break out! 817 818 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 819 // properly decode the character. Read it in raw mode to avoid emitting 820 // diagnostics about things like trigraphs. If we see an escaped newline, 821 // we'll handle it below. 822 const char *OldPtr = CurPtr; 823 bool OldRawMode = isLexingRawMode(); 824 LexingRawMode = true; 825 C = getAndAdvanceChar(CurPtr, Result); 826 LexingRawMode = OldRawMode; 827 828 // If we read multiple characters, and one of those characters was a \r or 829 // \n, then we had an escaped newline within the comment. Emit diagnostic 830 // unless the next line is also a // comment. 831 if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') { 832 for (; OldPtr != CurPtr; ++OldPtr) 833 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 834 // Okay, we found a // comment that ends in a newline, if the next 835 // line is also a // comment, but has spaces, don't emit a diagnostic. 836 if (isspace(C)) { 837 const char *ForwardPtr = CurPtr; 838 while (isspace(*ForwardPtr)) // Skip whitespace. 839 ++ForwardPtr; 840 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 841 break; 842 } 843 844 if (!isLexingRawMode()) 845 Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment); 846 break; 847 } 848 } 849 850 if (CurPtr == BufferEnd+1) { --CurPtr; break; } 851 } while (C != '\n' && C != '\r'); 852 853 // Found but did not consume the newline. 854 855 // If we are returning comments as tokens, return this comment as a token. 856 if (inKeepCommentMode()) 857 return SaveBCPLComment(Result, CurPtr); 858 859 // If we are inside a preprocessor directive and we see the end of line, 860 // return immediately, so that the lexer can return this as an EOM token. 861 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 862 BufferPtr = CurPtr; 863 return false; 864 } 865 866 // Otherwise, eat the \n character. We don't care if this is a \n\r or 867 // \r\n sequence. This is an efficiency hack (because we know the \n can't 868 // contribute to another token), it isn't needed for correctness. Note that 869 // this is ok even in KeepWhitespaceMode, because we would have returned the 870 /// comment above in that mode. 871 ++CurPtr; 872 873 // The next returned token is at the start of the line. 874 Result.setFlag(Token::StartOfLine); 875 // No leading whitespace seen so far. 876 Result.clearFlag(Token::LeadingSpace); 877 BufferPtr = CurPtr; 878 return false; 879} 880 881/// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in 882/// an appropriate way and return it. 883bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) { 884 // If we're not in a preprocessor directive, just return the // comment 885 // directly. 886 FormTokenWithChars(Result, CurPtr, tok::comment); 887 888 if (!ParsingPreprocessorDirective) 889 return true; 890 891 // If this BCPL-style comment is in a macro definition, transmogrify it into 892 // a C-style block comment. 893 std::string Spelling = PP->getSpelling(Result); 894 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?"); 895 Spelling[1] = '*'; // Change prefix to "/*". 896 Spelling += "*/"; // add suffix. 897 898 Result.setKind(tok::comment); 899 Result.setLocation(PP->CreateString(&Spelling[0], Spelling.size(), 900 Result.getLocation())); 901 Result.setLength(Spelling.size()); 902 return true; 903} 904 905/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 906/// character (either \n or \r) is part of an escaped newline sequence. Issue a 907/// diagnostic if so. We know that the newline is inside of a block comment. 908static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, 909 Lexer *L) { 910 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 911 912 // Back up off the newline. 913 --CurPtr; 914 915 // If this is a two-character newline sequence, skip the other character. 916 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 917 // \n\n or \r\r -> not escaped newline. 918 if (CurPtr[0] == CurPtr[1]) 919 return false; 920 // \n\r or \r\n -> skip the newline. 921 --CurPtr; 922 } 923 924 // If we have horizontal whitespace, skip over it. We allow whitespace 925 // between the slash and newline. 926 bool HasSpace = false; 927 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 928 --CurPtr; 929 HasSpace = true; 930 } 931 932 // If we have a slash, we know this is an escaped newline. 933 if (*CurPtr == '\\') { 934 if (CurPtr[-1] != '*') return false; 935 } else { 936 // It isn't a slash, is it the ?? / trigraph? 937 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || 938 CurPtr[-3] != '*') 939 return false; 940 941 // This is the trigraph ending the comment. Emit a stern warning! 942 CurPtr -= 2; 943 944 // If no trigraphs are enabled, warn that we ignored this trigraph and 945 // ignore this * character. 946 if (!L->getFeatures().Trigraphs) { 947 if (!L->isLexingRawMode()) 948 L->Diag(CurPtr, diag::trigraph_ignored_block_comment); 949 return false; 950 } 951 if (!L->isLexingRawMode()) 952 L->Diag(CurPtr, diag::trigraph_ends_block_comment); 953 } 954 955 // Warn about having an escaped newline between the */ characters. 956 if (!L->isLexingRawMode()) 957 L->Diag(CurPtr, diag::escaped_newline_block_comment_end); 958 959 // If there was space between the backslash and newline, warn about it. 960 if (HasSpace && !L->isLexingRawMode()) 961 L->Diag(CurPtr, diag::backslash_newline_space); 962 963 return true; 964} 965 966#ifdef __SSE2__ 967#include <emmintrin.h> 968#elif __ALTIVEC__ 969#include <altivec.h> 970#undef bool 971#endif 972 973/// SkipBlockComment - We have just read the /* characters from input. Read 974/// until we find the */ characters that terminate the comment. Note that we 975/// don't bother decoding trigraphs or escaped newlines in block comments, 976/// because they cannot cause the comment to end. The only thing that can 977/// happen is the comment could end with an escaped newline between the */ end 978/// of comment. 979/// 980/// If KeepCommentMode is enabled, this forms a token from the comment and 981/// returns true. 982bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { 983 // Scan one character past where we should, looking for a '/' character. Once 984 // we find it, check to see if it was preceeded by a *. This common 985 // optimization helps people who like to put a lot of * characters in their 986 // comments. 987 988 // The first character we get with newlines and trigraphs skipped to handle 989 // the degenerate /*/ case below correctly if the * has an escaped newline 990 // after it. 991 unsigned CharSize; 992 unsigned char C = getCharAndSize(CurPtr, CharSize); 993 CurPtr += CharSize; 994 if (C == 0 && CurPtr == BufferEnd+1) { 995 if (!isLexingRawMode()) 996 Diag(BufferPtr, diag::err_unterminated_block_comment); 997 --CurPtr; 998 999 // KeepWhitespaceMode should return this broken comment as a token. Since 1000 // it isn't a well formed comment, just return it as an 'unknown' token. 1001 if (isKeepWhitespaceMode()) { 1002 FormTokenWithChars(Result, CurPtr, tok::unknown); 1003 return true; 1004 } 1005 1006 BufferPtr = CurPtr; 1007 return false; 1008 } 1009 1010 // Check to see if the first character after the '/*' is another /. If so, 1011 // then this slash does not end the block comment, it is part of it. 1012 if (C == '/') 1013 C = *CurPtr++; 1014 1015 while (1) { 1016 // Skip over all non-interesting characters until we find end of buffer or a 1017 // (probably ending) '/' character. 1018 if (CurPtr + 24 < BufferEnd) { 1019 // While not aligned to a 16-byte boundary. 1020 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 1021 C = *CurPtr++; 1022 1023 if (C == '/') goto FoundSlash; 1024 1025#ifdef __SSE2__ 1026 __m128i Slashes = _mm_set_epi8('/', '/', '/', '/', '/', '/', '/', '/', 1027 '/', '/', '/', '/', '/', '/', '/', '/'); 1028 while (CurPtr+16 <= BufferEnd && 1029 _mm_movemask_epi8(_mm_cmpeq_epi8(*(__m128i*)CurPtr, Slashes)) == 0) 1030 CurPtr += 16; 1031#elif __ALTIVEC__ 1032 __vector unsigned char Slashes = { 1033 '/', '/', '/', '/', '/', '/', '/', '/', 1034 '/', '/', '/', '/', '/', '/', '/', '/' 1035 }; 1036 while (CurPtr+16 <= BufferEnd && 1037 !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes)) 1038 CurPtr += 16; 1039#else 1040 // Scan for '/' quickly. Many block comments are very large. 1041 while (CurPtr[0] != '/' && 1042 CurPtr[1] != '/' && 1043 CurPtr[2] != '/' && 1044 CurPtr[3] != '/' && 1045 CurPtr+4 < BufferEnd) { 1046 CurPtr += 4; 1047 } 1048#endif 1049 1050 // It has to be one of the bytes scanned, increment to it and read one. 1051 C = *CurPtr++; 1052 } 1053 1054 // Loop to scan the remainder. 1055 while (C != '/' && C != '\0') 1056 C = *CurPtr++; 1057 1058 FoundSlash: 1059 if (C == '/') { 1060 if (CurPtr[-2] == '*') // We found the final */. We're done! 1061 break; 1062 1063 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 1064 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { 1065 // We found the final */, though it had an escaped newline between the 1066 // * and /. We're done! 1067 break; 1068 } 1069 } 1070 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 1071 // If this is a /* inside of the comment, emit a warning. Don't do this 1072 // if this is a /*/, which will end the comment. This misses cases with 1073 // embedded escaped newlines, but oh well. 1074 if (!isLexingRawMode()) 1075 Diag(CurPtr-1, diag::warn_nested_block_comment); 1076 } 1077 } else if (C == 0 && CurPtr == BufferEnd+1) { 1078 if (!isLexingRawMode()) 1079 Diag(BufferPtr, diag::err_unterminated_block_comment); 1080 // Note: the user probably forgot a */. We could continue immediately 1081 // after the /*, but this would involve lexing a lot of what really is the 1082 // comment, which surely would confuse the parser. 1083 --CurPtr; 1084 1085 // KeepWhitespaceMode should return this broken comment as a token. Since 1086 // it isn't a well formed comment, just return it as an 'unknown' token. 1087 if (isKeepWhitespaceMode()) { 1088 FormTokenWithChars(Result, CurPtr, tok::unknown); 1089 return true; 1090 } 1091 1092 BufferPtr = CurPtr; 1093 return false; 1094 } 1095 C = *CurPtr++; 1096 } 1097 1098 // If we are returning comments as tokens, return this comment as a token. 1099 if (inKeepCommentMode()) { 1100 FormTokenWithChars(Result, CurPtr, tok::comment); 1101 return true; 1102 } 1103 1104 // It is common for the tokens immediately after a /**/ comment to be 1105 // whitespace. Instead of going through the big switch, handle it 1106 // efficiently now. This is safe even in KeepWhitespaceMode because we would 1107 // have already returned above with the comment as a token. 1108 if (isHorizontalWhitespace(*CurPtr)) { 1109 Result.setFlag(Token::LeadingSpace); 1110 SkipWhitespace(Result, CurPtr+1); 1111 return false; 1112 } 1113 1114 // Otherwise, just return so that the next character will be lexed as a token. 1115 BufferPtr = CurPtr; 1116 Result.setFlag(Token::LeadingSpace); 1117 return false; 1118} 1119 1120//===----------------------------------------------------------------------===// 1121// Primary Lexing Entry Points 1122//===----------------------------------------------------------------------===// 1123 1124/// ReadToEndOfLine - Read the rest of the current preprocessor line as an 1125/// uninterpreted string. This switches the lexer out of directive mode. 1126std::string Lexer::ReadToEndOfLine() { 1127 assert(ParsingPreprocessorDirective && ParsingFilename == false && 1128 "Must be in a preprocessing directive!"); 1129 std::string Result; 1130 Token Tmp; 1131 1132 // CurPtr - Cache BufferPtr in an automatic variable. 1133 const char *CurPtr = BufferPtr; 1134 while (1) { 1135 char Char = getAndAdvanceChar(CurPtr, Tmp); 1136 switch (Char) { 1137 default: 1138 Result += Char; 1139 break; 1140 case 0: // Null. 1141 // Found end of file? 1142 if (CurPtr-1 != BufferEnd) { 1143 // Nope, normal character, continue. 1144 Result += Char; 1145 break; 1146 } 1147 // FALL THROUGH. 1148 case '\r': 1149 case '\n': 1150 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 1151 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 1152 BufferPtr = CurPtr-1; 1153 1154 // Next, lex the character, which should handle the EOM transition. 1155 Lex(Tmp); 1156 assert(Tmp.is(tok::eom) && "Unexpected token!"); 1157 1158 // Finally, we're done, return the string we found. 1159 return Result; 1160 } 1161 } 1162} 1163 1164/// LexEndOfFile - CurPtr points to the end of this file. Handle this 1165/// condition, reporting diagnostics and handling other edge cases as required. 1166/// This returns true if Result contains a token, false if PP.Lex should be 1167/// called again. 1168bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 1169 // If we hit the end of the file while parsing a preprocessor directive, 1170 // end the preprocessor directive first. The next token returned will 1171 // then be the end of file. 1172 if (ParsingPreprocessorDirective) { 1173 // Done parsing the "line". 1174 ParsingPreprocessorDirective = false; 1175 // Update the location of token as well as BufferPtr. 1176 FormTokenWithChars(Result, CurPtr, tok::eom); 1177 1178 // Restore comment saving mode, in case it was disabled for directive. 1179 SetCommentRetentionState(PP->getCommentRetentionState()); 1180 return true; // Have a token. 1181 } 1182 1183 // If we are in raw mode, return this event as an EOF token. Let the caller 1184 // that put us in raw mode handle the event. 1185 if (isLexingRawMode()) { 1186 Result.startToken(); 1187 BufferPtr = BufferEnd; 1188 FormTokenWithChars(Result, BufferEnd, tok::eof); 1189 return true; 1190 } 1191 1192 // Otherwise, issue diagnostics for unterminated #if and missing newline. 1193 1194 // If we are in a #if directive, emit an error. 1195 while (!ConditionalStack.empty()) { 1196 PP->Diag(ConditionalStack.back().IfLoc, 1197 diag::err_pp_unterminated_conditional); 1198 ConditionalStack.pop_back(); 1199 } 1200 1201 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 1202 // a pedwarn. 1203 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) 1204 Diag(BufferEnd, diag::ext_no_newline_eof); 1205 1206 BufferPtr = CurPtr; 1207 1208 // Finally, let the preprocessor handle this. 1209 return PP->HandleEndOfFile(Result); 1210} 1211 1212/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 1213/// the specified lexer will return a tok::l_paren token, 0 if it is something 1214/// else and 2 if there are no more tokens in the buffer controlled by the 1215/// lexer. 1216unsigned Lexer::isNextPPTokenLParen() { 1217 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 1218 1219 // Switch to 'skipping' mode. This will ensure that we can lex a token 1220 // without emitting diagnostics, disables macro expansion, and will cause EOF 1221 // to return an EOF token instead of popping the include stack. 1222 LexingRawMode = true; 1223 1224 // Save state that can be changed while lexing so that we can restore it. 1225 const char *TmpBufferPtr = BufferPtr; 1226 1227 Token Tok; 1228 Tok.startToken(); 1229 LexTokenInternal(Tok); 1230 1231 // Restore state that may have changed. 1232 BufferPtr = TmpBufferPtr; 1233 1234 // Restore the lexer back to non-skipping mode. 1235 LexingRawMode = false; 1236 1237 if (Tok.is(tok::eof)) 1238 return 2; 1239 return Tok.is(tok::l_paren); 1240} 1241 1242 1243/// LexTokenInternal - This implements a simple C family lexer. It is an 1244/// extremely performance critical piece of code. This assumes that the buffer 1245/// has a null character at the end of the file. Return true if an error 1246/// occurred and compilation should terminate, false if normal. This returns a 1247/// preprocessing token, not a normal token, as such, it is an internal 1248/// interface. It assumes that the Flags of result have been cleared before 1249/// calling this. 1250void Lexer::LexTokenInternal(Token &Result) { 1251LexNextToken: 1252 // New token, can't need cleaning yet. 1253 Result.clearFlag(Token::NeedsCleaning); 1254 Result.setIdentifierInfo(0); 1255 1256 // CurPtr - Cache BufferPtr in an automatic variable. 1257 const char *CurPtr = BufferPtr; 1258 1259 // Small amounts of horizontal whitespace is very common between tokens. 1260 if ((*CurPtr == ' ') || (*CurPtr == '\t')) { 1261 ++CurPtr; 1262 while ((*CurPtr == ' ') || (*CurPtr == '\t')) 1263 ++CurPtr; 1264 1265 // If we are keeping whitespace and other tokens, just return what we just 1266 // skipped. The next lexer invocation will return the token after the 1267 // whitespace. 1268 if (isKeepWhitespaceMode()) { 1269 FormTokenWithChars(Result, CurPtr, tok::unknown); 1270 return; 1271 } 1272 1273 BufferPtr = CurPtr; 1274 Result.setFlag(Token::LeadingSpace); 1275 } 1276 1277 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 1278 1279 // Read a character, advancing over it. 1280 char Char = getAndAdvanceChar(CurPtr, Result); 1281 tok::TokenKind Kind; 1282 1283 switch (Char) { 1284 case 0: // Null. 1285 // Found end of file? 1286 if (CurPtr-1 == BufferEnd) { 1287 // Read the PP instance variable into an automatic variable, because 1288 // LexEndOfFile will often delete 'this'. 1289 Preprocessor *PPCache = PP; 1290 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 1291 return; // Got a token to return. 1292 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 1293 return PPCache->Lex(Result); 1294 } 1295 1296 if (!isLexingRawMode()) 1297 Diag(CurPtr-1, diag::null_in_file); 1298 Result.setFlag(Token::LeadingSpace); 1299 if (SkipWhitespace(Result, CurPtr)) 1300 return; // KeepWhitespaceMode 1301 1302 goto LexNextToken; // GCC isn't tail call eliminating. 1303 case '\n': 1304 case '\r': 1305 // If we are inside a preprocessor directive and we see the end of line, 1306 // we know we are done with the directive, so return an EOM token. 1307 if (ParsingPreprocessorDirective) { 1308 // Done parsing the "line". 1309 ParsingPreprocessorDirective = false; 1310 1311 // Restore comment saving mode, in case it was disabled for directive. 1312 SetCommentRetentionState(PP->getCommentRetentionState()); 1313 1314 // Since we consumed a newline, we are back at the start of a line. 1315 IsAtStartOfLine = true; 1316 1317 Kind = tok::eom; 1318 break; 1319 } 1320 // The returned token is at the start of the line. 1321 Result.setFlag(Token::StartOfLine); 1322 // No leading whitespace seen so far. 1323 Result.clearFlag(Token::LeadingSpace); 1324 1325 if (SkipWhitespace(Result, CurPtr)) 1326 return; // KeepWhitespaceMode 1327 goto LexNextToken; // GCC isn't tail call eliminating. 1328 case ' ': 1329 case '\t': 1330 case '\f': 1331 case '\v': 1332 SkipHorizontalWhitespace: 1333 Result.setFlag(Token::LeadingSpace); 1334 if (SkipWhitespace(Result, CurPtr)) 1335 return; // KeepWhitespaceMode 1336 1337 SkipIgnoredUnits: 1338 CurPtr = BufferPtr; 1339 1340 // If the next token is obviously a // or /* */ comment, skip it efficiently 1341 // too (without going through the big switch stmt). 1342 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 1343 Features.BCPLComment) { 1344 SkipBCPLComment(Result, CurPtr+2); 1345 goto SkipIgnoredUnits; 1346 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 1347 SkipBlockComment(Result, CurPtr+2); 1348 goto SkipIgnoredUnits; 1349 } else if (isHorizontalWhitespace(*CurPtr)) { 1350 goto SkipHorizontalWhitespace; 1351 } 1352 goto LexNextToken; // GCC isn't tail call eliminating. 1353 1354 // C99 6.4.4.1: Integer Constants. 1355 // C99 6.4.4.2: Floating Constants. 1356 case '0': case '1': case '2': case '3': case '4': 1357 case '5': case '6': case '7': case '8': case '9': 1358 // Notify MIOpt that we read a non-whitespace/non-comment token. 1359 MIOpt.ReadToken(); 1360 return LexNumericConstant(Result, CurPtr); 1361 1362 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 1363 // Notify MIOpt that we read a non-whitespace/non-comment token. 1364 MIOpt.ReadToken(); 1365 Char = getCharAndSize(CurPtr, SizeTmp); 1366 1367 // Wide string literal. 1368 if (Char == '"') 1369 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 1370 true); 1371 1372 // Wide character constant. 1373 if (Char == '\'') 1374 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 1375 // FALL THROUGH, treating L like the start of an identifier. 1376 1377 // C99 6.4.2: Identifiers. 1378 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 1379 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 1380 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': 1381 case 'V': case 'W': case 'X': case 'Y': case 'Z': 1382 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 1383 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 1384 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': 1385 case 'v': case 'w': case 'x': case 'y': case 'z': 1386 case '_': 1387 // Notify MIOpt that we read a non-whitespace/non-comment token. 1388 MIOpt.ReadToken(); 1389 return LexIdentifier(Result, CurPtr); 1390 1391 case '$': // $ in identifiers. 1392 if (Features.DollarIdents) { 1393 if (!isLexingRawMode()) 1394 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 1395 // Notify MIOpt that we read a non-whitespace/non-comment token. 1396 MIOpt.ReadToken(); 1397 return LexIdentifier(Result, CurPtr); 1398 } 1399 1400 Kind = tok::unknown; 1401 break; 1402 1403 // C99 6.4.4: Character Constants. 1404 case '\'': 1405 // Notify MIOpt that we read a non-whitespace/non-comment token. 1406 MIOpt.ReadToken(); 1407 return LexCharConstant(Result, CurPtr); 1408 1409 // C99 6.4.5: String Literals. 1410 case '"': 1411 // Notify MIOpt that we read a non-whitespace/non-comment token. 1412 MIOpt.ReadToken(); 1413 return LexStringLiteral(Result, CurPtr, false); 1414 1415 // C99 6.4.6: Punctuators. 1416 case '?': 1417 Kind = tok::question; 1418 break; 1419 case '[': 1420 Kind = tok::l_square; 1421 break; 1422 case ']': 1423 Kind = tok::r_square; 1424 break; 1425 case '(': 1426 Kind = tok::l_paren; 1427 break; 1428 case ')': 1429 Kind = tok::r_paren; 1430 break; 1431 case '{': 1432 Kind = tok::l_brace; 1433 break; 1434 case '}': 1435 Kind = tok::r_brace; 1436 break; 1437 case '.': 1438 Char = getCharAndSize(CurPtr, SizeTmp); 1439 if (Char >= '0' && Char <= '9') { 1440 // Notify MIOpt that we read a non-whitespace/non-comment token. 1441 MIOpt.ReadToken(); 1442 1443 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 1444 } else if (Features.CPlusPlus && Char == '*') { 1445 Kind = tok::periodstar; 1446 CurPtr += SizeTmp; 1447 } else if (Char == '.' && 1448 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 1449 Kind = tok::ellipsis; 1450 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1451 SizeTmp2, Result); 1452 } else { 1453 Kind = tok::period; 1454 } 1455 break; 1456 case '&': 1457 Char = getCharAndSize(CurPtr, SizeTmp); 1458 if (Char == '&') { 1459 Kind = tok::ampamp; 1460 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1461 } else if (Char == '=') { 1462 Kind = tok::ampequal; 1463 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1464 } else { 1465 Kind = tok::amp; 1466 } 1467 break; 1468 case '*': 1469 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 1470 Kind = tok::starequal; 1471 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1472 } else { 1473 Kind = tok::star; 1474 } 1475 break; 1476 case '+': 1477 Char = getCharAndSize(CurPtr, SizeTmp); 1478 if (Char == '+') { 1479 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1480 Kind = tok::plusplus; 1481 } else if (Char == '=') { 1482 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1483 Kind = tok::plusequal; 1484 } else { 1485 Kind = tok::plus; 1486 } 1487 break; 1488 case '-': 1489 Char = getCharAndSize(CurPtr, SizeTmp); 1490 if (Char == '-') { // -- 1491 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1492 Kind = tok::minusminus; 1493 } else if (Char == '>' && Features.CPlusPlus && 1494 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 1495 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1496 SizeTmp2, Result); 1497 Kind = tok::arrowstar; 1498 } else if (Char == '>') { // -> 1499 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1500 Kind = tok::arrow; 1501 } else if (Char == '=') { // -= 1502 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1503 Kind = tok::minusequal; 1504 } else { 1505 Kind = tok::minus; 1506 } 1507 break; 1508 case '~': 1509 Kind = tok::tilde; 1510 break; 1511 case '!': 1512 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 1513 Kind = tok::exclaimequal; 1514 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1515 } else { 1516 Kind = tok::exclaim; 1517 } 1518 break; 1519 case '/': 1520 // 6.4.9: Comments 1521 Char = getCharAndSize(CurPtr, SizeTmp); 1522 if (Char == '/') { // BCPL comment. 1523 // Even if BCPL comments are disabled (e.g. in C89 mode), we generally 1524 // want to lex this as a comment. There is one problem with this though, 1525 // that in one particular corner case, this can change the behavior of the 1526 // resultant program. For example, In "foo //**/ bar", C89 would lex 1527 // this as "foo / bar" and langauges with BCPL comments would lex it as 1528 // "foo". Check to see if the character after the second slash is a '*'. 1529 // If so, we will lex that as a "/" instead of the start of a comment. 1530 if (Features.BCPLComment || 1531 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') { 1532 if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 1533 return; // KeepCommentMode 1534 1535 // It is common for the tokens immediately after a // comment to be 1536 // whitespace (indentation for the next line). Instead of going through 1537 // the big switch, handle it efficiently now. 1538 goto SkipIgnoredUnits; 1539 } 1540 } 1541 1542 if (Char == '*') { // /**/ comment. 1543 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 1544 return; // KeepCommentMode 1545 goto LexNextToken; // GCC isn't tail call eliminating. 1546 } 1547 1548 if (Char == '=') { 1549 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1550 Kind = tok::slashequal; 1551 } else { 1552 Kind = tok::slash; 1553 } 1554 break; 1555 case '%': 1556 Char = getCharAndSize(CurPtr, SizeTmp); 1557 if (Char == '=') { 1558 Kind = tok::percentequal; 1559 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1560 } else if (Features.Digraphs && Char == '>') { 1561 Kind = tok::r_brace; // '%>' -> '}' 1562 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1563 } else if (Features.Digraphs && Char == ':') { 1564 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1565 Char = getCharAndSize(CurPtr, SizeTmp); 1566 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 1567 Kind = tok::hashhash; // '%:%:' -> '##' 1568 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1569 SizeTmp2, Result); 1570 } else if (Char == '@' && Features.Microsoft) { // %:@ -> #@ -> Charize 1571 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1572 if (!isLexingRawMode()) 1573 Diag(BufferPtr, diag::charize_microsoft_ext); 1574 Kind = tok::hashat; 1575 } else { 1576 Kind = tok::hash; // '%:' -> '#' 1577 1578 // We parsed a # character. If this occurs at the start of the line, 1579 // it's actually the start of a preprocessing directive. Callback to 1580 // the preprocessor to handle it. 1581 // FIXME: -fpreprocessed mode?? 1582 if (Result.isAtStartOfLine() && !LexingRawMode) { 1583 BufferPtr = CurPtr; 1584 PP->HandleDirective(Result); 1585 1586 // As an optimization, if the preprocessor didn't switch lexers, tail 1587 // recurse. 1588 if (PP->isCurrentLexer(this)) { 1589 // Start a new token. If this is a #include or something, the PP may 1590 // want us starting at the beginning of the line again. If so, set 1591 // the StartOfLine flag. 1592 if (IsAtStartOfLine) { 1593 Result.setFlag(Token::StartOfLine); 1594 IsAtStartOfLine = false; 1595 } 1596 goto LexNextToken; // GCC isn't tail call eliminating. 1597 } 1598 1599 return PP->Lex(Result); 1600 } 1601 } 1602 } else { 1603 Kind = tok::percent; 1604 } 1605 break; 1606 case '<': 1607 Char = getCharAndSize(CurPtr, SizeTmp); 1608 if (ParsingFilename) { 1609 return LexAngledStringLiteral(Result, CurPtr+SizeTmp); 1610 } else if (Char == '<' && 1611 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '=') { 1612 Kind = tok::lesslessequal; 1613 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1614 SizeTmp2, Result); 1615 } else if (Char == '<') { 1616 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1617 Kind = tok::lessless; 1618 } else if (Char == '=') { 1619 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1620 Kind = tok::lessequal; 1621 } else if (Features.Digraphs && Char == ':') { // '<:' -> '[' 1622 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1623 Kind = tok::l_square; 1624 } else if (Features.Digraphs && Char == '%') { // '<%' -> '{' 1625 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1626 Kind = tok::l_brace; 1627 } else { 1628 Kind = tok::less; 1629 } 1630 break; 1631 case '>': 1632 Char = getCharAndSize(CurPtr, SizeTmp); 1633 if (Char == '=') { 1634 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1635 Kind = tok::greaterequal; 1636 } else if (Char == '>' && 1637 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '=') { 1638 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1639 SizeTmp2, Result); 1640 Kind = tok::greatergreaterequal; 1641 } else if (Char == '>') { 1642 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1643 Kind = tok::greatergreater; 1644 } else { 1645 Kind = tok::greater; 1646 } 1647 break; 1648 case '^': 1649 Char = getCharAndSize(CurPtr, SizeTmp); 1650 if (Char == '=') { 1651 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1652 Kind = tok::caretequal; 1653 } else { 1654 Kind = tok::caret; 1655 } 1656 break; 1657 case '|': 1658 Char = getCharAndSize(CurPtr, SizeTmp); 1659 if (Char == '=') { 1660 Kind = tok::pipeequal; 1661 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1662 } else if (Char == '|') { 1663 Kind = tok::pipepipe; 1664 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1665 } else { 1666 Kind = tok::pipe; 1667 } 1668 break; 1669 case ':': 1670 Char = getCharAndSize(CurPtr, SizeTmp); 1671 if (Features.Digraphs && Char == '>') { 1672 Kind = tok::r_square; // ':>' -> ']' 1673 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1674 } else if (Features.CPlusPlus && Char == ':') { 1675 Kind = tok::coloncolon; 1676 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1677 } else { 1678 Kind = tok::colon; 1679 } 1680 break; 1681 case ';': 1682 Kind = tok::semi; 1683 break; 1684 case '=': 1685 Char = getCharAndSize(CurPtr, SizeTmp); 1686 if (Char == '=') { 1687 Kind = tok::equalequal; 1688 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1689 } else { 1690 Kind = tok::equal; 1691 } 1692 break; 1693 case ',': 1694 Kind = tok::comma; 1695 break; 1696 case '#': 1697 Char = getCharAndSize(CurPtr, SizeTmp); 1698 if (Char == '#') { 1699 Kind = tok::hashhash; 1700 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1701 } else if (Char == '@' && Features.Microsoft) { // #@ -> Charize 1702 Kind = tok::hashat; 1703 if (!isLexingRawMode()) 1704 Diag(BufferPtr, diag::charize_microsoft_ext); 1705 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1706 } else { 1707 Kind = tok::hash; 1708 // We parsed a # character. If this occurs at the start of the line, 1709 // it's actually the start of a preprocessing directive. Callback to 1710 // the preprocessor to handle it. 1711 // FIXME: -fpreprocessed mode?? 1712 if (Result.isAtStartOfLine() && !LexingRawMode) { 1713 BufferPtr = CurPtr; 1714 PP->HandleDirective(Result); 1715 1716 // As an optimization, if the preprocessor didn't switch lexers, tail 1717 // recurse. 1718 if (PP->isCurrentLexer(this)) { 1719 // Start a new token. If this is a #include or something, the PP may 1720 // want us starting at the beginning of the line again. If so, set 1721 // the StartOfLine flag. 1722 if (IsAtStartOfLine) { 1723 Result.setFlag(Token::StartOfLine); 1724 IsAtStartOfLine = false; 1725 } 1726 goto LexNextToken; // GCC isn't tail call eliminating. 1727 } 1728 return PP->Lex(Result); 1729 } 1730 } 1731 break; 1732 1733 case '@': 1734 // Objective C support. 1735 if (CurPtr[-1] == '@' && Features.ObjC1) 1736 Kind = tok::at; 1737 else 1738 Kind = tok::unknown; 1739 break; 1740 1741 case '\\': 1742 // FIXME: UCN's. 1743 // FALL THROUGH. 1744 default: 1745 Kind = tok::unknown; 1746 break; 1747 } 1748 1749 // Notify MIOpt that we read a non-whitespace/non-comment token. 1750 MIOpt.ReadToken(); 1751 1752 // Update the location of token as well as BufferPtr. 1753 FormTokenWithChars(Result, CurPtr, Kind); 1754} 1755