Lexer.cpp revision 025c3a66402fb713c2d9bf5dc174ff264765379a
1//===--- Lexer.cpp - C Language Family Lexer ------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements the Lexer and Token interfaces. 11// 12//===----------------------------------------------------------------------===// 13// 14// TODO: GCC Diagnostics emitted by the lexer: 15// PEDWARN: (form feed|vertical tab) in preprocessing directive 16// 17// Universal characters, unicode, char mapping: 18// WARNING: `%.*s' is not in NFKC 19// WARNING: `%.*s' is not in NFC 20// 21// Other: 22// TODO: Options to support: 23// -fexec-charset,-fwide-exec-charset 24// 25//===----------------------------------------------------------------------===// 26 27#include "clang/Lex/Lexer.h" 28#include "clang/Lex/Preprocessor.h" 29#include "clang/Basic/Diagnostic.h" 30#include "clang/Basic/SourceManager.h" 31#include "llvm/Support/Compiler.h" 32#include "llvm/Support/MemoryBuffer.h" 33#include <cctype> 34using namespace clang; 35 36static void InitCharacterInfo(); 37 38//===----------------------------------------------------------------------===// 39// Token Class Implementation 40//===----------------------------------------------------------------------===// 41 42/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 43bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 44 if (IdentifierInfo *II = getIdentifierInfo()) 45 return II->getObjCKeywordID() == objcKey; 46 return false; 47} 48 49/// getObjCKeywordID - Return the ObjC keyword kind. 50tok::ObjCKeywordKind Token::getObjCKeywordID() const { 51 IdentifierInfo *specId = getIdentifierInfo(); 52 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 53} 54 55 56//===----------------------------------------------------------------------===// 57// Lexer Class Implementation 58//===----------------------------------------------------------------------===// 59 60void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 61 const char *BufEnd) { 62 InitCharacterInfo(); 63 64 BufferStart = BufStart; 65 BufferPtr = BufPtr; 66 BufferEnd = BufEnd; 67 68 assert(BufEnd[0] == 0 && 69 "We assume that the input buffer has a null character at the end" 70 " to simplify lexing!"); 71 72 Is_PragmaLexer = false; 73 74 // Start of the file is a start of line. 75 IsAtStartOfLine = true; 76 77 // We are not after parsing a #. 78 ParsingPreprocessorDirective = false; 79 80 // We are not after parsing #include. 81 ParsingFilename = false; 82 83 // We are not in raw mode. Raw mode disables diagnostics and interpretation 84 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 85 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 86 // or otherwise skipping over tokens. 87 LexingRawMode = false; 88 89 // Default to not keeping comments. 90 ExtendedTokenMode = 0; 91} 92 93 94/// Lexer constructor - Create a new lexer object for the specified buffer 95/// with the specified preprocessor managing the lexing process. This lexer 96/// assumes that the associated file buffer and Preprocessor objects will 97/// outlive it, so it doesn't take ownership of either of them. 98Lexer::Lexer(SourceLocation fileloc, Preprocessor &PP, 99 const char *BufPtr, const char *BufEnd) 100// FIXME: This is really horrible and only needed for _Pragma lexers, split this 101// out of the main lexer path! 102 : PreprocessorLexer(&PP, 103 PP.getSourceManager().getCanonicalFileID( 104 PP.getSourceManager().getSpellingLoc(fileloc))), 105 FileLoc(fileloc), 106 Features(PP.getLangOptions()) { 107 108 SourceManager &SourceMgr = PP.getSourceManager(); 109 const llvm::MemoryBuffer *InputFile = SourceMgr.getBuffer(getFileID()); 110 111 // BufferPtr and BufferEnd can start out somewhere inside the current buffer. 112 // If unspecified, they starts at the start/end of the buffer. 113 const char *BufStart = InputFile->getBufferStart(); 114 if (BufPtr == 0) BufPtr = BufStart; 115 if (BufEnd == 0) BufEnd = InputFile->getBufferEnd(); 116 117 InitLexer(BufStart, BufPtr, BufEnd); 118 119 // Default to keeping comments if the preprocessor wants them. 120 SetCommentRetentionState(PP.getCommentRetentionState()); 121} 122 123/// Lexer constructor - Create a new raw lexer object. This object is only 124/// suitable for calls to 'LexRawToken'. This lexer assumes that the text 125/// range will outlive it, so it doesn't take ownership of it. 126Lexer::Lexer(SourceLocation fileloc, const LangOptions &features, 127 const char *BufPtr, const char *BufEnd, 128 const llvm::MemoryBuffer *FromFile) 129 : FileLoc(fileloc), Features(features) { 130 131 // If a MemoryBuffer was specified, use its start as BufferStart. This affects 132 // the source location objects produced by this lexer. 133 const char *BufStart = BufPtr; 134 if (FromFile) BufStart = FromFile->getBufferStart(); 135 136 InitLexer(BufStart, BufPtr, BufEnd); 137 138 // We *are* in raw mode. 139 LexingRawMode = true; 140} 141 142/// Lexer constructor - Create a new raw lexer object. This object is only 143/// suitable for calls to 'LexRawToken'. This lexer assumes that the text 144/// range will outlive it, so it doesn't take ownership of it. 145Lexer::Lexer(FileID FID, const SourceManager &SM, const LangOptions &features) 146 : FileLoc(SM.getLocForStartOfFile(FID)), Features(features) { 147 const llvm::MemoryBuffer *FromFile = SM.getBuffer(FID); 148 149 InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(), 150 FromFile->getBufferEnd()); 151 152 // We *are* in raw mode. 153 LexingRawMode = true; 154} 155 156 157/// Stringify - Convert the specified string into a C string, with surrounding 158/// ""'s, and with escaped \ and " characters. 159std::string Lexer::Stringify(const std::string &Str, bool Charify) { 160 std::string Result = Str; 161 char Quote = Charify ? '\'' : '"'; 162 for (unsigned i = 0, e = Result.size(); i != e; ++i) { 163 if (Result[i] == '\\' || Result[i] == Quote) { 164 Result.insert(Result.begin()+i, '\\'); 165 ++i; ++e; 166 } 167 } 168 return Result; 169} 170 171/// Stringify - Convert the specified string into a C string by escaping '\' 172/// and " characters. This does not add surrounding ""'s to the string. 173void Lexer::Stringify(llvm::SmallVectorImpl<char> &Str) { 174 for (unsigned i = 0, e = Str.size(); i != e; ++i) { 175 if (Str[i] == '\\' || Str[i] == '"') { 176 Str.insert(Str.begin()+i, '\\'); 177 ++i; ++e; 178 } 179 } 180} 181 182 183/// MeasureTokenLength - Relex the token at the specified location and return 184/// its length in bytes in the input file. If the token needs cleaning (e.g. 185/// includes a trigraph or an escaped newline) then this count includes bytes 186/// that are part of that. 187unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 188 const SourceManager &SM) { 189 // If this comes from a macro expansion, we really do want the macro name, not 190 // the token this macro expanded to. 191 Loc = SM.getInstantiationLoc(Loc); 192 193 const char *StrData = SM.getCharacterData(Loc); 194 195 // TODO: this could be special cased for common tokens like identifiers, ')', 196 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 197 // all obviously single-char tokens. This could use 198 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 199 // something. 200 const char *BufEnd = SM.getBufferData(Loc).second; 201 202 // Create a langops struct and enable trigraphs. This is sufficient for 203 // measuring tokens. 204 LangOptions LangOpts; 205 LangOpts.Trigraphs = true; 206 207 // Create a lexer starting at the beginning of this token. 208 Lexer TheLexer(Loc, LangOpts, StrData, BufEnd); 209 Token TheTok; 210 TheLexer.LexFromRawLexer(TheTok); 211 return TheTok.getLength(); 212} 213 214//===----------------------------------------------------------------------===// 215// Character information. 216//===----------------------------------------------------------------------===// 217 218static unsigned char CharInfo[256]; 219 220enum { 221 CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0' 222 CHAR_VERT_WS = 0x02, // '\r', '\n' 223 CHAR_LETTER = 0x04, // a-z,A-Z 224 CHAR_NUMBER = 0x08, // 0-9 225 CHAR_UNDER = 0x10, // _ 226 CHAR_PERIOD = 0x20 // . 227}; 228 229static void InitCharacterInfo() { 230 static bool isInited = false; 231 if (isInited) return; 232 isInited = true; 233 234 // Intiialize the CharInfo table. 235 // TODO: statically initialize this. 236 CharInfo[(int)' '] = CharInfo[(int)'\t'] = 237 CharInfo[(int)'\f'] = CharInfo[(int)'\v'] = CHAR_HORZ_WS; 238 CharInfo[(int)'\n'] = CharInfo[(int)'\r'] = CHAR_VERT_WS; 239 240 CharInfo[(int)'_'] = CHAR_UNDER; 241 CharInfo[(int)'.'] = CHAR_PERIOD; 242 for (unsigned i = 'a'; i <= 'z'; ++i) 243 CharInfo[i] = CharInfo[i+'A'-'a'] = CHAR_LETTER; 244 for (unsigned i = '0'; i <= '9'; ++i) 245 CharInfo[i] = CHAR_NUMBER; 246} 247 248/// isIdentifierBody - Return true if this is the body character of an 249/// identifier, which is [a-zA-Z0-9_]. 250static inline bool isIdentifierBody(unsigned char c) { 251 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false; 252} 253 254/// isHorizontalWhitespace - Return true if this character is horizontal 255/// whitespace: ' ', '\t', '\f', '\v'. Note that this returns false for '\0'. 256static inline bool isHorizontalWhitespace(unsigned char c) { 257 return (CharInfo[c] & CHAR_HORZ_WS) ? true : false; 258} 259 260/// isWhitespace - Return true if this character is horizontal or vertical 261/// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'. Note that this returns false 262/// for '\0'. 263static inline bool isWhitespace(unsigned char c) { 264 return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false; 265} 266 267/// isNumberBody - Return true if this is the body character of an 268/// preprocessing number, which is [a-zA-Z0-9_.]. 269static inline bool isNumberBody(unsigned char c) { 270 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ? 271 true : false; 272} 273 274 275//===----------------------------------------------------------------------===// 276// Diagnostics forwarding code. 277//===----------------------------------------------------------------------===// 278 279/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 280/// lexer buffer was all instantiated at a single point, perform the mapping. 281/// This is currently only used for _Pragma implementation, so it is the slow 282/// path of the hot getSourceLocation method. Do not allow it to be inlined. 283static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 284 SourceLocation FileLoc, 285 unsigned CharNo) DISABLE_INLINE; 286static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 287 SourceLocation FileLoc, 288 unsigned CharNo) { 289 // Otherwise, we're lexing "mapped tokens". This is used for things like 290 // _Pragma handling. Combine the instantiation location of FileLoc with the 291 // spelling location. 292 SourceManager &SourceMgr = PP.getSourceManager(); 293 294 // Create a new SLoc which is expanded from Instantiation(FileLoc) but whose 295 // characters come from spelling(FileLoc)+Offset. 296 SourceLocation InstLoc = SourceMgr.getInstantiationLoc(FileLoc); 297 SourceLocation SpellingLoc = SourceMgr.getSpellingLoc(FileLoc); 298 SpellingLoc = SourceLocation::getFileLoc(SpellingLoc.getFileID(), CharNo); 299 return SourceMgr.getInstantiationLoc(SpellingLoc, InstLoc); 300} 301 302/// getSourceLocation - Return a source location identifier for the specified 303/// offset in the current file. 304SourceLocation Lexer::getSourceLocation(const char *Loc) const { 305 assert(Loc >= BufferStart && Loc <= BufferEnd && 306 "Location out of range for this buffer!"); 307 308 // In the normal case, we're just lexing from a simple file buffer, return 309 // the file id from FileLoc with the offset specified. 310 unsigned CharNo = Loc-BufferStart; 311 if (FileLoc.isFileID()) 312 return SourceLocation::getFileLoc(FileLoc.getFileID(), CharNo); 313 314 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 315 // tokens are lexed from where the _Pragma was defined. 316 assert(PP && "This doesn't work on raw lexers"); 317 return GetMappedTokenLoc(*PP, FileLoc, CharNo); 318} 319 320/// Diag - Forwarding function for diagnostics. This translate a source 321/// position in the current buffer into a SourceLocation object for rendering. 322DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 323 return PP->Diag(getSourceLocation(Loc), DiagID); 324} 325 326//===----------------------------------------------------------------------===// 327// Trigraph and Escaped Newline Handling Code. 328//===----------------------------------------------------------------------===// 329 330/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 331/// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 332static char GetTrigraphCharForLetter(char Letter) { 333 switch (Letter) { 334 default: return 0; 335 case '=': return '#'; 336 case ')': return ']'; 337 case '(': return '['; 338 case '!': return '|'; 339 case '\'': return '^'; 340 case '>': return '}'; 341 case '/': return '\\'; 342 case '<': return '{'; 343 case '-': return '~'; 344 } 345} 346 347/// DecodeTrigraphChar - If the specified character is a legal trigraph when 348/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 349/// return the result character. Finally, emit a warning about trigraph use 350/// whether trigraphs are enabled or not. 351static char DecodeTrigraphChar(const char *CP, Lexer *L) { 352 char Res = GetTrigraphCharForLetter(*CP); 353 if (!Res || !L) return Res; 354 355 if (!L->getFeatures().Trigraphs) { 356 if (!L->isLexingRawMode()) 357 L->Diag(CP-2, diag::trigraph_ignored); 358 return 0; 359 } 360 361 if (!L->isLexingRawMode()) 362 L->Diag(CP-2, diag::trigraph_converted) << std::string()+Res; 363 return Res; 364} 365 366/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 367/// get its size, and return it. This is tricky in several cases: 368/// 1. If currently at the start of a trigraph, we warn about the trigraph, 369/// then either return the trigraph (skipping 3 chars) or the '?', 370/// depending on whether trigraphs are enabled or not. 371/// 2. If this is an escaped newline (potentially with whitespace between 372/// the backslash and newline), implicitly skip the newline and return 373/// the char after it. 374/// 3. If this is a UCN, return it. FIXME: C++ UCN's? 375/// 376/// This handles the slow/uncommon case of the getCharAndSize method. Here we 377/// know that we can accumulate into Size, and that we have already incremented 378/// Ptr by Size bytes. 379/// 380/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 381/// be updated to match. 382/// 383char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 384 Token *Tok) { 385 // If we have a slash, look for an escaped newline. 386 if (Ptr[0] == '\\') { 387 ++Size; 388 ++Ptr; 389Slash: 390 // Common case, backslash-char where the char is not whitespace. 391 if (!isWhitespace(Ptr[0])) return '\\'; 392 393 // See if we have optional whitespace characters followed by a newline. 394 { 395 unsigned SizeTmp = 0; 396 do { 397 ++SizeTmp; 398 if (Ptr[SizeTmp-1] == '\n' || Ptr[SizeTmp-1] == '\r') { 399 // Remember that this token needs to be cleaned. 400 if (Tok) Tok->setFlag(Token::NeedsCleaning); 401 402 // Warn if there was whitespace between the backslash and newline. 403 if (SizeTmp != 1 && Tok && !isLexingRawMode()) 404 Diag(Ptr, diag::backslash_newline_space); 405 406 // If this is a \r\n or \n\r, skip the newlines. 407 if ((Ptr[SizeTmp] == '\r' || Ptr[SizeTmp] == '\n') && 408 Ptr[SizeTmp-1] != Ptr[SizeTmp]) 409 ++SizeTmp; 410 411 // Found backslash<whitespace><newline>. Parse the char after it. 412 Size += SizeTmp; 413 Ptr += SizeTmp; 414 // Use slow version to accumulate a correct size field. 415 return getCharAndSizeSlow(Ptr, Size, Tok); 416 } 417 } while (isWhitespace(Ptr[SizeTmp])); 418 } 419 420 // Otherwise, this is not an escaped newline, just return the slash. 421 return '\\'; 422 } 423 424 // If this is a trigraph, process it. 425 if (Ptr[0] == '?' && Ptr[1] == '?') { 426 // If this is actually a legal trigraph (not something like "??x"), emit 427 // a trigraph warning. If so, and if trigraphs are enabled, return it. 428 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) { 429 // Remember that this token needs to be cleaned. 430 if (Tok) Tok->setFlag(Token::NeedsCleaning); 431 432 Ptr += 3; 433 Size += 3; 434 if (C == '\\') goto Slash; 435 return C; 436 } 437 } 438 439 // If this is neither, return a single character. 440 ++Size; 441 return *Ptr; 442} 443 444 445/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 446/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 447/// and that we have already incremented Ptr by Size bytes. 448/// 449/// NOTE: When this method is updated, getCharAndSizeSlow (above) should 450/// be updated to match. 451char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 452 const LangOptions &Features) { 453 // If we have a slash, look for an escaped newline. 454 if (Ptr[0] == '\\') { 455 ++Size; 456 ++Ptr; 457Slash: 458 // Common case, backslash-char where the char is not whitespace. 459 if (!isWhitespace(Ptr[0])) return '\\'; 460 461 // See if we have optional whitespace characters followed by a newline. 462 { 463 unsigned SizeTmp = 0; 464 do { 465 ++SizeTmp; 466 if (Ptr[SizeTmp-1] == '\n' || Ptr[SizeTmp-1] == '\r') { 467 468 // If this is a \r\n or \n\r, skip the newlines. 469 if ((Ptr[SizeTmp] == '\r' || Ptr[SizeTmp] == '\n') && 470 Ptr[SizeTmp-1] != Ptr[SizeTmp]) 471 ++SizeTmp; 472 473 // Found backslash<whitespace><newline>. Parse the char after it. 474 Size += SizeTmp; 475 Ptr += SizeTmp; 476 477 // Use slow version to accumulate a correct size field. 478 return getCharAndSizeSlowNoWarn(Ptr, Size, Features); 479 } 480 } while (isWhitespace(Ptr[SizeTmp])); 481 } 482 483 // Otherwise, this is not an escaped newline, just return the slash. 484 return '\\'; 485 } 486 487 // If this is a trigraph, process it. 488 if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 489 // If this is actually a legal trigraph (not something like "??x"), return 490 // it. 491 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 492 Ptr += 3; 493 Size += 3; 494 if (C == '\\') goto Slash; 495 return C; 496 } 497 } 498 499 // If this is neither, return a single character. 500 ++Size; 501 return *Ptr; 502} 503 504//===----------------------------------------------------------------------===// 505// Helper methods for lexing. 506//===----------------------------------------------------------------------===// 507 508void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { 509 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] 510 unsigned Size; 511 unsigned char C = *CurPtr++; 512 while (isIdentifierBody(C)) { 513 C = *CurPtr++; 514 } 515 --CurPtr; // Back up over the skipped character. 516 517 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline 518 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. 519 // FIXME: UCNs. 520 if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) { 521FinishIdentifier: 522 const char *IdStart = BufferPtr; 523 FormTokenWithChars(Result, CurPtr, tok::identifier); 524 525 // If we are in raw mode, return this identifier raw. There is no need to 526 // look up identifier information or attempt to macro expand it. 527 if (LexingRawMode) return; 528 529 // Fill in Result.IdentifierInfo, looking up the identifier in the 530 // identifier table. 531 PP->LookUpIdentifierInfo(Result, IdStart); 532 533 // Finally, now that we know we have an identifier, pass this off to the 534 // preprocessor, which may macro expand it or something. 535 return PP->HandleIdentifier(Result); 536 } 537 538 // Otherwise, $,\,? in identifier found. Enter slower path. 539 540 C = getCharAndSize(CurPtr, Size); 541 while (1) { 542 if (C == '$') { 543 // If we hit a $ and they are not supported in identifiers, we are done. 544 if (!Features.DollarIdents) goto FinishIdentifier; 545 546 // Otherwise, emit a diagnostic and continue. 547 if (!isLexingRawMode()) 548 Diag(CurPtr, diag::ext_dollar_in_identifier); 549 CurPtr = ConsumeChar(CurPtr, Size, Result); 550 C = getCharAndSize(CurPtr, Size); 551 continue; 552 } else if (!isIdentifierBody(C)) { // FIXME: UCNs. 553 // Found end of identifier. 554 goto FinishIdentifier; 555 } 556 557 // Otherwise, this character is good, consume it. 558 CurPtr = ConsumeChar(CurPtr, Size, Result); 559 560 C = getCharAndSize(CurPtr, Size); 561 while (isIdentifierBody(C)) { // FIXME: UCNs. 562 CurPtr = ConsumeChar(CurPtr, Size, Result); 563 C = getCharAndSize(CurPtr, Size); 564 } 565 } 566} 567 568 569/// LexNumericConstant - Lex the remainder of a integer or floating point 570/// constant. From[-1] is the first character lexed. Return the end of the 571/// constant. 572void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 573 unsigned Size; 574 char C = getCharAndSize(CurPtr, Size); 575 char PrevCh = 0; 576 while (isNumberBody(C)) { // FIXME: UCNs? 577 CurPtr = ConsumeChar(CurPtr, Size, Result); 578 PrevCh = C; 579 C = getCharAndSize(CurPtr, Size); 580 } 581 582 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 583 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) 584 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 585 586 // If we have a hex FP constant, continue. 587 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p') && 588 (Features.HexFloats || !Features.NoExtensions)) 589 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 590 591 // Update the location of token as well as BufferPtr. 592 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 593} 594 595/// LexStringLiteral - Lex the remainder of a string literal, after having lexed 596/// either " or L". 597void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) { 598 const char *NulCharacter = 0; // Does this string contain the \0 character? 599 600 char C = getAndAdvanceChar(CurPtr, Result); 601 while (C != '"') { 602 // Skip escaped characters. 603 if (C == '\\') { 604 // Skip the escaped character. 605 C = getAndAdvanceChar(CurPtr, Result); 606 } else if (C == '\n' || C == '\r' || // Newline. 607 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 608 if (!isLexingRawMode()) 609 Diag(BufferPtr, diag::err_unterminated_string); 610 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 611 return; 612 } else if (C == 0) { 613 NulCharacter = CurPtr-1; 614 } 615 C = getAndAdvanceChar(CurPtr, Result); 616 } 617 618 // If a nul character existed in the string, warn about it. 619 if (NulCharacter && !isLexingRawMode()) 620 Diag(NulCharacter, diag::null_in_string); 621 622 // Update the location of the token as well as the BufferPtr instance var. 623 FormTokenWithChars(Result, CurPtr, 624 Wide ? tok::wide_string_literal : tok::string_literal); 625} 626 627/// LexAngledStringLiteral - Lex the remainder of an angled string literal, 628/// after having lexed the '<' character. This is used for #include filenames. 629void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 630 const char *NulCharacter = 0; // Does this string contain the \0 character? 631 632 char C = getAndAdvanceChar(CurPtr, Result); 633 while (C != '>') { 634 // Skip escaped characters. 635 if (C == '\\') { 636 // Skip the escaped character. 637 C = getAndAdvanceChar(CurPtr, Result); 638 } else if (C == '\n' || C == '\r' || // Newline. 639 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 640 if (!isLexingRawMode()) 641 Diag(BufferPtr, diag::err_unterminated_string); 642 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 643 return; 644 } else if (C == 0) { 645 NulCharacter = CurPtr-1; 646 } 647 C = getAndAdvanceChar(CurPtr, Result); 648 } 649 650 // If a nul character existed in the string, warn about it. 651 if (NulCharacter && !isLexingRawMode()) 652 Diag(NulCharacter, diag::null_in_string); 653 654 // Update the location of token as well as BufferPtr. 655 FormTokenWithChars(Result, CurPtr, tok::angle_string_literal); 656} 657 658 659/// LexCharConstant - Lex the remainder of a character constant, after having 660/// lexed either ' or L'. 661void Lexer::LexCharConstant(Token &Result, const char *CurPtr) { 662 const char *NulCharacter = 0; // Does this character contain the \0 character? 663 664 // Handle the common case of 'x' and '\y' efficiently. 665 char C = getAndAdvanceChar(CurPtr, Result); 666 if (C == '\'') { 667 if (!isLexingRawMode()) 668 Diag(BufferPtr, diag::err_empty_character); 669 FormTokenWithChars(Result, CurPtr, tok::unknown); 670 return; 671 } else if (C == '\\') { 672 // Skip the escaped character. 673 // FIXME: UCN's. 674 C = getAndAdvanceChar(CurPtr, Result); 675 } 676 677 if (C && C != '\n' && C != '\r' && CurPtr[0] == '\'') { 678 ++CurPtr; 679 } else { 680 // Fall back on generic code for embedded nulls, newlines, wide chars. 681 do { 682 // Skip escaped characters. 683 if (C == '\\') { 684 // Skip the escaped character. 685 C = getAndAdvanceChar(CurPtr, Result); 686 } else if (C == '\n' || C == '\r' || // Newline. 687 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 688 if (!isLexingRawMode()) 689 Diag(BufferPtr, diag::err_unterminated_char); 690 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 691 return; 692 } else if (C == 0) { 693 NulCharacter = CurPtr-1; 694 } 695 C = getAndAdvanceChar(CurPtr, Result); 696 } while (C != '\''); 697 } 698 699 if (NulCharacter && !isLexingRawMode()) 700 Diag(NulCharacter, diag::null_in_char); 701 702 // Update the location of token as well as BufferPtr. 703 FormTokenWithChars(Result, CurPtr, tok::char_constant); 704} 705 706/// SkipWhitespace - Efficiently skip over a series of whitespace characters. 707/// Update BufferPtr to point to the next non-whitespace character and return. 708/// 709/// This method forms a token and returns true if KeepWhitespaceMode is enabled. 710/// 711bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { 712 // Whitespace - Skip it, then return the token after the whitespace. 713 unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently. 714 while (1) { 715 // Skip horizontal whitespace very aggressively. 716 while (isHorizontalWhitespace(Char)) 717 Char = *++CurPtr; 718 719 // Otherwise if we have something other than whitespace, we're done. 720 if (Char != '\n' && Char != '\r') 721 break; 722 723 if (ParsingPreprocessorDirective) { 724 // End of preprocessor directive line, let LexTokenInternal handle this. 725 BufferPtr = CurPtr; 726 return false; 727 } 728 729 // ok, but handle newline. 730 // The returned token is at the start of the line. 731 Result.setFlag(Token::StartOfLine); 732 // No leading whitespace seen so far. 733 Result.clearFlag(Token::LeadingSpace); 734 Char = *++CurPtr; 735 } 736 737 // If this isn't immediately after a newline, there is leading space. 738 char PrevChar = CurPtr[-1]; 739 if (PrevChar != '\n' && PrevChar != '\r') 740 Result.setFlag(Token::LeadingSpace); 741 742 // If the client wants us to return whitespace, return it now. 743 if (isKeepWhitespaceMode()) { 744 FormTokenWithChars(Result, CurPtr, tok::unknown); 745 return true; 746 } 747 748 BufferPtr = CurPtr; 749 return false; 750} 751 752// SkipBCPLComment - We have just read the // characters from input. Skip until 753// we find the newline character thats terminate the comment. Then update 754/// BufferPtr and return. If we're in KeepCommentMode, this will form the token 755/// and return true. 756bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { 757 // If BCPL comments aren't explicitly enabled for this language, emit an 758 // extension warning. 759 if (!Features.BCPLComment && !isLexingRawMode()) { 760 Diag(BufferPtr, diag::ext_bcpl_comment); 761 762 // Mark them enabled so we only emit one warning for this translation 763 // unit. 764 Features.BCPLComment = true; 765 } 766 767 // Scan over the body of the comment. The common case, when scanning, is that 768 // the comment contains normal ascii characters with nothing interesting in 769 // them. As such, optimize for this case with the inner loop. 770 char C; 771 do { 772 C = *CurPtr; 773 // FIXME: Speedup BCPL comment lexing. Just scan for a \n or \r character. 774 // If we find a \n character, scan backwards, checking to see if it's an 775 // escaped newline, like we do for block comments. 776 777 // Skip over characters in the fast loop. 778 while (C != 0 && // Potentially EOF. 779 C != '\\' && // Potentially escaped newline. 780 C != '?' && // Potentially trigraph. 781 C != '\n' && C != '\r') // Newline or DOS-style newline. 782 C = *++CurPtr; 783 784 // If this is a newline, we're done. 785 if (C == '\n' || C == '\r') 786 break; // Found the newline? Break out! 787 788 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 789 // properly decode the character. Read it in raw mode to avoid emitting 790 // diagnostics about things like trigraphs. If we see an escaped newline, 791 // we'll handle it below. 792 const char *OldPtr = CurPtr; 793 bool OldRawMode = isLexingRawMode(); 794 LexingRawMode = true; 795 C = getAndAdvanceChar(CurPtr, Result); 796 LexingRawMode = OldRawMode; 797 798 // If we read multiple characters, and one of those characters was a \r or 799 // \n, then we had an escaped newline within the comment. Emit diagnostic 800 // unless the next line is also a // comment. 801 if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') { 802 for (; OldPtr != CurPtr; ++OldPtr) 803 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 804 // Okay, we found a // comment that ends in a newline, if the next 805 // line is also a // comment, but has spaces, don't emit a diagnostic. 806 if (isspace(C)) { 807 const char *ForwardPtr = CurPtr; 808 while (isspace(*ForwardPtr)) // Skip whitespace. 809 ++ForwardPtr; 810 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 811 break; 812 } 813 814 if (!isLexingRawMode()) 815 Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment); 816 break; 817 } 818 } 819 820 if (CurPtr == BufferEnd+1) { --CurPtr; break; } 821 } while (C != '\n' && C != '\r'); 822 823 // Found but did not consume the newline. 824 825 // If we are returning comments as tokens, return this comment as a token. 826 if (inKeepCommentMode()) 827 return SaveBCPLComment(Result, CurPtr); 828 829 // If we are inside a preprocessor directive and we see the end of line, 830 // return immediately, so that the lexer can return this as an EOM token. 831 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 832 BufferPtr = CurPtr; 833 return false; 834 } 835 836 // Otherwise, eat the \n character. We don't care if this is a \n\r or 837 // \r\n sequence. This is an efficiency hack (because we know the \n can't 838 // contribute to another token), it isn't needed for correctness. Note that 839 // this is ok even in KeepWhitespaceMode, because we would have returned the 840 /// comment above in that mode. 841 ++CurPtr; 842 843 // The next returned token is at the start of the line. 844 Result.setFlag(Token::StartOfLine); 845 // No leading whitespace seen so far. 846 Result.clearFlag(Token::LeadingSpace); 847 BufferPtr = CurPtr; 848 return false; 849} 850 851/// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in 852/// an appropriate way and return it. 853bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) { 854 // If we're not in a preprocessor directive, just return the // comment 855 // directly. 856 FormTokenWithChars(Result, CurPtr, tok::comment); 857 858 if (!ParsingPreprocessorDirective) 859 return true; 860 861 // If this BCPL-style comment is in a macro definition, transmogrify it into 862 // a C-style block comment. 863 std::string Spelling = PP->getSpelling(Result); 864 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?"); 865 Spelling[1] = '*'; // Change prefix to "/*". 866 Spelling += "*/"; // add suffix. 867 868 Result.setKind(tok::comment); 869 Result.setLocation(PP->CreateString(&Spelling[0], Spelling.size(), 870 Result.getLocation())); 871 Result.setLength(Spelling.size()); 872 return true; 873} 874 875/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 876/// character (either \n or \r) is part of an escaped newline sequence. Issue a 877/// diagnostic if so. We know that the newline is inside of a block comment. 878static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, 879 Lexer *L) { 880 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 881 882 // Back up off the newline. 883 --CurPtr; 884 885 // If this is a two-character newline sequence, skip the other character. 886 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 887 // \n\n or \r\r -> not escaped newline. 888 if (CurPtr[0] == CurPtr[1]) 889 return false; 890 // \n\r or \r\n -> skip the newline. 891 --CurPtr; 892 } 893 894 // If we have horizontal whitespace, skip over it. We allow whitespace 895 // between the slash and newline. 896 bool HasSpace = false; 897 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 898 --CurPtr; 899 HasSpace = true; 900 } 901 902 // If we have a slash, we know this is an escaped newline. 903 if (*CurPtr == '\\') { 904 if (CurPtr[-1] != '*') return false; 905 } else { 906 // It isn't a slash, is it the ?? / trigraph? 907 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || 908 CurPtr[-3] != '*') 909 return false; 910 911 // This is the trigraph ending the comment. Emit a stern warning! 912 CurPtr -= 2; 913 914 // If no trigraphs are enabled, warn that we ignored this trigraph and 915 // ignore this * character. 916 if (!L->getFeatures().Trigraphs) { 917 if (!L->isLexingRawMode()) 918 L->Diag(CurPtr, diag::trigraph_ignored_block_comment); 919 return false; 920 } 921 if (!L->isLexingRawMode()) 922 L->Diag(CurPtr, diag::trigraph_ends_block_comment); 923 } 924 925 // Warn about having an escaped newline between the */ characters. 926 if (!L->isLexingRawMode()) 927 L->Diag(CurPtr, diag::escaped_newline_block_comment_end); 928 929 // If there was space between the backslash and newline, warn about it. 930 if (HasSpace && !L->isLexingRawMode()) 931 L->Diag(CurPtr, diag::backslash_newline_space); 932 933 return true; 934} 935 936#ifdef __SSE2__ 937#include <emmintrin.h> 938#elif __ALTIVEC__ 939#include <altivec.h> 940#undef bool 941#endif 942 943/// SkipBlockComment - We have just read the /* characters from input. Read 944/// until we find the */ characters that terminate the comment. Note that we 945/// don't bother decoding trigraphs or escaped newlines in block comments, 946/// because they cannot cause the comment to end. The only thing that can 947/// happen is the comment could end with an escaped newline between the */ end 948/// of comment. 949/// 950/// If KeepCommentMode is enabled, this forms a token from the comment and 951/// returns true. 952bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { 953 // Scan one character past where we should, looking for a '/' character. Once 954 // we find it, check to see if it was preceeded by a *. This common 955 // optimization helps people who like to put a lot of * characters in their 956 // comments. 957 958 // The first character we get with newlines and trigraphs skipped to handle 959 // the degenerate /*/ case below correctly if the * has an escaped newline 960 // after it. 961 unsigned CharSize; 962 unsigned char C = getCharAndSize(CurPtr, CharSize); 963 CurPtr += CharSize; 964 if (C == 0 && CurPtr == BufferEnd+1) { 965 if (!isLexingRawMode()) 966 Diag(BufferPtr, diag::err_unterminated_block_comment); 967 --CurPtr; 968 969 // KeepWhitespaceMode should return this broken comment as a token. Since 970 // it isn't a well formed comment, just return it as an 'unknown' token. 971 if (isKeepWhitespaceMode()) { 972 FormTokenWithChars(Result, CurPtr, tok::unknown); 973 return true; 974 } 975 976 BufferPtr = CurPtr; 977 return false; 978 } 979 980 // Check to see if the first character after the '/*' is another /. If so, 981 // then this slash does not end the block comment, it is part of it. 982 if (C == '/') 983 C = *CurPtr++; 984 985 while (1) { 986 // Skip over all non-interesting characters until we find end of buffer or a 987 // (probably ending) '/' character. 988 if (CurPtr + 24 < BufferEnd) { 989 // While not aligned to a 16-byte boundary. 990 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 991 C = *CurPtr++; 992 993 if (C == '/') goto FoundSlash; 994 995#ifdef __SSE2__ 996 __m128i Slashes = _mm_set_epi8('/', '/', '/', '/', '/', '/', '/', '/', 997 '/', '/', '/', '/', '/', '/', '/', '/'); 998 while (CurPtr+16 <= BufferEnd && 999 _mm_movemask_epi8(_mm_cmpeq_epi8(*(__m128i*)CurPtr, Slashes)) == 0) 1000 CurPtr += 16; 1001#elif __ALTIVEC__ 1002 __vector unsigned char Slashes = { 1003 '/', '/', '/', '/', '/', '/', '/', '/', 1004 '/', '/', '/', '/', '/', '/', '/', '/' 1005 }; 1006 while (CurPtr+16 <= BufferEnd && 1007 !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes)) 1008 CurPtr += 16; 1009#else 1010 // Scan for '/' quickly. Many block comments are very large. 1011 while (CurPtr[0] != '/' && 1012 CurPtr[1] != '/' && 1013 CurPtr[2] != '/' && 1014 CurPtr[3] != '/' && 1015 CurPtr+4 < BufferEnd) { 1016 CurPtr += 4; 1017 } 1018#endif 1019 1020 // It has to be one of the bytes scanned, increment to it and read one. 1021 C = *CurPtr++; 1022 } 1023 1024 // Loop to scan the remainder. 1025 while (C != '/' && C != '\0') 1026 C = *CurPtr++; 1027 1028 FoundSlash: 1029 if (C == '/') { 1030 if (CurPtr[-2] == '*') // We found the final */. We're done! 1031 break; 1032 1033 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 1034 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { 1035 // We found the final */, though it had an escaped newline between the 1036 // * and /. We're done! 1037 break; 1038 } 1039 } 1040 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 1041 // If this is a /* inside of the comment, emit a warning. Don't do this 1042 // if this is a /*/, which will end the comment. This misses cases with 1043 // embedded escaped newlines, but oh well. 1044 if (!isLexingRawMode()) 1045 Diag(CurPtr-1, diag::warn_nested_block_comment); 1046 } 1047 } else if (C == 0 && CurPtr == BufferEnd+1) { 1048 if (!isLexingRawMode()) 1049 Diag(BufferPtr, diag::err_unterminated_block_comment); 1050 // Note: the user probably forgot a */. We could continue immediately 1051 // after the /*, but this would involve lexing a lot of what really is the 1052 // comment, which surely would confuse the parser. 1053 --CurPtr; 1054 1055 // KeepWhitespaceMode should return this broken comment as a token. Since 1056 // it isn't a well formed comment, just return it as an 'unknown' token. 1057 if (isKeepWhitespaceMode()) { 1058 FormTokenWithChars(Result, CurPtr, tok::unknown); 1059 return true; 1060 } 1061 1062 BufferPtr = CurPtr; 1063 return false; 1064 } 1065 C = *CurPtr++; 1066 } 1067 1068 // If we are returning comments as tokens, return this comment as a token. 1069 if (inKeepCommentMode()) { 1070 FormTokenWithChars(Result, CurPtr, tok::comment); 1071 return true; 1072 } 1073 1074 // It is common for the tokens immediately after a /**/ comment to be 1075 // whitespace. Instead of going through the big switch, handle it 1076 // efficiently now. This is safe even in KeepWhitespaceMode because we would 1077 // have already returned above with the comment as a token. 1078 if (isHorizontalWhitespace(*CurPtr)) { 1079 Result.setFlag(Token::LeadingSpace); 1080 SkipWhitespace(Result, CurPtr+1); 1081 return false; 1082 } 1083 1084 // Otherwise, just return so that the next character will be lexed as a token. 1085 BufferPtr = CurPtr; 1086 Result.setFlag(Token::LeadingSpace); 1087 return false; 1088} 1089 1090//===----------------------------------------------------------------------===// 1091// Primary Lexing Entry Points 1092//===----------------------------------------------------------------------===// 1093 1094/// ReadToEndOfLine - Read the rest of the current preprocessor line as an 1095/// uninterpreted string. This switches the lexer out of directive mode. 1096std::string Lexer::ReadToEndOfLine() { 1097 assert(ParsingPreprocessorDirective && ParsingFilename == false && 1098 "Must be in a preprocessing directive!"); 1099 std::string Result; 1100 Token Tmp; 1101 1102 // CurPtr - Cache BufferPtr in an automatic variable. 1103 const char *CurPtr = BufferPtr; 1104 while (1) { 1105 char Char = getAndAdvanceChar(CurPtr, Tmp); 1106 switch (Char) { 1107 default: 1108 Result += Char; 1109 break; 1110 case 0: // Null. 1111 // Found end of file? 1112 if (CurPtr-1 != BufferEnd) { 1113 // Nope, normal character, continue. 1114 Result += Char; 1115 break; 1116 } 1117 // FALL THROUGH. 1118 case '\r': 1119 case '\n': 1120 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 1121 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 1122 BufferPtr = CurPtr-1; 1123 1124 // Next, lex the character, which should handle the EOM transition. 1125 Lex(Tmp); 1126 assert(Tmp.is(tok::eom) && "Unexpected token!"); 1127 1128 // Finally, we're done, return the string we found. 1129 return Result; 1130 } 1131 } 1132} 1133 1134/// LexEndOfFile - CurPtr points to the end of this file. Handle this 1135/// condition, reporting diagnostics and handling other edge cases as required. 1136/// This returns true if Result contains a token, false if PP.Lex should be 1137/// called again. 1138bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 1139 // If we hit the end of the file while parsing a preprocessor directive, 1140 // end the preprocessor directive first. The next token returned will 1141 // then be the end of file. 1142 if (ParsingPreprocessorDirective) { 1143 // Done parsing the "line". 1144 ParsingPreprocessorDirective = false; 1145 // Update the location of token as well as BufferPtr. 1146 FormTokenWithChars(Result, CurPtr, tok::eom); 1147 1148 // Restore comment saving mode, in case it was disabled for directive. 1149 SetCommentRetentionState(PP->getCommentRetentionState()); 1150 return true; // Have a token. 1151 } 1152 1153 // If we are in raw mode, return this event as an EOF token. Let the caller 1154 // that put us in raw mode handle the event. 1155 if (isLexingRawMode()) { 1156 Result.startToken(); 1157 BufferPtr = BufferEnd; 1158 FormTokenWithChars(Result, BufferEnd, tok::eof); 1159 return true; 1160 } 1161 1162 // Otherwise, issue diagnostics for unterminated #if and missing newline. 1163 1164 // If we are in a #if directive, emit an error. 1165 while (!ConditionalStack.empty()) { 1166 PP->Diag(ConditionalStack.back().IfLoc, 1167 diag::err_pp_unterminated_conditional); 1168 ConditionalStack.pop_back(); 1169 } 1170 1171 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 1172 // a pedwarn. 1173 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) 1174 Diag(BufferEnd, diag::ext_no_newline_eof); 1175 1176 BufferPtr = CurPtr; 1177 1178 // Finally, let the preprocessor handle this. 1179 return PP->HandleEndOfFile(Result); 1180} 1181 1182/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 1183/// the specified lexer will return a tok::l_paren token, 0 if it is something 1184/// else and 2 if there are no more tokens in the buffer controlled by the 1185/// lexer. 1186unsigned Lexer::isNextPPTokenLParen() { 1187 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 1188 1189 // Switch to 'skipping' mode. This will ensure that we can lex a token 1190 // without emitting diagnostics, disables macro expansion, and will cause EOF 1191 // to return an EOF token instead of popping the include stack. 1192 LexingRawMode = true; 1193 1194 // Save state that can be changed while lexing so that we can restore it. 1195 const char *TmpBufferPtr = BufferPtr; 1196 1197 Token Tok; 1198 Tok.startToken(); 1199 LexTokenInternal(Tok); 1200 1201 // Restore state that may have changed. 1202 BufferPtr = TmpBufferPtr; 1203 1204 // Restore the lexer back to non-skipping mode. 1205 LexingRawMode = false; 1206 1207 if (Tok.is(tok::eof)) 1208 return 2; 1209 return Tok.is(tok::l_paren); 1210} 1211 1212 1213/// LexTokenInternal - This implements a simple C family lexer. It is an 1214/// extremely performance critical piece of code. This assumes that the buffer 1215/// has a null character at the end of the file. Return true if an error 1216/// occurred and compilation should terminate, false if normal. This returns a 1217/// preprocessing token, not a normal token, as such, it is an internal 1218/// interface. It assumes that the Flags of result have been cleared before 1219/// calling this. 1220void Lexer::LexTokenInternal(Token &Result) { 1221LexNextToken: 1222 // New token, can't need cleaning yet. 1223 Result.clearFlag(Token::NeedsCleaning); 1224 Result.setIdentifierInfo(0); 1225 1226 // CurPtr - Cache BufferPtr in an automatic variable. 1227 const char *CurPtr = BufferPtr; 1228 1229 // Small amounts of horizontal whitespace is very common between tokens. 1230 if ((*CurPtr == ' ') || (*CurPtr == '\t')) { 1231 ++CurPtr; 1232 while ((*CurPtr == ' ') || (*CurPtr == '\t')) 1233 ++CurPtr; 1234 1235 // If we are keeping whitespace and other tokens, just return what we just 1236 // skipped. The next lexer invocation will return the token after the 1237 // whitespace. 1238 if (isKeepWhitespaceMode()) { 1239 FormTokenWithChars(Result, CurPtr, tok::unknown); 1240 return; 1241 } 1242 1243 BufferPtr = CurPtr; 1244 Result.setFlag(Token::LeadingSpace); 1245 } 1246 1247 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 1248 1249 // Read a character, advancing over it. 1250 char Char = getAndAdvanceChar(CurPtr, Result); 1251 tok::TokenKind Kind; 1252 1253 switch (Char) { 1254 case 0: // Null. 1255 // Found end of file? 1256 if (CurPtr-1 == BufferEnd) { 1257 // Read the PP instance variable into an automatic variable, because 1258 // LexEndOfFile will often delete 'this'. 1259 Preprocessor *PPCache = PP; 1260 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 1261 return; // Got a token to return. 1262 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 1263 return PPCache->Lex(Result); 1264 } 1265 1266 if (!isLexingRawMode()) 1267 Diag(CurPtr-1, diag::null_in_file); 1268 Result.setFlag(Token::LeadingSpace); 1269 if (SkipWhitespace(Result, CurPtr)) 1270 return; // KeepWhitespaceMode 1271 1272 goto LexNextToken; // GCC isn't tail call eliminating. 1273 case '\n': 1274 case '\r': 1275 // If we are inside a preprocessor directive and we see the end of line, 1276 // we know we are done with the directive, so return an EOM token. 1277 if (ParsingPreprocessorDirective) { 1278 // Done parsing the "line". 1279 ParsingPreprocessorDirective = false; 1280 1281 // Restore comment saving mode, in case it was disabled for directive. 1282 SetCommentRetentionState(PP->getCommentRetentionState()); 1283 1284 // Since we consumed a newline, we are back at the start of a line. 1285 IsAtStartOfLine = true; 1286 1287 Kind = tok::eom; 1288 break; 1289 } 1290 // The returned token is at the start of the line. 1291 Result.setFlag(Token::StartOfLine); 1292 // No leading whitespace seen so far. 1293 Result.clearFlag(Token::LeadingSpace); 1294 1295 if (SkipWhitespace(Result, CurPtr)) 1296 return; // KeepWhitespaceMode 1297 goto LexNextToken; // GCC isn't tail call eliminating. 1298 case ' ': 1299 case '\t': 1300 case '\f': 1301 case '\v': 1302 SkipHorizontalWhitespace: 1303 Result.setFlag(Token::LeadingSpace); 1304 if (SkipWhitespace(Result, CurPtr)) 1305 return; // KeepWhitespaceMode 1306 1307 SkipIgnoredUnits: 1308 CurPtr = BufferPtr; 1309 1310 // If the next token is obviously a // or /* */ comment, skip it efficiently 1311 // too (without going through the big switch stmt). 1312 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 1313 Features.BCPLComment) { 1314 SkipBCPLComment(Result, CurPtr+2); 1315 goto SkipIgnoredUnits; 1316 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 1317 SkipBlockComment(Result, CurPtr+2); 1318 goto SkipIgnoredUnits; 1319 } else if (isHorizontalWhitespace(*CurPtr)) { 1320 goto SkipHorizontalWhitespace; 1321 } 1322 goto LexNextToken; // GCC isn't tail call eliminating. 1323 1324 // C99 6.4.4.1: Integer Constants. 1325 // C99 6.4.4.2: Floating Constants. 1326 case '0': case '1': case '2': case '3': case '4': 1327 case '5': case '6': case '7': case '8': case '9': 1328 // Notify MIOpt that we read a non-whitespace/non-comment token. 1329 MIOpt.ReadToken(); 1330 return LexNumericConstant(Result, CurPtr); 1331 1332 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 1333 // Notify MIOpt that we read a non-whitespace/non-comment token. 1334 MIOpt.ReadToken(); 1335 Char = getCharAndSize(CurPtr, SizeTmp); 1336 1337 // Wide string literal. 1338 if (Char == '"') 1339 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 1340 true); 1341 1342 // Wide character constant. 1343 if (Char == '\'') 1344 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 1345 // FALL THROUGH, treating L like the start of an identifier. 1346 1347 // C99 6.4.2: Identifiers. 1348 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 1349 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 1350 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': 1351 case 'V': case 'W': case 'X': case 'Y': case 'Z': 1352 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 1353 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 1354 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': 1355 case 'v': case 'w': case 'x': case 'y': case 'z': 1356 case '_': 1357 // Notify MIOpt that we read a non-whitespace/non-comment token. 1358 MIOpt.ReadToken(); 1359 return LexIdentifier(Result, CurPtr); 1360 1361 case '$': // $ in identifiers. 1362 if (Features.DollarIdents) { 1363 if (!isLexingRawMode()) 1364 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 1365 // Notify MIOpt that we read a non-whitespace/non-comment token. 1366 MIOpt.ReadToken(); 1367 return LexIdentifier(Result, CurPtr); 1368 } 1369 1370 Kind = tok::unknown; 1371 break; 1372 1373 // C99 6.4.4: Character Constants. 1374 case '\'': 1375 // Notify MIOpt that we read a non-whitespace/non-comment token. 1376 MIOpt.ReadToken(); 1377 return LexCharConstant(Result, CurPtr); 1378 1379 // C99 6.4.5: String Literals. 1380 case '"': 1381 // Notify MIOpt that we read a non-whitespace/non-comment token. 1382 MIOpt.ReadToken(); 1383 return LexStringLiteral(Result, CurPtr, false); 1384 1385 // C99 6.4.6: Punctuators. 1386 case '?': 1387 Kind = tok::question; 1388 break; 1389 case '[': 1390 Kind = tok::l_square; 1391 break; 1392 case ']': 1393 Kind = tok::r_square; 1394 break; 1395 case '(': 1396 Kind = tok::l_paren; 1397 break; 1398 case ')': 1399 Kind = tok::r_paren; 1400 break; 1401 case '{': 1402 Kind = tok::l_brace; 1403 break; 1404 case '}': 1405 Kind = tok::r_brace; 1406 break; 1407 case '.': 1408 Char = getCharAndSize(CurPtr, SizeTmp); 1409 if (Char >= '0' && Char <= '9') { 1410 // Notify MIOpt that we read a non-whitespace/non-comment token. 1411 MIOpt.ReadToken(); 1412 1413 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 1414 } else if (Features.CPlusPlus && Char == '*') { 1415 Kind = tok::periodstar; 1416 CurPtr += SizeTmp; 1417 } else if (Char == '.' && 1418 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 1419 Kind = tok::ellipsis; 1420 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1421 SizeTmp2, Result); 1422 } else { 1423 Kind = tok::period; 1424 } 1425 break; 1426 case '&': 1427 Char = getCharAndSize(CurPtr, SizeTmp); 1428 if (Char == '&') { 1429 Kind = tok::ampamp; 1430 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1431 } else if (Char == '=') { 1432 Kind = tok::ampequal; 1433 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1434 } else { 1435 Kind = tok::amp; 1436 } 1437 break; 1438 case '*': 1439 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 1440 Kind = tok::starequal; 1441 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1442 } else { 1443 Kind = tok::star; 1444 } 1445 break; 1446 case '+': 1447 Char = getCharAndSize(CurPtr, SizeTmp); 1448 if (Char == '+') { 1449 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1450 Kind = tok::plusplus; 1451 } else if (Char == '=') { 1452 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1453 Kind = tok::plusequal; 1454 } else { 1455 Kind = tok::plus; 1456 } 1457 break; 1458 case '-': 1459 Char = getCharAndSize(CurPtr, SizeTmp); 1460 if (Char == '-') { // -- 1461 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1462 Kind = tok::minusminus; 1463 } else if (Char == '>' && Features.CPlusPlus && 1464 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 1465 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1466 SizeTmp2, Result); 1467 Kind = tok::arrowstar; 1468 } else if (Char == '>') { // -> 1469 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1470 Kind = tok::arrow; 1471 } else if (Char == '=') { // -= 1472 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1473 Kind = tok::minusequal; 1474 } else { 1475 Kind = tok::minus; 1476 } 1477 break; 1478 case '~': 1479 Kind = tok::tilde; 1480 break; 1481 case '!': 1482 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 1483 Kind = tok::exclaimequal; 1484 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1485 } else { 1486 Kind = tok::exclaim; 1487 } 1488 break; 1489 case '/': 1490 // 6.4.9: Comments 1491 Char = getCharAndSize(CurPtr, SizeTmp); 1492 if (Char == '/') { // BCPL comment. 1493 // Even if BCPL comments are disabled (e.g. in C89 mode), we generally 1494 // want to lex this as a comment. There is one problem with this though, 1495 // that in one particular corner case, this can change the behavior of the 1496 // resultant program. For example, In "foo //**/ bar", C89 would lex 1497 // this as "foo / bar" and langauges with BCPL comments would lex it as 1498 // "foo". Check to see if the character after the second slash is a '*'. 1499 // If so, we will lex that as a "/" instead of the start of a comment. 1500 if (Features.BCPLComment || 1501 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') { 1502 if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 1503 return; // KeepCommentMode 1504 1505 // It is common for the tokens immediately after a // comment to be 1506 // whitespace (indentation for the next line). Instead of going through 1507 // the big switch, handle it efficiently now. 1508 goto SkipIgnoredUnits; 1509 } 1510 } 1511 1512 if (Char == '*') { // /**/ comment. 1513 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 1514 return; // KeepCommentMode 1515 goto LexNextToken; // GCC isn't tail call eliminating. 1516 } 1517 1518 if (Char == '=') { 1519 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1520 Kind = tok::slashequal; 1521 } else { 1522 Kind = tok::slash; 1523 } 1524 break; 1525 case '%': 1526 Char = getCharAndSize(CurPtr, SizeTmp); 1527 if (Char == '=') { 1528 Kind = tok::percentequal; 1529 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1530 } else if (Features.Digraphs && Char == '>') { 1531 Kind = tok::r_brace; // '%>' -> '}' 1532 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1533 } else if (Features.Digraphs && Char == ':') { 1534 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1535 Char = getCharAndSize(CurPtr, SizeTmp); 1536 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 1537 Kind = tok::hashhash; // '%:%:' -> '##' 1538 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1539 SizeTmp2, Result); 1540 } else if (Char == '@' && Features.Microsoft) { // %:@ -> #@ -> Charize 1541 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1542 if (!isLexingRawMode()) 1543 Diag(BufferPtr, diag::charize_microsoft_ext); 1544 Kind = tok::hashat; 1545 } else { 1546 Kind = tok::hash; // '%:' -> '#' 1547 1548 // We parsed a # character. If this occurs at the start of the line, 1549 // it's actually the start of a preprocessing directive. Callback to 1550 // the preprocessor to handle it. 1551 // FIXME: -fpreprocessed mode?? 1552 if (Result.isAtStartOfLine() && !LexingRawMode) { 1553 BufferPtr = CurPtr; 1554 PP->HandleDirective(Result); 1555 1556 // As an optimization, if the preprocessor didn't switch lexers, tail 1557 // recurse. 1558 if (PP->isCurrentLexer(this)) { 1559 // Start a new token. If this is a #include or something, the PP may 1560 // want us starting at the beginning of the line again. If so, set 1561 // the StartOfLine flag. 1562 if (IsAtStartOfLine) { 1563 Result.setFlag(Token::StartOfLine); 1564 IsAtStartOfLine = false; 1565 } 1566 goto LexNextToken; // GCC isn't tail call eliminating. 1567 } 1568 1569 return PP->Lex(Result); 1570 } 1571 } 1572 } else { 1573 Kind = tok::percent; 1574 } 1575 break; 1576 case '<': 1577 Char = getCharAndSize(CurPtr, SizeTmp); 1578 if (ParsingFilename) { 1579 return LexAngledStringLiteral(Result, CurPtr+SizeTmp); 1580 } else if (Char == '<' && 1581 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '=') { 1582 Kind = tok::lesslessequal; 1583 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1584 SizeTmp2, Result); 1585 } else if (Char == '<') { 1586 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1587 Kind = tok::lessless; 1588 } else if (Char == '=') { 1589 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1590 Kind = tok::lessequal; 1591 } else if (Features.Digraphs && Char == ':') { // '<:' -> '[' 1592 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1593 Kind = tok::l_square; 1594 } else if (Features.Digraphs && Char == '%') { // '<%' -> '{' 1595 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1596 Kind = tok::l_brace; 1597 } else { 1598 Kind = tok::less; 1599 } 1600 break; 1601 case '>': 1602 Char = getCharAndSize(CurPtr, SizeTmp); 1603 if (Char == '=') { 1604 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1605 Kind = tok::greaterequal; 1606 } else if (Char == '>' && 1607 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '=') { 1608 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1609 SizeTmp2, Result); 1610 Kind = tok::greatergreaterequal; 1611 } else if (Char == '>') { 1612 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1613 Kind = tok::greatergreater; 1614 } else { 1615 Kind = tok::greater; 1616 } 1617 break; 1618 case '^': 1619 Char = getCharAndSize(CurPtr, SizeTmp); 1620 if (Char == '=') { 1621 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1622 Kind = tok::caretequal; 1623 } else { 1624 Kind = tok::caret; 1625 } 1626 break; 1627 case '|': 1628 Char = getCharAndSize(CurPtr, SizeTmp); 1629 if (Char == '=') { 1630 Kind = tok::pipeequal; 1631 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1632 } else if (Char == '|') { 1633 Kind = tok::pipepipe; 1634 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1635 } else { 1636 Kind = tok::pipe; 1637 } 1638 break; 1639 case ':': 1640 Char = getCharAndSize(CurPtr, SizeTmp); 1641 if (Features.Digraphs && Char == '>') { 1642 Kind = tok::r_square; // ':>' -> ']' 1643 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1644 } else if (Features.CPlusPlus && Char == ':') { 1645 Kind = tok::coloncolon; 1646 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1647 } else { 1648 Kind = tok::colon; 1649 } 1650 break; 1651 case ';': 1652 Kind = tok::semi; 1653 break; 1654 case '=': 1655 Char = getCharAndSize(CurPtr, SizeTmp); 1656 if (Char == '=') { 1657 Kind = tok::equalequal; 1658 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1659 } else { 1660 Kind = tok::equal; 1661 } 1662 break; 1663 case ',': 1664 Kind = tok::comma; 1665 break; 1666 case '#': 1667 Char = getCharAndSize(CurPtr, SizeTmp); 1668 if (Char == '#') { 1669 Kind = tok::hashhash; 1670 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1671 } else if (Char == '@' && Features.Microsoft) { // #@ -> Charize 1672 Kind = tok::hashat; 1673 if (!isLexingRawMode()) 1674 Diag(BufferPtr, diag::charize_microsoft_ext); 1675 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1676 } else { 1677 Kind = tok::hash; 1678 // We parsed a # character. If this occurs at the start of the line, 1679 // it's actually the start of a preprocessing directive. Callback to 1680 // the preprocessor to handle it. 1681 // FIXME: -fpreprocessed mode?? 1682 if (Result.isAtStartOfLine() && !LexingRawMode) { 1683 BufferPtr = CurPtr; 1684 PP->HandleDirective(Result); 1685 1686 // As an optimization, if the preprocessor didn't switch lexers, tail 1687 // recurse. 1688 if (PP->isCurrentLexer(this)) { 1689 // Start a new token. If this is a #include or something, the PP may 1690 // want us starting at the beginning of the line again. If so, set 1691 // the StartOfLine flag. 1692 if (IsAtStartOfLine) { 1693 Result.setFlag(Token::StartOfLine); 1694 IsAtStartOfLine = false; 1695 } 1696 goto LexNextToken; // GCC isn't tail call eliminating. 1697 } 1698 return PP->Lex(Result); 1699 } 1700 } 1701 break; 1702 1703 case '@': 1704 // Objective C support. 1705 if (CurPtr[-1] == '@' && Features.ObjC1) 1706 Kind = tok::at; 1707 else 1708 Kind = tok::unknown; 1709 break; 1710 1711 case '\\': 1712 // FIXME: UCN's. 1713 // FALL THROUGH. 1714 default: 1715 Kind = tok::unknown; 1716 break; 1717 } 1718 1719 // Notify MIOpt that we read a non-whitespace/non-comment token. 1720 MIOpt.ReadToken(); 1721 1722 // Update the location of token as well as BufferPtr. 1723 FormTokenWithChars(Result, CurPtr, Kind); 1724} 1725