Lexer.cpp revision 515f43f9f23de50d155b481b8774ec40bdfd7ff2
1//===--- Lexer.cpp - C Language Family Lexer ------------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements the Lexer and Token interfaces. 11// 12//===----------------------------------------------------------------------===// 13// 14// TODO: GCC Diagnostics emitted by the lexer: 15// PEDWARN: (form feed|vertical tab) in preprocessing directive 16// 17// Universal characters, unicode, char mapping: 18// WARNING: `%.*s' is not in NFKC 19// WARNING: `%.*s' is not in NFC 20// 21// Other: 22// TODO: Options to support: 23// -fexec-charset,-fwide-exec-charset 24// 25//===----------------------------------------------------------------------===// 26 27#include "clang/Lex/Lexer.h" 28#include "clang/Lex/Preprocessor.h" 29#include "clang/Lex/LexDiagnostic.h" 30#include "clang/Basic/SourceManager.h" 31#include "llvm/Support/Compiler.h" 32#include "llvm/Support/MemoryBuffer.h" 33#include <cctype> 34using namespace clang; 35 36static void InitCharacterInfo(); 37 38//===----------------------------------------------------------------------===// 39// Token Class Implementation 40//===----------------------------------------------------------------------===// 41 42/// isObjCAtKeyword - Return true if we have an ObjC keyword identifier. 43bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const { 44 if (IdentifierInfo *II = getIdentifierInfo()) 45 return II->getObjCKeywordID() == objcKey; 46 return false; 47} 48 49/// getObjCKeywordID - Return the ObjC keyword kind. 50tok::ObjCKeywordKind Token::getObjCKeywordID() const { 51 IdentifierInfo *specId = getIdentifierInfo(); 52 return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword; 53} 54 55 56//===----------------------------------------------------------------------===// 57// Lexer Class Implementation 58//===----------------------------------------------------------------------===// 59 60void Lexer::InitLexer(const char *BufStart, const char *BufPtr, 61 const char *BufEnd) { 62 InitCharacterInfo(); 63 64 BufferStart = BufStart; 65 BufferPtr = BufPtr; 66 BufferEnd = BufEnd; 67 68 assert(BufEnd[0] == 0 && 69 "We assume that the input buffer has a null character at the end" 70 " to simplify lexing!"); 71 72 Is_PragmaLexer = false; 73 IsInConflictMarker = false; 74 75 // Start of the file is a start of line. 76 IsAtStartOfLine = true; 77 78 // We are not after parsing a #. 79 ParsingPreprocessorDirective = false; 80 81 // We are not after parsing #include. 82 ParsingFilename = false; 83 84 // We are not in raw mode. Raw mode disables diagnostics and interpretation 85 // of tokens (e.g. identifiers, thus disabling macro expansion). It is used 86 // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block 87 // or otherwise skipping over tokens. 88 LexingRawMode = false; 89 90 // Default to not keeping comments. 91 ExtendedTokenMode = 0; 92} 93 94/// Lexer constructor - Create a new lexer object for the specified buffer 95/// with the specified preprocessor managing the lexing process. This lexer 96/// assumes that the associated file buffer and Preprocessor objects will 97/// outlive it, so it doesn't take ownership of either of them. 98Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *InputFile, Preprocessor &PP) 99 : PreprocessorLexer(&PP, FID), 100 FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)), 101 Features(PP.getLangOptions()) { 102 103 InitLexer(InputFile->getBufferStart(), InputFile->getBufferStart(), 104 InputFile->getBufferEnd()); 105 106 // Default to keeping comments if the preprocessor wants them. 107 SetCommentRetentionState(PP.getCommentRetentionState()); 108} 109 110/// Lexer constructor - Create a new raw lexer object. This object is only 111/// suitable for calls to 'LexRawToken'. This lexer assumes that the text 112/// range will outlive it, so it doesn't take ownership of it. 113Lexer::Lexer(SourceLocation fileloc, const LangOptions &features, 114 const char *BufStart, const char *BufPtr, const char *BufEnd) 115 : FileLoc(fileloc), Features(features) { 116 117 InitLexer(BufStart, BufPtr, BufEnd); 118 119 // We *are* in raw mode. 120 LexingRawMode = true; 121} 122 123/// Lexer constructor - Create a new raw lexer object. This object is only 124/// suitable for calls to 'LexRawToken'. This lexer assumes that the text 125/// range will outlive it, so it doesn't take ownership of it. 126Lexer::Lexer(FileID FID, const llvm::MemoryBuffer *FromFile, 127 const SourceManager &SM, const LangOptions &features) 128 : FileLoc(SM.getLocForStartOfFile(FID)), Features(features) { 129 130 InitLexer(FromFile->getBufferStart(), FromFile->getBufferStart(), 131 FromFile->getBufferEnd()); 132 133 // We *are* in raw mode. 134 LexingRawMode = true; 135} 136 137/// Create_PragmaLexer: Lexer constructor - Create a new lexer object for 138/// _Pragma expansion. This has a variety of magic semantics that this method 139/// sets up. It returns a new'd Lexer that must be delete'd when done. 140/// 141/// On entrance to this routine, TokStartLoc is a macro location which has a 142/// spelling loc that indicates the bytes to be lexed for the token and an 143/// instantiation location that indicates where all lexed tokens should be 144/// "expanded from". 145/// 146/// FIXME: It would really be nice to make _Pragma just be a wrapper around a 147/// normal lexer that remaps tokens as they fly by. This would require making 148/// Preprocessor::Lex virtual. Given that, we could just dump in a magic lexer 149/// interface that could handle this stuff. This would pull GetMappedTokenLoc 150/// out of the critical path of the lexer! 151/// 152Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc, 153 SourceLocation InstantiationLocStart, 154 SourceLocation InstantiationLocEnd, 155 unsigned TokLen, Preprocessor &PP) { 156 SourceManager &SM = PP.getSourceManager(); 157 158 // Create the lexer as if we were going to lex the file normally. 159 FileID SpellingFID = SM.getFileID(SpellingLoc); 160 const llvm::MemoryBuffer *InputFile = SM.getBuffer(SpellingFID); 161 Lexer *L = new Lexer(SpellingFID, InputFile, PP); 162 163 // Now that the lexer is created, change the start/end locations so that we 164 // just lex the subsection of the file that we want. This is lexing from a 165 // scratch buffer. 166 const char *StrData = SM.getCharacterData(SpellingLoc); 167 168 L->BufferPtr = StrData; 169 L->BufferEnd = StrData+TokLen; 170 assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!"); 171 172 // Set the SourceLocation with the remapping information. This ensures that 173 // GetMappedTokenLoc will remap the tokens as they are lexed. 174 L->FileLoc = SM.createInstantiationLoc(SM.getLocForStartOfFile(SpellingFID), 175 InstantiationLocStart, 176 InstantiationLocEnd, TokLen); 177 178 // Ensure that the lexer thinks it is inside a directive, so that end \n will 179 // return an EOM token. 180 L->ParsingPreprocessorDirective = true; 181 182 // This lexer really is for _Pragma. 183 L->Is_PragmaLexer = true; 184 return L; 185} 186 187 188/// Stringify - Convert the specified string into a C string, with surrounding 189/// ""'s, and with escaped \ and " characters. 190std::string Lexer::Stringify(const std::string &Str, bool Charify) { 191 std::string Result = Str; 192 char Quote = Charify ? '\'' : '"'; 193 for (unsigned i = 0, e = Result.size(); i != e; ++i) { 194 if (Result[i] == '\\' || Result[i] == Quote) { 195 Result.insert(Result.begin()+i, '\\'); 196 ++i; ++e; 197 } 198 } 199 return Result; 200} 201 202/// Stringify - Convert the specified string into a C string by escaping '\' 203/// and " characters. This does not add surrounding ""'s to the string. 204void Lexer::Stringify(llvm::SmallVectorImpl<char> &Str) { 205 for (unsigned i = 0, e = Str.size(); i != e; ++i) { 206 if (Str[i] == '\\' || Str[i] == '"') { 207 Str.insert(Str.begin()+i, '\\'); 208 ++i; ++e; 209 } 210 } 211} 212 213static bool isWhitespace(unsigned char c); 214 215/// MeasureTokenLength - Relex the token at the specified location and return 216/// its length in bytes in the input file. If the token needs cleaning (e.g. 217/// includes a trigraph or an escaped newline) then this count includes bytes 218/// that are part of that. 219unsigned Lexer::MeasureTokenLength(SourceLocation Loc, 220 const SourceManager &SM, 221 const LangOptions &LangOpts) { 222 // TODO: this could be special cased for common tokens like identifiers, ')', 223 // etc to make this faster, if it mattered. Just look at StrData[0] to handle 224 // all obviously single-char tokens. This could use 225 // Lexer::isObviouslySimpleCharacter for example to handle identifiers or 226 // something. 227 228 // If this comes from a macro expansion, we really do want the macro name, not 229 // the token this macro expanded to. 230 Loc = SM.getInstantiationLoc(Loc); 231 std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc); 232 bool Invalid = false; 233 llvm::StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid); 234 if (Invalid) 235 return 0; 236 237 const char *StrData = Buffer.data()+LocInfo.second; 238 239 if (isWhitespace(StrData[0])) 240 return 0; 241 242 // Create a lexer starting at the beginning of this token. 243 Lexer TheLexer(Loc, LangOpts, Buffer.begin(), StrData, Buffer.end()); 244 TheLexer.SetCommentRetentionState(true); 245 Token TheTok; 246 TheLexer.LexFromRawLexer(TheTok); 247 return TheTok.getLength(); 248} 249 250//===----------------------------------------------------------------------===// 251// Character information. 252//===----------------------------------------------------------------------===// 253 254enum { 255 CHAR_HORZ_WS = 0x01, // ' ', '\t', '\f', '\v'. Note, no '\0' 256 CHAR_VERT_WS = 0x02, // '\r', '\n' 257 CHAR_LETTER = 0x04, // a-z,A-Z 258 CHAR_NUMBER = 0x08, // 0-9 259 CHAR_UNDER = 0x10, // _ 260 CHAR_PERIOD = 0x20 // . 261}; 262 263// Statically initialize CharInfo table based on ASCII character set 264// Reference: FreeBSD 7.2 /usr/share/misc/ascii 265static const unsigned char CharInfo[256] = 266{ 267// 0 NUL 1 SOH 2 STX 3 ETX 268// 4 EOT 5 ENQ 6 ACK 7 BEL 269 0 , 0 , 0 , 0 , 270 0 , 0 , 0 , 0 , 271// 8 BS 9 HT 10 NL 11 VT 272//12 NP 13 CR 14 SO 15 SI 273 0 , CHAR_HORZ_WS, CHAR_VERT_WS, CHAR_HORZ_WS, 274 CHAR_HORZ_WS, CHAR_VERT_WS, 0 , 0 , 275//16 DLE 17 DC1 18 DC2 19 DC3 276//20 DC4 21 NAK 22 SYN 23 ETB 277 0 , 0 , 0 , 0 , 278 0 , 0 , 0 , 0 , 279//24 CAN 25 EM 26 SUB 27 ESC 280//28 FS 29 GS 30 RS 31 US 281 0 , 0 , 0 , 0 , 282 0 , 0 , 0 , 0 , 283//32 SP 33 ! 34 " 35 # 284//36 $ 37 % 38 & 39 ' 285 CHAR_HORZ_WS, 0 , 0 , 0 , 286 0 , 0 , 0 , 0 , 287//40 ( 41 ) 42 * 43 + 288//44 , 45 - 46 . 47 / 289 0 , 0 , 0 , 0 , 290 0 , 0 , CHAR_PERIOD , 0 , 291//48 0 49 1 50 2 51 3 292//52 4 53 5 54 6 55 7 293 CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , 294 CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , CHAR_NUMBER , 295//56 8 57 9 58 : 59 ; 296//60 < 61 = 62 > 63 ? 297 CHAR_NUMBER , CHAR_NUMBER , 0 , 0 , 298 0 , 0 , 0 , 0 , 299//64 @ 65 A 66 B 67 C 300//68 D 69 E 70 F 71 G 301 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 302 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 303//72 H 73 I 74 J 75 K 304//76 L 77 M 78 N 79 O 305 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 306 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 307//80 P 81 Q 82 R 83 S 308//84 T 85 U 86 V 87 W 309 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 310 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 311//88 X 89 Y 90 Z 91 [ 312//92 \ 93 ] 94 ^ 95 _ 313 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0 , 314 0 , 0 , 0 , CHAR_UNDER , 315//96 ` 97 a 98 b 99 c 316//100 d 101 e 102 f 103 g 317 0 , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 318 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 319//104 h 105 i 106 j 107 k 320//108 l 109 m 110 n 111 o 321 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 322 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 323//112 p 113 q 114 r 115 s 324//116 t 117 u 118 v 119 w 325 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 326 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 327//120 x 121 y 122 z 123 { 328//124 | 125 } 126 ~ 127 DEL 329 CHAR_LETTER , CHAR_LETTER , CHAR_LETTER , 0 , 330 0 , 0 , 0 , 0 331}; 332 333static void InitCharacterInfo() { 334 static bool isInited = false; 335 if (isInited) return; 336 // check the statically-initialized CharInfo table 337 assert(CHAR_HORZ_WS == CharInfo[(int)' ']); 338 assert(CHAR_HORZ_WS == CharInfo[(int)'\t']); 339 assert(CHAR_HORZ_WS == CharInfo[(int)'\f']); 340 assert(CHAR_HORZ_WS == CharInfo[(int)'\v']); 341 assert(CHAR_VERT_WS == CharInfo[(int)'\n']); 342 assert(CHAR_VERT_WS == CharInfo[(int)'\r']); 343 assert(CHAR_UNDER == CharInfo[(int)'_']); 344 assert(CHAR_PERIOD == CharInfo[(int)'.']); 345 for (unsigned i = 'a'; i <= 'z'; ++i) { 346 assert(CHAR_LETTER == CharInfo[i]); 347 assert(CHAR_LETTER == CharInfo[i+'A'-'a']); 348 } 349 for (unsigned i = '0'; i <= '9'; ++i) 350 assert(CHAR_NUMBER == CharInfo[i]); 351 352 isInited = true; 353} 354 355 356/// isIdentifierBody - Return true if this is the body character of an 357/// identifier, which is [a-zA-Z0-9_]. 358static inline bool isIdentifierBody(unsigned char c) { 359 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER)) ? true : false; 360} 361 362/// isHorizontalWhitespace - Return true if this character is horizontal 363/// whitespace: ' ', '\t', '\f', '\v'. Note that this returns false for '\0'. 364static inline bool isHorizontalWhitespace(unsigned char c) { 365 return (CharInfo[c] & CHAR_HORZ_WS) ? true : false; 366} 367 368/// isWhitespace - Return true if this character is horizontal or vertical 369/// whitespace: ' ', '\t', '\f', '\v', '\n', '\r'. Note that this returns false 370/// for '\0'. 371static inline bool isWhitespace(unsigned char c) { 372 return (CharInfo[c] & (CHAR_HORZ_WS|CHAR_VERT_WS)) ? true : false; 373} 374 375/// isNumberBody - Return true if this is the body character of an 376/// preprocessing number, which is [a-zA-Z0-9_.]. 377static inline bool isNumberBody(unsigned char c) { 378 return (CharInfo[c] & (CHAR_LETTER|CHAR_NUMBER|CHAR_UNDER|CHAR_PERIOD)) ? 379 true : false; 380} 381 382 383//===----------------------------------------------------------------------===// 384// Diagnostics forwarding code. 385//===----------------------------------------------------------------------===// 386 387/// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the 388/// lexer buffer was all instantiated at a single point, perform the mapping. 389/// This is currently only used for _Pragma implementation, so it is the slow 390/// path of the hot getSourceLocation method. Do not allow it to be inlined. 391static DISABLE_INLINE SourceLocation GetMappedTokenLoc(Preprocessor &PP, 392 SourceLocation FileLoc, 393 unsigned CharNo, 394 unsigned TokLen); 395static SourceLocation GetMappedTokenLoc(Preprocessor &PP, 396 SourceLocation FileLoc, 397 unsigned CharNo, unsigned TokLen) { 398 assert(FileLoc.isMacroID() && "Must be an instantiation"); 399 400 // Otherwise, we're lexing "mapped tokens". This is used for things like 401 // _Pragma handling. Combine the instantiation location of FileLoc with the 402 // spelling location. 403 SourceManager &SM = PP.getSourceManager(); 404 405 // Create a new SLoc which is expanded from Instantiation(FileLoc) but whose 406 // characters come from spelling(FileLoc)+Offset. 407 SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc); 408 SpellingLoc = SpellingLoc.getFileLocWithOffset(CharNo); 409 410 // Figure out the expansion loc range, which is the range covered by the 411 // original _Pragma(...) sequence. 412 std::pair<SourceLocation,SourceLocation> II = 413 SM.getImmediateInstantiationRange(FileLoc); 414 415 return SM.createInstantiationLoc(SpellingLoc, II.first, II.second, TokLen); 416} 417 418/// getSourceLocation - Return a source location identifier for the specified 419/// offset in the current file. 420SourceLocation Lexer::getSourceLocation(const char *Loc, 421 unsigned TokLen) const { 422 assert(Loc >= BufferStart && Loc <= BufferEnd && 423 "Location out of range for this buffer!"); 424 425 // In the normal case, we're just lexing from a simple file buffer, return 426 // the file id from FileLoc with the offset specified. 427 unsigned CharNo = Loc-BufferStart; 428 if (FileLoc.isFileID()) 429 return FileLoc.getFileLocWithOffset(CharNo); 430 431 // Otherwise, this is the _Pragma lexer case, which pretends that all of the 432 // tokens are lexed from where the _Pragma was defined. 433 assert(PP && "This doesn't work on raw lexers"); 434 return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen); 435} 436 437/// Diag - Forwarding function for diagnostics. This translate a source 438/// position in the current buffer into a SourceLocation object for rendering. 439DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const { 440 return PP->Diag(getSourceLocation(Loc), DiagID); 441} 442 443//===----------------------------------------------------------------------===// 444// Trigraph and Escaped Newline Handling Code. 445//===----------------------------------------------------------------------===// 446 447/// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair, 448/// return the decoded trigraph letter it corresponds to, or '\0' if nothing. 449static char GetTrigraphCharForLetter(char Letter) { 450 switch (Letter) { 451 default: return 0; 452 case '=': return '#'; 453 case ')': return ']'; 454 case '(': return '['; 455 case '!': return '|'; 456 case '\'': return '^'; 457 case '>': return '}'; 458 case '/': return '\\'; 459 case '<': return '{'; 460 case '-': return '~'; 461 } 462} 463 464/// DecodeTrigraphChar - If the specified character is a legal trigraph when 465/// prefixed with ??, emit a trigraph warning. If trigraphs are enabled, 466/// return the result character. Finally, emit a warning about trigraph use 467/// whether trigraphs are enabled or not. 468static char DecodeTrigraphChar(const char *CP, Lexer *L) { 469 char Res = GetTrigraphCharForLetter(*CP); 470 if (!Res || !L) return Res; 471 472 if (!L->getFeatures().Trigraphs) { 473 if (!L->isLexingRawMode()) 474 L->Diag(CP-2, diag::trigraph_ignored); 475 return 0; 476 } 477 478 if (!L->isLexingRawMode()) 479 L->Diag(CP-2, diag::trigraph_converted) << std::string()+Res; 480 return Res; 481} 482 483/// getEscapedNewLineSize - Return the size of the specified escaped newline, 484/// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a 485/// trigraph equivalent on entry to this function. 486unsigned Lexer::getEscapedNewLineSize(const char *Ptr) { 487 unsigned Size = 0; 488 while (isWhitespace(Ptr[Size])) { 489 ++Size; 490 491 if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r') 492 continue; 493 494 // If this is a \r\n or \n\r, skip the other half. 495 if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') && 496 Ptr[Size-1] != Ptr[Size]) 497 ++Size; 498 499 return Size; 500 } 501 502 // Not an escaped newline, must be a \t or something else. 503 return 0; 504} 505 506/// SkipEscapedNewLines - If P points to an escaped newline (or a series of 507/// them), skip over them and return the first non-escaped-newline found, 508/// otherwise return P. 509const char *Lexer::SkipEscapedNewLines(const char *P) { 510 while (1) { 511 const char *AfterEscape; 512 if (*P == '\\') { 513 AfterEscape = P+1; 514 } else if (*P == '?') { 515 // If not a trigraph for escape, bail out. 516 if (P[1] != '?' || P[2] != '/') 517 return P; 518 AfterEscape = P+3; 519 } else { 520 return P; 521 } 522 523 unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape); 524 if (NewLineSize == 0) return P; 525 P = AfterEscape+NewLineSize; 526 } 527} 528 529 530/// getCharAndSizeSlow - Peek a single 'character' from the specified buffer, 531/// get its size, and return it. This is tricky in several cases: 532/// 1. If currently at the start of a trigraph, we warn about the trigraph, 533/// then either return the trigraph (skipping 3 chars) or the '?', 534/// depending on whether trigraphs are enabled or not. 535/// 2. If this is an escaped newline (potentially with whitespace between 536/// the backslash and newline), implicitly skip the newline and return 537/// the char after it. 538/// 3. If this is a UCN, return it. FIXME: C++ UCN's? 539/// 540/// This handles the slow/uncommon case of the getCharAndSize method. Here we 541/// know that we can accumulate into Size, and that we have already incremented 542/// Ptr by Size bytes. 543/// 544/// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should 545/// be updated to match. 546/// 547char Lexer::getCharAndSizeSlow(const char *Ptr, unsigned &Size, 548 Token *Tok) { 549 // If we have a slash, look for an escaped newline. 550 if (Ptr[0] == '\\') { 551 ++Size; 552 ++Ptr; 553Slash: 554 // Common case, backslash-char where the char is not whitespace. 555 if (!isWhitespace(Ptr[0])) return '\\'; 556 557 // See if we have optional whitespace characters between the slash and 558 // newline. 559 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 560 // Remember that this token needs to be cleaned. 561 if (Tok) Tok->setFlag(Token::NeedsCleaning); 562 563 // Warn if there was whitespace between the backslash and newline. 564 if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode()) 565 Diag(Ptr, diag::backslash_newline_space); 566 567 // Found backslash<whitespace><newline>. Parse the char after it. 568 Size += EscapedNewLineSize; 569 Ptr += EscapedNewLineSize; 570 // Use slow version to accumulate a correct size field. 571 return getCharAndSizeSlow(Ptr, Size, Tok); 572 } 573 574 // Otherwise, this is not an escaped newline, just return the slash. 575 return '\\'; 576 } 577 578 // If this is a trigraph, process it. 579 if (Ptr[0] == '?' && Ptr[1] == '?') { 580 // If this is actually a legal trigraph (not something like "??x"), emit 581 // a trigraph warning. If so, and if trigraphs are enabled, return it. 582 if (char C = DecodeTrigraphChar(Ptr+2, Tok ? this : 0)) { 583 // Remember that this token needs to be cleaned. 584 if (Tok) Tok->setFlag(Token::NeedsCleaning); 585 586 Ptr += 3; 587 Size += 3; 588 if (C == '\\') goto Slash; 589 return C; 590 } 591 } 592 593 // If this is neither, return a single character. 594 ++Size; 595 return *Ptr; 596} 597 598 599/// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the 600/// getCharAndSizeNoWarn method. Here we know that we can accumulate into Size, 601/// and that we have already incremented Ptr by Size bytes. 602/// 603/// NOTE: When this method is updated, getCharAndSizeSlow (above) should 604/// be updated to match. 605char Lexer::getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size, 606 const LangOptions &Features) { 607 // If we have a slash, look for an escaped newline. 608 if (Ptr[0] == '\\') { 609 ++Size; 610 ++Ptr; 611Slash: 612 // Common case, backslash-char where the char is not whitespace. 613 if (!isWhitespace(Ptr[0])) return '\\'; 614 615 // See if we have optional whitespace characters followed by a newline. 616 if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) { 617 // Found backslash<whitespace><newline>. Parse the char after it. 618 Size += EscapedNewLineSize; 619 Ptr += EscapedNewLineSize; 620 621 // Use slow version to accumulate a correct size field. 622 return getCharAndSizeSlowNoWarn(Ptr, Size, Features); 623 } 624 625 // Otherwise, this is not an escaped newline, just return the slash. 626 return '\\'; 627 } 628 629 // If this is a trigraph, process it. 630 if (Features.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') { 631 // If this is actually a legal trigraph (not something like "??x"), return 632 // it. 633 if (char C = GetTrigraphCharForLetter(Ptr[2])) { 634 Ptr += 3; 635 Size += 3; 636 if (C == '\\') goto Slash; 637 return C; 638 } 639 } 640 641 // If this is neither, return a single character. 642 ++Size; 643 return *Ptr; 644} 645 646//===----------------------------------------------------------------------===// 647// Helper methods for lexing. 648//===----------------------------------------------------------------------===// 649 650void Lexer::LexIdentifier(Token &Result, const char *CurPtr) { 651 // Match [_A-Za-z0-9]*, we have already matched [_A-Za-z$] 652 unsigned Size; 653 unsigned char C = *CurPtr++; 654 while (isIdentifierBody(C)) 655 C = *CurPtr++; 656 657 --CurPtr; // Back up over the skipped character. 658 659 // Fast path, no $,\,? in identifier found. '\' might be an escaped newline 660 // or UCN, and ? might be a trigraph for '\', an escaped newline or UCN. 661 // FIXME: UCNs. 662 // 663 // TODO: Could merge these checks into a CharInfo flag to make the comparison 664 // cheaper 665 if (C != '\\' && C != '?' && (C != '$' || !Features.DollarIdents)) { 666FinishIdentifier: 667 const char *IdStart = BufferPtr; 668 FormTokenWithChars(Result, CurPtr, tok::identifier); 669 670 // If we are in raw mode, return this identifier raw. There is no need to 671 // look up identifier information or attempt to macro expand it. 672 if (LexingRawMode) return; 673 674 // Fill in Result.IdentifierInfo, looking up the identifier in the 675 // identifier table. 676 IdentifierInfo *II = PP->LookUpIdentifierInfo(Result, IdStart); 677 678 // Change the kind of this identifier to the appropriate token kind, e.g. 679 // turning "for" into a keyword. 680 Result.setKind(II->getTokenID()); 681 682 // Finally, now that we know we have an identifier, pass this off to the 683 // preprocessor, which may macro expand it or something. 684 if (II->isHandleIdentifierCase()) 685 PP->HandleIdentifier(Result); 686 return; 687 } 688 689 // Otherwise, $,\,? in identifier found. Enter slower path. 690 691 C = getCharAndSize(CurPtr, Size); 692 while (1) { 693 if (C == '$') { 694 // If we hit a $ and they are not supported in identifiers, we are done. 695 if (!Features.DollarIdents) goto FinishIdentifier; 696 697 // Otherwise, emit a diagnostic and continue. 698 if (!isLexingRawMode()) 699 Diag(CurPtr, diag::ext_dollar_in_identifier); 700 CurPtr = ConsumeChar(CurPtr, Size, Result); 701 C = getCharAndSize(CurPtr, Size); 702 continue; 703 } else if (!isIdentifierBody(C)) { // FIXME: UCNs. 704 // Found end of identifier. 705 goto FinishIdentifier; 706 } 707 708 // Otherwise, this character is good, consume it. 709 CurPtr = ConsumeChar(CurPtr, Size, Result); 710 711 C = getCharAndSize(CurPtr, Size); 712 while (isIdentifierBody(C)) { // FIXME: UCNs. 713 CurPtr = ConsumeChar(CurPtr, Size, Result); 714 C = getCharAndSize(CurPtr, Size); 715 } 716 } 717} 718 719 720/// LexNumericConstant - Lex the remainder of a integer or floating point 721/// constant. From[-1] is the first character lexed. Return the end of the 722/// constant. 723void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { 724 unsigned Size; 725 char C = getCharAndSize(CurPtr, Size); 726 char PrevCh = 0; 727 while (isNumberBody(C)) { // FIXME: UCNs? 728 CurPtr = ConsumeChar(CurPtr, Size, Result); 729 PrevCh = C; 730 C = getCharAndSize(CurPtr, Size); 731 } 732 733 // If we fell out, check for a sign, due to 1e+12. If we have one, continue. 734 if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) 735 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 736 737 // If we have a hex FP constant, continue. 738 if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p') && 739 (!PP || !PP->getLangOptions().CPlusPlus0x)) 740 return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result)); 741 742 // Update the location of token as well as BufferPtr. 743 const char *TokStart = BufferPtr; 744 FormTokenWithChars(Result, CurPtr, tok::numeric_constant); 745 Result.setLiteralData(TokStart); 746} 747 748/// LexStringLiteral - Lex the remainder of a string literal, after having lexed 749/// either " or L". 750void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) { 751 const char *NulCharacter = 0; // Does this string contain the \0 character? 752 753 char C = getAndAdvanceChar(CurPtr, Result); 754 while (C != '"') { 755 // Skip escaped characters. 756 if (C == '\\') { 757 // Skip the escaped character. 758 C = getAndAdvanceChar(CurPtr, Result); 759 } else if (C == '\n' || C == '\r' || // Newline. 760 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 761 if (!isLexingRawMode() && !Features.AsmPreprocessor) 762 Diag(BufferPtr, diag::err_unterminated_string); 763 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 764 return; 765 } else if (C == 0) { 766 NulCharacter = CurPtr-1; 767 } 768 C = getAndAdvanceChar(CurPtr, Result); 769 } 770 771 // If a nul character existed in the string, warn about it. 772 if (NulCharacter && !isLexingRawMode()) 773 Diag(NulCharacter, diag::null_in_string); 774 775 // Update the location of the token as well as the BufferPtr instance var. 776 const char *TokStart = BufferPtr; 777 FormTokenWithChars(Result, CurPtr, 778 Wide ? tok::wide_string_literal : tok::string_literal); 779 Result.setLiteralData(TokStart); 780} 781 782/// LexAngledStringLiteral - Lex the remainder of an angled string literal, 783/// after having lexed the '<' character. This is used for #include filenames. 784void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) { 785 const char *NulCharacter = 0; // Does this string contain the \0 character? 786 const char *AfterLessPos = CurPtr; 787 char C = getAndAdvanceChar(CurPtr, Result); 788 while (C != '>') { 789 // Skip escaped characters. 790 if (C == '\\') { 791 // Skip the escaped character. 792 C = getAndAdvanceChar(CurPtr, Result); 793 } else if (C == '\n' || C == '\r' || // Newline. 794 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 795 // If the filename is unterminated, then it must just be a lone < 796 // character. Return this as such. 797 FormTokenWithChars(Result, AfterLessPos, tok::less); 798 return; 799 } else if (C == 0) { 800 NulCharacter = CurPtr-1; 801 } 802 C = getAndAdvanceChar(CurPtr, Result); 803 } 804 805 // If a nul character existed in the string, warn about it. 806 if (NulCharacter && !isLexingRawMode()) 807 Diag(NulCharacter, diag::null_in_string); 808 809 // Update the location of token as well as BufferPtr. 810 const char *TokStart = BufferPtr; 811 FormTokenWithChars(Result, CurPtr, tok::angle_string_literal); 812 Result.setLiteralData(TokStart); 813} 814 815 816/// LexCharConstant - Lex the remainder of a character constant, after having 817/// lexed either ' or L'. 818void Lexer::LexCharConstant(Token &Result, const char *CurPtr) { 819 const char *NulCharacter = 0; // Does this character contain the \0 character? 820 821 // Handle the common case of 'x' and '\y' efficiently. 822 char C = getAndAdvanceChar(CurPtr, Result); 823 if (C == '\'') { 824 if (!isLexingRawMode() && !Features.AsmPreprocessor) 825 Diag(BufferPtr, diag::err_empty_character); 826 FormTokenWithChars(Result, CurPtr, tok::unknown); 827 return; 828 } else if (C == '\\') { 829 // Skip the escaped character. 830 // FIXME: UCN's. 831 C = getAndAdvanceChar(CurPtr, Result); 832 } 833 834 if (C && C != '\n' && C != '\r' && CurPtr[0] == '\'') { 835 ++CurPtr; 836 } else { 837 // Fall back on generic code for embedded nulls, newlines, wide chars. 838 do { 839 // Skip escaped characters. 840 if (C == '\\') { 841 // Skip the escaped character. 842 C = getAndAdvanceChar(CurPtr, Result); 843 } else if (C == '\n' || C == '\r' || // Newline. 844 (C == 0 && CurPtr-1 == BufferEnd)) { // End of file. 845 if (!isLexingRawMode() && !Features.AsmPreprocessor) 846 Diag(BufferPtr, diag::err_unterminated_char); 847 FormTokenWithChars(Result, CurPtr-1, tok::unknown); 848 return; 849 } else if (C == 0) { 850 NulCharacter = CurPtr-1; 851 } 852 C = getAndAdvanceChar(CurPtr, Result); 853 } while (C != '\''); 854 } 855 856 if (NulCharacter && !isLexingRawMode()) 857 Diag(NulCharacter, diag::null_in_char); 858 859 // Update the location of token as well as BufferPtr. 860 const char *TokStart = BufferPtr; 861 FormTokenWithChars(Result, CurPtr, tok::char_constant); 862 Result.setLiteralData(TokStart); 863} 864 865/// SkipWhitespace - Efficiently skip over a series of whitespace characters. 866/// Update BufferPtr to point to the next non-whitespace character and return. 867/// 868/// This method forms a token and returns true if KeepWhitespaceMode is enabled. 869/// 870bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr) { 871 // Whitespace - Skip it, then return the token after the whitespace. 872 unsigned char Char = *CurPtr; // Skip consequtive spaces efficiently. 873 while (1) { 874 // Skip horizontal whitespace very aggressively. 875 while (isHorizontalWhitespace(Char)) 876 Char = *++CurPtr; 877 878 // Otherwise if we have something other than whitespace, we're done. 879 if (Char != '\n' && Char != '\r') 880 break; 881 882 if (ParsingPreprocessorDirective) { 883 // End of preprocessor directive line, let LexTokenInternal handle this. 884 BufferPtr = CurPtr; 885 return false; 886 } 887 888 // ok, but handle newline. 889 // The returned token is at the start of the line. 890 Result.setFlag(Token::StartOfLine); 891 // No leading whitespace seen so far. 892 Result.clearFlag(Token::LeadingSpace); 893 Char = *++CurPtr; 894 } 895 896 // If this isn't immediately after a newline, there is leading space. 897 char PrevChar = CurPtr[-1]; 898 if (PrevChar != '\n' && PrevChar != '\r') 899 Result.setFlag(Token::LeadingSpace); 900 901 // If the client wants us to return whitespace, return it now. 902 if (isKeepWhitespaceMode()) { 903 FormTokenWithChars(Result, CurPtr, tok::unknown); 904 return true; 905 } 906 907 BufferPtr = CurPtr; 908 return false; 909} 910 911// SkipBCPLComment - We have just read the // characters from input. Skip until 912// we find the newline character thats terminate the comment. Then update 913/// BufferPtr and return. 914/// 915/// If we're in KeepCommentMode or any CommentHandler has inserted 916/// some tokens, this will store the first token and return true. 917bool Lexer::SkipBCPLComment(Token &Result, const char *CurPtr) { 918 // If BCPL comments aren't explicitly enabled for this language, emit an 919 // extension warning. 920 if (!Features.BCPLComment && !isLexingRawMode()) { 921 Diag(BufferPtr, diag::ext_bcpl_comment); 922 923 // Mark them enabled so we only emit one warning for this translation 924 // unit. 925 Features.BCPLComment = true; 926 } 927 928 // Scan over the body of the comment. The common case, when scanning, is that 929 // the comment contains normal ascii characters with nothing interesting in 930 // them. As such, optimize for this case with the inner loop. 931 char C; 932 do { 933 C = *CurPtr; 934 // FIXME: Speedup BCPL comment lexing. Just scan for a \n or \r character. 935 // If we find a \n character, scan backwards, checking to see if it's an 936 // escaped newline, like we do for block comments. 937 938 // Skip over characters in the fast loop. 939 while (C != 0 && // Potentially EOF. 940 C != '\\' && // Potentially escaped newline. 941 C != '?' && // Potentially trigraph. 942 C != '\n' && C != '\r') // Newline or DOS-style newline. 943 C = *++CurPtr; 944 945 // If this is a newline, we're done. 946 if (C == '\n' || C == '\r') 947 break; // Found the newline? Break out! 948 949 // Otherwise, this is a hard case. Fall back on getAndAdvanceChar to 950 // properly decode the character. Read it in raw mode to avoid emitting 951 // diagnostics about things like trigraphs. If we see an escaped newline, 952 // we'll handle it below. 953 const char *OldPtr = CurPtr; 954 bool OldRawMode = isLexingRawMode(); 955 LexingRawMode = true; 956 C = getAndAdvanceChar(CurPtr, Result); 957 LexingRawMode = OldRawMode; 958 959 // If the char that we finally got was a \n, then we must have had something 960 // like \<newline><newline>. We don't want to have consumed the second 961 // newline, we want CurPtr, to end up pointing to it down below. 962 if (C == '\n' || C == '\r') { 963 --CurPtr; 964 C = 'x'; // doesn't matter what this is. 965 } 966 967 // If we read multiple characters, and one of those characters was a \r or 968 // \n, then we had an escaped newline within the comment. Emit diagnostic 969 // unless the next line is also a // comment. 970 if (CurPtr != OldPtr+1 && C != '/' && CurPtr[0] != '/') { 971 for (; OldPtr != CurPtr; ++OldPtr) 972 if (OldPtr[0] == '\n' || OldPtr[0] == '\r') { 973 // Okay, we found a // comment that ends in a newline, if the next 974 // line is also a // comment, but has spaces, don't emit a diagnostic. 975 if (isspace(C)) { 976 const char *ForwardPtr = CurPtr; 977 while (isspace(*ForwardPtr)) // Skip whitespace. 978 ++ForwardPtr; 979 if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/') 980 break; 981 } 982 983 if (!isLexingRawMode()) 984 Diag(OldPtr-1, diag::ext_multi_line_bcpl_comment); 985 break; 986 } 987 } 988 989 if (CurPtr == BufferEnd+1) { --CurPtr; break; } 990 } while (C != '\n' && C != '\r'); 991 992 // Found but did not consume the newline. Notify comment handlers about the 993 // comment unless we're in a #if 0 block. 994 if (PP && !isLexingRawMode() && 995 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 996 getSourceLocation(CurPtr)))) { 997 BufferPtr = CurPtr; 998 return true; // A token has to be returned. 999 } 1000 1001 // If we are returning comments as tokens, return this comment as a token. 1002 if (inKeepCommentMode()) 1003 return SaveBCPLComment(Result, CurPtr); 1004 1005 // If we are inside a preprocessor directive and we see the end of line, 1006 // return immediately, so that the lexer can return this as an EOM token. 1007 if (ParsingPreprocessorDirective || CurPtr == BufferEnd) { 1008 BufferPtr = CurPtr; 1009 return false; 1010 } 1011 1012 // Otherwise, eat the \n character. We don't care if this is a \n\r or 1013 // \r\n sequence. This is an efficiency hack (because we know the \n can't 1014 // contribute to another token), it isn't needed for correctness. Note that 1015 // this is ok even in KeepWhitespaceMode, because we would have returned the 1016 /// comment above in that mode. 1017 ++CurPtr; 1018 1019 // The next returned token is at the start of the line. 1020 Result.setFlag(Token::StartOfLine); 1021 // No leading whitespace seen so far. 1022 Result.clearFlag(Token::LeadingSpace); 1023 BufferPtr = CurPtr; 1024 return false; 1025} 1026 1027/// SaveBCPLComment - If in save-comment mode, package up this BCPL comment in 1028/// an appropriate way and return it. 1029bool Lexer::SaveBCPLComment(Token &Result, const char *CurPtr) { 1030 // If we're not in a preprocessor directive, just return the // comment 1031 // directly. 1032 FormTokenWithChars(Result, CurPtr, tok::comment); 1033 1034 if (!ParsingPreprocessorDirective) 1035 return true; 1036 1037 // If this BCPL-style comment is in a macro definition, transmogrify it into 1038 // a C-style block comment. 1039 bool Invalid = false; 1040 std::string Spelling = PP->getSpelling(Result, &Invalid); 1041 if (Invalid) 1042 return true; 1043 1044 assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not bcpl comment?"); 1045 Spelling[1] = '*'; // Change prefix to "/*". 1046 Spelling += "*/"; // add suffix. 1047 1048 Result.setKind(tok::comment); 1049 PP->CreateString(&Spelling[0], Spelling.size(), Result, 1050 Result.getLocation()); 1051 return true; 1052} 1053 1054/// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline 1055/// character (either \n or \r) is part of an escaped newline sequence. Issue a 1056/// diagnostic if so. We know that the newline is inside of a block comment. 1057static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, 1058 Lexer *L) { 1059 assert(CurPtr[0] == '\n' || CurPtr[0] == '\r'); 1060 1061 // Back up off the newline. 1062 --CurPtr; 1063 1064 // If this is a two-character newline sequence, skip the other character. 1065 if (CurPtr[0] == '\n' || CurPtr[0] == '\r') { 1066 // \n\n or \r\r -> not escaped newline. 1067 if (CurPtr[0] == CurPtr[1]) 1068 return false; 1069 // \n\r or \r\n -> skip the newline. 1070 --CurPtr; 1071 } 1072 1073 // If we have horizontal whitespace, skip over it. We allow whitespace 1074 // between the slash and newline. 1075 bool HasSpace = false; 1076 while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) { 1077 --CurPtr; 1078 HasSpace = true; 1079 } 1080 1081 // If we have a slash, we know this is an escaped newline. 1082 if (*CurPtr == '\\') { 1083 if (CurPtr[-1] != '*') return false; 1084 } else { 1085 // It isn't a slash, is it the ?? / trigraph? 1086 if (CurPtr[0] != '/' || CurPtr[-1] != '?' || CurPtr[-2] != '?' || 1087 CurPtr[-3] != '*') 1088 return false; 1089 1090 // This is the trigraph ending the comment. Emit a stern warning! 1091 CurPtr -= 2; 1092 1093 // If no trigraphs are enabled, warn that we ignored this trigraph and 1094 // ignore this * character. 1095 if (!L->getFeatures().Trigraphs) { 1096 if (!L->isLexingRawMode()) 1097 L->Diag(CurPtr, diag::trigraph_ignored_block_comment); 1098 return false; 1099 } 1100 if (!L->isLexingRawMode()) 1101 L->Diag(CurPtr, diag::trigraph_ends_block_comment); 1102 } 1103 1104 // Warn about having an escaped newline between the */ characters. 1105 if (!L->isLexingRawMode()) 1106 L->Diag(CurPtr, diag::escaped_newline_block_comment_end); 1107 1108 // If there was space between the backslash and newline, warn about it. 1109 if (HasSpace && !L->isLexingRawMode()) 1110 L->Diag(CurPtr, diag::backslash_newline_space); 1111 1112 return true; 1113} 1114 1115#ifdef __SSE2__ 1116#include <emmintrin.h> 1117#elif __ALTIVEC__ 1118#include <altivec.h> 1119#undef bool 1120#endif 1121 1122/// SkipBlockComment - We have just read the /* characters from input. Read 1123/// until we find the */ characters that terminate the comment. Note that we 1124/// don't bother decoding trigraphs or escaped newlines in block comments, 1125/// because they cannot cause the comment to end. The only thing that can 1126/// happen is the comment could end with an escaped newline between the */ end 1127/// of comment. 1128/// 1129/// If we're in KeepCommentMode or any CommentHandler has inserted 1130/// some tokens, this will store the first token and return true. 1131bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr) { 1132 // Scan one character past where we should, looking for a '/' character. Once 1133 // we find it, check to see if it was preceeded by a *. This common 1134 // optimization helps people who like to put a lot of * characters in their 1135 // comments. 1136 1137 // The first character we get with newlines and trigraphs skipped to handle 1138 // the degenerate /*/ case below correctly if the * has an escaped newline 1139 // after it. 1140 unsigned CharSize; 1141 unsigned char C = getCharAndSize(CurPtr, CharSize); 1142 CurPtr += CharSize; 1143 if (C == 0 && CurPtr == BufferEnd+1) { 1144 if (!isLexingRawMode()) 1145 Diag(BufferPtr, diag::err_unterminated_block_comment); 1146 --CurPtr; 1147 1148 // KeepWhitespaceMode should return this broken comment as a token. Since 1149 // it isn't a well formed comment, just return it as an 'unknown' token. 1150 if (isKeepWhitespaceMode()) { 1151 FormTokenWithChars(Result, CurPtr, tok::unknown); 1152 return true; 1153 } 1154 1155 BufferPtr = CurPtr; 1156 return false; 1157 } 1158 1159 // Check to see if the first character after the '/*' is another /. If so, 1160 // then this slash does not end the block comment, it is part of it. 1161 if (C == '/') 1162 C = *CurPtr++; 1163 1164 while (1) { 1165 // Skip over all non-interesting characters until we find end of buffer or a 1166 // (probably ending) '/' character. 1167 if (CurPtr + 24 < BufferEnd) { 1168 // While not aligned to a 16-byte boundary. 1169 while (C != '/' && ((intptr_t)CurPtr & 0x0F) != 0) 1170 C = *CurPtr++; 1171 1172 if (C == '/') goto FoundSlash; 1173 1174#ifdef __SSE2__ 1175 __m128i Slashes = _mm_set_epi8('/', '/', '/', '/', '/', '/', '/', '/', 1176 '/', '/', '/', '/', '/', '/', '/', '/'); 1177 while (CurPtr+16 <= BufferEnd && 1178 _mm_movemask_epi8(_mm_cmpeq_epi8(*(__m128i*)CurPtr, Slashes)) == 0) 1179 CurPtr += 16; 1180#elif __ALTIVEC__ 1181 __vector unsigned char Slashes = { 1182 '/', '/', '/', '/', '/', '/', '/', '/', 1183 '/', '/', '/', '/', '/', '/', '/', '/' 1184 }; 1185 while (CurPtr+16 <= BufferEnd && 1186 !vec_any_eq(*(vector unsigned char*)CurPtr, Slashes)) 1187 CurPtr += 16; 1188#else 1189 // Scan for '/' quickly. Many block comments are very large. 1190 while (CurPtr[0] != '/' && 1191 CurPtr[1] != '/' && 1192 CurPtr[2] != '/' && 1193 CurPtr[3] != '/' && 1194 CurPtr+4 < BufferEnd) { 1195 CurPtr += 4; 1196 } 1197#endif 1198 1199 // It has to be one of the bytes scanned, increment to it and read one. 1200 C = *CurPtr++; 1201 } 1202 1203 // Loop to scan the remainder. 1204 while (C != '/' && C != '\0') 1205 C = *CurPtr++; 1206 1207 FoundSlash: 1208 if (C == '/') { 1209 if (CurPtr[-2] == '*') // We found the final */. We're done! 1210 break; 1211 1212 if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) { 1213 if (isEndOfBlockCommentWithEscapedNewLine(CurPtr-2, this)) { 1214 // We found the final */, though it had an escaped newline between the 1215 // * and /. We're done! 1216 break; 1217 } 1218 } 1219 if (CurPtr[0] == '*' && CurPtr[1] != '/') { 1220 // If this is a /* inside of the comment, emit a warning. Don't do this 1221 // if this is a /*/, which will end the comment. This misses cases with 1222 // embedded escaped newlines, but oh well. 1223 if (!isLexingRawMode()) 1224 Diag(CurPtr-1, diag::warn_nested_block_comment); 1225 } 1226 } else if (C == 0 && CurPtr == BufferEnd+1) { 1227 if (!isLexingRawMode()) 1228 Diag(BufferPtr, diag::err_unterminated_block_comment); 1229 // Note: the user probably forgot a */. We could continue immediately 1230 // after the /*, but this would involve lexing a lot of what really is the 1231 // comment, which surely would confuse the parser. 1232 --CurPtr; 1233 1234 // KeepWhitespaceMode should return this broken comment as a token. Since 1235 // it isn't a well formed comment, just return it as an 'unknown' token. 1236 if (isKeepWhitespaceMode()) { 1237 FormTokenWithChars(Result, CurPtr, tok::unknown); 1238 return true; 1239 } 1240 1241 BufferPtr = CurPtr; 1242 return false; 1243 } 1244 C = *CurPtr++; 1245 } 1246 1247 // Notify comment handlers about the comment unless we're in a #if 0 block. 1248 if (PP && !isLexingRawMode() && 1249 PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr), 1250 getSourceLocation(CurPtr)))) { 1251 BufferPtr = CurPtr; 1252 return true; // A token has to be returned. 1253 } 1254 1255 // If we are returning comments as tokens, return this comment as a token. 1256 if (inKeepCommentMode()) { 1257 FormTokenWithChars(Result, CurPtr, tok::comment); 1258 return true; 1259 } 1260 1261 // It is common for the tokens immediately after a /**/ comment to be 1262 // whitespace. Instead of going through the big switch, handle it 1263 // efficiently now. This is safe even in KeepWhitespaceMode because we would 1264 // have already returned above with the comment as a token. 1265 if (isHorizontalWhitespace(*CurPtr)) { 1266 Result.setFlag(Token::LeadingSpace); 1267 SkipWhitespace(Result, CurPtr+1); 1268 return false; 1269 } 1270 1271 // Otherwise, just return so that the next character will be lexed as a token. 1272 BufferPtr = CurPtr; 1273 Result.setFlag(Token::LeadingSpace); 1274 return false; 1275} 1276 1277//===----------------------------------------------------------------------===// 1278// Primary Lexing Entry Points 1279//===----------------------------------------------------------------------===// 1280 1281/// ReadToEndOfLine - Read the rest of the current preprocessor line as an 1282/// uninterpreted string. This switches the lexer out of directive mode. 1283std::string Lexer::ReadToEndOfLine() { 1284 assert(ParsingPreprocessorDirective && ParsingFilename == false && 1285 "Must be in a preprocessing directive!"); 1286 std::string Result; 1287 Token Tmp; 1288 1289 // CurPtr - Cache BufferPtr in an automatic variable. 1290 const char *CurPtr = BufferPtr; 1291 while (1) { 1292 char Char = getAndAdvanceChar(CurPtr, Tmp); 1293 switch (Char) { 1294 default: 1295 Result += Char; 1296 break; 1297 case 0: // Null. 1298 // Found end of file? 1299 if (CurPtr-1 != BufferEnd) { 1300 // Nope, normal character, continue. 1301 Result += Char; 1302 break; 1303 } 1304 // FALL THROUGH. 1305 case '\r': 1306 case '\n': 1307 // Okay, we found the end of the line. First, back up past the \0, \r, \n. 1308 assert(CurPtr[-1] == Char && "Trigraphs for newline?"); 1309 BufferPtr = CurPtr-1; 1310 1311 // Next, lex the character, which should handle the EOM transition. 1312 Lex(Tmp); 1313 assert(Tmp.is(tok::eom) && "Unexpected token!"); 1314 1315 // Finally, we're done, return the string we found. 1316 return Result; 1317 } 1318 } 1319} 1320 1321/// LexEndOfFile - CurPtr points to the end of this file. Handle this 1322/// condition, reporting diagnostics and handling other edge cases as required. 1323/// This returns true if Result contains a token, false if PP.Lex should be 1324/// called again. 1325bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) { 1326 // If we hit the end of the file while parsing a preprocessor directive, 1327 // end the preprocessor directive first. The next token returned will 1328 // then be the end of file. 1329 if (ParsingPreprocessorDirective) { 1330 // Done parsing the "line". 1331 ParsingPreprocessorDirective = false; 1332 // Update the location of token as well as BufferPtr. 1333 FormTokenWithChars(Result, CurPtr, tok::eom); 1334 1335 // Restore comment saving mode, in case it was disabled for directive. 1336 SetCommentRetentionState(PP->getCommentRetentionState()); 1337 return true; // Have a token. 1338 } 1339 1340 // If we are in raw mode, return this event as an EOF token. Let the caller 1341 // that put us in raw mode handle the event. 1342 if (isLexingRawMode()) { 1343 Result.startToken(); 1344 BufferPtr = BufferEnd; 1345 FormTokenWithChars(Result, BufferEnd, tok::eof); 1346 return true; 1347 } 1348 1349 // Otherwise, check if we are code-completing, then issue diagnostics for 1350 // unterminated #if and missing newline. 1351 1352 if (PP && PP->isCodeCompletionFile(FileLoc)) { 1353 // We're at the end of the file, but we've been asked to consider the 1354 // end of the file to be a code-completion token. Return the 1355 // code-completion token. 1356 Result.startToken(); 1357 FormTokenWithChars(Result, CurPtr, tok::code_completion); 1358 1359 // Only do the eof -> code_completion translation once. 1360 PP->SetCodeCompletionPoint(0, 0, 0); 1361 return true; 1362 } 1363 1364 // If we are in a #if directive, emit an error. 1365 while (!ConditionalStack.empty()) { 1366 PP->Diag(ConditionalStack.back().IfLoc, 1367 diag::err_pp_unterminated_conditional); 1368 ConditionalStack.pop_back(); 1369 } 1370 1371 // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue 1372 // a pedwarn. 1373 if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) 1374 Diag(BufferEnd, diag::ext_no_newline_eof) 1375 << FixItHint::CreateInsertion(getSourceLocation(BufferEnd), "\n"); 1376 1377 BufferPtr = CurPtr; 1378 1379 // Finally, let the preprocessor handle this. 1380 return PP->HandleEndOfFile(Result); 1381} 1382 1383/// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from 1384/// the specified lexer will return a tok::l_paren token, 0 if it is something 1385/// else and 2 if there are no more tokens in the buffer controlled by the 1386/// lexer. 1387unsigned Lexer::isNextPPTokenLParen() { 1388 assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?"); 1389 1390 // Switch to 'skipping' mode. This will ensure that we can lex a token 1391 // without emitting diagnostics, disables macro expansion, and will cause EOF 1392 // to return an EOF token instead of popping the include stack. 1393 LexingRawMode = true; 1394 1395 // Save state that can be changed while lexing so that we can restore it. 1396 const char *TmpBufferPtr = BufferPtr; 1397 bool inPPDirectiveMode = ParsingPreprocessorDirective; 1398 1399 Token Tok; 1400 Tok.startToken(); 1401 LexTokenInternal(Tok); 1402 1403 // Restore state that may have changed. 1404 BufferPtr = TmpBufferPtr; 1405 ParsingPreprocessorDirective = inPPDirectiveMode; 1406 1407 // Restore the lexer back to non-skipping mode. 1408 LexingRawMode = false; 1409 1410 if (Tok.is(tok::eof)) 1411 return 2; 1412 return Tok.is(tok::l_paren); 1413} 1414 1415/// FindConflictEnd - Find the end of a version control conflict marker. 1416static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd) { 1417 llvm::StringRef RestOfBuffer(CurPtr+7, BufferEnd-CurPtr-7); 1418 size_t Pos = RestOfBuffer.find(">>>>>>>"); 1419 while (Pos != llvm::StringRef::npos) { 1420 // Must occur at start of line. 1421 if (RestOfBuffer[Pos-1] != '\r' && 1422 RestOfBuffer[Pos-1] != '\n') { 1423 RestOfBuffer = RestOfBuffer.substr(Pos+7); 1424 continue; 1425 } 1426 return RestOfBuffer.data()+Pos; 1427 } 1428 return 0; 1429} 1430 1431/// IsStartOfConflictMarker - If the specified pointer is the start of a version 1432/// control conflict marker like '<<<<<<<', recognize it as such, emit an error 1433/// and recover nicely. This returns true if it is a conflict marker and false 1434/// if not. 1435bool Lexer::IsStartOfConflictMarker(const char *CurPtr) { 1436 // Only a conflict marker if it starts at the beginning of a line. 1437 if (CurPtr != BufferStart && 1438 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 1439 return false; 1440 1441 // Check to see if we have <<<<<<<. 1442 if (BufferEnd-CurPtr < 8 || 1443 llvm::StringRef(CurPtr, 7) != "<<<<<<<") 1444 return false; 1445 1446 // If we have a situation where we don't care about conflict markers, ignore 1447 // it. 1448 if (IsInConflictMarker || isLexingRawMode()) 1449 return false; 1450 1451 // Check to see if there is a >>>>>>> somewhere in the buffer at the start of 1452 // a line to terminate this conflict marker. 1453 if (FindConflictEnd(CurPtr+7, BufferEnd)) { 1454 // We found a match. We are really in a conflict marker. 1455 // Diagnose this, and ignore to the end of line. 1456 Diag(CurPtr, diag::err_conflict_marker); 1457 IsInConflictMarker = true; 1458 1459 // Skip ahead to the end of line. We know this exists because the 1460 // end-of-conflict marker starts with \r or \n. 1461 while (*CurPtr != '\r' && *CurPtr != '\n') { 1462 assert(CurPtr != BufferEnd && "Didn't find end of line"); 1463 ++CurPtr; 1464 } 1465 BufferPtr = CurPtr; 1466 return true; 1467 } 1468 1469 // No end of conflict marker found. 1470 return false; 1471} 1472 1473 1474/// HandleEndOfConflictMarker - If this is a '=======' or '|||||||' or '>>>>>>>' 1475/// marker, then it is the end of a conflict marker. Handle it by ignoring up 1476/// until the end of the line. This returns true if it is a conflict marker and 1477/// false if not. 1478bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) { 1479 // Only a conflict marker if it starts at the beginning of a line. 1480 if (CurPtr != BufferStart && 1481 CurPtr[-1] != '\n' && CurPtr[-1] != '\r') 1482 return false; 1483 1484 // If we have a situation where we don't care about conflict markers, ignore 1485 // it. 1486 if (!IsInConflictMarker || isLexingRawMode()) 1487 return false; 1488 1489 // Check to see if we have the marker (7 characters in a row). 1490 for (unsigned i = 1; i != 7; ++i) 1491 if (CurPtr[i] != CurPtr[0]) 1492 return false; 1493 1494 // If we do have it, search for the end of the conflict marker. This could 1495 // fail if it got skipped with a '#if 0' or something. Note that CurPtr might 1496 // be the end of conflict marker. 1497 if (const char *End = FindConflictEnd(CurPtr, BufferEnd)) { 1498 CurPtr = End; 1499 1500 // Skip ahead to the end of line. 1501 while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n') 1502 ++CurPtr; 1503 1504 BufferPtr = CurPtr; 1505 1506 // No longer in the conflict marker. 1507 IsInConflictMarker = false; 1508 return true; 1509 } 1510 1511 return false; 1512} 1513 1514 1515/// LexTokenInternal - This implements a simple C family lexer. It is an 1516/// extremely performance critical piece of code. This assumes that the buffer 1517/// has a null character at the end of the file. This returns a preprocessing 1518/// token, not a normal token, as such, it is an internal interface. It assumes 1519/// that the Flags of result have been cleared before calling this. 1520void Lexer::LexTokenInternal(Token &Result) { 1521LexNextToken: 1522 // New token, can't need cleaning yet. 1523 Result.clearFlag(Token::NeedsCleaning); 1524 Result.setIdentifierInfo(0); 1525 1526 // CurPtr - Cache BufferPtr in an automatic variable. 1527 const char *CurPtr = BufferPtr; 1528 1529 // Small amounts of horizontal whitespace is very common between tokens. 1530 if ((*CurPtr == ' ') || (*CurPtr == '\t')) { 1531 ++CurPtr; 1532 while ((*CurPtr == ' ') || (*CurPtr == '\t')) 1533 ++CurPtr; 1534 1535 // If we are keeping whitespace and other tokens, just return what we just 1536 // skipped. The next lexer invocation will return the token after the 1537 // whitespace. 1538 if (isKeepWhitespaceMode()) { 1539 FormTokenWithChars(Result, CurPtr, tok::unknown); 1540 return; 1541 } 1542 1543 BufferPtr = CurPtr; 1544 Result.setFlag(Token::LeadingSpace); 1545 } 1546 1547 unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below. 1548 1549 // Read a character, advancing over it. 1550 char Char = getAndAdvanceChar(CurPtr, Result); 1551 tok::TokenKind Kind; 1552 1553 switch (Char) { 1554 case 0: // Null. 1555 // Found end of file? 1556 if (CurPtr-1 == BufferEnd) { 1557 // Read the PP instance variable into an automatic variable, because 1558 // LexEndOfFile will often delete 'this'. 1559 Preprocessor *PPCache = PP; 1560 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 1561 return; // Got a token to return. 1562 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 1563 return PPCache->Lex(Result); 1564 } 1565 1566 if (!isLexingRawMode()) 1567 Diag(CurPtr-1, diag::null_in_file); 1568 Result.setFlag(Token::LeadingSpace); 1569 if (SkipWhitespace(Result, CurPtr)) 1570 return; // KeepWhitespaceMode 1571 1572 goto LexNextToken; // GCC isn't tail call eliminating. 1573 1574 case 26: // DOS & CP/M EOF: "^Z". 1575 // If we're in Microsoft extensions mode, treat this as end of file. 1576 if (Features.Microsoft) { 1577 // Read the PP instance variable into an automatic variable, because 1578 // LexEndOfFile will often delete 'this'. 1579 Preprocessor *PPCache = PP; 1580 if (LexEndOfFile(Result, CurPtr-1)) // Retreat back into the file. 1581 return; // Got a token to return. 1582 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 1583 return PPCache->Lex(Result); 1584 } 1585 // If Microsoft extensions are disabled, this is just random garbage. 1586 Kind = tok::unknown; 1587 break; 1588 1589 case '\n': 1590 case '\r': 1591 // If we are inside a preprocessor directive and we see the end of line, 1592 // we know we are done with the directive, so return an EOM token. 1593 if (ParsingPreprocessorDirective) { 1594 // Done parsing the "line". 1595 ParsingPreprocessorDirective = false; 1596 1597 // Restore comment saving mode, in case it was disabled for directive. 1598 SetCommentRetentionState(PP->getCommentRetentionState()); 1599 1600 // Since we consumed a newline, we are back at the start of a line. 1601 IsAtStartOfLine = true; 1602 1603 Kind = tok::eom; 1604 break; 1605 } 1606 // The returned token is at the start of the line. 1607 Result.setFlag(Token::StartOfLine); 1608 // No leading whitespace seen so far. 1609 Result.clearFlag(Token::LeadingSpace); 1610 1611 if (SkipWhitespace(Result, CurPtr)) 1612 return; // KeepWhitespaceMode 1613 goto LexNextToken; // GCC isn't tail call eliminating. 1614 case ' ': 1615 case '\t': 1616 case '\f': 1617 case '\v': 1618 SkipHorizontalWhitespace: 1619 Result.setFlag(Token::LeadingSpace); 1620 if (SkipWhitespace(Result, CurPtr)) 1621 return; // KeepWhitespaceMode 1622 1623 SkipIgnoredUnits: 1624 CurPtr = BufferPtr; 1625 1626 // If the next token is obviously a // or /* */ comment, skip it efficiently 1627 // too (without going through the big switch stmt). 1628 if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() && 1629 Features.BCPLComment) { 1630 if (SkipBCPLComment(Result, CurPtr+2)) 1631 return; // There is a token to return. 1632 goto SkipIgnoredUnits; 1633 } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) { 1634 if (SkipBlockComment(Result, CurPtr+2)) 1635 return; // There is a token to return. 1636 goto SkipIgnoredUnits; 1637 } else if (isHorizontalWhitespace(*CurPtr)) { 1638 goto SkipHorizontalWhitespace; 1639 } 1640 goto LexNextToken; // GCC isn't tail call eliminating. 1641 1642 // C99 6.4.4.1: Integer Constants. 1643 // C99 6.4.4.2: Floating Constants. 1644 case '0': case '1': case '2': case '3': case '4': 1645 case '5': case '6': case '7': case '8': case '9': 1646 // Notify MIOpt that we read a non-whitespace/non-comment token. 1647 MIOpt.ReadToken(); 1648 return LexNumericConstant(Result, CurPtr); 1649 1650 case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz"). 1651 // Notify MIOpt that we read a non-whitespace/non-comment token. 1652 MIOpt.ReadToken(); 1653 Char = getCharAndSize(CurPtr, SizeTmp); 1654 1655 // Wide string literal. 1656 if (Char == '"') 1657 return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result), 1658 true); 1659 1660 // Wide character constant. 1661 if (Char == '\'') 1662 return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 1663 // FALL THROUGH, treating L like the start of an identifier. 1664 1665 // C99 6.4.2: Identifiers. 1666 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': 1667 case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N': 1668 case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U': 1669 case 'V': case 'W': case 'X': case 'Y': case 'Z': 1670 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': 1671 case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n': 1672 case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u': 1673 case 'v': case 'w': case 'x': case 'y': case 'z': 1674 case '_': 1675 // Notify MIOpt that we read a non-whitespace/non-comment token. 1676 MIOpt.ReadToken(); 1677 return LexIdentifier(Result, CurPtr); 1678 1679 case '$': // $ in identifiers. 1680 if (Features.DollarIdents) { 1681 if (!isLexingRawMode()) 1682 Diag(CurPtr-1, diag::ext_dollar_in_identifier); 1683 // Notify MIOpt that we read a non-whitespace/non-comment token. 1684 MIOpt.ReadToken(); 1685 return LexIdentifier(Result, CurPtr); 1686 } 1687 1688 Kind = tok::unknown; 1689 break; 1690 1691 // C99 6.4.4: Character Constants. 1692 case '\'': 1693 // Notify MIOpt that we read a non-whitespace/non-comment token. 1694 MIOpt.ReadToken(); 1695 return LexCharConstant(Result, CurPtr); 1696 1697 // C99 6.4.5: String Literals. 1698 case '"': 1699 // Notify MIOpt that we read a non-whitespace/non-comment token. 1700 MIOpt.ReadToken(); 1701 return LexStringLiteral(Result, CurPtr, false); 1702 1703 // C99 6.4.6: Punctuators. 1704 case '?': 1705 Kind = tok::question; 1706 break; 1707 case '[': 1708 Kind = tok::l_square; 1709 break; 1710 case ']': 1711 Kind = tok::r_square; 1712 break; 1713 case '(': 1714 Kind = tok::l_paren; 1715 break; 1716 case ')': 1717 Kind = tok::r_paren; 1718 break; 1719 case '{': 1720 Kind = tok::l_brace; 1721 break; 1722 case '}': 1723 Kind = tok::r_brace; 1724 break; 1725 case '.': 1726 Char = getCharAndSize(CurPtr, SizeTmp); 1727 if (Char >= '0' && Char <= '9') { 1728 // Notify MIOpt that we read a non-whitespace/non-comment token. 1729 MIOpt.ReadToken(); 1730 1731 return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result)); 1732 } else if (Features.CPlusPlus && Char == '*') { 1733 Kind = tok::periodstar; 1734 CurPtr += SizeTmp; 1735 } else if (Char == '.' && 1736 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') { 1737 Kind = tok::ellipsis; 1738 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1739 SizeTmp2, Result); 1740 } else { 1741 Kind = tok::period; 1742 } 1743 break; 1744 case '&': 1745 Char = getCharAndSize(CurPtr, SizeTmp); 1746 if (Char == '&') { 1747 Kind = tok::ampamp; 1748 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1749 } else if (Char == '=') { 1750 Kind = tok::ampequal; 1751 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1752 } else { 1753 Kind = tok::amp; 1754 } 1755 break; 1756 case '*': 1757 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 1758 Kind = tok::starequal; 1759 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1760 } else { 1761 Kind = tok::star; 1762 } 1763 break; 1764 case '+': 1765 Char = getCharAndSize(CurPtr, SizeTmp); 1766 if (Char == '+') { 1767 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1768 Kind = tok::plusplus; 1769 } else if (Char == '=') { 1770 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1771 Kind = tok::plusequal; 1772 } else { 1773 Kind = tok::plus; 1774 } 1775 break; 1776 case '-': 1777 Char = getCharAndSize(CurPtr, SizeTmp); 1778 if (Char == '-') { // -- 1779 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1780 Kind = tok::minusminus; 1781 } else if (Char == '>' && Features.CPlusPlus && 1782 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->* 1783 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1784 SizeTmp2, Result); 1785 Kind = tok::arrowstar; 1786 } else if (Char == '>') { // -> 1787 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1788 Kind = tok::arrow; 1789 } else if (Char == '=') { // -= 1790 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1791 Kind = tok::minusequal; 1792 } else { 1793 Kind = tok::minus; 1794 } 1795 break; 1796 case '~': 1797 Kind = tok::tilde; 1798 break; 1799 case '!': 1800 if (getCharAndSize(CurPtr, SizeTmp) == '=') { 1801 Kind = tok::exclaimequal; 1802 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1803 } else { 1804 Kind = tok::exclaim; 1805 } 1806 break; 1807 case '/': 1808 // 6.4.9: Comments 1809 Char = getCharAndSize(CurPtr, SizeTmp); 1810 if (Char == '/') { // BCPL comment. 1811 // Even if BCPL comments are disabled (e.g. in C89 mode), we generally 1812 // want to lex this as a comment. There is one problem with this though, 1813 // that in one particular corner case, this can change the behavior of the 1814 // resultant program. For example, In "foo //**/ bar", C89 would lex 1815 // this as "foo / bar" and langauges with BCPL comments would lex it as 1816 // "foo". Check to see if the character after the second slash is a '*'. 1817 // If so, we will lex that as a "/" instead of the start of a comment. 1818 if (Features.BCPLComment || 1819 getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*') { 1820 if (SkipBCPLComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 1821 return; // There is a token to return. 1822 1823 // It is common for the tokens immediately after a // comment to be 1824 // whitespace (indentation for the next line). Instead of going through 1825 // the big switch, handle it efficiently now. 1826 goto SkipIgnoredUnits; 1827 } 1828 } 1829 1830 if (Char == '*') { // /**/ comment. 1831 if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result))) 1832 return; // There is a token to return. 1833 goto LexNextToken; // GCC isn't tail call eliminating. 1834 } 1835 1836 if (Char == '=') { 1837 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1838 Kind = tok::slashequal; 1839 } else { 1840 Kind = tok::slash; 1841 } 1842 break; 1843 case '%': 1844 Char = getCharAndSize(CurPtr, SizeTmp); 1845 if (Char == '=') { 1846 Kind = tok::percentequal; 1847 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1848 } else if (Features.Digraphs && Char == '>') { 1849 Kind = tok::r_brace; // '%>' -> '}' 1850 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1851 } else if (Features.Digraphs && Char == ':') { 1852 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1853 Char = getCharAndSize(CurPtr, SizeTmp); 1854 if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') { 1855 Kind = tok::hashhash; // '%:%:' -> '##' 1856 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1857 SizeTmp2, Result); 1858 } else if (Char == '@' && Features.Microsoft) { // %:@ -> #@ -> Charize 1859 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1860 if (!isLexingRawMode()) 1861 Diag(BufferPtr, diag::charize_microsoft_ext); 1862 Kind = tok::hashat; 1863 } else { // '%:' -> '#' 1864 // We parsed a # character. If this occurs at the start of the line, 1865 // it's actually the start of a preprocessing directive. Callback to 1866 // the preprocessor to handle it. 1867 // FIXME: -fpreprocessed mode?? 1868 if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) { 1869 FormTokenWithChars(Result, CurPtr, tok::hash); 1870 PP->HandleDirective(Result); 1871 1872 // As an optimization, if the preprocessor didn't switch lexers, tail 1873 // recurse. 1874 if (PP->isCurrentLexer(this)) { 1875 // Start a new token. If this is a #include or something, the PP may 1876 // want us starting at the beginning of the line again. If so, set 1877 // the StartOfLine flag and clear LeadingSpace. 1878 if (IsAtStartOfLine) { 1879 Result.setFlag(Token::StartOfLine); 1880 Result.clearFlag(Token::LeadingSpace); 1881 IsAtStartOfLine = false; 1882 } 1883 goto LexNextToken; // GCC isn't tail call eliminating. 1884 } 1885 1886 return PP->Lex(Result); 1887 } 1888 1889 Kind = tok::hash; 1890 } 1891 } else { 1892 Kind = tok::percent; 1893 } 1894 break; 1895 case '<': 1896 Char = getCharAndSize(CurPtr, SizeTmp); 1897 if (ParsingFilename) { 1898 return LexAngledStringLiteral(Result, CurPtr); 1899 } else if (Char == '<') { 1900 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 1901 if (After == '=') { 1902 Kind = tok::lesslessequal; 1903 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1904 SizeTmp2, Result); 1905 } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) { 1906 // If this is actually a '<<<<<<<' version control conflict marker, 1907 // recognize it as such and recover nicely. 1908 goto LexNextToken; 1909 } else { 1910 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1911 Kind = tok::lessless; 1912 } 1913 } else if (Char == '=') { 1914 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1915 Kind = tok::lessequal; 1916 } else if (Features.Digraphs && Char == ':') { // '<:' -> '[' 1917 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1918 Kind = tok::l_square; 1919 } else if (Features.Digraphs && Char == '%') { // '<%' -> '{' 1920 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1921 Kind = tok::l_brace; 1922 } else { 1923 Kind = tok::less; 1924 } 1925 break; 1926 case '>': 1927 Char = getCharAndSize(CurPtr, SizeTmp); 1928 if (Char == '=') { 1929 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1930 Kind = tok::greaterequal; 1931 } else if (Char == '>') { 1932 char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2); 1933 if (After == '=') { 1934 CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result), 1935 SizeTmp2, Result); 1936 Kind = tok::greatergreaterequal; 1937 } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) { 1938 // If this is '>>>>>>>' and we're in a conflict marker, ignore it. 1939 goto LexNextToken; 1940 } else { 1941 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1942 Kind = tok::greatergreater; 1943 } 1944 1945 } else { 1946 Kind = tok::greater; 1947 } 1948 break; 1949 case '^': 1950 Char = getCharAndSize(CurPtr, SizeTmp); 1951 if (Char == '=') { 1952 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1953 Kind = tok::caretequal; 1954 } else { 1955 Kind = tok::caret; 1956 } 1957 break; 1958 case '|': 1959 Char = getCharAndSize(CurPtr, SizeTmp); 1960 if (Char == '=') { 1961 Kind = tok::pipeequal; 1962 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1963 } else if (Char == '|') { 1964 // If this is '|||||||' and we're in a conflict marker, ignore it. 1965 if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1)) 1966 goto LexNextToken; 1967 Kind = tok::pipepipe; 1968 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1969 } else { 1970 Kind = tok::pipe; 1971 } 1972 break; 1973 case ':': 1974 Char = getCharAndSize(CurPtr, SizeTmp); 1975 if (Features.Digraphs && Char == '>') { 1976 Kind = tok::r_square; // ':>' -> ']' 1977 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1978 } else if (Features.CPlusPlus && Char == ':') { 1979 Kind = tok::coloncolon; 1980 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1981 } else { 1982 Kind = tok::colon; 1983 } 1984 break; 1985 case ';': 1986 Kind = tok::semi; 1987 break; 1988 case '=': 1989 Char = getCharAndSize(CurPtr, SizeTmp); 1990 if (Char == '=') { 1991 // If this is '=======' and we're in a conflict marker, ignore it. 1992 if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1)) 1993 goto LexNextToken; 1994 1995 Kind = tok::equalequal; 1996 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 1997 } else { 1998 Kind = tok::equal; 1999 } 2000 break; 2001 case ',': 2002 Kind = tok::comma; 2003 break; 2004 case '#': 2005 Char = getCharAndSize(CurPtr, SizeTmp); 2006 if (Char == '#') { 2007 Kind = tok::hashhash; 2008 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2009 } else if (Char == '@' && Features.Microsoft) { // #@ -> Charize 2010 Kind = tok::hashat; 2011 if (!isLexingRawMode()) 2012 Diag(BufferPtr, diag::charize_microsoft_ext); 2013 CurPtr = ConsumeChar(CurPtr, SizeTmp, Result); 2014 } else { 2015 // We parsed a # character. If this occurs at the start of the line, 2016 // it's actually the start of a preprocessing directive. Callback to 2017 // the preprocessor to handle it. 2018 // FIXME: -fpreprocessed mode?? 2019 if (Result.isAtStartOfLine() && !LexingRawMode && !Is_PragmaLexer) { 2020 FormTokenWithChars(Result, CurPtr, tok::hash); 2021 PP->HandleDirective(Result); 2022 2023 // As an optimization, if the preprocessor didn't switch lexers, tail 2024 // recurse. 2025 if (PP->isCurrentLexer(this)) { 2026 // Start a new token. If this is a #include or something, the PP may 2027 // want us starting at the beginning of the line again. If so, set 2028 // the StartOfLine flag and clear LeadingSpace. 2029 if (IsAtStartOfLine) { 2030 Result.setFlag(Token::StartOfLine); 2031 Result.clearFlag(Token::LeadingSpace); 2032 IsAtStartOfLine = false; 2033 } 2034 goto LexNextToken; // GCC isn't tail call eliminating. 2035 } 2036 return PP->Lex(Result); 2037 } 2038 2039 Kind = tok::hash; 2040 } 2041 break; 2042 2043 case '@': 2044 // Objective C support. 2045 if (CurPtr[-1] == '@' && Features.ObjC1) 2046 Kind = tok::at; 2047 else 2048 Kind = tok::unknown; 2049 break; 2050 2051 case '\\': 2052 // FIXME: UCN's. 2053 // FALL THROUGH. 2054 default: 2055 Kind = tok::unknown; 2056 break; 2057 } 2058 2059 // Notify MIOpt that we read a non-whitespace/non-comment token. 2060 MIOpt.ReadToken(); 2061 2062 // Update the location of token as well as BufferPtr. 2063 FormTokenWithChars(Result, CurPtr, Kind); 2064} 2065