PTHLexer.cpp revision 8a6aec620dbec1f292fe4116c0373ac81ab90234
1//===--- PTHLexer.cpp - Lex from a token stream ---------------------------===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file is distributed under the University of Illinois Open Source 6// License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// This file implements the PTHLexer interface. 11// 12//===----------------------------------------------------------------------===// 13 14#include "clang/Basic/TokenKinds.h" 15#include "clang/Basic/FileManager.h" 16#include "clang/Basic/IdentifierTable.h" 17#include "clang/Lex/PTHLexer.h" 18#include "clang/Lex/Preprocessor.h" 19#include "clang/Lex/PTHManager.h" 20#include "clang/Lex/Token.h" 21#include "clang/Lex/Preprocessor.h" 22#include "llvm/ADT/StringMap.h" 23#include "llvm/ADT/OwningPtr.h" 24#include "llvm/Support/Compiler.h" 25#include "llvm/Support/MathExtras.h" 26#include "llvm/Support/MemoryBuffer.h" 27#include "llvm/System/Host.h" 28using namespace clang; 29 30#define DISK_TOKEN_SIZE (1+1+2+4+4) 31 32//===----------------------------------------------------------------------===// 33// Utility methods for reading from the mmap'ed PTH file. 34//===----------------------------------------------------------------------===// 35 36static inline uint16_t ReadUnalignedLE16(const unsigned char *&Data) { 37 uint16_t V = ((uint16_t)Data[0] << 0) | 38 ((uint16_t)Data[1] << 8); 39 Data += 2; 40 return V; 41} 42 43static inline uint32_t ReadLE32(const unsigned char *&Data) { 44 // Hosts that directly support little-endian 32-bit loads can just 45 // use them. Big-endian hosts need a bswap. 46 uint32_t V = *((uint32_t*)Data); 47 if (llvm::sys::isBigEndianHost()) 48 V = llvm::ByteSwap_32(V); 49 Data += 4; 50 return V; 51} 52 53 54//===----------------------------------------------------------------------===// 55// PTHLexer methods. 56//===----------------------------------------------------------------------===// 57 58PTHLexer::PTHLexer(Preprocessor &PP, FileID FID, const unsigned char *D, 59 const unsigned char *ppcond, PTHManager &PM) 60 : PreprocessorLexer(&PP, FID), TokBuf(D), CurPtr(D), LastHashTokPtr(0), 61 PPCond(ppcond), CurPPCondPtr(ppcond), PTHMgr(PM) { 62 63 FileStartLoc = PP.getSourceManager().getLocForStartOfFile(FID); 64} 65 66void PTHLexer::Lex(Token& Tok) { 67LexNextToken: 68 69 //===--------------------------------------==// 70 // Read the raw token data. 71 //===--------------------------------------==// 72 73 // Shadow CurPtr into an automatic variable. 74 const unsigned char *CurPtrShadow = CurPtr; 75 76 // Read in the data for the token. 77 unsigned Word0 = ReadLE32(CurPtrShadow); 78 uint32_t IdentifierID = ReadLE32(CurPtrShadow); 79 uint32_t FileOffset = ReadLE32(CurPtrShadow); 80 81 tok::TokenKind TKind = (tok::TokenKind) (Word0 & 0xFF); 82 Token::TokenFlags TFlags = (Token::TokenFlags) ((Word0 >> 8) & 0xFF); 83 uint32_t Len = Word0 >> 16; 84 85 CurPtr = CurPtrShadow; 86 87 //===--------------------------------------==// 88 // Construct the token itself. 89 //===--------------------------------------==// 90 91 Tok.startToken(); 92 Tok.setKind(TKind); 93 Tok.setFlag(TFlags); 94 assert(!LexingRawMode); 95 Tok.setLocation(FileStartLoc.getFileLocWithOffset(FileOffset)); 96 Tok.setLength(Len); 97 98 // Handle identifiers. 99 if (Tok.isLiteral()) { 100 Tok.setLiteralData((const char*) (PTHMgr.SpellingBase + IdentifierID)); 101 } 102 else if (IdentifierID) { 103 MIOpt.ReadToken(); 104 IdentifierInfo *II = PTHMgr.GetIdentifierInfo(IdentifierID-1); 105 106 Tok.setIdentifierInfo(II); 107 108 // Change the kind of this identifier to the appropriate token kind, e.g. 109 // turning "for" into a keyword. 110 Tok.setKind(II->getTokenID()); 111 112 if (II->isHandleIdentifierCase()) 113 PP->HandleIdentifier(Tok); 114 return; 115 } 116 117 //===--------------------------------------==// 118 // Process the token. 119 //===--------------------------------------==// 120#if 0 121 SourceManager& SM = PP->getSourceManager(); 122 llvm::cerr << SM.getFileEntryForID(FileID)->getName() 123 << ':' << SM.getLogicalLineNumber(Tok.getLocation()) 124 << ':' << SM.getLogicalColumnNumber(Tok.getLocation()) 125 << '\n'; 126#endif 127 128 if (TKind == tok::eof) { 129 // Save the end-of-file token. 130 EofToken = Tok; 131 132 Preprocessor *PPCache = PP; 133 134 assert(!ParsingPreprocessorDirective); 135 assert(!LexingRawMode); 136 137 // FIXME: Issue diagnostics similar to Lexer. 138 if (PP->HandleEndOfFile(Tok, false)) 139 return; 140 141 assert(PPCache && "Raw buffer::LexEndOfFile should return a token"); 142 return PPCache->Lex(Tok); 143 } 144 145 if (TKind == tok::hash && Tok.isAtStartOfLine()) { 146 LastHashTokPtr = CurPtr - DISK_TOKEN_SIZE; 147 assert(!LexingRawMode); 148 PP->HandleDirective(Tok); 149 150 if (PP->isCurrentLexer(this)) 151 goto LexNextToken; 152 153 return PP->Lex(Tok); 154 } 155 156 if (TKind == tok::eom) { 157 assert(ParsingPreprocessorDirective); 158 ParsingPreprocessorDirective = false; 159 return; 160 } 161 162 MIOpt.ReadToken(); 163} 164 165// FIXME: We can just grab the last token instead of storing a copy 166// into EofToken. 167void PTHLexer::getEOF(Token& Tok) { 168 assert(EofToken.is(tok::eof)); 169 Tok = EofToken; 170} 171 172void PTHLexer::DiscardToEndOfLine() { 173 assert(ParsingPreprocessorDirective && ParsingFilename == false && 174 "Must be in a preprocessing directive!"); 175 176 // We assume that if the preprocessor wishes to discard to the end of 177 // the line that it also means to end the current preprocessor directive. 178 ParsingPreprocessorDirective = false; 179 180 // Skip tokens by only peeking at their token kind and the flags. 181 // We don't need to actually reconstruct full tokens from the token buffer. 182 // This saves some copies and it also reduces IdentifierInfo* lookup. 183 const unsigned char* p = CurPtr; 184 while (1) { 185 // Read the token kind. Are we at the end of the file? 186 tok::TokenKind x = (tok::TokenKind) (uint8_t) *p; 187 if (x == tok::eof) break; 188 189 // Read the token flags. Are we at the start of the next line? 190 Token::TokenFlags y = (Token::TokenFlags) (uint8_t) p[1]; 191 if (y & Token::StartOfLine) break; 192 193 // Skip to the next token. 194 p += DISK_TOKEN_SIZE; 195 } 196 197 CurPtr = p; 198} 199 200/// SkipBlock - Used by Preprocessor to skip the current conditional block. 201bool PTHLexer::SkipBlock() { 202 assert(CurPPCondPtr && "No cached PP conditional information."); 203 assert(LastHashTokPtr && "No known '#' token."); 204 205 const unsigned char* HashEntryI = 0; 206 uint32_t Offset; 207 uint32_t TableIdx; 208 209 do { 210 // Read the token offset from the side-table. 211 Offset = ReadLE32(CurPPCondPtr); 212 213 // Read the target table index from the side-table. 214 TableIdx = ReadLE32(CurPPCondPtr); 215 216 // Compute the actual memory address of the '#' token data for this entry. 217 HashEntryI = TokBuf + Offset; 218 219 // Optmization: "Sibling jumping". #if...#else...#endif blocks can 220 // contain nested blocks. In the side-table we can jump over these 221 // nested blocks instead of doing a linear search if the next "sibling" 222 // entry is not at a location greater than LastHashTokPtr. 223 if (HashEntryI < LastHashTokPtr && TableIdx) { 224 // In the side-table we are still at an entry for a '#' token that 225 // is earlier than the last one we saw. Check if the location we would 226 // stride gets us closer. 227 const unsigned char* NextPPCondPtr = 228 PPCond + TableIdx*(sizeof(uint32_t)*2); 229 assert(NextPPCondPtr >= CurPPCondPtr); 230 // Read where we should jump to. 231 uint32_t TmpOffset = ReadLE32(NextPPCondPtr); 232 const unsigned char* HashEntryJ = TokBuf + TmpOffset; 233 234 if (HashEntryJ <= LastHashTokPtr) { 235 // Jump directly to the next entry in the side table. 236 HashEntryI = HashEntryJ; 237 Offset = TmpOffset; 238 TableIdx = ReadLE32(NextPPCondPtr); 239 CurPPCondPtr = NextPPCondPtr; 240 } 241 } 242 } 243 while (HashEntryI < LastHashTokPtr); 244 assert(HashEntryI == LastHashTokPtr && "No PP-cond entry found for '#'"); 245 assert(TableIdx && "No jumping from #endifs."); 246 247 // Update our side-table iterator. 248 const unsigned char* NextPPCondPtr = PPCond + TableIdx*(sizeof(uint32_t)*2); 249 assert(NextPPCondPtr >= CurPPCondPtr); 250 CurPPCondPtr = NextPPCondPtr; 251 252 // Read where we should jump to. 253 HashEntryI = TokBuf + ReadLE32(NextPPCondPtr); 254 uint32_t NextIdx = ReadLE32(NextPPCondPtr); 255 256 // By construction NextIdx will be zero if this is a #endif. This is useful 257 // to know to obviate lexing another token. 258 bool isEndif = NextIdx == 0; 259 260 // This case can occur when we see something like this: 261 // 262 // #if ... 263 // /* a comment or nothing */ 264 // #elif 265 // 266 // If we are skipping the first #if block it will be the case that CurPtr 267 // already points 'elif'. Just return. 268 269 if (CurPtr > HashEntryI) { 270 assert(CurPtr == HashEntryI + DISK_TOKEN_SIZE); 271 // Did we reach a #endif? If so, go ahead and consume that token as well. 272 if (isEndif) 273 CurPtr += DISK_TOKEN_SIZE*2; 274 else 275 LastHashTokPtr = HashEntryI; 276 277 return isEndif; 278 } 279 280 // Otherwise, we need to advance. Update CurPtr to point to the '#' token. 281 CurPtr = HashEntryI; 282 283 // Update the location of the last observed '#'. This is useful if we 284 // are skipping multiple blocks. 285 LastHashTokPtr = CurPtr; 286 287 // Skip the '#' token. 288 assert(((tok::TokenKind)*CurPtr) == tok::hash); 289 CurPtr += DISK_TOKEN_SIZE; 290 291 // Did we reach a #endif? If so, go ahead and consume that token as well. 292 if (isEndif) { CurPtr += DISK_TOKEN_SIZE*2; } 293 294 return isEndif; 295} 296 297SourceLocation PTHLexer::getSourceLocation() { 298 // getSourceLocation is not on the hot path. It is used to get the location 299 // of the next token when transitioning back to this lexer when done 300 // handling a #included file. Just read the necessary data from the token 301 // data buffer to construct the SourceLocation object. 302 // NOTE: This is a virtual function; hence it is defined out-of-line. 303 const unsigned char *OffsetPtr = CurPtr + (DISK_TOKEN_SIZE - 4); 304 uint32_t Offset = ReadLE32(OffsetPtr); 305 return FileStartLoc.getFileLocWithOffset(Offset); 306} 307 308//===----------------------------------------------------------------------===// 309// Internal Data Structures for PTH file lookup and resolving identifiers. 310//===----------------------------------------------------------------------===// 311 312 313/// PTHFileLookup - This internal data structure is used by the PTHManager 314/// to map from FileEntry objects managed by FileManager to offsets within 315/// the PTH file. 316namespace { 317class VISIBILITY_HIDDEN PTHFileLookup { 318public: 319 class Val { 320 uint32_t TokenOff; 321 uint32_t PPCondOff; 322 public: 323 Val() : TokenOff(~0) {} 324 Val(uint32_t toff, uint32_t poff) 325 : TokenOff(toff), PPCondOff(poff) {} 326 327 bool isValid() const { return TokenOff != ~((uint32_t)0); } 328 329 uint32_t getTokenOffset() const { 330 assert(isValid() && "PTHFileLookup entry initialized."); 331 return TokenOff; 332 } 333 334 uint32_t getPPCondOffset() const { 335 assert(isValid() && "PTHFileLookup entry initialized."); 336 return PPCondOff; 337 } 338 }; 339 340private: 341 llvm::StringMap<Val> FileMap; 342 343public: 344 PTHFileLookup() {}; 345 346 bool isEmpty() const { 347 return FileMap.empty(); 348 } 349 350 Val Lookup(const FileEntry* FE) { 351 const char* s = FE->getName(); 352 unsigned size = strlen(s); 353 return FileMap.GetOrCreateValue(s, s+size).getValue(); 354 } 355 356 void ReadTable(const unsigned char* D) { 357 uint32_t N = ReadLE32(D); // Read the length of the table. 358 359 for ( ; N > 0; --N) { // The rest of the data is the table itself. 360 uint32_t Len = ReadLE32(D); 361 const char* s = (const char *)D; 362 D += Len; 363 364 uint32_t TokenOff = ReadLE32(D); 365 uint32_t PPCondOff = ReadLE32(D); 366 367 FileMap.GetOrCreateValue(s, s+Len).getValue() = 368 Val(TokenOff, PPCondOff); 369 } 370 } 371}; 372} // end anonymous namespace 373 374//===----------------------------------------------------------------------===// 375// PTHManager methods. 376//===----------------------------------------------------------------------===// 377 378PTHManager::PTHManager(const llvm::MemoryBuffer* buf, void* fileLookup, 379 const unsigned char* idDataTable, 380 IdentifierInfo** perIDCache, 381 const unsigned char* sortedIdTable, unsigned numIds, 382 const unsigned char* spellingBase) 383: Buf(buf), PerIDCache(perIDCache), FileLookup(fileLookup), 384 IdDataTable(idDataTable), SortedIdTable(sortedIdTable), 385 NumIds(numIds), PP(0), SpellingBase(spellingBase) {} 386 387PTHManager::~PTHManager() { 388 delete Buf; 389 delete (PTHFileLookup*) FileLookup; 390 free(PerIDCache); 391} 392 393PTHManager* PTHManager::Create(const std::string& file, Diagnostic* Diags) { 394 // Memory map the PTH file. 395 llvm::OwningPtr<llvm::MemoryBuffer> 396 File(llvm::MemoryBuffer::getFile(file.c_str())); 397 398 if (!File) { 399 if (Diags) { 400 unsigned DiagID = Diags->getCustomDiagID(Diagnostic::Note, 401 "PTH file %0 could not be read"); 402 Diags->Report(FullSourceLoc(), DiagID) << file; 403 } 404 405 return 0; 406 } 407 408 // Get the buffer ranges and check if there are at least three 32-bit 409 // words at the end of the file. 410 const unsigned char* BufBeg = (unsigned char*)File->getBufferStart(); 411 const unsigned char* BufEnd = (unsigned char*)File->getBufferEnd(); 412 413 // Check the prologue of the file. 414 if ((BufEnd - BufBeg) < (signed) (sizeof("cfe-pth") + 3 + 4) || 415 memcmp(BufBeg, "cfe-pth", sizeof("cfe-pth") - 1) != 0) 416 return 0; 417 418 // Read the PTH version. 419 const unsigned char *p = BufBeg + (sizeof("cfe-pth") - 1); 420 unsigned Version = ReadLE32(p); 421 422 if (Version != PTHManager::Version) 423 return 0; 424 425 // Compute the address of the index table at the end of the PTH file. 426 const unsigned char *EndTable = BufBeg + ReadLE32(p); 427 428 if (EndTable >= BufEnd) 429 return 0; 430 431 // Construct the file lookup table. This will be used for mapping from 432 // FileEntry*'s to cached tokens. 433 const unsigned char* FileTableOffset = EndTable + sizeof(uint32_t)*3; 434 const unsigned char* FileTable = BufBeg + ReadLE32(FileTableOffset); 435 436 if (!(FileTable > BufBeg && FileTable < BufEnd)) { 437 assert(false && "Invalid PTH file."); 438 return 0; // FIXME: Proper error diagnostic? 439 } 440 441 llvm::OwningPtr<PTHFileLookup> FL(new PTHFileLookup()); 442 FL->ReadTable(FileTable); 443 444 if (FL->isEmpty()) 445 return 0; 446 447 // Get the location of the table mapping from persistent ids to the 448 // data needed to reconstruct identifiers. 449 const unsigned char* IDTableOffset = EndTable + sizeof(uint32_t)*1; 450 const unsigned char* IData = BufBeg + ReadLE32(IDTableOffset); 451 452 if (!(IData >= BufBeg && IData < BufEnd)) { 453 assert(false && "Invalid PTH file."); 454 return 0; // FIXME: Proper error diagnostic? 455 } 456 457 // Get the location of the lexigraphically-sorted table of persistent IDs. 458 const unsigned char* SortedIdTableOffset = EndTable + sizeof(uint32_t)*2; 459 const unsigned char* SortedIdTable = BufBeg + ReadLE32(SortedIdTableOffset); 460 if (!(SortedIdTable >= BufBeg && SortedIdTable < BufEnd)) { 461 assert(false && "Invalid PTH file."); 462 return 0; // FIXME: Proper error diagnostic? 463 } 464 465 // Get the location of the spelling cache. 466 const unsigned char* spellingBaseOffset = EndTable + sizeof(uint32_t)*4; 467 const unsigned char* spellingBase = BufBeg + ReadLE32(spellingBaseOffset); 468 if (!(spellingBase >= BufBeg && spellingBase < BufEnd)) { 469 assert(false && "Invalid PTH file."); 470 return 0; 471 } 472 473 // Get the number of IdentifierInfos and pre-allocate the identifier cache. 474 uint32_t NumIds = ReadLE32(IData); 475 476 // Pre-allocate the peristent ID -> IdentifierInfo* cache. We use calloc() 477 // so that we in the best case only zero out memory once when the OS returns 478 // us new pages. 479 IdentifierInfo** PerIDCache = 0; 480 481 if (NumIds) { 482 PerIDCache = (IdentifierInfo**)calloc(NumIds, sizeof(*PerIDCache)); 483 if (!PerIDCache) { 484 assert(false && "Could not allocate Persistent ID cache."); 485 return 0; 486 } 487 } 488 489 // Create the new PTHManager. 490 return new PTHManager(File.take(), FL.take(), IData, PerIDCache, 491 SortedIdTable, NumIds, spellingBase); 492} 493IdentifierInfo* PTHManager::LazilyCreateIdentifierInfo(unsigned PersistentID) { 494 // Look in the PTH file for the string data for the IdentifierInfo object. 495 const unsigned char* TableEntry = IdDataTable + sizeof(uint32_t)*PersistentID; 496 const unsigned char* IDData = 497 (const unsigned char*)Buf->getBufferStart() + ReadLE32(TableEntry); 498 assert(IDData < (const unsigned char*)Buf->getBufferEnd()); 499 500 // Allocate the object. 501 std::pair<IdentifierInfo,const unsigned char*> *Mem = 502 Alloc.Allocate<std::pair<IdentifierInfo,const unsigned char*> >(); 503 504 Mem->second = IDData; 505 IdentifierInfo *II = new ((void*) Mem) IdentifierInfo(); 506 507 // Store the new IdentifierInfo in the cache. 508 PerIDCache[PersistentID] = II; 509 return II; 510} 511 512IdentifierInfo* PTHManager::get(const char *NameStart, const char *NameEnd) { 513 unsigned min = 0; 514 unsigned max = NumIds; 515 unsigned Len = NameEnd - NameStart; 516 517 do { 518 unsigned i = (max - min) / 2 + min; 519 const unsigned char *Ptr = SortedIdTable + (i * 4); 520 521 // Read the persistentID. 522 unsigned perID = ReadLE32(Ptr); 523 524 // Get the IdentifierInfo. 525 IdentifierInfo* II = GetIdentifierInfo(perID); 526 527 // First compare the lengths. 528 unsigned IILen = II->getLength(); 529 if (Len < IILen) goto IsLess; 530 if (Len > IILen) goto IsGreater; 531 532 // Now compare the strings! 533 { 534 signed comp = strncmp(NameStart, II->getName(), Len); 535 if (comp < 0) goto IsLess; 536 if (comp > 0) goto IsGreater; 537 } 538 // We found a match! 539 return II; 540 541 IsGreater: 542 if (i == min) break; 543 min = i; 544 continue; 545 546 IsLess: 547 max = i; 548 assert(!(max == min) || (min == i)); 549 } 550 while (min != max); 551 552 return 0; 553} 554 555 556PTHLexer *PTHManager::CreateLexer(FileID FID) { 557 const FileEntry *FE = PP->getSourceManager().getFileEntryForID(FID); 558 if (!FE) 559 return 0; 560 561 // Lookup the FileEntry object in our file lookup data structure. It will 562 // return a variant that indicates whether or not there is an offset within 563 // the PTH file that contains cached tokens. 564 PTHFileLookup::Val FileData = ((PTHFileLookup*)FileLookup)->Lookup(FE); 565 566 if (!FileData.isValid()) // No tokens available. 567 return 0; 568 569 const unsigned char *BufStart = (const unsigned char *)Buf->getBufferStart(); 570 // Compute the offset of the token data within the buffer. 571 const unsigned char* data = BufStart + FileData.getTokenOffset(); 572 573 // Get the location of pp-conditional table. 574 const unsigned char* ppcond = BufStart + FileData.getPPCondOffset(); 575 uint32_t Len = ReadLE32(ppcond); 576 if (Len == 0) ppcond = 0; 577 578 assert(PP && "No preprocessor set yet!"); 579 return new PTHLexer(*PP, FID, data, ppcond, *this); 580} 581