ArchiveReader.cpp revision 20c348978ea0d3ec7177fab801d619ccbe7d815f
1//===-- ArchiveReader.cpp - Read LLVM archive files -------------*- C++ -*-===// 2// 3// The LLVM Compiler Infrastructure 4// 5// This file was developed by Reid Spencer and is distributed under the 6// University of Illinois Open Source License. See LICENSE.TXT for details. 7// 8//===----------------------------------------------------------------------===// 9// 10// Builds up standard unix archive files (.a) containing LLVM bytecode. 11// 12//===----------------------------------------------------------------------===// 13 14#include "ArchiveInternals.h" 15#include "llvm/Bytecode/Reader.h" 16#include "llvm/Support/Compressor.h" 17#include <memory> 18using namespace llvm; 19 20/// Read a variable-bit-rate encoded unsigned integer 21inline unsigned readInteger(const char*&At, const char*End){ 22 unsigned Shift = 0; 23 unsigned Result = 0; 24 25 do { 26 if (At == End) 27 return Result; 28 Result |= (unsigned)((*At++) & 0x7F) << Shift; 29 Shift += 7; 30 } while (At[-1] & 0x80); 31 return Result; 32} 33 34// Completely parse the Archive's symbol table and populate symTab member var. 35bool 36Archive::parseSymbolTable(const void* data, unsigned size, std::string* error) { 37 const char* At = (const char*) data; 38 const char* End = At + size; 39 while (At < End) { 40 unsigned offset = readInteger(At, End); 41 if (At == End) { 42 if (error) 43 *error = "Ran out of data reading vbr_uint for symtab offset!"; 44 return false; 45 } 46 unsigned length = readInteger(At, End); 47 if (At == End) { 48 if (error) 49 *error = "Ran out of data reading vbr_uint for symtab length!"; 50 return false; 51 } 52 if (At + length > End) { 53 if (error) 54 *error = "Malformed symbol table: length not consistent with size"; 55 return false; 56 } 57 // we don't care if it can't be inserted (duplicate entry) 58 symTab.insert(std::make_pair(std::string(At, length), offset)); 59 At += length; 60 } 61 symTabSize = size; 62 return true; 63} 64 65// This member parses an ArchiveMemberHeader that is presumed to be pointed to 66// by At. The At pointer is updated to the byte just after the header, which 67// can be variable in size. 68ArchiveMember* 69Archive::parseMemberHeader(const char*& At, const char* End, std::string* error) 70{ 71 if (At + sizeof(ArchiveMemberHeader) >= End) { 72 if (error) 73 *error = "Unexpected end of file"; 74 return 0; 75 } 76 77 // Cast archive member header 78 ArchiveMemberHeader* Hdr = (ArchiveMemberHeader*)At; 79 At += sizeof(ArchiveMemberHeader); 80 81 // Extract the size and determine if the file is 82 // compressed or not (negative length). 83 int flags = 0; 84 int MemberSize = atoi(Hdr->size); 85 if (MemberSize < 0) { 86 flags |= ArchiveMember::CompressedFlag; 87 MemberSize = -MemberSize; 88 } 89 90 // Check the size of the member for sanity 91 if (At + MemberSize > End) { 92 if (error) 93 *error = "invalid member length in archive file"; 94 return 0; 95 } 96 97 // Check the member signature 98 if (!Hdr->checkSignature()) { 99 if (error) 100 *error = "invalid file member signature"; 101 return 0; 102 } 103 104 // Convert and check the member name 105 // The empty name ( '/' and 15 blanks) is for a foreign (non-LLVM) symbol 106 // table. The special name "//" and 14 blanks is for a string table, used 107 // for long file names. This library doesn't generate either of those but 108 // it will accept them. If the name starts with #1/ and the remainder is 109 // digits, then those digits specify the length of the name that is 110 // stored immediately following the header. The special name 111 // __LLVM_SYM_TAB__ identifies the symbol table for LLVM bytecode. 112 // Anything else is a regular, short filename that is terminated with 113 // a '/' and blanks. 114 115 std::string pathname; 116 switch (Hdr->name[0]) { 117 case '#': 118 if (Hdr->name[1] == '1' && Hdr->name[2] == '/') { 119 if (isdigit(Hdr->name[3])) { 120 unsigned len = atoi(&Hdr->name[3]); 121 pathname.assign(At, len); 122 At += len; 123 MemberSize -= len; 124 flags |= ArchiveMember::HasLongFilenameFlag; 125 } else { 126 if (error) 127 *error = "invalid long filename"; 128 return 0; 129 } 130 } else if (Hdr->name[1] == '_' && 131 (0 == memcmp(Hdr->name, ARFILE_LLVM_SYMTAB_NAME, 16))) { 132 // The member is using a long file name (>15 chars) format. 133 // This format is standard for 4.4BSD and Mac OSX operating 134 // systems. LLVM uses it similarly. In this format, the 135 // remainder of the name field (after #1/) specifies the 136 // length of the file name which occupy the first bytes of 137 // the member's data. The pathname already has the #1/ stripped. 138 pathname.assign(ARFILE_LLVM_SYMTAB_NAME); 139 flags |= ArchiveMember::LLVMSymbolTableFlag; 140 } 141 break; 142 case '/': 143 if (Hdr->name[1]== '/') { 144 if (0 == memcmp(Hdr->name, ARFILE_STRTAB_NAME, 16)) { 145 pathname.assign(ARFILE_STRTAB_NAME); 146 flags |= ArchiveMember::StringTableFlag; 147 } else { 148 if (error) 149 *error = "invalid string table name"; 150 return 0; 151 } 152 } else if (Hdr->name[1] == ' ') { 153 if (0 == memcmp(Hdr->name, ARFILE_SVR4_SYMTAB_NAME, 16)) { 154 pathname.assign(ARFILE_SVR4_SYMTAB_NAME); 155 flags |= ArchiveMember::SVR4SymbolTableFlag; 156 } else { 157 if (error) 158 *error = "invalid SVR4 symbol table name"; 159 return 0; 160 } 161 } else if (isdigit(Hdr->name[1])) { 162 unsigned index = atoi(&Hdr->name[1]); 163 if (index < strtab.length()) { 164 const char* namep = strtab.c_str() + index; 165 const char* endp = strtab.c_str() + strtab.length(); 166 const char* p = namep; 167 const char* last_p = p; 168 while (p < endp) { 169 if (*p == '\n' && *last_p == '/') { 170 pathname.assign(namep, last_p - namep); 171 flags |= ArchiveMember::HasLongFilenameFlag; 172 break; 173 } 174 last_p = p; 175 p++; 176 } 177 if (p >= endp) { 178 if (error) 179 *error = "missing name termiantor in string table"; 180 return 0; 181 } 182 } else { 183 if (error) 184 *error = "name index beyond string table"; 185 return 0; 186 } 187 } 188 break; 189 case '_': 190 if (Hdr->name[1] == '_' && 191 (0 == memcmp(Hdr->name, ARFILE_BSD4_SYMTAB_NAME, 16))) { 192 pathname.assign(ARFILE_BSD4_SYMTAB_NAME); 193 flags |= ArchiveMember::BSD4SymbolTableFlag; 194 break; 195 } 196 /* FALL THROUGH */ 197 198 default: 199 char* slash = (char*) memchr(Hdr->name, '/', 16); 200 if (slash == 0) 201 slash = Hdr->name + 16; 202 pathname.assign(Hdr->name, slash - Hdr->name); 203 break; 204 } 205 206 // Determine if this is a bytecode file 207 switch (sys::IdentifyFileType(At, 4)) { 208 case sys::Bytecode_FileType: 209 flags |= ArchiveMember::BytecodeFlag; 210 break; 211 case sys::CompressedBytecode_FileType: 212 flags |= ArchiveMember::CompressedBytecodeFlag; 213 flags &= ~ArchiveMember::CompressedFlag; 214 break; 215 default: 216 flags &= ~(ArchiveMember::BytecodeFlag| 217 ArchiveMember::CompressedBytecodeFlag); 218 break; 219 } 220 221 // Instantiate the ArchiveMember to be filled 222 ArchiveMember* member = new ArchiveMember(this); 223 224 // Fill in fields of the ArchiveMember 225 member->next = 0; 226 member->prev = 0; 227 member->parent = this; 228 member->path.set(pathname); 229 member->info.fileSize = MemberSize; 230 member->info.modTime.fromEpochTime(atoi(Hdr->date)); 231 unsigned int mode; 232 sscanf(Hdr->mode, "%o", &mode); 233 member->info.mode = mode; 234 member->info.user = atoi(Hdr->uid); 235 member->info.group = atoi(Hdr->gid); 236 member->flags = flags; 237 member->data = At; 238 239 return member; 240} 241 242bool 243Archive::checkSignature(std::string* error) { 244 // Check the magic string at file's header 245 if (mapfile->size() < 8 || memcmp(base, ARFILE_MAGIC, 8)) { 246 if (error) 247 *error = "invalid signature for an archive file"; 248 return false; 249 } 250 return true; 251} 252 253// This function loads the entire archive and fully populates its ilist with 254// the members of the archive file. This is typically used in preparation for 255// editing the contents of the archive. 256bool 257Archive::loadArchive(std::string* error) { 258 259 // Set up parsing 260 members.clear(); 261 symTab.clear(); 262 const char *At = base; 263 const char *End = base + mapfile->size(); 264 265 if (!checkSignature(error)) 266 return false; 267 268 At += 8; // Skip the magic string. 269 270 bool seenSymbolTable = false; 271 bool foundFirstFile = false; 272 while (At < End) { 273 // parse the member header 274 const char* Save = At; 275 ArchiveMember* mbr = parseMemberHeader(At, End, error); 276 if (!mbr) 277 return false; 278 279 // check if this is the foreign symbol table 280 if (mbr->isSVR4SymbolTable() || mbr->isBSD4SymbolTable()) { 281 // We just save this but don't do anything special 282 // with it. It doesn't count as the "first file". 283 if (foreignST) { 284 // What? Multiple foreign symbol tables? Just chuck it 285 // and retain the last one found. 286 delete foreignST; 287 } 288 foreignST = mbr; 289 At += mbr->getSize(); 290 if ((intptr_t(At) & 1) == 1) 291 At++; 292 } else if (mbr->isStringTable()) { 293 // Simply suck the entire string table into a string 294 // variable. This will be used to get the names of the 295 // members that use the "/ddd" format for their names 296 // (SVR4 style long names). 297 strtab.assign(At, mbr->getSize()); 298 At += mbr->getSize(); 299 if ((intptr_t(At) & 1) == 1) 300 At++; 301 delete mbr; 302 } else if (mbr->isLLVMSymbolTable()) { 303 // This is the LLVM symbol table for the archive. If we've seen it 304 // already, its an error. Otherwise, parse the symbol table and move on. 305 if (seenSymbolTable) { 306 if (error) 307 *error = "invalid archive: multiple symbol tables"; 308 return false; 309 } 310 if (!parseSymbolTable(mbr->getData(), mbr->getSize(), error)) 311 return false; 312 seenSymbolTable = true; 313 At += mbr->getSize(); 314 if ((intptr_t(At) & 1) == 1) 315 At++; 316 delete mbr; // We don't need this member in the list of members. 317 } else { 318 // This is just a regular file. If its the first one, save its offset. 319 // Otherwise just push it on the list and move on to the next file. 320 if (!foundFirstFile) { 321 firstFileOffset = Save - base; 322 foundFirstFile = true; 323 } 324 members.push_back(mbr); 325 At += mbr->getSize(); 326 if ((intptr_t(At) & 1) == 1) 327 At++; 328 } 329 } 330 return true; 331} 332 333// Open and completely load the archive file. 334Archive* 335Archive::OpenAndLoad(const sys::Path& file, std::string* ErrorMessage) 336{ 337 std::auto_ptr<Archive> result ( new Archive(file)); 338 if (result->mapToMemory(ErrorMessage)) 339 return 0; 340 if (!result->loadArchive(ErrorMessage)) 341 return 0; 342 return result.release(); 343} 344 345// Get all the bytecode modules from the archive 346bool 347Archive::getAllModules(std::vector<Module*>& Modules, std::string* ErrMessage) { 348 349 for (iterator I=begin(), E=end(); I != E; ++I) { 350 if (I->isBytecode() || I->isCompressedBytecode()) { 351 std::string FullMemberName = archPath.toString() + 352 "(" + I->getPath().toString() + ")"; 353 Module* M = ParseBytecodeBuffer((const unsigned char*)I->getData(), 354 I->getSize(), FullMemberName, 355 Compressor::decompressToNewBuffer, 356 ErrMessage); 357 if (!M) 358 return true; 359 360 Modules.push_back(M); 361 } 362 } 363 return false; 364} 365 366// Load just the symbol table from the archive file 367bool 368Archive::loadSymbolTable(std::string* ErrorMsg) { 369 370 // Set up parsing 371 members.clear(); 372 symTab.clear(); 373 const char *At = base; 374 const char *End = base + mapfile->size(); 375 376 // Make sure we're dealing with an archive 377 if (!checkSignature(ErrorMsg)) 378 return false; 379 380 At += 8; // Skip signature 381 382 // Parse the first file member header 383 const char* FirstFile = At; 384 ArchiveMember* mbr = parseMemberHeader(At, End, ErrorMsg); 385 if (!mbr) 386 return false; 387 388 if (mbr->isSVR4SymbolTable() || mbr->isBSD4SymbolTable()) { 389 // Skip the foreign symbol table, we don't do anything with it 390 At += mbr->getSize(); 391 if ((intptr_t(At) & 1) == 1) 392 At++; 393 delete mbr; 394 395 // Read the next one 396 FirstFile = At; 397 mbr = parseMemberHeader(At, End, ErrorMsg); 398 if (!mbr) { 399 delete mbr; 400 return false; 401 } 402 } 403 404 if (mbr->isStringTable()) { 405 // Process the string table entry 406 strtab.assign((const char*)mbr->getData(), mbr->getSize()); 407 At += mbr->getSize(); 408 if ((intptr_t(At) & 1) == 1) 409 At++; 410 delete mbr; 411 // Get the next one 412 FirstFile = At; 413 mbr = parseMemberHeader(At, End, ErrorMsg); 414 if (!mbr) { 415 delete mbr; 416 return false; 417 } 418 } 419 420 // See if its the symbol table 421 if (mbr->isLLVMSymbolTable()) { 422 if (!parseSymbolTable(mbr->getData(), mbr->getSize(), ErrorMsg)) { 423 delete mbr; 424 return false; 425 } 426 427 At += mbr->getSize(); 428 if ((intptr_t(At) & 1) == 1) 429 At++; 430 delete mbr; 431 // Can't be any more symtab headers so just advance 432 FirstFile = At; 433 } else { 434 // There's no symbol table in the file. We have to rebuild it from scratch 435 // because the intent of this method is to get the symbol table loaded so 436 // it can be searched efficiently. 437 // Add the member to the members list 438 members.push_back(mbr); 439 } 440 441 firstFileOffset = FirstFile - base; 442 return true; 443} 444 445// Open the archive and load just the symbol tables 446Archive* 447Archive::OpenAndLoadSymbols(const sys::Path& file, std::string* ErrorMessage) { 448 std::auto_ptr<Archive> result ( new Archive(file) ); 449 if (result->mapToMemory(ErrorMessage)) 450 return 0; 451 if (!result->loadSymbolTable(ErrorMessage)) 452 return 0; 453 return result.release(); 454} 455 456// Look up one symbol in the symbol table and return a ModuleProvider for the 457// module that defines that symbol. 458ModuleProvider* 459Archive::findModuleDefiningSymbol(const std::string& symbol, 460 std::string* ErrMsg) { 461 SymTabType::iterator SI = symTab.find(symbol); 462 if (SI == symTab.end()) 463 return 0; 464 465 // The symbol table was previously constructed assuming that the members were 466 // written without the symbol table header. Because VBR encoding is used, the 467 // values could not be adjusted to account for the offset of the symbol table 468 // because that could affect the size of the symbol table due to VBR encoding. 469 // We now have to account for this by adjusting the offset by the size of the 470 // symbol table and its header. 471 unsigned fileOffset = 472 SI->second + // offset in symbol-table-less file 473 firstFileOffset; // add offset to first "real" file in archive 474 475 // See if the module is already loaded 476 ModuleMap::iterator MI = modules.find(fileOffset); 477 if (MI != modules.end()) 478 return MI->second.first; 479 480 // Module hasn't been loaded yet, we need to load it 481 const char* modptr = base + fileOffset; 482 ArchiveMember* mbr = parseMemberHeader(modptr, base + mapfile->size(),ErrMsg); 483 if (!mbr) 484 return 0; 485 486 // Now, load the bytecode module to get the ModuleProvider 487 std::string FullMemberName = archPath.toString() + "(" + 488 mbr->getPath().toString() + ")"; 489 ModuleProvider* mp = getBytecodeBufferModuleProvider( 490 (const unsigned char*) mbr->getData(), mbr->getSize(), 491 FullMemberName, Decompressor, ErrMsg, 0); 492 if (!mp) 493 return 0; 494 495 modules.insert(std::make_pair(fileOffset, std::make_pair(mp, mbr))); 496 497 return mp; 498} 499 500// Look up multiple symbols in the symbol table and return a set of 501// ModuleProviders that define those symbols. 502bool 503Archive::findModulesDefiningSymbols(std::set<std::string>& symbols, 504 std::set<ModuleProvider*>& result, 505 std::string* error) { 506 if (!mapfile || !base) { 507 if (error) 508 *error = "Empty archive invalid for finding modules defining symbols"; 509 return false; 510 } 511 512 if (symTab.empty()) { 513 // We don't have a symbol table, so we must build it now but lets also 514 // make sure that we populate the modules table as we do this to ensure 515 // that we don't load them twice when findModuleDefiningSymbol is called 516 // below. 517 518 // Get a pointer to the first file 519 const char* At = ((const char*)base) + firstFileOffset; 520 const char* End = ((const char*)base) + mapfile->size(); 521 522 while ( At < End) { 523 // Compute the offset to be put in the symbol table 524 unsigned offset = At - base - firstFileOffset; 525 526 // Parse the file's header 527 ArchiveMember* mbr = parseMemberHeader(At, End, error); 528 if (!mbr) 529 return false; 530 531 // If it contains symbols 532 if (mbr->isBytecode() || mbr->isCompressedBytecode()) { 533 // Get the symbols 534 std::vector<std::string> symbols; 535 std::string FullMemberName = archPath.toString() + "(" + 536 mbr->getPath().toString() + ")"; 537 ModuleProvider* MP = 538 GetBytecodeSymbols((const unsigned char*)At, mbr->getSize(), 539 FullMemberName, symbols, 540 Compressor::decompressToNewBuffer, error); 541 542 if (MP) { 543 // Insert the module's symbols into the symbol table 544 for (std::vector<std::string>::iterator I = symbols.begin(), 545 E=symbols.end(); I != E; ++I ) { 546 symTab.insert(std::make_pair(*I, offset)); 547 } 548 // Insert the ModuleProvider and the ArchiveMember into the table of 549 // modules. 550 modules.insert(std::make_pair(offset, std::make_pair(MP, mbr))); 551 } else { 552 if (error) 553 *error = "Can't parse bytecode member: " + 554 mbr->getPath().toString() + ": " + *error; 555 delete mbr; 556 return false; 557 } 558 } 559 560 // Go to the next file location 561 At += mbr->getSize(); 562 if ((intptr_t(At) & 1) == 1) 563 At++; 564 } 565 } 566 567 // At this point we have a valid symbol table (one way or another) so we 568 // just use it to quickly find the symbols requested. 569 570 for (std::set<std::string>::iterator I=symbols.begin(), 571 E=symbols.end(); I != E;) { 572 // See if this symbol exists 573 ModuleProvider* mp = findModuleDefiningSymbol(*I,error); 574 if (mp) { 575 // The symbol exists, insert the ModuleProvider into our result, 576 // duplicates wil be ignored 577 result.insert(mp); 578 579 // Remove the symbol now that its been resolved, being careful to 580 // post-increment the iterator. 581 symbols.erase(I++); 582 } else { 583 ++I; 584 } 585 } 586 return true; 587} 588 589bool Archive::isBytecodeArchive() { 590 // Make sure the symTab has been loaded. In most cases this should have been 591 // done when the archive was constructed, but still, this is just in case. 592 if (!symTab.size()) 593 if (!loadSymbolTable(0)) 594 return false; 595 596 // Now that we know it's been loaded, return true 597 // if it has a size 598 if (symTab.size()) return true; 599 600 //We still can't be sure it isn't a bytecode archive 601 if (!loadArchive(0)) 602 return false; 603 604 std::vector<Module *> Modules; 605 std::string ErrorMessage; 606 607 // Scan the archive, trying to load a bytecode member. We only load one to 608 // see if this works. 609 for (iterator I = begin(), E = end(); I != E; ++I) { 610 if (!I->isBytecode() && !I->isCompressedBytecode()) 611 continue; 612 613 std::string FullMemberName = 614 archPath.toString() + "(" + I->getPath().toString() + ")"; 615 Module* M = ParseBytecodeBuffer((const unsigned char*)I->getData(), 616 I->getSize(), FullMemberName); 617 if (!M) 618 return false; // Couldn't parse bytecode, not a bytecode archive. 619 delete M; 620 return true; 621 } 622 623 return false; 624} 625