ArchiveReader.cpp revision 20c348978ea0d3ec7177fab801d619ccbe7d815f
1//===-- ArchiveReader.cpp - Read LLVM archive files -------------*- C++ -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file was developed by Reid Spencer and is distributed under the
6// University of Illinois Open Source License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// Builds up standard unix archive files (.a) containing LLVM bytecode.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ArchiveInternals.h"
15#include "llvm/Bytecode/Reader.h"
16#include "llvm/Support/Compressor.h"
17#include <memory>
18using namespace llvm;
19
20/// Read a variable-bit-rate encoded unsigned integer
21inline unsigned readInteger(const char*&At, const char*End){
22  unsigned Shift = 0;
23  unsigned Result = 0;
24
25  do {
26    if (At == End)
27      return Result;
28    Result |= (unsigned)((*At++) & 0x7F) << Shift;
29    Shift += 7;
30  } while (At[-1] & 0x80);
31  return Result;
32}
33
34// Completely parse the Archive's symbol table and populate symTab member var.
35bool
36Archive::parseSymbolTable(const void* data, unsigned size, std::string* error) {
37  const char* At = (const char*) data;
38  const char* End = At + size;
39  while (At < End) {
40    unsigned offset = readInteger(At, End);
41    if (At == End) {
42      if (error)
43        *error = "Ran out of data reading vbr_uint for symtab offset!";
44      return false;
45    }
46    unsigned length = readInteger(At, End);
47    if (At == End) {
48      if (error)
49        *error = "Ran out of data reading vbr_uint for symtab length!";
50      return false;
51    }
52    if (At + length > End) {
53      if (error)
54        *error = "Malformed symbol table: length not consistent with size";
55      return false;
56    }
57    // we don't care if it can't be inserted (duplicate entry)
58    symTab.insert(std::make_pair(std::string(At, length), offset));
59    At += length;
60  }
61  symTabSize = size;
62  return true;
63}
64
65// This member parses an ArchiveMemberHeader that is presumed to be pointed to
66// by At. The At pointer is updated to the byte just after the header, which
67// can be variable in size.
68ArchiveMember*
69Archive::parseMemberHeader(const char*& At, const char* End, std::string* error)
70{
71  if (At + sizeof(ArchiveMemberHeader) >= End) {
72    if (error)
73      *error = "Unexpected end of file";
74    return 0;
75  }
76
77  // Cast archive member header
78  ArchiveMemberHeader* Hdr = (ArchiveMemberHeader*)At;
79  At += sizeof(ArchiveMemberHeader);
80
81  // Extract the size and determine if the file is
82  // compressed or not (negative length).
83  int flags = 0;
84  int MemberSize = atoi(Hdr->size);
85  if (MemberSize < 0) {
86    flags |= ArchiveMember::CompressedFlag;
87    MemberSize = -MemberSize;
88  }
89
90  // Check the size of the member for sanity
91  if (At + MemberSize > End) {
92    if (error)
93      *error = "invalid member length in archive file";
94    return 0;
95  }
96
97  // Check the member signature
98  if (!Hdr->checkSignature()) {
99    if (error)
100      *error = "invalid file member signature";
101    return 0;
102  }
103
104  // Convert and check the member name
105  // The empty name ( '/' and 15 blanks) is for a foreign (non-LLVM) symbol
106  // table. The special name "//" and 14 blanks is for a string table, used
107  // for long file names. This library doesn't generate either of those but
108  // it will accept them. If the name starts with #1/ and the remainder is
109  // digits, then those digits specify the length of the name that is
110  // stored immediately following the header. The special name
111  // __LLVM_SYM_TAB__ identifies the symbol table for LLVM bytecode.
112  // Anything else is a regular, short filename that is terminated with
113  // a '/' and blanks.
114
115  std::string pathname;
116  switch (Hdr->name[0]) {
117    case '#':
118      if (Hdr->name[1] == '1' && Hdr->name[2] == '/') {
119        if (isdigit(Hdr->name[3])) {
120          unsigned len = atoi(&Hdr->name[3]);
121          pathname.assign(At, len);
122          At += len;
123          MemberSize -= len;
124          flags |= ArchiveMember::HasLongFilenameFlag;
125        } else {
126          if (error)
127            *error = "invalid long filename";
128          return 0;
129        }
130      } else if (Hdr->name[1] == '_' &&
131                 (0 == memcmp(Hdr->name, ARFILE_LLVM_SYMTAB_NAME, 16))) {
132        // The member is using a long file name (>15 chars) format.
133        // This format is standard for 4.4BSD and Mac OSX operating
134        // systems. LLVM uses it similarly. In this format, the
135        // remainder of the name field (after #1/) specifies the
136        // length of the file name which occupy the first bytes of
137        // the member's data. The pathname already has the #1/ stripped.
138        pathname.assign(ARFILE_LLVM_SYMTAB_NAME);
139        flags |= ArchiveMember::LLVMSymbolTableFlag;
140      }
141      break;
142    case '/':
143      if (Hdr->name[1]== '/') {
144        if (0 == memcmp(Hdr->name, ARFILE_STRTAB_NAME, 16)) {
145          pathname.assign(ARFILE_STRTAB_NAME);
146          flags |= ArchiveMember::StringTableFlag;
147        } else {
148          if (error)
149            *error = "invalid string table name";
150          return 0;
151        }
152      } else if (Hdr->name[1] == ' ') {
153        if (0 == memcmp(Hdr->name, ARFILE_SVR4_SYMTAB_NAME, 16)) {
154          pathname.assign(ARFILE_SVR4_SYMTAB_NAME);
155          flags |= ArchiveMember::SVR4SymbolTableFlag;
156        } else {
157          if (error)
158            *error = "invalid SVR4 symbol table name";
159          return 0;
160        }
161      } else if (isdigit(Hdr->name[1])) {
162        unsigned index = atoi(&Hdr->name[1]);
163        if (index < strtab.length()) {
164          const char* namep = strtab.c_str() + index;
165          const char* endp = strtab.c_str() + strtab.length();
166          const char* p = namep;
167          const char* last_p = p;
168          while (p < endp) {
169            if (*p == '\n' && *last_p == '/') {
170              pathname.assign(namep, last_p - namep);
171              flags |= ArchiveMember::HasLongFilenameFlag;
172              break;
173            }
174            last_p = p;
175            p++;
176          }
177          if (p >= endp) {
178            if (error)
179              *error = "missing name termiantor in string table";
180            return 0;
181          }
182        } else {
183          if (error)
184            *error = "name index beyond string table";
185          return 0;
186        }
187      }
188      break;
189    case '_':
190      if (Hdr->name[1] == '_' &&
191          (0 == memcmp(Hdr->name, ARFILE_BSD4_SYMTAB_NAME, 16))) {
192        pathname.assign(ARFILE_BSD4_SYMTAB_NAME);
193        flags |= ArchiveMember::BSD4SymbolTableFlag;
194        break;
195      }
196      /* FALL THROUGH */
197
198    default:
199      char* slash = (char*) memchr(Hdr->name, '/', 16);
200      if (slash == 0)
201        slash = Hdr->name + 16;
202      pathname.assign(Hdr->name, slash - Hdr->name);
203      break;
204  }
205
206  // Determine if this is a bytecode file
207  switch (sys::IdentifyFileType(At, 4)) {
208    case sys::Bytecode_FileType:
209      flags |= ArchiveMember::BytecodeFlag;
210      break;
211    case sys::CompressedBytecode_FileType:
212      flags |= ArchiveMember::CompressedBytecodeFlag;
213      flags &= ~ArchiveMember::CompressedFlag;
214      break;
215    default:
216      flags &= ~(ArchiveMember::BytecodeFlag|
217                 ArchiveMember::CompressedBytecodeFlag);
218      break;
219  }
220
221  // Instantiate the ArchiveMember to be filled
222  ArchiveMember* member = new ArchiveMember(this);
223
224  // Fill in fields of the ArchiveMember
225  member->next = 0;
226  member->prev = 0;
227  member->parent = this;
228  member->path.set(pathname);
229  member->info.fileSize = MemberSize;
230  member->info.modTime.fromEpochTime(atoi(Hdr->date));
231  unsigned int mode;
232  sscanf(Hdr->mode, "%o", &mode);
233  member->info.mode = mode;
234  member->info.user = atoi(Hdr->uid);
235  member->info.group = atoi(Hdr->gid);
236  member->flags = flags;
237  member->data = At;
238
239  return member;
240}
241
242bool
243Archive::checkSignature(std::string* error) {
244  // Check the magic string at file's header
245  if (mapfile->size() < 8 || memcmp(base, ARFILE_MAGIC, 8)) {
246    if (error)
247      *error = "invalid signature for an archive file";
248    return false;
249  }
250  return true;
251}
252
253// This function loads the entire archive and fully populates its ilist with
254// the members of the archive file. This is typically used in preparation for
255// editing the contents of the archive.
256bool
257Archive::loadArchive(std::string* error) {
258
259  // Set up parsing
260  members.clear();
261  symTab.clear();
262  const char *At = base;
263  const char *End = base + mapfile->size();
264
265  if (!checkSignature(error))
266    return false;
267
268  At += 8;  // Skip the magic string.
269
270  bool seenSymbolTable = false;
271  bool foundFirstFile = false;
272  while (At < End) {
273    // parse the member header
274    const char* Save = At;
275    ArchiveMember* mbr = parseMemberHeader(At, End, error);
276    if (!mbr)
277      return false;
278
279    // check if this is the foreign symbol table
280    if (mbr->isSVR4SymbolTable() || mbr->isBSD4SymbolTable()) {
281      // We just save this but don't do anything special
282      // with it. It doesn't count as the "first file".
283      if (foreignST) {
284        // What? Multiple foreign symbol tables? Just chuck it
285        // and retain the last one found.
286        delete foreignST;
287      }
288      foreignST = mbr;
289      At += mbr->getSize();
290      if ((intptr_t(At) & 1) == 1)
291        At++;
292    } else if (mbr->isStringTable()) {
293      // Simply suck the entire string table into a string
294      // variable. This will be used to get the names of the
295      // members that use the "/ddd" format for their names
296      // (SVR4 style long names).
297      strtab.assign(At, mbr->getSize());
298      At += mbr->getSize();
299      if ((intptr_t(At) & 1) == 1)
300        At++;
301      delete mbr;
302    } else if (mbr->isLLVMSymbolTable()) {
303      // This is the LLVM symbol table for the archive. If we've seen it
304      // already, its an error. Otherwise, parse the symbol table and move on.
305      if (seenSymbolTable) {
306        if (error)
307          *error = "invalid archive: multiple symbol tables";
308        return false;
309      }
310      if (!parseSymbolTable(mbr->getData(), mbr->getSize(), error))
311        return false;
312      seenSymbolTable = true;
313      At += mbr->getSize();
314      if ((intptr_t(At) & 1) == 1)
315        At++;
316      delete mbr; // We don't need this member in the list of members.
317    } else {
318      // This is just a regular file. If its the first one, save its offset.
319      // Otherwise just push it on the list and move on to the next file.
320      if (!foundFirstFile) {
321        firstFileOffset = Save - base;
322        foundFirstFile = true;
323      }
324      members.push_back(mbr);
325      At += mbr->getSize();
326      if ((intptr_t(At) & 1) == 1)
327        At++;
328    }
329  }
330  return true;
331}
332
333// Open and completely load the archive file.
334Archive*
335Archive::OpenAndLoad(const sys::Path& file, std::string* ErrorMessage)
336{
337  std::auto_ptr<Archive> result ( new Archive(file));
338  if (result->mapToMemory(ErrorMessage))
339    return 0;
340  if (!result->loadArchive(ErrorMessage))
341    return 0;
342  return result.release();
343}
344
345// Get all the bytecode modules from the archive
346bool
347Archive::getAllModules(std::vector<Module*>& Modules, std::string* ErrMessage) {
348
349  for (iterator I=begin(), E=end(); I != E; ++I) {
350    if (I->isBytecode() || I->isCompressedBytecode()) {
351      std::string FullMemberName = archPath.toString() +
352        "(" + I->getPath().toString() + ")";
353      Module* M = ParseBytecodeBuffer((const unsigned char*)I->getData(),
354                                      I->getSize(), FullMemberName,
355                                      Compressor::decompressToNewBuffer,
356                                      ErrMessage);
357      if (!M)
358        return true;
359
360      Modules.push_back(M);
361    }
362  }
363  return false;
364}
365
366// Load just the symbol table from the archive file
367bool
368Archive::loadSymbolTable(std::string* ErrorMsg) {
369
370  // Set up parsing
371  members.clear();
372  symTab.clear();
373  const char *At = base;
374  const char *End = base + mapfile->size();
375
376  // Make sure we're dealing with an archive
377  if (!checkSignature(ErrorMsg))
378    return false;
379
380  At += 8; // Skip signature
381
382  // Parse the first file member header
383  const char* FirstFile = At;
384  ArchiveMember* mbr = parseMemberHeader(At, End, ErrorMsg);
385  if (!mbr)
386    return false;
387
388  if (mbr->isSVR4SymbolTable() || mbr->isBSD4SymbolTable()) {
389    // Skip the foreign symbol table, we don't do anything with it
390    At += mbr->getSize();
391    if ((intptr_t(At) & 1) == 1)
392      At++;
393    delete mbr;
394
395    // Read the next one
396    FirstFile = At;
397    mbr = parseMemberHeader(At, End, ErrorMsg);
398    if (!mbr) {
399      delete mbr;
400      return false;
401    }
402  }
403
404  if (mbr->isStringTable()) {
405    // Process the string table entry
406    strtab.assign((const char*)mbr->getData(), mbr->getSize());
407    At += mbr->getSize();
408    if ((intptr_t(At) & 1) == 1)
409      At++;
410    delete mbr;
411    // Get the next one
412    FirstFile = At;
413    mbr = parseMemberHeader(At, End, ErrorMsg);
414    if (!mbr) {
415      delete mbr;
416      return false;
417    }
418  }
419
420  // See if its the symbol table
421  if (mbr->isLLVMSymbolTable()) {
422    if (!parseSymbolTable(mbr->getData(), mbr->getSize(), ErrorMsg)) {
423      delete mbr;
424      return false;
425    }
426
427    At += mbr->getSize();
428    if ((intptr_t(At) & 1) == 1)
429      At++;
430    delete mbr;
431    // Can't be any more symtab headers so just advance
432    FirstFile = At;
433  } else {
434    // There's no symbol table in the file. We have to rebuild it from scratch
435    // because the intent of this method is to get the symbol table loaded so
436    // it can be searched efficiently.
437    // Add the member to the members list
438    members.push_back(mbr);
439  }
440
441  firstFileOffset = FirstFile - base;
442  return true;
443}
444
445// Open the archive and load just the symbol tables
446Archive*
447Archive::OpenAndLoadSymbols(const sys::Path& file, std::string* ErrorMessage) {
448  std::auto_ptr<Archive> result ( new Archive(file) );
449  if (result->mapToMemory(ErrorMessage))
450    return 0;
451  if (!result->loadSymbolTable(ErrorMessage))
452    return 0;
453  return result.release();
454}
455
456// Look up one symbol in the symbol table and return a ModuleProvider for the
457// module that defines that symbol.
458ModuleProvider*
459Archive::findModuleDefiningSymbol(const std::string& symbol,
460                                  std::string* ErrMsg) {
461  SymTabType::iterator SI = symTab.find(symbol);
462  if (SI == symTab.end())
463    return 0;
464
465  // The symbol table was previously constructed assuming that the members were
466  // written without the symbol table header. Because VBR encoding is used, the
467  // values could not be adjusted to account for the offset of the symbol table
468  // because that could affect the size of the symbol table due to VBR encoding.
469  // We now have to account for this by adjusting the offset by the size of the
470  // symbol table and its header.
471  unsigned fileOffset =
472    SI->second +                // offset in symbol-table-less file
473    firstFileOffset;            // add offset to first "real" file in archive
474
475  // See if the module is already loaded
476  ModuleMap::iterator MI = modules.find(fileOffset);
477  if (MI != modules.end())
478    return MI->second.first;
479
480  // Module hasn't been loaded yet, we need to load it
481  const char* modptr = base + fileOffset;
482  ArchiveMember* mbr = parseMemberHeader(modptr, base + mapfile->size(),ErrMsg);
483  if (!mbr)
484    return 0;
485
486  // Now, load the bytecode module to get the ModuleProvider
487  std::string FullMemberName = archPath.toString() + "(" +
488    mbr->getPath().toString() + ")";
489  ModuleProvider* mp = getBytecodeBufferModuleProvider(
490      (const unsigned char*) mbr->getData(), mbr->getSize(),
491      FullMemberName, Decompressor, ErrMsg, 0);
492  if (!mp)
493    return 0;
494
495  modules.insert(std::make_pair(fileOffset, std::make_pair(mp, mbr)));
496
497  return mp;
498}
499
500// Look up multiple symbols in the symbol table and return a set of
501// ModuleProviders that define those symbols.
502bool
503Archive::findModulesDefiningSymbols(std::set<std::string>& symbols,
504                                    std::set<ModuleProvider*>& result,
505                                    std::string* error) {
506  if (!mapfile || !base) {
507    if (error)
508      *error = "Empty archive invalid for finding modules defining symbols";
509    return false;
510  }
511
512  if (symTab.empty()) {
513    // We don't have a symbol table, so we must build it now but lets also
514    // make sure that we populate the modules table as we do this to ensure
515    // that we don't load them twice when findModuleDefiningSymbol is called
516    // below.
517
518    // Get a pointer to the first file
519    const char* At  = ((const char*)base) + firstFileOffset;
520    const char* End = ((const char*)base) + mapfile->size();
521
522    while ( At < End) {
523      // Compute the offset to be put in the symbol table
524      unsigned offset = At - base - firstFileOffset;
525
526      // Parse the file's header
527      ArchiveMember* mbr = parseMemberHeader(At, End, error);
528      if (!mbr)
529        return false;
530
531      // If it contains symbols
532      if (mbr->isBytecode() || mbr->isCompressedBytecode()) {
533        // Get the symbols
534        std::vector<std::string> symbols;
535        std::string FullMemberName = archPath.toString() + "(" +
536          mbr->getPath().toString() + ")";
537        ModuleProvider* MP =
538          GetBytecodeSymbols((const unsigned char*)At, mbr->getSize(),
539                             FullMemberName, symbols,
540                             Compressor::decompressToNewBuffer, error);
541
542        if (MP) {
543          // Insert the module's symbols into the symbol table
544          for (std::vector<std::string>::iterator I = symbols.begin(),
545               E=symbols.end(); I != E; ++I ) {
546            symTab.insert(std::make_pair(*I, offset));
547          }
548          // Insert the ModuleProvider and the ArchiveMember into the table of
549          // modules.
550          modules.insert(std::make_pair(offset, std::make_pair(MP, mbr)));
551        } else {
552          if (error)
553            *error = "Can't parse bytecode member: " +
554              mbr->getPath().toString() + ": " + *error;
555          delete mbr;
556          return false;
557        }
558      }
559
560      // Go to the next file location
561      At += mbr->getSize();
562      if ((intptr_t(At) & 1) == 1)
563        At++;
564    }
565  }
566
567  // At this point we have a valid symbol table (one way or another) so we
568  // just use it to quickly find the symbols requested.
569
570  for (std::set<std::string>::iterator I=symbols.begin(),
571       E=symbols.end(); I != E;) {
572    // See if this symbol exists
573    ModuleProvider* mp = findModuleDefiningSymbol(*I,error);
574    if (mp) {
575      // The symbol exists, insert the ModuleProvider into our result,
576      // duplicates wil be ignored
577      result.insert(mp);
578
579      // Remove the symbol now that its been resolved, being careful to
580      // post-increment the iterator.
581      symbols.erase(I++);
582    } else {
583      ++I;
584    }
585  }
586  return true;
587}
588
589bool Archive::isBytecodeArchive() {
590  // Make sure the symTab has been loaded. In most cases this should have been
591  // done when the archive was constructed, but still,  this is just in case.
592  if (!symTab.size())
593    if (!loadSymbolTable(0))
594      return false;
595
596  // Now that we know it's been loaded, return true
597  // if it has a size
598  if (symTab.size()) return true;
599
600  //We still can't be sure it isn't a bytecode archive
601  if (!loadArchive(0))
602    return false;
603
604  std::vector<Module *> Modules;
605  std::string ErrorMessage;
606
607  // Scan the archive, trying to load a bytecode member.  We only load one to
608  // see if this works.
609  for (iterator I = begin(), E = end(); I != E; ++I) {
610    if (!I->isBytecode() && !I->isCompressedBytecode())
611      continue;
612
613    std::string FullMemberName =
614      archPath.toString() + "(" + I->getPath().toString() + ")";
615    Module* M = ParseBytecodeBuffer((const unsigned char*)I->getData(),
616                                    I->getSize(), FullMemberName);
617    if (!M)
618      return false;  // Couldn't parse bytecode, not a bytecode archive.
619    delete M;
620    return true;
621  }
622
623  return false;
624}
625