1//===- lib/MC/MCObjectDisassembler.cpp ------------------------------------===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9
10#include "llvm/MC/MCObjectDisassembler.h"
11#include "llvm/ADT/SetVector.h"
12#include "llvm/ADT/SmallPtrSet.h"
13#include "llvm/ADT/StringExtras.h"
14#include "llvm/ADT/StringRef.h"
15#include "llvm/ADT/Twine.h"
16#include "llvm/MC/MCAnalysis/MCAtom.h"
17#include "llvm/MC/MCAnalysis/MCFunction.h"
18#include "llvm/MC/MCAnalysis/MCModule.h"
19#include "llvm/MC/MCDisassembler.h"
20#include "llvm/MC/MCInstrAnalysis.h"
21#include "llvm/MC/MCObjectSymbolizer.h"
22#include "llvm/Object/MachO.h"
23#include "llvm/Object/ObjectFile.h"
24#include "llvm/Support/Debug.h"
25#include "llvm/Support/MachO.h"
26#include "llvm/Support/MemoryObject.h"
27#include "llvm/Support/StringRefMemoryObject.h"
28#include "llvm/Support/raw_ostream.h"
29#include <map>
30
31using namespace llvm;
32using namespace object;
33
34#define DEBUG_TYPE "mc"
35
36MCObjectDisassembler::MCObjectDisassembler(const ObjectFile &Obj,
37                                           const MCDisassembler &Dis,
38                                           const MCInstrAnalysis &MIA)
39    : Obj(Obj), Dis(Dis), MIA(MIA), MOS(nullptr) {}
40
41uint64_t MCObjectDisassembler::getEntrypoint() {
42  for (const SymbolRef &Symbol : Obj.symbols()) {
43    StringRef Name;
44    Symbol.getName(Name);
45    if (Name == "main" || Name == "_main") {
46      uint64_t Entrypoint;
47      Symbol.getAddress(Entrypoint);
48      return getEffectiveLoadAddr(Entrypoint);
49    }
50  }
51  return 0;
52}
53
54ArrayRef<uint64_t> MCObjectDisassembler::getStaticInitFunctions() {
55  return ArrayRef<uint64_t>();
56}
57
58ArrayRef<uint64_t> MCObjectDisassembler::getStaticExitFunctions() {
59  return ArrayRef<uint64_t>();
60}
61
62MemoryObject *MCObjectDisassembler::getRegionFor(uint64_t Addr) {
63  // FIXME: Keep track of object sections.
64  return FallbackRegion.get();
65}
66
67uint64_t MCObjectDisassembler::getEffectiveLoadAddr(uint64_t Addr) {
68  return Addr;
69}
70
71uint64_t MCObjectDisassembler::getOriginalLoadAddr(uint64_t Addr) {
72  return Addr;
73}
74
75MCModule *MCObjectDisassembler::buildEmptyModule() {
76  MCModule *Module = new MCModule;
77  Module->Entrypoint = getEntrypoint();
78  return Module;
79}
80
81MCModule *MCObjectDisassembler::buildModule(bool withCFG) {
82  MCModule *Module = buildEmptyModule();
83
84  buildSectionAtoms(Module);
85  if (withCFG)
86    buildCFG(Module);
87  return Module;
88}
89
90void MCObjectDisassembler::buildSectionAtoms(MCModule *Module) {
91  for (const SectionRef &Section : Obj.sections()) {
92    bool isText;
93    Section.isText(isText);
94    bool isData;
95    Section.isData(isData);
96    if (!isData && !isText)
97      continue;
98
99    uint64_t StartAddr;
100    Section.getAddress(StartAddr);
101    uint64_t SecSize;
102    Section.getSize(SecSize);
103    if (StartAddr == UnknownAddressOrSize || SecSize == UnknownAddressOrSize)
104      continue;
105    StartAddr = getEffectiveLoadAddr(StartAddr);
106
107    StringRef Contents;
108    Section.getContents(Contents);
109    StringRefMemoryObject memoryObject(Contents, StartAddr);
110
111    // We don't care about things like non-file-backed sections yet.
112    if (Contents.size() != SecSize || !SecSize)
113      continue;
114    uint64_t EndAddr = StartAddr + SecSize - 1;
115
116    StringRef SecName;
117    Section.getName(SecName);
118
119    if (isText) {
120      MCTextAtom *Text = nullptr;
121      MCDataAtom *InvalidData = nullptr;
122
123      uint64_t InstSize;
124      for (uint64_t Index = 0; Index < SecSize; Index += InstSize) {
125        const uint64_t CurAddr = StartAddr + Index;
126        MCInst Inst;
127        if (Dis.getInstruction(Inst, InstSize, memoryObject, CurAddr, nulls(),
128                               nulls())) {
129          if (!Text) {
130            Text = Module->createTextAtom(CurAddr, CurAddr);
131            Text->setName(SecName);
132          }
133          Text->addInst(Inst, InstSize);
134          InvalidData = nullptr;
135        } else {
136          assert(InstSize && "getInstruction() consumed no bytes");
137          if (!InvalidData) {
138            Text = nullptr;
139            InvalidData = Module->createDataAtom(CurAddr, CurAddr+InstSize - 1);
140          }
141          for (uint64_t I = 0; I < InstSize; ++I)
142            InvalidData->addData(Contents[Index+I]);
143        }
144      }
145    } else {
146      MCDataAtom *Data = Module->createDataAtom(StartAddr, EndAddr);
147      Data->setName(SecName);
148      for (uint64_t Index = 0; Index < SecSize; ++Index)
149        Data->addData(Contents[Index]);
150    }
151  }
152}
153
154namespace {
155  struct BBInfo;
156  typedef SmallPtrSet<BBInfo*, 2> BBInfoSetTy;
157
158  struct BBInfo {
159    MCTextAtom *Atom;
160    MCBasicBlock *BB;
161    BBInfoSetTy Succs;
162    BBInfoSetTy Preds;
163    MCObjectDisassembler::AddressSetTy SuccAddrs;
164
165    BBInfo() : Atom(nullptr), BB(nullptr) {}
166
167    void addSucc(BBInfo &Succ) {
168      Succs.insert(&Succ);
169      Succ.Preds.insert(this);
170    }
171  };
172}
173
174static void RemoveDupsFromAddressVector(MCObjectDisassembler::AddressSetTy &V) {
175  std::sort(V.begin(), V.end());
176  V.erase(std::unique(V.begin(), V.end()), V.end());
177}
178
179void MCObjectDisassembler::buildCFG(MCModule *Module) {
180  typedef std::map<uint64_t, BBInfo> BBInfoByAddrTy;
181  BBInfoByAddrTy BBInfos;
182  AddressSetTy Splits;
183  AddressSetTy Calls;
184
185  for (const SymbolRef &Symbol : Obj.symbols()) {
186    SymbolRef::Type SymType;
187    Symbol.getType(SymType);
188    if (SymType == SymbolRef::ST_Function) {
189      uint64_t SymAddr;
190      Symbol.getAddress(SymAddr);
191      SymAddr = getEffectiveLoadAddr(SymAddr);
192      Calls.push_back(SymAddr);
193      Splits.push_back(SymAddr);
194    }
195  }
196
197  assert(Module->func_begin() == Module->func_end()
198         && "Module already has a CFG!");
199
200  // First, determine the basic block boundaries and call targets.
201  for (MCModule::atom_iterator AI = Module->atom_begin(),
202                               AE = Module->atom_end();
203       AI != AE; ++AI) {
204    MCTextAtom *TA = dyn_cast<MCTextAtom>(*AI);
205    if (!TA) continue;
206    Calls.push_back(TA->getBeginAddr());
207    BBInfos[TA->getBeginAddr()].Atom = TA;
208    for (MCTextAtom::const_iterator II = TA->begin(), IE = TA->end();
209         II != IE; ++II) {
210      if (MIA.isTerminator(II->Inst))
211        Splits.push_back(II->Address + II->Size);
212      uint64_t Target;
213      if (MIA.evaluateBranch(II->Inst, II->Address, II->Size, Target)) {
214        if (MIA.isCall(II->Inst))
215          Calls.push_back(Target);
216        Splits.push_back(Target);
217      }
218    }
219  }
220
221  RemoveDupsFromAddressVector(Splits);
222  RemoveDupsFromAddressVector(Calls);
223
224  // Split text atoms into basic block atoms.
225  for (AddressSetTy::const_iterator SI = Splits.begin(), SE = Splits.end();
226       SI != SE; ++SI) {
227    MCAtom *A = Module->findAtomContaining(*SI);
228    if (!A) continue;
229    MCTextAtom *TA = cast<MCTextAtom>(A);
230    if (TA->getBeginAddr() == *SI)
231      continue;
232    MCTextAtom *NewAtom = TA->split(*SI);
233    BBInfos[NewAtom->getBeginAddr()].Atom = NewAtom;
234    StringRef BBName = TA->getName();
235    BBName = BBName.substr(0, BBName.find_last_of(':'));
236    NewAtom->setName((BBName + ":" + utohexstr(*SI)).str());
237  }
238
239  // Compute succs/preds.
240  for (MCModule::atom_iterator AI = Module->atom_begin(),
241                               AE = Module->atom_end();
242                               AI != AE; ++AI) {
243    MCTextAtom *TA = dyn_cast<MCTextAtom>(*AI);
244    if (!TA) continue;
245    BBInfo &CurBB = BBInfos[TA->getBeginAddr()];
246    const MCDecodedInst &LI = TA->back();
247    if (MIA.isBranch(LI.Inst)) {
248      uint64_t Target;
249      if (MIA.evaluateBranch(LI.Inst, LI.Address, LI.Size, Target))
250        CurBB.addSucc(BBInfos[Target]);
251      if (MIA.isConditionalBranch(LI.Inst))
252        CurBB.addSucc(BBInfos[LI.Address + LI.Size]);
253    } else if (!MIA.isTerminator(LI.Inst))
254      CurBB.addSucc(BBInfos[LI.Address + LI.Size]);
255  }
256
257
258  // Create functions and basic blocks.
259  for (AddressSetTy::const_iterator CI = Calls.begin(), CE = Calls.end();
260       CI != CE; ++CI) {
261    BBInfo &BBI = BBInfos[*CI];
262    if (!BBI.Atom) continue;
263
264    MCFunction &MCFN = *Module->createFunction(BBI.Atom->getName());
265
266    // Create MCBBs.
267    SmallSetVector<BBInfo*, 16> Worklist;
268    Worklist.insert(&BBI);
269    for (size_t wi = 0; wi < Worklist.size(); ++wi) {
270      BBInfo *BBI = Worklist[wi];
271      if (!BBI->Atom)
272        continue;
273      BBI->BB = &MCFN.createBlock(*BBI->Atom);
274      // Add all predecessors and successors to the worklist.
275      for (BBInfoSetTy::iterator SI = BBI->Succs.begin(), SE = BBI->Succs.end();
276                                 SI != SE; ++SI)
277        Worklist.insert(*SI);
278      for (BBInfoSetTy::iterator PI = BBI->Preds.begin(), PE = BBI->Preds.end();
279                                 PI != PE; ++PI)
280        Worklist.insert(*PI);
281    }
282
283    // Set preds/succs.
284    for (size_t wi = 0; wi < Worklist.size(); ++wi) {
285      BBInfo *BBI = Worklist[wi];
286      MCBasicBlock *MCBB = BBI->BB;
287      if (!MCBB)
288        continue;
289      for (BBInfoSetTy::iterator SI = BBI->Succs.begin(), SE = BBI->Succs.end();
290           SI != SE; ++SI)
291        if ((*SI)->BB)
292          MCBB->addSuccessor((*SI)->BB);
293      for (BBInfoSetTy::iterator PI = BBI->Preds.begin(), PE = BBI->Preds.end();
294           PI != PE; ++PI)
295        if ((*PI)->BB)
296          MCBB->addPredecessor((*PI)->BB);
297    }
298  }
299}
300
301// Basic idea of the disassembly + discovery:
302//
303// start with the wanted address, insert it in the worklist
304// while worklist not empty, take next address in the worklist:
305// - check if atom exists there
306//   - if middle of atom:
307//     - split basic blocks referencing the atom
308//     - look for an already encountered BBInfo (using a map<atom, bbinfo>)
309//       - if there is, split it (new one, fallthrough, move succs, etc..)
310//   - if start of atom: nothing else to do
311//   - if no atom: create new atom and new bbinfo
312// - look at the last instruction in the atom, add succs to worklist
313// for all elements in the worklist:
314// - create basic block, update preds/succs, etc..
315//
316MCBasicBlock *MCObjectDisassembler::getBBAt(MCModule *Module, MCFunction *MCFN,
317                                            uint64_t BBBeginAddr,
318                                            AddressSetTy &CallTargets,
319                                            AddressSetTy &TailCallTargets) {
320  typedef std::map<uint64_t, BBInfo> BBInfoByAddrTy;
321  typedef SmallSetVector<uint64_t, 16> AddrWorklistTy;
322  BBInfoByAddrTy BBInfos;
323  AddrWorklistTy Worklist;
324
325  Worklist.insert(BBBeginAddr);
326  for (size_t wi = 0; wi < Worklist.size(); ++wi) {
327    const uint64_t BeginAddr = Worklist[wi];
328    BBInfo *BBI = &BBInfos[BeginAddr];
329
330    MCTextAtom *&TA = BBI->Atom;
331    assert(!TA && "Discovered basic block already has an associated atom!");
332
333    // Look for an atom at BeginAddr.
334    if (MCAtom *A = Module->findAtomContaining(BeginAddr)) {
335      // FIXME: We don't care about mixed atoms, see above.
336      TA = cast<MCTextAtom>(A);
337
338      // The found atom doesn't begin at BeginAddr, we have to split it.
339      if (TA->getBeginAddr() != BeginAddr) {
340        // FIXME: Handle overlapping atoms: middle-starting instructions, etc..
341        MCTextAtom *NewTA = TA->split(BeginAddr);
342
343        // Look for an already encountered basic block that needs splitting
344        BBInfoByAddrTy::iterator It = BBInfos.find(TA->getBeginAddr());
345        if (It != BBInfos.end() && It->second.Atom) {
346          BBI->SuccAddrs = It->second.SuccAddrs;
347          It->second.SuccAddrs.clear();
348          It->second.SuccAddrs.push_back(BeginAddr);
349        }
350        TA = NewTA;
351      }
352      BBI->Atom = TA;
353    } else {
354      // If we didn't find an atom, then we have to disassemble to create one!
355
356      MemoryObject *Region = getRegionFor(BeginAddr);
357      if (!Region)
358        llvm_unreachable(("Couldn't find suitable region for disassembly at " +
359                          utostr(BeginAddr)).c_str());
360
361      uint64_t InstSize;
362      uint64_t EndAddr = Region->getBase() + Region->getExtent();
363
364      // We want to stop before the next atom and have a fallthrough to it.
365      if (MCTextAtom *NextAtom =
366              cast_or_null<MCTextAtom>(Module->findFirstAtomAfter(BeginAddr)))
367        EndAddr = std::min(EndAddr, NextAtom->getBeginAddr());
368
369      for (uint64_t Addr = BeginAddr; Addr < EndAddr; Addr += InstSize) {
370        MCInst Inst;
371        if (Dis.getInstruction(Inst, InstSize, *Region, Addr, nulls(),
372                               nulls())) {
373          if (!TA)
374            TA = Module->createTextAtom(Addr, Addr);
375          TA->addInst(Inst, InstSize);
376        } else {
377          // We don't care about splitting mixed atoms either.
378          llvm_unreachable("Couldn't disassemble instruction in atom.");
379        }
380
381        uint64_t BranchTarget;
382        if (MIA.evaluateBranch(Inst, Addr, InstSize, BranchTarget)) {
383          if (MIA.isCall(Inst))
384            CallTargets.push_back(BranchTarget);
385        }
386
387        if (MIA.isTerminator(Inst))
388          break;
389      }
390      BBI->Atom = TA;
391    }
392
393    assert(TA && "Couldn't disassemble atom, none was created!");
394    assert(TA->begin() != TA->end() && "Empty atom!");
395
396    MemoryObject *Region = getRegionFor(TA->getBeginAddr());
397    assert(Region && "Couldn't find region for already disassembled code!");
398    uint64_t EndRegion = Region->getBase() + Region->getExtent();
399
400    // Now we have a basic block atom, add successors.
401    // Add the fallthrough block.
402    if ((MIA.isConditionalBranch(TA->back().Inst) ||
403         !MIA.isTerminator(TA->back().Inst)) &&
404        (TA->getEndAddr() + 1 < EndRegion)) {
405      BBI->SuccAddrs.push_back(TA->getEndAddr() + 1);
406      Worklist.insert(TA->getEndAddr() + 1);
407    }
408
409    // If the terminator is a branch, add the target block.
410    if (MIA.isBranch(TA->back().Inst)) {
411      uint64_t BranchTarget;
412      if (MIA.evaluateBranch(TA->back().Inst, TA->back().Address,
413                             TA->back().Size, BranchTarget)) {
414        StringRef ExtFnName;
415        if (MOS)
416          ExtFnName =
417              MOS->findExternalFunctionAt(getOriginalLoadAddr(BranchTarget));
418        if (!ExtFnName.empty()) {
419          TailCallTargets.push_back(BranchTarget);
420          CallTargets.push_back(BranchTarget);
421        } else {
422          BBI->SuccAddrs.push_back(BranchTarget);
423          Worklist.insert(BranchTarget);
424        }
425      }
426    }
427  }
428
429  for (size_t wi = 0, we = Worklist.size(); wi != we; ++wi) {
430    const uint64_t BeginAddr = Worklist[wi];
431    BBInfo *BBI = &BBInfos[BeginAddr];
432
433    assert(BBI->Atom && "Found a basic block without an associated atom!");
434
435    // Look for a basic block at BeginAddr.
436    BBI->BB = MCFN->find(BeginAddr);
437    if (BBI->BB) {
438      // FIXME: check that the succs/preds are the same
439      continue;
440    }
441    // If there was none, we have to create one from the atom.
442    BBI->BB = &MCFN->createBlock(*BBI->Atom);
443  }
444
445  for (size_t wi = 0, we = Worklist.size(); wi != we; ++wi) {
446    const uint64_t BeginAddr = Worklist[wi];
447    BBInfo *BBI = &BBInfos[BeginAddr];
448    MCBasicBlock *BB = BBI->BB;
449
450    RemoveDupsFromAddressVector(BBI->SuccAddrs);
451    for (AddressSetTy::const_iterator SI = BBI->SuccAddrs.begin(),
452         SE = BBI->SuccAddrs.end();
453         SE != SE; ++SI) {
454      MCBasicBlock *Succ = BBInfos[*SI].BB;
455      BB->addSuccessor(Succ);
456      Succ->addPredecessor(BB);
457    }
458  }
459
460  assert(BBInfos[Worklist[0]].BB &&
461         "No basic block created at requested address?");
462
463  return BBInfos[Worklist[0]].BB;
464}
465
466MCFunction *
467MCObjectDisassembler::createFunction(MCModule *Module, uint64_t BeginAddr,
468                                     AddressSetTy &CallTargets,
469                                     AddressSetTy &TailCallTargets) {
470  // First, check if this is an external function.
471  StringRef ExtFnName;
472  if (MOS)
473    ExtFnName = MOS->findExternalFunctionAt(getOriginalLoadAddr(BeginAddr));
474  if (!ExtFnName.empty())
475    return Module->createFunction(ExtFnName);
476
477  // If it's not, look for an existing function.
478  for (MCModule::func_iterator FI = Module->func_begin(),
479                               FE = Module->func_end();
480       FI != FE; ++FI) {
481    if ((*FI)->empty())
482      continue;
483    // FIXME: MCModule should provide a findFunctionByAddr()
484    if ((*FI)->getEntryBlock()->getInsts()->getBeginAddr() == BeginAddr)
485      return FI->get();
486  }
487
488  // Finally, just create a new one.
489  MCFunction *MCFN = Module->createFunction("");
490  getBBAt(Module, MCFN, BeginAddr, CallTargets, TailCallTargets);
491  return MCFN;
492}
493
494// MachO MCObjectDisassembler implementation.
495
496MCMachOObjectDisassembler::MCMachOObjectDisassembler(
497    const MachOObjectFile &MOOF, const MCDisassembler &Dis,
498    const MCInstrAnalysis &MIA, uint64_t VMAddrSlide,
499    uint64_t HeaderLoadAddress)
500    : MCObjectDisassembler(MOOF, Dis, MIA), MOOF(MOOF),
501      VMAddrSlide(VMAddrSlide), HeaderLoadAddress(HeaderLoadAddress) {
502
503  for (const SectionRef &Section : MOOF.sections()) {
504    StringRef Name;
505    Section.getName(Name);
506    // FIXME: We should use the S_ section type instead of the name.
507    if (Name == "__mod_init_func") {
508      DEBUG(dbgs() << "Found __mod_init_func section!\n");
509      Section.getContents(ModInitContents);
510    } else if (Name == "__mod_exit_func") {
511      DEBUG(dbgs() << "Found __mod_exit_func section!\n");
512      Section.getContents(ModExitContents);
513    }
514  }
515}
516
517// FIXME: Only do the translations for addresses actually inside the object.
518uint64_t MCMachOObjectDisassembler::getEffectiveLoadAddr(uint64_t Addr) {
519  return Addr + VMAddrSlide;
520}
521
522uint64_t
523MCMachOObjectDisassembler::getOriginalLoadAddr(uint64_t EffectiveAddr) {
524  return EffectiveAddr - VMAddrSlide;
525}
526
527uint64_t MCMachOObjectDisassembler::getEntrypoint() {
528  uint64_t EntryFileOffset = 0;
529
530  // Look for LC_MAIN.
531  {
532    uint32_t LoadCommandCount = MOOF.getHeader().ncmds;
533    MachOObjectFile::LoadCommandInfo Load = MOOF.getFirstLoadCommandInfo();
534    for (unsigned I = 0;; ++I) {
535      if (Load.C.cmd == MachO::LC_MAIN) {
536        EntryFileOffset =
537            ((const MachO::entry_point_command *)Load.Ptr)->entryoff;
538        break;
539      }
540
541      if (I == LoadCommandCount - 1)
542        break;
543      else
544        Load = MOOF.getNextLoadCommandInfo(Load);
545    }
546  }
547
548  // If we didn't find anything, default to the common implementation.
549  // FIXME: Maybe we could also look at LC_UNIXTHREAD and friends?
550  if (EntryFileOffset)
551    return MCObjectDisassembler::getEntrypoint();
552
553  return EntryFileOffset + HeaderLoadAddress;
554}
555
556ArrayRef<uint64_t> MCMachOObjectDisassembler::getStaticInitFunctions() {
557  // FIXME: We only handle 64bit mach-o
558  assert(MOOF.is64Bit());
559
560  size_t EntrySize = 8;
561  size_t EntryCount = ModInitContents.size() / EntrySize;
562  return ArrayRef<uint64_t>(
563      reinterpret_cast<const uint64_t *>(ModInitContents.data()), EntryCount);
564}
565
566ArrayRef<uint64_t> MCMachOObjectDisassembler::getStaticExitFunctions() {
567  // FIXME: We only handle 64bit mach-o
568  assert(MOOF.is64Bit());
569
570  size_t EntrySize = 8;
571  size_t EntryCount = ModExitContents.size() / EntrySize;
572  return ArrayRef<uint64_t>(
573      reinterpret_cast<const uint64_t *>(ModExitContents.data()), EntryCount);
574}
575