1//===------ macho2yaml.cpp - obj2yaml conversion tool -----------*- C++ -*-===//
2//
3//                     The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9
10#include "Error.h"
11#include "obj2yaml.h"
12#include "llvm/Object/MachOUniversal.h"
13#include "llvm/ObjectYAML/ObjectYAML.h"
14#include "llvm/Support/ErrorHandling.h"
15#include "llvm/Support/LEB128.h"
16
17#include <string.h> // for memcpy
18
19using namespace llvm;
20
21class MachODumper {
22
23  template <typename StructType>
24  const char *processLoadCommandData(
25      MachOYAML::LoadCommand &LC,
26      const llvm::object::MachOObjectFile::LoadCommandInfo &LoadCmd);
27
28  const object::MachOObjectFile &Obj;
29  void dumpHeader(std::unique_ptr<MachOYAML::Object> &Y);
30  void dumpLoadCommands(std::unique_ptr<MachOYAML::Object> &Y);
31  void dumpLinkEdit(std::unique_ptr<MachOYAML::Object> &Y);
32  void dumpRebaseOpcodes(std::unique_ptr<MachOYAML::Object> &Y);
33  void dumpBindOpcodes(std::vector<MachOYAML::BindOpcode> &BindOpcodes,
34                       ArrayRef<uint8_t> OpcodeBuffer, bool Lazy = false);
35  void dumpExportTrie(std::unique_ptr<MachOYAML::Object> &Y);
36  void dumpSymbols(std::unique_ptr<MachOYAML::Object> &Y);
37
38public:
39  MachODumper(const object::MachOObjectFile &O) : Obj(O) {}
40  Expected<std::unique_ptr<MachOYAML::Object>> dump();
41};
42
43#define HANDLE_LOAD_COMMAND(LCName, LCValue, LCStruct)                         \
44  case MachO::LCName:                                                          \
45    memcpy((void *) & (LC.Data.LCStruct##_data), LoadCmd.Ptr,                  \
46           sizeof(MachO::LCStruct));                                           \
47    if (Obj.isLittleEndian() != sys::IsLittleEndianHost)                       \
48      MachO::swapStruct(LC.Data.LCStruct##_data);                              \
49    EndPtr = processLoadCommandData<MachO::LCStruct>(LC, LoadCmd);             \
50    break;
51
52template <typename SectionType>
53MachOYAML::Section constructSectionCommon(SectionType Sec) {
54  MachOYAML::Section TempSec;
55  memcpy(reinterpret_cast<void *>(&TempSec.sectname[0]), &Sec.sectname[0], 16);
56  memcpy(reinterpret_cast<void *>(&TempSec.segname[0]), &Sec.segname[0], 16);
57  TempSec.addr = Sec.addr;
58  TempSec.size = Sec.size;
59  TempSec.offset = Sec.offset;
60  TempSec.align = Sec.align;
61  TempSec.reloff = Sec.reloff;
62  TempSec.nreloc = Sec.nreloc;
63  TempSec.flags = Sec.flags;
64  TempSec.reserved1 = Sec.reserved1;
65  TempSec.reserved2 = Sec.reserved2;
66  TempSec.reserved3 = 0;
67  return TempSec;
68}
69
70template <typename SectionType>
71MachOYAML::Section constructSection(SectionType Sec);
72
73template <> MachOYAML::Section constructSection(MachO::section Sec) {
74  MachOYAML::Section TempSec = constructSectionCommon(Sec);
75  TempSec.reserved3 = 0;
76  return TempSec;
77}
78
79template <> MachOYAML::Section constructSection(MachO::section_64 Sec) {
80  MachOYAML::Section TempSec = constructSectionCommon(Sec);
81  TempSec.reserved3 = Sec.reserved3;
82  return TempSec;
83}
84
85template <typename SectionType, typename SegmentType>
86const char *
87extractSections(const llvm::object::MachOObjectFile::LoadCommandInfo &LoadCmd,
88                std::vector<MachOYAML::Section> &Sections,
89                bool IsLittleEndian) {
90  auto End = LoadCmd.Ptr + LoadCmd.C.cmdsize;
91  const SectionType *Curr =
92      reinterpret_cast<const SectionType *>(LoadCmd.Ptr + sizeof(SegmentType));
93  for (; reinterpret_cast<const void *>(Curr) < End; Curr++) {
94    if (IsLittleEndian != sys::IsLittleEndianHost) {
95      SectionType Sec;
96      memcpy((void *)&Sec, Curr, sizeof(SectionType));
97      MachO::swapStruct(Sec);
98      Sections.push_back(constructSection(Sec));
99    } else {
100      Sections.push_back(constructSection(*Curr));
101    }
102  }
103  return reinterpret_cast<const char *>(Curr);
104}
105
106template <typename StructType>
107const char *MachODumper::processLoadCommandData(
108    MachOYAML::LoadCommand &LC,
109    const llvm::object::MachOObjectFile::LoadCommandInfo &LoadCmd) {
110  return LoadCmd.Ptr + sizeof(StructType);
111}
112
113template <>
114const char *MachODumper::processLoadCommandData<MachO::segment_command>(
115    MachOYAML::LoadCommand &LC,
116    const llvm::object::MachOObjectFile::LoadCommandInfo &LoadCmd) {
117  return extractSections<MachO::section, MachO::segment_command>(
118      LoadCmd, LC.Sections, Obj.isLittleEndian());
119}
120
121template <>
122const char *MachODumper::processLoadCommandData<MachO::segment_command_64>(
123    MachOYAML::LoadCommand &LC,
124    const llvm::object::MachOObjectFile::LoadCommandInfo &LoadCmd) {
125  return extractSections<MachO::section_64, MachO::segment_command_64>(
126      LoadCmd, LC.Sections, Obj.isLittleEndian());
127}
128
129template <typename StructType>
130const char *
131readString(MachOYAML::LoadCommand &LC,
132           const llvm::object::MachOObjectFile::LoadCommandInfo &LoadCmd) {
133  auto Start = LoadCmd.Ptr + sizeof(StructType);
134  auto MaxSize = LoadCmd.C.cmdsize - sizeof(StructType);
135  auto Size = strnlen(Start, MaxSize);
136  LC.PayloadString = StringRef(Start, Size).str();
137  return Start + Size;
138}
139
140template <>
141const char *MachODumper::processLoadCommandData<MachO::dylib_command>(
142    MachOYAML::LoadCommand &LC,
143    const llvm::object::MachOObjectFile::LoadCommandInfo &LoadCmd) {
144  return readString<MachO::dylib_command>(LC, LoadCmd);
145}
146
147template <>
148const char *MachODumper::processLoadCommandData<MachO::dylinker_command>(
149    MachOYAML::LoadCommand &LC,
150    const llvm::object::MachOObjectFile::LoadCommandInfo &LoadCmd) {
151  return readString<MachO::dylinker_command>(LC, LoadCmd);
152}
153
154template <>
155const char *MachODumper::processLoadCommandData<MachO::rpath_command>(
156    MachOYAML::LoadCommand &LC,
157    const llvm::object::MachOObjectFile::LoadCommandInfo &LoadCmd) {
158  return readString<MachO::rpath_command>(LC, LoadCmd);
159}
160
161Expected<std::unique_ptr<MachOYAML::Object>> MachODumper::dump() {
162  auto Y = make_unique<MachOYAML::Object>();
163  dumpHeader(Y);
164  dumpLoadCommands(Y);
165  dumpLinkEdit(Y);
166  return std::move(Y);
167}
168
169void MachODumper::dumpHeader(std::unique_ptr<MachOYAML::Object> &Y) {
170  Y->Header.magic = Obj.getHeader().magic;
171  Y->Header.cputype = Obj.getHeader().cputype;
172  Y->Header.cpusubtype = Obj.getHeader().cpusubtype;
173  Y->Header.filetype = Obj.getHeader().filetype;
174  Y->Header.ncmds = Obj.getHeader().ncmds;
175  Y->Header.sizeofcmds = Obj.getHeader().sizeofcmds;
176  Y->Header.flags = Obj.getHeader().flags;
177  Y->Header.reserved = 0;
178}
179
180void MachODumper::dumpLoadCommands(std::unique_ptr<MachOYAML::Object> &Y) {
181  for (auto LoadCmd : Obj.load_commands()) {
182    MachOYAML::LoadCommand LC;
183    const char *EndPtr = LoadCmd.Ptr;
184    switch (LoadCmd.C.cmd) {
185    default:
186      memcpy((void *)&(LC.Data.load_command_data), LoadCmd.Ptr,
187             sizeof(MachO::load_command));
188      if (Obj.isLittleEndian() != sys::IsLittleEndianHost)
189        MachO::swapStruct(LC.Data.load_command_data);
190      EndPtr = processLoadCommandData<MachO::load_command>(LC, LoadCmd);
191      break;
192#include "llvm/Support/MachO.def"
193    }
194    auto RemainingBytes = LoadCmd.C.cmdsize - (EndPtr - LoadCmd.Ptr);
195    if (!std::all_of(EndPtr, &EndPtr[RemainingBytes],
196                     [](const char C) { return C == 0; })) {
197      LC.PayloadBytes.insert(LC.PayloadBytes.end(), EndPtr,
198                             &EndPtr[RemainingBytes]);
199      RemainingBytes = 0;
200    }
201    LC.ZeroPadBytes = RemainingBytes;
202    Y->LoadCommands.push_back(std::move(LC));
203  }
204}
205
206void MachODumper::dumpLinkEdit(std::unique_ptr<MachOYAML::Object> &Y) {
207  dumpRebaseOpcodes(Y);
208  dumpBindOpcodes(Y->LinkEdit.BindOpcodes, Obj.getDyldInfoBindOpcodes());
209  dumpBindOpcodes(Y->LinkEdit.WeakBindOpcodes,
210                  Obj.getDyldInfoWeakBindOpcodes());
211  dumpBindOpcodes(Y->LinkEdit.LazyBindOpcodes, Obj.getDyldInfoLazyBindOpcodes(),
212                  true);
213  dumpExportTrie(Y);
214  dumpSymbols(Y);
215}
216
217void MachODumper::dumpRebaseOpcodes(std::unique_ptr<MachOYAML::Object> &Y) {
218  MachOYAML::LinkEditData &LEData = Y->LinkEdit;
219
220  auto RebaseOpcodes = Obj.getDyldInfoRebaseOpcodes();
221  for (auto OpCode = RebaseOpcodes.begin(); OpCode != RebaseOpcodes.end();
222       ++OpCode) {
223    MachOYAML::RebaseOpcode RebaseOp;
224    RebaseOp.Opcode =
225        static_cast<MachO::RebaseOpcode>(*OpCode & MachO::REBASE_OPCODE_MASK);
226    RebaseOp.Imm = *OpCode & MachO::REBASE_IMMEDIATE_MASK;
227
228    unsigned Count;
229    uint64_t ULEB = 0;
230
231    switch (RebaseOp.Opcode) {
232    case MachO::REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB:
233
234      ULEB = decodeULEB128(OpCode + 1, &Count);
235      RebaseOp.ExtraData.push_back(ULEB);
236      OpCode += Count;
237    // Intentionally no break here -- This opcode has two ULEB values
238    case MachO::REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB:
239    case MachO::REBASE_OPCODE_ADD_ADDR_ULEB:
240    case MachO::REBASE_OPCODE_DO_REBASE_ULEB_TIMES:
241    case MachO::REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB:
242
243      ULEB = decodeULEB128(OpCode + 1, &Count);
244      RebaseOp.ExtraData.push_back(ULEB);
245      OpCode += Count;
246      break;
247    default:
248      break;
249    }
250
251    LEData.RebaseOpcodes.push_back(RebaseOp);
252
253    if (RebaseOp.Opcode == MachO::REBASE_OPCODE_DONE)
254      break;
255  }
256}
257
258StringRef ReadStringRef(const uint8_t *Start) {
259  const uint8_t *Itr = Start;
260  for (; *Itr; ++Itr)
261    ;
262  return StringRef(reinterpret_cast<const char *>(Start), Itr - Start);
263}
264
265void MachODumper::dumpBindOpcodes(
266    std::vector<MachOYAML::BindOpcode> &BindOpcodes,
267    ArrayRef<uint8_t> OpcodeBuffer, bool Lazy) {
268  for (auto OpCode = OpcodeBuffer.begin(); OpCode != OpcodeBuffer.end();
269       ++OpCode) {
270    MachOYAML::BindOpcode BindOp;
271    BindOp.Opcode =
272        static_cast<MachO::BindOpcode>(*OpCode & MachO::BIND_OPCODE_MASK);
273    BindOp.Imm = *OpCode & MachO::BIND_IMMEDIATE_MASK;
274
275    unsigned Count;
276    uint64_t ULEB = 0;
277    int64_t SLEB = 0;
278
279    switch (BindOp.Opcode) {
280    case MachO::BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB:
281      ULEB = decodeULEB128(OpCode + 1, &Count);
282      BindOp.ULEBExtraData.push_back(ULEB);
283      OpCode += Count;
284    // Intentionally no break here -- this opcode has two ULEB values
285
286    case MachO::BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB:
287    case MachO::BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB:
288    case MachO::BIND_OPCODE_ADD_ADDR_ULEB:
289    case MachO::BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB:
290      ULEB = decodeULEB128(OpCode + 1, &Count);
291      BindOp.ULEBExtraData.push_back(ULEB);
292      OpCode += Count;
293      break;
294
295    case MachO::BIND_OPCODE_SET_ADDEND_SLEB:
296      SLEB = decodeSLEB128(OpCode + 1, &Count);
297      BindOp.SLEBExtraData.push_back(SLEB);
298      OpCode += Count;
299      break;
300
301    case MachO::BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM:
302      BindOp.Symbol = ReadStringRef(OpCode + 1);
303      OpCode += BindOp.Symbol.size() + 1;
304      break;
305    default:
306      break;
307    }
308
309    BindOpcodes.push_back(BindOp);
310
311    // Lazy bindings have DONE opcodes between operations, so we need to keep
312    // processing after a DONE.
313    if (!Lazy && BindOp.Opcode == MachO::BIND_OPCODE_DONE)
314      break;
315  }
316}
317
318/*!
319 * /brief processes a node from the export trie, and its children.
320 *
321 * To my knowledge there is no documentation of the encoded format of this data
322 * other than in the heads of the Apple linker engineers. To that end hopefully
323 * this comment and the implementation below can serve to light the way for
324 * anyone crazy enough to come down this path in the future.
325 *
326 * This function reads and preserves the trie structure of the export trie. To
327 * my knowledge there is no code anywhere else that reads the data and preserves
328 * the Trie. LD64 (sources available at opensource.apple.com) has a similar
329 * implementation that parses the export trie into a vector. That code as well
330 * as LLVM's libObject MachO implementation were the basis for this.
331 *
332 * The export trie is an encoded trie. The node serialization is a bit awkward.
333 * The below pseudo-code is the best description I've come up with for it.
334 *
335 * struct SerializedNode {
336 *   ULEB128 TerminalSize;
337 *   struct TerminalData { <-- This is only present if TerminalSize > 0
338 *     ULEB128 Flags;
339 *     ULEB128 Address; <-- Present if (! Flags & REEXPORT )
340 *     ULEB128 Other; <-- Present if ( Flags & REEXPORT ||
341 *                                     Flags & STUB_AND_RESOLVER )
342 *     char[] ImportName; <-- Present if ( Flags & REEXPORT )
343 *   }
344 *   uint8_t ChildrenCount;
345 *   Pair<char[], ULEB128> ChildNameOffsetPair[ChildrenCount];
346 *   SerializedNode Children[ChildrenCount]
347 * }
348 *
349 * Terminal nodes are nodes that represent actual exports. They can appear
350 * anywhere in the tree other than at the root; they do not need to be leaf
351 * nodes. When reading the data out of the trie this routine reads it in-order,
352 * but it puts the child names and offsets directly into the child nodes. This
353 * results in looping over the children twice during serialization and
354 * de-serialization, but it makes the YAML representation more human readable.
355 *
356 * Below is an example of the graph from a "Hello World" executable:
357 *
358 * -------
359 * | ''  |
360 * -------
361 *    |
362 * -------
363 * | '_' |
364 * -------
365 *    |
366 *    |----------------------------------------|
367 *    |                                        |
368 *  ------------------------      ---------------------
369 *  | '_mh_execute_header' |      | 'main'            |
370 *  | Flags: 0x00000000    |      | Flags: 0x00000000 |
371 *  | Addr:  0x00000000    |      | Addr:  0x00001160 |
372 *  ------------------------      ---------------------
373 *
374 * This graph represents the trie for the exports "__mh_execute_header" and
375 * "_main". In the graph only the "_main" and "__mh_execute_header" nodes are
376 * terminal.
377*/
378
379const uint8_t *processExportNode(const uint8_t *CurrPtr,
380                                 const uint8_t *const End,
381                                 MachOYAML::ExportEntry &Entry) {
382  if (CurrPtr >= End)
383    return CurrPtr;
384  unsigned Count = 0;
385  Entry.TerminalSize = decodeULEB128(CurrPtr, &Count);
386  CurrPtr += Count;
387  if (Entry.TerminalSize != 0) {
388    Entry.Flags = decodeULEB128(CurrPtr, &Count);
389    CurrPtr += Count;
390    if (Entry.Flags & MachO::EXPORT_SYMBOL_FLAGS_REEXPORT) {
391      Entry.Address = 0;
392      Entry.Other = decodeULEB128(CurrPtr, &Count);
393      CurrPtr += Count;
394      Entry.ImportName = std::string(reinterpret_cast<const char *>(CurrPtr));
395    } else {
396      Entry.Address = decodeULEB128(CurrPtr, &Count);
397      CurrPtr += Count;
398      if (Entry.Flags & MachO::EXPORT_SYMBOL_FLAGS_STUB_AND_RESOLVER) {
399        Entry.Other = decodeULEB128(CurrPtr, &Count);
400        CurrPtr += Count;
401      } else
402        Entry.Other = 0;
403    }
404  }
405  uint8_t childrenCount = *CurrPtr++;
406  if (childrenCount == 0)
407    return CurrPtr;
408
409  Entry.Children.insert(Entry.Children.begin(), (size_t)childrenCount,
410                        MachOYAML::ExportEntry());
411  for (auto &Child : Entry.Children) {
412    Child.Name = std::string(reinterpret_cast<const char *>(CurrPtr));
413    CurrPtr += Child.Name.length() + 1;
414    Child.NodeOffset = decodeULEB128(CurrPtr, &Count);
415    CurrPtr += Count;
416  }
417  for (auto &Child : Entry.Children) {
418    CurrPtr = processExportNode(CurrPtr, End, Child);
419  }
420  return CurrPtr;
421}
422
423void MachODumper::dumpExportTrie(std::unique_ptr<MachOYAML::Object> &Y) {
424  MachOYAML::LinkEditData &LEData = Y->LinkEdit;
425  auto ExportsTrie = Obj.getDyldInfoExportsTrie();
426  processExportNode(ExportsTrie.begin(), ExportsTrie.end(), LEData.ExportTrie);
427}
428
429template <typename nlist_t>
430MachOYAML::NListEntry constructNameList(const nlist_t &nlist) {
431  MachOYAML::NListEntry NL;
432  NL.n_strx = nlist.n_strx;
433  NL.n_type = nlist.n_type;
434  NL.n_sect = nlist.n_sect;
435  NL.n_desc = nlist.n_desc;
436  NL.n_value = nlist.n_value;
437  return NL;
438}
439
440void MachODumper::dumpSymbols(std::unique_ptr<MachOYAML::Object> &Y) {
441  MachOYAML::LinkEditData &LEData = Y->LinkEdit;
442
443  for (auto Symbol : Obj.symbols()) {
444    MachOYAML::NListEntry NLE =
445        Obj.is64Bit() ? constructNameList<MachO::nlist_64>(
446                            *reinterpret_cast<const MachO::nlist_64 *>(
447                                Symbol.getRawDataRefImpl().p))
448                      : constructNameList<MachO::nlist>(
449                            *reinterpret_cast<const MachO::nlist *>(
450                                Symbol.getRawDataRefImpl().p));
451    LEData.NameList.push_back(NLE);
452  }
453
454  StringRef RemainingTable = Obj.getStringTableData();
455  while (RemainingTable.size() > 0) {
456    auto SymbolPair = RemainingTable.split('\0');
457    RemainingTable = SymbolPair.second;
458    if (SymbolPair.first.empty())
459      break;
460    LEData.StringTable.push_back(SymbolPair.first);
461  }
462}
463
464Error macho2yaml(raw_ostream &Out, const object::MachOObjectFile &Obj) {
465  MachODumper Dumper(Obj);
466  Expected<std::unique_ptr<MachOYAML::Object>> YAML = Dumper.dump();
467  if (!YAML)
468    return YAML.takeError();
469
470  yaml::YamlObjectFile YAMLFile;
471  YAMLFile.MachO = std::move(YAML.get());
472
473  yaml::Output Yout(Out);
474  Yout << YAMLFile;
475  return Error::success();
476}
477
478Error macho2yaml(raw_ostream &Out, const object::MachOUniversalBinary &Obj) {
479  yaml::YamlObjectFile YAMLFile;
480  YAMLFile.FatMachO.reset(new MachOYAML::UniversalBinary());
481  MachOYAML::UniversalBinary &YAML = *YAMLFile.FatMachO;
482  YAML.Header.magic = Obj.getMagic();
483  YAML.Header.nfat_arch = Obj.getNumberOfObjects();
484
485  for (auto Slice : Obj.objects()) {
486    MachOYAML::FatArch arch;
487    arch.cputype = Slice.getCPUType();
488    arch.cpusubtype = Slice.getCPUSubType();
489    arch.offset = Slice.getOffset();
490    arch.size = Slice.getSize();
491    arch.align = Slice.getAlign();
492    arch.reserved = Slice.getReserved();
493    YAML.FatArchs.push_back(arch);
494
495    auto SliceObj = Slice.getAsObjectFile();
496    if (!SliceObj)
497      return SliceObj.takeError();
498
499    MachODumper Dumper(*SliceObj.get());
500    Expected<std::unique_ptr<MachOYAML::Object>> YAMLObj = Dumper.dump();
501    if (!YAMLObj)
502      return YAMLObj.takeError();
503    YAML.Slices.push_back(*YAMLObj.get());
504  }
505
506  yaml::Output Yout(Out);
507  Yout << YAML;
508  return Error::success();
509}
510
511std::error_code macho2yaml(raw_ostream &Out, const object::Binary &Binary) {
512  if (const auto *MachOObj = dyn_cast<object::MachOUniversalBinary>(&Binary)) {
513    if (auto Err = macho2yaml(Out, *MachOObj)) {
514      return errorToErrorCode(std::move(Err));
515    }
516    return obj2yaml_error::success;
517  }
518
519  if (const auto *MachOObj = dyn_cast<object::MachOObjectFile>(&Binary)) {
520    if (auto Err = macho2yaml(Out, *MachOObj)) {
521      return errorToErrorCode(std::move(Err));
522    }
523    return obj2yaml_error::success;
524  }
525
526  return obj2yaml_error::unsupported_obj_file_format;
527}
528