1#ifndef LIBDISASM_H
2#define LIBDISASM_H
3
4#ifdef WIN32
5#include <windows.h>
6#endif
7
8#include <stdint.h>
9
10/* 'NEW" types
11 * __________________________________________________________________________*/
12#ifndef LIBDISASM_QWORD_H       /* do not interfere with qword.h */
13        #define LIBDISASM_QWORD_H
14        #ifdef _MSC_VER
15                typedef __int64         qword_t;
16        #else
17                typedef int64_t         qword_t;
18        #endif
19#endif
20
21#include <sys/types.h>
22
23#ifdef __cplusplus
24extern "C" {
25#endif
26
27/* 'NEW" x86 API
28 * __________________________________________________________________________*/
29
30
31/* ========================================= Error Reporting */
32/* REPORT CODES
33 *      These are passed to a reporter function passed at initialization.
34 *      Each code determines the type of the argument passed to the reporter;
35 *      this allows the report to recover from errors, or just log them.
36 */
37enum x86_report_codes {
38        report_disasm_bounds,   /* RVA OUT OF BOUNDS : The disassembler could
39                                   not disassemble the supplied RVA as it is
40                                   out of the range of the buffer. The
41                                   application should store the address and
42                                   attempt to determine what section of the
43                                   binary it is in, then disassemble the
44                                   address from the bytes in that section.
45                                        data: uint32_t rva */
46        report_insn_bounds,     /* INSTRUCTION OUT OF BOUNDS: The disassembler
47                                   could not disassemble the instruction as
48                                   the instruction would require bytes beyond
49                                   the end of the current buffer. This usually
50                                   indicated garbage bytes at the end of a
51                                   buffer, or an incorrectly-sized buffer.
52                                        data: uint32_t rva */
53        report_invalid_insn,    /* INVALID INSTRUCTION: The disassembler could
54                                   not disassemble the instruction as it has an
55                                   invalid combination of opcodes and operands.
56                                   This will stop automated disassembly; the
57                                   application can restart the disassembly
58                                   after the invalid instruction.
59                                        data: uint32_t rva */
60        report_unknown
61};
62
63/* 'arg' is optional arbitrary data provided by the code passing the
64 *       callback -- for example, it could be 'this' or 'self' in OOP code.
65 * 'code' is provided by libdisasm, it is one of the above
66 * 'data' is provided by libdisasm and is context-specific, per the enums */
67typedef void (*DISASM_REPORTER)( enum x86_report_codes code,
68				 void *data, void *arg );
69
70
71/* x86_report_error : Call the register reporter to report an error */
72void x86_report_error( enum x86_report_codes code, void *data );
73
74/* ========================================= Libdisasm Management Routines */
75enum x86_options {		/* these can be ORed together */
76        opt_none= 0,
77        opt_ignore_nulls=1,     /* ignore sequences of > 4 NULL bytes */
78        opt_16_bit=2,           /* 16-bit/DOS disassembly */
79        opt_att_mnemonics=4,    /* use AT&T syntax names for alternate opcode mnemonics */
80};
81
82/* management routines */
83/* 'arg' is caller-specific data which is passed as the first argument
84 * to the reporter callback routine */
85int x86_init( enum x86_options options, DISASM_REPORTER reporter, void *arg);
86void x86_set_reporter( DISASM_REPORTER reporter, void *arg);
87void x86_set_options( enum x86_options options );
88enum x86_options x86_get_options( void );
89int x86_cleanup(void);
90
91
92/* ========================================= Instruction Representation */
93/* these defines are only intended for use in the array decl's */
94#define MAX_REGNAME 8
95
96#define MAX_PREFIX_STR 32
97#define MAX_MNEM_STR 16
98#define MAX_INSN_SIZE 20        /* same as in i386.h */
99#define MAX_OP_STRING 32        /* max possible operand size in string form */
100#define MAX_OP_RAW_STRING 64    /* max possible operand size in raw form */
101#define MAX_OP_XML_STRING 256   /* max possible operand size in xml form */
102#define MAX_NUM_OPERANDS 8	/* max # implicit and explicit operands */
103/* in these, the '2 *' is arbitrary: the max # of operands should require
104 * more space than the rest of the insn */
105#define MAX_INSN_STRING 512        /* 2 * 8 * MAX_OP_STRING */
106#define MAX_INSN_RAW_STRING 1024   /* 2 * 8 * MAX_OP_RAW_STRING */
107#define MAX_INSN_XML_STRING 4096   /* 2 * 8 * MAX_OP_XML_STRING */
108
109enum x86_reg_type {     /* NOTE: these may be ORed together */
110        reg_gen         = 0x00001,      /* general purpose */
111        reg_in          = 0x00002,      /* incoming args, ala RISC */
112        reg_out         = 0x00004,      /* args to calls, ala RISC */
113        reg_local       = 0x00008,      /* local vars, ala RISC */
114        reg_fpu         = 0x00010,      /* FPU data register */
115        reg_seg         = 0x00020,      /* segment register */
116        reg_simd        = 0x00040,      /* SIMD/MMX reg */
117        reg_sys         = 0x00080,      /* restricted/system register */
118        reg_sp          = 0x00100,      /* stack pointer */
119        reg_fp          = 0x00200,      /* frame pointer */
120        reg_pc          = 0x00400,      /* program counter */
121        reg_retaddr     = 0x00800,      /* return addr for func */
122        reg_cond        = 0x01000,      /* condition code / flags */
123        reg_zero        = 0x02000,      /* zero register, ala RISC */
124        reg_ret         = 0x04000,      /* return value */
125        reg_src         = 0x10000,      /* array/rep source */
126        reg_dest        = 0x20000,      /* array/rep destination */
127        reg_count       = 0x40000       /* array/rep/loop counter */
128};
129
130/* x86_reg_t : an X86 CPU register */
131typedef struct {
132        char name[MAX_REGNAME];
133        enum x86_reg_type type;         /* what register is used for */
134        unsigned int size;              /* size of register in bytes */
135        unsigned int id;                /* register ID #, for quick compares */
136	unsigned int alias;		/* ID of reg this is an alias for */
137	unsigned int shift;		/* amount to shift aliased reg by */
138} x86_reg_t;
139
140/* x86_ea_t : an X86 effective address (address expression) */
141typedef struct {
142        unsigned int     scale;         /* scale factor */
143        x86_reg_t        index, base;   /* index, base registers */
144        int32_t          disp;          /* displacement */
145        char             disp_sign;     /* is negative? 1/0 */
146        char             disp_size;     /* 0, 1, 2, 4 */
147} x86_ea_t;
148
149/* x86_absolute_t : an X86 segment:offset address (descriptor) */
150typedef struct {
151	unsigned short	segment;	/* loaded directly into CS */
152	union {
153		unsigned short	off16;	/* loaded directly into IP */
154		uint32_t		off32;	/* loaded directly into EIP */
155	} offset;
156} x86_absolute_t;
157
158enum x86_op_type {      /* mutually exclusive */
159        op_unused = 0,          /* empty/unused operand: should never occur */
160        op_register = 1,        /* CPU register */
161        op_immediate = 2,       /* Immediate Value */
162        op_relative_near = 3,   /* Relative offset from IP */
163        op_relative_far = 4,    /* Relative offset from IP */
164        op_absolute = 5,        /* Absolute address (ptr16:32) */
165        op_expression = 6,      /* Address expression (scale/index/base/disp) */
166        op_offset = 7,          /* Offset from start of segment (m32) */
167        op_unknown
168};
169
170#define x86_optype_is_address( optype ) \
171	( optype == op_absolute || optype == op_offset )
172#define x86_optype_is_relative( optype ) \
173	( optype == op_relative_near || optype == op_relative_far )
174#define x86_optype_is_memory( optype ) \
175	( optype > op_immediate && optype < op_unknown )
176
177enum x86_op_datatype {          /* these use Intel's lame terminology */
178        op_byte = 1,            /* 1 byte integer */
179        op_word = 2,            /* 2 byte integer */
180        op_dword = 3,           /* 4 byte integer */
181        op_qword = 4,           /* 8 byte integer */
182        op_dqword = 5,          /* 16 byte integer */
183        op_sreal = 6,           /* 4 byte real (single real) */
184        op_dreal = 7,           /* 8 byte real (double real) */
185        op_extreal = 8,         /* 10 byte real (extended real) */
186        op_bcd = 9,             /* 10 byte binary-coded decimal */
187        op_ssimd = 10,          /* 16 byte : 4 packed single FP (SIMD, MMX) */
188        op_dsimd = 11,          /* 16 byte : 2 packed double FP (SIMD, MMX) */
189        op_sssimd = 12,         /* 4 byte : scalar single FP (SIMD, MMX) */
190        op_sdsimd = 13,         /* 8 byte : scalar double FP (SIMD, MMX) */
191	op_descr32 = 14,	/* 6 byte Intel descriptor 2:4 */
192	op_descr16 = 15,	/* 4 byte Intel descriptor 2:2 */
193	op_pdescr32 = 16,	/* 6 byte Intel pseudo-descriptor 32:16 */
194	op_pdescr16 = 17,	/* 6 byte Intel pseudo-descriptor 8:24:16 */
195	op_bounds16 = 18,	/* signed 16:16 lower:upper bounds */
196	op_bounds32 = 19,	/* signed 32:32 lower:upper bounds */
197        op_fpuenv16 = 20,	/* 14 byte FPU control/environment data */
198        op_fpuenv32 = 21,	/* 28 byte FPU control/environment data */
199	op_fpustate16 = 22,	/* 94 byte FPU state (env & reg stack) */
200	op_fpustate32 = 23,	/* 108 byte FPU state (env & reg stack) */
201	op_fpregset = 24,	/* 512 bytes: register set */
202	op_fpreg = 25,		/* FPU register */
203    op_none = 0xFF,     /* operand without a datatype (INVLPG) */
204};
205
206enum x86_op_access {    /* ORed together */
207        op_read = 1,
208        op_write = 2,
209        op_execute = 4
210};
211
212enum x86_op_flags {     /* ORed together, but segs are mutually exclusive */
213        op_signed = 1,          /* signed integer */
214        op_string = 2,          /* possible string or array */
215        op_constant = 4,        /* symbolic constant */
216        op_pointer = 8,         /* operand points to a memory address */
217	op_sysref = 0x010,	/* operand is a syscall number */
218	op_implied = 0x020,	/* operand is implicit in the insn */
219	op_hardcode = 0x40,	/* operand is hardcoded in insn definition */
220	/* NOTE: an 'implied' operand is one which can be considered a side
221	 * effect of the insn, e.g. %esp being modified by PUSH or POP. A
222	 * 'hard-coded' operand is one which is specified in the instruction
223	 * definition, e.g. %es:%edi in MOVSB or 1 in ROL Eb, 1. The difference
224	 * is that hard-coded operands are printed by disassemblers and are
225	 * required to re-assemble, while implicit operands are invisible. */
226        op_es_seg = 0x100,      /* ES segment override */
227        op_cs_seg = 0x200,      /* CS segment override */
228        op_ss_seg = 0x300,      /* SS segment override */
229        op_ds_seg = 0x400,      /* DS segment override */
230        op_fs_seg = 0x500,      /* FS segment override */
231        op_gs_seg = 0x600       /* GS segment override */
232};
233
234/* x86_op_t : an X86 instruction operand */
235typedef struct {
236        enum x86_op_type        type;           /* operand type */
237        enum x86_op_datatype    datatype;       /* operand size */
238        enum x86_op_access      access;         /* operand access [RWX] */
239        enum x86_op_flags       flags;          /* misc flags */
240        union {
241		/* sizeof will have to work on these union members! */
242                /* immediate values */
243                char            sbyte;
244                short           sword;
245                int32_t         sdword;
246                qword_t         sqword;
247                unsigned char   byte;
248                unsigned short  word;
249                uint32_t        dword;
250                qword_t         qword;
251                float           sreal;
252                double          dreal;
253                /* misc large/non-native types */
254                unsigned char   extreal[10];
255                unsigned char   bcd[10];
256                qword_t         dqword[2];
257                unsigned char   simd[16];
258                unsigned char   fpuenv[28];
259                /* offset from segment */
260                uint32_t        offset;
261                /* ID of CPU register */
262                x86_reg_t       reg;
263                /* offsets from current insn */
264                char            relative_near;
265                int32_t         relative_far;
266		/* segment:offset */
267		x86_absolute_t	absolute;
268                /* effective address [expression] */
269                x86_ea_t        expression;
270        } data;
271	/* this is needed to make formatting operands more sane */
272	void * insn;		/* pointer to x86_insn_t owning operand */
273} x86_op_t;
274
275/* Linked list of x86_op_t; provided for manual traversal of the operand
276 * list in an insn. Users wishing to add operands to this list, e.g. to add
277 * implicit operands, should use x86_operand_new in x86_operand_list.h */
278typedef struct x86_operand_list {
279	x86_op_t op;
280	struct x86_operand_list *next;
281} x86_oplist_t;
282
283enum x86_insn_group {
284	insn_none = 0,		/* invalid instruction */
285        insn_controlflow = 1,
286        insn_arithmetic = 2,
287        insn_logic = 3,
288        insn_stack = 4,
289        insn_comparison = 5,
290        insn_move = 6,
291        insn_string = 7,
292        insn_bit_manip = 8,
293        insn_flag_manip = 9,
294        insn_fpu = 10,
295        insn_interrupt = 13,
296        insn_system = 14,
297        insn_other = 15
298};
299
300enum x86_insn_type {
301	insn_invalid = 0,	/* invalid instruction */
302        /* insn_controlflow */
303        insn_jmp = 0x1001,
304        insn_jcc = 0x1002,
305        insn_call = 0x1003,
306        insn_callcc = 0x1004,
307        insn_return = 0x1005,
308        /* insn_arithmetic */
309        insn_add = 0x2001,
310        insn_sub = 0x2002,
311        insn_mul = 0x2003,
312        insn_div = 0x2004,
313        insn_inc = 0x2005,
314        insn_dec = 0x2006,
315        insn_shl = 0x2007,
316        insn_shr = 0x2008,
317        insn_rol = 0x2009,
318        insn_ror = 0x200A,
319        /* insn_logic */
320        insn_and = 0x3001,
321        insn_or = 0x3002,
322        insn_xor = 0x3003,
323        insn_not = 0x3004,
324        insn_neg = 0x3005,
325        /* insn_stack */
326        insn_push = 0x4001,
327        insn_pop = 0x4002,
328        insn_pushregs = 0x4003,
329        insn_popregs = 0x4004,
330        insn_pushflags = 0x4005,
331        insn_popflags = 0x4006,
332        insn_enter = 0x4007,
333        insn_leave = 0x4008,
334        /* insn_comparison */
335        insn_test = 0x5001,
336        insn_cmp = 0x5002,
337        /* insn_move */
338        insn_mov = 0x6001,      /* move */
339        insn_movcc = 0x6002,    /* conditional move */
340        insn_xchg = 0x6003,     /* exchange */
341        insn_xchgcc = 0x6004,   /* conditional exchange */
342        /* insn_string */
343        insn_strcmp = 0x7001,
344        insn_strload = 0x7002,
345        insn_strmov = 0x7003,
346        insn_strstore = 0x7004,
347        insn_translate = 0x7005,        /* xlat */
348        /* insn_bit_manip */
349        insn_bittest = 0x8001,
350        insn_bitset = 0x8002,
351        insn_bitclear = 0x8003,
352        /* insn_flag_manip */
353        insn_clear_carry = 0x9001,
354        insn_clear_zero = 0x9002,
355        insn_clear_oflow = 0x9003,
356        insn_clear_dir = 0x9004,
357        insn_clear_sign = 0x9005,
358        insn_clear_parity = 0x9006,
359        insn_set_carry = 0x9007,
360        insn_set_zero = 0x9008,
361        insn_set_oflow = 0x9009,
362        insn_set_dir = 0x900A,
363        insn_set_sign = 0x900B,
364        insn_set_parity = 0x900C,
365        insn_tog_carry = 0x9010,
366        insn_tog_zero = 0x9020,
367        insn_tog_oflow = 0x9030,
368        insn_tog_dir = 0x9040,
369        insn_tog_sign = 0x9050,
370        insn_tog_parity = 0x9060,
371        /* insn_fpu */
372        insn_fmov = 0xA001,
373        insn_fmovcc = 0xA002,
374        insn_fneg = 0xA003,
375        insn_fabs = 0xA004,
376        insn_fadd = 0xA005,
377        insn_fsub = 0xA006,
378        insn_fmul = 0xA007,
379        insn_fdiv = 0xA008,
380        insn_fsqrt = 0xA009,
381        insn_fcmp = 0xA00A,
382        insn_fcos = 0xA00C,
383        insn_fldpi = 0xA00D,
384        insn_fldz = 0xA00E,
385        insn_ftan = 0xA00F,
386        insn_fsine = 0xA010,
387        insn_fsys = 0xA020,
388        /* insn_interrupt */
389        insn_int = 0xD001,
390        insn_intcc = 0xD002,    /* not present in x86 ISA */
391        insn_iret = 0xD003,
392        insn_bound = 0xD004,
393        insn_debug = 0xD005,
394        insn_trace = 0xD006,
395        insn_invalid_op = 0xD007,
396        insn_oflow = 0xD008,
397        /* insn_system */
398        insn_halt = 0xE001,
399        insn_in = 0xE002,       /* input from port/bus */
400        insn_out = 0xE003,      /* output to port/bus */
401        insn_cpuid = 0xE004,
402        /* insn_other */
403        insn_nop = 0xF001,
404        insn_bcdconv = 0xF002,  /* convert to or from BCD */
405        insn_szconv = 0xF003    /* change size of operand */
406};
407
408/* These flags specify special characteristics of the instruction, such as
409 * whether the inatruction is privileged or whether it serializes the
410 * pipeline.
411 * NOTE : These may not be accurate for all instructions; updates to the
412 * opcode tables have not been completed. */
413enum x86_insn_note {
414	insn_note_ring0		= 1,	/* Only available in ring 0 */
415	insn_note_smm		= 2,	/* "" in System Management Mode */
416	insn_note_serial	= 4,	/* Serializing instruction */
417	insn_note_nonswap	= 8,	/* Does not swap arguments in att-style formatting */
418	insn_note_nosuffix  = 16,	/* Does not have size suffix in att-style formatting */
419};
420
421/* This specifies what effects the instruction has on the %eflags register */
422enum x86_flag_status {
423        insn_carry_set = 0x1,			/* CF */
424        insn_zero_set = 0x2,			/* ZF */
425        insn_oflow_set = 0x4,			/* OF */
426        insn_dir_set = 0x8,			/* DF */
427        insn_sign_set = 0x10,			/* SF */
428        insn_parity_set = 0x20,			/* PF */
429        insn_carry_or_zero_set = 0x40,
430        insn_zero_set_or_sign_ne_oflow = 0x80,
431        insn_carry_clear = 0x100,
432        insn_zero_clear = 0x200,
433        insn_oflow_clear = 0x400,
434        insn_dir_clear = 0x800,
435        insn_sign_clear = 0x1000,
436        insn_parity_clear = 0x2000,
437        insn_sign_eq_oflow = 0x4000,
438        insn_sign_ne_oflow = 0x8000
439};
440
441/* The CPU model in which the insturction first appeared; this can be used
442 * to mask out instructions appearing in earlier or later models or to
443 * check the portability of a binary.
444 * NOTE : These may not be accurate for all instructions; updates to the
445 * opcode tables have not been completed. */
446enum x86_insn_cpu {
447	cpu_8086 	= 1,	/* Intel */
448	cpu_80286	= 2,
449	cpu_80386	= 3,
450	cpu_80387	= 4,
451	cpu_80486	= 5,
452	cpu_pentium	= 6,
453	cpu_pentiumpro	= 7,
454	cpu_pentium2	= 8,
455	cpu_pentium3	= 9,
456	cpu_pentium4	= 10,
457	cpu_k6		= 16,	/* AMD */
458	cpu_k7		= 32,
459	cpu_athlon	= 48
460};
461
462/* CPU ISA subsets: These are derived from the Instruction Groups in
463 * Intel Vol 1 Chapter 5; they represent subsets of the IA32 ISA but
464 * do not reflect the 'type' of the instruction in the same way that
465 * x86_insn_group does. In short, these are AMD/Intel's somewhat useless
466 * designations.
467 * NOTE : These may not be accurate for all instructions; updates to the
468 * opcode tables have not been completed. */
469enum x86_insn_isa {
470	isa_gp		= 1,	/* general purpose */
471	isa_fp		= 2,	/* floating point */
472	isa_fpumgt	= 3,	/* FPU/SIMD management */
473	isa_mmx		= 4,	/* Intel MMX */
474	isa_sse1	= 5,	/* Intel SSE SIMD */
475	isa_sse2	= 6,	/* Intel SSE2 SIMD */
476	isa_sse3	= 7,	/* Intel SSE3 SIMD */
477	isa_3dnow	= 8,	/* AMD 3DNow! SIMD */
478	isa_sys		= 9	/* system instructions */
479};
480
481enum x86_insn_prefix {
482        insn_no_prefix = 0,
483        insn_rep_zero = 1,	/* REPZ and REPE */
484        insn_rep_notzero = 2,	/* REPNZ and REPNZ */
485        insn_lock = 4		/* LOCK: */
486};
487
488/* TODO: maybe provide insn_new/free(), and have disasm return new insn_t */
489/* x86_insn_t : an X86 instruction */
490typedef struct {
491        /* information about the instruction */
492        uint32_t addr;             /* load address */
493        uint32_t offset;           /* offset into file/buffer */
494        enum x86_insn_group group;      /* meta-type, e.g. INS_EXEC */
495        enum x86_insn_type type;        /* type, e.g. INS_BRANCH */
496	enum x86_insn_note note;	/* note, e.g. RING0 */
497        unsigned char bytes[MAX_INSN_SIZE];
498        unsigned char size;             /* size of insn in bytes */
499	/* 16/32-bit mode settings */
500	unsigned char addr_size;	/* default address size : 2 or 4 */
501	unsigned char op_size;		/* default operand size : 2 or 4 */
502	/* CPU/instruction set */
503	enum x86_insn_cpu cpu;
504	enum x86_insn_isa isa;
505	/* flags */
506        enum x86_flag_status flags_set; /* flags set or tested by insn */
507        enum x86_flag_status flags_tested;
508	/* stack */
509	unsigned char stack_mod;	/* 0 or 1 : is the stack modified? */
510	int32_t stack_mod_val;		/* val stack is modified by if known */
511
512        /* the instruction proper */
513        enum x86_insn_prefix prefix;	/* prefixes ORed together */
514        char prefix_string[MAX_PREFIX_STR]; /* prefixes [might be truncated] */
515        char mnemonic[MAX_MNEM_STR];
516        x86_oplist_t *operands;		/* list of explicit/implicit operands */
517	size_t operand_count;		/* total number of operands */
518	size_t explicit_count;		/* number of explicit operands */
519        /* convenience fields for user */
520        void *block;                    /* code block containing this insn */
521        void *function;                 /* function containing this insn */
522        int tag;			/* tag the insn as seen/processed */
523} x86_insn_t;
524
525
526/* returns 0 if an instruction is invalid, 1 if valid */
527int x86_insn_is_valid( x86_insn_t *insn );
528
529/* DISASSEMBLY ROUTINES
530 *      Canonical order of arguments is
531 *        (buf, buf_len, buf_rva, offset, len, insn, func, arg, resolve_func)
532 *      ...but of course all of these are not used at the same time.
533 */
534
535
536/* Function prototype for caller-supplied callback routine
537 *      These callbacks are intended to process 'insn' further, e.g. by
538 *      adding it to a linked list, database, etc */
539typedef void (*DISASM_CALLBACK)( x86_insn_t *insn, void * arg );
540
541/* Function prototype for caller-supplied address resolver.
542 *      This routine is used to determine the rva to disassemble next, given
543 *      the 'dest' operand of a jump/call. This allows the caller to resolve
544 *      jump/call targets stored in a register or on the stack, and also allows
545 *      the caller to prevent endless loops by checking if an address has
546 *      already been disassembled. If an address cannot be resolved from the
547 *      operand, or if the address has already been disassembled, this routine
548 *      should return -1; in all other cases the RVA to be disassembled next
549 *      should be returned. */
550typedef int32_t (*DISASM_RESOLVER)( x86_op_t *op, x86_insn_t * current_insn,
551				 void *arg );
552
553
554/* x86_disasm: Disassemble a single instruction from a buffer of bytes.
555 *             Returns size of instruction in bytes.
556 *             Caller is responsible for calling x86_oplist_free() on
557 *             a reused "insn" to avoid leaking memory when calling this
558 *             function repeatedly.
559 *      buf     : Buffer of bytes to disassemble
560 *      buf_len : Length of the buffer
561 *      buf_rva : Load address of the start of the buffer
562 *      offset  : Offset in buffer to disassemble
563 *      insn    : Structure to fill with disassembled instruction
564 */
565unsigned int x86_disasm( unsigned char *buf, unsigned int buf_len,
566                	 uint32_t buf_rva, unsigned int offset,
567                	 x86_insn_t * insn );
568
569/* x86_disasm_range: Sequential disassembly of a range of bytes in a buffer,
570 *                   invoking a callback function each time an instruction
571 *                   is successfully disassembled. The 'range' refers to the
572 *                   bytes between 'offset' and 'offset + len' in the buffer;
573 *                   'len' is assumed to be less than the length of the buffer.
574 *                   Returns number of instructions processed.
575 *      buf     : Buffer of bytes to disassemble (e.g. .text section)
576 *      buf_rva : Load address of buffer (e.g. ELF Virtual Address)
577 *      offset  : Offset in buffer to start disassembly at
578 *      len     : Number of bytes to disassemble
579 *      func    : Callback function to invoke (may be NULL)
580 *      arg     : Arbitrary data to pass to callback (may be NULL)
581 */
582unsigned int x86_disasm_range( unsigned char *buf, uint32_t buf_rva,
583	                       unsigned int offset, unsigned int len,
584	                       DISASM_CALLBACK func, void *arg );
585
586/* x86_disasm_forward: Flow-of-execution disassembly of the bytes in a buffer,
587 *                     invoking a callback function each time an instruction
588 *                     is successfully disassembled.
589 *      buf     : Buffer to disassemble (e.g. .text section)
590 *      buf_len : Number of bytes in buffer
591 *      buf_rva : Load address of buffer (e.g. ELF Virtual Address)
592 *      offset  : Offset in buffer to start disassembly at (e.g. entry point)
593 *      func    : Callback function to invoke (may be NULL)
594 *      arg     : Arbitrary data to pass to callback (may be NULL)
595 *      resolver: Caller-supplied address resolver. If no resolver is
596 *                supplied, a default internal one is used -- however the
597 *                internal resolver does NOT catch loops and could end up
598 *                disassembling forever..
599 *      r_arg	: Arbitrary data to pass to resolver (may be NULL)
600 */
601unsigned int x86_disasm_forward( unsigned char *buf, unsigned int buf_len,
602	                         uint32_t buf_rva, unsigned int offset,
603	                         DISASM_CALLBACK func, void *arg,
604	                         DISASM_RESOLVER resolver, void *r_arg );
605
606/* Instruction operands: these are stored as a list of explicit and
607 * implicit operands. It is recommended that the 'foreach' routines
608 * be used to when examining operands for purposes of data flow analysis */
609
610/* Operand FOREACH callback: 'arg' is an abritrary parameter passed to the
611 * foreach routine, 'insn' is the x86_insn_t whose operands are being
612 * iterated over, and 'op' is the current x86_op_t */
613typedef void (*x86_operand_fn)(x86_op_t *op, x86_insn_t *insn, void *arg);
614
615/* FOREACH types: these are used to limit the foreach results to
616 * operands which match a certain "type" (implicit or explicit)
617 * or which are accessed in certain ways (e.g. read or write). Note
618 * that this operates on the operand list of single instruction, so
619 * specifying the 'real' operand type (register, memory, etc) is not
620 * useful. Note also that by definition Execute Access implies Read
621 * Access and implies Not Write Access.
622 * The "type" (implicit or explicit) and the access method can
623 * be ORed together, e.g. op_wo | op_explicit */
624enum x86_op_foreach_type {
625	op_any 	= 0,		/* ALL operands (explicit, implicit, rwx) */
626	op_dest = 1,		/* operands with Write access */
627	op_src 	= 2,		/* operands with Read access */
628	op_ro 	= 3,		/* operands with Read but not Write access */
629	op_wo 	= 4,		/* operands with Write but not Read access */
630	op_xo 	= 5,		/* operands with Execute access */
631	op_rw 	= 6,		/* operands with Read AND Write access */
632	op_implicit = 0x10,	/* operands that are implied by the opcode */
633	op_explicit = 0x20	/* operands that are not side-effects */
634};
635
636
637/* free the operand list associated with an instruction -- useful for
638 * preventing memory leaks when free()ing an x86_insn_t */
639void x86_oplist_free( x86_insn_t *insn );
640
641/* Operand foreach: invokes 'func' with 'insn' and 'arg' as arguments. The
642 * 'type' parameter is used to select only operands matching specific
643 * criteria. */
644int x86_operand_foreach( x86_insn_t *insn, x86_operand_fn func, void *arg,
645	       	  	 enum x86_op_foreach_type type);
646
647/* convenience routine: returns count of operands matching 'type' */
648size_t x86_operand_count( x86_insn_t *insn, enum x86_op_foreach_type type );
649
650/* accessor functions for the operands */
651x86_op_t * x86_operand_1st( x86_insn_t *insn );
652x86_op_t * x86_operand_2nd( x86_insn_t *insn );
653x86_op_t * x86_operand_3rd( x86_insn_t *insn );
654
655/* these allow libdisasm 2.0 accessor functions to still be used */
656#define x86_get_dest_operand( insn ) x86_operand_1st( insn )
657#define x86_get_src_operand( insn ) x86_operand_2nd( insn )
658#define x86_get_imm_operand( insn ) x86_operand_3rd( insn )
659
660/* get size of operand data in bytes */
661unsigned int x86_operand_size( x86_op_t *op );
662
663/* Operand Convenience Routines: the following three routines are common
664 * operations on operands, intended to ease the burden of the programmer. */
665
666/* Get Address: return the value of an offset operand, or the offset of
667 * a segment:offset absolute address */
668uint32_t x86_get_address( x86_insn_t *insn );
669
670/* Get Relative Offset: return as a sign-extended int32_t the near or far
671 * relative offset operand, or 0 if there is none. There can be only one
672 * relaive offset operand in an instruction. */
673int32_t x86_get_rel_offset( x86_insn_t *insn );
674
675/* Get Branch Target: return the x86_op_t containing the target of
676 * a jump or call operand, or NULL if there is no branch target.
677 * Internally, a 'branch target' is defined as any operand with
678 * Execute Access set. There can be only one branch target per instruction. */
679x86_op_t * x86_get_branch_target( x86_insn_t *insn );
680
681/* Get Immediate: return the x86_op_t containing the immediate operand
682 * for this instruction, or NULL if there is no immediate operand. There
683 * can be only one immediate operand per instruction */
684x86_op_t * x86_get_imm( x86_insn_t *insn );
685
686/* Get Raw Immediate Data: returns a pointer to the immediate data encoded
687 * in the instruction. This is useful for large data types [>32 bits] currently
688 * not supported by libdisasm, or for determining if the disassembler
689 * screwed up the conversion of the immediate data. Note that 'imm' in this
690 * context refers to immediate data encoded at the end of an instruction as
691 * detailed in the Intel Manual Vol II Chapter 2; it does not refer to the
692 * 'op_imm' operand (the third operand in instructions like 'mul' */
693unsigned char * x86_get_raw_imm( x86_insn_t *insn );
694
695
696/* More accessor fuctions, this time for user-defined info... */
697/* set the address (usually RVA) of the insn */
698void x86_set_insn_addr( x86_insn_t *insn, uint32_t addr );
699
700/* set the offset (usually offset into file) of the insn */
701void x86_set_insn_offset( x86_insn_t *insn, unsigned int offset );
702
703/* set a pointer to the function owning the instruction. The
704 * type of 'func' is user-defined; libdisasm does not use the func field. */
705void x86_set_insn_function( x86_insn_t *insn, void * func );
706
707/* set a pointer to the block of code owning the instruction. The
708 * type of 'block' is user-defined; libdisasm does not use the block field. */
709void x86_set_insn_block( x86_insn_t *insn, void * block );
710
711/* instruction tagging: these routines allow the programmer to mark
712 * instructions as "seen" in a DFS, for example. libdisasm does not use
713 * the tag field.*/
714/* set insn->tag to 1 */
715void x86_tag_insn( x86_insn_t *insn );
716/* set insn->tag to 0 */
717void x86_untag_insn( x86_insn_t *insn );
718/* return insn->tag */
719int x86_insn_is_tagged( x86_insn_t *insn );
720
721
722/* Disassembly formats:
723 *      AT&T is standard AS/GAS-style: "mnemonic\tsrc, dest, imm"
724 *      Intel is standard MASM/NASM/TASM: "mnemonic\tdest,src, imm"
725 *      Native is tab-delimited: "RVA\tbytes\tmnemonic\tdest\tsrc\timm"
726 *      XML is your typical <insn> ... </insn>
727 *      Raw is addr|offset|size|bytes|prefix... see libdisasm_formats.7
728 */
729enum x86_asm_format {
730	unknown_syntax = 0,		/* never use! */
731	native_syntax, 			/* header: 35 bytes */
732	intel_syntax, 			/* header: 23 bytes */
733	att_syntax,  			/* header: 23 bytes */
734	xml_syntax,			/* header: 679 bytes */
735	raw_syntax			/* header: 172 bytes */
736};
737
738/* format (sprintf) an operand into 'buf' using specified syntax */
739int x86_format_operand(x86_op_t *op, char *buf, int len,
740                  enum x86_asm_format format);
741
742/* format (sprintf) an instruction mnemonic into 'buf' using specified syntax */
743int x86_format_mnemonic(x86_insn_t *insn, char *buf, int len,
744                        enum x86_asm_format format);
745
746/* format (sprintf) an instruction into 'buf' using specified syntax;
747 * this includes formatting all operands */
748int x86_format_insn(x86_insn_t *insn, char *buf, int len, enum x86_asm_format);
749
750/* fill 'buf' with a description of the format's syntax */
751int x86_format_header( char *buf, int len, enum x86_asm_format format);
752
753/* Endianness of an x86 CPU : 0 is big, 1 is little; always returns 1 */
754unsigned int x86_endian(void);
755
756/* Default address and operand size in bytes */
757unsigned int x86_addr_size(void);
758unsigned int x86_op_size(void);
759
760/* Size of a machine word in bytes */
761unsigned int x86_word_size(void);
762
763/* maximum size of a code instruction */
764#define x86_max_inst_size(x) x86_max_insn_size(x)
765unsigned int x86_max_insn_size(void);
766
767/* register IDs of Stack, Frame, Instruction pointer and Flags register */
768unsigned int x86_sp_reg(void);
769unsigned int x86_fp_reg(void);
770unsigned int x86_ip_reg(void);
771unsigned int x86_flag_reg(void);
772
773/* fill 'reg' struct with details of register 'id' */
774void x86_reg_from_id( unsigned int id, x86_reg_t * reg );
775
776/* convenience macro demonstrating how to get an aliased register; proto is
777 *   void x86_get_aliased_reg( x86_reg_t *alias_reg, x86_reg_t *output_reg )
778 * where 'alias_reg' is a reg operand and 'output_reg' is filled with the
779 * register that the operand is an alias for */
780#define x86_get_aliased_reg( alias_reg, output_reg )			\
781	x86_reg_from_id( alias_reg->alias, output_reg )
782
783
784/* ================================== Invariant Instruction Representation */
785/* Invariant instructions are used for generating binary signatures;
786 * the instruction is modified so that all variant bytes in an instruction
787 * are replaced with a wildcard byte.
788 *
789 * A 'variant byte' is one that is expected to be modified by either the
790 * static or the dynamic linker: for example, an address encoded in an
791 * instruction.
792 *
793 * By comparing the invariant representation of one instruction [or of a
794 * sequence of instructions] with the invariant representation of another,
795 * one determine whether the two invariant representations are from the same
796 * relocatable object [.o] file. Thus one can use binary signatures [which
797 * are just sequences of invariant instruction representations] to look for
798 * library routines which have been statically-linked into a binary.
799 *
800 * The invariant routines are faster and smaller than the disassembly
801 * routines; they can be used to determine the size of an instruction
802 * without all of the overhead of a full instruction disassembly.
803 */
804
805/* This byte is used to replace variant bytes */
806#define X86_WILDCARD_BYTE 0xF4
807
808typedef struct {
809        enum x86_op_type        type;           /* operand type */
810        enum x86_op_datatype    datatype;       /* operand size */
811        enum x86_op_access      access;         /* operand access [RWX] */
812        enum x86_op_flags       flags;          /* misc flags */
813} x86_invariant_op_t;
814
815typedef struct {
816	unsigned char bytes[64];	/* invariant representation */
817	unsigned int  size;		/* number of bytes in insn */
818        enum x86_insn_group group;      /* meta-type, e.g. INS_EXEC */
819        enum x86_insn_type type;        /* type, e.g. INS_BRANCH */
820	x86_invariant_op_t operands[3];	/* operands: dest, src, imm */
821} x86_invariant_t;
822
823
824/* return a version of the instruction with the variant bytes masked out */
825size_t x86_invariant_disasm( unsigned char *buf, int buf_len,
826			  x86_invariant_t *inv );
827/* return the size in bytes of the intruction pointed to by 'buf';
828 * this used x86_invariant_disasm since it faster than x86_disasm */
829size_t x86_size_disasm( unsigned char *buf, unsigned int buf_len );
830
831#ifdef __cplusplus
832}
833#endif
834
835
836#endif
837