X86DisassemblerDecoder.c revision 5117709a1d71fc34225decde0c7fe6a3ae29c063
1/*===- X86DisassemblerDecoder.c - Disassembler decoder -------------*- C -*-==*
2 *
3 *                     The LLVM Compiler Infrastructure
4 *
5 * This file is distributed under the University of Illinois Open Source
6 * License. See LICENSE.TXT for details.
7 *
8 *===----------------------------------------------------------------------===*
9 *
10 * This file is part of the X86 Disassembler.
11 * It contains the implementation of the instruction decoder.
12 * Documentation for the disassembler can be found in X86Disassembler.h.
13 *
14 *===----------------------------------------------------------------------===*/
15
16#include <stdarg.h>   /* for va_*()       */
17#include <stdio.h>    /* for vsnprintf()  */
18#include <stdlib.h>   /* for exit()       */
19#include <string.h>   /* for memset()     */
20
21#include "X86DisassemblerDecoder.h"
22
23#include "X86GenDisassemblerTables.inc"
24
25#define TRUE  1
26#define FALSE 0
27
28typedef int8_t bool;
29
30#ifndef NDEBUG
31#define debug(s) do { x86DisassemblerDebug(__FILE__, __LINE__, s); } while (0)
32#else
33#define debug(s) do { } while (0)
34#endif
35
36
37/*
38 * contextForAttrs - Client for the instruction context table.  Takes a set of
39 *   attributes and returns the appropriate decode context.
40 *
41 * @param attrMask  - Attributes, from the enumeration attributeBits.
42 * @return          - The InstructionContext to use when looking up an
43 *                    an instruction with these attributes.
44 */
45static InstructionContext contextForAttrs(uint8_t attrMask) {
46  return CONTEXTS_SYM[attrMask];
47}
48
49/*
50 * modRMRequired - Reads the appropriate instruction table to determine whether
51 *   the ModR/M byte is required to decode a particular instruction.
52 *
53 * @param type        - The opcode type (i.e., how many bytes it has).
54 * @param insnContext - The context for the instruction, as returned by
55 *                      contextForAttrs.
56 * @param opcode      - The last byte of the instruction's opcode, not counting
57 *                      ModR/M extensions and escapes.
58 * @return            - TRUE if the ModR/M byte is required, FALSE otherwise.
59 */
60static int modRMRequired(OpcodeType type,
61                                InstructionContext insnContext,
62                                uint8_t opcode) {
63  const struct ContextDecision* decision = 0;
64
65  switch (type) {
66  case ONEBYTE:
67    decision = &ONEBYTE_SYM;
68    break;
69  case TWOBYTE:
70    decision = &TWOBYTE_SYM;
71    break;
72  case THREEBYTE_38:
73    decision = &THREEBYTE38_SYM;
74    break;
75  case THREEBYTE_3A:
76    decision = &THREEBYTE3A_SYM;
77    break;
78  }
79
80  return decision->opcodeDecisions[insnContext].modRMDecisions[opcode].
81    modrm_type != MODRM_ONEENTRY;
82
83  return 0;
84}
85
86/*
87 * decode - Reads the appropriate instruction table to obtain the unique ID of
88 *   an instruction.
89 *
90 * @param type        - See modRMRequired().
91 * @param insnContext - See modRMRequired().
92 * @param opcode      - See modRMRequired().
93 * @param modRM       - The ModR/M byte if required, or any value if not.
94 * @return            - The UID of the instruction, or 0 on failure.
95 */
96static InstrUID decode(OpcodeType type,
97                       InstructionContext insnContext,
98                       uint8_t opcode,
99                       uint8_t modRM) {
100  struct ModRMDecision* dec;
101
102  switch (type) {
103  default:
104    debug("Unknown opcode type");
105    return 0;
106  case ONEBYTE:
107    dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
108    break;
109  case TWOBYTE:
110    dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
111    break;
112  case THREEBYTE_38:
113    dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
114    break;
115  case THREEBYTE_3A:
116    dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
117    break;
118  }
119
120  switch (dec->modrm_type) {
121  default:
122    debug("Corrupt table!  Unknown modrm_type");
123    return 0;
124  case MODRM_ONEENTRY:
125    return dec->instructionIDs[0];
126  case MODRM_SPLITRM:
127    if (modFromModRM(modRM) == 0x3)
128      return dec->instructionIDs[1];
129    else
130      return dec->instructionIDs[0];
131  case MODRM_FULL:
132    return dec->instructionIDs[modRM];
133  }
134}
135
136/*
137 * specifierForUID - Given a UID, returns the name and operand specification for
138 *   that instruction.
139 *
140 * @param uid - The unique ID for the instruction.  This should be returned by
141 *              decode(); specifierForUID will not check bounds.
142 * @return    - A pointer to the specification for that instruction.
143 */
144static struct InstructionSpecifier* specifierForUID(InstrUID uid) {
145  return &INSTRUCTIONS_SYM[uid];
146}
147
148/*
149 * consumeByte - Uses the reader function provided by the user to consume one
150 *   byte from the instruction's memory and advance the cursor.
151 *
152 * @param insn  - The instruction with the reader function to use.  The cursor
153 *                for this instruction is advanced.
154 * @param byte  - A pointer to a pre-allocated memory buffer to be populated
155 *                with the data read.
156 * @return      - 0 if the read was successful; nonzero otherwise.
157 */
158static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) {
159  int ret = insn->reader(insn->readerArg, byte, insn->readerCursor);
160
161  if (!ret)
162    ++(insn->readerCursor);
163
164  return ret;
165}
166
167/*
168 * lookAtByte - Like consumeByte, but does not advance the cursor.
169 *
170 * @param insn  - See consumeByte().
171 * @param byte  - See consumeByte().
172 * @return      - See consumeByte().
173 */
174static int lookAtByte(struct InternalInstruction* insn, uint8_t* byte) {
175  return insn->reader(insn->readerArg, byte, insn->readerCursor);
176}
177
178static void unconsumeByte(struct InternalInstruction* insn) {
179  insn->readerCursor--;
180}
181
182#define CONSUME_FUNC(name, type)                                  \
183  static int name(struct InternalInstruction* insn, type* ptr) {  \
184    type combined = 0;                                            \
185    unsigned offset;                                              \
186    for (offset = 0; offset < sizeof(type); ++offset) {           \
187      uint8_t byte;                                               \
188      int ret = insn->reader(insn->readerArg,                     \
189                             &byte,                               \
190                             insn->readerCursor + offset);        \
191      if (ret)                                                    \
192        return ret;                                               \
193      combined = combined | ((type)byte << ((type)offset * 8));   \
194    }                                                             \
195    *ptr = combined;                                              \
196    insn->readerCursor += sizeof(type);                           \
197    return 0;                                                     \
198  }
199
200/*
201 * consume* - Use the reader function provided by the user to consume data
202 *   values of various sizes from the instruction's memory and advance the
203 *   cursor appropriately.  These readers perform endian conversion.
204 *
205 * @param insn    - See consumeByte().
206 * @param ptr     - A pointer to a pre-allocated memory of appropriate size to
207 *                  be populated with the data read.
208 * @return        - See consumeByte().
209 */
210CONSUME_FUNC(consumeInt8, int8_t)
211CONSUME_FUNC(consumeInt16, int16_t)
212CONSUME_FUNC(consumeInt32, int32_t)
213CONSUME_FUNC(consumeUInt16, uint16_t)
214CONSUME_FUNC(consumeUInt32, uint32_t)
215CONSUME_FUNC(consumeUInt64, uint64_t)
216
217/*
218 * dbgprintf - Uses the logging function provided by the user to log a single
219 *   message, typically without a carriage-return.
220 *
221 * @param insn    - The instruction containing the logging function.
222 * @param format  - See printf().
223 * @param ...     - See printf().
224 */
225static void dbgprintf(struct InternalInstruction* insn,
226                      const char* format,
227                      ...) {
228  char buffer[256];
229  va_list ap;
230
231  if (!insn->dlog)
232    return;
233
234  va_start(ap, format);
235  (void)vsnprintf(buffer, sizeof(buffer), format, ap);
236  va_end(ap);
237
238  insn->dlog(insn->dlogArg, buffer);
239
240  return;
241}
242
243/*
244 * setPrefixPresent - Marks that a particular prefix is present at a particular
245 *   location.
246 *
247 * @param insn      - The instruction to be marked as having the prefix.
248 * @param prefix    - The prefix that is present.
249 * @param location  - The location where the prefix is located (in the address
250 *                    space of the instruction's reader).
251 */
252static void setPrefixPresent(struct InternalInstruction* insn,
253                                    uint8_t prefix,
254                                    uint64_t location)
255{
256  insn->prefixPresent[prefix] = 1;
257  insn->prefixLocations[prefix] = location;
258}
259
260/*
261 * isPrefixAtLocation - Queries an instruction to determine whether a prefix is
262 *   present at a given location.
263 *
264 * @param insn      - The instruction to be queried.
265 * @param prefix    - The prefix.
266 * @param location  - The location to query.
267 * @return          - Whether the prefix is at that location.
268 */
269static BOOL isPrefixAtLocation(struct InternalInstruction* insn,
270                               uint8_t prefix,
271                               uint64_t location)
272{
273  if (insn->prefixPresent[prefix] == 1 &&
274     insn->prefixLocations[prefix] == location)
275    return TRUE;
276  else
277    return FALSE;
278}
279
280/*
281 * readPrefixes - Consumes all of an instruction's prefix bytes, and marks the
282 *   instruction as having them.  Also sets the instruction's default operand,
283 *   address, and other relevant data sizes to report operands correctly.
284 *
285 * @param insn  - The instruction whose prefixes are to be read.
286 * @return      - 0 if the instruction could be read until the end of the prefix
287 *                bytes, and no prefixes conflicted; nonzero otherwise.
288 */
289static int readPrefixes(struct InternalInstruction* insn) {
290  BOOL isPrefix = TRUE;
291  BOOL prefixGroups[4] = { FALSE };
292  uint64_t prefixLocation;
293  uint8_t byte;
294
295  BOOL hasAdSize = FALSE;
296  BOOL hasOpSize = FALSE;
297
298  dbgprintf(insn, "readPrefixes()");
299
300  while (isPrefix) {
301    prefixLocation = insn->readerCursor;
302
303    if (consumeByte(insn, &byte))
304      return -1;
305
306    switch (byte) {
307    case 0xf0:  /* LOCK */
308    case 0xf2:  /* REPNE/REPNZ */
309    case 0xf3:  /* REP or REPE/REPZ */
310      if (prefixGroups[0])
311        dbgprintf(insn, "Redundant Group 1 prefix");
312      prefixGroups[0] = TRUE;
313      setPrefixPresent(insn, byte, prefixLocation);
314      break;
315    case 0x2e:  /* CS segment override -OR- Branch not taken */
316    case 0x36:  /* SS segment override -OR- Branch taken */
317    case 0x3e:  /* DS segment override */
318    case 0x26:  /* ES segment override */
319    case 0x64:  /* FS segment override */
320    case 0x65:  /* GS segment override */
321      switch (byte) {
322      case 0x2e:
323        insn->segmentOverride = SEG_OVERRIDE_CS;
324        break;
325      case 0x36:
326        insn->segmentOverride = SEG_OVERRIDE_SS;
327        break;
328      case 0x3e:
329        insn->segmentOverride = SEG_OVERRIDE_DS;
330        break;
331      case 0x26:
332        insn->segmentOverride = SEG_OVERRIDE_ES;
333        break;
334      case 0x64:
335        insn->segmentOverride = SEG_OVERRIDE_FS;
336        break;
337      case 0x65:
338        insn->segmentOverride = SEG_OVERRIDE_GS;
339        break;
340      default:
341        debug("Unhandled override");
342        return -1;
343      }
344      if (prefixGroups[1])
345        dbgprintf(insn, "Redundant Group 2 prefix");
346      prefixGroups[1] = TRUE;
347      setPrefixPresent(insn, byte, prefixLocation);
348      break;
349    case 0x66:  /* Operand-size override */
350      if (prefixGroups[2])
351        dbgprintf(insn, "Redundant Group 3 prefix");
352      prefixGroups[2] = TRUE;
353      hasOpSize = TRUE;
354      setPrefixPresent(insn, byte, prefixLocation);
355      break;
356    case 0x67:  /* Address-size override */
357      if (prefixGroups[3])
358        dbgprintf(insn, "Redundant Group 4 prefix");
359      prefixGroups[3] = TRUE;
360      hasAdSize = TRUE;
361      setPrefixPresent(insn, byte, prefixLocation);
362      break;
363    default:    /* Not a prefix byte */
364      isPrefix = FALSE;
365      break;
366    }
367
368    if (isPrefix)
369      dbgprintf(insn, "Found prefix 0x%hhx", byte);
370  }
371
372  if (insn->mode == MODE_64BIT) {
373    if ((byte & 0xf0) == 0x40) {
374      uint8_t opcodeByte;
375
376      if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) {
377        dbgprintf(insn, "Redundant REX prefix");
378        return -1;
379      }
380
381      insn->rexPrefix = byte;
382      insn->necessaryPrefixLocation = insn->readerCursor - 2;
383
384      dbgprintf(insn, "Found REX prefix 0x%hhx", byte);
385    } else {
386      unconsumeByte(insn);
387      insn->necessaryPrefixLocation = insn->readerCursor - 1;
388    }
389  } else {
390    unconsumeByte(insn);
391  }
392
393  if (insn->mode == MODE_16BIT) {
394    insn->registerSize       = (hasOpSize ? 4 : 2);
395    insn->addressSize        = (hasAdSize ? 4 : 2);
396    insn->displacementSize   = (hasAdSize ? 4 : 2);
397    insn->immediateSize      = (hasOpSize ? 4 : 2);
398  } else if (insn->mode == MODE_32BIT) {
399    insn->registerSize       = (hasOpSize ? 2 : 4);
400    insn->addressSize        = (hasAdSize ? 2 : 4);
401    insn->displacementSize   = (hasAdSize ? 2 : 4);
402    insn->immediateSize      = (hasOpSize ? 2 : 4);
403  } else if (insn->mode == MODE_64BIT) {
404    if (insn->rexPrefix && wFromREX(insn->rexPrefix)) {
405      insn->registerSize       = 8;
406      insn->addressSize        = (hasAdSize ? 4 : 8);
407      insn->displacementSize   = 4;
408      insn->immediateSize      = 4;
409    } else if (insn->rexPrefix) {
410      insn->registerSize       = (hasOpSize ? 2 : 4);
411      insn->addressSize        = (hasAdSize ? 4 : 8);
412      insn->displacementSize   = (hasOpSize ? 2 : 4);
413      insn->immediateSize      = (hasOpSize ? 2 : 4);
414    } else {
415      insn->registerSize       = (hasOpSize ? 2 : 4);
416      insn->addressSize        = (hasAdSize ? 4 : 8);
417      insn->displacementSize   = (hasOpSize ? 2 : 4);
418      insn->immediateSize      = (hasOpSize ? 2 : 4);
419    }
420  }
421
422  return 0;
423}
424
425/*
426 * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of
427 *   extended or escape opcodes).
428 *
429 * @param insn  - The instruction whose opcode is to be read.
430 * @return      - 0 if the opcode could be read successfully; nonzero otherwise.
431 */
432static int readOpcode(struct InternalInstruction* insn) {
433  /* Determine the length of the primary opcode */
434
435  uint8_t current;
436
437  dbgprintf(insn, "readOpcode()");
438
439  insn->opcodeType = ONEBYTE;
440  if (consumeByte(insn, &current))
441    return -1;
442
443  if (current == 0x0f) {
444    dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current);
445
446    insn->twoByteEscape = current;
447
448    if (consumeByte(insn, &current))
449      return -1;
450
451    if (current == 0x38) {
452      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
453
454      insn->threeByteEscape = current;
455
456      if (consumeByte(insn, &current))
457        return -1;
458
459      insn->opcodeType = THREEBYTE_38;
460    } else if (current == 0x3a) {
461      dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
462
463      insn->threeByteEscape = current;
464
465      if (consumeByte(insn, &current))
466        return -1;
467
468      insn->opcodeType = THREEBYTE_3A;
469    } else {
470      dbgprintf(insn, "Didn't find a three-byte escape prefix");
471
472      insn->opcodeType = TWOBYTE;
473    }
474  }
475
476  /*
477   * At this point we have consumed the full opcode.
478   * Anything we consume from here on must be unconsumed.
479   */
480
481  insn->opcode = current;
482
483  return 0;
484}
485
486static int readModRM(struct InternalInstruction* insn);
487
488/*
489 * getIDWithAttrMask - Determines the ID of an instruction, consuming
490 *   the ModR/M byte as appropriate for extended and escape opcodes,
491 *   and using a supplied attribute mask.
492 *
493 * @param instructionID - A pointer whose target is filled in with the ID of the
494 *                        instruction.
495 * @param insn          - The instruction whose ID is to be determined.
496 * @param attrMask      - The attribute mask to search.
497 * @return              - 0 if the ModR/M could be read when needed or was not
498 *                        needed; nonzero otherwise.
499 */
500static int getIDWithAttrMask(uint16_t* instructionID,
501                             struct InternalInstruction* insn,
502                             uint8_t attrMask) {
503  BOOL hasModRMExtension;
504
505  uint8_t instructionClass;
506
507  instructionClass = contextForAttrs(attrMask);
508
509  hasModRMExtension = modRMRequired(insn->opcodeType,
510                                    instructionClass,
511                                    insn->opcode);
512
513  if (hasModRMExtension) {
514    readModRM(insn);
515
516    *instructionID = decode(insn->opcodeType,
517                            instructionClass,
518                            insn->opcode,
519                            insn->modRM);
520  } else {
521    *instructionID = decode(insn->opcodeType,
522                            instructionClass,
523                            insn->opcode,
524                            0);
525  }
526
527  return 0;
528}
529
530/*
531 * is16BitEquivalent - Determines whether two instruction names refer to
532 * equivalent instructions but one is 16-bit whereas the other is not.
533 *
534 * @param orig  - The instruction that is not 16-bit
535 * @param equiv - The instruction that is 16-bit
536 */
537static BOOL is16BitEquvalent(const char* orig, const char* equiv) {
538  off_t i;
539
540  for (i = 0;; i++) {
541    if (orig[i] == '\0' && equiv[i] == '\0')
542      return TRUE;
543    if (orig[i] == '\0' || equiv[i] == '\0')
544      return FALSE;
545    if (orig[i] != equiv[i]) {
546      if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W')
547        continue;
548      if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1')
549        continue;
550      if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6')
551        continue;
552      return FALSE;
553    }
554  }
555}
556
557/*
558 * is64BitEquivalent - Determines whether two instruction names refer to
559 * equivalent instructions but one is 64-bit whereas the other is not.
560 *
561 * @param orig  - The instruction that is not 64-bit
562 * @param equiv - The instruction that is 64-bit
563 */
564static BOOL is64BitEquivalent(const char* orig, const char* equiv) {
565  off_t i;
566
567  for (i = 0;; i++) {
568    if (orig[i] == '\0' && equiv[i] == '\0')
569      return TRUE;
570    if (orig[i] == '\0' || equiv[i] == '\0')
571      return FALSE;
572    if (orig[i] != equiv[i]) {
573      if ((orig[i] == 'W' || orig[i] == 'L') && equiv[i] == 'Q')
574        continue;
575      if ((orig[i] == '1' || orig[i] == '3') && equiv[i] == '6')
576        continue;
577      if ((orig[i] == '6' || orig[i] == '2') && equiv[i] == '4')
578        continue;
579      return FALSE;
580    }
581  }
582}
583
584
585/*
586 * getID - Determines the ID of an instruction, consuming the ModR/M byte as
587 *   appropriate for extended and escape opcodes.  Determines the attributes and
588 *   context for the instruction before doing so.
589 *
590 * @param insn  - The instruction whose ID is to be determined.
591 * @return      - 0 if the ModR/M could be read when needed or was not needed;
592 *                nonzero otherwise.
593 */
594static int getID(struct InternalInstruction* insn) {
595  uint8_t attrMask;
596  uint16_t instructionID;
597
598  dbgprintf(insn, "getID()");
599
600  attrMask = ATTR_NONE;
601
602  if (insn->mode == MODE_64BIT)
603    attrMask |= ATTR_64BIT;
604
605  if (insn->rexPrefix & 0x08)
606    attrMask |= ATTR_REXW;
607
608  if (isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
609    attrMask |= ATTR_OPSIZE;
610  else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation))
611    attrMask |= ATTR_XS;
612  else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation))
613    attrMask |= ATTR_XD;
614
615  if (getIDWithAttrMask(&instructionID, insn, attrMask))
616    return -1;
617
618  /* The following clauses compensate for limitations of the tables. */
619
620  if ((attrMask & ATTR_XD) && (attrMask & ATTR_REXW)) {
621    /*
622     * Although for SSE instructions it is usually necessary to treat REX.W+F2
623     * as F2 for decode (in the absence of a 64BIT_REXW_XD category) there is
624     * an occasional instruction where F2 is incidental and REX.W is the more
625     * significant.  If the decoded instruction is 32-bit and adding REX.W
626     * instead of F2 changes a 32 to a 64, we adopt the new encoding.
627     */
628
629    struct InstructionSpecifier* spec;
630    uint16_t instructionIDWithREXw;
631    struct InstructionSpecifier* specWithREXw;
632
633    spec = specifierForUID(instructionID);
634
635    if (getIDWithAttrMask(&instructionIDWithREXw,
636                          insn,
637                          attrMask & (~ATTR_XD))) {
638      /*
639       * Decoding with REX.w would yield nothing; give up and return original
640       * decode.
641       */
642
643      insn->instructionID = instructionID;
644      insn->spec = spec;
645      return 0;
646    }
647
648    specWithREXw = specifierForUID(instructionIDWithREXw);
649
650    if (is64BitEquivalent(spec->name, specWithREXw->name)) {
651      insn->instructionID = instructionIDWithREXw;
652      insn->spec = specWithREXw;
653    } else {
654      insn->instructionID = instructionID;
655      insn->spec = spec;
656    }
657    return 0;
658  }
659
660  if (insn->prefixPresent[0x66] && !(attrMask & ATTR_OPSIZE)) {
661    /*
662     * The instruction tables make no distinction between instructions that
663     * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a
664     * particular spot (i.e., many MMX operations).  In general we're
665     * conservative, but in the specific case where OpSize is present but not
666     * in the right place we check if there's a 16-bit operation.
667     */
668
669    struct InstructionSpecifier* spec;
670    uint16_t instructionIDWithOpsize;
671    struct InstructionSpecifier* specWithOpsize;
672
673    spec = specifierForUID(instructionID);
674
675    if (getIDWithAttrMask(&instructionIDWithOpsize,
676                          insn,
677                          attrMask | ATTR_OPSIZE)) {
678      /*
679       * ModRM required with OpSize but not present; give up and return version
680       * without OpSize set
681       */
682
683      insn->instructionID = instructionID;
684      insn->spec = spec;
685      return 0;
686    }
687
688    specWithOpsize = specifierForUID(instructionIDWithOpsize);
689
690    if (is16BitEquvalent(spec->name, specWithOpsize->name)) {
691      insn->instructionID = instructionIDWithOpsize;
692      insn->spec = specWithOpsize;
693    } else {
694      insn->instructionID = instructionID;
695      insn->spec = spec;
696    }
697    return 0;
698  }
699
700  insn->instructionID = instructionID;
701  insn->spec = specifierForUID(insn->instructionID);
702
703  return 0;
704}
705
706/*
707 * readSIB - Consumes the SIB byte to determine addressing information for an
708 *   instruction.
709 *
710 * @param insn  - The instruction whose SIB byte is to be read.
711 * @return      - 0 if the SIB byte was successfully read; nonzero otherwise.
712 */
713static int readSIB(struct InternalInstruction* insn) {
714  SIBIndex sibIndexBase = 0;
715  SIBBase sibBaseBase = 0;
716  uint8_t index, base;
717
718  dbgprintf(insn, "readSIB()");
719
720  if (insn->consumedSIB)
721    return 0;
722
723  insn->consumedSIB = TRUE;
724
725  switch (insn->addressSize) {
726  case 2:
727    dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode");
728    return -1;
729    break;
730  case 4:
731    sibIndexBase = SIB_INDEX_EAX;
732    sibBaseBase = SIB_BASE_EAX;
733    break;
734  case 8:
735    sibIndexBase = SIB_INDEX_RAX;
736    sibBaseBase = SIB_BASE_RAX;
737    break;
738  }
739
740  if (consumeByte(insn, &insn->sib))
741    return -1;
742
743  index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3);
744
745  switch (index) {
746  case 0x4:
747    insn->sibIndex = SIB_INDEX_NONE;
748    break;
749  default:
750    insn->sibIndex = (EABase)(sibIndexBase + index);
751    if (insn->sibIndex == SIB_INDEX_sib ||
752        insn->sibIndex == SIB_INDEX_sib64)
753      insn->sibIndex = SIB_INDEX_NONE;
754    break;
755  }
756
757  switch (scaleFromSIB(insn->sib)) {
758  case 0:
759    insn->sibScale = 1;
760    break;
761  case 1:
762    insn->sibScale = 2;
763    break;
764  case 2:
765    insn->sibScale = 4;
766    break;
767  case 3:
768    insn->sibScale = 8;
769    break;
770  }
771
772  base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3);
773
774  switch (base) {
775  case 0x5:
776    switch (modFromModRM(insn->modRM)) {
777    case 0x0:
778      insn->eaDisplacement = EA_DISP_32;
779      insn->sibBase = SIB_BASE_NONE;
780      break;
781    case 0x1:
782      insn->eaDisplacement = EA_DISP_8;
783      insn->sibBase = (insn->addressSize == 4 ?
784                       SIB_BASE_EBP : SIB_BASE_RBP);
785      break;
786    case 0x2:
787      insn->eaDisplacement = EA_DISP_32;
788      insn->sibBase = (insn->addressSize == 4 ?
789                       SIB_BASE_EBP : SIB_BASE_RBP);
790      break;
791    case 0x3:
792      debug("Cannot have Mod = 0b11 and a SIB byte");
793      return -1;
794    }
795    break;
796  default:
797    insn->sibBase = (EABase)(sibBaseBase + base);
798    break;
799  }
800
801  return 0;
802}
803
804/*
805 * readDisplacement - Consumes the displacement of an instruction.
806 *
807 * @param insn  - The instruction whose displacement is to be read.
808 * @return      - 0 if the displacement byte was successfully read; nonzero
809 *                otherwise.
810 */
811static int readDisplacement(struct InternalInstruction* insn) {
812  int8_t d8;
813  int16_t d16;
814  int32_t d32;
815
816  dbgprintf(insn, "readDisplacement()");
817
818  if (insn->consumedDisplacement)
819    return 0;
820
821  insn->consumedDisplacement = TRUE;
822
823  switch (insn->eaDisplacement) {
824  case EA_DISP_NONE:
825    insn->consumedDisplacement = FALSE;
826    break;
827  case EA_DISP_8:
828    if (consumeInt8(insn, &d8))
829      return -1;
830    insn->displacement = d8;
831    break;
832  case EA_DISP_16:
833    if (consumeInt16(insn, &d16))
834      return -1;
835    insn->displacement = d16;
836    break;
837  case EA_DISP_32:
838    if (consumeInt32(insn, &d32))
839      return -1;
840    insn->displacement = d32;
841    break;
842  }
843
844  insn->consumedDisplacement = TRUE;
845  return 0;
846}
847
848/*
849 * readModRM - Consumes all addressing information (ModR/M byte, SIB byte, and
850 *   displacement) for an instruction and interprets it.
851 *
852 * @param insn  - The instruction whose addressing information is to be read.
853 * @return      - 0 if the information was successfully read; nonzero otherwise.
854 */
855static int readModRM(struct InternalInstruction* insn) {
856  uint8_t mod, rm, reg;
857
858  dbgprintf(insn, "readModRM()");
859
860  if (insn->consumedModRM)
861    return 0;
862
863  consumeByte(insn, &insn->modRM);
864  insn->consumedModRM = TRUE;
865
866  mod     = modFromModRM(insn->modRM);
867  rm      = rmFromModRM(insn->modRM);
868  reg     = regFromModRM(insn->modRM);
869
870  /*
871   * This goes by insn->registerSize to pick the correct register, which messes
872   * up if we're using (say) XMM or 8-bit register operands.  That gets fixed in
873   * fixupReg().
874   */
875  switch (insn->registerSize) {
876  case 2:
877    insn->regBase = MODRM_REG_AX;
878    insn->eaRegBase = EA_REG_AX;
879    break;
880  case 4:
881    insn->regBase = MODRM_REG_EAX;
882    insn->eaRegBase = EA_REG_EAX;
883    break;
884  case 8:
885    insn->regBase = MODRM_REG_RAX;
886    insn->eaRegBase = EA_REG_RAX;
887    break;
888  }
889
890  reg |= rFromREX(insn->rexPrefix) << 3;
891  rm  |= bFromREX(insn->rexPrefix) << 3;
892
893  insn->reg = (Reg)(insn->regBase + reg);
894
895  switch (insn->addressSize) {
896  case 2:
897    insn->eaBaseBase = EA_BASE_BX_SI;
898
899    switch (mod) {
900    case 0x0:
901      if (rm == 0x6) {
902        insn->eaBase = EA_BASE_NONE;
903        insn->eaDisplacement = EA_DISP_16;
904        if (readDisplacement(insn))
905          return -1;
906      } else {
907        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
908        insn->eaDisplacement = EA_DISP_NONE;
909      }
910      break;
911    case 0x1:
912      insn->eaBase = (EABase)(insn->eaBaseBase + rm);
913      insn->eaDisplacement = EA_DISP_8;
914      if (readDisplacement(insn))
915        return -1;
916      break;
917    case 0x2:
918      insn->eaBase = (EABase)(insn->eaBaseBase + rm);
919      insn->eaDisplacement = EA_DISP_16;
920      if (readDisplacement(insn))
921        return -1;
922      break;
923    case 0x3:
924      insn->eaBase = (EABase)(insn->eaRegBase + rm);
925      if (readDisplacement(insn))
926        return -1;
927      break;
928    }
929    break;
930  case 4:
931  case 8:
932    insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
933
934    switch (mod) {
935    case 0x0:
936      insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */
937      switch (rm) {
938      case 0x4:
939      case 0xc:   /* in case REXW.b is set */
940        insn->eaBase = (insn->addressSize == 4 ?
941                        EA_BASE_sib : EA_BASE_sib64);
942        readSIB(insn);
943        if (readDisplacement(insn))
944          return -1;
945        break;
946      case 0x5:
947        insn->eaBase = EA_BASE_NONE;
948        insn->eaDisplacement = EA_DISP_32;
949        if (readDisplacement(insn))
950          return -1;
951        break;
952      default:
953        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
954        break;
955      }
956      break;
957    case 0x1:
958    case 0x2:
959      insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32);
960      switch (rm) {
961      case 0x4:
962      case 0xc:   /* in case REXW.b is set */
963        insn->eaBase = EA_BASE_sib;
964        readSIB(insn);
965        if (readDisplacement(insn))
966          return -1;
967        break;
968      default:
969        insn->eaBase = (EABase)(insn->eaBaseBase + rm);
970        if (readDisplacement(insn))
971          return -1;
972        break;
973      }
974      break;
975    case 0x3:
976      insn->eaDisplacement = EA_DISP_NONE;
977      insn->eaBase = (EABase)(insn->eaRegBase + rm);
978      break;
979    }
980    break;
981  } /* switch (insn->addressSize) */
982
983  return 0;
984}
985
986#define GENERIC_FIXUP_FUNC(name, base, prefix)            \
987  static uint8_t name(struct InternalInstruction *insn,   \
988                      OperandType type,                   \
989                      uint8_t index,                      \
990                      uint8_t *valid) {                   \
991    *valid = 1;                                           \
992    switch (type) {                                       \
993    default:                                              \
994      debug("Unhandled register type");                   \
995      *valid = 0;                                         \
996      return 0;                                           \
997    case TYPE_Rv:                                         \
998      return base + index;                                \
999    case TYPE_R8:                                         \
1000      if (insn->rexPrefix &&                              \
1001         index >= 4 && index <= 7) {                      \
1002        return prefix##_SPL + (index - 4);                \
1003      } else {                                            \
1004        return prefix##_AL + index;                       \
1005      }                                                   \
1006    case TYPE_R16:                                        \
1007      return prefix##_AX + index;                         \
1008    case TYPE_R32:                                        \
1009      return prefix##_EAX + index;                        \
1010    case TYPE_R64:                                        \
1011      return prefix##_RAX + index;                        \
1012    case TYPE_XMM128:                                     \
1013    case TYPE_XMM64:                                      \
1014    case TYPE_XMM32:                                      \
1015    case TYPE_XMM:                                        \
1016      return prefix##_XMM0 + index;                       \
1017    case TYPE_MM64:                                       \
1018    case TYPE_MM32:                                       \
1019    case TYPE_MM:                                         \
1020      if (index > 7)                                      \
1021        *valid = 0;                                       \
1022      return prefix##_MM0 + index;                        \
1023    case TYPE_SEGMENTREG:                                 \
1024      if (index > 5)                                      \
1025        *valid = 0;                                       \
1026      return prefix##_ES + index;                         \
1027    case TYPE_DEBUGREG:                                   \
1028      if (index > 7)                                      \
1029        *valid = 0;                                       \
1030      return prefix##_DR0 + index;                        \
1031    case TYPE_CONTROLREG:                                 \
1032      if (index > 8)                                      \
1033        *valid = 0;                                       \
1034      return prefix##_CR0 + index;                        \
1035    }                                                     \
1036  }
1037
1038/*
1039 * fixup*Value - Consults an operand type to determine the meaning of the
1040 *   reg or R/M field.  If the operand is an XMM operand, for example, an
1041 *   operand would be XMM0 instead of AX, which readModRM() would otherwise
1042 *   misinterpret it as.
1043 *
1044 * @param insn  - The instruction containing the operand.
1045 * @param type  - The operand type.
1046 * @param index - The existing value of the field as reported by readModRM().
1047 * @param valid - The address of a uint8_t.  The target is set to 1 if the
1048 *                field is valid for the register class; 0 if not.
1049 * @return      - The proper value.
1050 */
1051GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase,    MODRM_REG)
1052GENERIC_FIXUP_FUNC(fixupRMValue,  insn->eaRegBase,  EA_REG)
1053
1054/*
1055 * fixupReg - Consults an operand specifier to determine which of the
1056 *   fixup*Value functions to use in correcting readModRM()'ss interpretation.
1057 *
1058 * @param insn  - See fixup*Value().
1059 * @param op    - The operand specifier.
1060 * @return      - 0 if fixup was successful; -1 if the register returned was
1061 *                invalid for its class.
1062 */
1063static int fixupReg(struct InternalInstruction *insn,
1064                    struct OperandSpecifier *op) {
1065  uint8_t valid;
1066
1067  dbgprintf(insn, "fixupReg()");
1068
1069  switch ((OperandEncoding)op->encoding) {
1070  default:
1071    debug("Expected a REG or R/M encoding in fixupReg");
1072    return -1;
1073  case ENCODING_REG:
1074    insn->reg = (Reg)fixupRegValue(insn,
1075                                   (OperandType)op->type,
1076                                   insn->reg - insn->regBase,
1077                                   &valid);
1078    if (!valid)
1079      return -1;
1080    break;
1081  case ENCODING_RM:
1082    if (insn->eaBase >= insn->eaRegBase) {
1083      insn->eaBase = (EABase)fixupRMValue(insn,
1084                                          (OperandType)op->type,
1085                                          insn->eaBase - insn->eaRegBase,
1086                                          &valid);
1087      if (!valid)
1088        return -1;
1089    }
1090    break;
1091  }
1092
1093  return 0;
1094}
1095
1096/*
1097 * readOpcodeModifier - Reads an operand from the opcode field of an
1098 *   instruction.  Handles AddRegFrm instructions.
1099 *
1100 * @param insn    - The instruction whose opcode field is to be read.
1101 * @param inModRM - Indicates that the opcode field is to be read from the
1102 *                  ModR/M extension; useful for escape opcodes
1103 * @return        - 0 on success; nonzero otherwise.
1104 */
1105static int readOpcodeModifier(struct InternalInstruction* insn) {
1106  dbgprintf(insn, "readOpcodeModifier()");
1107
1108  if (insn->consumedOpcodeModifier)
1109    return 0;
1110
1111  insn->consumedOpcodeModifier = TRUE;
1112
1113  switch (insn->spec->modifierType) {
1114  default:
1115    debug("Unknown modifier type.");
1116    return -1;
1117  case MODIFIER_NONE:
1118    debug("No modifier but an operand expects one.");
1119    return -1;
1120  case MODIFIER_OPCODE:
1121    insn->opcodeModifier = insn->opcode - insn->spec->modifierBase;
1122    return 0;
1123  case MODIFIER_MODRM:
1124    insn->opcodeModifier = insn->modRM - insn->spec->modifierBase;
1125    return 0;
1126  }
1127}
1128
1129/*
1130 * readOpcodeRegister - Reads an operand from the opcode field of an
1131 *   instruction and interprets it appropriately given the operand width.
1132 *   Handles AddRegFrm instructions.
1133 *
1134 * @param insn  - See readOpcodeModifier().
1135 * @param size  - The width (in bytes) of the register being specified.
1136 *                1 means AL and friends, 2 means AX, 4 means EAX, and 8 means
1137 *                RAX.
1138 * @return      - 0 on success; nonzero otherwise.
1139 */
1140static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) {
1141  dbgprintf(insn, "readOpcodeRegister()");
1142
1143  if (readOpcodeModifier(insn))
1144    return -1;
1145
1146  if (size == 0)
1147    size = insn->registerSize;
1148
1149  switch (size) {
1150  case 1:
1151    insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3)
1152                                                  | insn->opcodeModifier));
1153    if (insn->rexPrefix &&
1154        insn->opcodeRegister >= MODRM_REG_AL + 0x4 &&
1155        insn->opcodeRegister < MODRM_REG_AL + 0x8) {
1156      insn->opcodeRegister = (Reg)(MODRM_REG_SPL
1157                                   + (insn->opcodeRegister - MODRM_REG_AL - 4));
1158    }
1159
1160    break;
1161  case 2:
1162    insn->opcodeRegister = (Reg)(MODRM_REG_AX
1163                                 + ((bFromREX(insn->rexPrefix) << 3)
1164                                    | insn->opcodeModifier));
1165    break;
1166  case 4:
1167    insn->opcodeRegister = (Reg)(MODRM_REG_EAX
1168                                 + ((bFromREX(insn->rexPrefix) << 3)
1169                                    | insn->opcodeModifier));
1170    break;
1171  case 8:
1172    insn->opcodeRegister = (Reg)(MODRM_REG_RAX
1173                                 + ((bFromREX(insn->rexPrefix) << 3)
1174                                    | insn->opcodeModifier));
1175    break;
1176  }
1177
1178  return 0;
1179}
1180
1181/*
1182 * readImmediate - Consumes an immediate operand from an instruction, given the
1183 *   desired operand size.
1184 *
1185 * @param insn  - The instruction whose operand is to be read.
1186 * @param size  - The width (in bytes) of the operand.
1187 * @return      - 0 if the immediate was successfully consumed; nonzero
1188 *                otherwise.
1189 */
1190static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
1191  uint8_t imm8;
1192  uint16_t imm16;
1193  uint32_t imm32;
1194  uint64_t imm64;
1195
1196  dbgprintf(insn, "readImmediate()");
1197
1198  if (insn->numImmediatesConsumed == 2) {
1199    debug("Already consumed two immediates");
1200    return -1;
1201  }
1202
1203  if (size == 0)
1204    size = insn->immediateSize;
1205  else
1206    insn->immediateSize = size;
1207
1208  switch (size) {
1209  case 1:
1210    if (consumeByte(insn, &imm8))
1211      return -1;
1212    insn->immediates[insn->numImmediatesConsumed] = imm8;
1213    break;
1214  case 2:
1215    if (consumeUInt16(insn, &imm16))
1216      return -1;
1217    insn->immediates[insn->numImmediatesConsumed] = imm16;
1218    break;
1219  case 4:
1220    if (consumeUInt32(insn, &imm32))
1221      return -1;
1222    insn->immediates[insn->numImmediatesConsumed] = imm32;
1223    break;
1224  case 8:
1225    if (consumeUInt64(insn, &imm64))
1226      return -1;
1227    insn->immediates[insn->numImmediatesConsumed] = imm64;
1228    break;
1229  }
1230
1231  insn->numImmediatesConsumed++;
1232
1233  return 0;
1234}
1235
1236/*
1237 * readOperands - Consults the specifier for an instruction and consumes all
1238 *   operands for that instruction, interpreting them as it goes.
1239 *
1240 * @param insn  - The instruction whose operands are to be read and interpreted.
1241 * @return      - 0 if all operands could be read; nonzero otherwise.
1242 */
1243static int readOperands(struct InternalInstruction* insn) {
1244  int index;
1245
1246  dbgprintf(insn, "readOperands()");
1247
1248  for (index = 0; index < X86_MAX_OPERANDS; ++index) {
1249    switch (insn->spec->operands[index].encoding) {
1250    case ENCODING_NONE:
1251      break;
1252    case ENCODING_REG:
1253    case ENCODING_RM:
1254      if (readModRM(insn))
1255        return -1;
1256      if (fixupReg(insn, &insn->spec->operands[index]))
1257        return -1;
1258      break;
1259    case ENCODING_CB:
1260    case ENCODING_CW:
1261    case ENCODING_CD:
1262    case ENCODING_CP:
1263    case ENCODING_CO:
1264    case ENCODING_CT:
1265      dbgprintf(insn, "We currently don't hande code-offset encodings");
1266      return -1;
1267    case ENCODING_IB:
1268      if (readImmediate(insn, 1))
1269        return -1;
1270      if (insn->spec->operands[index].type == TYPE_IMM3 &&
1271          insn->immediates[insn->numImmediatesConsumed - 1] > 7)
1272        return -1;
1273      break;
1274    case ENCODING_IW:
1275      if (readImmediate(insn, 2))
1276        return -1;
1277      break;
1278    case ENCODING_ID:
1279      if (readImmediate(insn, 4))
1280        return -1;
1281      break;
1282    case ENCODING_IO:
1283      if (readImmediate(insn, 8))
1284        return -1;
1285      break;
1286    case ENCODING_Iv:
1287      if (readImmediate(insn, insn->immediateSize))
1288        return -1;
1289      break;
1290    case ENCODING_Ia:
1291      if (readImmediate(insn, insn->addressSize))
1292        return -1;
1293      break;
1294    case ENCODING_RB:
1295      if (readOpcodeRegister(insn, 1))
1296        return -1;
1297      break;
1298    case ENCODING_RW:
1299      if (readOpcodeRegister(insn, 2))
1300        return -1;
1301      break;
1302    case ENCODING_RD:
1303      if (readOpcodeRegister(insn, 4))
1304        return -1;
1305      break;
1306    case ENCODING_RO:
1307      if (readOpcodeRegister(insn, 8))
1308        return -1;
1309      break;
1310    case ENCODING_Rv:
1311      if (readOpcodeRegister(insn, 0))
1312        return -1;
1313      break;
1314    case ENCODING_I:
1315      if (readOpcodeModifier(insn))
1316        return -1;
1317    case ENCODING_DUP:
1318      break;
1319    default:
1320      dbgprintf(insn, "Encountered an operand with an unknown encoding.");
1321      return -1;
1322    }
1323  }
1324
1325  return 0;
1326}
1327
1328/*
1329 * decodeInstruction - Reads and interprets a full instruction provided by the
1330 *   user.
1331 *
1332 * @param insn      - A pointer to the instruction to be populated.  Must be
1333 *                    pre-allocated.
1334 * @param reader    - The function to be used to read the instruction's bytes.
1335 * @param readerArg - A generic argument to be passed to the reader to store
1336 *                    any internal state.
1337 * @param logger    - If non-NULL, the function to be used to write log messages
1338 *                    and warnings.
1339 * @param loggerArg - A generic argument to be passed to the logger to store
1340 *                    any internal state.
1341 * @param startLoc  - The address (in the reader's address space) of the first
1342 *                    byte in the instruction.
1343 * @param mode      - The mode (real mode, IA-32e, or IA-32e in 64-bit mode) to
1344 *                    decode the instruction in.
1345 * @return          - 0 if the instruction's memory could be read; nonzero if
1346 *                    not.
1347 */
1348int decodeInstruction(struct InternalInstruction* insn,
1349                      byteReader_t reader,
1350                      void* readerArg,
1351                      dlog_t logger,
1352                      void* loggerArg,
1353                      uint64_t startLoc,
1354                      DisassemblerMode mode) {
1355  memset(insn, 0, sizeof(struct InternalInstruction));
1356
1357  insn->reader = reader;
1358  insn->readerArg = readerArg;
1359  insn->dlog = logger;
1360  insn->dlogArg = loggerArg;
1361  insn->startLocation = startLoc;
1362  insn->readerCursor = startLoc;
1363  insn->mode = mode;
1364  insn->numImmediatesConsumed = 0;
1365
1366  if (readPrefixes(insn)       ||
1367      readOpcode(insn)         ||
1368      getID(insn)              ||
1369      insn->instructionID == 0 ||
1370      readOperands(insn))
1371    return -1;
1372
1373  insn->length = insn->readerCursor - insn->startLocation;
1374
1375  dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %zu",
1376            startLoc, insn->readerCursor, insn->length);
1377
1378  if (insn->length > 15)
1379    dbgprintf(insn, "Instruction exceeds 15-byte limit");
1380
1381  return 0;
1382}
1383