X86DisassemblerDecoder.c revision 9e9bb0871d8f6d026afed6c127373222eab233e3
1/*===- X86DisassemblerDecoder.c - Disassembler decoder -------------*- C -*-==* 2 * 3 * The LLVM Compiler Infrastructure 4 * 5 * This file is distributed under the University of Illinois Open Source 6 * License. See LICENSE.TXT for details. 7 * 8 *===----------------------------------------------------------------------===* 9 * 10 * This file is part of the X86 Disassembler. 11 * It contains the implementation of the instruction decoder. 12 * Documentation for the disassembler can be found in X86Disassembler.h. 13 * 14 *===----------------------------------------------------------------------===*/ 15 16#include <stdarg.h> /* for va_*() */ 17#include <stdio.h> /* for vsnprintf() */ 18#include <stdlib.h> /* for exit() */ 19#include <string.h> /* for memset() */ 20 21#include "X86DisassemblerDecoder.h" 22 23#include "X86GenDisassemblerTables.inc" 24 25#define TRUE 1 26#define FALSE 0 27 28typedef int8_t bool; 29 30#ifndef NDEBUG 31#define debug(s) do { x86DisassemblerDebug(__FILE__, __LINE__, s); } while (0) 32#else 33#define debug(s) do { } while (0) 34#endif 35 36 37/* 38 * contextForAttrs - Client for the instruction context table. Takes a set of 39 * attributes and returns the appropriate decode context. 40 * 41 * @param attrMask - Attributes, from the enumeration attributeBits. 42 * @return - The InstructionContext to use when looking up an 43 * an instruction with these attributes. 44 */ 45static InstructionContext contextForAttrs(uint8_t attrMask) { 46 return CONTEXTS_SYM[attrMask]; 47} 48 49/* 50 * modRMRequired - Reads the appropriate instruction table to determine whether 51 * the ModR/M byte is required to decode a particular instruction. 52 * 53 * @param type - The opcode type (i.e., how many bytes it has). 54 * @param insnContext - The context for the instruction, as returned by 55 * contextForAttrs. 56 * @param opcode - The last byte of the instruction's opcode, not counting 57 * ModR/M extensions and escapes. 58 * @return - TRUE if the ModR/M byte is required, FALSE otherwise. 59 */ 60static int modRMRequired(OpcodeType type, 61 InstructionContext insnContext, 62 uint8_t opcode) { 63 const struct ContextDecision* decision = 0; 64 65 switch (type) { 66 case ONEBYTE: 67 decision = &ONEBYTE_SYM; 68 break; 69 case TWOBYTE: 70 decision = &TWOBYTE_SYM; 71 break; 72 case THREEBYTE_38: 73 decision = &THREEBYTE38_SYM; 74 break; 75 case THREEBYTE_3A: 76 decision = &THREEBYTE3A_SYM; 77 break; 78 } 79 80 return decision->opcodeDecisions[insnContext].modRMDecisions[opcode]. 81 modrm_type != MODRM_ONEENTRY; 82 83 return 0; 84} 85 86/* 87 * decode - Reads the appropriate instruction table to obtain the unique ID of 88 * an instruction. 89 * 90 * @param type - See modRMRequired(). 91 * @param insnContext - See modRMRequired(). 92 * @param opcode - See modRMRequired(). 93 * @param modRM - The ModR/M byte if required, or any value if not. 94 * @return - The UID of the instruction, or 0 on failure. 95 */ 96static InstrUID decode(OpcodeType type, 97 InstructionContext insnContext, 98 uint8_t opcode, 99 uint8_t modRM) { 100 const struct ModRMDecision* dec; 101 102 switch (type) { 103 default: 104 debug("Unknown opcode type"); 105 return 0; 106 case ONEBYTE: 107 dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; 108 break; 109 case TWOBYTE: 110 dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; 111 break; 112 case THREEBYTE_38: 113 dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; 114 break; 115 case THREEBYTE_3A: 116 dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; 117 break; 118 } 119 120 switch (dec->modrm_type) { 121 default: 122 debug("Corrupt table! Unknown modrm_type"); 123 return 0; 124 case MODRM_ONEENTRY: 125 return dec->instructionIDs[0]; 126 case MODRM_SPLITRM: 127 if (modFromModRM(modRM) == 0x3) 128 return dec->instructionIDs[1]; 129 else 130 return dec->instructionIDs[0]; 131 case MODRM_FULL: 132 return dec->instructionIDs[modRM]; 133 } 134} 135 136/* 137 * specifierForUID - Given a UID, returns the name and operand specification for 138 * that instruction. 139 * 140 * @param uid - The unique ID for the instruction. This should be returned by 141 * decode(); specifierForUID will not check bounds. 142 * @return - A pointer to the specification for that instruction. 143 */ 144static const struct InstructionSpecifier *specifierForUID(InstrUID uid) { 145 return &INSTRUCTIONS_SYM[uid]; 146} 147 148/* 149 * consumeByte - Uses the reader function provided by the user to consume one 150 * byte from the instruction's memory and advance the cursor. 151 * 152 * @param insn - The instruction with the reader function to use. The cursor 153 * for this instruction is advanced. 154 * @param byte - A pointer to a pre-allocated memory buffer to be populated 155 * with the data read. 156 * @return - 0 if the read was successful; nonzero otherwise. 157 */ 158static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) { 159 int ret = insn->reader(insn->readerArg, byte, insn->readerCursor); 160 161 if (!ret) 162 ++(insn->readerCursor); 163 164 return ret; 165} 166 167/* 168 * lookAtByte - Like consumeByte, but does not advance the cursor. 169 * 170 * @param insn - See consumeByte(). 171 * @param byte - See consumeByte(). 172 * @return - See consumeByte(). 173 */ 174static int lookAtByte(struct InternalInstruction* insn, uint8_t* byte) { 175 return insn->reader(insn->readerArg, byte, insn->readerCursor); 176} 177 178static void unconsumeByte(struct InternalInstruction* insn) { 179 insn->readerCursor--; 180} 181 182#define CONSUME_FUNC(name, type) \ 183 static int name(struct InternalInstruction* insn, type* ptr) { \ 184 type combined = 0; \ 185 unsigned offset; \ 186 for (offset = 0; offset < sizeof(type); ++offset) { \ 187 uint8_t byte; \ 188 int ret = insn->reader(insn->readerArg, \ 189 &byte, \ 190 insn->readerCursor + offset); \ 191 if (ret) \ 192 return ret; \ 193 combined = combined | ((type)byte << ((type)offset * 8)); \ 194 } \ 195 *ptr = combined; \ 196 insn->readerCursor += sizeof(type); \ 197 return 0; \ 198 } 199 200/* 201 * consume* - Use the reader function provided by the user to consume data 202 * values of various sizes from the instruction's memory and advance the 203 * cursor appropriately. These readers perform endian conversion. 204 * 205 * @param insn - See consumeByte(). 206 * @param ptr - A pointer to a pre-allocated memory of appropriate size to 207 * be populated with the data read. 208 * @return - See consumeByte(). 209 */ 210CONSUME_FUNC(consumeInt8, int8_t) 211CONSUME_FUNC(consumeInt16, int16_t) 212CONSUME_FUNC(consumeInt32, int32_t) 213CONSUME_FUNC(consumeUInt16, uint16_t) 214CONSUME_FUNC(consumeUInt32, uint32_t) 215CONSUME_FUNC(consumeUInt64, uint64_t) 216 217/* 218 * dbgprintf - Uses the logging function provided by the user to log a single 219 * message, typically without a carriage-return. 220 * 221 * @param insn - The instruction containing the logging function. 222 * @param format - See printf(). 223 * @param ... - See printf(). 224 */ 225static void dbgprintf(struct InternalInstruction* insn, 226 const char* format, 227 ...) { 228 char buffer[256]; 229 va_list ap; 230 231 if (!insn->dlog) 232 return; 233 234 va_start(ap, format); 235 (void)vsnprintf(buffer, sizeof(buffer), format, ap); 236 va_end(ap); 237 238 insn->dlog(insn->dlogArg, buffer); 239 240 return; 241} 242 243/* 244 * setPrefixPresent - Marks that a particular prefix is present at a particular 245 * location. 246 * 247 * @param insn - The instruction to be marked as having the prefix. 248 * @param prefix - The prefix that is present. 249 * @param location - The location where the prefix is located (in the address 250 * space of the instruction's reader). 251 */ 252static void setPrefixPresent(struct InternalInstruction* insn, 253 uint8_t prefix, 254 uint64_t location) 255{ 256 insn->prefixPresent[prefix] = 1; 257 insn->prefixLocations[prefix] = location; 258} 259 260/* 261 * isPrefixAtLocation - Queries an instruction to determine whether a prefix is 262 * present at a given location. 263 * 264 * @param insn - The instruction to be queried. 265 * @param prefix - The prefix. 266 * @param location - The location to query. 267 * @return - Whether the prefix is at that location. 268 */ 269static BOOL isPrefixAtLocation(struct InternalInstruction* insn, 270 uint8_t prefix, 271 uint64_t location) 272{ 273 if (insn->prefixPresent[prefix] == 1 && 274 insn->prefixLocations[prefix] == location) 275 return TRUE; 276 else 277 return FALSE; 278} 279 280/* 281 * readPrefixes - Consumes all of an instruction's prefix bytes, and marks the 282 * instruction as having them. Also sets the instruction's default operand, 283 * address, and other relevant data sizes to report operands correctly. 284 * 285 * @param insn - The instruction whose prefixes are to be read. 286 * @return - 0 if the instruction could be read until the end of the prefix 287 * bytes, and no prefixes conflicted; nonzero otherwise. 288 */ 289static int readPrefixes(struct InternalInstruction* insn) { 290 BOOL isPrefix = TRUE; 291 BOOL prefixGroups[4] = { FALSE }; 292 uint64_t prefixLocation; 293 uint8_t byte = 0; 294 295 BOOL hasAdSize = FALSE; 296 BOOL hasOpSize = FALSE; 297 298 dbgprintf(insn, "readPrefixes()"); 299 300 while (isPrefix) { 301 prefixLocation = insn->readerCursor; 302 303 if (consumeByte(insn, &byte)) 304 return -1; 305 306 switch (byte) { 307 case 0xf0: /* LOCK */ 308 case 0xf2: /* REPNE/REPNZ */ 309 case 0xf3: /* REP or REPE/REPZ */ 310 if (prefixGroups[0]) 311 dbgprintf(insn, "Redundant Group 1 prefix"); 312 prefixGroups[0] = TRUE; 313 setPrefixPresent(insn, byte, prefixLocation); 314 break; 315 case 0x2e: /* CS segment override -OR- Branch not taken */ 316 case 0x36: /* SS segment override -OR- Branch taken */ 317 case 0x3e: /* DS segment override */ 318 case 0x26: /* ES segment override */ 319 case 0x64: /* FS segment override */ 320 case 0x65: /* GS segment override */ 321 switch (byte) { 322 case 0x2e: 323 insn->segmentOverride = SEG_OVERRIDE_CS; 324 break; 325 case 0x36: 326 insn->segmentOverride = SEG_OVERRIDE_SS; 327 break; 328 case 0x3e: 329 insn->segmentOverride = SEG_OVERRIDE_DS; 330 break; 331 case 0x26: 332 insn->segmentOverride = SEG_OVERRIDE_ES; 333 break; 334 case 0x64: 335 insn->segmentOverride = SEG_OVERRIDE_FS; 336 break; 337 case 0x65: 338 insn->segmentOverride = SEG_OVERRIDE_GS; 339 break; 340 default: 341 debug("Unhandled override"); 342 return -1; 343 } 344 if (prefixGroups[1]) 345 dbgprintf(insn, "Redundant Group 2 prefix"); 346 prefixGroups[1] = TRUE; 347 setPrefixPresent(insn, byte, prefixLocation); 348 break; 349 case 0x66: /* Operand-size override */ 350 if (prefixGroups[2]) 351 dbgprintf(insn, "Redundant Group 3 prefix"); 352 prefixGroups[2] = TRUE; 353 hasOpSize = TRUE; 354 setPrefixPresent(insn, byte, prefixLocation); 355 break; 356 case 0x67: /* Address-size override */ 357 if (prefixGroups[3]) 358 dbgprintf(insn, "Redundant Group 4 prefix"); 359 prefixGroups[3] = TRUE; 360 hasAdSize = TRUE; 361 setPrefixPresent(insn, byte, prefixLocation); 362 break; 363 default: /* Not a prefix byte */ 364 isPrefix = FALSE; 365 break; 366 } 367 368 if (isPrefix) 369 dbgprintf(insn, "Found prefix 0x%hhx", byte); 370 } 371 372 if (insn->mode == MODE_64BIT) { 373 if ((byte & 0xf0) == 0x40) { 374 uint8_t opcodeByte; 375 376 if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) { 377 dbgprintf(insn, "Redundant REX prefix"); 378 return -1; 379 } 380 381 insn->rexPrefix = byte; 382 insn->necessaryPrefixLocation = insn->readerCursor - 2; 383 384 dbgprintf(insn, "Found REX prefix 0x%hhx", byte); 385 } else { 386 unconsumeByte(insn); 387 insn->necessaryPrefixLocation = insn->readerCursor - 1; 388 } 389 } else { 390 unconsumeByte(insn); 391 insn->necessaryPrefixLocation = insn->readerCursor - 1; 392 } 393 394 if (insn->mode == MODE_16BIT) { 395 insn->registerSize = (hasOpSize ? 4 : 2); 396 insn->addressSize = (hasAdSize ? 4 : 2); 397 insn->displacementSize = (hasAdSize ? 4 : 2); 398 insn->immediateSize = (hasOpSize ? 4 : 2); 399 } else if (insn->mode == MODE_32BIT) { 400 insn->registerSize = (hasOpSize ? 2 : 4); 401 insn->addressSize = (hasAdSize ? 2 : 4); 402 insn->displacementSize = (hasAdSize ? 2 : 4); 403 insn->immediateSize = (hasOpSize ? 2 : 4); 404 } else if (insn->mode == MODE_64BIT) { 405 if (insn->rexPrefix && wFromREX(insn->rexPrefix)) { 406 insn->registerSize = 8; 407 insn->addressSize = (hasAdSize ? 4 : 8); 408 insn->displacementSize = 4; 409 insn->immediateSize = 4; 410 } else if (insn->rexPrefix) { 411 insn->registerSize = (hasOpSize ? 2 : 4); 412 insn->addressSize = (hasAdSize ? 4 : 8); 413 insn->displacementSize = (hasOpSize ? 2 : 4); 414 insn->immediateSize = (hasOpSize ? 2 : 4); 415 } else { 416 insn->registerSize = (hasOpSize ? 2 : 4); 417 insn->addressSize = (hasAdSize ? 4 : 8); 418 insn->displacementSize = (hasOpSize ? 2 : 4); 419 insn->immediateSize = (hasOpSize ? 2 : 4); 420 } 421 } 422 423 return 0; 424} 425 426/* 427 * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of 428 * extended or escape opcodes). 429 * 430 * @param insn - The instruction whose opcode is to be read. 431 * @return - 0 if the opcode could be read successfully; nonzero otherwise. 432 */ 433static int readOpcode(struct InternalInstruction* insn) { 434 /* Determine the length of the primary opcode */ 435 436 uint8_t current; 437 438 dbgprintf(insn, "readOpcode()"); 439 440 insn->opcodeType = ONEBYTE; 441 if (consumeByte(insn, ¤t)) 442 return -1; 443 444 if (current == 0x0f) { 445 dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current); 446 447 insn->twoByteEscape = current; 448 449 if (consumeByte(insn, ¤t)) 450 return -1; 451 452 if (current == 0x38) { 453 dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); 454 455 insn->threeByteEscape = current; 456 457 if (consumeByte(insn, ¤t)) 458 return -1; 459 460 insn->opcodeType = THREEBYTE_38; 461 } else if (current == 0x3a) { 462 dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); 463 464 insn->threeByteEscape = current; 465 466 if (consumeByte(insn, ¤t)) 467 return -1; 468 469 insn->opcodeType = THREEBYTE_3A; 470 } else { 471 dbgprintf(insn, "Didn't find a three-byte escape prefix"); 472 473 insn->opcodeType = TWOBYTE; 474 } 475 } 476 477 /* 478 * At this point we have consumed the full opcode. 479 * Anything we consume from here on must be unconsumed. 480 */ 481 482 insn->opcode = current; 483 484 return 0; 485} 486 487static int readModRM(struct InternalInstruction* insn); 488 489/* 490 * getIDWithAttrMask - Determines the ID of an instruction, consuming 491 * the ModR/M byte as appropriate for extended and escape opcodes, 492 * and using a supplied attribute mask. 493 * 494 * @param instructionID - A pointer whose target is filled in with the ID of the 495 * instruction. 496 * @param insn - The instruction whose ID is to be determined. 497 * @param attrMask - The attribute mask to search. 498 * @return - 0 if the ModR/M could be read when needed or was not 499 * needed; nonzero otherwise. 500 */ 501static int getIDWithAttrMask(uint16_t* instructionID, 502 struct InternalInstruction* insn, 503 uint8_t attrMask) { 504 BOOL hasModRMExtension; 505 506 uint8_t instructionClass; 507 508 instructionClass = contextForAttrs(attrMask); 509 510 hasModRMExtension = modRMRequired(insn->opcodeType, 511 instructionClass, 512 insn->opcode); 513 514 if (hasModRMExtension) { 515 if (readModRM(insn)) 516 return -1; 517 518 *instructionID = decode(insn->opcodeType, 519 instructionClass, 520 insn->opcode, 521 insn->modRM); 522 } else { 523 *instructionID = decode(insn->opcodeType, 524 instructionClass, 525 insn->opcode, 526 0); 527 } 528 529 return 0; 530} 531 532/* 533 * is16BitEquivalent - Determines whether two instruction names refer to 534 * equivalent instructions but one is 16-bit whereas the other is not. 535 * 536 * @param orig - The instruction that is not 16-bit 537 * @param equiv - The instruction that is 16-bit 538 */ 539static BOOL is16BitEquvalent(const char* orig, const char* equiv) { 540 off_t i; 541 542 for (i = 0;; i++) { 543 if (orig[i] == '\0' && equiv[i] == '\0') 544 return TRUE; 545 if (orig[i] == '\0' || equiv[i] == '\0') 546 return FALSE; 547 if (orig[i] != equiv[i]) { 548 if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W') 549 continue; 550 if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1') 551 continue; 552 if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6') 553 continue; 554 return FALSE; 555 } 556 } 557} 558 559/* 560 * is64BitEquivalent - Determines whether two instruction names refer to 561 * equivalent instructions but one is 64-bit whereas the other is not. 562 * 563 * @param orig - The instruction that is not 64-bit 564 * @param equiv - The instruction that is 64-bit 565 */ 566static BOOL is64BitEquivalent(const char* orig, const char* equiv) { 567 off_t i; 568 569 for (i = 0;; i++) { 570 if (orig[i] == '\0' && equiv[i] == '\0') 571 return TRUE; 572 if (orig[i] == '\0' || equiv[i] == '\0') 573 return FALSE; 574 if (orig[i] != equiv[i]) { 575 if ((orig[i] == 'W' || orig[i] == 'L') && equiv[i] == 'Q') 576 continue; 577 if ((orig[i] == '1' || orig[i] == '3') && equiv[i] == '6') 578 continue; 579 if ((orig[i] == '6' || orig[i] == '2') && equiv[i] == '4') 580 continue; 581 return FALSE; 582 } 583 } 584} 585 586 587/* 588 * getID - Determines the ID of an instruction, consuming the ModR/M byte as 589 * appropriate for extended and escape opcodes. Determines the attributes and 590 * context for the instruction before doing so. 591 * 592 * @param insn - The instruction whose ID is to be determined. 593 * @return - 0 if the ModR/M could be read when needed or was not needed; 594 * nonzero otherwise. 595 */ 596static int getID(struct InternalInstruction* insn) { 597 uint8_t attrMask; 598 uint16_t instructionID; 599 600 dbgprintf(insn, "getID()"); 601 602 attrMask = ATTR_NONE; 603 604 if (insn->mode == MODE_64BIT) 605 attrMask |= ATTR_64BIT; 606 607 if (insn->rexPrefix & 0x08) 608 attrMask |= ATTR_REXW; 609 610 if (isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) 611 attrMask |= ATTR_OPSIZE; 612 else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation)) 613 attrMask |= ATTR_XS; 614 else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation)) 615 attrMask |= ATTR_XD; 616 617 if (getIDWithAttrMask(&instructionID, insn, attrMask)) 618 return -1; 619 620 /* The following clauses compensate for limitations of the tables. */ 621 622 if ((attrMask & ATTR_XD) && (attrMask & ATTR_REXW)) { 623 /* 624 * Although for SSE instructions it is usually necessary to treat REX.W+F2 625 * as F2 for decode (in the absence of a 64BIT_REXW_XD category) there is 626 * an occasional instruction where F2 is incidental and REX.W is the more 627 * significant. If the decoded instruction is 32-bit and adding REX.W 628 * instead of F2 changes a 32 to a 64, we adopt the new encoding. 629 */ 630 631 const struct InstructionSpecifier *spec; 632 uint16_t instructionIDWithREXw; 633 const struct InstructionSpecifier *specWithREXw; 634 635 spec = specifierForUID(instructionID); 636 637 if (getIDWithAttrMask(&instructionIDWithREXw, 638 insn, 639 attrMask & (~ATTR_XD))) { 640 /* 641 * Decoding with REX.w would yield nothing; give up and return original 642 * decode. 643 */ 644 645 insn->instructionID = instructionID; 646 insn->spec = spec; 647 return 0; 648 } 649 650 specWithREXw = specifierForUID(instructionIDWithREXw); 651 652 if (is64BitEquivalent(spec->name, specWithREXw->name)) { 653 insn->instructionID = instructionIDWithREXw; 654 insn->spec = specWithREXw; 655 } else { 656 insn->instructionID = instructionID; 657 insn->spec = spec; 658 } 659 return 0; 660 } 661 662 if (insn->prefixPresent[0x66] && !(attrMask & ATTR_OPSIZE)) { 663 /* 664 * The instruction tables make no distinction between instructions that 665 * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a 666 * particular spot (i.e., many MMX operations). In general we're 667 * conservative, but in the specific case where OpSize is present but not 668 * in the right place we check if there's a 16-bit operation. 669 */ 670 671 const struct InstructionSpecifier *spec; 672 uint16_t instructionIDWithOpsize; 673 const struct InstructionSpecifier *specWithOpsize; 674 675 spec = specifierForUID(instructionID); 676 677 if (getIDWithAttrMask(&instructionIDWithOpsize, 678 insn, 679 attrMask | ATTR_OPSIZE)) { 680 /* 681 * ModRM required with OpSize but not present; give up and return version 682 * without OpSize set 683 */ 684 685 insn->instructionID = instructionID; 686 insn->spec = spec; 687 return 0; 688 } 689 690 specWithOpsize = specifierForUID(instructionIDWithOpsize); 691 692 if (is16BitEquvalent(spec->name, specWithOpsize->name)) { 693 insn->instructionID = instructionIDWithOpsize; 694 insn->spec = specWithOpsize; 695 } else { 696 insn->instructionID = instructionID; 697 insn->spec = spec; 698 } 699 return 0; 700 } 701 702 insn->instructionID = instructionID; 703 insn->spec = specifierForUID(insn->instructionID); 704 705 return 0; 706} 707 708/* 709 * readSIB - Consumes the SIB byte to determine addressing information for an 710 * instruction. 711 * 712 * @param insn - The instruction whose SIB byte is to be read. 713 * @return - 0 if the SIB byte was successfully read; nonzero otherwise. 714 */ 715static int readSIB(struct InternalInstruction* insn) { 716 SIBIndex sibIndexBase = 0; 717 SIBBase sibBaseBase = 0; 718 uint8_t index, base; 719 720 dbgprintf(insn, "readSIB()"); 721 722 if (insn->consumedSIB) 723 return 0; 724 725 insn->consumedSIB = TRUE; 726 727 switch (insn->addressSize) { 728 case 2: 729 dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode"); 730 return -1; 731 break; 732 case 4: 733 sibIndexBase = SIB_INDEX_EAX; 734 sibBaseBase = SIB_BASE_EAX; 735 break; 736 case 8: 737 sibIndexBase = SIB_INDEX_RAX; 738 sibBaseBase = SIB_BASE_RAX; 739 break; 740 } 741 742 if (consumeByte(insn, &insn->sib)) 743 return -1; 744 745 index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3); 746 747 switch (index) { 748 case 0x4: 749 insn->sibIndex = SIB_INDEX_NONE; 750 break; 751 default: 752 insn->sibIndex = (SIBIndex)(sibIndexBase + index); 753 if (insn->sibIndex == SIB_INDEX_sib || 754 insn->sibIndex == SIB_INDEX_sib64) 755 insn->sibIndex = SIB_INDEX_NONE; 756 break; 757 } 758 759 switch (scaleFromSIB(insn->sib)) { 760 case 0: 761 insn->sibScale = 1; 762 break; 763 case 1: 764 insn->sibScale = 2; 765 break; 766 case 2: 767 insn->sibScale = 4; 768 break; 769 case 3: 770 insn->sibScale = 8; 771 break; 772 } 773 774 base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3); 775 776 switch (base) { 777 case 0x5: 778 switch (modFromModRM(insn->modRM)) { 779 case 0x0: 780 insn->eaDisplacement = EA_DISP_32; 781 insn->sibBase = SIB_BASE_NONE; 782 break; 783 case 0x1: 784 insn->eaDisplacement = EA_DISP_8; 785 insn->sibBase = (insn->addressSize == 4 ? 786 SIB_BASE_EBP : SIB_BASE_RBP); 787 break; 788 case 0x2: 789 insn->eaDisplacement = EA_DISP_32; 790 insn->sibBase = (insn->addressSize == 4 ? 791 SIB_BASE_EBP : SIB_BASE_RBP); 792 break; 793 case 0x3: 794 debug("Cannot have Mod = 0b11 and a SIB byte"); 795 return -1; 796 } 797 break; 798 default: 799 insn->sibBase = (SIBBase)(sibBaseBase + base); 800 break; 801 } 802 803 return 0; 804} 805 806/* 807 * readDisplacement - Consumes the displacement of an instruction. 808 * 809 * @param insn - The instruction whose displacement is to be read. 810 * @return - 0 if the displacement byte was successfully read; nonzero 811 * otherwise. 812 */ 813static int readDisplacement(struct InternalInstruction* insn) { 814 int8_t d8; 815 int16_t d16; 816 int32_t d32; 817 818 dbgprintf(insn, "readDisplacement()"); 819 820 if (insn->consumedDisplacement) 821 return 0; 822 823 insn->consumedDisplacement = TRUE; 824 825 switch (insn->eaDisplacement) { 826 case EA_DISP_NONE: 827 insn->consumedDisplacement = FALSE; 828 break; 829 case EA_DISP_8: 830 if (consumeInt8(insn, &d8)) 831 return -1; 832 insn->displacement = d8; 833 break; 834 case EA_DISP_16: 835 if (consumeInt16(insn, &d16)) 836 return -1; 837 insn->displacement = d16; 838 break; 839 case EA_DISP_32: 840 if (consumeInt32(insn, &d32)) 841 return -1; 842 insn->displacement = d32; 843 break; 844 } 845 846 insn->consumedDisplacement = TRUE; 847 return 0; 848} 849 850/* 851 * readModRM - Consumes all addressing information (ModR/M byte, SIB byte, and 852 * displacement) for an instruction and interprets it. 853 * 854 * @param insn - The instruction whose addressing information is to be read. 855 * @return - 0 if the information was successfully read; nonzero otherwise. 856 */ 857static int readModRM(struct InternalInstruction* insn) { 858 uint8_t mod, rm, reg; 859 860 dbgprintf(insn, "readModRM()"); 861 862 if (insn->consumedModRM) 863 return 0; 864 865 if (consumeByte(insn, &insn->modRM)) 866 return -1; 867 insn->consumedModRM = TRUE; 868 869 mod = modFromModRM(insn->modRM); 870 rm = rmFromModRM(insn->modRM); 871 reg = regFromModRM(insn->modRM); 872 873 /* 874 * This goes by insn->registerSize to pick the correct register, which messes 875 * up if we're using (say) XMM or 8-bit register operands. That gets fixed in 876 * fixupReg(). 877 */ 878 switch (insn->registerSize) { 879 case 2: 880 insn->regBase = MODRM_REG_AX; 881 insn->eaRegBase = EA_REG_AX; 882 break; 883 case 4: 884 insn->regBase = MODRM_REG_EAX; 885 insn->eaRegBase = EA_REG_EAX; 886 break; 887 case 8: 888 insn->regBase = MODRM_REG_RAX; 889 insn->eaRegBase = EA_REG_RAX; 890 break; 891 } 892 893 reg |= rFromREX(insn->rexPrefix) << 3; 894 rm |= bFromREX(insn->rexPrefix) << 3; 895 896 insn->reg = (Reg)(insn->regBase + reg); 897 898 switch (insn->addressSize) { 899 case 2: 900 insn->eaBaseBase = EA_BASE_BX_SI; 901 902 switch (mod) { 903 case 0x0: 904 if (rm == 0x6) { 905 insn->eaBase = EA_BASE_NONE; 906 insn->eaDisplacement = EA_DISP_16; 907 if (readDisplacement(insn)) 908 return -1; 909 } else { 910 insn->eaBase = (EABase)(insn->eaBaseBase + rm); 911 insn->eaDisplacement = EA_DISP_NONE; 912 } 913 break; 914 case 0x1: 915 insn->eaBase = (EABase)(insn->eaBaseBase + rm); 916 insn->eaDisplacement = EA_DISP_8; 917 if (readDisplacement(insn)) 918 return -1; 919 break; 920 case 0x2: 921 insn->eaBase = (EABase)(insn->eaBaseBase + rm); 922 insn->eaDisplacement = EA_DISP_16; 923 if (readDisplacement(insn)) 924 return -1; 925 break; 926 case 0x3: 927 insn->eaBase = (EABase)(insn->eaRegBase + rm); 928 if (readDisplacement(insn)) 929 return -1; 930 break; 931 } 932 break; 933 case 4: 934 case 8: 935 insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX); 936 937 switch (mod) { 938 case 0x0: 939 insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */ 940 switch (rm) { 941 case 0x4: 942 case 0xc: /* in case REXW.b is set */ 943 insn->eaBase = (insn->addressSize == 4 ? 944 EA_BASE_sib : EA_BASE_sib64); 945 readSIB(insn); 946 if (readDisplacement(insn)) 947 return -1; 948 break; 949 case 0x5: 950 insn->eaBase = EA_BASE_NONE; 951 insn->eaDisplacement = EA_DISP_32; 952 if (readDisplacement(insn)) 953 return -1; 954 break; 955 default: 956 insn->eaBase = (EABase)(insn->eaBaseBase + rm); 957 break; 958 } 959 break; 960 case 0x1: 961 case 0x2: 962 insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32); 963 switch (rm) { 964 case 0x4: 965 case 0xc: /* in case REXW.b is set */ 966 insn->eaBase = EA_BASE_sib; 967 readSIB(insn); 968 if (readDisplacement(insn)) 969 return -1; 970 break; 971 default: 972 insn->eaBase = (EABase)(insn->eaBaseBase + rm); 973 if (readDisplacement(insn)) 974 return -1; 975 break; 976 } 977 break; 978 case 0x3: 979 insn->eaDisplacement = EA_DISP_NONE; 980 insn->eaBase = (EABase)(insn->eaRegBase + rm); 981 break; 982 } 983 break; 984 } /* switch (insn->addressSize) */ 985 986 return 0; 987} 988 989#define GENERIC_FIXUP_FUNC(name, base, prefix) \ 990 static uint8_t name(struct InternalInstruction *insn, \ 991 OperandType type, \ 992 uint8_t index, \ 993 uint8_t *valid) { \ 994 *valid = 1; \ 995 switch (type) { \ 996 default: \ 997 debug("Unhandled register type"); \ 998 *valid = 0; \ 999 return 0; \ 1000 case TYPE_Rv: \ 1001 return base + index; \ 1002 case TYPE_R8: \ 1003 if (insn->rexPrefix && \ 1004 index >= 4 && index <= 7) { \ 1005 return prefix##_SPL + (index - 4); \ 1006 } else { \ 1007 return prefix##_AL + index; \ 1008 } \ 1009 case TYPE_R16: \ 1010 return prefix##_AX + index; \ 1011 case TYPE_R32: \ 1012 return prefix##_EAX + index; \ 1013 case TYPE_R64: \ 1014 return prefix##_RAX + index; \ 1015 case TYPE_XMM128: \ 1016 case TYPE_XMM64: \ 1017 case TYPE_XMM32: \ 1018 case TYPE_XMM: \ 1019 return prefix##_XMM0 + index; \ 1020 case TYPE_MM64: \ 1021 case TYPE_MM32: \ 1022 case TYPE_MM: \ 1023 if (index > 7) \ 1024 *valid = 0; \ 1025 return prefix##_MM0 + index; \ 1026 case TYPE_SEGMENTREG: \ 1027 if (index > 5) \ 1028 *valid = 0; \ 1029 return prefix##_ES + index; \ 1030 case TYPE_DEBUGREG: \ 1031 if (index > 7) \ 1032 *valid = 0; \ 1033 return prefix##_DR0 + index; \ 1034 case TYPE_CONTROLREG: \ 1035 if (index > 8) \ 1036 *valid = 0; \ 1037 return prefix##_CR0 + index; \ 1038 } \ 1039 } 1040 1041/* 1042 * fixup*Value - Consults an operand type to determine the meaning of the 1043 * reg or R/M field. If the operand is an XMM operand, for example, an 1044 * operand would be XMM0 instead of AX, which readModRM() would otherwise 1045 * misinterpret it as. 1046 * 1047 * @param insn - The instruction containing the operand. 1048 * @param type - The operand type. 1049 * @param index - The existing value of the field as reported by readModRM(). 1050 * @param valid - The address of a uint8_t. The target is set to 1 if the 1051 * field is valid for the register class; 0 if not. 1052 * @return - The proper value. 1053 */ 1054GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase, MODRM_REG) 1055GENERIC_FIXUP_FUNC(fixupRMValue, insn->eaRegBase, EA_REG) 1056 1057/* 1058 * fixupReg - Consults an operand specifier to determine which of the 1059 * fixup*Value functions to use in correcting readModRM()'ss interpretation. 1060 * 1061 * @param insn - See fixup*Value(). 1062 * @param op - The operand specifier. 1063 * @return - 0 if fixup was successful; -1 if the register returned was 1064 * invalid for its class. 1065 */ 1066static int fixupReg(struct InternalInstruction *insn, 1067 const struct OperandSpecifier *op) { 1068 uint8_t valid; 1069 1070 dbgprintf(insn, "fixupReg()"); 1071 1072 switch ((OperandEncoding)op->encoding) { 1073 default: 1074 debug("Expected a REG or R/M encoding in fixupReg"); 1075 return -1; 1076 case ENCODING_REG: 1077 insn->reg = (Reg)fixupRegValue(insn, 1078 (OperandType)op->type, 1079 insn->reg - insn->regBase, 1080 &valid); 1081 if (!valid) 1082 return -1; 1083 break; 1084 case ENCODING_RM: 1085 if (insn->eaBase >= insn->eaRegBase) { 1086 insn->eaBase = (EABase)fixupRMValue(insn, 1087 (OperandType)op->type, 1088 insn->eaBase - insn->eaRegBase, 1089 &valid); 1090 if (!valid) 1091 return -1; 1092 } 1093 break; 1094 } 1095 1096 return 0; 1097} 1098 1099/* 1100 * readOpcodeModifier - Reads an operand from the opcode field of an 1101 * instruction. Handles AddRegFrm instructions. 1102 * 1103 * @param insn - The instruction whose opcode field is to be read. 1104 * @param inModRM - Indicates that the opcode field is to be read from the 1105 * ModR/M extension; useful for escape opcodes 1106 * @return - 0 on success; nonzero otherwise. 1107 */ 1108static int readOpcodeModifier(struct InternalInstruction* insn) { 1109 dbgprintf(insn, "readOpcodeModifier()"); 1110 1111 if (insn->consumedOpcodeModifier) 1112 return 0; 1113 1114 insn->consumedOpcodeModifier = TRUE; 1115 1116 switch (insn->spec->modifierType) { 1117 default: 1118 debug("Unknown modifier type."); 1119 return -1; 1120 case MODIFIER_NONE: 1121 debug("No modifier but an operand expects one."); 1122 return -1; 1123 case MODIFIER_OPCODE: 1124 insn->opcodeModifier = insn->opcode - insn->spec->modifierBase; 1125 return 0; 1126 case MODIFIER_MODRM: 1127 insn->opcodeModifier = insn->modRM - insn->spec->modifierBase; 1128 return 0; 1129 } 1130} 1131 1132/* 1133 * readOpcodeRegister - Reads an operand from the opcode field of an 1134 * instruction and interprets it appropriately given the operand width. 1135 * Handles AddRegFrm instructions. 1136 * 1137 * @param insn - See readOpcodeModifier(). 1138 * @param size - The width (in bytes) of the register being specified. 1139 * 1 means AL and friends, 2 means AX, 4 means EAX, and 8 means 1140 * RAX. 1141 * @return - 0 on success; nonzero otherwise. 1142 */ 1143static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) { 1144 dbgprintf(insn, "readOpcodeRegister()"); 1145 1146 if (readOpcodeModifier(insn)) 1147 return -1; 1148 1149 if (size == 0) 1150 size = insn->registerSize; 1151 1152 switch (size) { 1153 case 1: 1154 insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3) 1155 | insn->opcodeModifier)); 1156 if (insn->rexPrefix && 1157 insn->opcodeRegister >= MODRM_REG_AL + 0x4 && 1158 insn->opcodeRegister < MODRM_REG_AL + 0x8) { 1159 insn->opcodeRegister = (Reg)(MODRM_REG_SPL 1160 + (insn->opcodeRegister - MODRM_REG_AL - 4)); 1161 } 1162 1163 break; 1164 case 2: 1165 insn->opcodeRegister = (Reg)(MODRM_REG_AX 1166 + ((bFromREX(insn->rexPrefix) << 3) 1167 | insn->opcodeModifier)); 1168 break; 1169 case 4: 1170 insn->opcodeRegister = (Reg)(MODRM_REG_EAX 1171 + ((bFromREX(insn->rexPrefix) << 3) 1172 | insn->opcodeModifier)); 1173 break; 1174 case 8: 1175 insn->opcodeRegister = (Reg)(MODRM_REG_RAX 1176 + ((bFromREX(insn->rexPrefix) << 3) 1177 | insn->opcodeModifier)); 1178 break; 1179 } 1180 1181 return 0; 1182} 1183 1184/* 1185 * readImmediate - Consumes an immediate operand from an instruction, given the 1186 * desired operand size. 1187 * 1188 * @param insn - The instruction whose operand is to be read. 1189 * @param size - The width (in bytes) of the operand. 1190 * @return - 0 if the immediate was successfully consumed; nonzero 1191 * otherwise. 1192 */ 1193static int readImmediate(struct InternalInstruction* insn, uint8_t size) { 1194 uint8_t imm8; 1195 uint16_t imm16; 1196 uint32_t imm32; 1197 uint64_t imm64; 1198 1199 dbgprintf(insn, "readImmediate()"); 1200 1201 if (insn->numImmediatesConsumed == 2) { 1202 debug("Already consumed two immediates"); 1203 return -1; 1204 } 1205 1206 if (size == 0) 1207 size = insn->immediateSize; 1208 else 1209 insn->immediateSize = size; 1210 1211 switch (size) { 1212 case 1: 1213 if (consumeByte(insn, &imm8)) 1214 return -1; 1215 insn->immediates[insn->numImmediatesConsumed] = imm8; 1216 break; 1217 case 2: 1218 if (consumeUInt16(insn, &imm16)) 1219 return -1; 1220 insn->immediates[insn->numImmediatesConsumed] = imm16; 1221 break; 1222 case 4: 1223 if (consumeUInt32(insn, &imm32)) 1224 return -1; 1225 insn->immediates[insn->numImmediatesConsumed] = imm32; 1226 break; 1227 case 8: 1228 if (consumeUInt64(insn, &imm64)) 1229 return -1; 1230 insn->immediates[insn->numImmediatesConsumed] = imm64; 1231 break; 1232 } 1233 1234 insn->numImmediatesConsumed++; 1235 1236 return 0; 1237} 1238 1239/* 1240 * readOperands - Consults the specifier for an instruction and consumes all 1241 * operands for that instruction, interpreting them as it goes. 1242 * 1243 * @param insn - The instruction whose operands are to be read and interpreted. 1244 * @return - 0 if all operands could be read; nonzero otherwise. 1245 */ 1246static int readOperands(struct InternalInstruction* insn) { 1247 int index; 1248 1249 dbgprintf(insn, "readOperands()"); 1250 1251 for (index = 0; index < X86_MAX_OPERANDS; ++index) { 1252 switch (insn->spec->operands[index].encoding) { 1253 case ENCODING_NONE: 1254 break; 1255 case ENCODING_REG: 1256 case ENCODING_RM: 1257 if (readModRM(insn)) 1258 return -1; 1259 if (fixupReg(insn, &insn->spec->operands[index])) 1260 return -1; 1261 break; 1262 case ENCODING_CB: 1263 case ENCODING_CW: 1264 case ENCODING_CD: 1265 case ENCODING_CP: 1266 case ENCODING_CO: 1267 case ENCODING_CT: 1268 dbgprintf(insn, "We currently don't hande code-offset encodings"); 1269 return -1; 1270 case ENCODING_IB: 1271 if (readImmediate(insn, 1)) 1272 return -1; 1273 if (insn->spec->operands[index].type == TYPE_IMM3 && 1274 insn->immediates[insn->numImmediatesConsumed - 1] > 7) 1275 return -1; 1276 break; 1277 case ENCODING_IW: 1278 if (readImmediate(insn, 2)) 1279 return -1; 1280 break; 1281 case ENCODING_ID: 1282 if (readImmediate(insn, 4)) 1283 return -1; 1284 break; 1285 case ENCODING_IO: 1286 if (readImmediate(insn, 8)) 1287 return -1; 1288 break; 1289 case ENCODING_Iv: 1290 if (readImmediate(insn, insn->immediateSize)) 1291 return -1; 1292 break; 1293 case ENCODING_Ia: 1294 if (readImmediate(insn, insn->addressSize)) 1295 return -1; 1296 break; 1297 case ENCODING_RB: 1298 if (readOpcodeRegister(insn, 1)) 1299 return -1; 1300 break; 1301 case ENCODING_RW: 1302 if (readOpcodeRegister(insn, 2)) 1303 return -1; 1304 break; 1305 case ENCODING_RD: 1306 if (readOpcodeRegister(insn, 4)) 1307 return -1; 1308 break; 1309 case ENCODING_RO: 1310 if (readOpcodeRegister(insn, 8)) 1311 return -1; 1312 break; 1313 case ENCODING_Rv: 1314 if (readOpcodeRegister(insn, 0)) 1315 return -1; 1316 break; 1317 case ENCODING_I: 1318 if (readOpcodeModifier(insn)) 1319 return -1; 1320 case ENCODING_DUP: 1321 break; 1322 default: 1323 dbgprintf(insn, "Encountered an operand with an unknown encoding."); 1324 return -1; 1325 } 1326 } 1327 1328 return 0; 1329} 1330 1331/* 1332 * decodeInstruction - Reads and interprets a full instruction provided by the 1333 * user. 1334 * 1335 * @param insn - A pointer to the instruction to be populated. Must be 1336 * pre-allocated. 1337 * @param reader - The function to be used to read the instruction's bytes. 1338 * @param readerArg - A generic argument to be passed to the reader to store 1339 * any internal state. 1340 * @param logger - If non-NULL, the function to be used to write log messages 1341 * and warnings. 1342 * @param loggerArg - A generic argument to be passed to the logger to store 1343 * any internal state. 1344 * @param startLoc - The address (in the reader's address space) of the first 1345 * byte in the instruction. 1346 * @param mode - The mode (real mode, IA-32e, or IA-32e in 64-bit mode) to 1347 * decode the instruction in. 1348 * @return - 0 if the instruction's memory could be read; nonzero if 1349 * not. 1350 */ 1351int decodeInstruction(struct InternalInstruction* insn, 1352 byteReader_t reader, 1353 void* readerArg, 1354 dlog_t logger, 1355 void* loggerArg, 1356 uint64_t startLoc, 1357 DisassemblerMode mode) { 1358 memset(insn, 0, sizeof(struct InternalInstruction)); 1359 1360 insn->reader = reader; 1361 insn->readerArg = readerArg; 1362 insn->dlog = logger; 1363 insn->dlogArg = loggerArg; 1364 insn->startLocation = startLoc; 1365 insn->readerCursor = startLoc; 1366 insn->mode = mode; 1367 insn->numImmediatesConsumed = 0; 1368 1369 if (readPrefixes(insn) || 1370 readOpcode(insn) || 1371 getID(insn) || 1372 insn->instructionID == 0 || 1373 readOperands(insn)) 1374 return -1; 1375 1376 insn->length = insn->readerCursor - insn->startLocation; 1377 1378 dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %zu", 1379 startLoc, insn->readerCursor, insn->length); 1380 1381 if (insn->length > 15) 1382 dbgprintf(insn, "Instruction exceeds 15-byte limit"); 1383 1384 return 0; 1385} 1386