X86DisassemblerDecoder.c revision 5117709a1d71fc34225decde0c7fe6a3ae29c063
1/*===- X86DisassemblerDecoder.c - Disassembler decoder -------------*- C -*-==* 2 * 3 * The LLVM Compiler Infrastructure 4 * 5 * This file is distributed under the University of Illinois Open Source 6 * License. See LICENSE.TXT for details. 7 * 8 *===----------------------------------------------------------------------===* 9 * 10 * This file is part of the X86 Disassembler. 11 * It contains the implementation of the instruction decoder. 12 * Documentation for the disassembler can be found in X86Disassembler.h. 13 * 14 *===----------------------------------------------------------------------===*/ 15 16#include <stdarg.h> /* for va_*() */ 17#include <stdio.h> /* for vsnprintf() */ 18#include <stdlib.h> /* for exit() */ 19#include <string.h> /* for memset() */ 20 21#include "X86DisassemblerDecoder.h" 22 23#include "X86GenDisassemblerTables.inc" 24 25#define TRUE 1 26#define FALSE 0 27 28typedef int8_t bool; 29 30#ifndef NDEBUG 31#define debug(s) do { x86DisassemblerDebug(__FILE__, __LINE__, s); } while (0) 32#else 33#define debug(s) do { } while (0) 34#endif 35 36 37/* 38 * contextForAttrs - Client for the instruction context table. Takes a set of 39 * attributes and returns the appropriate decode context. 40 * 41 * @param attrMask - Attributes, from the enumeration attributeBits. 42 * @return - The InstructionContext to use when looking up an 43 * an instruction with these attributes. 44 */ 45static InstructionContext contextForAttrs(uint8_t attrMask) { 46 return CONTEXTS_SYM[attrMask]; 47} 48 49/* 50 * modRMRequired - Reads the appropriate instruction table to determine whether 51 * the ModR/M byte is required to decode a particular instruction. 52 * 53 * @param type - The opcode type (i.e., how many bytes it has). 54 * @param insnContext - The context for the instruction, as returned by 55 * contextForAttrs. 56 * @param opcode - The last byte of the instruction's opcode, not counting 57 * ModR/M extensions and escapes. 58 * @return - TRUE if the ModR/M byte is required, FALSE otherwise. 59 */ 60static int modRMRequired(OpcodeType type, 61 InstructionContext insnContext, 62 uint8_t opcode) { 63 const struct ContextDecision* decision = 0; 64 65 switch (type) { 66 case ONEBYTE: 67 decision = &ONEBYTE_SYM; 68 break; 69 case TWOBYTE: 70 decision = &TWOBYTE_SYM; 71 break; 72 case THREEBYTE_38: 73 decision = &THREEBYTE38_SYM; 74 break; 75 case THREEBYTE_3A: 76 decision = &THREEBYTE3A_SYM; 77 break; 78 } 79 80 return decision->opcodeDecisions[insnContext].modRMDecisions[opcode]. 81 modrm_type != MODRM_ONEENTRY; 82 83 return 0; 84} 85 86/* 87 * decode - Reads the appropriate instruction table to obtain the unique ID of 88 * an instruction. 89 * 90 * @param type - See modRMRequired(). 91 * @param insnContext - See modRMRequired(). 92 * @param opcode - See modRMRequired(). 93 * @param modRM - The ModR/M byte if required, or any value if not. 94 * @return - The UID of the instruction, or 0 on failure. 95 */ 96static InstrUID decode(OpcodeType type, 97 InstructionContext insnContext, 98 uint8_t opcode, 99 uint8_t modRM) { 100 struct ModRMDecision* dec; 101 102 switch (type) { 103 default: 104 debug("Unknown opcode type"); 105 return 0; 106 case ONEBYTE: 107 dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; 108 break; 109 case TWOBYTE: 110 dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; 111 break; 112 case THREEBYTE_38: 113 dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; 114 break; 115 case THREEBYTE_3A: 116 dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; 117 break; 118 } 119 120 switch (dec->modrm_type) { 121 default: 122 debug("Corrupt table! Unknown modrm_type"); 123 return 0; 124 case MODRM_ONEENTRY: 125 return dec->instructionIDs[0]; 126 case MODRM_SPLITRM: 127 if (modFromModRM(modRM) == 0x3) 128 return dec->instructionIDs[1]; 129 else 130 return dec->instructionIDs[0]; 131 case MODRM_FULL: 132 return dec->instructionIDs[modRM]; 133 } 134} 135 136/* 137 * specifierForUID - Given a UID, returns the name and operand specification for 138 * that instruction. 139 * 140 * @param uid - The unique ID for the instruction. This should be returned by 141 * decode(); specifierForUID will not check bounds. 142 * @return - A pointer to the specification for that instruction. 143 */ 144static struct InstructionSpecifier* specifierForUID(InstrUID uid) { 145 return &INSTRUCTIONS_SYM[uid]; 146} 147 148/* 149 * consumeByte - Uses the reader function provided by the user to consume one 150 * byte from the instruction's memory and advance the cursor. 151 * 152 * @param insn - The instruction with the reader function to use. The cursor 153 * for this instruction is advanced. 154 * @param byte - A pointer to a pre-allocated memory buffer to be populated 155 * with the data read. 156 * @return - 0 if the read was successful; nonzero otherwise. 157 */ 158static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) { 159 int ret = insn->reader(insn->readerArg, byte, insn->readerCursor); 160 161 if (!ret) 162 ++(insn->readerCursor); 163 164 return ret; 165} 166 167/* 168 * lookAtByte - Like consumeByte, but does not advance the cursor. 169 * 170 * @param insn - See consumeByte(). 171 * @param byte - See consumeByte(). 172 * @return - See consumeByte(). 173 */ 174static int lookAtByte(struct InternalInstruction* insn, uint8_t* byte) { 175 return insn->reader(insn->readerArg, byte, insn->readerCursor); 176} 177 178static void unconsumeByte(struct InternalInstruction* insn) { 179 insn->readerCursor--; 180} 181 182#define CONSUME_FUNC(name, type) \ 183 static int name(struct InternalInstruction* insn, type* ptr) { \ 184 type combined = 0; \ 185 unsigned offset; \ 186 for (offset = 0; offset < sizeof(type); ++offset) { \ 187 uint8_t byte; \ 188 int ret = insn->reader(insn->readerArg, \ 189 &byte, \ 190 insn->readerCursor + offset); \ 191 if (ret) \ 192 return ret; \ 193 combined = combined | ((type)byte << ((type)offset * 8)); \ 194 } \ 195 *ptr = combined; \ 196 insn->readerCursor += sizeof(type); \ 197 return 0; \ 198 } 199 200/* 201 * consume* - Use the reader function provided by the user to consume data 202 * values of various sizes from the instruction's memory and advance the 203 * cursor appropriately. These readers perform endian conversion. 204 * 205 * @param insn - See consumeByte(). 206 * @param ptr - A pointer to a pre-allocated memory of appropriate size to 207 * be populated with the data read. 208 * @return - See consumeByte(). 209 */ 210CONSUME_FUNC(consumeInt8, int8_t) 211CONSUME_FUNC(consumeInt16, int16_t) 212CONSUME_FUNC(consumeInt32, int32_t) 213CONSUME_FUNC(consumeUInt16, uint16_t) 214CONSUME_FUNC(consumeUInt32, uint32_t) 215CONSUME_FUNC(consumeUInt64, uint64_t) 216 217/* 218 * dbgprintf - Uses the logging function provided by the user to log a single 219 * message, typically without a carriage-return. 220 * 221 * @param insn - The instruction containing the logging function. 222 * @param format - See printf(). 223 * @param ... - See printf(). 224 */ 225static void dbgprintf(struct InternalInstruction* insn, 226 const char* format, 227 ...) { 228 char buffer[256]; 229 va_list ap; 230 231 if (!insn->dlog) 232 return; 233 234 va_start(ap, format); 235 (void)vsnprintf(buffer, sizeof(buffer), format, ap); 236 va_end(ap); 237 238 insn->dlog(insn->dlogArg, buffer); 239 240 return; 241} 242 243/* 244 * setPrefixPresent - Marks that a particular prefix is present at a particular 245 * location. 246 * 247 * @param insn - The instruction to be marked as having the prefix. 248 * @param prefix - The prefix that is present. 249 * @param location - The location where the prefix is located (in the address 250 * space of the instruction's reader). 251 */ 252static void setPrefixPresent(struct InternalInstruction* insn, 253 uint8_t prefix, 254 uint64_t location) 255{ 256 insn->prefixPresent[prefix] = 1; 257 insn->prefixLocations[prefix] = location; 258} 259 260/* 261 * isPrefixAtLocation - Queries an instruction to determine whether a prefix is 262 * present at a given location. 263 * 264 * @param insn - The instruction to be queried. 265 * @param prefix - The prefix. 266 * @param location - The location to query. 267 * @return - Whether the prefix is at that location. 268 */ 269static BOOL isPrefixAtLocation(struct InternalInstruction* insn, 270 uint8_t prefix, 271 uint64_t location) 272{ 273 if (insn->prefixPresent[prefix] == 1 && 274 insn->prefixLocations[prefix] == location) 275 return TRUE; 276 else 277 return FALSE; 278} 279 280/* 281 * readPrefixes - Consumes all of an instruction's prefix bytes, and marks the 282 * instruction as having them. Also sets the instruction's default operand, 283 * address, and other relevant data sizes to report operands correctly. 284 * 285 * @param insn - The instruction whose prefixes are to be read. 286 * @return - 0 if the instruction could be read until the end of the prefix 287 * bytes, and no prefixes conflicted; nonzero otherwise. 288 */ 289static int readPrefixes(struct InternalInstruction* insn) { 290 BOOL isPrefix = TRUE; 291 BOOL prefixGroups[4] = { FALSE }; 292 uint64_t prefixLocation; 293 uint8_t byte; 294 295 BOOL hasAdSize = FALSE; 296 BOOL hasOpSize = FALSE; 297 298 dbgprintf(insn, "readPrefixes()"); 299 300 while (isPrefix) { 301 prefixLocation = insn->readerCursor; 302 303 if (consumeByte(insn, &byte)) 304 return -1; 305 306 switch (byte) { 307 case 0xf0: /* LOCK */ 308 case 0xf2: /* REPNE/REPNZ */ 309 case 0xf3: /* REP or REPE/REPZ */ 310 if (prefixGroups[0]) 311 dbgprintf(insn, "Redundant Group 1 prefix"); 312 prefixGroups[0] = TRUE; 313 setPrefixPresent(insn, byte, prefixLocation); 314 break; 315 case 0x2e: /* CS segment override -OR- Branch not taken */ 316 case 0x36: /* SS segment override -OR- Branch taken */ 317 case 0x3e: /* DS segment override */ 318 case 0x26: /* ES segment override */ 319 case 0x64: /* FS segment override */ 320 case 0x65: /* GS segment override */ 321 switch (byte) { 322 case 0x2e: 323 insn->segmentOverride = SEG_OVERRIDE_CS; 324 break; 325 case 0x36: 326 insn->segmentOverride = SEG_OVERRIDE_SS; 327 break; 328 case 0x3e: 329 insn->segmentOverride = SEG_OVERRIDE_DS; 330 break; 331 case 0x26: 332 insn->segmentOverride = SEG_OVERRIDE_ES; 333 break; 334 case 0x64: 335 insn->segmentOverride = SEG_OVERRIDE_FS; 336 break; 337 case 0x65: 338 insn->segmentOverride = SEG_OVERRIDE_GS; 339 break; 340 default: 341 debug("Unhandled override"); 342 return -1; 343 } 344 if (prefixGroups[1]) 345 dbgprintf(insn, "Redundant Group 2 prefix"); 346 prefixGroups[1] = TRUE; 347 setPrefixPresent(insn, byte, prefixLocation); 348 break; 349 case 0x66: /* Operand-size override */ 350 if (prefixGroups[2]) 351 dbgprintf(insn, "Redundant Group 3 prefix"); 352 prefixGroups[2] = TRUE; 353 hasOpSize = TRUE; 354 setPrefixPresent(insn, byte, prefixLocation); 355 break; 356 case 0x67: /* Address-size override */ 357 if (prefixGroups[3]) 358 dbgprintf(insn, "Redundant Group 4 prefix"); 359 prefixGroups[3] = TRUE; 360 hasAdSize = TRUE; 361 setPrefixPresent(insn, byte, prefixLocation); 362 break; 363 default: /* Not a prefix byte */ 364 isPrefix = FALSE; 365 break; 366 } 367 368 if (isPrefix) 369 dbgprintf(insn, "Found prefix 0x%hhx", byte); 370 } 371 372 if (insn->mode == MODE_64BIT) { 373 if ((byte & 0xf0) == 0x40) { 374 uint8_t opcodeByte; 375 376 if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) { 377 dbgprintf(insn, "Redundant REX prefix"); 378 return -1; 379 } 380 381 insn->rexPrefix = byte; 382 insn->necessaryPrefixLocation = insn->readerCursor - 2; 383 384 dbgprintf(insn, "Found REX prefix 0x%hhx", byte); 385 } else { 386 unconsumeByte(insn); 387 insn->necessaryPrefixLocation = insn->readerCursor - 1; 388 } 389 } else { 390 unconsumeByte(insn); 391 } 392 393 if (insn->mode == MODE_16BIT) { 394 insn->registerSize = (hasOpSize ? 4 : 2); 395 insn->addressSize = (hasAdSize ? 4 : 2); 396 insn->displacementSize = (hasAdSize ? 4 : 2); 397 insn->immediateSize = (hasOpSize ? 4 : 2); 398 } else if (insn->mode == MODE_32BIT) { 399 insn->registerSize = (hasOpSize ? 2 : 4); 400 insn->addressSize = (hasAdSize ? 2 : 4); 401 insn->displacementSize = (hasAdSize ? 2 : 4); 402 insn->immediateSize = (hasOpSize ? 2 : 4); 403 } else if (insn->mode == MODE_64BIT) { 404 if (insn->rexPrefix && wFromREX(insn->rexPrefix)) { 405 insn->registerSize = 8; 406 insn->addressSize = (hasAdSize ? 4 : 8); 407 insn->displacementSize = 4; 408 insn->immediateSize = 4; 409 } else if (insn->rexPrefix) { 410 insn->registerSize = (hasOpSize ? 2 : 4); 411 insn->addressSize = (hasAdSize ? 4 : 8); 412 insn->displacementSize = (hasOpSize ? 2 : 4); 413 insn->immediateSize = (hasOpSize ? 2 : 4); 414 } else { 415 insn->registerSize = (hasOpSize ? 2 : 4); 416 insn->addressSize = (hasAdSize ? 4 : 8); 417 insn->displacementSize = (hasOpSize ? 2 : 4); 418 insn->immediateSize = (hasOpSize ? 2 : 4); 419 } 420 } 421 422 return 0; 423} 424 425/* 426 * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of 427 * extended or escape opcodes). 428 * 429 * @param insn - The instruction whose opcode is to be read. 430 * @return - 0 if the opcode could be read successfully; nonzero otherwise. 431 */ 432static int readOpcode(struct InternalInstruction* insn) { 433 /* Determine the length of the primary opcode */ 434 435 uint8_t current; 436 437 dbgprintf(insn, "readOpcode()"); 438 439 insn->opcodeType = ONEBYTE; 440 if (consumeByte(insn, ¤t)) 441 return -1; 442 443 if (current == 0x0f) { 444 dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current); 445 446 insn->twoByteEscape = current; 447 448 if (consumeByte(insn, ¤t)) 449 return -1; 450 451 if (current == 0x38) { 452 dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); 453 454 insn->threeByteEscape = current; 455 456 if (consumeByte(insn, ¤t)) 457 return -1; 458 459 insn->opcodeType = THREEBYTE_38; 460 } else if (current == 0x3a) { 461 dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); 462 463 insn->threeByteEscape = current; 464 465 if (consumeByte(insn, ¤t)) 466 return -1; 467 468 insn->opcodeType = THREEBYTE_3A; 469 } else { 470 dbgprintf(insn, "Didn't find a three-byte escape prefix"); 471 472 insn->opcodeType = TWOBYTE; 473 } 474 } 475 476 /* 477 * At this point we have consumed the full opcode. 478 * Anything we consume from here on must be unconsumed. 479 */ 480 481 insn->opcode = current; 482 483 return 0; 484} 485 486static int readModRM(struct InternalInstruction* insn); 487 488/* 489 * getIDWithAttrMask - Determines the ID of an instruction, consuming 490 * the ModR/M byte as appropriate for extended and escape opcodes, 491 * and using a supplied attribute mask. 492 * 493 * @param instructionID - A pointer whose target is filled in with the ID of the 494 * instruction. 495 * @param insn - The instruction whose ID is to be determined. 496 * @param attrMask - The attribute mask to search. 497 * @return - 0 if the ModR/M could be read when needed or was not 498 * needed; nonzero otherwise. 499 */ 500static int getIDWithAttrMask(uint16_t* instructionID, 501 struct InternalInstruction* insn, 502 uint8_t attrMask) { 503 BOOL hasModRMExtension; 504 505 uint8_t instructionClass; 506 507 instructionClass = contextForAttrs(attrMask); 508 509 hasModRMExtension = modRMRequired(insn->opcodeType, 510 instructionClass, 511 insn->opcode); 512 513 if (hasModRMExtension) { 514 readModRM(insn); 515 516 *instructionID = decode(insn->opcodeType, 517 instructionClass, 518 insn->opcode, 519 insn->modRM); 520 } else { 521 *instructionID = decode(insn->opcodeType, 522 instructionClass, 523 insn->opcode, 524 0); 525 } 526 527 return 0; 528} 529 530/* 531 * is16BitEquivalent - Determines whether two instruction names refer to 532 * equivalent instructions but one is 16-bit whereas the other is not. 533 * 534 * @param orig - The instruction that is not 16-bit 535 * @param equiv - The instruction that is 16-bit 536 */ 537static BOOL is16BitEquvalent(const char* orig, const char* equiv) { 538 off_t i; 539 540 for (i = 0;; i++) { 541 if (orig[i] == '\0' && equiv[i] == '\0') 542 return TRUE; 543 if (orig[i] == '\0' || equiv[i] == '\0') 544 return FALSE; 545 if (orig[i] != equiv[i]) { 546 if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W') 547 continue; 548 if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1') 549 continue; 550 if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6') 551 continue; 552 return FALSE; 553 } 554 } 555} 556 557/* 558 * is64BitEquivalent - Determines whether two instruction names refer to 559 * equivalent instructions but one is 64-bit whereas the other is not. 560 * 561 * @param orig - The instruction that is not 64-bit 562 * @param equiv - The instruction that is 64-bit 563 */ 564static BOOL is64BitEquivalent(const char* orig, const char* equiv) { 565 off_t i; 566 567 for (i = 0;; i++) { 568 if (orig[i] == '\0' && equiv[i] == '\0') 569 return TRUE; 570 if (orig[i] == '\0' || equiv[i] == '\0') 571 return FALSE; 572 if (orig[i] != equiv[i]) { 573 if ((orig[i] == 'W' || orig[i] == 'L') && equiv[i] == 'Q') 574 continue; 575 if ((orig[i] == '1' || orig[i] == '3') && equiv[i] == '6') 576 continue; 577 if ((orig[i] == '6' || orig[i] == '2') && equiv[i] == '4') 578 continue; 579 return FALSE; 580 } 581 } 582} 583 584 585/* 586 * getID - Determines the ID of an instruction, consuming the ModR/M byte as 587 * appropriate for extended and escape opcodes. Determines the attributes and 588 * context for the instruction before doing so. 589 * 590 * @param insn - The instruction whose ID is to be determined. 591 * @return - 0 if the ModR/M could be read when needed or was not needed; 592 * nonzero otherwise. 593 */ 594static int getID(struct InternalInstruction* insn) { 595 uint8_t attrMask; 596 uint16_t instructionID; 597 598 dbgprintf(insn, "getID()"); 599 600 attrMask = ATTR_NONE; 601 602 if (insn->mode == MODE_64BIT) 603 attrMask |= ATTR_64BIT; 604 605 if (insn->rexPrefix & 0x08) 606 attrMask |= ATTR_REXW; 607 608 if (isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) 609 attrMask |= ATTR_OPSIZE; 610 else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation)) 611 attrMask |= ATTR_XS; 612 else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation)) 613 attrMask |= ATTR_XD; 614 615 if (getIDWithAttrMask(&instructionID, insn, attrMask)) 616 return -1; 617 618 /* The following clauses compensate for limitations of the tables. */ 619 620 if ((attrMask & ATTR_XD) && (attrMask & ATTR_REXW)) { 621 /* 622 * Although for SSE instructions it is usually necessary to treat REX.W+F2 623 * as F2 for decode (in the absence of a 64BIT_REXW_XD category) there is 624 * an occasional instruction where F2 is incidental and REX.W is the more 625 * significant. If the decoded instruction is 32-bit and adding REX.W 626 * instead of F2 changes a 32 to a 64, we adopt the new encoding. 627 */ 628 629 struct InstructionSpecifier* spec; 630 uint16_t instructionIDWithREXw; 631 struct InstructionSpecifier* specWithREXw; 632 633 spec = specifierForUID(instructionID); 634 635 if (getIDWithAttrMask(&instructionIDWithREXw, 636 insn, 637 attrMask & (~ATTR_XD))) { 638 /* 639 * Decoding with REX.w would yield nothing; give up and return original 640 * decode. 641 */ 642 643 insn->instructionID = instructionID; 644 insn->spec = spec; 645 return 0; 646 } 647 648 specWithREXw = specifierForUID(instructionIDWithREXw); 649 650 if (is64BitEquivalent(spec->name, specWithREXw->name)) { 651 insn->instructionID = instructionIDWithREXw; 652 insn->spec = specWithREXw; 653 } else { 654 insn->instructionID = instructionID; 655 insn->spec = spec; 656 } 657 return 0; 658 } 659 660 if (insn->prefixPresent[0x66] && !(attrMask & ATTR_OPSIZE)) { 661 /* 662 * The instruction tables make no distinction between instructions that 663 * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a 664 * particular spot (i.e., many MMX operations). In general we're 665 * conservative, but in the specific case where OpSize is present but not 666 * in the right place we check if there's a 16-bit operation. 667 */ 668 669 struct InstructionSpecifier* spec; 670 uint16_t instructionIDWithOpsize; 671 struct InstructionSpecifier* specWithOpsize; 672 673 spec = specifierForUID(instructionID); 674 675 if (getIDWithAttrMask(&instructionIDWithOpsize, 676 insn, 677 attrMask | ATTR_OPSIZE)) { 678 /* 679 * ModRM required with OpSize but not present; give up and return version 680 * without OpSize set 681 */ 682 683 insn->instructionID = instructionID; 684 insn->spec = spec; 685 return 0; 686 } 687 688 specWithOpsize = specifierForUID(instructionIDWithOpsize); 689 690 if (is16BitEquvalent(spec->name, specWithOpsize->name)) { 691 insn->instructionID = instructionIDWithOpsize; 692 insn->spec = specWithOpsize; 693 } else { 694 insn->instructionID = instructionID; 695 insn->spec = spec; 696 } 697 return 0; 698 } 699 700 insn->instructionID = instructionID; 701 insn->spec = specifierForUID(insn->instructionID); 702 703 return 0; 704} 705 706/* 707 * readSIB - Consumes the SIB byte to determine addressing information for an 708 * instruction. 709 * 710 * @param insn - The instruction whose SIB byte is to be read. 711 * @return - 0 if the SIB byte was successfully read; nonzero otherwise. 712 */ 713static int readSIB(struct InternalInstruction* insn) { 714 SIBIndex sibIndexBase = 0; 715 SIBBase sibBaseBase = 0; 716 uint8_t index, base; 717 718 dbgprintf(insn, "readSIB()"); 719 720 if (insn->consumedSIB) 721 return 0; 722 723 insn->consumedSIB = TRUE; 724 725 switch (insn->addressSize) { 726 case 2: 727 dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode"); 728 return -1; 729 break; 730 case 4: 731 sibIndexBase = SIB_INDEX_EAX; 732 sibBaseBase = SIB_BASE_EAX; 733 break; 734 case 8: 735 sibIndexBase = SIB_INDEX_RAX; 736 sibBaseBase = SIB_BASE_RAX; 737 break; 738 } 739 740 if (consumeByte(insn, &insn->sib)) 741 return -1; 742 743 index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3); 744 745 switch (index) { 746 case 0x4: 747 insn->sibIndex = SIB_INDEX_NONE; 748 break; 749 default: 750 insn->sibIndex = (EABase)(sibIndexBase + index); 751 if (insn->sibIndex == SIB_INDEX_sib || 752 insn->sibIndex == SIB_INDEX_sib64) 753 insn->sibIndex = SIB_INDEX_NONE; 754 break; 755 } 756 757 switch (scaleFromSIB(insn->sib)) { 758 case 0: 759 insn->sibScale = 1; 760 break; 761 case 1: 762 insn->sibScale = 2; 763 break; 764 case 2: 765 insn->sibScale = 4; 766 break; 767 case 3: 768 insn->sibScale = 8; 769 break; 770 } 771 772 base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3); 773 774 switch (base) { 775 case 0x5: 776 switch (modFromModRM(insn->modRM)) { 777 case 0x0: 778 insn->eaDisplacement = EA_DISP_32; 779 insn->sibBase = SIB_BASE_NONE; 780 break; 781 case 0x1: 782 insn->eaDisplacement = EA_DISP_8; 783 insn->sibBase = (insn->addressSize == 4 ? 784 SIB_BASE_EBP : SIB_BASE_RBP); 785 break; 786 case 0x2: 787 insn->eaDisplacement = EA_DISP_32; 788 insn->sibBase = (insn->addressSize == 4 ? 789 SIB_BASE_EBP : SIB_BASE_RBP); 790 break; 791 case 0x3: 792 debug("Cannot have Mod = 0b11 and a SIB byte"); 793 return -1; 794 } 795 break; 796 default: 797 insn->sibBase = (EABase)(sibBaseBase + base); 798 break; 799 } 800 801 return 0; 802} 803 804/* 805 * readDisplacement - Consumes the displacement of an instruction. 806 * 807 * @param insn - The instruction whose displacement is to be read. 808 * @return - 0 if the displacement byte was successfully read; nonzero 809 * otherwise. 810 */ 811static int readDisplacement(struct InternalInstruction* insn) { 812 int8_t d8; 813 int16_t d16; 814 int32_t d32; 815 816 dbgprintf(insn, "readDisplacement()"); 817 818 if (insn->consumedDisplacement) 819 return 0; 820 821 insn->consumedDisplacement = TRUE; 822 823 switch (insn->eaDisplacement) { 824 case EA_DISP_NONE: 825 insn->consumedDisplacement = FALSE; 826 break; 827 case EA_DISP_8: 828 if (consumeInt8(insn, &d8)) 829 return -1; 830 insn->displacement = d8; 831 break; 832 case EA_DISP_16: 833 if (consumeInt16(insn, &d16)) 834 return -1; 835 insn->displacement = d16; 836 break; 837 case EA_DISP_32: 838 if (consumeInt32(insn, &d32)) 839 return -1; 840 insn->displacement = d32; 841 break; 842 } 843 844 insn->consumedDisplacement = TRUE; 845 return 0; 846} 847 848/* 849 * readModRM - Consumes all addressing information (ModR/M byte, SIB byte, and 850 * displacement) for an instruction and interprets it. 851 * 852 * @param insn - The instruction whose addressing information is to be read. 853 * @return - 0 if the information was successfully read; nonzero otherwise. 854 */ 855static int readModRM(struct InternalInstruction* insn) { 856 uint8_t mod, rm, reg; 857 858 dbgprintf(insn, "readModRM()"); 859 860 if (insn->consumedModRM) 861 return 0; 862 863 consumeByte(insn, &insn->modRM); 864 insn->consumedModRM = TRUE; 865 866 mod = modFromModRM(insn->modRM); 867 rm = rmFromModRM(insn->modRM); 868 reg = regFromModRM(insn->modRM); 869 870 /* 871 * This goes by insn->registerSize to pick the correct register, which messes 872 * up if we're using (say) XMM or 8-bit register operands. That gets fixed in 873 * fixupReg(). 874 */ 875 switch (insn->registerSize) { 876 case 2: 877 insn->regBase = MODRM_REG_AX; 878 insn->eaRegBase = EA_REG_AX; 879 break; 880 case 4: 881 insn->regBase = MODRM_REG_EAX; 882 insn->eaRegBase = EA_REG_EAX; 883 break; 884 case 8: 885 insn->regBase = MODRM_REG_RAX; 886 insn->eaRegBase = EA_REG_RAX; 887 break; 888 } 889 890 reg |= rFromREX(insn->rexPrefix) << 3; 891 rm |= bFromREX(insn->rexPrefix) << 3; 892 893 insn->reg = (Reg)(insn->regBase + reg); 894 895 switch (insn->addressSize) { 896 case 2: 897 insn->eaBaseBase = EA_BASE_BX_SI; 898 899 switch (mod) { 900 case 0x0: 901 if (rm == 0x6) { 902 insn->eaBase = EA_BASE_NONE; 903 insn->eaDisplacement = EA_DISP_16; 904 if (readDisplacement(insn)) 905 return -1; 906 } else { 907 insn->eaBase = (EABase)(insn->eaBaseBase + rm); 908 insn->eaDisplacement = EA_DISP_NONE; 909 } 910 break; 911 case 0x1: 912 insn->eaBase = (EABase)(insn->eaBaseBase + rm); 913 insn->eaDisplacement = EA_DISP_8; 914 if (readDisplacement(insn)) 915 return -1; 916 break; 917 case 0x2: 918 insn->eaBase = (EABase)(insn->eaBaseBase + rm); 919 insn->eaDisplacement = EA_DISP_16; 920 if (readDisplacement(insn)) 921 return -1; 922 break; 923 case 0x3: 924 insn->eaBase = (EABase)(insn->eaRegBase + rm); 925 if (readDisplacement(insn)) 926 return -1; 927 break; 928 } 929 break; 930 case 4: 931 case 8: 932 insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX); 933 934 switch (mod) { 935 case 0x0: 936 insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */ 937 switch (rm) { 938 case 0x4: 939 case 0xc: /* in case REXW.b is set */ 940 insn->eaBase = (insn->addressSize == 4 ? 941 EA_BASE_sib : EA_BASE_sib64); 942 readSIB(insn); 943 if (readDisplacement(insn)) 944 return -1; 945 break; 946 case 0x5: 947 insn->eaBase = EA_BASE_NONE; 948 insn->eaDisplacement = EA_DISP_32; 949 if (readDisplacement(insn)) 950 return -1; 951 break; 952 default: 953 insn->eaBase = (EABase)(insn->eaBaseBase + rm); 954 break; 955 } 956 break; 957 case 0x1: 958 case 0x2: 959 insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32); 960 switch (rm) { 961 case 0x4: 962 case 0xc: /* in case REXW.b is set */ 963 insn->eaBase = EA_BASE_sib; 964 readSIB(insn); 965 if (readDisplacement(insn)) 966 return -1; 967 break; 968 default: 969 insn->eaBase = (EABase)(insn->eaBaseBase + rm); 970 if (readDisplacement(insn)) 971 return -1; 972 break; 973 } 974 break; 975 case 0x3: 976 insn->eaDisplacement = EA_DISP_NONE; 977 insn->eaBase = (EABase)(insn->eaRegBase + rm); 978 break; 979 } 980 break; 981 } /* switch (insn->addressSize) */ 982 983 return 0; 984} 985 986#define GENERIC_FIXUP_FUNC(name, base, prefix) \ 987 static uint8_t name(struct InternalInstruction *insn, \ 988 OperandType type, \ 989 uint8_t index, \ 990 uint8_t *valid) { \ 991 *valid = 1; \ 992 switch (type) { \ 993 default: \ 994 debug("Unhandled register type"); \ 995 *valid = 0; \ 996 return 0; \ 997 case TYPE_Rv: \ 998 return base + index; \ 999 case TYPE_R8: \ 1000 if (insn->rexPrefix && \ 1001 index >= 4 && index <= 7) { \ 1002 return prefix##_SPL + (index - 4); \ 1003 } else { \ 1004 return prefix##_AL + index; \ 1005 } \ 1006 case TYPE_R16: \ 1007 return prefix##_AX + index; \ 1008 case TYPE_R32: \ 1009 return prefix##_EAX + index; \ 1010 case TYPE_R64: \ 1011 return prefix##_RAX + index; \ 1012 case TYPE_XMM128: \ 1013 case TYPE_XMM64: \ 1014 case TYPE_XMM32: \ 1015 case TYPE_XMM: \ 1016 return prefix##_XMM0 + index; \ 1017 case TYPE_MM64: \ 1018 case TYPE_MM32: \ 1019 case TYPE_MM: \ 1020 if (index > 7) \ 1021 *valid = 0; \ 1022 return prefix##_MM0 + index; \ 1023 case TYPE_SEGMENTREG: \ 1024 if (index > 5) \ 1025 *valid = 0; \ 1026 return prefix##_ES + index; \ 1027 case TYPE_DEBUGREG: \ 1028 if (index > 7) \ 1029 *valid = 0; \ 1030 return prefix##_DR0 + index; \ 1031 case TYPE_CONTROLREG: \ 1032 if (index > 8) \ 1033 *valid = 0; \ 1034 return prefix##_CR0 + index; \ 1035 } \ 1036 } 1037 1038/* 1039 * fixup*Value - Consults an operand type to determine the meaning of the 1040 * reg or R/M field. If the operand is an XMM operand, for example, an 1041 * operand would be XMM0 instead of AX, which readModRM() would otherwise 1042 * misinterpret it as. 1043 * 1044 * @param insn - The instruction containing the operand. 1045 * @param type - The operand type. 1046 * @param index - The existing value of the field as reported by readModRM(). 1047 * @param valid - The address of a uint8_t. The target is set to 1 if the 1048 * field is valid for the register class; 0 if not. 1049 * @return - The proper value. 1050 */ 1051GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase, MODRM_REG) 1052GENERIC_FIXUP_FUNC(fixupRMValue, insn->eaRegBase, EA_REG) 1053 1054/* 1055 * fixupReg - Consults an operand specifier to determine which of the 1056 * fixup*Value functions to use in correcting readModRM()'ss interpretation. 1057 * 1058 * @param insn - See fixup*Value(). 1059 * @param op - The operand specifier. 1060 * @return - 0 if fixup was successful; -1 if the register returned was 1061 * invalid for its class. 1062 */ 1063static int fixupReg(struct InternalInstruction *insn, 1064 struct OperandSpecifier *op) { 1065 uint8_t valid; 1066 1067 dbgprintf(insn, "fixupReg()"); 1068 1069 switch ((OperandEncoding)op->encoding) { 1070 default: 1071 debug("Expected a REG or R/M encoding in fixupReg"); 1072 return -1; 1073 case ENCODING_REG: 1074 insn->reg = (Reg)fixupRegValue(insn, 1075 (OperandType)op->type, 1076 insn->reg - insn->regBase, 1077 &valid); 1078 if (!valid) 1079 return -1; 1080 break; 1081 case ENCODING_RM: 1082 if (insn->eaBase >= insn->eaRegBase) { 1083 insn->eaBase = (EABase)fixupRMValue(insn, 1084 (OperandType)op->type, 1085 insn->eaBase - insn->eaRegBase, 1086 &valid); 1087 if (!valid) 1088 return -1; 1089 } 1090 break; 1091 } 1092 1093 return 0; 1094} 1095 1096/* 1097 * readOpcodeModifier - Reads an operand from the opcode field of an 1098 * instruction. Handles AddRegFrm instructions. 1099 * 1100 * @param insn - The instruction whose opcode field is to be read. 1101 * @param inModRM - Indicates that the opcode field is to be read from the 1102 * ModR/M extension; useful for escape opcodes 1103 * @return - 0 on success; nonzero otherwise. 1104 */ 1105static int readOpcodeModifier(struct InternalInstruction* insn) { 1106 dbgprintf(insn, "readOpcodeModifier()"); 1107 1108 if (insn->consumedOpcodeModifier) 1109 return 0; 1110 1111 insn->consumedOpcodeModifier = TRUE; 1112 1113 switch (insn->spec->modifierType) { 1114 default: 1115 debug("Unknown modifier type."); 1116 return -1; 1117 case MODIFIER_NONE: 1118 debug("No modifier but an operand expects one."); 1119 return -1; 1120 case MODIFIER_OPCODE: 1121 insn->opcodeModifier = insn->opcode - insn->spec->modifierBase; 1122 return 0; 1123 case MODIFIER_MODRM: 1124 insn->opcodeModifier = insn->modRM - insn->spec->modifierBase; 1125 return 0; 1126 } 1127} 1128 1129/* 1130 * readOpcodeRegister - Reads an operand from the opcode field of an 1131 * instruction and interprets it appropriately given the operand width. 1132 * Handles AddRegFrm instructions. 1133 * 1134 * @param insn - See readOpcodeModifier(). 1135 * @param size - The width (in bytes) of the register being specified. 1136 * 1 means AL and friends, 2 means AX, 4 means EAX, and 8 means 1137 * RAX. 1138 * @return - 0 on success; nonzero otherwise. 1139 */ 1140static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) { 1141 dbgprintf(insn, "readOpcodeRegister()"); 1142 1143 if (readOpcodeModifier(insn)) 1144 return -1; 1145 1146 if (size == 0) 1147 size = insn->registerSize; 1148 1149 switch (size) { 1150 case 1: 1151 insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3) 1152 | insn->opcodeModifier)); 1153 if (insn->rexPrefix && 1154 insn->opcodeRegister >= MODRM_REG_AL + 0x4 && 1155 insn->opcodeRegister < MODRM_REG_AL + 0x8) { 1156 insn->opcodeRegister = (Reg)(MODRM_REG_SPL 1157 + (insn->opcodeRegister - MODRM_REG_AL - 4)); 1158 } 1159 1160 break; 1161 case 2: 1162 insn->opcodeRegister = (Reg)(MODRM_REG_AX 1163 + ((bFromREX(insn->rexPrefix) << 3) 1164 | insn->opcodeModifier)); 1165 break; 1166 case 4: 1167 insn->opcodeRegister = (Reg)(MODRM_REG_EAX 1168 + ((bFromREX(insn->rexPrefix) << 3) 1169 | insn->opcodeModifier)); 1170 break; 1171 case 8: 1172 insn->opcodeRegister = (Reg)(MODRM_REG_RAX 1173 + ((bFromREX(insn->rexPrefix) << 3) 1174 | insn->opcodeModifier)); 1175 break; 1176 } 1177 1178 return 0; 1179} 1180 1181/* 1182 * readImmediate - Consumes an immediate operand from an instruction, given the 1183 * desired operand size. 1184 * 1185 * @param insn - The instruction whose operand is to be read. 1186 * @param size - The width (in bytes) of the operand. 1187 * @return - 0 if the immediate was successfully consumed; nonzero 1188 * otherwise. 1189 */ 1190static int readImmediate(struct InternalInstruction* insn, uint8_t size) { 1191 uint8_t imm8; 1192 uint16_t imm16; 1193 uint32_t imm32; 1194 uint64_t imm64; 1195 1196 dbgprintf(insn, "readImmediate()"); 1197 1198 if (insn->numImmediatesConsumed == 2) { 1199 debug("Already consumed two immediates"); 1200 return -1; 1201 } 1202 1203 if (size == 0) 1204 size = insn->immediateSize; 1205 else 1206 insn->immediateSize = size; 1207 1208 switch (size) { 1209 case 1: 1210 if (consumeByte(insn, &imm8)) 1211 return -1; 1212 insn->immediates[insn->numImmediatesConsumed] = imm8; 1213 break; 1214 case 2: 1215 if (consumeUInt16(insn, &imm16)) 1216 return -1; 1217 insn->immediates[insn->numImmediatesConsumed] = imm16; 1218 break; 1219 case 4: 1220 if (consumeUInt32(insn, &imm32)) 1221 return -1; 1222 insn->immediates[insn->numImmediatesConsumed] = imm32; 1223 break; 1224 case 8: 1225 if (consumeUInt64(insn, &imm64)) 1226 return -1; 1227 insn->immediates[insn->numImmediatesConsumed] = imm64; 1228 break; 1229 } 1230 1231 insn->numImmediatesConsumed++; 1232 1233 return 0; 1234} 1235 1236/* 1237 * readOperands - Consults the specifier for an instruction and consumes all 1238 * operands for that instruction, interpreting them as it goes. 1239 * 1240 * @param insn - The instruction whose operands are to be read and interpreted. 1241 * @return - 0 if all operands could be read; nonzero otherwise. 1242 */ 1243static int readOperands(struct InternalInstruction* insn) { 1244 int index; 1245 1246 dbgprintf(insn, "readOperands()"); 1247 1248 for (index = 0; index < X86_MAX_OPERANDS; ++index) { 1249 switch (insn->spec->operands[index].encoding) { 1250 case ENCODING_NONE: 1251 break; 1252 case ENCODING_REG: 1253 case ENCODING_RM: 1254 if (readModRM(insn)) 1255 return -1; 1256 if (fixupReg(insn, &insn->spec->operands[index])) 1257 return -1; 1258 break; 1259 case ENCODING_CB: 1260 case ENCODING_CW: 1261 case ENCODING_CD: 1262 case ENCODING_CP: 1263 case ENCODING_CO: 1264 case ENCODING_CT: 1265 dbgprintf(insn, "We currently don't hande code-offset encodings"); 1266 return -1; 1267 case ENCODING_IB: 1268 if (readImmediate(insn, 1)) 1269 return -1; 1270 if (insn->spec->operands[index].type == TYPE_IMM3 && 1271 insn->immediates[insn->numImmediatesConsumed - 1] > 7) 1272 return -1; 1273 break; 1274 case ENCODING_IW: 1275 if (readImmediate(insn, 2)) 1276 return -1; 1277 break; 1278 case ENCODING_ID: 1279 if (readImmediate(insn, 4)) 1280 return -1; 1281 break; 1282 case ENCODING_IO: 1283 if (readImmediate(insn, 8)) 1284 return -1; 1285 break; 1286 case ENCODING_Iv: 1287 if (readImmediate(insn, insn->immediateSize)) 1288 return -1; 1289 break; 1290 case ENCODING_Ia: 1291 if (readImmediate(insn, insn->addressSize)) 1292 return -1; 1293 break; 1294 case ENCODING_RB: 1295 if (readOpcodeRegister(insn, 1)) 1296 return -1; 1297 break; 1298 case ENCODING_RW: 1299 if (readOpcodeRegister(insn, 2)) 1300 return -1; 1301 break; 1302 case ENCODING_RD: 1303 if (readOpcodeRegister(insn, 4)) 1304 return -1; 1305 break; 1306 case ENCODING_RO: 1307 if (readOpcodeRegister(insn, 8)) 1308 return -1; 1309 break; 1310 case ENCODING_Rv: 1311 if (readOpcodeRegister(insn, 0)) 1312 return -1; 1313 break; 1314 case ENCODING_I: 1315 if (readOpcodeModifier(insn)) 1316 return -1; 1317 case ENCODING_DUP: 1318 break; 1319 default: 1320 dbgprintf(insn, "Encountered an operand with an unknown encoding."); 1321 return -1; 1322 } 1323 } 1324 1325 return 0; 1326} 1327 1328/* 1329 * decodeInstruction - Reads and interprets a full instruction provided by the 1330 * user. 1331 * 1332 * @param insn - A pointer to the instruction to be populated. Must be 1333 * pre-allocated. 1334 * @param reader - The function to be used to read the instruction's bytes. 1335 * @param readerArg - A generic argument to be passed to the reader to store 1336 * any internal state. 1337 * @param logger - If non-NULL, the function to be used to write log messages 1338 * and warnings. 1339 * @param loggerArg - A generic argument to be passed to the logger to store 1340 * any internal state. 1341 * @param startLoc - The address (in the reader's address space) of the first 1342 * byte in the instruction. 1343 * @param mode - The mode (real mode, IA-32e, or IA-32e in 64-bit mode) to 1344 * decode the instruction in. 1345 * @return - 0 if the instruction's memory could be read; nonzero if 1346 * not. 1347 */ 1348int decodeInstruction(struct InternalInstruction* insn, 1349 byteReader_t reader, 1350 void* readerArg, 1351 dlog_t logger, 1352 void* loggerArg, 1353 uint64_t startLoc, 1354 DisassemblerMode mode) { 1355 memset(insn, 0, sizeof(struct InternalInstruction)); 1356 1357 insn->reader = reader; 1358 insn->readerArg = readerArg; 1359 insn->dlog = logger; 1360 insn->dlogArg = loggerArg; 1361 insn->startLocation = startLoc; 1362 insn->readerCursor = startLoc; 1363 insn->mode = mode; 1364 insn->numImmediatesConsumed = 0; 1365 1366 if (readPrefixes(insn) || 1367 readOpcode(insn) || 1368 getID(insn) || 1369 insn->instructionID == 0 || 1370 readOperands(insn)) 1371 return -1; 1372 1373 insn->length = insn->readerCursor - insn->startLocation; 1374 1375 dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %zu", 1376 startLoc, insn->readerCursor, insn->length); 1377 1378 if (insn->length > 15) 1379 dbgprintf(insn, "Instruction exceeds 15-byte limit"); 1380 1381 return 0; 1382} 1383