X86DisassemblerDecoder.c revision 0122c9051a0157908e3f4e1c604435339ac4761d
1/*===- X86DisassemblerDecoder.c - Disassembler decoder -------------*- C -*-==* 2 * 3 * The LLVM Compiler Infrastructure 4 * 5 * This file is distributed under the University of Illinois Open Source 6 * License. See LICENSE.TXT for details. 7 * 8 *===----------------------------------------------------------------------===* 9 * 10 * This file is part of the X86 Disassembler. 11 * It contains the implementation of the instruction decoder. 12 * Documentation for the disassembler can be found in X86Disassembler.h. 13 * 14 *===----------------------------------------------------------------------===*/ 15 16#include <assert.h> /* for assert() */ 17#include <stdarg.h> /* for va_*() */ 18#include <stdio.h> /* for vsnprintf() */ 19#include <stdlib.h> /* for exit() */ 20#include <string.h> /* for memset() */ 21 22#include "X86DisassemblerDecoder.h" 23 24#include "X86GenDisassemblerTables.inc" 25 26#define TRUE 1 27#define FALSE 0 28 29#ifdef __GNUC__ 30#define NORETURN __attribute__((noreturn)) 31#else 32#define NORETURN 33#endif 34 35#define unreachable(s) \ 36 do { \ 37 fprintf(stderr, "%s:%d: %s\n", __FILE__, __LINE__, s); \ 38 exit(-1); \ 39 } while (0); 40 41/* 42 * contextForAttrs - Client for the instruction context table. Takes a set of 43 * attributes and returns the appropriate decode context. 44 * 45 * @param attrMask - Attributes, from the enumeration attributeBits. 46 * @return - The InstructionContext to use when looking up an 47 * an instruction with these attributes. 48 */ 49static inline InstructionContext contextForAttrs(uint8_t attrMask) { 50 return CONTEXTS_SYM[attrMask]; 51} 52 53/* 54 * modRMRequired - Reads the appropriate instruction table to determine whether 55 * the ModR/M byte is required to decode a particular instruction. 56 * 57 * @param type - The opcode type (i.e., how many bytes it has). 58 * @param insnContext - The context for the instruction, as returned by 59 * contextForAttrs. 60 * @param opcode - The last byte of the instruction's opcode, not counting 61 * ModR/M extensions and escapes. 62 * @return - TRUE if the ModR/M byte is required, FALSE otherwise. 63 */ 64static inline int modRMRequired(OpcodeType type, 65 InstructionContext insnContext, 66 uint8_t opcode) { 67 const struct ContextDecision* decision; 68 69 switch (type) { 70 case ONEBYTE: 71 decision = &ONEBYTE_SYM; 72 break; 73 case TWOBYTE: 74 decision = &TWOBYTE_SYM; 75 break; 76 case THREEBYTE_38: 77 decision = &THREEBYTE38_SYM; 78 break; 79 case THREEBYTE_3A: 80 decision = &THREEBYTE3A_SYM; 81 break; 82 } 83 84 return decision->opcodeDecisions[insnContext].modRMDecisions[opcode]. 85 modrm_type != MODRM_ONEENTRY; 86 87 unreachable("Unknown opcode type"); 88 return 0; 89} 90 91/* 92 * decode - Reads the appropriate instruction table to obtain the unique ID of 93 * an instruction. 94 * 95 * @param type - See modRMRequired(). 96 * @param insnContext - See modRMRequired(). 97 * @param opcode - See modRMRequired(). 98 * @param modRM - The ModR/M byte if required, or any value if not. 99 */ 100static inline InstrUID decode(OpcodeType type, 101 InstructionContext insnContext, 102 uint8_t opcode, 103 uint8_t modRM) { 104 struct ModRMDecision* dec; 105 106 switch (type) { 107 default: 108 unreachable("Unknown opcode type"); 109 case ONEBYTE: 110 dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; 111 break; 112 case TWOBYTE: 113 dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; 114 break; 115 case THREEBYTE_38: 116 dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; 117 break; 118 case THREEBYTE_3A: 119 dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode]; 120 break; 121 } 122 123 switch (dec->modrm_type) { 124 default: 125 unreachable("Corrupt table! Unknown modrm_type"); 126 case MODRM_ONEENTRY: 127 return dec->instructionIDs[0]; 128 case MODRM_SPLITRM: 129 if (modFromModRM(modRM) == 0x3) 130 return dec->instructionIDs[1]; 131 else 132 return dec->instructionIDs[0]; 133 case MODRM_FULL: 134 return dec->instructionIDs[modRM]; 135 } 136 137 return 0; 138} 139 140/* 141 * specifierForUID - Given a UID, returns the name and operand specification for 142 * that instruction. 143 * 144 * @param uid - The unique ID for the instruction. This should be returned by 145 * decode(); specifierForUID will not check bounds. 146 * @return - A pointer to the specification for that instruction. 147 */ 148static inline struct InstructionSpecifier* specifierForUID(InstrUID uid) { 149 return &INSTRUCTIONS_SYM[uid]; 150} 151 152/* 153 * consumeByte - Uses the reader function provided by the user to consume one 154 * byte from the instruction's memory and advance the cursor. 155 * 156 * @param insn - The instruction with the reader function to use. The cursor 157 * for this instruction is advanced. 158 * @param byte - A pointer to a pre-allocated memory buffer to be populated 159 * with the data read. 160 * @return - 0 if the read was successful; nonzero otherwise. 161 */ 162static inline int consumeByte(struct InternalInstruction* insn, uint8_t* byte) { 163 int ret = insn->reader(insn->readerArg, byte, insn->readerCursor); 164 165 if (!ret) 166 ++(insn->readerCursor); 167 168 return ret; 169} 170 171/* 172 * lookAtByte - Like consumeByte, but does not advance the cursor. 173 * 174 * @param insn - See consumeByte(). 175 * @param byte - See consumeByte(). 176 * @return - See consumeByte(). 177 */ 178static inline int lookAtByte(struct InternalInstruction* insn, uint8_t* byte) { 179 return insn->reader(insn->readerArg, byte, insn->readerCursor); 180} 181 182static inline void unconsumeByte(struct InternalInstruction* insn) { 183 insn->readerCursor--; 184} 185 186#define CONSUME_FUNC(name, type) \ 187 static inline int name(struct InternalInstruction* insn, type* ptr) { \ 188 type combined = 0; \ 189 unsigned offset; \ 190 for (offset = 0; offset < sizeof(type); ++offset) { \ 191 uint8_t byte; \ 192 int ret = insn->reader(insn->readerArg, \ 193 &byte, \ 194 insn->readerCursor + offset); \ 195 if (ret) \ 196 return ret; \ 197 combined = combined | ((type)byte << ((type)offset * 8)); \ 198 } \ 199 *ptr = combined; \ 200 insn->readerCursor += sizeof(type); \ 201 return 0; \ 202 } 203 204/* 205 * consume* - Use the reader function provided by the user to consume data 206 * values of various sizes from the instruction's memory and advance the 207 * cursor appropriately. These readers perform endian conversion. 208 * 209 * @param insn - See consumeByte(). 210 * @param ptr - A pointer to a pre-allocated memory of appropriate size to 211 * be populated with the data read. 212 * @return - See consumeByte(). 213 */ 214CONSUME_FUNC(consumeInt8, int8_t) 215CONSUME_FUNC(consumeInt16, int16_t) 216CONSUME_FUNC(consumeInt32, int32_t) 217CONSUME_FUNC(consumeUInt16, uint16_t) 218CONSUME_FUNC(consumeUInt32, uint32_t) 219CONSUME_FUNC(consumeUInt64, uint64_t) 220 221/* 222 * dbgprintf - Uses the logging function provided by the user to log a single 223 * message, typically without a carriage-return. 224 * 225 * @param insn - The instruction containing the logging function. 226 * @param format - See printf(). 227 * @param ... - See printf(). 228 */ 229static inline void dbgprintf(struct InternalInstruction* insn, 230 const char* format, 231 ...) { 232 char buffer[256]; 233 va_list ap; 234 235 if (!insn->dlog) 236 return; 237 238 va_start(ap, format); 239 (void)vsnprintf(buffer, sizeof(buffer), format, ap); 240 va_end(ap); 241 242 insn->dlog(insn->dlogArg, buffer); 243 244 return; 245} 246 247/* 248 * setPrefixPresent - Marks that a particular prefix is present at a particular 249 * location. 250 * 251 * @param insn - The instruction to be marked as having the prefix. 252 * @param prefix - The prefix that is present. 253 * @param location - The location where the prefix is located (in the address 254 * space of the instruction's reader). 255 */ 256static inline void setPrefixPresent(struct InternalInstruction* insn, 257 uint8_t prefix, 258 uint64_t location) 259{ 260 insn->prefixPresent[prefix] = 1; 261 insn->prefixLocations[prefix] = location; 262} 263 264/* 265 * isPrefixAtLocation - Queries an instruction to determine whether a prefix is 266 * present at a given location. 267 * 268 * @param insn - The instruction to be queried. 269 * @param prefix - The prefix. 270 * @param location - The location to query. 271 * @return - Whether the prefix is at that location. 272 */ 273static inline BOOL isPrefixAtLocation(struct InternalInstruction* insn, 274 uint8_t prefix, 275 uint64_t location) 276{ 277 if (insn->prefixPresent[prefix] == 1 && 278 insn->prefixLocations[prefix] == location) 279 return TRUE; 280 else 281 return FALSE; 282} 283 284/* 285 * readPrefixes - Consumes all of an instruction's prefix bytes, and marks the 286 * instruction as having them. Also sets the instruction's default operand, 287 * address, and other relevant data sizes to report operands correctly. 288 * 289 * @param insn - The instruction whose prefixes are to be read. 290 * @return - 0 if the instruction could be read until the end of the prefix 291 * bytes, and no prefixes conflicted; nonzero otherwise. 292 */ 293static int readPrefixes(struct InternalInstruction* insn) { 294 BOOL isPrefix = TRUE; 295 BOOL prefixGroups[4] = { FALSE }; 296 uint64_t prefixLocation; 297 uint8_t byte; 298 299 BOOL hasAdSize = FALSE; 300 BOOL hasOpSize = FALSE; 301 302 dbgprintf(insn, "readPrefixes()"); 303 304 while (isPrefix) { 305 prefixLocation = insn->readerCursor; 306 307 if (consumeByte(insn, &byte)) 308 return -1; 309 310 switch (byte) { 311 case 0xf0: /* LOCK */ 312 case 0xf2: /* REPNE/REPNZ */ 313 case 0xf3: /* REP or REPE/REPZ */ 314 if (prefixGroups[0]) 315 dbgprintf(insn, "Redundant Group 1 prefix"); 316 prefixGroups[0] = TRUE; 317 setPrefixPresent(insn, byte, prefixLocation); 318 break; 319 case 0x2e: /* CS segment override -OR- Branch not taken */ 320 case 0x36: /* SS segment override -OR- Branch taken */ 321 case 0x3e: /* DS segment override */ 322 case 0x26: /* ES segment override */ 323 case 0x64: /* FS segment override */ 324 case 0x65: /* GS segment override */ 325 switch (byte) { 326 case 0x2e: 327 insn->segmentOverride = SEG_OVERRIDE_CS; 328 break; 329 case 0x36: 330 insn->segmentOverride = SEG_OVERRIDE_SS; 331 break; 332 case 0x3e: 333 insn->segmentOverride = SEG_OVERRIDE_DS; 334 break; 335 case 0x26: 336 insn->segmentOverride = SEG_OVERRIDE_ES; 337 break; 338 case 0x64: 339 insn->segmentOverride = SEG_OVERRIDE_FS; 340 break; 341 case 0x65: 342 insn->segmentOverride = SEG_OVERRIDE_GS; 343 break; 344 default: 345 unreachable("Unhandled override"); 346 } 347 if (prefixGroups[1]) 348 dbgprintf(insn, "Redundant Group 2 prefix"); 349 prefixGroups[1] = TRUE; 350 setPrefixPresent(insn, byte, prefixLocation); 351 break; 352 case 0x66: /* Operand-size override */ 353 if (prefixGroups[2]) 354 dbgprintf(insn, "Redundant Group 3 prefix"); 355 prefixGroups[2] = TRUE; 356 hasOpSize = TRUE; 357 setPrefixPresent(insn, byte, prefixLocation); 358 break; 359 case 0x67: /* Address-size override */ 360 if (prefixGroups[3]) 361 dbgprintf(insn, "Redundant Group 4 prefix"); 362 prefixGroups[3] = TRUE; 363 hasAdSize = TRUE; 364 setPrefixPresent(insn, byte, prefixLocation); 365 break; 366 default: /* Not a prefix byte */ 367 isPrefix = FALSE; 368 break; 369 } 370 371 if (isPrefix) 372 dbgprintf(insn, "Found prefix 0x%hhx", byte); 373 } 374 375 if (insn->mode == MODE_64BIT) { 376 if ((byte & 0xf0) == 0x40) { 377 uint8_t opcodeByte; 378 379 if(lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) { 380 dbgprintf(insn, "Redundant REX prefix"); 381 return -1; 382 } 383 384 insn->rexPrefix = byte; 385 insn->necessaryPrefixLocation = insn->readerCursor - 2; 386 387 dbgprintf(insn, "Found REX prefix 0x%hhx", byte); 388 } else { 389 unconsumeByte(insn); 390 insn->necessaryPrefixLocation = insn->readerCursor - 1; 391 } 392 } else { 393 unconsumeByte(insn); 394 } 395 396 if (insn->mode == MODE_16BIT) { 397 insn->registerSize = (hasOpSize ? 4 : 2); 398 insn->addressSize = (hasAdSize ? 4 : 2); 399 insn->displacementSize = (hasAdSize ? 4 : 2); 400 insn->immediateSize = (hasOpSize ? 4 : 2); 401 } else if (insn->mode == MODE_32BIT) { 402 insn->registerSize = (hasOpSize ? 2 : 4); 403 insn->addressSize = (hasAdSize ? 2 : 4); 404 insn->displacementSize = (hasAdSize ? 2 : 4); 405 insn->immediateSize = (hasAdSize ? 2 : 4); 406 } else if (insn->mode == MODE_64BIT) { 407 if (insn->rexPrefix && wFromREX(insn->rexPrefix)) { 408 insn->registerSize = 8; 409 insn->addressSize = (hasAdSize ? 4 : 8); 410 insn->displacementSize = 4; 411 insn->immediateSize = 4; 412 } else if (insn->rexPrefix) { 413 insn->registerSize = (hasOpSize ? 2 : 4); 414 insn->addressSize = (hasAdSize ? 4 : 8); 415 insn->displacementSize = (hasOpSize ? 2 : 4); 416 insn->immediateSize = (hasOpSize ? 2 : 4); 417 } else { 418 insn->registerSize = (hasOpSize ? 2 : 4); 419 insn->addressSize = (hasAdSize ? 4 : 8); 420 insn->displacementSize = (hasOpSize ? 2 : 4); 421 insn->immediateSize = (hasOpSize ? 2 : 4); 422 } 423 } 424 425 return 0; 426} 427 428/* 429 * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of 430 * extended or escape opcodes). 431 * 432 * @param insn - The instruction whose opcode is to be read. 433 * @return - 0 if the opcode could be read successfully; nonzero otherwise. 434 */ 435static int readOpcode(struct InternalInstruction* insn) { 436 /* Determine the length of the primary opcode */ 437 438 uint8_t current; 439 440 dbgprintf(insn, "readOpcode()"); 441 442 insn->opcodeType = ONEBYTE; 443 if (consumeByte(insn, ¤t)) 444 return -1; 445 446 if (current == 0x0f) { 447 dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current); 448 449 insn->twoByteEscape = current; 450 451 if (consumeByte(insn, ¤t)) 452 return -1; 453 454 if (current == 0x38) { 455 dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); 456 457 insn->threeByteEscape = current; 458 459 if (consumeByte(insn, ¤t)) 460 return -1; 461 462 insn->opcodeType = THREEBYTE_38; 463 } else if (current == 0x3a) { 464 dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current); 465 466 insn->threeByteEscape = current; 467 468 if (consumeByte(insn, ¤t)) 469 return -1; 470 471 insn->opcodeType = THREEBYTE_3A; 472 } else { 473 dbgprintf(insn, "Didn't find a three-byte escape prefix"); 474 475 insn->opcodeType = TWOBYTE; 476 } 477 } 478 479 /* 480 * At this point we have consumed the full opcode. 481 * Anything we consume from here on must be unconsumed. 482 */ 483 484 insn->opcode = current; 485 486 return 0; 487} 488 489static int readModRM(struct InternalInstruction* insn); 490 491/* 492 * getIDWithAttrMask - Determines the ID of an instruction, consuming 493 * the ModR/M byte as appropriate for extended and escape opcodes, 494 * and using a supplied attribute mask. 495 * 496 * @param instructionID - A pointer whose target is filled in with the ID of the 497 * instruction. 498 * @param insn - The instruction whose ID is to be determined. 499 * @param attrMask - The attribute mask to search. 500 * @return - 0 if the ModR/M could be read when needed or was not 501 * needed; nonzero otherwise. 502 */ 503static int getIDWithAttrMask(uint16_t* instructionID, 504 struct InternalInstruction* insn, 505 uint8_t attrMask) { 506 BOOL hasModRMExtension; 507 508 uint8_t instructionClass; 509 510 instructionClass = contextForAttrs(attrMask); 511 512 hasModRMExtension = modRMRequired(insn->opcodeType, 513 instructionClass, 514 insn->opcode); 515 516 if (hasModRMExtension) { 517 readModRM(insn); 518 519 *instructionID = decode(insn->opcodeType, 520 instructionClass, 521 insn->opcode, 522 insn->modRM); 523 } else { 524 *instructionID = decode(insn->opcodeType, 525 instructionClass, 526 insn->opcode, 527 0); 528 } 529 530 return 0; 531} 532 533/* 534 * is16BitEquivalent - Determines whether two instruction names refer to 535 * equivalent instructions but one is 16-bit whereas the other is not. 536 * 537 * @param orig - The instruction that is not 16-bit 538 * @param equiv - The instruction that is 16-bit 539 */ 540static BOOL is16BitEquvalent(const char* orig, const char* equiv) { 541 off_t i; 542 543 for(i = 0;; i++) { 544 if(orig[i] == '\0' && equiv[i] == '\0') 545 return TRUE; 546 if(orig[i] == '\0' || equiv[i] == '\0') 547 return FALSE; 548 if(orig[i] != equiv[i]) { 549 if((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W') 550 continue; 551 if((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1') 552 continue; 553 if((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6') 554 continue; 555 return FALSE; 556 } 557 } 558} 559 560/* 561 * is64BitEquivalent - Determines whether two instruction names refer to 562 * equivalent instructions but one is 64-bit whereas the other is not. 563 * 564 * @param orig - The instruction that is not 64-bit 565 * @param equiv - The instruction that is 64-bit 566 */ 567static BOOL is64BitEquivalent(const char* orig, const char* equiv) { 568 off_t i; 569 570 for(i = 0;; i++) { 571 if(orig[i] == '\0' && equiv[i] == '\0') 572 return TRUE; 573 if(orig[i] == '\0' || equiv[i] == '\0') 574 return FALSE; 575 if(orig[i] != equiv[i]) { 576 if((orig[i] == 'W' || orig[i] == 'L') && equiv[i] == 'Q') 577 continue; 578 if((orig[i] == '1' || orig[i] == '3') && equiv[i] == '6') 579 continue; 580 if((orig[i] == '6' || orig[i] == '2') && equiv[i] == '4') 581 continue; 582 return FALSE; 583 } 584 } 585} 586 587 588/* 589 * getID - Determines the ID of an instruction, consuming the ModR/M byte as 590 * appropriate for extended and escape opcodes. Determines the attributes and 591 * context for the instruction before doing so. 592 * 593 * @param insn - The instruction whose ID is to be determined. 594 * @return - 0 if the ModR/M could be read when needed or was not needed; 595 * nonzero otherwise. 596 */ 597static int getID(struct InternalInstruction* insn) { 598 uint8_t attrMask; 599 uint16_t instructionID; 600 601 dbgprintf(insn, "getID()"); 602 603 attrMask = ATTR_NONE; 604 605 if (insn->mode == MODE_64BIT) 606 attrMask |= ATTR_64BIT; 607 608 if (insn->rexPrefix & 0x08) 609 attrMask |= ATTR_REXW; 610 611 if (isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) 612 attrMask |= ATTR_OPSIZE; 613 else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation)) 614 attrMask |= ATTR_XS; 615 else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation)) 616 attrMask |= ATTR_XD; 617 618 if(getIDWithAttrMask(&instructionID, insn, attrMask)) 619 return -1; 620 621 /* The following clauses compensate for limitations of the tables. */ 622 623 if ((attrMask & ATTR_XD) && (attrMask & ATTR_REXW)) { 624 /* 625 * Although for SSE instructions it is usually necessary to treat REX.W+F2 626 * as F2 for decode (in the absence of a 64BIT_REXW_XD category) there is 627 * an occasional instruction where F2 is incidental and REX.W is the more 628 * significant. If the decoded instruction is 32-bit and adding REX.W 629 * instead of F2 changes a 32 to a 64, we adopt the new encoding. 630 */ 631 632 struct InstructionSpecifier* spec; 633 uint16_t instructionIDWithREXw; 634 struct InstructionSpecifier* specWithREXw; 635 636 spec = specifierForUID(instructionID); 637 638 if (getIDWithAttrMask(&instructionIDWithREXw, 639 insn, 640 attrMask & (~ATTR_XD))) { 641 /* 642 * Decoding with REX.w would yield nothing; give up and return original 643 * decode. 644 */ 645 646 insn->instructionID = instructionID; 647 insn->spec = spec; 648 return 0; 649 } 650 651 specWithREXw = specifierForUID(instructionIDWithREXw); 652 653 if (is64BitEquivalent(spec->name, specWithREXw->name)) { 654 insn->instructionID = instructionIDWithREXw; 655 insn->spec = specWithREXw; 656 } else { 657 insn->instructionID = instructionID; 658 insn->spec = spec; 659 } 660 return 0; 661 } 662 663 if (insn->prefixPresent[0x66] && !(attrMask & ATTR_OPSIZE)) { 664 /* 665 * The instruction tables make no distinction between instructions that 666 * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a 667 * particular spot (i.e., many MMX operations). In general we're 668 * conservative, but in the specific case where OpSize is present but not 669 * in the right place we check if there's a 16-bit operation. 670 */ 671 672 struct InstructionSpecifier* spec; 673 uint16_t instructionIDWithOpsize; 674 struct InstructionSpecifier* specWithOpsize; 675 676 spec = specifierForUID(instructionID); 677 678 if (getIDWithAttrMask(&instructionIDWithOpsize, 679 insn, 680 attrMask | ATTR_OPSIZE)) { 681 /* 682 * ModRM required with OpSize but not present; give up and return version 683 * without OpSize set 684 */ 685 686 insn->instructionID = instructionID; 687 insn->spec = spec; 688 return 0; 689 } 690 691 specWithOpsize = specifierForUID(instructionIDWithOpsize); 692 693 if (is16BitEquvalent(spec->name, specWithOpsize->name)) { 694 insn->instructionID = instructionIDWithOpsize; 695 insn->spec = specWithOpsize; 696 } else { 697 insn->instructionID = instructionID; 698 insn->spec = spec; 699 } 700 return 0; 701 } 702 703 insn->instructionID = instructionID; 704 insn->spec = specifierForUID(insn->instructionID); 705 706 return 0; 707} 708 709/* 710 * readSIB - Consumes the SIB byte to determine addressing information for an 711 * instruction. 712 * 713 * @param insn - The instruction whose SIB byte is to be read. 714 * @return - 0 if the SIB byte was successfully read; nonzero otherwise. 715 */ 716static int readSIB(struct InternalInstruction* insn) { 717 SIBIndex sibIndexBase; 718 SIBBase sibBaseBase; 719 uint8_t index, base; 720 721 dbgprintf(insn, "readSIB()"); 722 723 if (insn->consumedSIB) 724 return 0; 725 726 insn->consumedSIB = TRUE; 727 728 switch (insn->addressSize) { 729 case 2: 730 dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode"); 731 return -1; 732 break; 733 case 4: 734 sibIndexBase = SIB_INDEX_EAX; 735 sibBaseBase = SIB_BASE_EAX; 736 break; 737 case 8: 738 sibIndexBase = SIB_INDEX_RAX; 739 sibBaseBase = SIB_BASE_RAX; 740 break; 741 } 742 743 if (consumeByte(insn, &insn->sib)) 744 return -1; 745 746 index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3); 747 748 switch (index) { 749 case 0x4: 750 insn->sibIndex = SIB_INDEX_NONE; 751 break; 752 default: 753 insn->sibIndex = (EABase)(sibIndexBase + index); 754 if (insn->sibIndex == SIB_INDEX_sib || 755 insn->sibIndex == SIB_INDEX_sib64) 756 insn->sibIndex = SIB_INDEX_NONE; 757 break; 758 } 759 760 switch (scaleFromSIB(insn->sib)) { 761 case 0: 762 insn->sibScale = 1; 763 break; 764 case 1: 765 insn->sibScale = 2; 766 break; 767 case 2: 768 insn->sibScale = 4; 769 break; 770 case 3: 771 insn->sibScale = 8; 772 break; 773 } 774 775 base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3); 776 777 switch (base) { 778 case 0x5: 779 switch (modFromModRM(insn->modRM)) { 780 case 0x0: 781 insn->eaDisplacement = EA_DISP_32; 782 insn->sibBase = SIB_BASE_NONE; 783 break; 784 case 0x1: 785 insn->eaDisplacement = EA_DISP_8; 786 insn->sibBase = (insn->addressSize == 4 ? 787 SIB_BASE_EBP : SIB_BASE_RBP); 788 break; 789 case 0x2: 790 insn->eaDisplacement = EA_DISP_32; 791 insn->sibBase = (insn->addressSize == 4 ? 792 SIB_BASE_EBP : SIB_BASE_RBP); 793 break; 794 case 0x3: 795 unreachable("Cannot have Mod = 0b11 and a SIB byte"); 796 } 797 break; 798 default: 799 insn->sibBase = (EABase)(sibBaseBase + base); 800 break; 801 } 802 803 return 0; 804} 805 806/* 807 * readDisplacement - Consumes the displacement of an instruction. 808 * 809 * @param insn - The instruction whose displacement is to be read. 810 * @return - 0 if the displacement byte was successfully read; nonzero 811 * otherwise. 812 */ 813static int readDisplacement(struct InternalInstruction* insn) { 814 int8_t d8; 815 int16_t d16; 816 int32_t d32; 817 818 dbgprintf(insn, "readDisplacement()"); 819 820 if (insn->consumedDisplacement) 821 return 0; 822 823 insn->consumedDisplacement = TRUE; 824 825 switch (insn->eaDisplacement) { 826 case EA_DISP_NONE: 827 insn->consumedDisplacement = FALSE; 828 break; 829 case EA_DISP_8: 830 if (consumeInt8(insn, &d8)) 831 return -1; 832 insn->displacement = d8; 833 break; 834 case EA_DISP_16: 835 if (consumeInt16(insn, &d16)) 836 return -1; 837 insn->displacement = d16; 838 break; 839 case EA_DISP_32: 840 if (consumeInt32(insn, &d32)) 841 return -1; 842 insn->displacement = d32; 843 break; 844 } 845 846 insn->consumedDisplacement = TRUE; 847 return 0; 848} 849 850/* 851 * readModRM - Consumes all addressing information (ModR/M byte, SIB byte, and 852 * displacement) for an instruction and interprets it. 853 * 854 * @param insn - The instruction whose addressing information is to be read. 855 * @return - 0 if the information was successfully read; nonzero otherwise. 856 */ 857static int readModRM(struct InternalInstruction* insn) { 858 uint8_t mod, rm, reg; 859 860 dbgprintf(insn, "readModRM()"); 861 862 if (insn->consumedModRM) 863 return 0; 864 865 consumeByte(insn, &insn->modRM); 866 insn->consumedModRM = TRUE; 867 868 mod = modFromModRM(insn->modRM); 869 rm = rmFromModRM(insn->modRM); 870 reg = regFromModRM(insn->modRM); 871 872 /* 873 * This goes by insn->registerSize to pick the correct register, which messes 874 * up if we're using (say) XMM or 8-bit register operands. That gets fixed in 875 * fixupReg(). 876 */ 877 switch (insn->registerSize) { 878 case 2: 879 insn->regBase = REG_AX; 880 insn->eaRegBase = EA_REG_AX; 881 break; 882 case 4: 883 insn->regBase = REG_EAX; 884 insn->eaRegBase = EA_REG_EAX; 885 break; 886 case 8: 887 insn->regBase = REG_RAX; 888 insn->eaRegBase = EA_REG_RAX; 889 break; 890 } 891 892 reg |= rFromREX(insn->rexPrefix) << 3; 893 rm |= bFromREX(insn->rexPrefix) << 3; 894 895 insn->reg = (Reg)(insn->regBase + reg); 896 897 switch (insn->addressSize) { 898 case 2: 899 insn->eaBaseBase = EA_BASE_BX_SI; 900 901 switch (mod) { 902 case 0x0: 903 if (rm == 0x6) { 904 insn->eaBase = EA_BASE_NONE; 905 insn->eaDisplacement = EA_DISP_16; 906 if(readDisplacement(insn)) 907 return -1; 908 } else { 909 insn->eaBase = (EABase)(insn->eaBaseBase + rm); 910 insn->eaDisplacement = EA_DISP_NONE; 911 } 912 break; 913 case 0x1: 914 insn->eaBase = (EABase)(insn->eaBaseBase + rm); 915 insn->eaDisplacement = EA_DISP_8; 916 if(readDisplacement(insn)) 917 return -1; 918 break; 919 case 0x2: 920 insn->eaBase = (EABase)(insn->eaBaseBase + rm); 921 insn->eaDisplacement = EA_DISP_16; 922 if(readDisplacement(insn)) 923 return -1; 924 break; 925 case 0x3: 926 insn->eaBase = (EABase)(insn->eaRegBase + rm); 927 if(readDisplacement(insn)) 928 return -1; 929 break; 930 } 931 break; 932 case 4: 933 case 8: 934 insn->eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX); 935 936 switch (mod) { 937 case 0x0: 938 insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */ 939 switch (rm) { 940 case 0x4: 941 case 0xc: /* in case REXW.b is set */ 942 insn->eaBase = (insn->addressSize == 4 ? 943 EA_BASE_sib : EA_BASE_sib64); 944 readSIB(insn); 945 if(readDisplacement(insn)) 946 return -1; 947 break; 948 case 0x5: 949 insn->eaBase = EA_BASE_NONE; 950 insn->eaDisplacement = EA_DISP_32; 951 if(readDisplacement(insn)) 952 return -1; 953 break; 954 default: 955 insn->eaBase = (EABase)(insn->eaBaseBase + rm); 956 break; 957 } 958 break; 959 case 0x1: 960 case 0x2: 961 insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32); 962 switch (rm) { 963 case 0x4: 964 case 0xc: /* in case REXW.b is set */ 965 insn->eaBase = EA_BASE_sib; 966 readSIB(insn); 967 if(readDisplacement(insn)) 968 return -1; 969 break; 970 default: 971 insn->eaBase = (EABase)(insn->eaBaseBase + rm); 972 if(readDisplacement(insn)) 973 return -1; 974 break; 975 } 976 break; 977 case 0x3: 978 insn->eaDisplacement = EA_DISP_NONE; 979 insn->eaBase = (EABase)(insn->eaRegBase + rm); 980 break; 981 } 982 break; 983 } /* switch (insn->addressSize) */ 984 985 return 0; 986} 987 988#define GENERIC_FIXUP_FUNC(name, base, prefix) \ 989 static uint8_t name(struct InternalInstruction *insn, \ 990 OperandType type, \ 991 uint8_t index, \ 992 uint8_t *valid) { \ 993 *valid = 1; \ 994 switch (type) { \ 995 default: \ 996 unreachable("Unhandled register type"); \ 997 case TYPE_Rv: \ 998 return base + index; \ 999 case TYPE_R8: \ 1000 if(insn->rexPrefix && \ 1001 index >= 4 && index <= 7) { \ 1002 return prefix##_SPL + (index - 4); \ 1003 } else { \ 1004 return prefix##_AL + index; \ 1005 } \ 1006 case TYPE_R16: \ 1007 return prefix##_AX + index; \ 1008 case TYPE_R32: \ 1009 return prefix##_EAX + index; \ 1010 case TYPE_R64: \ 1011 return prefix##_RAX + index; \ 1012 case TYPE_XMM128: \ 1013 case TYPE_XMM64: \ 1014 case TYPE_XMM32: \ 1015 case TYPE_XMM: \ 1016 return prefix##_XMM0 + index; \ 1017 case TYPE_MM64: \ 1018 case TYPE_MM32: \ 1019 case TYPE_MM: \ 1020 if(index > 7) \ 1021 *valid = 0; \ 1022 return prefix##_MM0 + index; \ 1023 case TYPE_SEGMENTREG: \ 1024 if(index > 5) \ 1025 *valid = 0; \ 1026 return prefix##_ES + index; \ 1027 case TYPE_DEBUGREG: \ 1028 if(index > 7) \ 1029 *valid = 0; \ 1030 return prefix##_DR0 + index; \ 1031 case TYPE_CR32: \ 1032 if(index > 7) \ 1033 *valid = 0; \ 1034 return prefix##_ECR0 + index; \ 1035 case TYPE_CR64: \ 1036 if(index > 8) \ 1037 *valid = 0; \ 1038 return prefix##_RCR0 + index; \ 1039 } \ 1040 } 1041 1042/* 1043 * fixup*Value - Consults an operand type to determine the meaning of the 1044 * reg or R/M field. If the operand is an XMM operand, for example, an 1045 * operand would be XMM0 instead of AX, which readModRM() would otherwise 1046 * misinterpret it as. 1047 * 1048 * @param insn - The instruction containing the operand. 1049 * @param type - The operand type. 1050 * @param index - The existing value of the field as reported by readModRM(). 1051 * @param valid - The address of a uint8_t. The target is set to 1 if the 1052 * field is valid for the register class; 0 if not. 1053 */ 1054GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase, REG) 1055GENERIC_FIXUP_FUNC(fixupRMValue, insn->eaRegBase, EA_REG) 1056 1057/* 1058 * fixupReg - Consults an operand specifier to determine which of the 1059 * fixup*Value functions to use in correcting readModRM()'ss interpretation. 1060 * 1061 * @param insn - See fixup*Value(). 1062 * @param op - The operand specifier. 1063 * @return - 0 if fixup was successful; -1 if the register returned was 1064 * invalid for its class. 1065 */ 1066static int fixupReg(struct InternalInstruction *insn, 1067 struct OperandSpecifier *op) { 1068 uint8_t valid; 1069 1070 dbgprintf(insn, "fixupReg()"); 1071 1072 switch ((OperandEncoding)op->encoding) { 1073 default: 1074 unreachable("Expected a REG or R/M encoding in fixupReg"); 1075 case ENCODING_REG: 1076 insn->reg = (Reg)fixupRegValue(insn, 1077 (OperandType)op->type, 1078 insn->reg - insn->regBase, 1079 &valid); 1080 if (!valid) 1081 return -1; 1082 break; 1083 case ENCODING_RM: 1084 if (insn->eaBase >= insn->eaRegBase) { 1085 insn->eaBase = (EABase)fixupRMValue(insn, 1086 (OperandType)op->type, 1087 insn->eaBase - insn->eaRegBase, 1088 &valid); 1089 if (!valid) 1090 return -1; 1091 } 1092 break; 1093 } 1094 1095 return 0; 1096} 1097 1098/* 1099 * readOpcodeModifier - Reads an operand from the opcode field of an 1100 * instruction. Handles AddRegFrm instructions. 1101 * 1102 * @param insn - The instruction whose opcode field is to be read. 1103 * @param inModRM - Indicates that the opcode field is to be read from the 1104 * ModR/M extension; useful for escape opcodes 1105 */ 1106static void readOpcodeModifier(struct InternalInstruction* insn) { 1107 dbgprintf(insn, "readOpcodeModifier()"); 1108 1109 if (insn->consumedOpcodeModifier) 1110 return; 1111 1112 insn->consumedOpcodeModifier = TRUE; 1113 1114 switch(insn->spec->modifierType) { 1115 default: 1116 unreachable("Unknown modifier type."); 1117 case MODIFIER_NONE: 1118 unreachable("No modifier but an operand expects one."); 1119 case MODIFIER_OPCODE: 1120 insn->opcodeModifier = insn->opcode - insn->spec->modifierBase; 1121 break; 1122 case MODIFIER_MODRM: 1123 insn->opcodeModifier = insn->modRM - insn->spec->modifierBase; 1124 break; 1125 } 1126} 1127 1128/* 1129 * readOpcodeRegister - Reads an operand from the opcode field of an 1130 * instruction and interprets it appropriately given the operand width. 1131 * Handles AddRegFrm instructions. 1132 * 1133 * @param insn - See readOpcodeModifier(). 1134 * @param size - The width (in bytes) of the register being specified. 1135 * 1 means AL and friends, 2 means AX, 4 means EAX, and 8 means 1136 * RAX. 1137 */ 1138static void readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) { 1139 dbgprintf(insn, "readOpcodeRegister()"); 1140 1141 readOpcodeModifier(insn); 1142 1143 if (size == 0) 1144 size = insn->registerSize; 1145 1146 switch (size) { 1147 case 1: 1148 insn->opcodeRegister = (Reg)(REG_AL + ((bFromREX(insn->rexPrefix) << 3) 1149 | insn->opcodeModifier)); 1150 if(insn->rexPrefix && 1151 insn->opcodeRegister >= REG_AL + 0x4 && 1152 insn->opcodeRegister < REG_AL + 0x8) { 1153 insn->opcodeRegister = (Reg)(REG_SPL + (insn->opcodeRegister - REG_AL - 4)); 1154 } 1155 1156 break; 1157 case 2: 1158 insn->opcodeRegister = (Reg)(REG_AX + ((bFromREX(insn->rexPrefix) << 3) 1159 | insn->opcodeModifier)); 1160 break; 1161 case 4: 1162 insn->opcodeRegister = (Reg)(REG_EAX + ((bFromREX(insn->rexPrefix) << 3) 1163 | insn->opcodeModifier)); 1164 break; 1165 case 8: 1166 insn->opcodeRegister = (Reg)(REG_RAX + ((bFromREX(insn->rexPrefix) << 3) 1167 |insn->opcodeModifier)); 1168 break; 1169 } 1170} 1171 1172/* 1173 * readImmediate - Consumes an immediate operand from an instruction, given the 1174 * desired operand size. 1175 * 1176 * @param insn - The instruction whose operand is to be read. 1177 * @param size - The width (in bytes) of the operand. 1178 * @return - 0 if the immediate was successfully consumed; nonzero 1179 * otherwise. 1180 */ 1181static int readImmediate(struct InternalInstruction* insn, uint8_t size) { 1182 uint8_t imm8; 1183 uint16_t imm16; 1184 uint32_t imm32; 1185 uint64_t imm64; 1186 1187 dbgprintf(insn, "readImmediate()"); 1188 1189 if (insn->numImmediatesConsumed == 2) 1190 unreachable("Already consumed two immediates"); 1191 1192 if (size == 0) 1193 size = insn->immediateSize; 1194 else 1195 insn->immediateSize = size; 1196 1197 switch (size) { 1198 case 1: 1199 if (consumeByte(insn, &imm8)) 1200 return -1; 1201 insn->immediates[insn->numImmediatesConsumed] = imm8; 1202 break; 1203 case 2: 1204 if (consumeUInt16(insn, &imm16)) 1205 return -1; 1206 insn->immediates[insn->numImmediatesConsumed] = imm16; 1207 break; 1208 case 4: 1209 if (consumeUInt32(insn, &imm32)) 1210 return -1; 1211 insn->immediates[insn->numImmediatesConsumed] = imm32; 1212 break; 1213 case 8: 1214 if (consumeUInt64(insn, &imm64)) 1215 return -1; 1216 insn->immediates[insn->numImmediatesConsumed] = imm64; 1217 break; 1218 } 1219 1220 insn->numImmediatesConsumed++; 1221 1222 return 0; 1223} 1224 1225/* 1226 * readOperands - Consults the specifier for an instruction and consumes all 1227 * operands for that instruction, interpreting them as it goes. 1228 * 1229 * @param insn - The instruction whose operands are to be read and interpreted. 1230 * @return - 0 if all operands could be read; nonzero otherwise. 1231 */ 1232static int readOperands(struct InternalInstruction* insn) { 1233 int index; 1234 1235 dbgprintf(insn, "readOperands()"); 1236 1237 for (index = 0; index < X86_MAX_OPERANDS; ++index) { 1238 switch (insn->spec->operands[index].encoding) { 1239 case ENCODING_NONE: 1240 break; 1241 case ENCODING_REG: 1242 case ENCODING_RM: 1243 if (readModRM(insn)) 1244 return -1; 1245 if (fixupReg(insn, &insn->spec->operands[index])) 1246 return -1; 1247 break; 1248 case ENCODING_CB: 1249 case ENCODING_CW: 1250 case ENCODING_CD: 1251 case ENCODING_CP: 1252 case ENCODING_CO: 1253 case ENCODING_CT: 1254 dbgprintf(insn, "We currently don't hande code-offset encodings"); 1255 return -1; 1256 case ENCODING_IB: 1257 if (readImmediate(insn, 1)) 1258 return -1; 1259 break; 1260 case ENCODING_IW: 1261 if (readImmediate(insn, 2)) 1262 return -1; 1263 break; 1264 case ENCODING_ID: 1265 if (readImmediate(insn, 4)) 1266 return -1; 1267 break; 1268 case ENCODING_IO: 1269 if (readImmediate(insn, 8)) 1270 return -1; 1271 break; 1272 case ENCODING_Iv: 1273 readImmediate(insn, insn->immediateSize); 1274 break; 1275 case ENCODING_Ia: 1276 readImmediate(insn, insn->addressSize); 1277 break; 1278 case ENCODING_RB: 1279 readOpcodeRegister(insn, 1); 1280 break; 1281 case ENCODING_RW: 1282 readOpcodeRegister(insn, 2); 1283 break; 1284 case ENCODING_RD: 1285 readOpcodeRegister(insn, 4); 1286 break; 1287 case ENCODING_RO: 1288 readOpcodeRegister(insn, 8); 1289 break; 1290 case ENCODING_Rv: 1291 readOpcodeRegister(insn, 0); 1292 break; 1293 case ENCODING_I: 1294 readOpcodeModifier(insn); 1295 break; 1296 case ENCODING_DUP: 1297 break; 1298 default: 1299 dbgprintf(insn, "Encountered an operand with an unknown encoding."); 1300 return -1; 1301 } 1302 } 1303 1304 return 0; 1305} 1306 1307/* 1308 * decodeInstruction - Reads and interprets a full instruction provided by the 1309 * user. 1310 * 1311 * @param insn - A pointer to the instruction to be populated. Must be 1312 * pre-allocated. 1313 * @param reader - The function to be used to read the instruction's bytes. 1314 * @param readerArg - A generic argument to be passed to the reader to store 1315 * any internal state. 1316 * @param logger - If non-NULL, the function to be used to write log messages 1317 * and warnings. 1318 * @param loggerArg - A generic argument to be passed to the logger to store 1319 * any internal state. 1320 * @param startLoc - The address (in the reader's address space) of the first 1321 * byte in the instruction. 1322 * @param mode - The mode (real mode, IA-32e, or IA-32e in 64-bit mode) to 1323 * decode the instruction in. 1324 * @return - 0 if the instruction's memory could be read; nonzero if 1325 * not. 1326 */ 1327int decodeInstruction(struct InternalInstruction* insn, 1328 byteReader_t reader, 1329 void* readerArg, 1330 dlog_t logger, 1331 void* loggerArg, 1332 uint64_t startLoc, 1333 DisassemblerMode mode) { 1334 memset(insn, 0, sizeof(struct InternalInstruction)); 1335 1336 insn->reader = reader; 1337 insn->readerArg = readerArg; 1338 insn->dlog = logger; 1339 insn->dlogArg = loggerArg; 1340 insn->startLocation = startLoc; 1341 insn->readerCursor = startLoc; 1342 insn->mode = mode; 1343 insn->numImmediatesConsumed = 0; 1344 1345 if (readPrefixes(insn) || 1346 readOpcode(insn) || 1347 getID(insn) || 1348 insn->instructionID == 0 || 1349 readOperands(insn)) 1350 return -1; 1351 1352 insn->length = insn->readerCursor - insn->startLocation; 1353 1354 dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %llu", 1355 startLoc, insn->readerCursor, insn->length); 1356 1357 if (insn->length > 15) 1358 dbgprintf(insn, "Instruction exceeds 15-byte limit"); 1359 1360 return 0; 1361} 1362