1
2/*--------------------------------------------------------------------*/
3/*--- begin                                     guest_amd64_toIR.c ---*/
4/*--------------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2011 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36/* Translates AMD64 code to IR. */
37
38/* TODO:
39
40   All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
41   to ensure a 64-bit value is being written.
42
43   x87 FP Limitations:
44
45   * all arithmetic done at 64 bits
46
47   * no FP exceptions, except for handling stack over/underflow
48
49   * FP rounding mode observed only for float->int conversions and
50     int->float conversions which could lose accuracy, and for
51     float-to-float rounding.  For all other operations,
52     round-to-nearest is used, regardless.
53
54   * FP sin/cos/tan/sincos: C2 flag is always cleared.  IOW the
55     simulation claims the argument is in-range (-2^63 <= arg <= 2^63)
56     even when it isn't.
57
58   * some of the FCOM cases could do with testing -- not convinced
59     that the args are the right way round.
60
61   * FSAVE does not re-initialise the FPU; it should do
62
63   * FINIT not only initialises the FPU environment, it also zeroes
64     all the FP registers.  It should leave the registers unchanged.
65
66    RDTSC returns zero, always.
67
68    SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
69    per Intel docs this bit has no meaning anyway.  Since PUSHF is the
70    only way to observe eflags[1], a proper fix would be to make that
71    bit be set by PUSHF.
72
73    This module uses global variables and so is not MT-safe (if that
74    should ever become relevant).
75*/
76
77/* Notes re address size overrides (0x67).
78
79   According to the AMD documentation (24594 Rev 3.09, Sept 2003,
80   "AMD64 Architecture Programmer's Manual Volume 3: General-Purpose
81   and System Instructions"), Section 1.2.3 ("Address-Size Override
82   Prefix"):
83
84   0x67 applies to all explicit memory references, causing the top
85   32 bits of the effective address to become zero.
86
87   0x67 has no effect on stack references (push/pop); these always
88   use a 64-bit address.
89
90   0x67 changes the interpretation of instructions which implicitly
91   reference RCX/RSI/RDI, so that in fact ECX/ESI/EDI are used
92   instead.  These are:
93
94      cmp{s,sb,sw,sd,sq}
95      in{s,sb,sw,sd}
96      jcxz, jecxz, jrcxz
97      lod{s,sb,sw,sd,sq}
98      loop{,e,bz,be,z}
99      mov{s,sb,sw,sd,sq}
100      out{s,sb,sw,sd}
101      rep{,e,ne,nz}
102      sca{s,sb,sw,sd,sq}
103      sto{s,sb,sw,sd,sq}
104      xlat{,b} */
105
106/* "Special" instructions.
107
108   This instruction decoder can decode three special instructions
109   which mean nothing natively (are no-ops as far as regs/mem are
110   concerned) but have meaning for supporting Valgrind.  A special
111   instruction is flagged by the 16-byte preamble 48C1C703 48C1C70D
112   48C1C73D 48C1C733 (in the standard interpretation, that means: rolq
113   $3, %rdi; rolq $13, %rdi; rolq $61, %rdi; rolq $51, %rdi).
114   Following that, one of the following 3 are allowed (standard
115   interpretation in parentheses):
116
117      4887DB (xchgq %rbx,%rbx)   %RDX = client_request ( %RAX )
118      4887C9 (xchgq %rcx,%rcx)   %RAX = guest_NRADDR
119      4887D2 (xchgq %rdx,%rdx)   call-noredir *%RAX
120
121   Any other bytes following the 16-byte preamble are illegal and
122   constitute a failure in instruction decoding.  This all assumes
123   that the preamble will never occur except in specific code
124   fragments designed for Valgrind to catch.
125
126   No prefixes may precede a "Special" instruction.
127*/
128
129/* casLE (implementation of lock-prefixed insns) and rep-prefixed
130   insns: the side-exit back to the start of the insn is done with
131   Ijk_Boring.  This is quite wrong, it should be done with
132   Ijk_NoRedir, since otherwise the side exit, which is intended to
133   restart the instruction for whatever reason, could go somewhere
134   entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
135   no-redir jumps performance critical, at least for rep-prefixed
136   instructions, since all iterations thereof would involve such a
137   jump.  It's not such a big deal with casLE since the side exit is
138   only taken if the CAS fails, that is, the location is contended,
139   which is relatively unlikely.
140
141   Note also, the test for CAS success vs failure is done using
142   Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
143   Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
144   shouldn't definedness-check these comparisons.  See
145   COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
146   background/rationale.
147*/
148
149/* LOCK prefixed instructions.  These are translated using IR-level
150   CAS statements (IRCAS) and are believed to preserve atomicity, even
151   from the point of view of some other process racing against a
152   simulated one (presumably they communicate via a shared memory
153   segment).
154
155   Handlers which are aware of LOCK prefixes are:
156      dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
157      dis_cmpxchg_G_E  (cmpxchg)
158      dis_Grp1         (add, or, adc, sbb, and, sub, xor)
159      dis_Grp3         (not, neg)
160      dis_Grp4         (inc, dec)
161      dis_Grp5         (inc, dec)
162      dis_Grp8_Imm     (bts, btc, btr)
163      dis_bt_G_E       (bts, btc, btr)
164      dis_xadd_G_E     (xadd)
165*/
166
167
168#include "libvex_basictypes.h"
169#include "libvex_ir.h"
170#include "libvex.h"
171#include "libvex_guest_amd64.h"
172
173#include "main_util.h"
174#include "main_globals.h"
175#include "guest_generic_bb_to_IR.h"
176#include "guest_generic_x87.h"
177#include "guest_amd64_defs.h"
178
179
180/*------------------------------------------------------------*/
181/*--- Globals                                              ---*/
182/*------------------------------------------------------------*/
183
184/* These are set at the start of the translation of an insn, right
185   down in disInstr_AMD64, so that we don't have to pass them around
186   endlessly.  They are all constant during the translation of any
187   given insn. */
188
189/* These are set at the start of the translation of a BB, so
190   that we don't have to pass them around endlessly. */
191
192/* We need to know this to do sub-register accesses correctly. */
193static Bool host_is_bigendian;
194
195/* Pointer to the guest code area (points to start of BB, not to the
196   insn being processed). */
197static UChar* guest_code;
198
199/* The guest address corresponding to guest_code[0]. */
200static Addr64 guest_RIP_bbstart;
201
202/* The guest address for the instruction currently being
203   translated. */
204static Addr64 guest_RIP_curr_instr;
205
206/* The IRSB* into which we're generating code. */
207static IRSB* irsb;
208
209/* For ensuring that %rip-relative addressing is done right.  A read
210   of %rip generates the address of the next instruction.  It may be
211   that we don't conveniently know that inside disAMode().  For sanity
212   checking, if the next insn %rip is needed, we make a guess at what
213   it is, record that guess here, and set the accompanying Bool to
214   indicate that -- after this insn's decode is finished -- that guess
215   needs to be checked.  */
216
217/* At the start of each insn decode, is set to (0, False).
218   After the decode, if _mustcheck is now True, _assumed is
219   checked. */
220
221static Addr64 guest_RIP_next_assumed;
222static Bool   guest_RIP_next_mustcheck;
223
224
225/*------------------------------------------------------------*/
226/*--- Helpers for constructing IR.                         ---*/
227/*------------------------------------------------------------*/
228
229/* Generate a new temporary of the given type. */
230static IRTemp newTemp ( IRType ty )
231{
232   vassert(isPlausibleIRType(ty));
233   return newIRTemp( irsb->tyenv, ty );
234}
235
236/* Add a statement to the list held by "irsb". */
237static void stmt ( IRStmt* st )
238{
239   addStmtToIRSB( irsb, st );
240}
241
242/* Generate a statement "dst := e". */
243static void assign ( IRTemp dst, IRExpr* e )
244{
245   stmt( IRStmt_WrTmp(dst, e) );
246}
247
248static IRExpr* unop ( IROp op, IRExpr* a )
249{
250   return IRExpr_Unop(op, a);
251}
252
253static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
254{
255   return IRExpr_Binop(op, a1, a2);
256}
257
258static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
259{
260   return IRExpr_Triop(op, a1, a2, a3);
261}
262
263static IRExpr* mkexpr ( IRTemp tmp )
264{
265   return IRExpr_RdTmp(tmp);
266}
267
268static IRExpr* mkU8 ( ULong i )
269{
270   vassert(i < 256);
271   return IRExpr_Const(IRConst_U8( (UChar)i ));
272}
273
274static IRExpr* mkU16 ( ULong i )
275{
276   vassert(i < 0x10000ULL);
277   return IRExpr_Const(IRConst_U16( (UShort)i ));
278}
279
280static IRExpr* mkU32 ( ULong i )
281{
282   vassert(i < 0x100000000ULL);
283   return IRExpr_Const(IRConst_U32( (UInt)i ));
284}
285
286static IRExpr* mkU64 ( ULong i )
287{
288   return IRExpr_Const(IRConst_U64(i));
289}
290
291static IRExpr* mkU ( IRType ty, ULong i )
292{
293   switch (ty) {
294      case Ity_I8:  return mkU8(i);
295      case Ity_I16: return mkU16(i);
296      case Ity_I32: return mkU32(i);
297      case Ity_I64: return mkU64(i);
298      default: vpanic("mkU(amd64)");
299   }
300}
301
302static void storeLE ( IRExpr* addr, IRExpr* data )
303{
304   stmt( IRStmt_Store(Iend_LE, addr, data) );
305}
306
307static IRExpr* loadLE ( IRType ty, IRExpr* addr )
308{
309   return IRExpr_Load(Iend_LE, ty, addr);
310}
311
312static IROp mkSizedOp ( IRType ty, IROp op8 )
313{
314   vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
315           || op8 == Iop_Mul8
316           || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
317           || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
318           || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
319           || op8 == Iop_CasCmpNE8
320           || op8 == Iop_Not8 );
321   switch (ty) {
322      case Ity_I8:  return 0 +op8;
323      case Ity_I16: return 1 +op8;
324      case Ity_I32: return 2 +op8;
325      case Ity_I64: return 3 +op8;
326      default: vpanic("mkSizedOp(amd64)");
327   }
328}
329
330static
331IRExpr* doScalarWidening ( Int szSmall, Int szBig, Bool signd, IRExpr* src )
332{
333   if (szSmall == 1 && szBig == 4) {
334      return unop(signd ? Iop_8Sto32 : Iop_8Uto32, src);
335   }
336   if (szSmall == 1 && szBig == 2) {
337      return unop(signd ? Iop_8Sto16 : Iop_8Uto16, src);
338   }
339   if (szSmall == 2 && szBig == 4) {
340      return unop(signd ? Iop_16Sto32 : Iop_16Uto32, src);
341   }
342   if (szSmall == 1 && szBig == 8 && !signd) {
343      return unop(Iop_8Uto64, src);
344   }
345   if (szSmall == 1 && szBig == 8 && signd) {
346      return unop(Iop_8Sto64, src);
347   }
348   if (szSmall == 2 && szBig == 8 && !signd) {
349      return unop(Iop_16Uto64, src);
350   }
351   if (szSmall == 2 && szBig == 8 && signd) {
352      return unop(Iop_16Sto64, src);
353   }
354   vpanic("doScalarWidening(amd64)");
355}
356
357
358
359/*------------------------------------------------------------*/
360/*--- Debugging output                                     ---*/
361/*------------------------------------------------------------*/
362
363/* Bomb out if we can't handle something. */
364__attribute__ ((noreturn))
365static void unimplemented ( HChar* str )
366{
367   vex_printf("amd64toIR: unimplemented feature\n");
368   vpanic(str);
369}
370
371#define DIP(format, args...)           \
372   if (vex_traceflags & VEX_TRACE_FE)  \
373      vex_printf(format, ## args)
374
375#define DIS(buf, format, args...)      \
376   if (vex_traceflags & VEX_TRACE_FE)  \
377      vex_sprintf(buf, format, ## args)
378
379
380/*------------------------------------------------------------*/
381/*--- Offsets of various parts of the amd64 guest state.   ---*/
382/*------------------------------------------------------------*/
383
384#define OFFB_RAX       offsetof(VexGuestAMD64State,guest_RAX)
385#define OFFB_RBX       offsetof(VexGuestAMD64State,guest_RBX)
386#define OFFB_RCX       offsetof(VexGuestAMD64State,guest_RCX)
387#define OFFB_RDX       offsetof(VexGuestAMD64State,guest_RDX)
388#define OFFB_RSP       offsetof(VexGuestAMD64State,guest_RSP)
389#define OFFB_RBP       offsetof(VexGuestAMD64State,guest_RBP)
390#define OFFB_RSI       offsetof(VexGuestAMD64State,guest_RSI)
391#define OFFB_RDI       offsetof(VexGuestAMD64State,guest_RDI)
392#define OFFB_R8        offsetof(VexGuestAMD64State,guest_R8)
393#define OFFB_R9        offsetof(VexGuestAMD64State,guest_R9)
394#define OFFB_R10       offsetof(VexGuestAMD64State,guest_R10)
395#define OFFB_R11       offsetof(VexGuestAMD64State,guest_R11)
396#define OFFB_R12       offsetof(VexGuestAMD64State,guest_R12)
397#define OFFB_R13       offsetof(VexGuestAMD64State,guest_R13)
398#define OFFB_R14       offsetof(VexGuestAMD64State,guest_R14)
399#define OFFB_R15       offsetof(VexGuestAMD64State,guest_R15)
400
401#define OFFB_RIP       offsetof(VexGuestAMD64State,guest_RIP)
402
403#define OFFB_FS_ZERO   offsetof(VexGuestAMD64State,guest_FS_ZERO)
404#define OFFB_GS_0x60   offsetof(VexGuestAMD64State,guest_GS_0x60)
405
406#define OFFB_CC_OP     offsetof(VexGuestAMD64State,guest_CC_OP)
407#define OFFB_CC_DEP1   offsetof(VexGuestAMD64State,guest_CC_DEP1)
408#define OFFB_CC_DEP2   offsetof(VexGuestAMD64State,guest_CC_DEP2)
409#define OFFB_CC_NDEP   offsetof(VexGuestAMD64State,guest_CC_NDEP)
410
411#define OFFB_FPREGS    offsetof(VexGuestAMD64State,guest_FPREG[0])
412#define OFFB_FPTAGS    offsetof(VexGuestAMD64State,guest_FPTAG[0])
413#define OFFB_DFLAG     offsetof(VexGuestAMD64State,guest_DFLAG)
414#define OFFB_ACFLAG    offsetof(VexGuestAMD64State,guest_ACFLAG)
415#define OFFB_IDFLAG    offsetof(VexGuestAMD64State,guest_IDFLAG)
416#define OFFB_FTOP      offsetof(VexGuestAMD64State,guest_FTOP)
417#define OFFB_FC3210    offsetof(VexGuestAMD64State,guest_FC3210)
418#define OFFB_FPROUND   offsetof(VexGuestAMD64State,guest_FPROUND)
419//..
420//.. #define OFFB_CS        offsetof(VexGuestX86State,guest_CS)
421//.. #define OFFB_DS        offsetof(VexGuestX86State,guest_DS)
422//.. #define OFFB_ES        offsetof(VexGuestX86State,guest_ES)
423//.. #define OFFB_FS        offsetof(VexGuestX86State,guest_FS)
424//.. #define OFFB_GS        offsetof(VexGuestX86State,guest_GS)
425//.. #define OFFB_SS        offsetof(VexGuestX86State,guest_SS)
426//.. #define OFFB_LDT       offsetof(VexGuestX86State,guest_LDT)
427//.. #define OFFB_GDT       offsetof(VexGuestX86State,guest_GDT)
428
429#define OFFB_SSEROUND  offsetof(VexGuestAMD64State,guest_SSEROUND)
430#define OFFB_XMM0      offsetof(VexGuestAMD64State,guest_XMM0)
431#define OFFB_XMM1      offsetof(VexGuestAMD64State,guest_XMM1)
432#define OFFB_XMM2      offsetof(VexGuestAMD64State,guest_XMM2)
433#define OFFB_XMM3      offsetof(VexGuestAMD64State,guest_XMM3)
434#define OFFB_XMM4      offsetof(VexGuestAMD64State,guest_XMM4)
435#define OFFB_XMM5      offsetof(VexGuestAMD64State,guest_XMM5)
436#define OFFB_XMM6      offsetof(VexGuestAMD64State,guest_XMM6)
437#define OFFB_XMM7      offsetof(VexGuestAMD64State,guest_XMM7)
438#define OFFB_XMM8      offsetof(VexGuestAMD64State,guest_XMM8)
439#define OFFB_XMM9      offsetof(VexGuestAMD64State,guest_XMM9)
440#define OFFB_XMM10     offsetof(VexGuestAMD64State,guest_XMM10)
441#define OFFB_XMM11     offsetof(VexGuestAMD64State,guest_XMM11)
442#define OFFB_XMM12     offsetof(VexGuestAMD64State,guest_XMM12)
443#define OFFB_XMM13     offsetof(VexGuestAMD64State,guest_XMM13)
444#define OFFB_XMM14     offsetof(VexGuestAMD64State,guest_XMM14)
445#define OFFB_XMM15     offsetof(VexGuestAMD64State,guest_XMM15)
446#define OFFB_XMM16     offsetof(VexGuestAMD64State,guest_XMM16)
447
448#define OFFB_EMWARN    offsetof(VexGuestAMD64State,guest_EMWARN)
449#define OFFB_TISTART   offsetof(VexGuestAMD64State,guest_TISTART)
450#define OFFB_TILEN     offsetof(VexGuestAMD64State,guest_TILEN)
451
452#define OFFB_NRADDR    offsetof(VexGuestAMD64State,guest_NRADDR)
453
454
455/*------------------------------------------------------------*/
456/*--- Helper bits and pieces for deconstructing the        ---*/
457/*--- amd64 insn stream.                                   ---*/
458/*------------------------------------------------------------*/
459
460/* This is the AMD64 register encoding -- integer regs. */
461#define R_RAX 0
462#define R_RCX 1
463#define R_RDX 2
464#define R_RBX 3
465#define R_RSP 4
466#define R_RBP 5
467#define R_RSI 6
468#define R_RDI 7
469#define R_R8  8
470#define R_R9  9
471#define R_R10 10
472#define R_R11 11
473#define R_R12 12
474#define R_R13 13
475#define R_R14 14
476#define R_R15 15
477
478//.. #define R_AL (0+R_EAX)
479//.. #define R_AH (4+R_EAX)
480
481/* This is the Intel register encoding -- segment regs. */
482#define R_ES 0
483#define R_CS 1
484#define R_SS 2
485#define R_DS 3
486#define R_FS 4
487#define R_GS 5
488
489
490/* Various simple conversions */
491
492static ULong extend_s_8to64 ( UChar x )
493{
494   return (ULong)((((Long)x) << 56) >> 56);
495}
496
497static ULong extend_s_16to64 ( UShort x )
498{
499   return (ULong)((((Long)x) << 48) >> 48);
500}
501
502static ULong extend_s_32to64 ( UInt x )
503{
504   return (ULong)((((Long)x) << 32) >> 32);
505}
506
507/* Figure out whether the mod and rm parts of a modRM byte refer to a
508   register or memory.  If so, the byte will have the form 11XXXYYY,
509   where YYY is the register number. */
510inline
511static Bool epartIsReg ( UChar mod_reg_rm )
512{
513   return toBool(0xC0 == (mod_reg_rm & 0xC0));
514}
515
516/* Extract the 'g' field from a modRM byte.  This only produces 3
517   bits, which is not a complete register number.  You should avoid
518   this function if at all possible. */
519inline
520static Int gregLO3ofRM ( UChar mod_reg_rm )
521{
522   return (Int)( (mod_reg_rm >> 3) & 7 );
523}
524
525/* Ditto the 'e' field of a modRM byte. */
526inline
527static Int eregLO3ofRM ( UChar mod_reg_rm )
528{
529   return (Int)(mod_reg_rm & 0x7);
530}
531
532/* Get a 8/16/32-bit unsigned value out of the insn stream. */
533
534static UChar getUChar ( Long delta )
535{
536   UChar v = guest_code[delta+0];
537   return v;
538}
539
540static UInt getUDisp16 ( Long delta )
541{
542   UInt v = guest_code[delta+1]; v <<= 8;
543   v |= guest_code[delta+0];
544   return v & 0xFFFF;
545}
546
547//.. static UInt getUDisp ( Int size, Long delta )
548//.. {
549//..    switch (size) {
550//..       case 4: return getUDisp32(delta);
551//..       case 2: return getUDisp16(delta);
552//..       case 1: return getUChar(delta);
553//..       default: vpanic("getUDisp(x86)");
554//..    }
555//..    return 0; /*notreached*/
556//.. }
557
558
559/* Get a byte value out of the insn stream and sign-extend to 64
560   bits. */
561static Long getSDisp8 ( Long delta )
562{
563   return extend_s_8to64( guest_code[delta] );
564}
565
566/* Get a 16-bit value out of the insn stream and sign-extend to 64
567   bits. */
568static Long getSDisp16 ( Long delta )
569{
570   UInt v = guest_code[delta+1]; v <<= 8;
571   v |= guest_code[delta+0];
572   return extend_s_16to64( (UShort)v );
573}
574
575/* Get a 32-bit value out of the insn stream and sign-extend to 64
576   bits. */
577static Long getSDisp32 ( Long delta )
578{
579   UInt v = guest_code[delta+3]; v <<= 8;
580   v |= guest_code[delta+2]; v <<= 8;
581   v |= guest_code[delta+1]; v <<= 8;
582   v |= guest_code[delta+0];
583   return extend_s_32to64( v );
584}
585
586/* Get a 64-bit value out of the insn stream. */
587static Long getDisp64 ( Long delta )
588{
589   ULong v = 0;
590   v |= guest_code[delta+7]; v <<= 8;
591   v |= guest_code[delta+6]; v <<= 8;
592   v |= guest_code[delta+5]; v <<= 8;
593   v |= guest_code[delta+4]; v <<= 8;
594   v |= guest_code[delta+3]; v <<= 8;
595   v |= guest_code[delta+2]; v <<= 8;
596   v |= guest_code[delta+1]; v <<= 8;
597   v |= guest_code[delta+0];
598   return v;
599}
600
601/* Note: because AMD64 doesn't allow 64-bit literals, it is an error
602   if this is called with size==8.  Should not happen. */
603static Long getSDisp ( Int size, Long delta )
604{
605   switch (size) {
606      case 4: return getSDisp32(delta);
607      case 2: return getSDisp16(delta);
608      case 1: return getSDisp8(delta);
609      default: vpanic("getSDisp(amd64)");
610  }
611}
612
613static ULong mkSizeMask ( Int sz )
614{
615   switch (sz) {
616      case 1: return 0x00000000000000FFULL;
617      case 2: return 0x000000000000FFFFULL;
618      case 4: return 0x00000000FFFFFFFFULL;
619      case 8: return 0xFFFFFFFFFFFFFFFFULL;
620      default: vpanic("mkSzMask(amd64)");
621   }
622}
623
624static Int imin ( Int a, Int b )
625{
626   return (a < b) ? a : b;
627}
628
629static IRType szToITy ( Int n )
630{
631   switch (n) {
632      case 1: return Ity_I8;
633      case 2: return Ity_I16;
634      case 4: return Ity_I32;
635      case 8: return Ity_I64;
636      default: vex_printf("\nszToITy(%d)\n", n);
637               vpanic("szToITy(amd64)");
638   }
639}
640
641
642/*------------------------------------------------------------*/
643/*--- For dealing with prefixes.                           ---*/
644/*------------------------------------------------------------*/
645
646/* The idea is to pass around an int holding a bitmask summarising
647   info from the prefixes seen on the current instruction, including
648   info from the REX byte.  This info is used in various places, but
649   most especially when making sense of register fields in
650   instructions.
651
652   The top 16 bits of the prefix are 0x3141, just as a hacky way
653   to ensure it really is a valid prefix.
654
655   Things you can safely assume about a well-formed prefix:
656   * at most one segment-override bit (CS,DS,ES,FS,GS,SS) is set.
657   * if REX is not present then REXW,REXR,REXX,REXB will read
658     as zero.
659   * F2 and F3 will not both be 1.
660*/
661
662typedef UInt  Prefix;
663
664#define PFX_ASO   (1<<0)     /* address-size override present (0x67) */
665#define PFX_66    (1<<1)     /* operand-size override-to-16 present (0x66) */
666#define PFX_REX   (1<<2)     /* REX byte present (0x40 to 0x4F) */
667#define PFX_REXW  (1<<3)     /* REX W bit, if REX present, else 0 */
668#define PFX_REXR  (1<<4)     /* REX R bit, if REX present, else 0 */
669#define PFX_REXX  (1<<5)     /* REX X bit, if REX present, else 0 */
670#define PFX_REXB  (1<<6)     /* REX B bit, if REX present, else 0 */
671#define PFX_LOCK  (1<<7)     /* bus LOCK prefix present (0xF0) */
672#define PFX_F2    (1<<8)     /* REP/REPE/REPZ prefix present (0xF2) */
673#define PFX_F3    (1<<9)     /* REPNE/REPNZ prefix present (0xF3) */
674#define PFX_CS    (1<<10)    /* CS segment prefix present (0x2E) */
675#define PFX_DS    (1<<11)    /* DS segment prefix present (0x3E) */
676#define PFX_ES    (1<<12)    /* ES segment prefix present (0x26) */
677#define PFX_FS    (1<<13)    /* FS segment prefix present (0x64) */
678#define PFX_GS    (1<<14)    /* GS segment prefix present (0x65) */
679#define PFX_SS    (1<<15)    /* SS segment prefix present (0x36) */
680
681#define PFX_EMPTY 0x31410000
682
683static Bool IS_VALID_PFX ( Prefix pfx ) {
684   return toBool((pfx & 0xFFFF0000) == PFX_EMPTY);
685}
686
687static Bool haveREX ( Prefix pfx ) {
688   return toBool(pfx & PFX_REX);
689}
690
691static Int getRexW ( Prefix pfx ) {
692   return (pfx & PFX_REXW) ? 1 : 0;
693}
694/* Apparently unused.
695static Int getRexR ( Prefix pfx ) {
696   return (pfx & PFX_REXR) ? 1 : 0;
697}
698*/
699static Int getRexX ( Prefix pfx ) {
700   return (pfx & PFX_REXX) ? 1 : 0;
701}
702static Int getRexB ( Prefix pfx ) {
703   return (pfx & PFX_REXB) ? 1 : 0;
704}
705
706/* Check a prefix doesn't have F2 or F3 set in it, since usually that
707   completely changes what instruction it really is. */
708static Bool haveF2orF3 ( Prefix pfx ) {
709   return toBool((pfx & (PFX_F2|PFX_F3)) > 0);
710}
711static Bool haveF2 ( Prefix pfx ) {
712   return toBool((pfx & PFX_F2) > 0);
713}
714static Bool haveF3 ( Prefix pfx ) {
715   return toBool((pfx & PFX_F3) > 0);
716}
717
718static Bool have66 ( Prefix pfx ) {
719   return toBool((pfx & PFX_66) > 0);
720}
721static Bool haveASO ( Prefix pfx ) {
722   return toBool((pfx & PFX_ASO) > 0);
723}
724
725/* Return True iff pfx has 66 set and F2 and F3 clear */
726static Bool have66noF2noF3 ( Prefix pfx )
727{
728  return
729     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_66);
730}
731
732/* Return True iff pfx has F2 set and 66 and F3 clear */
733static Bool haveF2no66noF3 ( Prefix pfx )
734{
735  return
736     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F2);
737}
738
739/* Return True iff pfx has F3 set and 66 and F2 clear */
740static Bool haveF3no66noF2 ( Prefix pfx )
741{
742  return
743     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F3);
744}
745
746/* Return True iff pfx has F3 set and F2 clear */
747static Bool haveF3noF2 ( Prefix pfx )
748{
749  return
750     toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F3);
751}
752
753/* Return True iff pfx has F2 set and F3 clear */
754static Bool haveF2noF3 ( Prefix pfx )
755{
756  return
757     toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F2);
758}
759
760/* Return True iff pfx has 66, F2 and F3 clear */
761static Bool haveNo66noF2noF3 ( Prefix pfx )
762{
763  return
764     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == 0);
765}
766
767/* Return True iff pfx has any of 66, F2 and F3 set */
768static Bool have66orF2orF3 ( Prefix pfx )
769{
770  return toBool( ! haveNo66noF2noF3(pfx) );
771}
772
773/* Return True iff pfx has 66 or F2 set */
774static Bool have66orF2 ( Prefix pfx )
775{
776   return toBool((pfx & (PFX_66|PFX_F2)) > 0);
777}
778
779/* Clear all the segment-override bits in a prefix. */
780static Prefix clearSegBits ( Prefix p )
781{
782   return
783      p & ~(PFX_CS | PFX_DS | PFX_ES | PFX_FS | PFX_GS | PFX_SS);
784}
785
786
787/*------------------------------------------------------------*/
788/*--- For dealing with integer registers                   ---*/
789/*------------------------------------------------------------*/
790
791/* This is somewhat complex.  The rules are:
792
793   For 64, 32 and 16 bit register references, the e or g fields in the
794   modrm bytes supply the low 3 bits of the register number.  The
795   fourth (most-significant) bit of the register number is supplied by
796   the REX byte, if it is present; else that bit is taken to be zero.
797
798   The REX.R bit supplies the high bit corresponding to the g register
799   field, and the REX.B bit supplies the high bit corresponding to the
800   e register field (when the mod part of modrm indicates that modrm's
801   e component refers to a register and not to memory).
802
803   The REX.X bit supplies a high register bit for certain registers
804   in SIB address modes, and is generally rarely used.
805
806   For 8 bit register references, the presence of the REX byte itself
807   has significance.  If there is no REX present, then the 3-bit
808   number extracted from the modrm e or g field is treated as an index
809   into the sequence %al %cl %dl %bl %ah %ch %dh %bh -- that is, the
810   old x86 encoding scheme.
811
812   But if there is a REX present, the register reference is
813   interpreted in the same way as for 64/32/16-bit references: a high
814   bit is extracted from REX, giving a 4-bit number, and the denoted
815   register is the lowest 8 bits of the 16 integer registers denoted
816   by the number.  In particular, values 3 through 7 of this sequence
817   do not refer to %ah %ch %dh %bh but instead to the lowest 8 bits of
818   %rsp %rbp %rsi %rdi.
819
820   The REX.W bit has no bearing at all on register numbers.  Instead
821   its presence indicates that the operand size is to be overridden
822   from its default value (32 bits) to 64 bits instead.  This is in
823   the same fashion that an 0x66 prefix indicates the operand size is
824   to be overridden from 32 bits down to 16 bits.  When both REX.W and
825   0x66 are present there is a conflict, and REX.W takes precedence.
826
827   Rather than try to handle this complexity using a single huge
828   function, several smaller ones are provided.  The aim is to make it
829   as difficult as possible to screw up register decoding in a subtle
830   and hard-to-track-down way.
831
832   Because these routines fish around in the host's memory (that is,
833   in the guest state area) for sub-parts of guest registers, their
834   correctness depends on the host's endianness.  So far these
835   routines only work for little-endian hosts.  Those for which
836   endianness is important have assertions to ensure sanity.
837*/
838
839
840/* About the simplest question you can ask: where do the 64-bit
841   integer registers live (in the guest state) ? */
842
843static Int integerGuestReg64Offset ( UInt reg )
844{
845   switch (reg) {
846      case R_RAX: return OFFB_RAX;
847      case R_RCX: return OFFB_RCX;
848      case R_RDX: return OFFB_RDX;
849      case R_RBX: return OFFB_RBX;
850      case R_RSP: return OFFB_RSP;
851      case R_RBP: return OFFB_RBP;
852      case R_RSI: return OFFB_RSI;
853      case R_RDI: return OFFB_RDI;
854      case R_R8:  return OFFB_R8;
855      case R_R9:  return OFFB_R9;
856      case R_R10: return OFFB_R10;
857      case R_R11: return OFFB_R11;
858      case R_R12: return OFFB_R12;
859      case R_R13: return OFFB_R13;
860      case R_R14: return OFFB_R14;
861      case R_R15: return OFFB_R15;
862      default: vpanic("integerGuestReg64Offset(amd64)");
863   }
864}
865
866
867/* Produce the name of an integer register, for printing purposes.
868   reg is a number in the range 0 .. 15 that has been generated from a
869   3-bit reg-field number and a REX extension bit.  irregular denotes
870   the case where sz==1 and no REX byte is present. */
871
872static
873HChar* nameIReg ( Int sz, UInt reg, Bool irregular )
874{
875   static HChar* ireg64_names[16]
876     = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
877         "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
878   static HChar* ireg32_names[16]
879     = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
880         "%r8d", "%r9d", "%r10d","%r11d","%r12d","%r13d","%r14d","%r15d" };
881   static HChar* ireg16_names[16]
882     = { "%ax",  "%cx",  "%dx",  "%bx",  "%sp",  "%bp",  "%si",  "%di",
883         "%r8w", "%r9w", "%r10w","%r11w","%r12w","%r13w","%r14w","%r15w" };
884   static HChar* ireg8_names[16]
885     = { "%al",  "%cl",  "%dl",  "%bl",  "%spl", "%bpl", "%sil", "%dil",
886         "%r8b", "%r9b", "%r10b","%r11b","%r12b","%r13b","%r14b","%r15b" };
887   static HChar* ireg8_irregular[8]
888     = { "%al", "%cl", "%dl", "%bl", "%ah", "%ch", "%dh", "%bh" };
889
890   vassert(reg < 16);
891   if (sz == 1) {
892      if (irregular)
893         vassert(reg < 8);
894   } else {
895      vassert(irregular == False);
896   }
897
898   switch (sz) {
899      case 8: return ireg64_names[reg];
900      case 4: return ireg32_names[reg];
901      case 2: return ireg16_names[reg];
902      case 1: if (irregular) {
903                 return ireg8_irregular[reg];
904              } else {
905                 return ireg8_names[reg];
906              }
907      default: vpanic("nameIReg(amd64)");
908   }
909}
910
911/* Using the same argument conventions as nameIReg, produce the
912   guest state offset of an integer register. */
913
914static
915Int offsetIReg ( Int sz, UInt reg, Bool irregular )
916{
917   vassert(reg < 16);
918   if (sz == 1) {
919      if (irregular)
920         vassert(reg < 8);
921   } else {
922      vassert(irregular == False);
923   }
924
925   /* Deal with irregular case -- sz==1 and no REX present */
926   if (sz == 1 && irregular) {
927      switch (reg) {
928         case R_RSP: return 1+ OFFB_RAX;
929         case R_RBP: return 1+ OFFB_RCX;
930         case R_RSI: return 1+ OFFB_RDX;
931         case R_RDI: return 1+ OFFB_RBX;
932         default:    break; /* use the normal case */
933      }
934   }
935
936   /* Normal case */
937   return integerGuestReg64Offset(reg);
938}
939
940
941/* Read the %CL register :: Ity_I8, for shift/rotate operations. */
942
943static IRExpr* getIRegCL ( void )
944{
945   vassert(!host_is_bigendian);
946   return IRExpr_Get( OFFB_RCX, Ity_I8 );
947}
948
949
950/* Write to the %AH register. */
951
952static void putIRegAH ( IRExpr* e )
953{
954   vassert(!host_is_bigendian);
955   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
956   stmt( IRStmt_Put( OFFB_RAX+1, e ) );
957}
958
959
960/* Read/write various widths of %RAX, as it has various
961   special-purpose uses. */
962
963static HChar* nameIRegRAX ( Int sz )
964{
965   switch (sz) {
966      case 1: return "%al";
967      case 2: return "%ax";
968      case 4: return "%eax";
969      case 8: return "%rax";
970      default: vpanic("nameIRegRAX(amd64)");
971   }
972}
973
974static IRExpr* getIRegRAX ( Int sz )
975{
976   vassert(!host_is_bigendian);
977   switch (sz) {
978      case 1: return IRExpr_Get( OFFB_RAX, Ity_I8 );
979      case 2: return IRExpr_Get( OFFB_RAX, Ity_I16 );
980      case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RAX, Ity_I64 ));
981      case 8: return IRExpr_Get( OFFB_RAX, Ity_I64 );
982      default: vpanic("getIRegRAX(amd64)");
983   }
984}
985
986static void putIRegRAX ( Int sz, IRExpr* e )
987{
988   IRType ty = typeOfIRExpr(irsb->tyenv, e);
989   vassert(!host_is_bigendian);
990   switch (sz) {
991      case 8: vassert(ty == Ity_I64);
992              stmt( IRStmt_Put( OFFB_RAX, e ));
993              break;
994      case 4: vassert(ty == Ity_I32);
995              stmt( IRStmt_Put( OFFB_RAX, unop(Iop_32Uto64,e) ));
996              break;
997      case 2: vassert(ty == Ity_I16);
998              stmt( IRStmt_Put( OFFB_RAX, e ));
999              break;
1000      case 1: vassert(ty == Ity_I8);
1001              stmt( IRStmt_Put( OFFB_RAX, e ));
1002              break;
1003      default: vpanic("putIRegRAX(amd64)");
1004   }
1005}
1006
1007
1008/* Read/write various widths of %RDX, as it has various
1009   special-purpose uses. */
1010
1011static HChar* nameIRegRDX ( Int sz )
1012{
1013   switch (sz) {
1014      case 1: return "%dl";
1015      case 2: return "%dx";
1016      case 4: return "%edx";
1017      case 8: return "%rdx";
1018      default: vpanic("nameIRegRDX(amd64)");
1019   }
1020}
1021
1022static IRExpr* getIRegRDX ( Int sz )
1023{
1024   vassert(!host_is_bigendian);
1025   switch (sz) {
1026      case 1: return IRExpr_Get( OFFB_RDX, Ity_I8 );
1027      case 2: return IRExpr_Get( OFFB_RDX, Ity_I16 );
1028      case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RDX, Ity_I64 ));
1029      case 8: return IRExpr_Get( OFFB_RDX, Ity_I64 );
1030      default: vpanic("getIRegRDX(amd64)");
1031   }
1032}
1033
1034static void putIRegRDX ( Int sz, IRExpr* e )
1035{
1036   vassert(!host_is_bigendian);
1037   vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
1038   switch (sz) {
1039      case 8: stmt( IRStmt_Put( OFFB_RDX, e ));
1040              break;
1041      case 4: stmt( IRStmt_Put( OFFB_RDX, unop(Iop_32Uto64,e) ));
1042              break;
1043      case 2: stmt( IRStmt_Put( OFFB_RDX, e ));
1044              break;
1045      case 1: stmt( IRStmt_Put( OFFB_RDX, e ));
1046              break;
1047      default: vpanic("putIRegRDX(amd64)");
1048   }
1049}
1050
1051
1052/* Simplistic functions to deal with the integer registers as a
1053   straightforward bank of 16 64-bit regs. */
1054
1055static IRExpr* getIReg64 ( UInt regno )
1056{
1057   return IRExpr_Get( integerGuestReg64Offset(regno),
1058                      Ity_I64 );
1059}
1060
1061static void putIReg64 ( UInt regno, IRExpr* e )
1062{
1063   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
1064   stmt( IRStmt_Put( integerGuestReg64Offset(regno), e ) );
1065}
1066
1067static HChar* nameIReg64 ( UInt regno )
1068{
1069   return nameIReg( 8, regno, False );
1070}
1071
1072
1073/* Simplistic functions to deal with the lower halves of integer
1074   registers as a straightforward bank of 16 32-bit regs. */
1075
1076static IRExpr* getIReg32 ( UInt regno )
1077{
1078   vassert(!host_is_bigendian);
1079   return unop(Iop_64to32,
1080               IRExpr_Get( integerGuestReg64Offset(regno),
1081                           Ity_I64 ));
1082}
1083
1084static void putIReg32 ( UInt regno, IRExpr* e )
1085{
1086   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
1087   stmt( IRStmt_Put( integerGuestReg64Offset(regno),
1088                     unop(Iop_32Uto64,e) ) );
1089}
1090
1091static HChar* nameIReg32 ( UInt regno )
1092{
1093   return nameIReg( 4, regno, False );
1094}
1095
1096
1097/* Simplistic functions to deal with the lower quarters of integer
1098   registers as a straightforward bank of 16 16-bit regs. */
1099
1100static IRExpr* getIReg16 ( UInt regno )
1101{
1102   vassert(!host_is_bigendian);
1103   return IRExpr_Get( integerGuestReg64Offset(regno),
1104                      Ity_I16 );
1105}
1106
1107static void putIReg16 ( UInt regno, IRExpr* e )
1108{
1109   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
1110   stmt( IRStmt_Put( integerGuestReg64Offset(regno),
1111                     unop(Iop_16Uto64,e) ) );
1112}
1113
1114static HChar* nameIReg16 ( UInt regno )
1115{
1116   return nameIReg( 2, regno, False );
1117}
1118
1119
1120/* Sometimes what we know is a 3-bit register number, a REX byte, and
1121   which field of the REX byte is to be used to extend to a 4-bit
1122   number.  These functions cater for that situation.
1123*/
1124static IRExpr* getIReg64rexX ( Prefix pfx, UInt lo3bits )
1125{
1126   vassert(lo3bits < 8);
1127   vassert(IS_VALID_PFX(pfx));
1128   return getIReg64( lo3bits | (getRexX(pfx) << 3) );
1129}
1130
1131static HChar* nameIReg64rexX ( Prefix pfx, UInt lo3bits )
1132{
1133   vassert(lo3bits < 8);
1134   vassert(IS_VALID_PFX(pfx));
1135   return nameIReg( 8, lo3bits | (getRexX(pfx) << 3), False );
1136}
1137
1138static HChar* nameIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
1139{
1140   vassert(lo3bits < 8);
1141   vassert(IS_VALID_PFX(pfx));
1142   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1143   return nameIReg( sz, lo3bits | (getRexB(pfx) << 3),
1144                        toBool(sz==1 && !haveREX(pfx)) );
1145}
1146
1147static IRExpr* getIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
1148{
1149   vassert(lo3bits < 8);
1150   vassert(IS_VALID_PFX(pfx));
1151   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1152   if (sz == 4) {
1153      sz = 8;
1154      return unop(Iop_64to32,
1155                  IRExpr_Get(
1156                     offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
1157                                     toBool(sz==1 && !haveREX(pfx)) ),
1158                     szToITy(sz)
1159                 )
1160             );
1161   } else {
1162      return IRExpr_Get(
1163                offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
1164                                toBool(sz==1 && !haveREX(pfx)) ),
1165                szToITy(sz)
1166             );
1167   }
1168}
1169
1170static void putIRegRexB ( Int sz, Prefix pfx, UInt lo3bits, IRExpr* e )
1171{
1172   vassert(lo3bits < 8);
1173   vassert(IS_VALID_PFX(pfx));
1174   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1175   vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
1176   stmt( IRStmt_Put(
1177            offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
1178                            toBool(sz==1 && !haveREX(pfx)) ),
1179            sz==4 ? unop(Iop_32Uto64,e) : e
1180   ));
1181}
1182
1183
1184/* Functions for getting register numbers from modrm bytes and REX
1185   when we don't have to consider the complexities of integer subreg
1186   accesses.
1187*/
1188/* Extract the g reg field from a modRM byte, and augment it using the
1189   REX.R bit from the supplied REX byte.  The R bit usually is
1190   associated with the g register field.
1191*/
1192static UInt gregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
1193{
1194   Int reg = (Int)( (mod_reg_rm >> 3) & 7 );
1195   reg += (pfx & PFX_REXR) ? 8 : 0;
1196   return reg;
1197}
1198
1199/* Extract the e reg field from a modRM byte, and augment it using the
1200   REX.B bit from the supplied REX byte.  The B bit usually is
1201   associated with the e register field (when modrm indicates e is a
1202   register, that is).
1203*/
1204static UInt eregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
1205{
1206   Int rm;
1207   vassert(epartIsReg(mod_reg_rm));
1208   rm = (Int)(mod_reg_rm & 0x7);
1209   rm += (pfx & PFX_REXB) ? 8 : 0;
1210   return rm;
1211}
1212
1213
1214/* General functions for dealing with integer register access. */
1215
1216/* Produce the guest state offset for a reference to the 'g' register
1217   field in a modrm byte, taking into account REX (or its absence),
1218   and the size of the access.
1219*/
1220static UInt offsetIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
1221{
1222   UInt reg;
1223   vassert(!host_is_bigendian);
1224   vassert(IS_VALID_PFX(pfx));
1225   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1226   reg = gregOfRexRM( pfx, mod_reg_rm );
1227   return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
1228}
1229
1230static
1231IRExpr* getIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
1232{
1233   if (sz == 4) {
1234      sz = 8;
1235      return unop(Iop_64to32,
1236                  IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
1237                              szToITy(sz) ));
1238   } else {
1239      return IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
1240                         szToITy(sz) );
1241   }
1242}
1243
1244static
1245void putIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
1246{
1247   vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
1248   if (sz == 4) {
1249      e = unop(Iop_32Uto64,e);
1250   }
1251   stmt( IRStmt_Put( offsetIRegG( sz, pfx, mod_reg_rm ), e ) );
1252}
1253
1254static
1255HChar* nameIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
1256{
1257   return nameIReg( sz, gregOfRexRM(pfx,mod_reg_rm),
1258                        toBool(sz==1 && !haveREX(pfx)) );
1259}
1260
1261
1262/* Produce the guest state offset for a reference to the 'e' register
1263   field in a modrm byte, taking into account REX (or its absence),
1264   and the size of the access.  eregOfRexRM will assert if mod_reg_rm
1265   denotes a memory access rather than a register access.
1266*/
1267static UInt offsetIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
1268{
1269   UInt reg;
1270   vassert(!host_is_bigendian);
1271   vassert(IS_VALID_PFX(pfx));
1272   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1273   reg = eregOfRexRM( pfx, mod_reg_rm );
1274   return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
1275}
1276
1277static
1278IRExpr* getIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
1279{
1280   if (sz == 4) {
1281      sz = 8;
1282      return unop(Iop_64to32,
1283                  IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
1284                              szToITy(sz) ));
1285   } else {
1286      return IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
1287                         szToITy(sz) );
1288   }
1289}
1290
1291static
1292void putIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
1293{
1294   vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
1295   if (sz == 4) {
1296      e = unop(Iop_32Uto64,e);
1297   }
1298   stmt( IRStmt_Put( offsetIRegE( sz, pfx, mod_reg_rm ), e ) );
1299}
1300
1301static
1302HChar* nameIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
1303{
1304   return nameIReg( sz, eregOfRexRM(pfx,mod_reg_rm),
1305                        toBool(sz==1 && !haveREX(pfx)) );
1306}
1307
1308
1309/*------------------------------------------------------------*/
1310/*--- For dealing with XMM registers                       ---*/
1311/*------------------------------------------------------------*/
1312
1313//.. static Int segmentGuestRegOffset ( UInt sreg )
1314//.. {
1315//..    switch (sreg) {
1316//..       case R_ES: return OFFB_ES;
1317//..       case R_CS: return OFFB_CS;
1318//..       case R_SS: return OFFB_SS;
1319//..       case R_DS: return OFFB_DS;
1320//..       case R_FS: return OFFB_FS;
1321//..       case R_GS: return OFFB_GS;
1322//..       default: vpanic("segmentGuestRegOffset(x86)");
1323//..    }
1324//.. }
1325
1326static Int xmmGuestRegOffset ( UInt xmmreg )
1327{
1328   switch (xmmreg) {
1329      case 0:  return OFFB_XMM0;
1330      case 1:  return OFFB_XMM1;
1331      case 2:  return OFFB_XMM2;
1332      case 3:  return OFFB_XMM3;
1333      case 4:  return OFFB_XMM4;
1334      case 5:  return OFFB_XMM5;
1335      case 6:  return OFFB_XMM6;
1336      case 7:  return OFFB_XMM7;
1337      case 8:  return OFFB_XMM8;
1338      case 9:  return OFFB_XMM9;
1339      case 10: return OFFB_XMM10;
1340      case 11: return OFFB_XMM11;
1341      case 12: return OFFB_XMM12;
1342      case 13: return OFFB_XMM13;
1343      case 14: return OFFB_XMM14;
1344      case 15: return OFFB_XMM15;
1345      default: vpanic("xmmGuestRegOffset(amd64)");
1346   }
1347}
1348
1349/* Lanes of vector registers are always numbered from zero being the
1350   least significant lane (rightmost in the register).  */
1351
1352static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
1353{
1354   /* Correct for little-endian host only. */
1355   vassert(!host_is_bigendian);
1356   vassert(laneno >= 0 && laneno < 8);
1357   return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
1358}
1359
1360static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
1361{
1362   /* Correct for little-endian host only. */
1363   vassert(!host_is_bigendian);
1364   vassert(laneno >= 0 && laneno < 4);
1365   return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
1366}
1367
1368static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
1369{
1370   /* Correct for little-endian host only. */
1371   vassert(!host_is_bigendian);
1372   vassert(laneno >= 0 && laneno < 2);
1373   return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
1374}
1375
1376//.. static IRExpr* getSReg ( UInt sreg )
1377//.. {
1378//..    return IRExpr_Get( segmentGuestRegOffset(sreg), Ity_I16 );
1379//.. }
1380//..
1381//.. static void putSReg ( UInt sreg, IRExpr* e )
1382//.. {
1383//..    vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
1384//..    stmt( IRStmt_Put( segmentGuestRegOffset(sreg), e ) );
1385//.. }
1386
1387static IRExpr* getXMMReg ( UInt xmmreg )
1388{
1389   return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
1390}
1391
1392static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
1393{
1394   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
1395}
1396
1397static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
1398{
1399   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
1400}
1401
1402static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
1403{
1404   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
1405}
1406
1407static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
1408{
1409   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
1410}
1411
1412static IRExpr* getXMMRegLane16 ( UInt xmmreg, Int laneno )
1413{
1414  return IRExpr_Get( xmmGuestRegLane16offset(xmmreg,laneno), Ity_I16 );
1415}
1416
1417static void putXMMReg ( UInt xmmreg, IRExpr* e )
1418{
1419   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
1420   stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
1421}
1422
1423static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
1424{
1425   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
1426   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
1427}
1428
1429static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
1430{
1431   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
1432   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
1433}
1434
1435static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
1436{
1437   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
1438   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
1439}
1440
1441static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
1442{
1443   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
1444   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
1445}
1446
1447static void putXMMRegLane16 ( UInt xmmreg, Int laneno, IRExpr* e )
1448{
1449   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
1450   stmt( IRStmt_Put( xmmGuestRegLane16offset(xmmreg,laneno), e ) );
1451}
1452
1453static IRExpr* mkV128 ( UShort mask )
1454{
1455   return IRExpr_Const(IRConst_V128(mask));
1456}
1457
1458static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
1459{
1460   vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
1461   vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
1462   return unop(Iop_64to1,
1463               binop(Iop_And64,
1464                     unop(Iop_1Uto64,x),
1465                     unop(Iop_1Uto64,y)));
1466}
1467
1468/* Generate a compare-and-swap operation, operating on memory at
1469   'addr'.  The expected value is 'expVal' and the new value is
1470   'newVal'.  If the operation fails, then transfer control (with a
1471   no-redir jump (XXX no -- see comment at top of this file)) to
1472   'restart_point', which is presumably the address of the guest
1473   instruction again -- retrying, essentially. */
1474static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
1475                    Addr64 restart_point )
1476{
1477   IRCAS* cas;
1478   IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
1479   IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
1480   IRTemp oldTmp = newTemp(tyE);
1481   IRTemp expTmp = newTemp(tyE);
1482   vassert(tyE == tyN);
1483   vassert(tyE == Ity_I64 || tyE == Ity_I32
1484           || tyE == Ity_I16 || tyE == Ity_I8);
1485   assign(expTmp, expVal);
1486   cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
1487                  NULL, mkexpr(expTmp), NULL, newVal );
1488   stmt( IRStmt_CAS(cas) );
1489   stmt( IRStmt_Exit(
1490            binop( mkSizedOp(tyE,Iop_CasCmpNE8),
1491                   mkexpr(oldTmp), mkexpr(expTmp) ),
1492            Ijk_Boring, /*Ijk_NoRedir*/
1493            IRConst_U64( restart_point )
1494         ));
1495}
1496
1497
1498/*------------------------------------------------------------*/
1499/*--- Helpers for %rflags.                                 ---*/
1500/*------------------------------------------------------------*/
1501
1502/* -------------- Evaluating the flags-thunk. -------------- */
1503
1504/* Build IR to calculate all the eflags from stored
1505   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1506   Ity_I64. */
1507static IRExpr* mk_amd64g_calculate_rflags_all ( void )
1508{
1509   IRExpr** args
1510      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1511                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1512                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1513                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1514   IRExpr* call
1515      = mkIRExprCCall(
1516           Ity_I64,
1517           0/*regparm*/,
1518           "amd64g_calculate_rflags_all", &amd64g_calculate_rflags_all,
1519           args
1520        );
1521   /* Exclude OP and NDEP from definedness checking.  We're only
1522      interested in DEP1 and DEP2. */
1523   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1524   return call;
1525}
1526
1527/* Build IR to calculate some particular condition from stored
1528   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1529   Ity_Bit. */
1530static IRExpr* mk_amd64g_calculate_condition ( AMD64Condcode cond )
1531{
1532   IRExpr** args
1533      = mkIRExprVec_5( mkU64(cond),
1534                       IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1535                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1536                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1537                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1538   IRExpr* call
1539      = mkIRExprCCall(
1540           Ity_I64,
1541           0/*regparm*/,
1542           "amd64g_calculate_condition", &amd64g_calculate_condition,
1543           args
1544        );
1545   /* Exclude the requested condition, OP and NDEP from definedness
1546      checking.  We're only interested in DEP1 and DEP2. */
1547   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
1548   return unop(Iop_64to1, call);
1549}
1550
1551/* Build IR to calculate just the carry flag from stored
1552   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I64. */
1553static IRExpr* mk_amd64g_calculate_rflags_c ( void )
1554{
1555   IRExpr** args
1556      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1557                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1558                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1559                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1560   IRExpr* call
1561      = mkIRExprCCall(
1562           Ity_I64,
1563           0/*regparm*/,
1564           "amd64g_calculate_rflags_c", &amd64g_calculate_rflags_c,
1565           args
1566        );
1567   /* Exclude OP and NDEP from definedness checking.  We're only
1568      interested in DEP1 and DEP2. */
1569   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1570   return call;
1571}
1572
1573
1574/* -------------- Building the flags-thunk. -------------- */
1575
1576/* The machinery in this section builds the flag-thunk following a
1577   flag-setting operation.  Hence the various setFlags_* functions.
1578*/
1579
1580static Bool isAddSub ( IROp op8 )
1581{
1582   return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
1583}
1584
1585static Bool isLogic ( IROp op8 )
1586{
1587   return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
1588}
1589
1590/* U-widen 8/16/32/64 bit int expr to 64. */
1591static IRExpr* widenUto64 ( IRExpr* e )
1592{
1593   switch (typeOfIRExpr(irsb->tyenv,e)) {
1594      case Ity_I64: return e;
1595      case Ity_I32: return unop(Iop_32Uto64, e);
1596      case Ity_I16: return unop(Iop_16Uto64, e);
1597      case Ity_I8:  return unop(Iop_8Uto64, e);
1598      default: vpanic("widenUto64");
1599   }
1600}
1601
1602/* S-widen 8/16/32/64 bit int expr to 32. */
1603static IRExpr* widenSto64 ( IRExpr* e )
1604{
1605   switch (typeOfIRExpr(irsb->tyenv,e)) {
1606      case Ity_I64: return e;
1607      case Ity_I32: return unop(Iop_32Sto64, e);
1608      case Ity_I16: return unop(Iop_16Sto64, e);
1609      case Ity_I8:  return unop(Iop_8Sto64, e);
1610      default: vpanic("widenSto64");
1611   }
1612}
1613
1614/* Narrow 8/16/32/64 bit int expr to 8/16/32/64.  Clearly only some
1615   of these combinations make sense. */
1616static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
1617{
1618   IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
1619   if (src_ty == dst_ty)
1620      return e;
1621   if (src_ty == Ity_I32 && dst_ty == Ity_I16)
1622      return unop(Iop_32to16, e);
1623   if (src_ty == Ity_I32 && dst_ty == Ity_I8)
1624      return unop(Iop_32to8, e);
1625   if (src_ty == Ity_I64 && dst_ty == Ity_I32)
1626      return unop(Iop_64to32, e);
1627   if (src_ty == Ity_I64 && dst_ty == Ity_I16)
1628      return unop(Iop_64to16, e);
1629   if (src_ty == Ity_I64 && dst_ty == Ity_I8)
1630      return unop(Iop_64to8, e);
1631
1632   vex_printf("\nsrc, dst tys are: ");
1633   ppIRType(src_ty);
1634   vex_printf(", ");
1635   ppIRType(dst_ty);
1636   vex_printf("\n");
1637   vpanic("narrowTo(amd64)");
1638}
1639
1640
1641/* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
1642   auto-sized up to the real op. */
1643
1644static
1645void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
1646{
1647   Int ccOp = 0;
1648   switch (ty) {
1649      case Ity_I8:  ccOp = 0; break;
1650      case Ity_I16: ccOp = 1; break;
1651      case Ity_I32: ccOp = 2; break;
1652      case Ity_I64: ccOp = 3; break;
1653      default: vassert(0);
1654   }
1655   switch (op8) {
1656      case Iop_Add8: ccOp += AMD64G_CC_OP_ADDB;   break;
1657      case Iop_Sub8: ccOp += AMD64G_CC_OP_SUBB;   break;
1658      default:       ppIROp(op8);
1659                     vpanic("setFlags_DEP1_DEP2(amd64)");
1660   }
1661   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
1662   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
1663   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(dep2))) );
1664}
1665
1666
1667/* Set the OP and DEP1 fields only, and write zero to DEP2. */
1668
1669static
1670void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
1671{
1672   Int ccOp = 0;
1673   switch (ty) {
1674      case Ity_I8:  ccOp = 0; break;
1675      case Ity_I16: ccOp = 1; break;
1676      case Ity_I32: ccOp = 2; break;
1677      case Ity_I64: ccOp = 3; break;
1678      default: vassert(0);
1679   }
1680   switch (op8) {
1681      case Iop_Or8:
1682      case Iop_And8:
1683      case Iop_Xor8: ccOp += AMD64G_CC_OP_LOGICB; break;
1684      default:       ppIROp(op8);
1685                     vpanic("setFlags_DEP1(amd64)");
1686   }
1687   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
1688   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
1689   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
1690}
1691
1692
1693/* For shift operations, we put in the result and the undershifted
1694   result.  Except if the shift amount is zero, the thunk is left
1695   unchanged. */
1696
1697static void setFlags_DEP1_DEP2_shift ( IROp    op64,
1698                                       IRTemp  res,
1699                                       IRTemp  resUS,
1700                                       IRType  ty,
1701                                       IRTemp  guard )
1702{
1703   Int ccOp = 0;
1704   switch (ty) {
1705      case Ity_I8:  ccOp = 0; break;
1706      case Ity_I16: ccOp = 1; break;
1707      case Ity_I32: ccOp = 2; break;
1708      case Ity_I64: ccOp = 3; break;
1709      default: vassert(0);
1710   }
1711
1712   vassert(guard);
1713
1714   /* Both kinds of right shifts are handled by the same thunk
1715      operation. */
1716   switch (op64) {
1717      case Iop_Shr64:
1718      case Iop_Sar64: ccOp += AMD64G_CC_OP_SHRB; break;
1719      case Iop_Shl64: ccOp += AMD64G_CC_OP_SHLB; break;
1720      default:        ppIROp(op64);
1721                      vpanic("setFlags_DEP1_DEP2_shift(amd64)");
1722   }
1723
1724   /* DEP1 contains the result, DEP2 contains the undershifted value. */
1725   stmt( IRStmt_Put( OFFB_CC_OP,
1726                     IRExpr_Mux0X( mkexpr(guard),
1727                                   IRExpr_Get(OFFB_CC_OP,Ity_I64),
1728                                   mkU64(ccOp))) );
1729   stmt( IRStmt_Put( OFFB_CC_DEP1,
1730                     IRExpr_Mux0X( mkexpr(guard),
1731                                   IRExpr_Get(OFFB_CC_DEP1,Ity_I64),
1732                                   widenUto64(mkexpr(res)))) );
1733   stmt( IRStmt_Put( OFFB_CC_DEP2,
1734                     IRExpr_Mux0X( mkexpr(guard),
1735                                   IRExpr_Get(OFFB_CC_DEP2,Ity_I64),
1736                                   widenUto64(mkexpr(resUS)))) );
1737}
1738
1739
1740/* For the inc/dec case, we store in DEP1 the result value and in NDEP
1741   the former value of the carry flag, which unfortunately we have to
1742   compute. */
1743
1744static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
1745{
1746   Int ccOp = inc ? AMD64G_CC_OP_INCB : AMD64G_CC_OP_DECB;
1747
1748   switch (ty) {
1749      case Ity_I8:  ccOp += 0; break;
1750      case Ity_I16: ccOp += 1; break;
1751      case Ity_I32: ccOp += 2; break;
1752      case Ity_I64: ccOp += 3; break;
1753      default: vassert(0);
1754   }
1755
1756   /* This has to come first, because calculating the C flag
1757      may require reading all four thunk fields. */
1758   stmt( IRStmt_Put( OFFB_CC_NDEP, mk_amd64g_calculate_rflags_c()) );
1759   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
1760   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(res))) );
1761   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
1762}
1763
1764
1765/* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
1766   two arguments. */
1767
1768static
1769void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, ULong base_op )
1770{
1771   switch (ty) {
1772      case Ity_I8:
1773         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+0) ) );
1774         break;
1775      case Ity_I16:
1776         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+1) ) );
1777         break;
1778      case Ity_I32:
1779         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+2) ) );
1780         break;
1781      case Ity_I64:
1782         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+3) ) );
1783         break;
1784      default:
1785         vpanic("setFlags_MUL(amd64)");
1786   }
1787   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(arg1)) ));
1788   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(arg2)) ));
1789}
1790
1791
1792/* -------------- Condition codes. -------------- */
1793
1794/* Condition codes, using the AMD encoding.  */
1795
1796static HChar* name_AMD64Condcode ( AMD64Condcode cond )
1797{
1798   switch (cond) {
1799      case AMD64CondO:      return "o";
1800      case AMD64CondNO:     return "no";
1801      case AMD64CondB:      return "b";
1802      case AMD64CondNB:     return "ae"; /*"nb";*/
1803      case AMD64CondZ:      return "e"; /*"z";*/
1804      case AMD64CondNZ:     return "ne"; /*"nz";*/
1805      case AMD64CondBE:     return "be";
1806      case AMD64CondNBE:    return "a"; /*"nbe";*/
1807      case AMD64CondS:      return "s";
1808      case AMD64CondNS:     return "ns";
1809      case AMD64CondP:      return "p";
1810      case AMD64CondNP:     return "np";
1811      case AMD64CondL:      return "l";
1812      case AMD64CondNL:     return "ge"; /*"nl";*/
1813      case AMD64CondLE:     return "le";
1814      case AMD64CondNLE:    return "g"; /*"nle";*/
1815      case AMD64CondAlways: return "ALWAYS";
1816      default: vpanic("name_AMD64Condcode");
1817   }
1818}
1819
1820static
1821AMD64Condcode positiveIse_AMD64Condcode ( AMD64Condcode  cond,
1822                                          /*OUT*/Bool*   needInvert )
1823{
1824   vassert(cond >= AMD64CondO && cond <= AMD64CondNLE);
1825   if (cond & 1) {
1826      *needInvert = True;
1827      return cond-1;
1828   } else {
1829      *needInvert = False;
1830      return cond;
1831   }
1832}
1833
1834
1835/* -------------- Helpers for ADD/SUB with carry. -------------- */
1836
1837/* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
1838   appropriately.
1839
1840   Optionally, generate a store for the 'tres' value.  This can either
1841   be a normal store, or it can be a cas-with-possible-failure style
1842   store:
1843
1844   if taddr is IRTemp_INVALID, then no store is generated.
1845
1846   if taddr is not IRTemp_INVALID, then a store (using taddr as
1847   the address) is generated:
1848
1849     if texpVal is IRTemp_INVALID then a normal store is
1850     generated, and restart_point must be zero (it is irrelevant).
1851
1852     if texpVal is not IRTemp_INVALID then a cas-style store is
1853     generated.  texpVal is the expected value, restart_point
1854     is the restart point if the store fails, and texpVal must
1855     have the same type as tres.
1856
1857*/
1858static void helper_ADC ( Int sz,
1859                         IRTemp tres, IRTemp ta1, IRTemp ta2,
1860                         /* info about optional store: */
1861                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
1862{
1863   UInt    thunkOp;
1864   IRType  ty    = szToITy(sz);
1865   IRTemp  oldc  = newTemp(Ity_I64);
1866   IRTemp  oldcn = newTemp(ty);
1867   IROp    plus  = mkSizedOp(ty, Iop_Add8);
1868   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
1869
1870   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
1871
1872   switch (sz) {
1873      case 8:  thunkOp = AMD64G_CC_OP_ADCQ; break;
1874      case 4:  thunkOp = AMD64G_CC_OP_ADCL; break;
1875      case 2:  thunkOp = AMD64G_CC_OP_ADCW; break;
1876      case 1:  thunkOp = AMD64G_CC_OP_ADCB; break;
1877      default: vassert(0);
1878   }
1879
1880   /* oldc = old carry flag, 0 or 1 */
1881   assign( oldc,  binop(Iop_And64,
1882                        mk_amd64g_calculate_rflags_c(),
1883                        mkU64(1)) );
1884
1885   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
1886
1887   assign( tres, binop(plus,
1888                       binop(plus,mkexpr(ta1),mkexpr(ta2)),
1889                       mkexpr(oldcn)) );
1890
1891   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
1892      start of this function. */
1893   if (taddr != IRTemp_INVALID) {
1894      if (texpVal == IRTemp_INVALID) {
1895         vassert(restart_point == 0);
1896         storeLE( mkexpr(taddr), mkexpr(tres) );
1897      } else {
1898         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
1899         /* .. and hence 'texpVal' has the same type as 'tres'. */
1900         casLE( mkexpr(taddr),
1901                mkexpr(texpVal), mkexpr(tres), restart_point );
1902      }
1903   }
1904
1905   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
1906   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1))  ));
1907   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
1908                                                         mkexpr(oldcn)) )) );
1909   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
1910}
1911
1912
1913/* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
1914   appropriately.  As with helper_ADC, possibly generate a store of
1915   the result -- see comments on helper_ADC for details.
1916*/
1917static void helper_SBB ( Int sz,
1918                         IRTemp tres, IRTemp ta1, IRTemp ta2,
1919                         /* info about optional store: */
1920                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
1921{
1922   UInt    thunkOp;
1923   IRType  ty    = szToITy(sz);
1924   IRTemp  oldc  = newTemp(Ity_I64);
1925   IRTemp  oldcn = newTemp(ty);
1926   IROp    minus = mkSizedOp(ty, Iop_Sub8);
1927   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
1928
1929   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
1930
1931   switch (sz) {
1932      case 8:  thunkOp = AMD64G_CC_OP_SBBQ; break;
1933      case 4:  thunkOp = AMD64G_CC_OP_SBBL; break;
1934      case 2:  thunkOp = AMD64G_CC_OP_SBBW; break;
1935      case 1:  thunkOp = AMD64G_CC_OP_SBBB; break;
1936      default: vassert(0);
1937   }
1938
1939   /* oldc = old carry flag, 0 or 1 */
1940   assign( oldc, binop(Iop_And64,
1941                       mk_amd64g_calculate_rflags_c(),
1942                       mkU64(1)) );
1943
1944   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
1945
1946   assign( tres, binop(minus,
1947                       binop(minus,mkexpr(ta1),mkexpr(ta2)),
1948                       mkexpr(oldcn)) );
1949
1950   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
1951      start of this function. */
1952   if (taddr != IRTemp_INVALID) {
1953      if (texpVal == IRTemp_INVALID) {
1954         vassert(restart_point == 0);
1955         storeLE( mkexpr(taddr), mkexpr(tres) );
1956      } else {
1957         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
1958         /* .. and hence 'texpVal' has the same type as 'tres'. */
1959         casLE( mkexpr(taddr),
1960                mkexpr(texpVal), mkexpr(tres), restart_point );
1961      }
1962   }
1963
1964   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
1965   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1) )) );
1966   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
1967                                                         mkexpr(oldcn)) )) );
1968   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
1969}
1970
1971
1972/* -------------- Helpers for disassembly printing. -------------- */
1973
1974static HChar* nameGrp1 ( Int opc_aux )
1975{
1976   static HChar* grp1_names[8]
1977     = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
1978   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(amd64)");
1979   return grp1_names[opc_aux];
1980}
1981
1982static HChar* nameGrp2 ( Int opc_aux )
1983{
1984   static HChar* grp2_names[8]
1985     = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
1986   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(amd64)");
1987   return grp2_names[opc_aux];
1988}
1989
1990static HChar* nameGrp4 ( Int opc_aux )
1991{
1992   static HChar* grp4_names[8]
1993     = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
1994   if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(amd64)");
1995   return grp4_names[opc_aux];
1996}
1997
1998static HChar* nameGrp5 ( Int opc_aux )
1999{
2000   static HChar* grp5_names[8]
2001     = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
2002   if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(amd64)");
2003   return grp5_names[opc_aux];
2004}
2005
2006static HChar* nameGrp8 ( Int opc_aux )
2007{
2008   static HChar* grp8_names[8]
2009      = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
2010   if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(amd64)");
2011   return grp8_names[opc_aux];
2012}
2013
2014//.. static HChar* nameSReg ( UInt sreg )
2015//.. {
2016//..    switch (sreg) {
2017//..       case R_ES: return "%es";
2018//..       case R_CS: return "%cs";
2019//..       case R_SS: return "%ss";
2020//..       case R_DS: return "%ds";
2021//..       case R_FS: return "%fs";
2022//..       case R_GS: return "%gs";
2023//..       default: vpanic("nameSReg(x86)");
2024//..    }
2025//.. }
2026
2027static HChar* nameMMXReg ( Int mmxreg )
2028{
2029   static HChar* mmx_names[8]
2030     = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
2031   if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(amd64,guest)");
2032   return mmx_names[mmxreg];
2033}
2034
2035static HChar* nameXMMReg ( Int xmmreg )
2036{
2037   static HChar* xmm_names[16]
2038     = { "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3",
2039         "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7",
2040         "%xmm8",  "%xmm9",  "%xmm10", "%xmm11",
2041         "%xmm12", "%xmm13", "%xmm14", "%xmm15" };
2042   if (xmmreg < 0 || xmmreg > 15) vpanic("nameXMMReg(amd64)");
2043   return xmm_names[xmmreg];
2044}
2045
2046static HChar* nameMMXGran ( Int gran )
2047{
2048   switch (gran) {
2049      case 0: return "b";
2050      case 1: return "w";
2051      case 2: return "d";
2052      case 3: return "q";
2053      default: vpanic("nameMMXGran(amd64,guest)");
2054   }
2055}
2056
2057static HChar nameISize ( Int size )
2058{
2059   switch (size) {
2060      case 8: return 'q';
2061      case 4: return 'l';
2062      case 2: return 'w';
2063      case 1: return 'b';
2064      default: vpanic("nameISize(amd64)");
2065   }
2066}
2067
2068
2069/*------------------------------------------------------------*/
2070/*--- JMP helpers                                          ---*/
2071/*------------------------------------------------------------*/
2072
2073static void jmp_lit( IRJumpKind kind, Addr64 d64 )
2074{
2075   irsb->next     = mkU64(d64);
2076   irsb->jumpkind = kind;
2077}
2078
2079static void jmp_treg( IRJumpKind kind, IRTemp t )
2080{
2081   irsb->next     = mkexpr(t);
2082   irsb->jumpkind = kind;
2083}
2084
2085static
2086void jcc_01 ( AMD64Condcode cond, Addr64 d64_false, Addr64 d64_true )
2087{
2088   Bool          invert;
2089   AMD64Condcode condPos;
2090   condPos = positiveIse_AMD64Condcode ( cond, &invert );
2091   if (invert) {
2092      stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
2093                         Ijk_Boring,
2094                         IRConst_U64(d64_false) ) );
2095      irsb->next     = mkU64(d64_true);
2096      irsb->jumpkind = Ijk_Boring;
2097   } else {
2098      stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
2099                         Ijk_Boring,
2100                         IRConst_U64(d64_true) ) );
2101      irsb->next     = mkU64(d64_false);
2102      irsb->jumpkind = Ijk_Boring;
2103   }
2104}
2105
2106/* Let new_rsp be the %rsp value after a call/return.  Let nia be the
2107   guest address of the next instruction to be executed.
2108
2109   This function generates an AbiHint to say that -128(%rsp)
2110   .. -1(%rsp) should now be regarded as uninitialised.
2111*/
2112static
2113void make_redzone_AbiHint ( VexAbiInfo* vbi,
2114                            IRTemp new_rsp, IRTemp nia, HChar* who )
2115{
2116   Int szB = vbi->guest_stack_redzone_size;
2117   vassert(szB >= 0);
2118
2119   /* A bit of a kludge.  Currently the only AbI we've guested AMD64
2120      for is ELF.  So just check it's the expected 128 value
2121      (paranoia). */
2122   vassert(szB == 128);
2123
2124   if (0) vex_printf("AbiHint: %s\n", who);
2125   vassert(typeOfIRTemp(irsb->tyenv, new_rsp) == Ity_I64);
2126   vassert(typeOfIRTemp(irsb->tyenv, nia) == Ity_I64);
2127   if (szB > 0)
2128      stmt( IRStmt_AbiHint(
2129               binop(Iop_Sub64, mkexpr(new_rsp), mkU64(szB)),
2130               szB,
2131               mkexpr(nia)
2132            ));
2133}
2134
2135
2136/*------------------------------------------------------------*/
2137/*--- Disassembling addressing modes                       ---*/
2138/*------------------------------------------------------------*/
2139
2140static
2141HChar* segRegTxt ( Prefix pfx )
2142{
2143   if (pfx & PFX_CS) return "%cs:";
2144   if (pfx & PFX_DS) return "%ds:";
2145   if (pfx & PFX_ES) return "%es:";
2146   if (pfx & PFX_FS) return "%fs:";
2147   if (pfx & PFX_GS) return "%gs:";
2148   if (pfx & PFX_SS) return "%ss:";
2149   return ""; /* no override */
2150}
2151
2152
2153/* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
2154   linear address by adding any required segment override as indicated
2155   by sorb, and also dealing with any address size override
2156   present. */
2157static
2158IRExpr* handleAddrOverrides ( VexAbiInfo* vbi,
2159                              Prefix pfx, IRExpr* virtual )
2160{
2161   /* --- segment overrides --- */
2162   if (pfx & PFX_FS) {
2163      if (vbi->guest_amd64_assume_fs_is_zero) {
2164         /* Note that this is a linux-kernel specific hack that relies
2165            on the assumption that %fs is always zero. */
2166         /* return virtual + guest_FS_ZERO. */
2167         virtual = binop(Iop_Add64, virtual,
2168                                    IRExpr_Get(OFFB_FS_ZERO, Ity_I64));
2169      } else {
2170         unimplemented("amd64 %fs segment override");
2171      }
2172   }
2173
2174   if (pfx & PFX_GS) {
2175      if (vbi->guest_amd64_assume_gs_is_0x60) {
2176         /* Note that this is a darwin-kernel specific hack that relies
2177            on the assumption that %gs is always 0x60. */
2178         /* return virtual + guest_GS_0x60. */
2179         virtual = binop(Iop_Add64, virtual,
2180                                    IRExpr_Get(OFFB_GS_0x60, Ity_I64));
2181      } else {
2182         unimplemented("amd64 %gs segment override");
2183      }
2184   }
2185
2186   /* cs, ds, es and ss are simply ignored in 64-bit mode. */
2187
2188   /* --- address size override --- */
2189   if (haveASO(pfx))
2190      virtual = unop(Iop_32Uto64, unop(Iop_64to32, virtual));
2191
2192   return virtual;
2193}
2194
2195//.. {
2196//..    Int    sreg;
2197//..    IRType hWordTy;
2198//..    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
2199//..
2200//..    if (sorb == 0)
2201//..       /* the common case - no override */
2202//..       return virtual;
2203//..
2204//..    switch (sorb) {
2205//..       case 0x3E: sreg = R_DS; break;
2206//..       case 0x26: sreg = R_ES; break;
2207//..       case 0x64: sreg = R_FS; break;
2208//..       case 0x65: sreg = R_GS; break;
2209//..       default: vpanic("handleAddrOverrides(x86,guest)");
2210//..    }
2211//..
2212//..    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
2213//..
2214//..    seg_selector = newTemp(Ity_I32);
2215//..    ldt_ptr      = newTemp(hWordTy);
2216//..    gdt_ptr      = newTemp(hWordTy);
2217//..    r64          = newTemp(Ity_I64);
2218//..
2219//..    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
2220//..    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
2221//..    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
2222//..
2223//..    /*
2224//..    Call this to do the translation and limit checks:
2225//..    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
2226//..                                  UInt seg_selector, UInt virtual_addr )
2227//..    */
2228//..    assign(
2229//..       r64,
2230//..       mkIRExprCCall(
2231//..          Ity_I64,
2232//..          0/*regparms*/,
2233//..          "x86g_use_seg_selector",
2234//..          &x86g_use_seg_selector,
2235//..          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
2236//..                         mkexpr(seg_selector), virtual)
2237//..       )
2238//..    );
2239//..
2240//..    /* If the high 32 of the result are non-zero, there was a
2241//..       failure in address translation.  In which case, make a
2242//..       quick exit.
2243//..    */
2244//..    stmt(
2245//..       IRStmt_Exit(
2246//..          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
2247//..          Ijk_MapFail,
2248//..          IRConst_U32( guest_eip_curr_instr )
2249//..       )
2250//..    );
2251//..
2252//..    /* otherwise, here's the translated result. */
2253//..    return unop(Iop_64to32, mkexpr(r64));
2254//.. }
2255
2256
2257/* Generate IR to calculate an address indicated by a ModRM and
2258   following SIB bytes.  The expression, and the number of bytes in
2259   the address mode, are returned (the latter in *len).  Note that
2260   this fn should not be called if the R/M part of the address denotes
2261   a register instead of memory.  If print_codegen is true, text of
2262   the addressing mode is placed in buf.
2263
2264   The computed address is stored in a new tempreg, and the
2265   identity of the tempreg is returned.
2266
2267   extra_bytes holds the number of bytes after the amode, as supplied
2268   by the caller.  This is needed to make sense of %rip-relative
2269   addresses.  Note that the value that *len is set to is only the
2270   length of the amode itself and does not include the value supplied
2271   in extra_bytes.
2272 */
2273
2274static IRTemp disAMode_copy2tmp ( IRExpr* addr64 )
2275{
2276   IRTemp tmp = newTemp(Ity_I64);
2277   assign( tmp, addr64 );
2278   return tmp;
2279}
2280
2281static
2282IRTemp disAMode ( /*OUT*/Int* len,
2283                  VexAbiInfo* vbi, Prefix pfx, Long delta,
2284                  /*OUT*/HChar* buf, Int extra_bytes )
2285{
2286   UChar mod_reg_rm = getUChar(delta);
2287   delta++;
2288
2289   buf[0] = (UChar)0;
2290   vassert(extra_bytes >= 0 && extra_bytes < 10);
2291
2292   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
2293      jump table seems a bit excessive.
2294   */
2295   mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
2296   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
2297                                               /* is now XX0XXYYY */
2298   mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
2299   switch (mod_reg_rm) {
2300
2301      /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
2302         REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
2303      */
2304      case 0x00: case 0x01: case 0x02: case 0x03:
2305      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
2306         { UChar rm = toUChar(mod_reg_rm & 7);
2307           DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
2308           *len = 1;
2309           return disAMode_copy2tmp(
2310                  handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,rm)));
2311         }
2312
2313      /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
2314         REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
2315      */
2316      case 0x08: case 0x09: case 0x0A: case 0x0B:
2317      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
2318         { UChar rm = toUChar(mod_reg_rm & 7);
2319           Long d   = getSDisp8(delta);
2320           if (d == 0) {
2321              DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
2322           } else {
2323              DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
2324           }
2325           *len = 2;
2326           return disAMode_copy2tmp(
2327                  handleAddrOverrides(vbi, pfx,
2328                     binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
2329         }
2330
2331      /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
2332         REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
2333      */
2334      case 0x10: case 0x11: case 0x12: case 0x13:
2335      /* ! 14 */ case 0x15: case 0x16: case 0x17:
2336         { UChar rm = toUChar(mod_reg_rm & 7);
2337           Long  d  = getSDisp32(delta);
2338           DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
2339           *len = 5;
2340           return disAMode_copy2tmp(
2341                  handleAddrOverrides(vbi, pfx,
2342                     binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
2343         }
2344
2345      /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
2346      /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
2347      case 0x18: case 0x19: case 0x1A: case 0x1B:
2348      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
2349         vpanic("disAMode(amd64): not an addr!");
2350
2351      /* RIP + disp32.  This assumes that guest_RIP_curr_instr is set
2352         correctly at the start of handling each instruction. */
2353      case 0x05:
2354         { Long d = getSDisp32(delta);
2355           *len = 5;
2356           DIS(buf, "%s%lld(%%rip)", segRegTxt(pfx), d);
2357           /* We need to know the next instruction's start address.
2358              Try and figure out what it is, record the guess, and ask
2359              the top-level driver logic (bbToIR_AMD64) to check we
2360              guessed right, after the instruction is completely
2361              decoded. */
2362           guest_RIP_next_mustcheck = True;
2363           guest_RIP_next_assumed = guest_RIP_bbstart
2364                                    + delta+4 + extra_bytes;
2365           return disAMode_copy2tmp(
2366                     handleAddrOverrides(vbi, pfx,
2367                        binop(Iop_Add64, mkU64(guest_RIP_next_assumed),
2368                                         mkU64(d))));
2369         }
2370
2371      case 0x04: {
2372         /* SIB, with no displacement.  Special cases:
2373            -- %rsp cannot act as an index value.
2374               If index_r indicates %rsp, zero is used for the index.
2375            -- when mod is zero and base indicates RBP or R13, base is
2376               instead a 32-bit sign-extended literal.
2377            It's all madness, I tell you.  Extract %index, %base and
2378            scale from the SIB byte.  The value denoted is then:
2379               | %index == %RSP && (%base == %RBP || %base == %R13)
2380               = d32 following SIB byte
2381               | %index == %RSP && !(%base == %RBP || %base == %R13)
2382               = %base
2383               | %index != %RSP && (%base == %RBP || %base == %R13)
2384               = d32 following SIB byte + (%index << scale)
2385               | %index != %RSP && !(%base == %RBP || %base == %R13)
2386               = %base + (%index << scale)
2387         */
2388         UChar sib     = getUChar(delta);
2389         UChar scale   = toUChar((sib >> 6) & 3);
2390         UChar index_r = toUChar((sib >> 3) & 7);
2391         UChar base_r  = toUChar(sib & 7);
2392         /* correct since #(R13) == 8 + #(RBP) */
2393         Bool  base_is_BPor13 = toBool(base_r == R_RBP);
2394         Bool  index_is_SP    = toBool(index_r == R_RSP && 0==getRexX(pfx));
2395         delta++;
2396
2397         if ((!index_is_SP) && (!base_is_BPor13)) {
2398            if (scale == 0) {
2399               DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
2400                         nameIRegRexB(8,pfx,base_r),
2401                         nameIReg64rexX(pfx,index_r));
2402            } else {
2403               DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
2404                         nameIRegRexB(8,pfx,base_r),
2405                         nameIReg64rexX(pfx,index_r), 1<<scale);
2406            }
2407            *len = 2;
2408            return
2409               disAMode_copy2tmp(
2410               handleAddrOverrides(vbi, pfx,
2411                  binop(Iop_Add64,
2412                        getIRegRexB(8,pfx,base_r),
2413                        binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
2414                              mkU8(scale)))));
2415         }
2416
2417         if ((!index_is_SP) && base_is_BPor13) {
2418            Long d = getSDisp32(delta);
2419            DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d,
2420                      nameIReg64rexX(pfx,index_r), 1<<scale);
2421            *len = 6;
2422            return
2423               disAMode_copy2tmp(
2424               handleAddrOverrides(vbi, pfx,
2425                  binop(Iop_Add64,
2426                        binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
2427                                         mkU8(scale)),
2428                        mkU64(d))));
2429         }
2430
2431         if (index_is_SP && (!base_is_BPor13)) {
2432            DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,base_r));
2433            *len = 2;
2434            return disAMode_copy2tmp(
2435                   handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,base_r)));
2436         }
2437
2438         if (index_is_SP && base_is_BPor13) {
2439            Long d = getSDisp32(delta);
2440            DIS(buf, "%s%lld", segRegTxt(pfx), d);
2441            *len = 6;
2442            return disAMode_copy2tmp(
2443                   handleAddrOverrides(vbi, pfx, mkU64(d)));
2444         }
2445
2446         vassert(0);
2447      }
2448
2449      /* SIB, with 8-bit displacement.  Special cases:
2450         -- %esp cannot act as an index value.
2451            If index_r indicates %esp, zero is used for the index.
2452         Denoted value is:
2453            | %index == %ESP
2454            = d8 + %base
2455            | %index != %ESP
2456            = d8 + %base + (%index << scale)
2457      */
2458      case 0x0C: {
2459         UChar sib     = getUChar(delta);
2460         UChar scale   = toUChar((sib >> 6) & 3);
2461         UChar index_r = toUChar((sib >> 3) & 7);
2462         UChar base_r  = toUChar(sib & 7);
2463         Long d        = getSDisp8(delta+1);
2464
2465         if (index_r == R_RSP && 0==getRexX(pfx)) {
2466            DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
2467                                   d, nameIRegRexB(8,pfx,base_r));
2468            *len = 3;
2469            return disAMode_copy2tmp(
2470                   handleAddrOverrides(vbi, pfx,
2471                      binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
2472         } else {
2473            if (scale == 0) {
2474               DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
2475                         nameIRegRexB(8,pfx,base_r),
2476                         nameIReg64rexX(pfx,index_r));
2477            } else {
2478               DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
2479                         nameIRegRexB(8,pfx,base_r),
2480                         nameIReg64rexX(pfx,index_r), 1<<scale);
2481            }
2482            *len = 3;
2483            return
2484                disAMode_copy2tmp(
2485                handleAddrOverrides(vbi, pfx,
2486                  binop(Iop_Add64,
2487                        binop(Iop_Add64,
2488                              getIRegRexB(8,pfx,base_r),
2489                              binop(Iop_Shl64,
2490                                    getIReg64rexX(pfx,index_r), mkU8(scale))),
2491                        mkU64(d))));
2492         }
2493         vassert(0); /*NOTREACHED*/
2494      }
2495
2496      /* SIB, with 32-bit displacement.  Special cases:
2497         -- %rsp cannot act as an index value.
2498            If index_r indicates %rsp, zero is used for the index.
2499         Denoted value is:
2500            | %index == %RSP
2501            = d32 + %base
2502            | %index != %RSP
2503            = d32 + %base + (%index << scale)
2504      */
2505      case 0x14: {
2506         UChar sib     = getUChar(delta);
2507         UChar scale   = toUChar((sib >> 6) & 3);
2508         UChar index_r = toUChar((sib >> 3) & 7);
2509         UChar base_r  = toUChar(sib & 7);
2510         Long d        = getSDisp32(delta+1);
2511
2512         if (index_r == R_RSP && 0==getRexX(pfx)) {
2513            DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
2514                                   d, nameIRegRexB(8,pfx,base_r));
2515            *len = 6;
2516            return disAMode_copy2tmp(
2517                   handleAddrOverrides(vbi, pfx,
2518                      binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
2519         } else {
2520            if (scale == 0) {
2521               DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
2522                         nameIRegRexB(8,pfx,base_r),
2523                         nameIReg64rexX(pfx,index_r));
2524            } else {
2525               DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
2526                         nameIRegRexB(8,pfx,base_r),
2527                         nameIReg64rexX(pfx,index_r), 1<<scale);
2528            }
2529            *len = 6;
2530            return
2531                disAMode_copy2tmp(
2532                handleAddrOverrides(vbi, pfx,
2533                  binop(Iop_Add64,
2534                        binop(Iop_Add64,
2535                              getIRegRexB(8,pfx,base_r),
2536                              binop(Iop_Shl64,
2537                                    getIReg64rexX(pfx,index_r), mkU8(scale))),
2538                        mkU64(d))));
2539         }
2540         vassert(0); /*NOTREACHED*/
2541      }
2542
2543      default:
2544         vpanic("disAMode(amd64)");
2545         return 0; /*notreached*/
2546   }
2547}
2548
2549
2550/* Figure out the number of (insn-stream) bytes constituting the amode
2551   beginning at delta.  Is useful for getting hold of literals beyond
2552   the end of the amode before it has been disassembled.  */
2553
2554static UInt lengthAMode ( Prefix pfx, Long delta )
2555{
2556   UChar mod_reg_rm = getUChar(delta);
2557   delta++;
2558
2559   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
2560      jump table seems a bit excessive.
2561   */
2562   mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
2563   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
2564                                               /* is now XX0XXYYY */
2565   mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
2566   switch (mod_reg_rm) {
2567
2568      /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
2569         REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
2570      */
2571      case 0x00: case 0x01: case 0x02: case 0x03:
2572      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
2573         return 1;
2574
2575      /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
2576         REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
2577      */
2578      case 0x08: case 0x09: case 0x0A: case 0x0B:
2579      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
2580         return 2;
2581
2582      /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
2583         REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
2584      */
2585      case 0x10: case 0x11: case 0x12: case 0x13:
2586      /* ! 14 */ case 0x15: case 0x16: case 0x17:
2587         return 5;
2588
2589      /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
2590      /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
2591      /* Not an address, but still handled. */
2592      case 0x18: case 0x19: case 0x1A: case 0x1B:
2593      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
2594         return 1;
2595
2596      /* RIP + disp32. */
2597      case 0x05:
2598         return 5;
2599
2600      case 0x04: {
2601         /* SIB, with no displacement. */
2602         UChar sib     = getUChar(delta);
2603         UChar base_r  = toUChar(sib & 7);
2604         /* correct since #(R13) == 8 + #(RBP) */
2605         Bool  base_is_BPor13 = toBool(base_r == R_RBP);
2606
2607         if (base_is_BPor13) {
2608            return 6;
2609         } else {
2610            return 2;
2611         }
2612      }
2613
2614      /* SIB, with 8-bit displacement. */
2615      case 0x0C:
2616         return 3;
2617
2618      /* SIB, with 32-bit displacement. */
2619      case 0x14:
2620         return 6;
2621
2622      default:
2623         vpanic("lengthAMode(amd64)");
2624         return 0; /*notreached*/
2625   }
2626}
2627
2628
2629/*------------------------------------------------------------*/
2630/*--- Disassembling common idioms                          ---*/
2631/*------------------------------------------------------------*/
2632
2633/* Handle binary integer instructions of the form
2634      op E, G  meaning
2635      op reg-or-mem, reg
2636   Is passed the a ptr to the modRM byte, the actual operation, and the
2637   data size.  Returns the address advanced completely over this
2638   instruction.
2639
2640   E(src) is reg-or-mem
2641   G(dst) is reg.
2642
2643   If E is reg, -->    GET %G,  tmp
2644                       OP %E,   tmp
2645                       PUT tmp, %G
2646
2647   If E is mem and OP is not reversible,
2648                -->    (getAddr E) -> tmpa
2649                       LD (tmpa), tmpa
2650                       GET %G, tmp2
2651                       OP tmpa, tmp2
2652                       PUT tmp2, %G
2653
2654   If E is mem and OP is reversible
2655                -->    (getAddr E) -> tmpa
2656                       LD (tmpa), tmpa
2657                       OP %G, tmpa
2658                       PUT tmpa, %G
2659*/
2660static
2661ULong dis_op2_E_G ( VexAbiInfo* vbi,
2662                    Prefix      pfx,
2663                    Bool        addSubCarry,
2664                    IROp        op8,
2665                    Bool        keep,
2666                    Int         size,
2667                    Long        delta0,
2668                    HChar*      t_amd64opc )
2669{
2670   HChar   dis_buf[50];
2671   Int     len;
2672   IRType  ty   = szToITy(size);
2673   IRTemp  dst1 = newTemp(ty);
2674   IRTemp  src  = newTemp(ty);
2675   IRTemp  dst0 = newTemp(ty);
2676   UChar   rm   = getUChar(delta0);
2677   IRTemp  addr = IRTemp_INVALID;
2678
2679   /* addSubCarry == True indicates the intended operation is
2680      add-with-carry or subtract-with-borrow. */
2681   if (addSubCarry) {
2682      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
2683      vassert(keep);
2684   }
2685
2686   if (epartIsReg(rm)) {
2687      /* Specially handle XOR reg,reg, because that doesn't really
2688         depend on reg, and doing the obvious thing potentially
2689         generates a spurious value check failure due to the bogus
2690         dependency. */
2691      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
2692          && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
2693         if (False && op8 == Iop_Sub8)
2694            vex_printf("vex amd64->IR: sbb %%r,%%r optimisation(1)\n");
2695	 putIRegG(size,pfx,rm, mkU(ty,0));
2696      }
2697
2698      assign( dst0, getIRegG(size,pfx,rm) );
2699      assign( src,  getIRegE(size,pfx,rm) );
2700
2701      if (addSubCarry && op8 == Iop_Add8) {
2702         helper_ADC( size, dst1, dst0, src,
2703                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2704         putIRegG(size, pfx, rm, mkexpr(dst1));
2705      } else
2706      if (addSubCarry && op8 == Iop_Sub8) {
2707         helper_SBB( size, dst1, dst0, src,
2708                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2709         putIRegG(size, pfx, rm, mkexpr(dst1));
2710      } else {
2711         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
2712         if (isAddSub(op8))
2713            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2714         else
2715            setFlags_DEP1(op8, dst1, ty);
2716         if (keep)
2717            putIRegG(size, pfx, rm, mkexpr(dst1));
2718      }
2719
2720      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
2721                          nameIRegE(size,pfx,rm),
2722                          nameIRegG(size,pfx,rm));
2723      return 1+delta0;
2724   } else {
2725      /* E refers to memory */
2726      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
2727      assign( dst0, getIRegG(size,pfx,rm) );
2728      assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
2729
2730      if (addSubCarry && op8 == Iop_Add8) {
2731         helper_ADC( size, dst1, dst0, src,
2732                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2733         putIRegG(size, pfx, rm, mkexpr(dst1));
2734      } else
2735      if (addSubCarry && op8 == Iop_Sub8) {
2736         helper_SBB( size, dst1, dst0, src,
2737                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2738         putIRegG(size, pfx, rm, mkexpr(dst1));
2739      } else {
2740         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
2741         if (isAddSub(op8))
2742            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2743         else
2744            setFlags_DEP1(op8, dst1, ty);
2745         if (keep)
2746            putIRegG(size, pfx, rm, mkexpr(dst1));
2747      }
2748
2749      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
2750                          dis_buf, nameIRegG(size, pfx, rm));
2751      return len+delta0;
2752   }
2753}
2754
2755
2756
2757/* Handle binary integer instructions of the form
2758      op G, E  meaning
2759      op reg, reg-or-mem
2760   Is passed the a ptr to the modRM byte, the actual operation, and the
2761   data size.  Returns the address advanced completely over this
2762   instruction.
2763
2764   G(src) is reg.
2765   E(dst) is reg-or-mem
2766
2767   If E is reg, -->    GET %E,  tmp
2768                       OP %G,   tmp
2769                       PUT tmp, %E
2770
2771   If E is mem, -->    (getAddr E) -> tmpa
2772                       LD (tmpa), tmpv
2773                       OP %G, tmpv
2774                       ST tmpv, (tmpa)
2775*/
2776static
2777ULong dis_op2_G_E ( VexAbiInfo* vbi,
2778                    Prefix      pfx,
2779                    Bool        addSubCarry,
2780                    IROp        op8,
2781                    Bool        keep,
2782                    Int         size,
2783                    Long        delta0,
2784                    HChar*      t_amd64opc )
2785{
2786   HChar   dis_buf[50];
2787   Int     len;
2788   IRType  ty   = szToITy(size);
2789   IRTemp  dst1 = newTemp(ty);
2790   IRTemp  src  = newTemp(ty);
2791   IRTemp  dst0 = newTemp(ty);
2792   UChar   rm   = getUChar(delta0);
2793   IRTemp  addr = IRTemp_INVALID;
2794
2795   /* addSubCarry == True indicates the intended operation is
2796      add-with-carry or subtract-with-borrow. */
2797   if (addSubCarry) {
2798      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
2799      vassert(keep);
2800   }
2801
2802   if (epartIsReg(rm)) {
2803      /* Specially handle XOR reg,reg, because that doesn't really
2804         depend on reg, and doing the obvious thing potentially
2805         generates a spurious value check failure due to the bogus
2806         dependency.  Ditto SBB reg,reg. */
2807      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
2808          && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
2809         putIRegE(size,pfx,rm, mkU(ty,0));
2810      }
2811
2812      assign(dst0, getIRegE(size,pfx,rm));
2813      assign(src,  getIRegG(size,pfx,rm));
2814
2815      if (addSubCarry && op8 == Iop_Add8) {
2816         helper_ADC( size, dst1, dst0, src,
2817                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2818         putIRegE(size, pfx, rm, mkexpr(dst1));
2819      } else
2820      if (addSubCarry && op8 == Iop_Sub8) {
2821         helper_SBB( size, dst1, dst0, src,
2822                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2823         putIRegE(size, pfx, rm, mkexpr(dst1));
2824      } else {
2825         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2826         if (isAddSub(op8))
2827            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2828         else
2829            setFlags_DEP1(op8, dst1, ty);
2830         if (keep)
2831            putIRegE(size, pfx, rm, mkexpr(dst1));
2832      }
2833
2834      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
2835                          nameIRegG(size,pfx,rm),
2836                          nameIRegE(size,pfx,rm));
2837      return 1+delta0;
2838   }
2839
2840   /* E refers to memory */
2841   {
2842      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
2843      assign(dst0, loadLE(ty,mkexpr(addr)));
2844      assign(src,  getIRegG(size,pfx,rm));
2845
2846      if (addSubCarry && op8 == Iop_Add8) {
2847         if (pfx & PFX_LOCK) {
2848            /* cas-style store */
2849            helper_ADC( size, dst1, dst0, src,
2850                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
2851         } else {
2852            /* normal store */
2853            helper_ADC( size, dst1, dst0, src,
2854                        /*store*/addr, IRTemp_INVALID, 0 );
2855         }
2856      } else
2857      if (addSubCarry && op8 == Iop_Sub8) {
2858         if (pfx & PFX_LOCK) {
2859            /* cas-style store */
2860            helper_SBB( size, dst1, dst0, src,
2861                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
2862         } else {
2863            /* normal store */
2864            helper_SBB( size, dst1, dst0, src,
2865                        /*store*/addr, IRTemp_INVALID, 0 );
2866         }
2867      } else {
2868         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2869         if (keep) {
2870            if (pfx & PFX_LOCK) {
2871               if (0) vex_printf("locked case\n" );
2872               casLE( mkexpr(addr),
2873                      mkexpr(dst0)/*expval*/,
2874                      mkexpr(dst1)/*newval*/, guest_RIP_curr_instr );
2875            } else {
2876               if (0) vex_printf("nonlocked case\n");
2877               storeLE(mkexpr(addr), mkexpr(dst1));
2878            }
2879         }
2880         if (isAddSub(op8))
2881            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2882         else
2883            setFlags_DEP1(op8, dst1, ty);
2884      }
2885
2886      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
2887                          nameIRegG(size,pfx,rm), dis_buf);
2888      return len+delta0;
2889   }
2890}
2891
2892
2893/* Handle move instructions of the form
2894      mov E, G  meaning
2895      mov reg-or-mem, reg
2896   Is passed the a ptr to the modRM byte, and the data size.  Returns
2897   the address advanced completely over this instruction.
2898
2899   E(src) is reg-or-mem
2900   G(dst) is reg.
2901
2902   If E is reg, -->    GET %E,  tmpv
2903                       PUT tmpv, %G
2904
2905   If E is mem  -->    (getAddr E) -> tmpa
2906                       LD (tmpa), tmpb
2907                       PUT tmpb, %G
2908*/
2909static
2910ULong dis_mov_E_G ( VexAbiInfo* vbi,
2911                    Prefix      pfx,
2912                    Int         size,
2913                    Long        delta0 )
2914{
2915   Int len;
2916   UChar rm = getUChar(delta0);
2917   HChar dis_buf[50];
2918
2919   if (epartIsReg(rm)) {
2920      putIRegG(size, pfx, rm, getIRegE(size, pfx, rm));
2921      DIP("mov%c %s,%s\n", nameISize(size),
2922                           nameIRegE(size,pfx,rm),
2923                           nameIRegG(size,pfx,rm));
2924      return 1+delta0;
2925   }
2926
2927   /* E refers to memory */
2928   {
2929      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
2930      putIRegG(size, pfx, rm, loadLE(szToITy(size), mkexpr(addr)));
2931      DIP("mov%c %s,%s\n", nameISize(size),
2932                           dis_buf,
2933                           nameIRegG(size,pfx,rm));
2934      return delta0+len;
2935   }
2936}
2937
2938
2939/* Handle move instructions of the form
2940      mov G, E  meaning
2941      mov reg, reg-or-mem
2942   Is passed the a ptr to the modRM byte, and the data size.  Returns
2943   the address advanced completely over this instruction.
2944
2945   G(src) is reg.
2946   E(dst) is reg-or-mem
2947
2948   If E is reg, -->    GET %G,  tmp
2949                       PUT tmp, %E
2950
2951   If E is mem, -->    (getAddr E) -> tmpa
2952                       GET %G, tmpv
2953                       ST tmpv, (tmpa)
2954*/
2955static
2956ULong dis_mov_G_E ( VexAbiInfo* vbi,
2957                    Prefix      pfx,
2958                    Int         size,
2959                    Long        delta0 )
2960{
2961   Int len;
2962   UChar rm = getUChar(delta0);
2963   HChar dis_buf[50];
2964
2965   if (epartIsReg(rm)) {
2966      putIRegE(size, pfx, rm, getIRegG(size, pfx, rm));
2967      DIP("mov%c %s,%s\n", nameISize(size),
2968                           nameIRegG(size,pfx,rm),
2969                           nameIRegE(size,pfx,rm));
2970      return 1+delta0;
2971   }
2972
2973   /* E refers to memory */
2974   {
2975      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
2976      storeLE( mkexpr(addr), getIRegG(size, pfx, rm) );
2977      DIP("mov%c %s,%s\n", nameISize(size),
2978                           nameIRegG(size,pfx,rm),
2979                           dis_buf);
2980      return len+delta0;
2981   }
2982}
2983
2984
2985/* op $immediate, AL/AX/EAX/RAX. */
2986static
2987ULong dis_op_imm_A ( Int    size,
2988                     Bool   carrying,
2989                     IROp   op8,
2990                     Bool   keep,
2991                     Long   delta,
2992                     HChar* t_amd64opc )
2993{
2994   Int    size4 = imin(size,4);
2995   IRType ty    = szToITy(size);
2996   IRTemp dst0  = newTemp(ty);
2997   IRTemp src   = newTemp(ty);
2998   IRTemp dst1  = newTemp(ty);
2999   Long  lit    = getSDisp(size4,delta);
3000   assign(dst0, getIRegRAX(size));
3001   assign(src,  mkU(ty,lit & mkSizeMask(size)));
3002
3003   if (isAddSub(op8) && !carrying) {
3004      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
3005      setFlags_DEP1_DEP2(op8, dst0, src, ty);
3006   }
3007   else
3008   if (isLogic(op8)) {
3009      vassert(!carrying);
3010      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
3011      setFlags_DEP1(op8, dst1, ty);
3012   }
3013   else
3014   if (op8 == Iop_Add8 && carrying) {
3015      helper_ADC( size, dst1, dst0, src,
3016                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3017   }
3018   else
3019   if (op8 == Iop_Sub8 && carrying) {
3020      helper_SBB( size, dst1, dst0, src,
3021                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3022   }
3023   else
3024      vpanic("dis_op_imm_A(amd64,guest)");
3025
3026   if (keep)
3027      putIRegRAX(size, mkexpr(dst1));
3028
3029   DIP("%s%c $%lld, %s\n", t_amd64opc, nameISize(size),
3030                           lit, nameIRegRAX(size));
3031   return delta+size4;
3032}
3033
3034
3035/* Sign- and Zero-extending moves. */
3036static
3037ULong dis_movx_E_G ( VexAbiInfo* vbi,
3038                     Prefix pfx,
3039                     Long delta, Int szs, Int szd, Bool sign_extend )
3040{
3041   UChar rm = getUChar(delta);
3042   if (epartIsReg(rm)) {
3043      putIRegG(szd, pfx, rm,
3044                    doScalarWidening(
3045                       szs,szd,sign_extend,
3046                       getIRegE(szs,pfx,rm)));
3047      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
3048                               nameISize(szs),
3049                               nameISize(szd),
3050                               nameIRegE(szs,pfx,rm),
3051                               nameIRegG(szd,pfx,rm));
3052      return 1+delta;
3053   }
3054
3055   /* E refers to memory */
3056   {
3057      Int    len;
3058      HChar  dis_buf[50];
3059      IRTemp addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
3060      putIRegG(szd, pfx, rm,
3061                    doScalarWidening(
3062                       szs,szd,sign_extend,
3063                       loadLE(szToITy(szs),mkexpr(addr))));
3064      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
3065                               nameISize(szs),
3066                               nameISize(szd),
3067                               dis_buf,
3068                               nameIRegG(szd,pfx,rm));
3069      return len+delta;
3070   }
3071}
3072
3073
3074/* Generate code to divide ArchRegs RDX:RAX / EDX:EAX / DX:AX / AX by
3075   the 64 / 32 / 16 / 8 bit quantity in the given IRTemp.  */
3076static
3077void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
3078{
3079   /* special-case the 64-bit case */
3080   if (sz == 8) {
3081      IROp   op     = signed_divide ? Iop_DivModS128to64
3082                                    : Iop_DivModU128to64;
3083      IRTemp src128 = newTemp(Ity_I128);
3084      IRTemp dst128 = newTemp(Ity_I128);
3085      assign( src128, binop(Iop_64HLto128,
3086                            getIReg64(R_RDX),
3087                            getIReg64(R_RAX)) );
3088      assign( dst128, binop(op, mkexpr(src128), mkexpr(t)) );
3089      putIReg64( R_RAX, unop(Iop_128to64,mkexpr(dst128)) );
3090      putIReg64( R_RDX, unop(Iop_128HIto64,mkexpr(dst128)) );
3091   } else {
3092      IROp   op    = signed_divide ? Iop_DivModS64to32
3093                                   : Iop_DivModU64to32;
3094      IRTemp src64 = newTemp(Ity_I64);
3095      IRTemp dst64 = newTemp(Ity_I64);
3096      switch (sz) {
3097      case 4:
3098         assign( src64,
3099                 binop(Iop_32HLto64, getIRegRDX(4), getIRegRAX(4)) );
3100         assign( dst64,
3101                 binop(op, mkexpr(src64), mkexpr(t)) );
3102         putIRegRAX( 4, unop(Iop_64to32,mkexpr(dst64)) );
3103         putIRegRDX( 4, unop(Iop_64HIto32,mkexpr(dst64)) );
3104         break;
3105      case 2: {
3106         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
3107         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
3108         assign( src64, unop(widen3264,
3109                             binop(Iop_16HLto32,
3110                                   getIRegRDX(2),
3111                                   getIRegRAX(2))) );
3112         assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
3113         putIRegRAX( 2, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
3114         putIRegRDX( 2, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
3115         break;
3116      }
3117      case 1: {
3118         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
3119         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
3120         IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
3121         assign( src64, unop(widen3264,
3122                        unop(widen1632, getIRegRAX(2))) );
3123         assign( dst64,
3124                 binop(op, mkexpr(src64),
3125                           unop(widen1632, unop(widen816, mkexpr(t)))) );
3126         putIRegRAX( 1, unop(Iop_16to8,
3127                        unop(Iop_32to16,
3128                        unop(Iop_64to32,mkexpr(dst64)))) );
3129         putIRegAH( unop(Iop_16to8,
3130                    unop(Iop_32to16,
3131                    unop(Iop_64HIto32,mkexpr(dst64)))) );
3132         break;
3133      }
3134      default:
3135         vpanic("codegen_div(amd64)");
3136      }
3137   }
3138}
3139
3140static
3141ULong dis_Grp1 ( VexAbiInfo* vbi,
3142                 Prefix pfx,
3143                 Long delta, UChar modrm,
3144                 Int am_sz, Int d_sz, Int sz, Long d64 )
3145{
3146   Int     len;
3147   HChar   dis_buf[50];
3148   IRType  ty   = szToITy(sz);
3149   IRTemp  dst1 = newTemp(ty);
3150   IRTemp  src  = newTemp(ty);
3151   IRTemp  dst0 = newTemp(ty);
3152   IRTemp  addr = IRTemp_INVALID;
3153   IROp    op8  = Iop_INVALID;
3154   ULong   mask = mkSizeMask(sz);
3155
3156   switch (gregLO3ofRM(modrm)) {
3157      case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
3158      case 2: break;  // ADC
3159      case 3: break;  // SBB
3160      case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
3161      case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
3162      /*NOTREACHED*/
3163      default: vpanic("dis_Grp1(amd64): unhandled case");
3164   }
3165
3166   if (epartIsReg(modrm)) {
3167      vassert(am_sz == 1);
3168
3169      assign(dst0, getIRegE(sz,pfx,modrm));
3170      assign(src,  mkU(ty,d64 & mask));
3171
3172      if (gregLO3ofRM(modrm) == 2 /* ADC */) {
3173         helper_ADC( sz, dst1, dst0, src,
3174                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3175      } else
3176      if (gregLO3ofRM(modrm) == 3 /* SBB */) {
3177         helper_SBB( sz, dst1, dst0, src,
3178                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3179      } else {
3180         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
3181         if (isAddSub(op8))
3182            setFlags_DEP1_DEP2(op8, dst0, src, ty);
3183         else
3184            setFlags_DEP1(op8, dst1, ty);
3185      }
3186
3187      if (gregLO3ofRM(modrm) < 7)
3188         putIRegE(sz, pfx, modrm, mkexpr(dst1));
3189
3190      delta += (am_sz + d_sz);
3191      DIP("%s%c $%lld, %s\n",
3192          nameGrp1(gregLO3ofRM(modrm)), nameISize(sz), d64,
3193          nameIRegE(sz,pfx,modrm));
3194   } else {
3195      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
3196
3197      assign(dst0, loadLE(ty,mkexpr(addr)));
3198      assign(src, mkU(ty,d64 & mask));
3199
3200      if (gregLO3ofRM(modrm) == 2 /* ADC */) {
3201         if (pfx & PFX_LOCK) {
3202            /* cas-style store */
3203            helper_ADC( sz, dst1, dst0, src,
3204                       /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
3205         } else {
3206            /* normal store */
3207            helper_ADC( sz, dst1, dst0, src,
3208                        /*store*/addr, IRTemp_INVALID, 0 );
3209         }
3210      } else
3211      if (gregLO3ofRM(modrm) == 3 /* SBB */) {
3212         if (pfx & PFX_LOCK) {
3213            /* cas-style store */
3214            helper_SBB( sz, dst1, dst0, src,
3215                       /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
3216         } else {
3217            /* normal store */
3218            helper_SBB( sz, dst1, dst0, src,
3219                        /*store*/addr, IRTemp_INVALID, 0 );
3220         }
3221      } else {
3222         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
3223         if (gregLO3ofRM(modrm) < 7) {
3224            if (pfx & PFX_LOCK) {
3225               casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
3226                                    mkexpr(dst1)/*newVal*/,
3227                                    guest_RIP_curr_instr );
3228            } else {
3229               storeLE(mkexpr(addr), mkexpr(dst1));
3230            }
3231         }
3232         if (isAddSub(op8))
3233            setFlags_DEP1_DEP2(op8, dst0, src, ty);
3234         else
3235            setFlags_DEP1(op8, dst1, ty);
3236      }
3237
3238      delta += (len+d_sz);
3239      DIP("%s%c $%lld, %s\n",
3240          nameGrp1(gregLO3ofRM(modrm)), nameISize(sz),
3241          d64, dis_buf);
3242   }
3243   return delta;
3244}
3245
3246
3247/* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
3248   expression. */
3249
3250static
3251ULong dis_Grp2 ( VexAbiInfo* vbi,
3252                 Prefix pfx,
3253                 Long delta, UChar modrm,
3254                 Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
3255                 HChar* shift_expr_txt, Bool* decode_OK )
3256{
3257   /* delta on entry points at the modrm byte. */
3258   HChar  dis_buf[50];
3259   Int    len;
3260   Bool   isShift, isRotate, isRotateC;
3261   IRType ty    = szToITy(sz);
3262   IRTemp dst0  = newTemp(ty);
3263   IRTemp dst1  = newTemp(ty);
3264   IRTemp addr  = IRTemp_INVALID;
3265
3266   *decode_OK = True;
3267
3268   vassert(sz == 1 || sz == 2 || sz == 4 || sz == 8);
3269
3270   /* Put value to shift/rotate in dst0. */
3271   if (epartIsReg(modrm)) {
3272      assign(dst0, getIRegE(sz, pfx, modrm));
3273      delta += (am_sz + d_sz);
3274   } else {
3275      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
3276      assign(dst0, loadLE(ty,mkexpr(addr)));
3277      delta += len + d_sz;
3278   }
3279
3280   isShift = False;
3281   switch (gregLO3ofRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
3282
3283   isRotate = False;
3284   switch (gregLO3ofRM(modrm)) { case 0: case 1: isRotate = True; }
3285
3286   isRotateC = False;
3287   switch (gregLO3ofRM(modrm)) { case 2: case 3: isRotateC = True; }
3288
3289   if (!isShift && !isRotate && !isRotateC) {
3290      /*NOTREACHED*/
3291      vpanic("dis_Grp2(Reg): unhandled case(amd64)");
3292   }
3293
3294   if (isRotateC) {
3295      /* Call a helper; this insn is so ridiculous it does not deserve
3296         better.  One problem is, the helper has to calculate both the
3297         new value and the new flags.  This is more than 64 bits, and
3298         there is no way to return more than 64 bits from the helper.
3299         Hence the crude and obvious solution is to call it twice,
3300         using the sign of the sz field to indicate whether it is the
3301         value or rflags result we want.
3302      */
3303      Bool     left = toBool(gregLO3ofRM(modrm) == 2);
3304      IRExpr** argsVALUE;
3305      IRExpr** argsRFLAGS;
3306
3307      IRTemp new_value  = newTemp(Ity_I64);
3308      IRTemp new_rflags = newTemp(Ity_I64);
3309      IRTemp old_rflags = newTemp(Ity_I64);
3310
3311      assign( old_rflags, widenUto64(mk_amd64g_calculate_rflags_all()) );
3312
3313      argsVALUE
3314         = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
3315                          widenUto64(shift_expr),   /* rotate amount */
3316                          mkexpr(old_rflags),
3317                          mkU64(sz) );
3318      assign( new_value,
3319                 mkIRExprCCall(
3320                    Ity_I64,
3321                    0/*regparm*/,
3322                    left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
3323                    left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
3324                    argsVALUE
3325                 )
3326            );
3327
3328      argsRFLAGS
3329         = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
3330                          widenUto64(shift_expr),   /* rotate amount */
3331                          mkexpr(old_rflags),
3332                          mkU64(-sz) );
3333      assign( new_rflags,
3334                 mkIRExprCCall(
3335                    Ity_I64,
3336                    0/*regparm*/,
3337                    left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
3338                    left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
3339                    argsRFLAGS
3340                 )
3341            );
3342
3343      assign( dst1, narrowTo(ty, mkexpr(new_value)) );
3344      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
3345      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(new_rflags) ));
3346      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
3347      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
3348   }
3349
3350   else
3351   if (isShift) {
3352
3353      IRTemp pre64     = newTemp(Ity_I64);
3354      IRTemp res64     = newTemp(Ity_I64);
3355      IRTemp res64ss   = newTemp(Ity_I64);
3356      IRTemp shift_amt = newTemp(Ity_I8);
3357      UChar  mask      = toUChar(sz==8 ? 63 : 31);
3358      IROp   op64;
3359
3360      switch (gregLO3ofRM(modrm)) {
3361         case 4: op64 = Iop_Shl64; break;
3362         case 5: op64 = Iop_Shr64; break;
3363         case 6: op64 = Iop_Shl64; break;
3364         case 7: op64 = Iop_Sar64; break;
3365         /*NOTREACHED*/
3366         default: vpanic("dis_Grp2:shift"); break;
3367      }
3368
3369      /* Widen the value to be shifted to 64 bits, do the shift, and
3370         narrow back down.  This seems surprisingly long-winded, but
3371         unfortunately the AMD semantics requires that 8/16/32-bit
3372         shifts give defined results for shift values all the way up
3373         to 32, and this seems the simplest way to do it.  It has the
3374         advantage that the only IR level shifts generated are of 64
3375         bit values, and the shift amount is guaranteed to be in the
3376         range 0 .. 63, thereby observing the IR semantics requiring
3377         all shift values to be in the range 0 .. 2^word_size-1.
3378
3379         Therefore the shift amount is masked with 63 for 64-bit shifts
3380         and 31 for all others.
3381      */
3382      /* shift_amt = shift_expr & MASK, regardless of operation size */
3383      assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(mask)) );
3384
3385      /* suitably widen the value to be shifted to 64 bits. */
3386      assign( pre64, op64==Iop_Sar64 ? widenSto64(mkexpr(dst0))
3387                                     : widenUto64(mkexpr(dst0)) );
3388
3389      /* res64 = pre64 `shift` shift_amt */
3390      assign( res64, binop(op64, mkexpr(pre64), mkexpr(shift_amt)) );
3391
3392      /* res64ss = pre64 `shift` ((shift_amt - 1) & MASK) */
3393      assign( res64ss,
3394              binop(op64,
3395                    mkexpr(pre64),
3396                    binop(Iop_And8,
3397                          binop(Iop_Sub8,
3398                                mkexpr(shift_amt), mkU8(1)),
3399                          mkU8(mask))) );
3400
3401      /* Build the flags thunk. */
3402      setFlags_DEP1_DEP2_shift(op64, res64, res64ss, ty, shift_amt);
3403
3404      /* Narrow the result back down. */
3405      assign( dst1, narrowTo(ty, mkexpr(res64)) );
3406
3407   } /* if (isShift) */
3408
3409   else
3410   if (isRotate) {
3411      Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1
3412                                        : (ty==Ity_I32 ? 2 : 3));
3413      Bool   left      = toBool(gregLO3ofRM(modrm) == 0);
3414      IRTemp rot_amt   = newTemp(Ity_I8);
3415      IRTemp rot_amt64 = newTemp(Ity_I8);
3416      IRTemp oldFlags  = newTemp(Ity_I64);
3417      UChar  mask      = toUChar(sz==8 ? 63 : 31);
3418
3419      /* rot_amt = shift_expr & mask */
3420      /* By masking the rotate amount thusly, the IR-level Shl/Shr
3421         expressions never shift beyond the word size and thus remain
3422         well defined. */
3423      assign(rot_amt64, binop(Iop_And8, shift_expr, mkU8(mask)));
3424
3425      if (ty == Ity_I64)
3426         assign(rot_amt, mkexpr(rot_amt64));
3427      else
3428         assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt64), mkU8(8*sz-1)));
3429
3430      if (left) {
3431
3432         /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
3433         assign(dst1,
3434            binop( mkSizedOp(ty,Iop_Or8),
3435                   binop( mkSizedOp(ty,Iop_Shl8),
3436                          mkexpr(dst0),
3437                          mkexpr(rot_amt)
3438                   ),
3439                   binop( mkSizedOp(ty,Iop_Shr8),
3440                          mkexpr(dst0),
3441                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
3442                   )
3443            )
3444         );
3445         ccOp += AMD64G_CC_OP_ROLB;
3446
3447      } else { /* right */
3448
3449         /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
3450         assign(dst1,
3451            binop( mkSizedOp(ty,Iop_Or8),
3452                   binop( mkSizedOp(ty,Iop_Shr8),
3453                          mkexpr(dst0),
3454                          mkexpr(rot_amt)
3455                   ),
3456                   binop( mkSizedOp(ty,Iop_Shl8),
3457                          mkexpr(dst0),
3458                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
3459                   )
3460            )
3461         );
3462         ccOp += AMD64G_CC_OP_RORB;
3463
3464      }
3465
3466      /* dst1 now holds the rotated value.  Build flag thunk.  We
3467         need the resulting value for this, and the previous flags.
3468         Except don't set it if the rotate count is zero. */
3469
3470      assign(oldFlags, mk_amd64g_calculate_rflags_all());
3471
3472      /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
3473      stmt( IRStmt_Put( OFFB_CC_OP,
3474                        IRExpr_Mux0X( mkexpr(rot_amt64),
3475                                      IRExpr_Get(OFFB_CC_OP,Ity_I64),
3476                                      mkU64(ccOp))) );
3477      stmt( IRStmt_Put( OFFB_CC_DEP1,
3478                        IRExpr_Mux0X( mkexpr(rot_amt64),
3479                                      IRExpr_Get(OFFB_CC_DEP1,Ity_I64),
3480                                      widenUto64(mkexpr(dst1)))) );
3481      stmt( IRStmt_Put( OFFB_CC_DEP2,
3482                        IRExpr_Mux0X( mkexpr(rot_amt64),
3483                                      IRExpr_Get(OFFB_CC_DEP2,Ity_I64),
3484                                      mkU64(0))) );
3485      stmt( IRStmt_Put( OFFB_CC_NDEP,
3486                        IRExpr_Mux0X( mkexpr(rot_amt64),
3487                                      IRExpr_Get(OFFB_CC_NDEP,Ity_I64),
3488                                      mkexpr(oldFlags))) );
3489   } /* if (isRotate) */
3490
3491   /* Save result, and finish up. */
3492   if (epartIsReg(modrm)) {
3493      putIRegE(sz, pfx, modrm, mkexpr(dst1));
3494      if (vex_traceflags & VEX_TRACE_FE) {
3495         vex_printf("%s%c ",
3496                    nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
3497         if (shift_expr_txt)
3498            vex_printf("%s", shift_expr_txt);
3499         else
3500            ppIRExpr(shift_expr);
3501         vex_printf(", %s\n", nameIRegE(sz,pfx,modrm));
3502      }
3503   } else {
3504      storeLE(mkexpr(addr), mkexpr(dst1));
3505      if (vex_traceflags & VEX_TRACE_FE) {
3506         vex_printf("%s%c ",
3507                    nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
3508         if (shift_expr_txt)
3509            vex_printf("%s", shift_expr_txt);
3510         else
3511            ppIRExpr(shift_expr);
3512         vex_printf(", %s\n", dis_buf);
3513      }
3514   }
3515   return delta;
3516}
3517
3518
3519/* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
3520static
3521ULong dis_Grp8_Imm ( VexAbiInfo* vbi,
3522                     Prefix pfx,
3523                     Long delta, UChar modrm,
3524                     Int am_sz, Int sz, ULong src_val,
3525                     Bool* decode_OK )
3526{
3527   /* src_val denotes a d8.
3528      And delta on entry points at the modrm byte. */
3529
3530   IRType ty     = szToITy(sz);
3531   IRTemp t2     = newTemp(Ity_I64);
3532   IRTemp t2m    = newTemp(Ity_I64);
3533   IRTemp t_addr = IRTemp_INVALID;
3534   HChar  dis_buf[50];
3535   ULong  mask;
3536
3537   /* we're optimists :-) */
3538   *decode_OK = True;
3539
3540   /* Limit src_val -- the bit offset -- to something within a word.
3541      The Intel docs say that literal offsets larger than a word are
3542      masked in this way. */
3543   switch (sz) {
3544      case 2:  src_val &= 15; break;
3545      case 4:  src_val &= 31; break;
3546      case 8:  src_val &= 63; break;
3547      default: *decode_OK = False; return delta;
3548   }
3549
3550   /* Invent a mask suitable for the operation. */
3551   switch (gregLO3ofRM(modrm)) {
3552      case 4: /* BT */  mask = 0;                  break;
3553      case 5: /* BTS */ mask = 1ULL << src_val;    break;
3554      case 6: /* BTR */ mask = ~(1ULL << src_val); break;
3555      case 7: /* BTC */ mask = 1ULL << src_val;    break;
3556         /* If this needs to be extended, probably simplest to make a
3557            new function to handle the other cases (0 .. 3).  The
3558            Intel docs do however not indicate any use for 0 .. 3, so
3559            we don't expect this to happen. */
3560      default: *decode_OK = False; return delta;
3561   }
3562
3563   /* Fetch the value to be tested and modified into t2, which is
3564      64-bits wide regardless of sz. */
3565   if (epartIsReg(modrm)) {
3566      vassert(am_sz == 1);
3567      assign( t2, widenUto64(getIRegE(sz, pfx, modrm)) );
3568      delta += (am_sz + 1);
3569      DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
3570                                nameISize(sz),
3571                                src_val, nameIRegE(sz,pfx,modrm));
3572   } else {
3573      Int len;
3574      t_addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 1 );
3575      delta  += (len+1);
3576      assign( t2, widenUto64(loadLE(ty, mkexpr(t_addr))) );
3577      DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
3578                                nameISize(sz),
3579                                src_val, dis_buf);
3580   }
3581
3582   /* Compute the new value into t2m, if non-BT. */
3583   switch (gregLO3ofRM(modrm)) {
3584      case 4: /* BT */
3585         break;
3586      case 5: /* BTS */
3587         assign( t2m, binop(Iop_Or64, mkU64(mask), mkexpr(t2)) );
3588         break;
3589      case 6: /* BTR */
3590         assign( t2m, binop(Iop_And64, mkU64(mask), mkexpr(t2)) );
3591         break;
3592      case 7: /* BTC */
3593         assign( t2m, binop(Iop_Xor64, mkU64(mask), mkexpr(t2)) );
3594         break;
3595     default:
3596         /*NOTREACHED*/ /*the previous switch guards this*/
3597         vassert(0);
3598   }
3599
3600   /* Write the result back, if non-BT. */
3601   if (gregLO3ofRM(modrm) != 4 /* BT */) {
3602      if (epartIsReg(modrm)) {
3603	putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m)));
3604      } else {
3605         if (pfx & PFX_LOCK) {
3606            casLE( mkexpr(t_addr),
3607                   narrowTo(ty, mkexpr(t2))/*expd*/,
3608                   narrowTo(ty, mkexpr(t2m))/*new*/,
3609                   guest_RIP_curr_instr );
3610         } else {
3611            storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
3612         }
3613      }
3614   }
3615
3616   /* Copy relevant bit from t2 into the carry flag. */
3617   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
3618   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
3619   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
3620   stmt( IRStmt_Put(
3621            OFFB_CC_DEP1,
3622            binop(Iop_And64,
3623                  binop(Iop_Shr64, mkexpr(t2), mkU8(src_val)),
3624                  mkU64(1))
3625       ));
3626   /* Set NDEP even though it isn't used.  This makes redundant-PUT
3627      elimination of previous stores to this field work better. */
3628   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
3629
3630   return delta;
3631}
3632
3633
3634/* Signed/unsigned widening multiply.  Generate IR to multiply the
3635   value in RAX/EAX/AX/AL by the given IRTemp, and park the result in
3636   RDX:RAX/EDX:EAX/DX:AX/AX.
3637*/
3638static void codegen_mulL_A_D ( Int sz, Bool syned,
3639                               IRTemp tmp, HChar* tmp_txt )
3640{
3641   IRType ty = szToITy(sz);
3642   IRTemp t1 = newTemp(ty);
3643
3644   assign( t1, getIRegRAX(sz) );
3645
3646   switch (ty) {
3647      case Ity_I64: {
3648         IRTemp res128  = newTemp(Ity_I128);
3649         IRTemp resHi   = newTemp(Ity_I64);
3650         IRTemp resLo   = newTemp(Ity_I64);
3651         IROp   mulOp   = syned ? Iop_MullS64 : Iop_MullU64;
3652         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
3653         setFlags_MUL ( Ity_I64, t1, tmp, tBaseOp );
3654         assign( res128, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
3655         assign( resHi, unop(Iop_128HIto64,mkexpr(res128)));
3656         assign( resLo, unop(Iop_128to64,mkexpr(res128)));
3657         putIReg64(R_RDX, mkexpr(resHi));
3658         putIReg64(R_RAX, mkexpr(resLo));
3659         break;
3660      }
3661      case Ity_I32: {
3662         IRTemp res64   = newTemp(Ity_I64);
3663         IRTemp resHi   = newTemp(Ity_I32);
3664         IRTemp resLo   = newTemp(Ity_I32);
3665         IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
3666         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
3667         setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
3668         assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
3669         assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
3670         assign( resLo, unop(Iop_64to32,mkexpr(res64)));
3671         putIRegRDX(4, mkexpr(resHi));
3672         putIRegRAX(4, mkexpr(resLo));
3673         break;
3674      }
3675      case Ity_I16: {
3676         IRTemp res32   = newTemp(Ity_I32);
3677         IRTemp resHi   = newTemp(Ity_I16);
3678         IRTemp resLo   = newTemp(Ity_I16);
3679         IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
3680         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
3681         setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
3682         assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
3683         assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
3684         assign( resLo, unop(Iop_32to16,mkexpr(res32)));
3685         putIRegRDX(2, mkexpr(resHi));
3686         putIRegRAX(2, mkexpr(resLo));
3687         break;
3688      }
3689      case Ity_I8: {
3690         IRTemp res16   = newTemp(Ity_I16);
3691         IRTemp resHi   = newTemp(Ity_I8);
3692         IRTemp resLo   = newTemp(Ity_I8);
3693         IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
3694         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
3695         setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
3696         assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
3697         assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
3698         assign( resLo, unop(Iop_16to8,mkexpr(res16)));
3699         putIRegRAX(2, mkexpr(res16));
3700         break;
3701      }
3702      default:
3703         ppIRType(ty);
3704         vpanic("codegen_mulL_A_D(amd64)");
3705   }
3706   DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
3707}
3708
3709
3710/* Group 3 extended opcodes. */
3711static
3712ULong dis_Grp3 ( VexAbiInfo* vbi,
3713                 Prefix pfx, Int sz, Long delta, Bool* decode_OK )
3714{
3715   Long    d64;
3716   UChar   modrm;
3717   HChar   dis_buf[50];
3718   Int     len;
3719   IRTemp  addr;
3720   IRType  ty = szToITy(sz);
3721   IRTemp  t1 = newTemp(ty);
3722   IRTemp dst1, src, dst0;
3723   *decode_OK = True;
3724   modrm = getUChar(delta);
3725   if (epartIsReg(modrm)) {
3726      switch (gregLO3ofRM(modrm)) {
3727         case 0: { /* TEST */
3728            delta++;
3729            d64 = getSDisp(imin(4,sz), delta);
3730            delta += imin(4,sz);
3731            dst1 = newTemp(ty);
3732            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
3733                               getIRegE(sz,pfx,modrm),
3734                               mkU(ty, d64 & mkSizeMask(sz))));
3735            setFlags_DEP1( Iop_And8, dst1, ty );
3736            DIP("test%c $%lld, %s\n",
3737                nameISize(sz), d64,
3738                nameIRegE(sz, pfx, modrm));
3739            break;
3740         }
3741         case 1:
3742            *decode_OK = False;
3743            return delta;
3744         case 2: /* NOT */
3745            delta++;
3746            putIRegE(sz, pfx, modrm,
3747                              unop(mkSizedOp(ty,Iop_Not8),
3748                                   getIRegE(sz, pfx, modrm)));
3749            DIP("not%c %s\n", nameISize(sz),
3750                              nameIRegE(sz, pfx, modrm));
3751            break;
3752         case 3: /* NEG */
3753            delta++;
3754            dst0 = newTemp(ty);
3755            src  = newTemp(ty);
3756            dst1 = newTemp(ty);
3757            assign(dst0, mkU(ty,0));
3758            assign(src,  getIRegE(sz, pfx, modrm));
3759            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
3760                                                       mkexpr(src)));
3761            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
3762            putIRegE(sz, pfx, modrm, mkexpr(dst1));
3763            DIP("neg%c %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm));
3764            break;
3765         case 4: /* MUL (unsigned widening) */
3766            delta++;
3767            src = newTemp(ty);
3768            assign(src, getIRegE(sz,pfx,modrm));
3769            codegen_mulL_A_D ( sz, False, src,
3770                               nameIRegE(sz,pfx,modrm) );
3771            break;
3772         case 5: /* IMUL (signed widening) */
3773            delta++;
3774            src = newTemp(ty);
3775            assign(src, getIRegE(sz,pfx,modrm));
3776            codegen_mulL_A_D ( sz, True, src,
3777                               nameIRegE(sz,pfx,modrm) );
3778            break;
3779         case 6: /* DIV */
3780            delta++;
3781            assign( t1, getIRegE(sz, pfx, modrm) );
3782            codegen_div ( sz, t1, False );
3783            DIP("div%c %s\n", nameISize(sz),
3784                              nameIRegE(sz, pfx, modrm));
3785            break;
3786         case 7: /* IDIV */
3787            delta++;
3788            assign( t1, getIRegE(sz, pfx, modrm) );
3789            codegen_div ( sz, t1, True );
3790            DIP("idiv%c %s\n", nameISize(sz),
3791                               nameIRegE(sz, pfx, modrm));
3792            break;
3793         default:
3794            /*NOTREACHED*/
3795            vpanic("Grp3(amd64,R)");
3796      }
3797   } else {
3798      addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
3799                        /* we have to inform disAMode of any immediate
3800			   bytes used */
3801                        gregLO3ofRM(modrm)==0/*TEST*/
3802                           ? imin(4,sz)
3803                           : 0
3804                      );
3805      t1   = newTemp(ty);
3806      delta += len;
3807      assign(t1, loadLE(ty,mkexpr(addr)));
3808      switch (gregLO3ofRM(modrm)) {
3809         case 0: { /* TEST */
3810            d64 = getSDisp(imin(4,sz), delta);
3811            delta += imin(4,sz);
3812            dst1 = newTemp(ty);
3813            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
3814                               mkexpr(t1),
3815                               mkU(ty, d64 & mkSizeMask(sz))));
3816            setFlags_DEP1( Iop_And8, dst1, ty );
3817            DIP("test%c $%lld, %s\n", nameISize(sz), d64, dis_buf);
3818            break;
3819         }
3820         case 1:
3821            *decode_OK = False;
3822            return delta;
3823         case 2: /* NOT */
3824            dst1 = newTemp(ty);
3825            assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
3826            if (pfx & PFX_LOCK) {
3827               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
3828                                    guest_RIP_curr_instr );
3829            } else {
3830               storeLE( mkexpr(addr), mkexpr(dst1) );
3831            }
3832            DIP("not%c %s\n", nameISize(sz), dis_buf);
3833            break;
3834         case 3: /* NEG */
3835            dst0 = newTemp(ty);
3836            src  = newTemp(ty);
3837            dst1 = newTemp(ty);
3838            assign(dst0, mkU(ty,0));
3839            assign(src,  mkexpr(t1));
3840            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
3841                                                       mkexpr(src)));
3842            if (pfx & PFX_LOCK) {
3843               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
3844                                    guest_RIP_curr_instr );
3845            } else {
3846               storeLE( mkexpr(addr), mkexpr(dst1) );
3847            }
3848            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
3849            DIP("neg%c %s\n", nameISize(sz), dis_buf);
3850            break;
3851         case 4: /* MUL (unsigned widening) */
3852            codegen_mulL_A_D ( sz, False, t1, dis_buf );
3853            break;
3854         case 5: /* IMUL */
3855            codegen_mulL_A_D ( sz, True, t1, dis_buf );
3856            break;
3857         case 6: /* DIV */
3858            codegen_div ( sz, t1, False );
3859            DIP("div%c %s\n", nameISize(sz), dis_buf);
3860            break;
3861         case 7: /* IDIV */
3862            codegen_div ( sz, t1, True );
3863            DIP("idiv%c %s\n", nameISize(sz), dis_buf);
3864            break;
3865         default:
3866            /*NOTREACHED*/
3867            vpanic("Grp3(amd64,M)");
3868      }
3869   }
3870   return delta;
3871}
3872
3873
3874/* Group 4 extended opcodes. */
3875static
3876ULong dis_Grp4 ( VexAbiInfo* vbi,
3877                 Prefix pfx, Long delta, Bool* decode_OK )
3878{
3879   Int   alen;
3880   UChar modrm;
3881   HChar dis_buf[50];
3882   IRType ty = Ity_I8;
3883   IRTemp t1 = newTemp(ty);
3884   IRTemp t2 = newTemp(ty);
3885
3886   *decode_OK = True;
3887
3888   modrm = getUChar(delta);
3889   if (epartIsReg(modrm)) {
3890      assign(t1, getIRegE(1, pfx, modrm));
3891      switch (gregLO3ofRM(modrm)) {
3892         case 0: /* INC */
3893            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
3894            putIRegE(1, pfx, modrm, mkexpr(t2));
3895            setFlags_INC_DEC( True, t2, ty );
3896            break;
3897         case 1: /* DEC */
3898            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
3899            putIRegE(1, pfx, modrm, mkexpr(t2));
3900            setFlags_INC_DEC( False, t2, ty );
3901            break;
3902         default:
3903            *decode_OK = False;
3904            return delta;
3905      }
3906      delta++;
3907      DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)),
3908                      nameIRegE(1, pfx, modrm));
3909   } else {
3910      IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
3911      assign( t1, loadLE(ty, mkexpr(addr)) );
3912      switch (gregLO3ofRM(modrm)) {
3913         case 0: /* INC */
3914            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
3915            if (pfx & PFX_LOCK) {
3916               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
3917                      guest_RIP_curr_instr );
3918            } else {
3919               storeLE( mkexpr(addr), mkexpr(t2) );
3920            }
3921            setFlags_INC_DEC( True, t2, ty );
3922            break;
3923         case 1: /* DEC */
3924            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
3925            if (pfx & PFX_LOCK) {
3926               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
3927                      guest_RIP_curr_instr );
3928            } else {
3929               storeLE( mkexpr(addr), mkexpr(t2) );
3930            }
3931            setFlags_INC_DEC( False, t2, ty );
3932            break;
3933         default:
3934            *decode_OK = False;
3935            return delta;
3936      }
3937      delta += alen;
3938      DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)), dis_buf);
3939   }
3940   return delta;
3941}
3942
3943
3944/* Group 5 extended opcodes. */
3945static
3946ULong dis_Grp5 ( VexAbiInfo* vbi,
3947                 Prefix pfx, Int sz, Long delta,
3948                 DisResult* dres, Bool* decode_OK )
3949{
3950   Int     len;
3951   UChar   modrm;
3952   HChar   dis_buf[50];
3953   IRTemp  addr = IRTemp_INVALID;
3954   IRType  ty = szToITy(sz);
3955   IRTemp  t1 = newTemp(ty);
3956   IRTemp  t2 = IRTemp_INVALID;
3957   IRTemp  t3 = IRTemp_INVALID;
3958   Bool    showSz = True;
3959
3960   *decode_OK = True;
3961
3962   modrm = getUChar(delta);
3963   if (epartIsReg(modrm)) {
3964      assign(t1, getIRegE(sz,pfx,modrm));
3965      switch (gregLO3ofRM(modrm)) {
3966         case 0: /* INC */
3967            t2 = newTemp(ty);
3968            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
3969                             mkexpr(t1), mkU(ty,1)));
3970            setFlags_INC_DEC( True, t2, ty );
3971            putIRegE(sz,pfx,modrm, mkexpr(t2));
3972            break;
3973         case 1: /* DEC */
3974            t2 = newTemp(ty);
3975            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
3976                             mkexpr(t1), mkU(ty,1)));
3977            setFlags_INC_DEC( False, t2, ty );
3978            putIRegE(sz,pfx,modrm, mkexpr(t2));
3979            break;
3980         case 2: /* call Ev */
3981            /* Ignore any sz value and operate as if sz==8. */
3982            if (!(sz == 4 || sz == 8)) goto unhandled;
3983            sz = 8;
3984            t3 = newTemp(Ity_I64);
3985            assign(t3, getIRegE(sz,pfx,modrm));
3986            t2 = newTemp(Ity_I64);
3987            assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
3988            putIReg64(R_RSP, mkexpr(t2));
3989            storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+1));
3990            make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(reg)");
3991            jmp_treg(Ijk_Call,t3);
3992            dres->whatNext = Dis_StopHere;
3993            showSz = False;
3994            break;
3995         case 4: /* jmp Ev */
3996            /* Ignore any sz value and operate as if sz==8. */
3997            if (!(sz == 4 || sz == 8)) goto unhandled;
3998            sz = 8;
3999            t3 = newTemp(Ity_I64);
4000            assign(t3, getIRegE(sz,pfx,modrm));
4001            jmp_treg(Ijk_Boring,t3);
4002            dres->whatNext = Dis_StopHere;
4003            showSz = False;
4004            break;
4005         default:
4006            *decode_OK = False;
4007            return delta;
4008      }
4009      delta++;
4010      DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
4011                       showSz ? nameISize(sz) : ' ',
4012                       nameIRegE(sz, pfx, modrm));
4013   } else {
4014      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
4015      if (gregLO3ofRM(modrm) != 2 && gregLO3ofRM(modrm) != 4
4016                                  && gregLO3ofRM(modrm) != 6) {
4017         assign(t1, loadLE(ty,mkexpr(addr)));
4018      }
4019      switch (gregLO3ofRM(modrm)) {
4020         case 0: /* INC */
4021            t2 = newTemp(ty);
4022            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
4023                             mkexpr(t1), mkU(ty,1)));
4024            if (pfx & PFX_LOCK) {
4025               casLE( mkexpr(addr),
4026                      mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
4027            } else {
4028               storeLE(mkexpr(addr),mkexpr(t2));
4029            }
4030            setFlags_INC_DEC( True, t2, ty );
4031            break;
4032         case 1: /* DEC */
4033            t2 = newTemp(ty);
4034            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
4035                             mkexpr(t1), mkU(ty,1)));
4036            if (pfx & PFX_LOCK) {
4037               casLE( mkexpr(addr),
4038                      mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
4039            } else {
4040               storeLE(mkexpr(addr),mkexpr(t2));
4041            }
4042            setFlags_INC_DEC( False, t2, ty );
4043            break;
4044         case 2: /* call Ev */
4045            /* Ignore any sz value and operate as if sz==8. */
4046            if (!(sz == 4 || sz == 8)) goto unhandled;
4047            sz = 8;
4048            t3 = newTemp(Ity_I64);
4049            assign(t3, loadLE(Ity_I64,mkexpr(addr)));
4050            t2 = newTemp(Ity_I64);
4051            assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
4052            putIReg64(R_RSP, mkexpr(t2));
4053            storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+len));
4054            make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(mem)");
4055            jmp_treg(Ijk_Call,t3);
4056            dres->whatNext = Dis_StopHere;
4057            showSz = False;
4058            break;
4059         case 4: /* JMP Ev */
4060            /* Ignore any sz value and operate as if sz==8. */
4061            if (!(sz == 4 || sz == 8)) goto unhandled;
4062            sz = 8;
4063            t3 = newTemp(Ity_I64);
4064            assign(t3, loadLE(Ity_I64,mkexpr(addr)));
4065            jmp_treg(Ijk_Boring,t3);
4066            dres->whatNext = Dis_StopHere;
4067            showSz = False;
4068            break;
4069         case 6: /* PUSH Ev */
4070            /* There is no encoding for 32-bit operand size; hence ... */
4071            if (sz == 4) sz = 8;
4072            if (!(sz == 8 || sz == 2)) goto unhandled;
4073            if (sz == 8) {
4074               t3 = newTemp(Ity_I64);
4075               assign(t3, loadLE(Ity_I64,mkexpr(addr)));
4076               t2 = newTemp(Ity_I64);
4077               assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
4078               putIReg64(R_RSP, mkexpr(t2) );
4079               storeLE( mkexpr(t2), mkexpr(t3) );
4080               break;
4081	    } else {
4082               goto unhandled; /* awaiting test case */
4083	    }
4084         default:
4085         unhandled:
4086            *decode_OK = False;
4087            return delta;
4088      }
4089      delta += len;
4090      DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
4091                       showSz ? nameISize(sz) : ' ',
4092                       dis_buf);
4093   }
4094   return delta;
4095}
4096
4097
4098/*------------------------------------------------------------*/
4099/*--- Disassembling string ops (including REP prefixes)    ---*/
4100/*------------------------------------------------------------*/
4101
4102/* Code shared by all the string ops */
4103static
4104void dis_string_op_increment ( Int sz, IRTemp t_inc )
4105{
4106   UChar logSz;
4107   if (sz == 8 || sz == 4 || sz == 2) {
4108      logSz = 1;
4109      if (sz == 4) logSz = 2;
4110      if (sz == 8) logSz = 3;
4111      assign( t_inc,
4112              binop(Iop_Shl64, IRExpr_Get( OFFB_DFLAG, Ity_I64 ),
4113                               mkU8(logSz) ) );
4114   } else {
4115      assign( t_inc,
4116              IRExpr_Get( OFFB_DFLAG, Ity_I64 ) );
4117   }
4118}
4119
4120static
4121void dis_string_op( void (*dis_OP)( Int, IRTemp, Prefix pfx ),
4122                    Int sz, HChar* name, Prefix pfx )
4123{
4124   IRTemp t_inc = newTemp(Ity_I64);
4125   /* Really we ought to inspect the override prefixes, but we don't.
4126      The following assertion catches any resulting sillyness. */
4127   vassert(pfx == clearSegBits(pfx));
4128   dis_string_op_increment(sz, t_inc);
4129   dis_OP( sz, t_inc, pfx );
4130   DIP("%s%c\n", name, nameISize(sz));
4131}
4132
4133static
4134void dis_MOVS ( Int sz, IRTemp t_inc, Prefix pfx )
4135{
4136   IRType ty = szToITy(sz);
4137   IRTemp td = newTemp(Ity_I64);   /* RDI */
4138   IRTemp ts = newTemp(Ity_I64);   /* RSI */
4139   IRExpr *incd, *incs;
4140
4141   if (haveASO(pfx)) {
4142      assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
4143      assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
4144   } else {
4145      assign( td, getIReg64(R_RDI) );
4146      assign( ts, getIReg64(R_RSI) );
4147   }
4148
4149   storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
4150
4151   incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
4152   incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
4153   if (haveASO(pfx)) {
4154      incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
4155      incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
4156   }
4157   putIReg64( R_RDI, incd );
4158   putIReg64( R_RSI, incs );
4159}
4160
4161static
4162void dis_LODS ( Int sz, IRTemp t_inc, Prefix pfx )
4163{
4164   IRType ty = szToITy(sz);
4165   IRTemp ts = newTemp(Ity_I64);   /* RSI */
4166   IRExpr *incs;
4167
4168   if (haveASO(pfx))
4169      assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
4170   else
4171      assign( ts, getIReg64(R_RSI) );
4172
4173   putIRegRAX ( sz, loadLE(ty, mkexpr(ts)) );
4174
4175   incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
4176   if (haveASO(pfx))
4177      incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
4178   putIReg64( R_RSI, incs );
4179}
4180
4181static
4182void dis_STOS ( Int sz, IRTemp t_inc, Prefix pfx )
4183{
4184   IRType ty = szToITy(sz);
4185   IRTemp ta = newTemp(ty);        /* rAX */
4186   IRTemp td = newTemp(Ity_I64);   /* RDI */
4187   IRExpr *incd;
4188
4189   assign( ta, getIRegRAX(sz) );
4190
4191   if (haveASO(pfx))
4192      assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
4193   else
4194      assign( td, getIReg64(R_RDI) );
4195
4196   storeLE( mkexpr(td), mkexpr(ta) );
4197
4198   incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
4199   if (haveASO(pfx))
4200      incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
4201   putIReg64( R_RDI, incd );
4202}
4203
4204static
4205void dis_CMPS ( Int sz, IRTemp t_inc, Prefix pfx )
4206{
4207   IRType ty  = szToITy(sz);
4208   IRTemp tdv = newTemp(ty);      /* (RDI) */
4209   IRTemp tsv = newTemp(ty);      /* (RSI) */
4210   IRTemp td  = newTemp(Ity_I64); /*  RDI  */
4211   IRTemp ts  = newTemp(Ity_I64); /*  RSI  */
4212   IRExpr *incd, *incs;
4213
4214   if (haveASO(pfx)) {
4215      assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
4216      assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
4217   } else {
4218      assign( td, getIReg64(R_RDI) );
4219      assign( ts, getIReg64(R_RSI) );
4220   }
4221
4222   assign( tdv, loadLE(ty,mkexpr(td)) );
4223
4224   assign( tsv, loadLE(ty,mkexpr(ts)) );
4225
4226   setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
4227
4228   incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
4229   incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
4230   if (haveASO(pfx)) {
4231      incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
4232      incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
4233   }
4234   putIReg64( R_RDI, incd );
4235   putIReg64( R_RSI, incs );
4236}
4237
4238static
4239void dis_SCAS ( Int sz, IRTemp t_inc, Prefix pfx )
4240{
4241   IRType ty  = szToITy(sz);
4242   IRTemp ta  = newTemp(ty);       /*  rAX  */
4243   IRTemp td  = newTemp(Ity_I64);  /*  RDI  */
4244   IRTemp tdv = newTemp(ty);       /* (RDI) */
4245   IRExpr *incd;
4246
4247   assign( ta, getIRegRAX(sz) );
4248
4249   if (haveASO(pfx))
4250      assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
4251   else
4252      assign( td, getIReg64(R_RDI) );
4253
4254   assign( tdv, loadLE(ty,mkexpr(td)) );
4255
4256   setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
4257
4258   incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
4259   if (haveASO(pfx))
4260      incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
4261   putIReg64( R_RDI, incd );
4262}
4263
4264
4265/* Wrap the appropriate string op inside a REP/REPE/REPNE.  We assume
4266   the insn is the last one in the basic block, and so emit a jump to
4267   the next insn, rather than just falling through. */
4268static
4269void dis_REP_op ( AMD64Condcode cond,
4270                  void (*dis_OP)(Int, IRTemp, Prefix),
4271                  Int sz, Addr64 rip, Addr64 rip_next, HChar* name,
4272                  Prefix pfx )
4273{
4274   IRTemp t_inc = newTemp(Ity_I64);
4275   IRTemp tc;
4276   IRExpr* cmp;
4277
4278   /* Really we ought to inspect the override prefixes, but we don't.
4279      The following assertion catches any resulting sillyness. */
4280   vassert(pfx == clearSegBits(pfx));
4281
4282   if (haveASO(pfx)) {
4283      tc = newTemp(Ity_I32);  /*  ECX  */
4284      assign( tc, getIReg32(R_RCX) );
4285      cmp = binop(Iop_CmpEQ32, mkexpr(tc), mkU32(0));
4286   } else {
4287      tc = newTemp(Ity_I64);  /*  RCX  */
4288      assign( tc, getIReg64(R_RCX) );
4289      cmp = binop(Iop_CmpEQ64, mkexpr(tc), mkU64(0));
4290   }
4291
4292   stmt( IRStmt_Exit( cmp, Ijk_Boring, IRConst_U64(rip_next) ) );
4293
4294   if (haveASO(pfx))
4295      putIReg32(R_RCX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
4296  else
4297      putIReg64(R_RCX, binop(Iop_Sub64, mkexpr(tc), mkU64(1)) );
4298
4299   dis_string_op_increment(sz, t_inc);
4300   dis_OP (sz, t_inc, pfx);
4301
4302   if (cond == AMD64CondAlways) {
4303      jmp_lit(Ijk_Boring,rip);
4304   } else {
4305      stmt( IRStmt_Exit( mk_amd64g_calculate_condition(cond),
4306                         Ijk_Boring,
4307                         IRConst_U64(rip) ) );
4308      jmp_lit(Ijk_Boring,rip_next);
4309   }
4310   DIP("%s%c\n", name, nameISize(sz));
4311}
4312
4313
4314/*------------------------------------------------------------*/
4315/*--- Arithmetic, etc.                                     ---*/
4316/*------------------------------------------------------------*/
4317
4318/* IMUL E, G.  Supplied eip points to the modR/M byte. */
4319static
4320ULong dis_mul_E_G ( VexAbiInfo* vbi,
4321                    Prefix      pfx,
4322                    Int         size,
4323                    Long        delta0 )
4324{
4325   Int    alen;
4326   HChar  dis_buf[50];
4327   UChar  rm = getUChar(delta0);
4328   IRType ty = szToITy(size);
4329   IRTemp te = newTemp(ty);
4330   IRTemp tg = newTemp(ty);
4331   IRTemp resLo = newTemp(ty);
4332
4333   assign( tg, getIRegG(size, pfx, rm) );
4334   if (epartIsReg(rm)) {
4335      assign( te, getIRegE(size, pfx, rm) );
4336   } else {
4337      IRTemp addr = disAMode( &alen, vbi, pfx, delta0, dis_buf, 0 );
4338      assign( te, loadLE(ty,mkexpr(addr)) );
4339   }
4340
4341   setFlags_MUL ( ty, te, tg, AMD64G_CC_OP_SMULB );
4342
4343   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
4344
4345   putIRegG(size, pfx, rm, mkexpr(resLo) );
4346
4347   if (epartIsReg(rm)) {
4348      DIP("imul%c %s, %s\n", nameISize(size),
4349                             nameIRegE(size,pfx,rm),
4350                             nameIRegG(size,pfx,rm));
4351      return 1+delta0;
4352   } else {
4353      DIP("imul%c %s, %s\n", nameISize(size),
4354                             dis_buf,
4355                             nameIRegG(size,pfx,rm));
4356      return alen+delta0;
4357   }
4358}
4359
4360
4361/* IMUL I * E -> G.  Supplied rip points to the modR/M byte. */
4362static
4363ULong dis_imul_I_E_G ( VexAbiInfo* vbi,
4364                       Prefix      pfx,
4365                       Int         size,
4366                       Long        delta,
4367                       Int         litsize )
4368{
4369   Long   d64;
4370   Int    alen;
4371   HChar  dis_buf[50];
4372   UChar  rm = getUChar(delta);
4373   IRType ty = szToITy(size);
4374   IRTemp te = newTemp(ty);
4375   IRTemp tl = newTemp(ty);
4376   IRTemp resLo = newTemp(ty);
4377
4378   vassert(/*size == 1 ||*/ size == 2 || size == 4 || size == 8);
4379
4380   if (epartIsReg(rm)) {
4381      assign(te, getIRegE(size, pfx, rm));
4382      delta++;
4383   } else {
4384      IRTemp addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
4385                                     imin(4,litsize) );
4386      assign(te, loadLE(ty, mkexpr(addr)));
4387      delta += alen;
4388   }
4389   d64 = getSDisp(imin(4,litsize),delta);
4390   delta += imin(4,litsize);
4391
4392   d64 &= mkSizeMask(size);
4393   assign(tl, mkU(ty,d64));
4394
4395   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
4396
4397   setFlags_MUL ( ty, te, tl, AMD64G_CC_OP_SMULB );
4398
4399   putIRegG(size, pfx, rm, mkexpr(resLo));
4400
4401   DIP("imul%c $%lld, %s, %s\n",
4402       nameISize(size), d64,
4403       ( epartIsReg(rm) ? nameIRegE(size,pfx,rm) : dis_buf ),
4404       nameIRegG(size,pfx,rm) );
4405   return delta;
4406}
4407
4408
4409/* Generate an IR sequence to do a popcount operation on the supplied
4410   IRTemp, and return a new IRTemp holding the result.  'ty' may be
4411   Ity_I16, Ity_I32 or Ity_I64 only. */
4412static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src )
4413{
4414   Int i;
4415   if (ty == Ity_I16) {
4416      IRTemp old = IRTemp_INVALID;
4417      IRTemp nyu = IRTemp_INVALID;
4418      IRTemp mask[4], shift[4];
4419      for (i = 0; i < 4; i++) {
4420         mask[i]  = newTemp(ty);
4421         shift[i] = 1 << i;
4422      }
4423      assign(mask[0], mkU16(0x5555));
4424      assign(mask[1], mkU16(0x3333));
4425      assign(mask[2], mkU16(0x0F0F));
4426      assign(mask[3], mkU16(0x00FF));
4427      old = src;
4428      for (i = 0; i < 4; i++) {
4429         nyu = newTemp(ty);
4430         assign(nyu,
4431                binop(Iop_Add16,
4432                      binop(Iop_And16,
4433                            mkexpr(old),
4434                            mkexpr(mask[i])),
4435                      binop(Iop_And16,
4436                            binop(Iop_Shr16, mkexpr(old), mkU8(shift[i])),
4437                            mkexpr(mask[i]))));
4438         old = nyu;
4439      }
4440      return nyu;
4441   }
4442   if (ty == Ity_I32) {
4443      IRTemp old = IRTemp_INVALID;
4444      IRTemp nyu = IRTemp_INVALID;
4445      IRTemp mask[5], shift[5];
4446      for (i = 0; i < 5; i++) {
4447         mask[i]  = newTemp(ty);
4448         shift[i] = 1 << i;
4449      }
4450      assign(mask[0], mkU32(0x55555555));
4451      assign(mask[1], mkU32(0x33333333));
4452      assign(mask[2], mkU32(0x0F0F0F0F));
4453      assign(mask[3], mkU32(0x00FF00FF));
4454      assign(mask[4], mkU32(0x0000FFFF));
4455      old = src;
4456      for (i = 0; i < 5; i++) {
4457         nyu = newTemp(ty);
4458         assign(nyu,
4459                binop(Iop_Add32,
4460                      binop(Iop_And32,
4461                            mkexpr(old),
4462                            mkexpr(mask[i])),
4463                      binop(Iop_And32,
4464                            binop(Iop_Shr32, mkexpr(old), mkU8(shift[i])),
4465                            mkexpr(mask[i]))));
4466         old = nyu;
4467      }
4468      return nyu;
4469   }
4470   if (ty == Ity_I64) {
4471      IRTemp old = IRTemp_INVALID;
4472      IRTemp nyu = IRTemp_INVALID;
4473      IRTemp mask[6], shift[6];
4474      for (i = 0; i < 6; i++) {
4475         mask[i]  = newTemp(ty);
4476         shift[i] = 1 << i;
4477      }
4478      assign(mask[0], mkU64(0x5555555555555555ULL));
4479      assign(mask[1], mkU64(0x3333333333333333ULL));
4480      assign(mask[2], mkU64(0x0F0F0F0F0F0F0F0FULL));
4481      assign(mask[3], mkU64(0x00FF00FF00FF00FFULL));
4482      assign(mask[4], mkU64(0x0000FFFF0000FFFFULL));
4483      assign(mask[5], mkU64(0x00000000FFFFFFFFULL));
4484      old = src;
4485      for (i = 0; i < 6; i++) {
4486         nyu = newTemp(ty);
4487         assign(nyu,
4488                binop(Iop_Add64,
4489                      binop(Iop_And64,
4490                            mkexpr(old),
4491                            mkexpr(mask[i])),
4492                      binop(Iop_And64,
4493                            binop(Iop_Shr64, mkexpr(old), mkU8(shift[i])),
4494                            mkexpr(mask[i]))));
4495         old = nyu;
4496      }
4497      return nyu;
4498   }
4499   /*NOTREACHED*/
4500   vassert(0);
4501}
4502
4503
4504/* Generate an IR sequence to do a count-leading-zeroes operation on
4505   the supplied IRTemp, and return a new IRTemp holding the result.
4506   'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
4507   the argument is zero, return the number of bits in the word (the
4508   natural semantics). */
4509static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
4510{
4511   vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
4512
4513   IRTemp src64 = newTemp(Ity_I64);
4514   assign(src64, widenUto64( mkexpr(src) ));
4515
4516   IRTemp src64x = newTemp(Ity_I64);
4517   assign(src64x,
4518          binop(Iop_Shl64, mkexpr(src64),
4519                           mkU8(64 - 8 * sizeofIRType(ty))));
4520
4521   // Clz64 has undefined semantics when its input is zero, so
4522   // special-case around that.
4523   IRTemp res64 = newTemp(Ity_I64);
4524   assign(res64,
4525          IRExpr_Mux0X(
4526             unop(Iop_1Uto8,
4527                  binop(Iop_CmpEQ64, mkexpr(src64x), mkU64(0))),
4528             unop(Iop_Clz64, mkexpr(src64x)),
4529             mkU64(8 * sizeofIRType(ty))
4530   ));
4531
4532   IRTemp res = newTemp(ty);
4533   assign(res, narrowTo(ty, mkexpr(res64)));
4534   return res;
4535}
4536
4537
4538/*------------------------------------------------------------*/
4539/*---                                                      ---*/
4540/*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
4541/*---                                                      ---*/
4542/*------------------------------------------------------------*/
4543
4544/* --- Helper functions for dealing with the register stack. --- */
4545
4546/* --- Set the emulation-warning pseudo-register. --- */
4547
4548static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
4549{
4550   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
4551   stmt( IRStmt_Put( OFFB_EMWARN, e ) );
4552}
4553
4554/* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
4555
4556static IRExpr* mkQNaN64 ( void )
4557{
4558  /* QNaN is 0 2047 1 0(51times)
4559     == 0b 11111111111b 1 0(51times)
4560     == 0x7FF8 0000 0000 0000
4561   */
4562   return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
4563}
4564
4565/* --------- Get/put the top-of-stack pointer :: Ity_I32 --------- */
4566
4567static IRExpr* get_ftop ( void )
4568{
4569   return IRExpr_Get( OFFB_FTOP, Ity_I32 );
4570}
4571
4572static void put_ftop ( IRExpr* e )
4573{
4574   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
4575   stmt( IRStmt_Put( OFFB_FTOP, e ) );
4576}
4577
4578/* --------- Get/put the C3210 bits. --------- */
4579
4580static IRExpr*  /* :: Ity_I64 */ get_C3210 ( void )
4581{
4582   return IRExpr_Get( OFFB_FC3210, Ity_I64 );
4583}
4584
4585static void put_C3210 ( IRExpr* e  /* :: Ity_I64 */ )
4586{
4587   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
4588   stmt( IRStmt_Put( OFFB_FC3210, e ) );
4589}
4590
4591/* --------- Get/put the FPU rounding mode. --------- */
4592static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
4593{
4594   return unop(Iop_64to32, IRExpr_Get( OFFB_FPROUND, Ity_I64 ));
4595}
4596
4597static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
4598{
4599   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
4600   stmt( IRStmt_Put( OFFB_FPROUND, unop(Iop_32Uto64,e) ) );
4601}
4602
4603
4604/* --------- Synthesise a 2-bit FPU rounding mode. --------- */
4605/* Produces a value in 0 .. 3, which is encoded as per the type
4606   IRRoundingMode.  Since the guest_FPROUND value is also encoded as
4607   per IRRoundingMode, we merely need to get it and mask it for
4608   safety.
4609*/
4610static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
4611{
4612   return binop( Iop_And32, get_fpround(), mkU32(3) );
4613}
4614
4615static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
4616{
4617   return mkU32(Irrm_NEAREST);
4618}
4619
4620
4621/* --------- Get/set FP register tag bytes. --------- */
4622
4623/* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
4624
4625static void put_ST_TAG ( Int i, IRExpr* value )
4626{
4627   IRRegArray* descr;
4628   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
4629   descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
4630   stmt( IRStmt_PutI( descr, get_ftop(), i, value ) );
4631}
4632
4633/* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
4634   zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
4635
4636static IRExpr* get_ST_TAG ( Int i )
4637{
4638   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
4639   return IRExpr_GetI( descr, get_ftop(), i );
4640}
4641
4642
4643/* --------- Get/set FP registers. --------- */
4644
4645/* Given i, and some expression e, emit 'ST(i) = e' and set the
4646   register's tag to indicate the register is full.  The previous
4647   state of the register is not checked. */
4648
4649static void put_ST_UNCHECKED ( Int i, IRExpr* value )
4650{
4651   IRRegArray* descr;
4652   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
4653   descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
4654   stmt( IRStmt_PutI( descr, get_ftop(), i, value ) );
4655   /* Mark the register as in-use. */
4656   put_ST_TAG(i, mkU8(1));
4657}
4658
4659/* Given i, and some expression e, emit
4660      ST(i) = is_full(i) ? NaN : e
4661   and set the tag accordingly.
4662*/
4663
4664static void put_ST ( Int i, IRExpr* value )
4665{
4666   put_ST_UNCHECKED( i,
4667                     IRExpr_Mux0X( get_ST_TAG(i),
4668                                   /* 0 means empty */
4669                                   value,
4670                                   /* non-0 means full */
4671                                   mkQNaN64()
4672                   )
4673   );
4674}
4675
4676
4677/* Given i, generate an expression yielding 'ST(i)'. */
4678
4679static IRExpr* get_ST_UNCHECKED ( Int i )
4680{
4681   IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
4682   return IRExpr_GetI( descr, get_ftop(), i );
4683}
4684
4685
4686/* Given i, generate an expression yielding
4687  is_full(i) ? ST(i) : NaN
4688*/
4689
4690static IRExpr* get_ST ( Int i )
4691{
4692   return
4693      IRExpr_Mux0X( get_ST_TAG(i),
4694                    /* 0 means empty */
4695                    mkQNaN64(),
4696                    /* non-0 means full */
4697                    get_ST_UNCHECKED(i));
4698}
4699
4700
4701/* Adjust FTOP downwards by one register. */
4702
4703static void fp_push ( void )
4704{
4705   put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
4706}
4707
4708/* Adjust FTOP upwards by one register, and mark the vacated register
4709   as empty.  */
4710
4711static void fp_pop ( void )
4712{
4713   put_ST_TAG(0, mkU8(0));
4714   put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
4715}
4716
4717/* Clear the C2 bit of the FPU status register, for
4718   sin/cos/tan/sincos. */
4719
4720static void clear_C2 ( void )
4721{
4722   put_C3210( binop(Iop_And64, get_C3210(), mkU64(~AMD64G_FC_MASK_C2)) );
4723}
4724
4725/* Invent a plausible-looking FPU status word value:
4726      ((ftop & 7) << 11) | (c3210 & 0x4700)
4727 */
4728static IRExpr* get_FPU_sw ( void )
4729{
4730   return
4731      unop(Iop_32to16,
4732           binop(Iop_Or32,
4733                 binop(Iop_Shl32,
4734                       binop(Iop_And32, get_ftop(), mkU32(7)),
4735                             mkU8(11)),
4736                       binop(Iop_And32, unop(Iop_64to32, get_C3210()),
4737                                        mkU32(0x4700))
4738      ));
4739}
4740
4741
4742/* ------------------------------------------------------- */
4743/* Given all that stack-mangling junk, we can now go ahead
4744   and describe FP instructions.
4745*/
4746
4747/* ST(0) = ST(0) `op` mem64/32(addr)
4748   Need to check ST(0)'s tag on read, but not on write.
4749*/
4750static
4751void fp_do_op_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
4752                         IROp op, Bool dbl )
4753{
4754   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
4755   if (dbl) {
4756      put_ST_UNCHECKED(0,
4757         triop( op,
4758                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4759                get_ST(0),
4760                loadLE(Ity_F64,mkexpr(addr))
4761         ));
4762   } else {
4763      put_ST_UNCHECKED(0,
4764         triop( op,
4765                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4766                get_ST(0),
4767                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
4768         ));
4769   }
4770}
4771
4772
4773/* ST(0) = mem64/32(addr) `op` ST(0)
4774   Need to check ST(0)'s tag on read, but not on write.
4775*/
4776static
4777void fp_do_oprev_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
4778                            IROp op, Bool dbl )
4779{
4780   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
4781   if (dbl) {
4782      put_ST_UNCHECKED(0,
4783         triop( op,
4784                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4785                loadLE(Ity_F64,mkexpr(addr)),
4786                get_ST(0)
4787         ));
4788   } else {
4789      put_ST_UNCHECKED(0,
4790         triop( op,
4791                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4792                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
4793                get_ST(0)
4794         ));
4795   }
4796}
4797
4798
4799/* ST(dst) = ST(dst) `op` ST(src).
4800   Check dst and src tags when reading but not on write.
4801*/
4802static
4803void fp_do_op_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
4804                      Bool pop_after )
4805{
4806   DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
4807   put_ST_UNCHECKED(
4808      st_dst,
4809      triop( op,
4810             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4811             get_ST(st_dst),
4812             get_ST(st_src) )
4813   );
4814   if (pop_after)
4815      fp_pop();
4816}
4817
4818/* ST(dst) = ST(src) `op` ST(dst).
4819   Check dst and src tags when reading but not on write.
4820*/
4821static
4822void fp_do_oprev_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
4823                         Bool pop_after )
4824{
4825   DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
4826   put_ST_UNCHECKED(
4827      st_dst,
4828      triop( op,
4829             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4830             get_ST(st_src),
4831             get_ST(st_dst) )
4832   );
4833   if (pop_after)
4834      fp_pop();
4835}
4836
4837/* %rflags(Z,P,C) = UCOMI( st(0), st(i) ) */
4838static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
4839{
4840   DIP("fucomi%s %%st(0),%%st(%u)\n", pop_after ? "p" : "", i);
4841   /* This is a bit of a hack (and isn't really right).  It sets
4842      Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
4843      documentation implies A and S are unchanged.
4844   */
4845   /* It's also fishy in that it is used both for COMIP and
4846      UCOMIP, and they aren't the same (although similar). */
4847   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
4848   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
4849   stmt( IRStmt_Put(
4850            OFFB_CC_DEP1,
4851            binop( Iop_And64,
4852                   unop( Iop_32Uto64,
4853                         binop(Iop_CmpF64, get_ST(0), get_ST(i))),
4854                   mkU64(0x45)
4855        )));
4856   if (pop_after)
4857      fp_pop();
4858}
4859
4860
4861/* returns
4862   32to16( if e32 <s -32768 || e32 >s 32767 then -32768 else e32 )
4863*/
4864static IRExpr* x87ishly_qnarrow_32_to_16 ( IRExpr* e32 )
4865{
4866   IRTemp t32 = newTemp(Ity_I32);
4867   assign( t32, e32 );
4868   return
4869      IRExpr_Mux0X(
4870         unop(Iop_1Uto8,
4871              binop(Iop_CmpLT64U,
4872                    unop(Iop_32Uto64,
4873                         binop(Iop_Add32, mkexpr(t32), mkU32(32768))),
4874                    mkU64(65536))),
4875         mkU16( 0x8000 ),
4876         unop(Iop_32to16, mkexpr(t32)));
4877}
4878
4879
4880static
4881ULong dis_FPU ( /*OUT*/Bool* decode_ok,
4882                VexAbiInfo* vbi, Prefix pfx, Long delta )
4883{
4884   Int    len;
4885   UInt   r_src, r_dst;
4886   HChar  dis_buf[50];
4887   IRTemp t1, t2;
4888
4889   /* On entry, delta points at the second byte of the insn (the modrm
4890      byte).*/
4891   UChar first_opcode = getUChar(delta-1);
4892   UChar modrm        = getUChar(delta+0);
4893
4894   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
4895
4896   if (first_opcode == 0xD8) {
4897      if (modrm < 0xC0) {
4898
4899         /* bits 5,4,3 are an opcode extension, and the modRM also
4900           specifies an address. */
4901         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
4902         delta += len;
4903
4904         switch (gregLO3ofRM(modrm)) {
4905
4906            case 0: /* FADD single-real */
4907               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
4908               break;
4909
4910            case 1: /* FMUL single-real */
4911               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
4912               break;
4913
4914//..             case 2: /* FCOM single-real */
4915//..                DIP("fcoms %s\n", dis_buf);
4916//..                /* This forces C1 to zero, which isn't right. */
4917//..                put_C3210(
4918//..                    binop( Iop_And32,
4919//..                           binop(Iop_Shl32,
4920//..                                 binop(Iop_CmpF64,
4921//..                                       get_ST(0),
4922//..                                       unop(Iop_F32toF64,
4923//..                                            loadLE(Ity_F32,mkexpr(addr)))),
4924//..                                 mkU8(8)),
4925//..                           mkU32(0x4500)
4926//..                    ));
4927//..                break;
4928//..
4929//..             case 3: /* FCOMP single-real */
4930//..                DIP("fcomps %s\n", dis_buf);
4931//..                /* This forces C1 to zero, which isn't right. */
4932//..                put_C3210(
4933//..                    binop( Iop_And32,
4934//..                           binop(Iop_Shl32,
4935//..                                 binop(Iop_CmpF64,
4936//..                                       get_ST(0),
4937//..                                       unop(Iop_F32toF64,
4938//..                                            loadLE(Ity_F32,mkexpr(addr)))),
4939//..                                 mkU8(8)),
4940//..                           mkU32(0x4500)
4941//..                    ));
4942//..                fp_pop();
4943//..                break;
4944
4945            case 4: /* FSUB single-real */
4946               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
4947               break;
4948
4949            case 5: /* FSUBR single-real */
4950               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
4951               break;
4952
4953            case 6: /* FDIV single-real */
4954               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
4955               break;
4956
4957            case 7: /* FDIVR single-real */
4958               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
4959               break;
4960
4961            default:
4962               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
4963               vex_printf("first_opcode == 0xD8\n");
4964               goto decode_fail;
4965         }
4966      } else {
4967         delta++;
4968         switch (modrm) {
4969
4970            case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
4971               fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
4972               break;
4973
4974            case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
4975               fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
4976               break;
4977
4978            /* Dunno if this is right */
4979            case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
4980               r_dst = (UInt)modrm - 0xD0;
4981               DIP("fcom %%st(0),%%st(%d)\n", r_dst);
4982               /* This forces C1 to zero, which isn't right. */
4983               put_C3210(
4984                   unop(Iop_32Uto64,
4985                   binop( Iop_And32,
4986                          binop(Iop_Shl32,
4987                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
4988                                mkU8(8)),
4989                          mkU32(0x4500)
4990                   )));
4991               break;
4992
4993            /* Dunno if this is right */
4994            case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
4995               r_dst = (UInt)modrm - 0xD8;
4996               DIP("fcomp %%st(0),%%st(%d)\n", r_dst);
4997               /* This forces C1 to zero, which isn't right. */
4998               put_C3210(
4999                   unop(Iop_32Uto64,
5000                   binop( Iop_And32,
5001                          binop(Iop_Shl32,
5002                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
5003                                mkU8(8)),
5004                          mkU32(0x4500)
5005                   )));
5006               fp_pop();
5007               break;
5008
5009            case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
5010               fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
5011               break;
5012
5013            case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
5014               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
5015               break;
5016
5017            case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
5018               fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
5019               break;
5020
5021            case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
5022               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
5023               break;
5024
5025            default:
5026               goto decode_fail;
5027         }
5028      }
5029   }
5030
5031   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
5032   else
5033   if (first_opcode == 0xD9) {
5034      if (modrm < 0xC0) {
5035
5036         /* bits 5,4,3 are an opcode extension, and the modRM also
5037            specifies an address. */
5038         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
5039         delta += len;
5040
5041         switch (gregLO3ofRM(modrm)) {
5042
5043            case 0: /* FLD single-real */
5044               DIP("flds %s\n", dis_buf);
5045               fp_push();
5046               put_ST(0, unop(Iop_F32toF64,
5047                              loadLE(Ity_F32, mkexpr(addr))));
5048               break;
5049
5050            case 2: /* FST single-real */
5051               DIP("fsts %s\n", dis_buf);
5052               storeLE(mkexpr(addr),
5053                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
5054               break;
5055
5056            case 3: /* FSTP single-real */
5057               DIP("fstps %s\n", dis_buf);
5058               storeLE(mkexpr(addr),
5059                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
5060               fp_pop();
5061               break;
5062
5063            case 4: { /* FLDENV m28 */
5064               /* Uses dirty helper:
5065                     VexEmWarn amd64g_do_FLDENV ( VexGuestX86State*, HWord ) */
5066               IRTemp    ew = newTemp(Ity_I32);
5067               IRTemp   w64 = newTemp(Ity_I64);
5068               IRDirty*   d = unsafeIRDirty_0_N (
5069                                 0/*regparms*/,
5070                                 "amd64g_dirtyhelper_FLDENV",
5071                                 &amd64g_dirtyhelper_FLDENV,
5072                                 mkIRExprVec_1( mkexpr(addr) )
5073                              );
5074               d->needsBBP = True;
5075               d->tmp      = w64;
5076               /* declare we're reading memory */
5077               d->mFx   = Ifx_Read;
5078               d->mAddr = mkexpr(addr);
5079               d->mSize = 28;
5080
5081               /* declare we're writing guest state */
5082               d->nFxState = 4;
5083
5084               d->fxState[0].fx     = Ifx_Write;
5085               d->fxState[0].offset = OFFB_FTOP;
5086               d->fxState[0].size   = sizeof(UInt);
5087
5088               d->fxState[1].fx     = Ifx_Write;
5089               d->fxState[1].offset = OFFB_FPTAGS;
5090               d->fxState[1].size   = 8 * sizeof(UChar);
5091
5092               d->fxState[2].fx     = Ifx_Write;
5093               d->fxState[2].offset = OFFB_FPROUND;
5094               d->fxState[2].size   = sizeof(ULong);
5095
5096               d->fxState[3].fx     = Ifx_Write;
5097               d->fxState[3].offset = OFFB_FC3210;
5098               d->fxState[3].size   = sizeof(ULong);
5099
5100               stmt( IRStmt_Dirty(d) );
5101
5102               /* ew contains any emulation warning we may need to
5103                  issue.  If needed, side-exit to the next insn,
5104                  reporting the warning, so that Valgrind's dispatcher
5105                  sees the warning. */
5106	       assign(ew, unop(Iop_64to32,mkexpr(w64)) );
5107               put_emwarn( mkexpr(ew) );
5108               stmt(
5109                  IRStmt_Exit(
5110                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
5111                     Ijk_EmWarn,
5112                     IRConst_U64( guest_RIP_bbstart+delta )
5113                  )
5114               );
5115
5116               DIP("fldenv %s\n", dis_buf);
5117               break;
5118            }
5119
5120            case 5: {/* FLDCW */
5121               /* The only thing we observe in the control word is the
5122                  rounding mode.  Therefore, pass the 16-bit value
5123                  (x87 native-format control word) to a clean helper,
5124                  getting back a 64-bit value, the lower half of which
5125                  is the FPROUND value to store, and the upper half of
5126                  which is the emulation-warning token which may be
5127                  generated.
5128               */
5129               /* ULong amd64h_check_fldcw ( ULong ); */
5130               IRTemp t64 = newTemp(Ity_I64);
5131               IRTemp ew = newTemp(Ity_I32);
5132               DIP("fldcw %s\n", dis_buf);
5133               assign( t64, mkIRExprCCall(
5134                               Ity_I64, 0/*regparms*/,
5135                               "amd64g_check_fldcw",
5136                               &amd64g_check_fldcw,
5137                               mkIRExprVec_1(
5138                                  unop( Iop_16Uto64,
5139                                        loadLE(Ity_I16, mkexpr(addr)))
5140                               )
5141                            )
5142                     );
5143
5144               put_fpround( unop(Iop_64to32, mkexpr(t64)) );
5145               assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
5146               put_emwarn( mkexpr(ew) );
5147               /* Finally, if an emulation warning was reported,
5148                  side-exit to the next insn, reporting the warning,
5149                  so that Valgrind's dispatcher sees the warning. */
5150               stmt(
5151                  IRStmt_Exit(
5152                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
5153                     Ijk_EmWarn,
5154                     IRConst_U64( guest_RIP_bbstart+delta )
5155                  )
5156               );
5157               break;
5158            }
5159
5160            case 6: { /* FNSTENV m28 */
5161               /* Uses dirty helper:
5162                     void amd64g_do_FSTENV ( VexGuestAMD64State*, HWord ) */
5163               IRDirty* d = unsafeIRDirty_0_N (
5164                               0/*regparms*/,
5165                               "amd64g_dirtyhelper_FSTENV",
5166                               &amd64g_dirtyhelper_FSTENV,
5167                               mkIRExprVec_1( mkexpr(addr) )
5168                            );
5169               d->needsBBP = True;
5170               /* declare we're writing memory */
5171               d->mFx   = Ifx_Write;
5172               d->mAddr = mkexpr(addr);
5173               d->mSize = 28;
5174
5175               /* declare we're reading guest state */
5176               d->nFxState = 4;
5177
5178               d->fxState[0].fx     = Ifx_Read;
5179               d->fxState[0].offset = OFFB_FTOP;
5180               d->fxState[0].size   = sizeof(UInt);
5181
5182               d->fxState[1].fx     = Ifx_Read;
5183               d->fxState[1].offset = OFFB_FPTAGS;
5184               d->fxState[1].size   = 8 * sizeof(UChar);
5185
5186               d->fxState[2].fx     = Ifx_Read;
5187               d->fxState[2].offset = OFFB_FPROUND;
5188               d->fxState[2].size   = sizeof(ULong);
5189
5190               d->fxState[3].fx     = Ifx_Read;
5191               d->fxState[3].offset = OFFB_FC3210;
5192               d->fxState[3].size   = sizeof(ULong);
5193
5194               stmt( IRStmt_Dirty(d) );
5195
5196               DIP("fnstenv %s\n", dis_buf);
5197               break;
5198            }
5199
5200            case 7: /* FNSTCW */
5201               /* Fake up a native x87 FPU control word.  The only
5202                  thing it depends on is FPROUND[1:0], so call a clean
5203                  helper to cook it up. */
5204               /* ULong amd64g_create_fpucw ( ULong fpround ) */
5205               DIP("fnstcw %s\n", dis_buf);
5206               storeLE(
5207                  mkexpr(addr),
5208                  unop( Iop_64to16,
5209                        mkIRExprCCall(
5210                           Ity_I64, 0/*regp*/,
5211                           "amd64g_create_fpucw", &amd64g_create_fpucw,
5212                           mkIRExprVec_1( unop(Iop_32Uto64, get_fpround()) )
5213                        )
5214                  )
5215               );
5216               break;
5217
5218            default:
5219               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
5220               vex_printf("first_opcode == 0xD9\n");
5221               goto decode_fail;
5222         }
5223
5224      } else {
5225         delta++;
5226         switch (modrm) {
5227
5228            case 0xC0 ... 0xC7: /* FLD %st(?) */
5229               r_src = (UInt)modrm - 0xC0;
5230               DIP("fld %%st(%u)\n", r_src);
5231               t1 = newTemp(Ity_F64);
5232               assign(t1, get_ST(r_src));
5233               fp_push();
5234               put_ST(0, mkexpr(t1));
5235               break;
5236
5237            case 0xC8 ... 0xCF: /* FXCH %st(?) */
5238               r_src = (UInt)modrm - 0xC8;
5239               DIP("fxch %%st(%u)\n", r_src);
5240               t1 = newTemp(Ity_F64);
5241               t2 = newTemp(Ity_F64);
5242               assign(t1, get_ST(0));
5243               assign(t2, get_ST(r_src));
5244               put_ST_UNCHECKED(0, mkexpr(t2));
5245               put_ST_UNCHECKED(r_src, mkexpr(t1));
5246               break;
5247
5248            case 0xE0: /* FCHS */
5249               DIP("fchs\n");
5250               put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
5251               break;
5252
5253            case 0xE1: /* FABS */
5254               DIP("fabs\n");
5255               put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
5256               break;
5257
5258            case 0xE5: { /* FXAM */
5259               /* This is an interesting one.  It examines %st(0),
5260                  regardless of whether the tag says it's empty or not.
5261                  Here, just pass both the tag (in our format) and the
5262                  value (as a double, actually a ULong) to a helper
5263                  function. */
5264               IRExpr** args
5265                  = mkIRExprVec_2( unop(Iop_8Uto64, get_ST_TAG(0)),
5266                                   unop(Iop_ReinterpF64asI64,
5267                                        get_ST_UNCHECKED(0)) );
5268               put_C3210(mkIRExprCCall(
5269                            Ity_I64,
5270                            0/*regparm*/,
5271                            "amd64g_calculate_FXAM", &amd64g_calculate_FXAM,
5272                            args
5273                        ));
5274               DIP("fxam\n");
5275               break;
5276            }
5277
5278            case 0xE8: /* FLD1 */
5279               DIP("fld1\n");
5280               fp_push();
5281               /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
5282               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
5283               break;
5284
5285            case 0xE9: /* FLDL2T */
5286               DIP("fldl2t\n");
5287               fp_push();
5288               /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
5289               put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
5290               break;
5291
5292            case 0xEA: /* FLDL2E */
5293               DIP("fldl2e\n");
5294               fp_push();
5295               /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
5296               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
5297               break;
5298
5299            case 0xEB: /* FLDPI */
5300               DIP("fldpi\n");
5301               fp_push();
5302               /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
5303               put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
5304               break;
5305
5306            case 0xEC: /* FLDLG2 */
5307               DIP("fldlg2\n");
5308               fp_push();
5309               /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
5310               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
5311               break;
5312
5313            case 0xED: /* FLDLN2 */
5314               DIP("fldln2\n");
5315               fp_push();
5316               /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
5317               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
5318               break;
5319
5320            case 0xEE: /* FLDZ */
5321               DIP("fldz\n");
5322               fp_push();
5323               /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
5324               put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
5325               break;
5326
5327            case 0xF0: /* F2XM1 */
5328               DIP("f2xm1\n");
5329               put_ST_UNCHECKED(0,
5330                  binop(Iop_2xm1F64,
5331                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5332                        get_ST(0)));
5333               break;
5334
5335            case 0xF1: /* FYL2X */
5336               DIP("fyl2x\n");
5337               put_ST_UNCHECKED(1,
5338                  triop(Iop_Yl2xF64,
5339                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5340                        get_ST(1),
5341                        get_ST(0)));
5342               fp_pop();
5343               break;
5344
5345            case 0xF2: /* FPTAN */
5346               DIP("ftan\n");
5347               put_ST_UNCHECKED(0,
5348                  binop(Iop_TanF64,
5349                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5350                        get_ST(0)));
5351               fp_push();
5352               put_ST(0, IRExpr_Const(IRConst_F64(1.0)));
5353               clear_C2(); /* HACK */
5354               break;
5355
5356            case 0xF3: /* FPATAN */
5357               DIP("fpatan\n");
5358               put_ST_UNCHECKED(1,
5359                  triop(Iop_AtanF64,
5360                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5361                        get_ST(1),
5362                        get_ST(0)));
5363               fp_pop();
5364               break;
5365
5366            case 0xF4: { /* FXTRACT */
5367               IRTemp argF = newTemp(Ity_F64);
5368               IRTemp sigF = newTemp(Ity_F64);
5369               IRTemp expF = newTemp(Ity_F64);
5370               IRTemp argI = newTemp(Ity_I64);
5371               IRTemp sigI = newTemp(Ity_I64);
5372               IRTemp expI = newTemp(Ity_I64);
5373               DIP("fxtract\n");
5374               assign( argF, get_ST(0) );
5375               assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
5376               assign( sigI,
5377                       mkIRExprCCall(
5378                          Ity_I64, 0/*regparms*/,
5379                          "x86amd64g_calculate_FXTRACT",
5380                          &x86amd64g_calculate_FXTRACT,
5381                          mkIRExprVec_2( mkexpr(argI),
5382                                         mkIRExpr_HWord(0)/*sig*/ ))
5383               );
5384               assign( expI,
5385                       mkIRExprCCall(
5386                          Ity_I64, 0/*regparms*/,
5387                          "x86amd64g_calculate_FXTRACT",
5388                          &x86amd64g_calculate_FXTRACT,
5389                          mkIRExprVec_2( mkexpr(argI),
5390                                         mkIRExpr_HWord(1)/*exp*/ ))
5391               );
5392               assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
5393               assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
5394               /* exponent */
5395               put_ST_UNCHECKED(0, mkexpr(expF) );
5396               fp_push();
5397               /* significand */
5398               put_ST(0, mkexpr(sigF) );
5399               break;
5400            }
5401
5402            case 0xF5: { /* FPREM1 -- IEEE compliant */
5403               IRTemp a1 = newTemp(Ity_F64);
5404               IRTemp a2 = newTemp(Ity_F64);
5405               DIP("fprem1\n");
5406               /* Do FPREM1 twice, once to get the remainder, and once
5407                  to get the C3210 flag values. */
5408               assign( a1, get_ST(0) );
5409               assign( a2, get_ST(1) );
5410               put_ST_UNCHECKED(0,
5411                  triop(Iop_PRem1F64,
5412                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5413                        mkexpr(a1),
5414                        mkexpr(a2)));
5415               put_C3210(
5416                  unop(Iop_32Uto64,
5417                  triop(Iop_PRem1C3210F64,
5418                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5419                        mkexpr(a1),
5420                        mkexpr(a2)) ));
5421               break;
5422            }
5423
5424            case 0xF7: /* FINCSTP */
5425               DIP("fincstp\n");
5426               put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
5427               break;
5428
5429            case 0xF8: { /* FPREM -- not IEEE compliant */
5430               IRTemp a1 = newTemp(Ity_F64);
5431               IRTemp a2 = newTemp(Ity_F64);
5432               DIP("fprem\n");
5433               /* Do FPREM twice, once to get the remainder, and once
5434                  to get the C3210 flag values. */
5435               assign( a1, get_ST(0) );
5436               assign( a2, get_ST(1) );
5437               put_ST_UNCHECKED(0,
5438                  triop(Iop_PRemF64,
5439                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5440                        mkexpr(a1),
5441                        mkexpr(a2)));
5442               put_C3210(
5443                  unop(Iop_32Uto64,
5444                  triop(Iop_PRemC3210F64,
5445                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5446                        mkexpr(a1),
5447                        mkexpr(a2)) ));
5448               break;
5449            }
5450
5451            case 0xF9: /* FYL2XP1 */
5452               DIP("fyl2xp1\n");
5453               put_ST_UNCHECKED(1,
5454                  triop(Iop_Yl2xp1F64,
5455                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5456                        get_ST(1),
5457                        get_ST(0)));
5458               fp_pop();
5459               break;
5460
5461            case 0xFA: /* FSQRT */
5462               DIP("fsqrt\n");
5463               put_ST_UNCHECKED(0,
5464                  binop(Iop_SqrtF64,
5465                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5466                        get_ST(0)));
5467               break;
5468
5469            case 0xFB: { /* FSINCOS */
5470               IRTemp a1 = newTemp(Ity_F64);
5471               assign( a1, get_ST(0) );
5472               DIP("fsincos\n");
5473               put_ST_UNCHECKED(0,
5474                  binop(Iop_SinF64,
5475                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5476                        mkexpr(a1)));
5477               fp_push();
5478               put_ST(0,
5479                  binop(Iop_CosF64,
5480                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5481                        mkexpr(a1)));
5482               clear_C2(); /* HACK */
5483               break;
5484            }
5485
5486            case 0xFC: /* FRNDINT */
5487               DIP("frndint\n");
5488               put_ST_UNCHECKED(0,
5489                  binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
5490               break;
5491
5492            case 0xFD: /* FSCALE */
5493               DIP("fscale\n");
5494               put_ST_UNCHECKED(0,
5495                  triop(Iop_ScaleF64,
5496                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5497                        get_ST(0),
5498                        get_ST(1)));
5499               break;
5500
5501            case 0xFE: /* FSIN */
5502               DIP("fsin\n");
5503               put_ST_UNCHECKED(0,
5504                  binop(Iop_SinF64,
5505                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5506                        get_ST(0)));
5507               clear_C2(); /* HACK */
5508               break;
5509
5510            case 0xFF: /* FCOS */
5511               DIP("fcos\n");
5512               put_ST_UNCHECKED(0,
5513                  binop(Iop_CosF64,
5514                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5515                        get_ST(0)));
5516               clear_C2(); /* HACK */
5517               break;
5518
5519            default:
5520               goto decode_fail;
5521         }
5522      }
5523   }
5524
5525   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
5526   else
5527   if (first_opcode == 0xDA) {
5528
5529      if (modrm < 0xC0) {
5530
5531         /* bits 5,4,3 are an opcode extension, and the modRM also
5532            specifies an address. */
5533         IROp   fop;
5534         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
5535         delta += len;
5536         switch (gregLO3ofRM(modrm)) {
5537
5538            case 0: /* FIADD m32int */ /* ST(0) += m32int */
5539               DIP("fiaddl %s\n", dis_buf);
5540               fop = Iop_AddF64;
5541               goto do_fop_m32;
5542
5543            case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
5544               DIP("fimull %s\n", dis_buf);
5545               fop = Iop_MulF64;
5546               goto do_fop_m32;
5547
5548            case 4: /* FISUB m32int */ /* ST(0) -= m32int */
5549               DIP("fisubl %s\n", dis_buf);
5550               fop = Iop_SubF64;
5551               goto do_fop_m32;
5552
5553            case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
5554               DIP("fisubrl %s\n", dis_buf);
5555               fop = Iop_SubF64;
5556               goto do_foprev_m32;
5557
5558            case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
5559               DIP("fisubl %s\n", dis_buf);
5560               fop = Iop_DivF64;
5561               goto do_fop_m32;
5562
5563            case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
5564               DIP("fidivrl %s\n", dis_buf);
5565               fop = Iop_DivF64;
5566               goto do_foprev_m32;
5567
5568            do_fop_m32:
5569               put_ST_UNCHECKED(0,
5570                  triop(fop,
5571                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5572                        get_ST(0),
5573                        unop(Iop_I32StoF64,
5574                             loadLE(Ity_I32, mkexpr(addr)))));
5575               break;
5576
5577            do_foprev_m32:
5578               put_ST_UNCHECKED(0,
5579                  triop(fop,
5580                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5581                        unop(Iop_I32StoF64,
5582                             loadLE(Ity_I32, mkexpr(addr))),
5583                        get_ST(0)));
5584               break;
5585
5586            default:
5587               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
5588               vex_printf("first_opcode == 0xDA\n");
5589               goto decode_fail;
5590         }
5591
5592      } else {
5593
5594         delta++;
5595         switch (modrm) {
5596
5597            case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
5598               r_src = (UInt)modrm - 0xC0;
5599               DIP("fcmovb %%st(%u), %%st(0)\n", r_src);
5600               put_ST_UNCHECKED(0,
5601                                IRExpr_Mux0X(
5602                                    unop(Iop_1Uto8,
5603                                         mk_amd64g_calculate_condition(AMD64CondB)),
5604                                    get_ST(0), get_ST(r_src)) );
5605               break;
5606
5607            case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
5608               r_src = (UInt)modrm - 0xC8;
5609               DIP("fcmovz %%st(%u), %%st(0)\n", r_src);
5610               put_ST_UNCHECKED(0,
5611                                IRExpr_Mux0X(
5612                                    unop(Iop_1Uto8,
5613                                         mk_amd64g_calculate_condition(AMD64CondZ)),
5614                                    get_ST(0), get_ST(r_src)) );
5615               break;
5616
5617            case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
5618               r_src = (UInt)modrm - 0xD0;
5619               DIP("fcmovbe %%st(%u), %%st(0)\n", r_src);
5620               put_ST_UNCHECKED(0,
5621                                IRExpr_Mux0X(
5622                                    unop(Iop_1Uto8,
5623                                         mk_amd64g_calculate_condition(AMD64CondBE)),
5624                                    get_ST(0), get_ST(r_src)) );
5625               break;
5626
5627            case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
5628               r_src = (UInt)modrm - 0xD8;
5629               DIP("fcmovu %%st(%u), %%st(0)\n", r_src);
5630               put_ST_UNCHECKED(0,
5631                                IRExpr_Mux0X(
5632                                    unop(Iop_1Uto8,
5633                                         mk_amd64g_calculate_condition(AMD64CondP)),
5634                                    get_ST(0), get_ST(r_src)) );
5635               break;
5636
5637            case 0xE9: /* FUCOMPP %st(0),%st(1) */
5638               DIP("fucompp %%st(0),%%st(1)\n");
5639               /* This forces C1 to zero, which isn't right. */
5640               put_C3210(
5641                   unop(Iop_32Uto64,
5642                   binop( Iop_And32,
5643                          binop(Iop_Shl32,
5644                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
5645                                mkU8(8)),
5646                          mkU32(0x4500)
5647                   )));
5648               fp_pop();
5649               fp_pop();
5650               break;
5651
5652            default:
5653               goto decode_fail;
5654         }
5655
5656      }
5657   }
5658
5659   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
5660   else
5661   if (first_opcode == 0xDB) {
5662      if (modrm < 0xC0) {
5663
5664         /* bits 5,4,3 are an opcode extension, and the modRM also
5665            specifies an address. */
5666         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
5667         delta += len;
5668
5669         switch (gregLO3ofRM(modrm)) {
5670
5671            case 0: /* FILD m32int */
5672               DIP("fildl %s\n", dis_buf);
5673               fp_push();
5674               put_ST(0, unop(Iop_I32StoF64,
5675                              loadLE(Ity_I32, mkexpr(addr))));
5676               break;
5677
5678            case 1: /* FISTTPL m32 (SSE3) */
5679               DIP("fisttpl %s\n", dis_buf);
5680               storeLE( mkexpr(addr),
5681                        binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
5682               fp_pop();
5683               break;
5684
5685            case 2: /* FIST m32 */
5686               DIP("fistl %s\n", dis_buf);
5687               storeLE( mkexpr(addr),
5688                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
5689               break;
5690
5691            case 3: /* FISTP m32 */
5692               DIP("fistpl %s\n", dis_buf);
5693               storeLE( mkexpr(addr),
5694                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
5695               fp_pop();
5696               break;
5697
5698            case 5: { /* FLD extended-real */
5699               /* Uses dirty helper:
5700                     ULong amd64g_loadF80le ( ULong )
5701                  addr holds the address.  First, do a dirty call to
5702                  get hold of the data. */
5703               IRTemp   val  = newTemp(Ity_I64);
5704               IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
5705
5706               IRDirty* d = unsafeIRDirty_1_N (
5707                               val,
5708                               0/*regparms*/,
5709                               "amd64g_dirtyhelper_loadF80le",
5710                               &amd64g_dirtyhelper_loadF80le,
5711                               args
5712                            );
5713               /* declare that we're reading memory */
5714               d->mFx   = Ifx_Read;
5715               d->mAddr = mkexpr(addr);
5716               d->mSize = 10;
5717
5718               /* execute the dirty call, dumping the result in val. */
5719               stmt( IRStmt_Dirty(d) );
5720               fp_push();
5721               put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
5722
5723               DIP("fldt %s\n", dis_buf);
5724               break;
5725            }
5726
5727            case 7: { /* FSTP extended-real */
5728               /* Uses dirty helper:
5729                     void amd64g_storeF80le ( ULong addr, ULong data )
5730               */
5731               IRExpr** args
5732                  = mkIRExprVec_2( mkexpr(addr),
5733                                   unop(Iop_ReinterpF64asI64, get_ST(0)) );
5734
5735               IRDirty* d = unsafeIRDirty_0_N (
5736                               0/*regparms*/,
5737                               "amd64g_dirtyhelper_storeF80le",
5738                               &amd64g_dirtyhelper_storeF80le,
5739                               args
5740                            );
5741               /* declare we're writing memory */
5742               d->mFx   = Ifx_Write;
5743               d->mAddr = mkexpr(addr);
5744               d->mSize = 10;
5745
5746               /* execute the dirty call. */
5747               stmt( IRStmt_Dirty(d) );
5748               fp_pop();
5749
5750               DIP("fstpt\n %s", dis_buf);
5751               break;
5752            }
5753
5754            default:
5755               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
5756               vex_printf("first_opcode == 0xDB\n");
5757               goto decode_fail;
5758         }
5759
5760      } else {
5761
5762         delta++;
5763         switch (modrm) {
5764
5765            case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
5766               r_src = (UInt)modrm - 0xC0;
5767               DIP("fcmovnb %%st(%u), %%st(0)\n", r_src);
5768               put_ST_UNCHECKED(0,
5769                                IRExpr_Mux0X(
5770                                    unop(Iop_1Uto8,
5771                                         mk_amd64g_calculate_condition(AMD64CondNB)),
5772                                    get_ST(0), get_ST(r_src)) );
5773               break;
5774
5775            case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
5776               r_src = (UInt)modrm - 0xC8;
5777               DIP("fcmovnz %%st(%u), %%st(0)\n", r_src);
5778               put_ST_UNCHECKED(
5779                  0,
5780                  IRExpr_Mux0X(
5781                     unop(Iop_1Uto8,
5782                          mk_amd64g_calculate_condition(AMD64CondNZ)),
5783                     get_ST(0),
5784                     get_ST(r_src)
5785                  )
5786               );
5787               break;
5788
5789            case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
5790               r_src = (UInt)modrm - 0xD0;
5791               DIP("fcmovnbe %%st(%u), %%st(0)\n", r_src);
5792               put_ST_UNCHECKED(
5793                  0,
5794                  IRExpr_Mux0X(
5795                     unop(Iop_1Uto8,
5796                          mk_amd64g_calculate_condition(AMD64CondNBE)),
5797                     get_ST(0),
5798                     get_ST(r_src)
5799                  )
5800               );
5801               break;
5802
5803            case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
5804               r_src = (UInt)modrm - 0xD8;
5805               DIP("fcmovnu %%st(%u), %%st(0)\n", r_src);
5806               put_ST_UNCHECKED(
5807                  0,
5808                  IRExpr_Mux0X(
5809                     unop(Iop_1Uto8,
5810                          mk_amd64g_calculate_condition(AMD64CondNP)),
5811                     get_ST(0),
5812                     get_ST(r_src)
5813                  )
5814               );
5815               break;
5816
5817            case 0xE2:
5818               DIP("fnclex\n");
5819               break;
5820
5821            case 0xE3: {
5822               /* Uses dirty helper:
5823                     void amd64g_do_FINIT ( VexGuestAMD64State* ) */
5824               IRDirty* d  = unsafeIRDirty_0_N (
5825                                0/*regparms*/,
5826                                "amd64g_dirtyhelper_FINIT",
5827                                &amd64g_dirtyhelper_FINIT,
5828                                mkIRExprVec_0()
5829                             );
5830               d->needsBBP = True;
5831
5832               /* declare we're writing guest state */
5833               d->nFxState = 5;
5834
5835               d->fxState[0].fx     = Ifx_Write;
5836               d->fxState[0].offset = OFFB_FTOP;
5837               d->fxState[0].size   = sizeof(UInt);
5838
5839               d->fxState[1].fx     = Ifx_Write;
5840               d->fxState[1].offset = OFFB_FPREGS;
5841               d->fxState[1].size   = 8 * sizeof(ULong);
5842
5843               d->fxState[2].fx     = Ifx_Write;
5844               d->fxState[2].offset = OFFB_FPTAGS;
5845               d->fxState[2].size   = 8 * sizeof(UChar);
5846
5847               d->fxState[3].fx     = Ifx_Write;
5848               d->fxState[3].offset = OFFB_FPROUND;
5849               d->fxState[3].size   = sizeof(ULong);
5850
5851               d->fxState[4].fx     = Ifx_Write;
5852               d->fxState[4].offset = OFFB_FC3210;
5853               d->fxState[4].size   = sizeof(ULong);
5854
5855               stmt( IRStmt_Dirty(d) );
5856
5857               DIP("fninit\n");
5858               break;
5859            }
5860
5861            case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
5862               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
5863               break;
5864
5865            case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
5866               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
5867               break;
5868
5869            default:
5870               goto decode_fail;
5871         }
5872      }
5873   }
5874
5875   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
5876   else
5877   if (first_opcode == 0xDC) {
5878      if (modrm < 0xC0) {
5879
5880         /* bits 5,4,3 are an opcode extension, and the modRM also
5881            specifies an address. */
5882         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
5883         delta += len;
5884
5885         switch (gregLO3ofRM(modrm)) {
5886
5887            case 0: /* FADD double-real */
5888               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
5889               break;
5890
5891            case 1: /* FMUL double-real */
5892               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
5893               break;
5894
5895//..             case 2: /* FCOM double-real */
5896//..                DIP("fcoml %s\n", dis_buf);
5897//..                /* This forces C1 to zero, which isn't right. */
5898//..                put_C3210(
5899//..                    binop( Iop_And32,
5900//..                           binop(Iop_Shl32,
5901//..                                 binop(Iop_CmpF64,
5902//..                                       get_ST(0),
5903//..                                       loadLE(Ity_F64,mkexpr(addr))),
5904//..                                 mkU8(8)),
5905//..                           mkU32(0x4500)
5906//..                    ));
5907//..                break;
5908
5909            case 3: /* FCOMP double-real */
5910               DIP("fcompl %s\n", dis_buf);
5911               /* This forces C1 to zero, which isn't right. */
5912               put_C3210(
5913                   unop(Iop_32Uto64,
5914                   binop( Iop_And32,
5915                          binop(Iop_Shl32,
5916                                binop(Iop_CmpF64,
5917                                      get_ST(0),
5918                                      loadLE(Ity_F64,mkexpr(addr))),
5919                                mkU8(8)),
5920                          mkU32(0x4500)
5921                   )));
5922               fp_pop();
5923               break;
5924
5925            case 4: /* FSUB double-real */
5926               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
5927               break;
5928
5929            case 5: /* FSUBR double-real */
5930               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
5931               break;
5932
5933            case 6: /* FDIV double-real */
5934               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
5935               break;
5936
5937            case 7: /* FDIVR double-real */
5938               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
5939               break;
5940
5941            default:
5942               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
5943               vex_printf("first_opcode == 0xDC\n");
5944               goto decode_fail;
5945         }
5946
5947      } else {
5948
5949         delta++;
5950         switch (modrm) {
5951
5952            case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
5953               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
5954               break;
5955
5956            case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
5957               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
5958               break;
5959
5960            case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
5961               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
5962               break;
5963
5964            case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
5965               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
5966               break;
5967
5968            case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
5969               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
5970               break;
5971
5972            case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
5973               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
5974               break;
5975
5976            default:
5977               goto decode_fail;
5978         }
5979
5980      }
5981   }
5982
5983   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
5984   else
5985   if (first_opcode == 0xDD) {
5986
5987      if (modrm < 0xC0) {
5988
5989         /* bits 5,4,3 are an opcode extension, and the modRM also
5990            specifies an address. */
5991         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
5992         delta += len;
5993
5994         switch (gregLO3ofRM(modrm)) {
5995
5996            case 0: /* FLD double-real */
5997               DIP("fldl %s\n", dis_buf);
5998               fp_push();
5999               put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
6000               break;
6001
6002            case 1: /* FISTTPQ m64 (SSE3) */
6003               DIP("fistppll %s\n", dis_buf);
6004               storeLE( mkexpr(addr),
6005                        binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
6006               fp_pop();
6007               break;
6008
6009            case 2: /* FST double-real */
6010               DIP("fstl %s\n", dis_buf);
6011               storeLE(mkexpr(addr), get_ST(0));
6012               break;
6013
6014            case 3: /* FSTP double-real */
6015               DIP("fstpl %s\n", dis_buf);
6016               storeLE(mkexpr(addr), get_ST(0));
6017               fp_pop();
6018               break;
6019
6020//..             case 4: { /* FRSTOR m108 */
6021//..                /* Uses dirty helper:
6022//..                      VexEmWarn x86g_do_FRSTOR ( VexGuestX86State*, Addr32 ) */
6023//..                IRTemp   ew = newTemp(Ity_I32);
6024//..                IRDirty* d  = unsafeIRDirty_0_N (
6025//..                                 0/*regparms*/,
6026//..                                 "x86g_dirtyhelper_FRSTOR",
6027//..                                 &x86g_dirtyhelper_FRSTOR,
6028//..                                 mkIRExprVec_1( mkexpr(addr) )
6029//..                              );
6030//..                d->needsBBP = True;
6031//..                d->tmp      = ew;
6032//..                /* declare we're reading memory */
6033//..                d->mFx   = Ifx_Read;
6034//..                d->mAddr = mkexpr(addr);
6035//..                d->mSize = 108;
6036//..
6037//..                /* declare we're writing guest state */
6038//..                d->nFxState = 5;
6039//..
6040//..                d->fxState[0].fx     = Ifx_Write;
6041//..                d->fxState[0].offset = OFFB_FTOP;
6042//..                d->fxState[0].size   = sizeof(UInt);
6043//..
6044//..                d->fxState[1].fx     = Ifx_Write;
6045//..                d->fxState[1].offset = OFFB_FPREGS;
6046//..                d->fxState[1].size   = 8 * sizeof(ULong);
6047//..
6048//..                d->fxState[2].fx     = Ifx_Write;
6049//..                d->fxState[2].offset = OFFB_FPTAGS;
6050//..                d->fxState[2].size   = 8 * sizeof(UChar);
6051//..
6052//..                d->fxState[3].fx     = Ifx_Write;
6053//..                d->fxState[3].offset = OFFB_FPROUND;
6054//..                d->fxState[3].size   = sizeof(UInt);
6055//..
6056//..                d->fxState[4].fx     = Ifx_Write;
6057//..                d->fxState[4].offset = OFFB_FC3210;
6058//..                d->fxState[4].size   = sizeof(UInt);
6059//..
6060//..                stmt( IRStmt_Dirty(d) );
6061//..
6062//..                /* ew contains any emulation warning we may need to
6063//..                   issue.  If needed, side-exit to the next insn,
6064//..                   reporting the warning, so that Valgrind's dispatcher
6065//..                   sees the warning. */
6066//..                put_emwarn( mkexpr(ew) );
6067//..                stmt(
6068//..                   IRStmt_Exit(
6069//..                      binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
6070//..                      Ijk_EmWarn,
6071//..                      IRConst_U32( ((Addr32)guest_eip_bbstart)+delta)
6072//..                   )
6073//..                );
6074//..
6075//..                DIP("frstor %s\n", dis_buf);
6076//..                break;
6077//..             }
6078//..
6079//..             case 6: { /* FNSAVE m108 */
6080//..                /* Uses dirty helper:
6081//..                      void x86g_do_FSAVE ( VexGuestX86State*, UInt ) */
6082//..                IRDirty* d = unsafeIRDirty_0_N (
6083//..                                0/*regparms*/,
6084//..                                "x86g_dirtyhelper_FSAVE",
6085//..                                &x86g_dirtyhelper_FSAVE,
6086//..                                mkIRExprVec_1( mkexpr(addr) )
6087//..                             );
6088//..                d->needsBBP = True;
6089//..                /* declare we're writing memory */
6090//..                d->mFx   = Ifx_Write;
6091//..                d->mAddr = mkexpr(addr);
6092//..                d->mSize = 108;
6093//..
6094//..                /* declare we're reading guest state */
6095//..                d->nFxState = 5;
6096//..
6097//..                d->fxState[0].fx     = Ifx_Read;
6098//..                d->fxState[0].offset = OFFB_FTOP;
6099//..                d->fxState[0].size   = sizeof(UInt);
6100//..
6101//..                d->fxState[1].fx     = Ifx_Read;
6102//..                d->fxState[1].offset = OFFB_FPREGS;
6103//..                d->fxState[1].size   = 8 * sizeof(ULong);
6104//..
6105//..                d->fxState[2].fx     = Ifx_Read;
6106//..                d->fxState[2].offset = OFFB_FPTAGS;
6107//..                d->fxState[2].size   = 8 * sizeof(UChar);
6108//..
6109//..                d->fxState[3].fx     = Ifx_Read;
6110//..                d->fxState[3].offset = OFFB_FPROUND;
6111//..                d->fxState[3].size   = sizeof(UInt);
6112//..
6113//..                d->fxState[4].fx     = Ifx_Read;
6114//..                d->fxState[4].offset = OFFB_FC3210;
6115//..                d->fxState[4].size   = sizeof(UInt);
6116//..
6117//..                stmt( IRStmt_Dirty(d) );
6118//..
6119//..                DIP("fnsave %s\n", dis_buf);
6120//..                break;
6121//..             }
6122
6123            case 7: { /* FNSTSW m16 */
6124               IRExpr* sw = get_FPU_sw();
6125               vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
6126               storeLE( mkexpr(addr), sw );
6127               DIP("fnstsw %s\n", dis_buf);
6128               break;
6129            }
6130
6131            default:
6132               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
6133               vex_printf("first_opcode == 0xDD\n");
6134               goto decode_fail;
6135         }
6136      } else {
6137         delta++;
6138         switch (modrm) {
6139
6140            case 0xC0 ... 0xC7: /* FFREE %st(?) */
6141               r_dst = (UInt)modrm - 0xC0;
6142               DIP("ffree %%st(%u)\n", r_dst);
6143               put_ST_TAG ( r_dst, mkU8(0) );
6144               break;
6145
6146            case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
6147               r_dst = (UInt)modrm - 0xD0;
6148               DIP("fst %%st(0),%%st(%u)\n", r_dst);
6149               /* P4 manual says: "If the destination operand is a
6150                  non-empty register, the invalid-operation exception
6151                  is not generated.  Hence put_ST_UNCHECKED. */
6152               put_ST_UNCHECKED(r_dst, get_ST(0));
6153               break;
6154
6155            case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
6156               r_dst = (UInt)modrm - 0xD8;
6157               DIP("fstp %%st(0),%%st(%u)\n", r_dst);
6158               /* P4 manual says: "If the destination operand is a
6159                  non-empty register, the invalid-operation exception
6160                  is not generated.  Hence put_ST_UNCHECKED. */
6161               put_ST_UNCHECKED(r_dst, get_ST(0));
6162               fp_pop();
6163               break;
6164
6165            case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
6166               r_dst = (UInt)modrm - 0xE0;
6167               DIP("fucom %%st(0),%%st(%u)\n", r_dst);
6168               /* This forces C1 to zero, which isn't right. */
6169               put_C3210(
6170                   unop(Iop_32Uto64,
6171                   binop( Iop_And32,
6172                          binop(Iop_Shl32,
6173                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
6174                                mkU8(8)),
6175                          mkU32(0x4500)
6176                   )));
6177               break;
6178
6179            case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
6180               r_dst = (UInt)modrm - 0xE8;
6181               DIP("fucomp %%st(0),%%st(%u)\n", r_dst);
6182               /* This forces C1 to zero, which isn't right. */
6183               put_C3210(
6184                   unop(Iop_32Uto64,
6185                   binop( Iop_And32,
6186                          binop(Iop_Shl32,
6187                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
6188                                mkU8(8)),
6189                          mkU32(0x4500)
6190                   )));
6191               fp_pop();
6192               break;
6193
6194            default:
6195               goto decode_fail;
6196         }
6197      }
6198   }
6199
6200   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
6201   else
6202   if (first_opcode == 0xDE) {
6203
6204      if (modrm < 0xC0) {
6205
6206         /* bits 5,4,3 are an opcode extension, and the modRM also
6207            specifies an address. */
6208         IROp   fop;
6209         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6210         delta += len;
6211
6212         switch (gregLO3ofRM(modrm)) {
6213
6214            case 0: /* FIADD m16int */ /* ST(0) += m16int */
6215               DIP("fiaddw %s\n", dis_buf);
6216               fop = Iop_AddF64;
6217               goto do_fop_m16;
6218
6219            case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
6220               DIP("fimulw %s\n", dis_buf);
6221               fop = Iop_MulF64;
6222               goto do_fop_m16;
6223
6224            case 4: /* FISUB m16int */ /* ST(0) -= m16int */
6225               DIP("fisubw %s\n", dis_buf);
6226               fop = Iop_SubF64;
6227               goto do_fop_m16;
6228
6229            case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
6230               DIP("fisubrw %s\n", dis_buf);
6231               fop = Iop_SubF64;
6232               goto do_foprev_m16;
6233
6234            case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
6235               DIP("fisubw %s\n", dis_buf);
6236               fop = Iop_DivF64;
6237               goto do_fop_m16;
6238
6239            case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
6240               DIP("fidivrw %s\n", dis_buf);
6241               fop = Iop_DivF64;
6242               goto do_foprev_m16;
6243
6244            do_fop_m16:
6245               put_ST_UNCHECKED(0,
6246                  triop(fop,
6247                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6248                        get_ST(0),
6249                        unop(Iop_I32StoF64,
6250                             unop(Iop_16Sto32,
6251                                  loadLE(Ity_I16, mkexpr(addr))))));
6252               break;
6253
6254            do_foprev_m16:
6255               put_ST_UNCHECKED(0,
6256                  triop(fop,
6257                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6258                        unop(Iop_I32StoF64,
6259                             unop(Iop_16Sto32,
6260                                  loadLE(Ity_I16, mkexpr(addr)))),
6261                        get_ST(0)));
6262               break;
6263
6264            default:
6265               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
6266               vex_printf("first_opcode == 0xDE\n");
6267               goto decode_fail;
6268         }
6269
6270      } else {
6271
6272         delta++;
6273         switch (modrm) {
6274
6275            case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
6276               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
6277               break;
6278
6279            case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
6280               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
6281               break;
6282
6283            case 0xD9: /* FCOMPP %st(0),%st(1) */
6284               DIP("fcompp %%st(0),%%st(1)\n");
6285               /* This forces C1 to zero, which isn't right. */
6286               put_C3210(
6287                   unop(Iop_32Uto64,
6288                   binop( Iop_And32,
6289                          binop(Iop_Shl32,
6290                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
6291                                mkU8(8)),
6292                          mkU32(0x4500)
6293                   )));
6294               fp_pop();
6295               fp_pop();
6296               break;
6297
6298            case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
6299               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
6300               break;
6301
6302            case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
6303               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
6304               break;
6305
6306            case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
6307               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
6308               break;
6309
6310            case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
6311               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
6312               break;
6313
6314            default:
6315               goto decode_fail;
6316         }
6317
6318      }
6319   }
6320
6321   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
6322   else
6323   if (first_opcode == 0xDF) {
6324
6325      if (modrm < 0xC0) {
6326
6327         /* bits 5,4,3 are an opcode extension, and the modRM also
6328            specifies an address. */
6329         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6330         delta += len;
6331
6332         switch (gregLO3ofRM(modrm)) {
6333
6334            case 0: /* FILD m16int */
6335               DIP("fildw %s\n", dis_buf);
6336               fp_push();
6337               put_ST(0, unop(Iop_I32StoF64,
6338                              unop(Iop_16Sto32,
6339                                   loadLE(Ity_I16, mkexpr(addr)))));
6340               break;
6341
6342            case 1: /* FISTTPS m16 (SSE3) */
6343               DIP("fisttps %s\n", dis_buf);
6344               storeLE( mkexpr(addr),
6345                        x87ishly_qnarrow_32_to_16(
6346                        binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) ));
6347               fp_pop();
6348               break;
6349
6350            case 2: /* FIST m16 */
6351               DIP("fists %s\n", dis_buf);
6352               storeLE( mkexpr(addr),
6353                        x87ishly_qnarrow_32_to_16(
6354                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
6355               break;
6356
6357            case 3: /* FISTP m16 */
6358               DIP("fistps %s\n", dis_buf);
6359               storeLE( mkexpr(addr),
6360                        x87ishly_qnarrow_32_to_16(
6361                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
6362               fp_pop();
6363               break;
6364
6365            case 5: /* FILD m64 */
6366               DIP("fildll %s\n", dis_buf);
6367               fp_push();
6368               put_ST(0, binop(Iop_I64StoF64,
6369                               get_roundingmode(),
6370                               loadLE(Ity_I64, mkexpr(addr))));
6371               break;
6372
6373            case 7: /* FISTP m64 */
6374               DIP("fistpll %s\n", dis_buf);
6375               storeLE( mkexpr(addr),
6376                        binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
6377               fp_pop();
6378               break;
6379
6380            default:
6381               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
6382               vex_printf("first_opcode == 0xDF\n");
6383               goto decode_fail;
6384         }
6385
6386      } else {
6387
6388         delta++;
6389         switch (modrm) {
6390
6391            case 0xC0: /* FFREEP %st(0) */
6392               DIP("ffreep %%st(%d)\n", 0);
6393               put_ST_TAG ( 0, mkU8(0) );
6394               fp_pop();
6395               break;
6396
6397            case 0xE0: /* FNSTSW %ax */
6398               DIP("fnstsw %%ax\n");
6399               /* Invent a plausible-looking FPU status word value and
6400                  dump it in %AX:
6401                     ((ftop & 7) << 11) | (c3210 & 0x4700)
6402               */
6403               putIRegRAX(
6404                  2,
6405                  unop(Iop_32to16,
6406                       binop(Iop_Or32,
6407                             binop(Iop_Shl32,
6408                                   binop(Iop_And32, get_ftop(), mkU32(7)),
6409                                   mkU8(11)),
6410                             binop(Iop_And32,
6411                                   unop(Iop_64to32, get_C3210()),
6412                                   mkU32(0x4700))
6413               )));
6414               break;
6415
6416            case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
6417               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
6418               break;
6419
6420            case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
6421               /* not really right since COMIP != UCOMIP */
6422               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
6423               break;
6424
6425            default:
6426               goto decode_fail;
6427         }
6428      }
6429
6430   }
6431
6432   else
6433      goto decode_fail;
6434
6435   *decode_ok = True;
6436   return delta;
6437
6438  decode_fail:
6439   *decode_ok = False;
6440   return delta;
6441}
6442
6443
6444/*------------------------------------------------------------*/
6445/*---                                                      ---*/
6446/*--- MMX INSTRUCTIONS                                     ---*/
6447/*---                                                      ---*/
6448/*------------------------------------------------------------*/
6449
6450/* Effect of MMX insns on x87 FPU state (table 11-2 of
6451   IA32 arch manual, volume 3):
6452
6453   Read from, or write to MMX register (viz, any insn except EMMS):
6454   * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
6455   * FP stack pointer set to zero
6456
6457   EMMS:
6458   * All tags set to Invalid (empty) -- FPTAGS[i] := zero
6459   * FP stack pointer set to zero
6460*/
6461
6462static void do_MMX_preamble ( void )
6463{
6464   Int         i;
6465   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
6466   IRExpr*     zero  = mkU32(0);
6467   IRExpr*     tag1  = mkU8(1);
6468   put_ftop(zero);
6469   for (i = 0; i < 8; i++)
6470      stmt( IRStmt_PutI( descr, zero, i, tag1 ) );
6471}
6472
6473static void do_EMMS_preamble ( void )
6474{
6475   Int         i;
6476   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
6477   IRExpr*     zero  = mkU32(0);
6478   IRExpr*     tag0  = mkU8(0);
6479   put_ftop(zero);
6480   for (i = 0; i < 8; i++)
6481      stmt( IRStmt_PutI( descr, zero, i, tag0 ) );
6482}
6483
6484
6485static IRExpr* getMMXReg ( UInt archreg )
6486{
6487   vassert(archreg < 8);
6488   return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
6489}
6490
6491
6492static void putMMXReg ( UInt archreg, IRExpr* e )
6493{
6494   vassert(archreg < 8);
6495   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
6496   stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
6497}
6498
6499
6500/* Helper for non-shift MMX insns.  Note this is incomplete in the
6501   sense that it does not first call do_MMX_preamble() -- that is the
6502   responsibility of its caller. */
6503
6504static
6505ULong dis_MMXop_regmem_to_reg ( VexAbiInfo* vbi,
6506                                Prefix      pfx,
6507                                Long        delta,
6508                                UChar       opc,
6509                                HChar*      name,
6510                                Bool        show_granularity )
6511{
6512   HChar   dis_buf[50];
6513   UChar   modrm = getUChar(delta);
6514   Bool    isReg = epartIsReg(modrm);
6515   IRExpr* argL  = NULL;
6516   IRExpr* argR  = NULL;
6517   IRExpr* argG  = NULL;
6518   IRExpr* argE  = NULL;
6519   IRTemp  res   = newTemp(Ity_I64);
6520
6521   Bool    invG  = False;
6522   IROp    op    = Iop_INVALID;
6523   void*   hAddr = NULL;
6524   HChar*  hName = NULL;
6525   Bool    eLeft = False;
6526
6527#  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
6528
6529   switch (opc) {
6530      /* Original MMX ones */
6531      case 0xFC: op = Iop_Add8x8; break;
6532      case 0xFD: op = Iop_Add16x4; break;
6533      case 0xFE: op = Iop_Add32x2; break;
6534
6535      case 0xEC: op = Iop_QAdd8Sx8; break;
6536      case 0xED: op = Iop_QAdd16Sx4; break;
6537
6538      case 0xDC: op = Iop_QAdd8Ux8; break;
6539      case 0xDD: op = Iop_QAdd16Ux4; break;
6540
6541      case 0xF8: op = Iop_Sub8x8;  break;
6542      case 0xF9: op = Iop_Sub16x4; break;
6543      case 0xFA: op = Iop_Sub32x2; break;
6544
6545      case 0xE8: op = Iop_QSub8Sx8; break;
6546      case 0xE9: op = Iop_QSub16Sx4; break;
6547
6548      case 0xD8: op = Iop_QSub8Ux8; break;
6549      case 0xD9: op = Iop_QSub16Ux4; break;
6550
6551      case 0xE5: op = Iop_MulHi16Sx4; break;
6552      case 0xD5: op = Iop_Mul16x4; break;
6553      case 0xF5: XXX(amd64g_calculate_mmx_pmaddwd); break;
6554
6555      case 0x74: op = Iop_CmpEQ8x8; break;
6556      case 0x75: op = Iop_CmpEQ16x4; break;
6557      case 0x76: op = Iop_CmpEQ32x2; break;
6558
6559      case 0x64: op = Iop_CmpGT8Sx8; break;
6560      case 0x65: op = Iop_CmpGT16Sx4; break;
6561      case 0x66: op = Iop_CmpGT32Sx2; break;
6562
6563      case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
6564      case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
6565      case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
6566
6567      case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
6568      case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
6569      case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
6570
6571      case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
6572      case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
6573      case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
6574
6575      case 0xDB: op = Iop_And64; break;
6576      case 0xDF: op = Iop_And64; invG = True; break;
6577      case 0xEB: op = Iop_Or64; break;
6578      case 0xEF: /* Possibly do better here if argL and argR are the
6579                    same reg */
6580                 op = Iop_Xor64; break;
6581
6582      /* Introduced in SSE1 */
6583      case 0xE0: op = Iop_Avg8Ux8;    break;
6584      case 0xE3: op = Iop_Avg16Ux4;   break;
6585      case 0xEE: op = Iop_Max16Sx4;   break;
6586      case 0xDE: op = Iop_Max8Ux8;    break;
6587      case 0xEA: op = Iop_Min16Sx4;   break;
6588      case 0xDA: op = Iop_Min8Ux8;    break;
6589      case 0xE4: op = Iop_MulHi16Ux4; break;
6590      case 0xF6: XXX(amd64g_calculate_mmx_psadbw); break;
6591
6592      /* Introduced in SSE2 */
6593      case 0xD4: op = Iop_Add64; break;
6594      case 0xFB: op = Iop_Sub64; break;
6595
6596      default:
6597         vex_printf("\n0x%x\n", (Int)opc);
6598         vpanic("dis_MMXop_regmem_to_reg");
6599   }
6600
6601#  undef XXX
6602
6603   argG = getMMXReg(gregLO3ofRM(modrm));
6604   if (invG)
6605      argG = unop(Iop_Not64, argG);
6606
6607   if (isReg) {
6608      delta++;
6609      argE = getMMXReg(eregLO3ofRM(modrm));
6610   } else {
6611      Int    len;
6612      IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6613      delta += len;
6614      argE = loadLE(Ity_I64, mkexpr(addr));
6615   }
6616
6617   if (eLeft) {
6618      argL = argE;
6619      argR = argG;
6620   } else {
6621      argL = argG;
6622      argR = argE;
6623   }
6624
6625   if (op != Iop_INVALID) {
6626      vassert(hName == NULL);
6627      vassert(hAddr == NULL);
6628      assign(res, binop(op, argL, argR));
6629   } else {
6630      vassert(hName != NULL);
6631      vassert(hAddr != NULL);
6632      assign( res,
6633              mkIRExprCCall(
6634                 Ity_I64,
6635                 0/*regparms*/, hName, hAddr,
6636                 mkIRExprVec_2( argL, argR )
6637              )
6638            );
6639   }
6640
6641   putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
6642
6643   DIP("%s%s %s, %s\n",
6644       name, show_granularity ? nameMMXGran(opc & 3) : "",
6645       ( isReg ? nameMMXReg(eregLO3ofRM(modrm)) : dis_buf ),
6646       nameMMXReg(gregLO3ofRM(modrm)) );
6647
6648   return delta;
6649}
6650
6651
6652/* Vector by scalar shift of G by the amount specified at the bottom
6653   of E.  This is a straight copy of dis_SSE_shiftG_byE. */
6654
6655static ULong dis_MMX_shiftG_byE ( VexAbiInfo* vbi,
6656                                  Prefix pfx, Long delta,
6657                                  HChar* opname, IROp op )
6658{
6659   HChar   dis_buf[50];
6660   Int     alen, size;
6661   IRTemp  addr;
6662   Bool    shl, shr, sar;
6663   UChar   rm   = getUChar(delta);
6664   IRTemp  g0   = newTemp(Ity_I64);
6665   IRTemp  g1   = newTemp(Ity_I64);
6666   IRTemp  amt  = newTemp(Ity_I64);
6667   IRTemp  amt8 = newTemp(Ity_I8);
6668
6669   if (epartIsReg(rm)) {
6670      assign( amt, getMMXReg(eregLO3ofRM(rm)) );
6671      DIP("%s %s,%s\n", opname,
6672                        nameMMXReg(eregLO3ofRM(rm)),
6673                        nameMMXReg(gregLO3ofRM(rm)) );
6674      delta++;
6675   } else {
6676      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
6677      assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
6678      DIP("%s %s,%s\n", opname,
6679                        dis_buf,
6680                        nameMMXReg(gregLO3ofRM(rm)) );
6681      delta += alen;
6682   }
6683   assign( g0,   getMMXReg(gregLO3ofRM(rm)) );
6684   assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
6685
6686   shl = shr = sar = False;
6687   size = 0;
6688   switch (op) {
6689      case Iop_ShlN16x4: shl = True; size = 32; break;
6690      case Iop_ShlN32x2: shl = True; size = 32; break;
6691      case Iop_Shl64:    shl = True; size = 64; break;
6692      case Iop_ShrN16x4: shr = True; size = 16; break;
6693      case Iop_ShrN32x2: shr = True; size = 32; break;
6694      case Iop_Shr64:    shr = True; size = 64; break;
6695      case Iop_SarN16x4: sar = True; size = 16; break;
6696      case Iop_SarN32x2: sar = True; size = 32; break;
6697      default: vassert(0);
6698   }
6699
6700   if (shl || shr) {
6701     assign(
6702        g1,
6703        IRExpr_Mux0X(
6704           unop(Iop_1Uto8,binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size))),
6705           mkU64(0),
6706           binop(op, mkexpr(g0), mkexpr(amt8))
6707        )
6708     );
6709   } else
6710   if (sar) {
6711     assign(
6712        g1,
6713        IRExpr_Mux0X(
6714           unop(Iop_1Uto8,binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size))),
6715           binop(op, mkexpr(g0), mkU8(size-1)),
6716           binop(op, mkexpr(g0), mkexpr(amt8))
6717        )
6718     );
6719   } else {
6720      vassert(0);
6721   }
6722
6723   putMMXReg( gregLO3ofRM(rm), mkexpr(g1) );
6724   return delta;
6725}
6726
6727
6728/* Vector by scalar shift of E by an immediate byte.  This is a
6729   straight copy of dis_SSE_shiftE_imm. */
6730
6731static
6732ULong dis_MMX_shiftE_imm ( Long delta, HChar* opname, IROp op )
6733{
6734   Bool    shl, shr, sar;
6735   UChar   rm   = getUChar(delta);
6736   IRTemp  e0   = newTemp(Ity_I64);
6737   IRTemp  e1   = newTemp(Ity_I64);
6738   UChar   amt, size;
6739   vassert(epartIsReg(rm));
6740   vassert(gregLO3ofRM(rm) == 2
6741           || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
6742   amt = getUChar(delta+1);
6743   delta += 2;
6744   DIP("%s $%d,%s\n", opname,
6745                      (Int)amt,
6746                      nameMMXReg(eregLO3ofRM(rm)) );
6747
6748   assign( e0, getMMXReg(eregLO3ofRM(rm)) );
6749
6750   shl = shr = sar = False;
6751   size = 0;
6752   switch (op) {
6753      case Iop_ShlN16x4: shl = True; size = 16; break;
6754      case Iop_ShlN32x2: shl = True; size = 32; break;
6755      case Iop_Shl64:    shl = True; size = 64; break;
6756      case Iop_SarN16x4: sar = True; size = 16; break;
6757      case Iop_SarN32x2: sar = True; size = 32; break;
6758      case Iop_ShrN16x4: shr = True; size = 16; break;
6759      case Iop_ShrN32x2: shr = True; size = 32; break;
6760      case Iop_Shr64:    shr = True; size = 64; break;
6761      default: vassert(0);
6762   }
6763
6764   if (shl || shr) {
6765     assign( e1, amt >= size
6766                    ? mkU64(0)
6767                    : binop(op, mkexpr(e0), mkU8(amt))
6768     );
6769   } else
6770   if (sar) {
6771     assign( e1, amt >= size
6772                    ? binop(op, mkexpr(e0), mkU8(size-1))
6773                    : binop(op, mkexpr(e0), mkU8(amt))
6774     );
6775   } else {
6776      vassert(0);
6777   }
6778
6779   putMMXReg( eregLO3ofRM(rm), mkexpr(e1) );
6780   return delta;
6781}
6782
6783
6784/* Completely handle all MMX instructions except emms. */
6785
6786static
6787ULong dis_MMX ( Bool* decode_ok,
6788                VexAbiInfo* vbi, Prefix pfx, Int sz, Long delta )
6789{
6790   Int   len;
6791   UChar modrm;
6792   HChar dis_buf[50];
6793   UChar opc = getUChar(delta);
6794   delta++;
6795
6796   /* dis_MMX handles all insns except emms. */
6797   do_MMX_preamble();
6798
6799   switch (opc) {
6800
6801      case 0x6E:
6802         if (sz == 4) {
6803            /* MOVD (src)ireg32-or-mem32 (E), (dst)mmxreg (G)*/
6804            modrm = getUChar(delta);
6805            if (epartIsReg(modrm)) {
6806               delta++;
6807               putMMXReg(
6808                  gregLO3ofRM(modrm),
6809                  binop( Iop_32HLto64,
6810                         mkU32(0),
6811                         getIReg32(eregOfRexRM(pfx,modrm)) ) );
6812               DIP("movd %s, %s\n",
6813                   nameIReg32(eregOfRexRM(pfx,modrm)),
6814                   nameMMXReg(gregLO3ofRM(modrm)));
6815            } else {
6816               IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6817               delta += len;
6818               putMMXReg(
6819                  gregLO3ofRM(modrm),
6820                  binop( Iop_32HLto64,
6821                         mkU32(0),
6822                         loadLE(Ity_I32, mkexpr(addr)) ) );
6823               DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
6824            }
6825         }
6826         else
6827         if (sz == 8) {
6828            /* MOVD (src)ireg64-or-mem64 (E), (dst)mmxreg (G)*/
6829            modrm = getUChar(delta);
6830            if (epartIsReg(modrm)) {
6831               delta++;
6832               putMMXReg( gregLO3ofRM(modrm),
6833                          getIReg64(eregOfRexRM(pfx,modrm)) );
6834               DIP("movd %s, %s\n",
6835                   nameIReg64(eregOfRexRM(pfx,modrm)),
6836                   nameMMXReg(gregLO3ofRM(modrm)));
6837            } else {
6838               IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6839               delta += len;
6840               putMMXReg( gregLO3ofRM(modrm),
6841                          loadLE(Ity_I64, mkexpr(addr)) );
6842               DIP("movd{64} %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
6843            }
6844         }
6845         else {
6846            goto mmx_decode_failure;
6847         }
6848         break;
6849
6850      case 0x7E:
6851         if (sz == 4) {
6852            /* MOVD (src)mmxreg (G), (dst)ireg32-or-mem32 (E) */
6853            modrm = getUChar(delta);
6854            if (epartIsReg(modrm)) {
6855               delta++;
6856               putIReg32( eregOfRexRM(pfx,modrm),
6857                          unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
6858               DIP("movd %s, %s\n",
6859                   nameMMXReg(gregLO3ofRM(modrm)),
6860                   nameIReg32(eregOfRexRM(pfx,modrm)));
6861            } else {
6862               IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6863               delta += len;
6864               storeLE( mkexpr(addr),
6865                        unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
6866               DIP("movd %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
6867            }
6868         }
6869         else
6870         if (sz == 8) {
6871            /* MOVD (src)mmxreg (G), (dst)ireg64-or-mem64 (E) */
6872            modrm = getUChar(delta);
6873            if (epartIsReg(modrm)) {
6874               delta++;
6875               putIReg64( eregOfRexRM(pfx,modrm),
6876                          getMMXReg(gregLO3ofRM(modrm)) );
6877               DIP("movd %s, %s\n",
6878                   nameMMXReg(gregLO3ofRM(modrm)),
6879                   nameIReg64(eregOfRexRM(pfx,modrm)));
6880            } else {
6881               IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6882               delta += len;
6883               storeLE( mkexpr(addr),
6884                       getMMXReg(gregLO3ofRM(modrm)) );
6885               DIP("movd{64} %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
6886            }
6887         } else {
6888            goto mmx_decode_failure;
6889         }
6890         break;
6891
6892      case 0x6F:
6893         /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
6894         if (sz != 4
6895             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
6896            goto mmx_decode_failure;
6897         modrm = getUChar(delta);
6898         if (epartIsReg(modrm)) {
6899            delta++;
6900            putMMXReg( gregLO3ofRM(modrm), getMMXReg(eregLO3ofRM(modrm)) );
6901            DIP("movq %s, %s\n",
6902                nameMMXReg(eregLO3ofRM(modrm)),
6903                nameMMXReg(gregLO3ofRM(modrm)));
6904         } else {
6905            IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6906            delta += len;
6907            putMMXReg( gregLO3ofRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
6908            DIP("movq %s, %s\n",
6909                dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
6910         }
6911         break;
6912
6913      case 0x7F:
6914         /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
6915         if (sz != 4
6916             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
6917            goto mmx_decode_failure;
6918         modrm = getUChar(delta);
6919         if (epartIsReg(modrm)) {
6920            /* Fall through.  The assembler doesn't appear to generate
6921               these. */
6922            goto mmx_decode_failure;
6923         } else {
6924            IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6925            delta += len;
6926            storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
6927            DIP("mov(nt)q %s, %s\n",
6928                nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
6929         }
6930         break;
6931
6932      case 0xFC:
6933      case 0xFD:
6934      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
6935         if (sz != 4)
6936            goto mmx_decode_failure;
6937         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padd", True );
6938         break;
6939
6940      case 0xEC:
6941      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
6942         if (sz != 4
6943             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
6944            goto mmx_decode_failure;
6945         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padds", True );
6946         break;
6947
6948      case 0xDC:
6949      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
6950         if (sz != 4)
6951            goto mmx_decode_failure;
6952         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "paddus", True );
6953         break;
6954
6955      case 0xF8:
6956      case 0xF9:
6957      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
6958         if (sz != 4)
6959            goto mmx_decode_failure;
6960         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psub", True );
6961         break;
6962
6963      case 0xE8:
6964      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
6965         if (sz != 4)
6966            goto mmx_decode_failure;
6967         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubs", True );
6968         break;
6969
6970      case 0xD8:
6971      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
6972         if (sz != 4)
6973            goto mmx_decode_failure;
6974         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubus", True );
6975         break;
6976
6977      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
6978         if (sz != 4)
6979            goto mmx_decode_failure;
6980         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmulhw", False );
6981         break;
6982
6983      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
6984         if (sz != 4)
6985            goto mmx_decode_failure;
6986         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmullw", False );
6987         break;
6988
6989      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
6990         vassert(sz == 4);
6991         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmaddwd", False );
6992         break;
6993
6994      case 0x74:
6995      case 0x75:
6996      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
6997         if (sz != 4)
6998            goto mmx_decode_failure;
6999         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpeq", True );
7000         break;
7001
7002      case 0x64:
7003      case 0x65:
7004      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
7005         if (sz != 4)
7006            goto mmx_decode_failure;
7007         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpgt", True );
7008         break;
7009
7010      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
7011         if (sz != 4)
7012            goto mmx_decode_failure;
7013         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packssdw", False );
7014         break;
7015
7016      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
7017         if (sz != 4)
7018            goto mmx_decode_failure;
7019         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packsswb", False );
7020         break;
7021
7022      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
7023         if (sz != 4)
7024            goto mmx_decode_failure;
7025         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packuswb", False );
7026         break;
7027
7028      case 0x68:
7029      case 0x69:
7030      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
7031         if (sz != 4
7032             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7033            goto mmx_decode_failure;
7034         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckh", True );
7035         break;
7036
7037      case 0x60:
7038      case 0x61:
7039      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
7040         if (sz != 4
7041             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7042            goto mmx_decode_failure;
7043         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckl", True );
7044         break;
7045
7046      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
7047         if (sz != 4)
7048            goto mmx_decode_failure;
7049         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pand", False );
7050         break;
7051
7052      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
7053         if (sz != 4)
7054            goto mmx_decode_failure;
7055         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pandn", False );
7056         break;
7057
7058      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
7059         if (sz != 4)
7060            goto mmx_decode_failure;
7061         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "por", False );
7062         break;
7063
7064      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
7065         if (sz != 4)
7066            goto mmx_decode_failure;
7067         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pxor", False );
7068         break;
7069
7070#     define SHIFT_BY_REG(_name,_op)                                     \
7071                delta = dis_MMX_shiftG_byE(vbi, pfx, delta, _name, _op); \
7072                break;
7073
7074      /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
7075      case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
7076      case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
7077      case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
7078
7079      /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
7080      case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
7081      case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
7082      case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
7083
7084      /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
7085      case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
7086      case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
7087
7088#     undef SHIFT_BY_REG
7089
7090      case 0x71:
7091      case 0x72:
7092      case 0x73: {
7093         /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
7094         UChar byte2, subopc;
7095         if (sz != 4)
7096            goto mmx_decode_failure;
7097         byte2  = getUChar(delta);      /* amode / sub-opcode */
7098         subopc = toUChar( (byte2 >> 3) & 7 );
7099
7100#        define SHIFT_BY_IMM(_name,_op)                        \
7101            do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
7102            } while (0)
7103
7104              if (subopc == 2 /*SRL*/ && opc == 0x71)
7105                  SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
7106         else if (subopc == 2 /*SRL*/ && opc == 0x72)
7107                 SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
7108         else if (subopc == 2 /*SRL*/ && opc == 0x73)
7109                 SHIFT_BY_IMM("psrlq", Iop_Shr64);
7110
7111         else if (subopc == 4 /*SAR*/ && opc == 0x71)
7112                 SHIFT_BY_IMM("psraw", Iop_SarN16x4);
7113         else if (subopc == 4 /*SAR*/ && opc == 0x72)
7114                 SHIFT_BY_IMM("psrad", Iop_SarN32x2);
7115
7116         else if (subopc == 6 /*SHL*/ && opc == 0x71)
7117                 SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
7118         else if (subopc == 6 /*SHL*/ && opc == 0x72)
7119                  SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
7120         else if (subopc == 6 /*SHL*/ && opc == 0x73)
7121                 SHIFT_BY_IMM("psllq", Iop_Shl64);
7122
7123         else goto mmx_decode_failure;
7124
7125#        undef SHIFT_BY_IMM
7126         break;
7127      }
7128
7129      case 0xF7: {
7130         IRTemp addr    = newTemp(Ity_I64);
7131         IRTemp regD    = newTemp(Ity_I64);
7132         IRTemp regM    = newTemp(Ity_I64);
7133         IRTemp mask    = newTemp(Ity_I64);
7134         IRTemp olddata = newTemp(Ity_I64);
7135         IRTemp newdata = newTemp(Ity_I64);
7136
7137         modrm = getUChar(delta);
7138         if (sz != 4 || (!epartIsReg(modrm)))
7139            goto mmx_decode_failure;
7140         delta++;
7141
7142         assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
7143         assign( regM, getMMXReg( eregLO3ofRM(modrm) ));
7144         assign( regD, getMMXReg( gregLO3ofRM(modrm) ));
7145         assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
7146         assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
7147         assign( newdata,
7148                 binop(Iop_Or64,
7149                       binop(Iop_And64,
7150                             mkexpr(regD),
7151                             mkexpr(mask) ),
7152                       binop(Iop_And64,
7153                             mkexpr(olddata),
7154                             unop(Iop_Not64, mkexpr(mask)))) );
7155         storeLE( mkexpr(addr), mkexpr(newdata) );
7156         DIP("maskmovq %s,%s\n", nameMMXReg( eregLO3ofRM(modrm) ),
7157                                 nameMMXReg( gregLO3ofRM(modrm) ) );
7158         break;
7159      }
7160
7161      /* --- MMX decode failure --- */
7162      default:
7163      mmx_decode_failure:
7164         *decode_ok = False;
7165         return delta; /* ignored */
7166
7167   }
7168
7169   *decode_ok = True;
7170   return delta;
7171}
7172
7173
7174/*------------------------------------------------------------*/
7175/*--- More misc arithmetic and other obscure insns.        ---*/
7176/*------------------------------------------------------------*/
7177
7178/* Generate base << amt with vacated places filled with stuff
7179   from xtra.  amt guaranteed in 0 .. 63. */
7180static
7181IRExpr* shiftL64_with_extras ( IRTemp base, IRTemp xtra, IRTemp amt )
7182{
7183   /* if   amt == 0
7184      then base
7185      else (base << amt) | (xtra >>u (64-amt))
7186   */
7187   return
7188      IRExpr_Mux0X(
7189         mkexpr(amt),
7190         mkexpr(base),
7191         binop(Iop_Or64,
7192               binop(Iop_Shl64, mkexpr(base), mkexpr(amt)),
7193               binop(Iop_Shr64, mkexpr(xtra),
7194                                binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
7195         )
7196      );
7197}
7198
7199/* Generate base >>u amt with vacated places filled with stuff
7200   from xtra.  amt guaranteed in 0 .. 63. */
7201static
7202IRExpr* shiftR64_with_extras ( IRTemp xtra, IRTemp base, IRTemp amt )
7203{
7204   /* if   amt == 0
7205      then base
7206      else (base >>u amt) | (xtra << (64-amt))
7207   */
7208   return
7209      IRExpr_Mux0X(
7210         mkexpr(amt),
7211         mkexpr(base),
7212         binop(Iop_Or64,
7213               binop(Iop_Shr64, mkexpr(base), mkexpr(amt)),
7214               binop(Iop_Shl64, mkexpr(xtra),
7215                                binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
7216         )
7217      );
7218}
7219
7220/* Double length left and right shifts.  Apparently only required in
7221   v-size (no b- variant). */
7222static
7223ULong dis_SHLRD_Gv_Ev ( VexAbiInfo* vbi,
7224                        Prefix pfx,
7225                        Long delta, UChar modrm,
7226                        Int sz,
7227                        IRExpr* shift_amt,
7228                        Bool amt_is_literal,
7229                        HChar* shift_amt_txt,
7230                        Bool left_shift )
7231{
7232   /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
7233      for printing it.   And eip on entry points at the modrm byte. */
7234   Int len;
7235   HChar dis_buf[50];
7236
7237   IRType ty     = szToITy(sz);
7238   IRTemp gsrc   = newTemp(ty);
7239   IRTemp esrc   = newTemp(ty);
7240   IRTemp addr   = IRTemp_INVALID;
7241   IRTemp tmpSH  = newTemp(Ity_I8);
7242   IRTemp tmpSS  = newTemp(Ity_I8);
7243   IRTemp tmp64  = IRTemp_INVALID;
7244   IRTemp res64  = IRTemp_INVALID;
7245   IRTemp rss64  = IRTemp_INVALID;
7246   IRTemp resTy  = IRTemp_INVALID;
7247   IRTemp rssTy  = IRTemp_INVALID;
7248   Int    mask   = sz==8 ? 63 : 31;
7249
7250   vassert(sz == 2 || sz == 4 || sz == 8);
7251
7252   /* The E-part is the destination; this is shifted.  The G-part
7253      supplies bits to be shifted into the E-part, but is not
7254      changed.
7255
7256      If shifting left, form a double-length word with E at the top
7257      and G at the bottom, and shift this left.  The result is then in
7258      the high part.
7259
7260      If shifting right, form a double-length word with G at the top
7261      and E at the bottom, and shift this right.  The result is then
7262      at the bottom.  */
7263
7264   /* Fetch the operands. */
7265
7266   assign( gsrc, getIRegG(sz, pfx, modrm) );
7267
7268   if (epartIsReg(modrm)) {
7269      delta++;
7270      assign( esrc, getIRegE(sz, pfx, modrm) );
7271      DIP("sh%cd%c %s, %s, %s\n",
7272          ( left_shift ? 'l' : 'r' ), nameISize(sz),
7273          shift_amt_txt,
7274          nameIRegG(sz, pfx, modrm), nameIRegE(sz, pfx, modrm));
7275   } else {
7276      addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
7277                        /* # bytes following amode */
7278                        amt_is_literal ? 1 : 0 );
7279      delta += len;
7280      assign( esrc, loadLE(ty, mkexpr(addr)) );
7281      DIP("sh%cd%c %s, %s, %s\n",
7282          ( left_shift ? 'l' : 'r' ), nameISize(sz),
7283          shift_amt_txt,
7284          nameIRegG(sz, pfx, modrm), dis_buf);
7285   }
7286
7287   /* Calculate the masked shift amount (tmpSH), the masked subshift
7288      amount (tmpSS), the shifted value (res64) and the subshifted
7289      value (rss64). */
7290
7291   assign( tmpSH, binop(Iop_And8, shift_amt, mkU8(mask)) );
7292   assign( tmpSS, binop(Iop_And8,
7293                        binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
7294                        mkU8(mask)));
7295
7296   tmp64 = newTemp(Ity_I64);
7297   res64 = newTemp(Ity_I64);
7298   rss64 = newTemp(Ity_I64);
7299
7300   if (sz == 2 || sz == 4) {
7301
7302      /* G is xtra; E is data */
7303      /* what a freaking nightmare: */
7304      if (sz == 4 && left_shift) {
7305         assign( tmp64, binop(Iop_32HLto64, mkexpr(esrc), mkexpr(gsrc)) );
7306         assign( res64,
7307                 binop(Iop_Shr64,
7308                       binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
7309                       mkU8(32)) );
7310         assign( rss64,
7311                 binop(Iop_Shr64,
7312                       binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSS)),
7313                       mkU8(32)) );
7314      }
7315      else
7316      if (sz == 4 && !left_shift) {
7317         assign( tmp64, binop(Iop_32HLto64, mkexpr(gsrc), mkexpr(esrc)) );
7318         assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
7319         assign( rss64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSS)) );
7320      }
7321      else
7322      if (sz == 2 && left_shift) {
7323         assign( tmp64,
7324                 binop(Iop_32HLto64,
7325                       binop(Iop_16HLto32, mkexpr(esrc), mkexpr(gsrc)),
7326                       binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc))
7327         ));
7328	 /* result formed by shifting [esrc'gsrc'gsrc'gsrc] */
7329         assign( res64,
7330                 binop(Iop_Shr64,
7331                       binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
7332                       mkU8(48)) );
7333         /* subshift formed by shifting [esrc'0000'0000'0000] */
7334         assign( rss64,
7335                 binop(Iop_Shr64,
7336                       binop(Iop_Shl64,
7337                             binop(Iop_Shl64, unop(Iop_16Uto64, mkexpr(esrc)),
7338                                              mkU8(48)),
7339                             mkexpr(tmpSS)),
7340                       mkU8(48)) );
7341      }
7342      else
7343      if (sz == 2 && !left_shift) {
7344         assign( tmp64,
7345                 binop(Iop_32HLto64,
7346                       binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc)),
7347                       binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(esrc))
7348         ));
7349         /* result formed by shifting [gsrc'gsrc'gsrc'esrc] */
7350         assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
7351         /* subshift formed by shifting [0000'0000'0000'esrc] */
7352         assign( rss64, binop(Iop_Shr64,
7353                              unop(Iop_16Uto64, mkexpr(esrc)),
7354                              mkexpr(tmpSS)) );
7355      }
7356
7357   } else {
7358
7359      vassert(sz == 8);
7360      if (left_shift) {
7361         assign( res64, shiftL64_with_extras( esrc, gsrc, tmpSH ));
7362         assign( rss64, shiftL64_with_extras( esrc, gsrc, tmpSS ));
7363      } else {
7364         assign( res64, shiftR64_with_extras( gsrc, esrc, tmpSH ));
7365         assign( rss64, shiftR64_with_extras( gsrc, esrc, tmpSS ));
7366      }
7367
7368   }
7369
7370   resTy = newTemp(ty);
7371   rssTy = newTemp(ty);
7372   assign( resTy, narrowTo(ty, mkexpr(res64)) );
7373   assign( rssTy, narrowTo(ty, mkexpr(rss64)) );
7374
7375   /* Put result back and write the flags thunk. */
7376   setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl64 : Iop_Sar64,
7377                              resTy, rssTy, ty, tmpSH );
7378
7379   if (epartIsReg(modrm)) {
7380      putIRegE(sz, pfx, modrm, mkexpr(resTy));
7381   } else {
7382      storeLE( mkexpr(addr), mkexpr(resTy) );
7383   }
7384
7385   if (amt_is_literal) delta++;
7386   return delta;
7387}
7388
7389
7390/* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
7391   required. */
7392
7393typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
7394
7395static HChar* nameBtOp ( BtOp op )
7396{
7397   switch (op) {
7398      case BtOpNone:  return "";
7399      case BtOpSet:   return "s";
7400      case BtOpReset: return "r";
7401      case BtOpComp:  return "c";
7402      default: vpanic("nameBtOp(amd64)");
7403   }
7404}
7405
7406
7407static
7408ULong dis_bt_G_E ( VexAbiInfo* vbi,
7409                   Prefix pfx, Int sz, Long delta, BtOp op )
7410{
7411   HChar  dis_buf[50];
7412   UChar  modrm;
7413   Int    len;
7414   IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
7415     t_addr1, t_rsp, t_mask, t_new;
7416
7417   vassert(sz == 2 || sz == 4 || sz == 8);
7418
7419   t_fetched = t_bitno0 = t_bitno1 = t_bitno2
7420             = t_addr0 = t_addr1 = t_rsp
7421             = t_mask = t_new = IRTemp_INVALID;
7422
7423   t_fetched = newTemp(Ity_I8);
7424   t_new     = newTemp(Ity_I8);
7425   t_bitno0  = newTemp(Ity_I64);
7426   t_bitno1  = newTemp(Ity_I64);
7427   t_bitno2  = newTemp(Ity_I8);
7428   t_addr1   = newTemp(Ity_I64);
7429   modrm     = getUChar(delta);
7430
7431   assign( t_bitno0, widenSto64(getIRegG(sz, pfx, modrm)) );
7432
7433   if (epartIsReg(modrm)) {
7434      delta++;
7435      /* Get it onto the client's stack.  Oh, this is a horrible
7436         kludge.  See https://bugs.kde.org/show_bug.cgi?id=245925.
7437         Because of the ELF ABI stack redzone, there may be live data
7438         up to 128 bytes below %RSP.  So we can't just push it on the
7439         stack, else we may wind up trashing live data, and causing
7440         impossible-to-find simulation errors.  (Yes, this did
7441         happen.)  So we need to drop RSP before at least 128 before
7442         pushing it.  That unfortunately means hitting Memcheck's
7443         fast-case painting code.  Ideally we should drop more than
7444         128, to reduce the chances of breaking buggy programs that
7445         have live data below -128(%RSP).  Memcheck fast-cases moves
7446         of 288 bytes due to the need to handle ppc64-linux quickly,
7447         so let's use 288.  Of course the real fix is to get rid of
7448         this kludge entirely.  */
7449      t_rsp = newTemp(Ity_I64);
7450      t_addr0 = newTemp(Ity_I64);
7451
7452      vassert(vbi->guest_stack_redzone_size == 128);
7453      assign( t_rsp, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(288)) );
7454      putIReg64(R_RSP, mkexpr(t_rsp));
7455
7456      storeLE( mkexpr(t_rsp), getIRegE(sz, pfx, modrm) );
7457
7458      /* Make t_addr0 point at it. */
7459      assign( t_addr0, mkexpr(t_rsp) );
7460
7461      /* Mask out upper bits of the shift amount, since we're doing a
7462         reg. */
7463      assign( t_bitno1, binop(Iop_And64,
7464                              mkexpr(t_bitno0),
7465                              mkU64(sz == 8 ? 63 : sz == 4 ? 31 : 15)) );
7466
7467   } else {
7468      t_addr0 = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
7469      delta += len;
7470      assign( t_bitno1, mkexpr(t_bitno0) );
7471   }
7472
7473   /* At this point: t_addr0 is the address being operated on.  If it
7474      was a reg, we will have pushed it onto the client's stack.
7475      t_bitno1 is the bit number, suitably masked in the case of a
7476      reg.  */
7477
7478   /* Now the main sequence. */
7479   assign( t_addr1,
7480           binop(Iop_Add64,
7481                 mkexpr(t_addr0),
7482                 binop(Iop_Sar64, mkexpr(t_bitno1), mkU8(3))) );
7483
7484   /* t_addr1 now holds effective address */
7485
7486   assign( t_bitno2,
7487           unop(Iop_64to8,
7488                binop(Iop_And64, mkexpr(t_bitno1), mkU64(7))) );
7489
7490   /* t_bitno2 contains offset of bit within byte */
7491
7492   if (op != BtOpNone) {
7493      t_mask = newTemp(Ity_I8);
7494      assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
7495   }
7496
7497   /* t_mask is now a suitable byte mask */
7498
7499   assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
7500
7501   if (op != BtOpNone) {
7502      switch (op) {
7503         case BtOpSet:
7504            assign( t_new,
7505                    binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
7506            break;
7507         case BtOpComp:
7508            assign( t_new,
7509                    binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
7510            break;
7511         case BtOpReset:
7512            assign( t_new,
7513                    binop(Iop_And8, mkexpr(t_fetched),
7514                                    unop(Iop_Not8, mkexpr(t_mask))) );
7515            break;
7516         default:
7517            vpanic("dis_bt_G_E(amd64)");
7518      }
7519      if ((pfx & PFX_LOCK) && !epartIsReg(modrm)) {
7520         casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
7521                                 mkexpr(t_new)/*new*/,
7522                                 guest_RIP_curr_instr );
7523      } else {
7524         storeLE( mkexpr(t_addr1), mkexpr(t_new) );
7525      }
7526   }
7527
7528   /* Side effect done; now get selected bit into Carry flag */
7529   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
7530   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
7531   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
7532   stmt( IRStmt_Put(
7533            OFFB_CC_DEP1,
7534            binop(Iop_And64,
7535                  binop(Iop_Shr64,
7536                        unop(Iop_8Uto64, mkexpr(t_fetched)),
7537                        mkexpr(t_bitno2)),
7538                  mkU64(1)))
7539       );
7540   /* Set NDEP even though it isn't used.  This makes redundant-PUT
7541      elimination of previous stores to this field work better. */
7542   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
7543
7544   /* Move reg operand from stack back to reg */
7545   if (epartIsReg(modrm)) {
7546      /* t_rsp still points at it. */
7547      /* only write the reg if actually modifying it; doing otherwise
7548         zeroes the top half erroneously when doing btl due to
7549         standard zero-extend rule */
7550      if (op != BtOpNone)
7551         putIRegE(sz, pfx, modrm, loadLE(szToITy(sz), mkexpr(t_rsp)) );
7552      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t_rsp), mkU64(288)) );
7553   }
7554
7555   DIP("bt%s%c %s, %s\n",
7556       nameBtOp(op), nameISize(sz), nameIRegG(sz, pfx, modrm),
7557       ( epartIsReg(modrm) ? nameIRegE(sz, pfx, modrm) : dis_buf ) );
7558
7559   return delta;
7560}
7561
7562
7563
7564/* Handle BSF/BSR.  Only v-size seems necessary. */
7565static
7566ULong dis_bs_E_G ( VexAbiInfo* vbi,
7567                   Prefix pfx, Int sz, Long delta, Bool fwds )
7568{
7569   Bool   isReg;
7570   UChar  modrm;
7571   HChar  dis_buf[50];
7572
7573   IRType ty    = szToITy(sz);
7574   IRTemp src   = newTemp(ty);
7575   IRTemp dst   = newTemp(ty);
7576   IRTemp src64 = newTemp(Ity_I64);
7577   IRTemp dst64 = newTemp(Ity_I64);
7578   IRTemp src8  = newTemp(Ity_I8);
7579
7580   vassert(sz == 8 || sz == 4 || sz == 2);
7581
7582   modrm = getUChar(delta);
7583   isReg = epartIsReg(modrm);
7584   if (isReg) {
7585      delta++;
7586      assign( src, getIRegE(sz, pfx, modrm) );
7587   } else {
7588      Int    len;
7589      IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7590      delta += len;
7591      assign( src, loadLE(ty, mkexpr(addr)) );
7592   }
7593
7594   DIP("bs%c%c %s, %s\n",
7595       fwds ? 'f' : 'r', nameISize(sz),
7596       ( isReg ? nameIRegE(sz, pfx, modrm) : dis_buf ),
7597       nameIRegG(sz, pfx, modrm));
7598
7599   /* First, widen src to 64 bits if it is not already. */
7600   assign( src64, widenUto64(mkexpr(src)) );
7601
7602   /* Generate an 8-bit expression which is zero iff the
7603      original is zero, and nonzero otherwise */
7604   assign( src8,
7605           unop(Iop_1Uto8,
7606                binop(Iop_CmpNE64,
7607                      mkexpr(src64), mkU64(0))) );
7608
7609   /* Flags: Z is 1 iff source value is zero.  All others
7610      are undefined -- we force them to zero. */
7611   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
7612   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
7613   stmt( IRStmt_Put(
7614            OFFB_CC_DEP1,
7615            IRExpr_Mux0X( mkexpr(src8),
7616                          /* src==0 */
7617                          mkU64(AMD64G_CC_MASK_Z),
7618                          /* src!=0 */
7619                          mkU64(0)
7620                        )
7621       ));
7622   /* Set NDEP even though it isn't used.  This makes redundant-PUT
7623      elimination of previous stores to this field work better. */
7624   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
7625
7626   /* Result: iff source value is zero, we can't use
7627      Iop_Clz64/Iop_Ctz64 as they have no defined result in that case.
7628      But anyway, amd64 semantics say the result is undefined in
7629      such situations.  Hence handle the zero case specially. */
7630
7631   /* Bleh.  What we compute:
7632
7633          bsf64:  if src == 0 then {dst is unchanged}
7634                              else Ctz64(src)
7635
7636          bsr64:  if src == 0 then {dst is unchanged}
7637                              else 63 - Clz64(src)
7638
7639          bsf32:  if src == 0 then {dst is unchanged}
7640                              else Ctz64(32Uto64(src))
7641
7642          bsr32:  if src == 0 then {dst is unchanged}
7643                              else 63 - Clz64(32Uto64(src))
7644
7645          bsf16:  if src == 0 then {dst is unchanged}
7646                              else Ctz64(32Uto64(16Uto32(src)))
7647
7648          bsr16:  if src == 0 then {dst is unchanged}
7649                              else 63 - Clz64(32Uto64(16Uto32(src)))
7650   */
7651
7652   /* The main computation, guarding against zero. */
7653   assign( dst64,
7654           IRExpr_Mux0X(
7655              mkexpr(src8),
7656              /* src == 0 -- leave dst unchanged */
7657              widenUto64( getIRegG( sz, pfx, modrm ) ),
7658              /* src != 0 */
7659              fwds ? unop(Iop_Ctz64, mkexpr(src64))
7660                   : binop(Iop_Sub64,
7661                           mkU64(63),
7662                           unop(Iop_Clz64, mkexpr(src64)))
7663           )
7664         );
7665
7666   if (sz == 2)
7667      assign( dst, unop(Iop_64to16, mkexpr(dst64)) );
7668   else
7669   if (sz == 4)
7670      assign( dst, unop(Iop_64to32, mkexpr(dst64)) );
7671   else
7672      assign( dst, mkexpr(dst64) );
7673
7674   /* dump result back */
7675   putIRegG( sz, pfx, modrm, mkexpr(dst) );
7676
7677   return delta;
7678}
7679
7680
7681/* swap rAX with the reg specified by reg and REX.B */
7682static
7683void codegen_xchg_rAX_Reg ( Prefix pfx, Int sz, UInt regLo3 )
7684{
7685   IRType ty = szToITy(sz);
7686   IRTemp t1 = newTemp(ty);
7687   IRTemp t2 = newTemp(ty);
7688   vassert(sz == 2 || sz == 4 || sz == 8);
7689   vassert(regLo3 < 8);
7690   if (sz == 8) {
7691      assign( t1, getIReg64(R_RAX) );
7692      assign( t2, getIRegRexB(8, pfx, regLo3) );
7693      putIReg64( R_RAX, mkexpr(t2) );
7694      putIRegRexB(8, pfx, regLo3, mkexpr(t1) );
7695   } else if (sz == 4) {
7696      assign( t1, getIReg32(R_RAX) );
7697      assign( t2, getIRegRexB(4, pfx, regLo3) );
7698      putIReg32( R_RAX, mkexpr(t2) );
7699      putIRegRexB(4, pfx, regLo3, mkexpr(t1) );
7700   } else {
7701      assign( t1, getIReg16(R_RAX) );
7702      assign( t2, getIRegRexB(2, pfx, regLo3) );
7703      putIReg16( R_RAX, mkexpr(t2) );
7704      putIRegRexB(2, pfx, regLo3, mkexpr(t1) );
7705   }
7706   DIP("xchg%c %s, %s\n",
7707       nameISize(sz), nameIRegRAX(sz),
7708                      nameIRegRexB(sz,pfx, regLo3));
7709}
7710
7711
7712static
7713void codegen_SAHF ( void )
7714{
7715   /* Set the flags to:
7716      (amd64g_calculate_flags_all() & AMD64G_CC_MASK_O)
7717                                    -- retain the old O flag
7718      | (%AH & (AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
7719                |AMD64G_CC_MASK_P|AMD64G_CC_MASK_C)
7720   */
7721   ULong  mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
7722                       |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
7723   IRTemp oldflags   = newTemp(Ity_I64);
7724   assign( oldflags, mk_amd64g_calculate_rflags_all() );
7725   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
7726   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
7727   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
7728   stmt( IRStmt_Put( OFFB_CC_DEP1,
7729         binop(Iop_Or64,
7730               binop(Iop_And64, mkexpr(oldflags), mkU64(AMD64G_CC_MASK_O)),
7731               binop(Iop_And64,
7732                     binop(Iop_Shr64, getIReg64(R_RAX), mkU8(8)),
7733                     mkU64(mask_SZACP))
7734              )
7735   ));
7736}
7737
7738
7739static
7740void codegen_LAHF ( void  )
7741{
7742   /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
7743   IRExpr* rax_with_hole;
7744   IRExpr* new_byte;
7745   IRExpr* new_rax;
7746   ULong   mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
7747                        |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
7748
7749   IRTemp  flags = newTemp(Ity_I64);
7750   assign( flags, mk_amd64g_calculate_rflags_all() );
7751
7752   rax_with_hole
7753      = binop(Iop_And64, getIReg64(R_RAX), mkU64(~0xFF00ULL));
7754   new_byte
7755      = binop(Iop_Or64, binop(Iop_And64, mkexpr(flags), mkU64(mask_SZACP)),
7756                        mkU64(1<<1));
7757   new_rax
7758      = binop(Iop_Or64, rax_with_hole,
7759                        binop(Iop_Shl64, new_byte, mkU8(8)));
7760   putIReg64(R_RAX, new_rax);
7761}
7762
7763
7764static
7765ULong dis_cmpxchg_G_E ( /*OUT*/Bool* ok,
7766                        VexAbiInfo*  vbi,
7767                        Prefix       pfx,
7768                        Int          size,
7769                        Long         delta0 )
7770{
7771   HChar dis_buf[50];
7772   Int   len;
7773
7774   IRType ty    = szToITy(size);
7775   IRTemp acc   = newTemp(ty);
7776   IRTemp src   = newTemp(ty);
7777   IRTemp dest  = newTemp(ty);
7778   IRTemp dest2 = newTemp(ty);
7779   IRTemp acc2  = newTemp(ty);
7780   IRTemp cond8 = newTemp(Ity_I8);
7781   IRTemp addr  = IRTemp_INVALID;
7782   UChar  rm    = getUChar(delta0);
7783
7784   /* There are 3 cases to consider:
7785
7786      reg-reg: ignore any lock prefix, generate sequence based
7787               on Mux0X
7788
7789      reg-mem, not locked: ignore any lock prefix, generate sequence
7790                           based on Mux0X
7791
7792      reg-mem, locked: use IRCAS
7793   */
7794
7795   if (epartIsReg(rm)) {
7796      /* case 1 */
7797      assign( dest, getIRegE(size, pfx, rm) );
7798      delta0++;
7799      assign( src, getIRegG(size, pfx, rm) );
7800      assign( acc, getIRegRAX(size) );
7801      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
7802      assign( cond8, unop(Iop_1Uto8, mk_amd64g_calculate_condition(AMD64CondZ)) );
7803      assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
7804      assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
7805      putIRegRAX(size, mkexpr(acc2));
7806      putIRegE(size, pfx, rm, mkexpr(dest2));
7807      DIP("cmpxchg%c %s,%s\n", nameISize(size),
7808                               nameIRegG(size,pfx,rm),
7809                               nameIRegE(size,pfx,rm) );
7810   }
7811   else if (!epartIsReg(rm) && !(pfx & PFX_LOCK)) {
7812      /* case 2 */
7813      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
7814      assign( dest, loadLE(ty, mkexpr(addr)) );
7815      delta0 += len;
7816      assign( src, getIRegG(size, pfx, rm) );
7817      assign( acc, getIRegRAX(size) );
7818      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
7819      assign( cond8, unop(Iop_1Uto8, mk_amd64g_calculate_condition(AMD64CondZ)) );
7820      assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
7821      assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
7822      putIRegRAX(size, mkexpr(acc2));
7823      storeLE( mkexpr(addr), mkexpr(dest2) );
7824      DIP("cmpxchg%c %s,%s\n", nameISize(size),
7825                               nameIRegG(size,pfx,rm), dis_buf);
7826   }
7827   else if (!epartIsReg(rm) && (pfx & PFX_LOCK)) {
7828      /* case 3 */
7829      /* src is new value.  acc is expected value.  dest is old value.
7830         Compute success from the output of the IRCAS, and steer the
7831         new value for RAX accordingly: in case of success, RAX is
7832         unchanged. */
7833      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
7834      delta0 += len;
7835      assign( src, getIRegG(size, pfx, rm) );
7836      assign( acc, getIRegRAX(size) );
7837      stmt( IRStmt_CAS(
7838         mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
7839                  NULL, mkexpr(acc), NULL, mkexpr(src) )
7840      ));
7841      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
7842      assign( cond8, unop(Iop_1Uto8, mk_amd64g_calculate_condition(AMD64CondZ)) );
7843      assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
7844      putIRegRAX(size, mkexpr(acc2));
7845      DIP("cmpxchg%c %s,%s\n", nameISize(size),
7846                               nameIRegG(size,pfx,rm), dis_buf);
7847   }
7848   else vassert(0);
7849
7850   *ok = True;
7851   return delta0;
7852}
7853
7854
7855/* Handle conditional move instructions of the form
7856      cmovcc E(reg-or-mem), G(reg)
7857
7858   E(src) is reg-or-mem
7859   G(dst) is reg.
7860
7861   If E is reg, -->    GET %E, tmps
7862                       GET %G, tmpd
7863                       CMOVcc tmps, tmpd
7864                       PUT tmpd, %G
7865
7866   If E is mem  -->    (getAddr E) -> tmpa
7867                       LD (tmpa), tmps
7868                       GET %G, tmpd
7869                       CMOVcc tmps, tmpd
7870                       PUT tmpd, %G
7871*/
7872static
7873ULong dis_cmov_E_G ( VexAbiInfo* vbi,
7874                     Prefix        pfx,
7875                     Int           sz,
7876                     AMD64Condcode cond,
7877                     Long          delta0 )
7878{
7879   UChar rm  = getUChar(delta0);
7880   HChar dis_buf[50];
7881   Int   len;
7882
7883   IRType ty   = szToITy(sz);
7884   IRTemp tmps = newTemp(ty);
7885   IRTemp tmpd = newTemp(ty);
7886
7887   if (epartIsReg(rm)) {
7888      assign( tmps, getIRegE(sz, pfx, rm) );
7889      assign( tmpd, getIRegG(sz, pfx, rm) );
7890
7891      putIRegG( sz, pfx, rm,
7892                IRExpr_Mux0X( unop(Iop_1Uto8,
7893                                   mk_amd64g_calculate_condition(cond)),
7894                              mkexpr(tmpd),
7895                              mkexpr(tmps) )
7896              );
7897      DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
7898                            nameIRegE(sz,pfx,rm),
7899                            nameIRegG(sz,pfx,rm));
7900      return 1+delta0;
7901   }
7902
7903   /* E refers to memory */
7904   {
7905      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
7906      assign( tmps, loadLE(ty, mkexpr(addr)) );
7907      assign( tmpd, getIRegG(sz, pfx, rm) );
7908
7909      putIRegG( sz, pfx, rm,
7910                IRExpr_Mux0X( unop(Iop_1Uto8,
7911                                   mk_amd64g_calculate_condition(cond)),
7912                              mkexpr(tmpd),
7913                              mkexpr(tmps) )
7914              );
7915
7916      DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
7917                            dis_buf,
7918                            nameIRegG(sz,pfx,rm));
7919      return len+delta0;
7920   }
7921}
7922
7923
7924static
7925ULong dis_xadd_G_E ( /*OUT*/Bool* decode_ok,
7926                     VexAbiInfo* vbi,
7927                     Prefix pfx, Int sz, Long delta0 )
7928{
7929   Int   len;
7930   UChar rm = getUChar(delta0);
7931   HChar dis_buf[50];
7932
7933   IRType ty    = szToITy(sz);
7934   IRTemp tmpd  = newTemp(ty);
7935   IRTemp tmpt0 = newTemp(ty);
7936   IRTemp tmpt1 = newTemp(ty);
7937
7938   /* There are 3 cases to consider:
7939
7940      reg-reg: ignore any lock prefix,
7941               generate 'naive' (non-atomic) sequence
7942
7943      reg-mem, not locked: ignore any lock prefix, generate 'naive'
7944                           (non-atomic) sequence
7945
7946      reg-mem, locked: use IRCAS
7947   */
7948
7949   if (epartIsReg(rm)) {
7950      /* case 1 */
7951      assign( tmpd, getIRegE(sz, pfx, rm) );
7952      assign( tmpt0, getIRegG(sz, pfx, rm) );
7953      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
7954                           mkexpr(tmpd), mkexpr(tmpt0)) );
7955      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
7956      putIRegG(sz, pfx, rm, mkexpr(tmpd));
7957      putIRegE(sz, pfx, rm, mkexpr(tmpt1));
7958      DIP("xadd%c %s, %s\n",
7959          nameISize(sz), nameIRegG(sz,pfx,rm),
7960          				 nameIRegE(sz,pfx,rm));
7961      *decode_ok = True;
7962      return 1+delta0;
7963   }
7964   else if (!epartIsReg(rm) && !(pfx & PFX_LOCK)) {
7965      /* case 2 */
7966      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
7967      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
7968      assign( tmpt0, getIRegG(sz, pfx, rm) );
7969      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
7970                           mkexpr(tmpd), mkexpr(tmpt0)) );
7971      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
7972      storeLE( mkexpr(addr), mkexpr(tmpt1) );
7973      putIRegG(sz, pfx, rm, mkexpr(tmpd));
7974      DIP("xadd%c %s, %s\n",
7975          nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
7976      *decode_ok = True;
7977      return len+delta0;
7978   }
7979   else if (!epartIsReg(rm) && (pfx & PFX_LOCK)) {
7980      /* case 3 */
7981      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
7982      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
7983      assign( tmpt0, getIRegG(sz, pfx, rm) );
7984      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
7985                           mkexpr(tmpd), mkexpr(tmpt0)) );
7986      casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
7987                           mkexpr(tmpt1)/*newVal*/, guest_RIP_curr_instr );
7988      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
7989      putIRegG(sz, pfx, rm, mkexpr(tmpd));
7990      DIP("xadd%c %s, %s\n",
7991          nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
7992      *decode_ok = True;
7993      return len+delta0;
7994   }
7995   /*UNREACHED*/
7996   vassert(0);
7997}
7998
7999//.. /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
8000//..
8001//.. static
8002//.. UInt dis_mov_Ew_Sw ( UChar sorb, Long delta0 )
8003//.. {
8004//..    Int    len;
8005//..    IRTemp addr;
8006//..    UChar  rm  = getUChar(delta0);
8007//..    HChar  dis_buf[50];
8008//..
8009//..    if (epartIsReg(rm)) {
8010//..       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
8011//..       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
8012//..       return 1+delta0;
8013//..    } else {
8014//..       addr = disAMode ( &len, sorb, delta0, dis_buf );
8015//..       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
8016//..       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
8017//..       return len+delta0;
8018//..    }
8019//.. }
8020//..
8021//.. /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
8022//..    dst is ireg and sz==4, zero out top half of it.  */
8023//..
8024//.. static
8025//.. UInt dis_mov_Sw_Ew ( UChar sorb,
8026//..                      Int   sz,
8027//..                      UInt  delta0 )
8028//.. {
8029//..    Int    len;
8030//..    IRTemp addr;
8031//..    UChar  rm  = getUChar(delta0);
8032//..    HChar  dis_buf[50];
8033//..
8034//..    vassert(sz == 2 || sz == 4);
8035//..
8036//..    if (epartIsReg(rm)) {
8037//..       if (sz == 4)
8038//..          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
8039//..       else
8040//..          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
8041//..
8042//..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
8043//..       return 1+delta0;
8044//..    } else {
8045//..       addr = disAMode ( &len, sorb, delta0, dis_buf );
8046//..       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
8047//..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
8048//..       return len+delta0;
8049//..    }
8050//.. }
8051//..
8052//..
8053//.. static
8054//.. void dis_push_segreg ( UInt sreg, Int sz )
8055//.. {
8056//..     IRTemp t1 = newTemp(Ity_I16);
8057//..     IRTemp ta = newTemp(Ity_I32);
8058//..     vassert(sz == 2 || sz == 4);
8059//..
8060//..     assign( t1, getSReg(sreg) );
8061//..     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
8062//..     putIReg(4, R_ESP, mkexpr(ta));
8063//..     storeLE( mkexpr(ta), mkexpr(t1) );
8064//..
8065//..     DIP("pushw %s\n", nameSReg(sreg));
8066//.. }
8067//..
8068//.. static
8069//.. void dis_pop_segreg ( UInt sreg, Int sz )
8070//.. {
8071//..     IRTemp t1 = newTemp(Ity_I16);
8072//..     IRTemp ta = newTemp(Ity_I32);
8073//..     vassert(sz == 2 || sz == 4);
8074//..
8075//..     assign( ta, getIReg(4, R_ESP) );
8076//..     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
8077//..
8078//..     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
8079//..     putSReg( sreg, mkexpr(t1) );
8080//..     DIP("pop %s\n", nameSReg(sreg));
8081//.. }
8082
8083static
8084void dis_ret ( VexAbiInfo* vbi, ULong d64 )
8085{
8086   IRTemp t1 = newTemp(Ity_I64);
8087   IRTemp t2 = newTemp(Ity_I64);
8088   IRTemp t3 = newTemp(Ity_I64);
8089   assign(t1, getIReg64(R_RSP));
8090   assign(t2, loadLE(Ity_I64,mkexpr(t1)));
8091   assign(t3, binop(Iop_Add64, mkexpr(t1), mkU64(8+d64)));
8092   putIReg64(R_RSP, mkexpr(t3));
8093   make_redzone_AbiHint(vbi, t3, t2/*nia*/, "ret");
8094   jmp_treg(Ijk_Ret,t2);
8095}
8096
8097
8098/*------------------------------------------------------------*/
8099/*--- SSE/SSE2/SSE3 helpers                                ---*/
8100/*------------------------------------------------------------*/
8101
8102/* Worker function; do not call directly.
8103   Handles full width G = G `op` E   and   G = (not G) `op` E.
8104*/
8105
8106static ULong dis_SSE_E_to_G_all_wrk (
8107                VexAbiInfo* vbi,
8108                Prefix pfx, Long delta,
8109                HChar* opname, IROp op,
8110                Bool   invertG
8111             )
8112{
8113   HChar   dis_buf[50];
8114   Int     alen;
8115   IRTemp  addr;
8116   UChar   rm = getUChar(delta);
8117   IRExpr* gpart
8118      = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRexRM(pfx,rm)))
8119                : getXMMReg(gregOfRexRM(pfx,rm));
8120   if (epartIsReg(rm)) {
8121      putXMMReg( gregOfRexRM(pfx,rm),
8122                 binop(op, gpart,
8123                           getXMMReg(eregOfRexRM(pfx,rm))) );
8124      DIP("%s %s,%s\n", opname,
8125                        nameXMMReg(eregOfRexRM(pfx,rm)),
8126                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8127      return delta+1;
8128   } else {
8129      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8130      putXMMReg( gregOfRexRM(pfx,rm),
8131                 binop(op, gpart,
8132                           loadLE(Ity_V128, mkexpr(addr))) );
8133      DIP("%s %s,%s\n", opname,
8134                        dis_buf,
8135                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8136      return delta+alen;
8137   }
8138}
8139
8140
8141/* All lanes SSE binary operation, G = G `op` E. */
8142
8143static
8144ULong dis_SSE_E_to_G_all ( VexAbiInfo* vbi,
8145                           Prefix pfx, Long delta,
8146                           HChar* opname, IROp op )
8147{
8148   return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, False );
8149}
8150
8151/* All lanes SSE binary operation, G = (not G) `op` E. */
8152
8153static
8154ULong dis_SSE_E_to_G_all_invG ( VexAbiInfo* vbi,
8155                                Prefix pfx, Long delta,
8156                                HChar* opname, IROp op )
8157{
8158   return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, True );
8159}
8160
8161
8162/* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
8163
8164static ULong dis_SSE_E_to_G_lo32 ( VexAbiInfo* vbi,
8165                                   Prefix pfx, Long delta,
8166                                   HChar* opname, IROp op )
8167{
8168   HChar   dis_buf[50];
8169   Int     alen;
8170   IRTemp  addr;
8171   UChar   rm = getUChar(delta);
8172   IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
8173   if (epartIsReg(rm)) {
8174      putXMMReg( gregOfRexRM(pfx,rm),
8175                 binop(op, gpart,
8176                           getXMMReg(eregOfRexRM(pfx,rm))) );
8177      DIP("%s %s,%s\n", opname,
8178                        nameXMMReg(eregOfRexRM(pfx,rm)),
8179                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8180      return delta+1;
8181   } else {
8182      /* We can only do a 32-bit memory read, so the upper 3/4 of the
8183         E operand needs to be made simply of zeroes. */
8184      IRTemp epart = newTemp(Ity_V128);
8185      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8186      assign( epart, unop( Iop_32UtoV128,
8187                           loadLE(Ity_I32, mkexpr(addr))) );
8188      putXMMReg( gregOfRexRM(pfx,rm),
8189                 binop(op, gpart, mkexpr(epart)) );
8190      DIP("%s %s,%s\n", opname,
8191                        dis_buf,
8192                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8193      return delta+alen;
8194   }
8195}
8196
8197
8198/* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
8199
8200static ULong dis_SSE_E_to_G_lo64 ( VexAbiInfo* vbi,
8201                                   Prefix pfx, Long delta,
8202                                   HChar* opname, IROp op )
8203{
8204   HChar   dis_buf[50];
8205   Int     alen;
8206   IRTemp  addr;
8207   UChar   rm = getUChar(delta);
8208   IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
8209   if (epartIsReg(rm)) {
8210      putXMMReg( gregOfRexRM(pfx,rm),
8211                 binop(op, gpart,
8212                           getXMMReg(eregOfRexRM(pfx,rm))) );
8213      DIP("%s %s,%s\n", opname,
8214                        nameXMMReg(eregOfRexRM(pfx,rm)),
8215                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8216      return delta+1;
8217   } else {
8218      /* We can only do a 64-bit memory read, so the upper half of the
8219         E operand needs to be made simply of zeroes. */
8220      IRTemp epart = newTemp(Ity_V128);
8221      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8222      assign( epart, unop( Iop_64UtoV128,
8223                           loadLE(Ity_I64, mkexpr(addr))) );
8224      putXMMReg( gregOfRexRM(pfx,rm),
8225                 binop(op, gpart, mkexpr(epart)) );
8226      DIP("%s %s,%s\n", opname,
8227                        dis_buf,
8228                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8229      return delta+alen;
8230   }
8231}
8232
8233
8234/* All lanes unary SSE operation, G = op(E). */
8235
8236static ULong dis_SSE_E_to_G_unary_all (
8237                VexAbiInfo* vbi,
8238                Prefix pfx, Long delta,
8239                HChar* opname, IROp op
8240             )
8241{
8242   HChar   dis_buf[50];
8243   Int     alen;
8244   IRTemp  addr;
8245   UChar   rm = getUChar(delta);
8246   if (epartIsReg(rm)) {
8247      putXMMReg( gregOfRexRM(pfx,rm),
8248                 unop(op, getXMMReg(eregOfRexRM(pfx,rm))) );
8249      DIP("%s %s,%s\n", opname,
8250                        nameXMMReg(eregOfRexRM(pfx,rm)),
8251                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8252      return delta+1;
8253   } else {
8254      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8255      putXMMReg( gregOfRexRM(pfx,rm),
8256                 unop(op, loadLE(Ity_V128, mkexpr(addr))) );
8257      DIP("%s %s,%s\n", opname,
8258                        dis_buf,
8259                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8260      return delta+alen;
8261   }
8262}
8263
8264
8265/* Lowest 32-bit lane only unary SSE operation, G = op(E). */
8266
8267static ULong dis_SSE_E_to_G_unary_lo32 (
8268                VexAbiInfo* vbi,
8269                Prefix pfx, Long delta,
8270                HChar* opname, IROp op
8271             )
8272{
8273   /* First we need to get the old G value and patch the low 32 bits
8274      of the E operand into it.  Then apply op and write back to G. */
8275   HChar   dis_buf[50];
8276   Int     alen;
8277   IRTemp  addr;
8278   UChar   rm = getUChar(delta);
8279   IRTemp  oldG0 = newTemp(Ity_V128);
8280   IRTemp  oldG1 = newTemp(Ity_V128);
8281
8282   assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
8283
8284   if (epartIsReg(rm)) {
8285      assign( oldG1,
8286              binop( Iop_SetV128lo32,
8287                     mkexpr(oldG0),
8288                     getXMMRegLane32(eregOfRexRM(pfx,rm), 0)) );
8289      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
8290      DIP("%s %s,%s\n", opname,
8291                        nameXMMReg(eregOfRexRM(pfx,rm)),
8292                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8293      return delta+1;
8294   } else {
8295      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8296      assign( oldG1,
8297              binop( Iop_SetV128lo32,
8298                     mkexpr(oldG0),
8299                     loadLE(Ity_I32, mkexpr(addr)) ));
8300      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
8301      DIP("%s %s,%s\n", opname,
8302                        dis_buf,
8303                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8304      return delta+alen;
8305   }
8306}
8307
8308
8309/* Lowest 64-bit lane only unary SSE operation, G = op(E). */
8310
8311static ULong dis_SSE_E_to_G_unary_lo64 (
8312                VexAbiInfo* vbi,
8313                Prefix pfx, Long delta,
8314                HChar* opname, IROp op
8315             )
8316{
8317   /* First we need to get the old G value and patch the low 64 bits
8318      of the E operand into it.  Then apply op and write back to G. */
8319   HChar   dis_buf[50];
8320   Int     alen;
8321   IRTemp  addr;
8322   UChar   rm = getUChar(delta);
8323   IRTemp  oldG0 = newTemp(Ity_V128);
8324   IRTemp  oldG1 = newTemp(Ity_V128);
8325
8326   assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
8327
8328   if (epartIsReg(rm)) {
8329      assign( oldG1,
8330              binop( Iop_SetV128lo64,
8331                     mkexpr(oldG0),
8332                     getXMMRegLane64(eregOfRexRM(pfx,rm), 0)) );
8333      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
8334      DIP("%s %s,%s\n", opname,
8335                        nameXMMReg(eregOfRexRM(pfx,rm)),
8336                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8337      return delta+1;
8338   } else {
8339      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8340      assign( oldG1,
8341              binop( Iop_SetV128lo64,
8342                     mkexpr(oldG0),
8343                     loadLE(Ity_I64, mkexpr(addr)) ));
8344      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
8345      DIP("%s %s,%s\n", opname,
8346                        dis_buf,
8347                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8348      return delta+alen;
8349   }
8350}
8351
8352
8353/* SSE integer binary operation:
8354      G = G `op` E   (eLeft == False)
8355      G = E `op` G   (eLeft == True)
8356*/
8357static ULong dis_SSEint_E_to_G(
8358                VexAbiInfo* vbi,
8359                Prefix pfx, Long delta,
8360                HChar* opname, IROp op,
8361                Bool   eLeft
8362             )
8363{
8364   HChar   dis_buf[50];
8365   Int     alen;
8366   IRTemp  addr;
8367   UChar   rm = getUChar(delta);
8368   IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
8369   IRExpr* epart = NULL;
8370   if (epartIsReg(rm)) {
8371      epart = getXMMReg(eregOfRexRM(pfx,rm));
8372      DIP("%s %s,%s\n", opname,
8373                        nameXMMReg(eregOfRexRM(pfx,rm)),
8374                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8375      delta += 1;
8376   } else {
8377      addr  = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8378      epart = loadLE(Ity_V128, mkexpr(addr));
8379      DIP("%s %s,%s\n", opname,
8380                        dis_buf,
8381                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8382      delta += alen;
8383   }
8384   putXMMReg( gregOfRexRM(pfx,rm),
8385              eLeft ? binop(op, epart, gpart)
8386	            : binop(op, gpart, epart) );
8387   return delta;
8388}
8389
8390
8391/* Helper for doing SSE FP comparisons. */
8392
8393static void findSSECmpOp ( Bool* needNot, IROp* op,
8394                           Int imm8, Bool all_lanes, Int sz )
8395{
8396   imm8 &= 7;
8397   *needNot = False;
8398   *op      = Iop_INVALID;
8399   if (imm8 >= 4) {
8400      *needNot = True;
8401      imm8 -= 4;
8402   }
8403
8404   if (sz == 4 && all_lanes) {
8405      switch (imm8) {
8406         case 0: *op = Iop_CmpEQ32Fx4; return;
8407         case 1: *op = Iop_CmpLT32Fx4; return;
8408         case 2: *op = Iop_CmpLE32Fx4; return;
8409         case 3: *op = Iop_CmpUN32Fx4; return;
8410         default: break;
8411      }
8412   }
8413   if (sz == 4 && !all_lanes) {
8414      switch (imm8) {
8415         case 0: *op = Iop_CmpEQ32F0x4; return;
8416         case 1: *op = Iop_CmpLT32F0x4; return;
8417         case 2: *op = Iop_CmpLE32F0x4; return;
8418         case 3: *op = Iop_CmpUN32F0x4; return;
8419         default: break;
8420      }
8421   }
8422   if (sz == 8 && all_lanes) {
8423      switch (imm8) {
8424         case 0: *op = Iop_CmpEQ64Fx2; return;
8425         case 1: *op = Iop_CmpLT64Fx2; return;
8426         case 2: *op = Iop_CmpLE64Fx2; return;
8427         case 3: *op = Iop_CmpUN64Fx2; return;
8428         default: break;
8429      }
8430   }
8431   if (sz == 8 && !all_lanes) {
8432      switch (imm8) {
8433         case 0: *op = Iop_CmpEQ64F0x2; return;
8434         case 1: *op = Iop_CmpLT64F0x2; return;
8435         case 2: *op = Iop_CmpLE64F0x2; return;
8436         case 3: *op = Iop_CmpUN64F0x2; return;
8437         default: break;
8438      }
8439   }
8440   vpanic("findSSECmpOp(amd64,guest)");
8441}
8442
8443/* Handles SSE 32F/64F comparisons. */
8444
8445static ULong dis_SSEcmp_E_to_G ( VexAbiInfo* vbi,
8446                                 Prefix pfx, Long delta,
8447                                 HChar* opname, Bool all_lanes, Int sz )
8448{
8449   HChar   dis_buf[50];
8450   Int     alen, imm8;
8451   IRTemp  addr;
8452   Bool    needNot = False;
8453   IROp    op      = Iop_INVALID;
8454   IRTemp  plain   = newTemp(Ity_V128);
8455   UChar   rm      = getUChar(delta);
8456   UShort  mask    = 0;
8457   vassert(sz == 4 || sz == 8);
8458   if (epartIsReg(rm)) {
8459      imm8 = getUChar(delta+1);
8460      findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
8461      assign( plain, binop(op, getXMMReg(gregOfRexRM(pfx,rm)),
8462                               getXMMReg(eregOfRexRM(pfx,rm))) );
8463      delta += 2;
8464      DIP("%s $%d,%s,%s\n", opname,
8465                            (Int)imm8,
8466                            nameXMMReg(eregOfRexRM(pfx,rm)),
8467                            nameXMMReg(gregOfRexRM(pfx,rm)) );
8468   } else {
8469      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
8470      imm8 = getUChar(delta+alen);
8471      findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
8472      assign( plain,
8473              binop(
8474                 op,
8475                 getXMMReg(gregOfRexRM(pfx,rm)),
8476                   all_lanes  ? loadLE(Ity_V128, mkexpr(addr))
8477                 : sz == 8    ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
8478                 : /*sz==4*/    unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
8479	      )
8480      );
8481      delta += alen+1;
8482      DIP("%s $%d,%s,%s\n", opname,
8483                            (Int)imm8,
8484                            dis_buf,
8485                            nameXMMReg(gregOfRexRM(pfx,rm)) );
8486   }
8487
8488   if (needNot && all_lanes) {
8489      putXMMReg( gregOfRexRM(pfx,rm),
8490                 unop(Iop_NotV128, mkexpr(plain)) );
8491   }
8492   else
8493   if (needNot && !all_lanes) {
8494      mask = toUShort(sz==4 ? 0x000F : 0x00FF);
8495      putXMMReg( gregOfRexRM(pfx,rm),
8496                 binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
8497   }
8498   else {
8499      putXMMReg( gregOfRexRM(pfx,rm), mkexpr(plain) );
8500   }
8501
8502   return delta;
8503}
8504
8505
8506/* Vector by scalar shift of G by the amount specified at the bottom
8507   of E. */
8508
8509static ULong dis_SSE_shiftG_byE ( VexAbiInfo* vbi,
8510                                  Prefix pfx, Long delta,
8511                                  HChar* opname, IROp op )
8512{
8513   HChar   dis_buf[50];
8514   Int     alen, size;
8515   IRTemp  addr;
8516   Bool    shl, shr, sar;
8517   UChar   rm   = getUChar(delta);
8518   IRTemp  g0   = newTemp(Ity_V128);
8519   IRTemp  g1   = newTemp(Ity_V128);
8520   IRTemp  amt  = newTemp(Ity_I32);
8521   IRTemp  amt8 = newTemp(Ity_I8);
8522   if (epartIsReg(rm)) {
8523      assign( amt, getXMMRegLane32(eregOfRexRM(pfx,rm), 0) );
8524      DIP("%s %s,%s\n", opname,
8525                        nameXMMReg(eregOfRexRM(pfx,rm)),
8526                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8527      delta++;
8528   } else {
8529      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8530      assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
8531      DIP("%s %s,%s\n", opname,
8532                        dis_buf,
8533                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8534      delta += alen;
8535   }
8536   assign( g0,   getXMMReg(gregOfRexRM(pfx,rm)) );
8537   assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
8538
8539   shl = shr = sar = False;
8540   size = 0;
8541   switch (op) {
8542      case Iop_ShlN16x8: shl = True; size = 32; break;
8543      case Iop_ShlN32x4: shl = True; size = 32; break;
8544      case Iop_ShlN64x2: shl = True; size = 64; break;
8545      case Iop_SarN16x8: sar = True; size = 16; break;
8546      case Iop_SarN32x4: sar = True; size = 32; break;
8547      case Iop_ShrN16x8: shr = True; size = 16; break;
8548      case Iop_ShrN32x4: shr = True; size = 32; break;
8549      case Iop_ShrN64x2: shr = True; size = 64; break;
8550      default: vassert(0);
8551   }
8552
8553   if (shl || shr) {
8554     assign(
8555        g1,
8556        IRExpr_Mux0X(
8557           unop(Iop_1Uto8,
8558                binop(Iop_CmpLT64U, unop(Iop_32Uto64,mkexpr(amt)), mkU64(size))),
8559           mkV128(0x0000),
8560           binop(op, mkexpr(g0), mkexpr(amt8))
8561        )
8562     );
8563   } else
8564   if (sar) {
8565     assign(
8566        g1,
8567        IRExpr_Mux0X(
8568           unop(Iop_1Uto8,
8569                binop(Iop_CmpLT64U, unop(Iop_32Uto64,mkexpr(amt)), mkU64(size))),
8570           binop(op, mkexpr(g0), mkU8(size-1)),
8571           binop(op, mkexpr(g0), mkexpr(amt8))
8572        )
8573     );
8574   } else {
8575      vassert(0);
8576   }
8577
8578   putXMMReg( gregOfRexRM(pfx,rm), mkexpr(g1) );
8579   return delta;
8580}
8581
8582
8583/* Vector by scalar shift of E by an immediate byte. */
8584
8585static
8586ULong dis_SSE_shiftE_imm ( Prefix pfx,
8587                           Long delta, HChar* opname, IROp op )
8588{
8589   Bool    shl, shr, sar;
8590   UChar   rm   = getUChar(delta);
8591   IRTemp  e0   = newTemp(Ity_V128);
8592   IRTemp  e1   = newTemp(Ity_V128);
8593   UChar   amt, size;
8594   vassert(epartIsReg(rm));
8595   vassert(gregLO3ofRM(rm) == 2
8596           || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
8597   amt = getUChar(delta+1);
8598   delta += 2;
8599   DIP("%s $%d,%s\n", opname,
8600                      (Int)amt,
8601                      nameXMMReg(eregOfRexRM(pfx,rm)) );
8602   assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
8603
8604   shl = shr = sar = False;
8605   size = 0;
8606   switch (op) {
8607      case Iop_ShlN16x8: shl = True; size = 16; break;
8608      case Iop_ShlN32x4: shl = True; size = 32; break;
8609      case Iop_ShlN64x2: shl = True; size = 64; break;
8610      case Iop_SarN16x8: sar = True; size = 16; break;
8611      case Iop_SarN32x4: sar = True; size = 32; break;
8612      case Iop_ShrN16x8: shr = True; size = 16; break;
8613      case Iop_ShrN32x4: shr = True; size = 32; break;
8614      case Iop_ShrN64x2: shr = True; size = 64; break;
8615      default: vassert(0);
8616   }
8617
8618   if (shl || shr) {
8619     assign( e1, amt >= size
8620                    ? mkV128(0x0000)
8621                    : binop(op, mkexpr(e0), mkU8(amt))
8622     );
8623   } else
8624   if (sar) {
8625     assign( e1, amt >= size
8626                    ? binop(op, mkexpr(e0), mkU8(size-1))
8627                    : binop(op, mkexpr(e0), mkU8(amt))
8628     );
8629   } else {
8630      vassert(0);
8631   }
8632
8633   putXMMReg( eregOfRexRM(pfx,rm), mkexpr(e1) );
8634   return delta;
8635}
8636
8637
8638/* Get the current SSE rounding mode. */
8639
8640static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
8641{
8642   return
8643      unop( Iop_64to32,
8644            binop( Iop_And64,
8645                   IRExpr_Get( OFFB_SSEROUND, Ity_I64 ),
8646                   mkU64(3) ));
8647}
8648
8649static void put_sse_roundingmode ( IRExpr* sseround )
8650{
8651   vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
8652   stmt( IRStmt_Put( OFFB_SSEROUND,
8653                     unop(Iop_32Uto64,sseround) ) );
8654}
8655
8656/* Break a 128-bit value up into four 32-bit ints. */
8657
8658static void breakup128to32s ( IRTemp t128,
8659                              /*OUTs*/
8660                              IRTemp* t3, IRTemp* t2,
8661                              IRTemp* t1, IRTemp* t0 )
8662{
8663   IRTemp hi64 = newTemp(Ity_I64);
8664   IRTemp lo64 = newTemp(Ity_I64);
8665   assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
8666   assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
8667
8668   vassert(t0 && *t0 == IRTemp_INVALID);
8669   vassert(t1 && *t1 == IRTemp_INVALID);
8670   vassert(t2 && *t2 == IRTemp_INVALID);
8671   vassert(t3 && *t3 == IRTemp_INVALID);
8672
8673   *t0 = newTemp(Ity_I32);
8674   *t1 = newTemp(Ity_I32);
8675   *t2 = newTemp(Ity_I32);
8676   *t3 = newTemp(Ity_I32);
8677   assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
8678   assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
8679   assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
8680   assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
8681}
8682
8683/* Construct a 128-bit value from four 32-bit ints. */
8684
8685static IRExpr* mk128from32s ( IRTemp t3, IRTemp t2,
8686                              IRTemp t1, IRTemp t0 )
8687{
8688   return
8689      binop( Iop_64HLtoV128,
8690             binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
8691             binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
8692   );
8693}
8694
8695/* Break a 64-bit value up into four 16-bit ints. */
8696
8697static void breakup64to16s ( IRTemp t64,
8698                             /*OUTs*/
8699                             IRTemp* t3, IRTemp* t2,
8700                             IRTemp* t1, IRTemp* t0 )
8701{
8702   IRTemp hi32 = newTemp(Ity_I32);
8703   IRTemp lo32 = newTemp(Ity_I32);
8704   assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
8705   assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
8706
8707   vassert(t0 && *t0 == IRTemp_INVALID);
8708   vassert(t1 && *t1 == IRTemp_INVALID);
8709   vassert(t2 && *t2 == IRTemp_INVALID);
8710   vassert(t3 && *t3 == IRTemp_INVALID);
8711
8712   *t0 = newTemp(Ity_I16);
8713   *t1 = newTemp(Ity_I16);
8714   *t2 = newTemp(Ity_I16);
8715   *t3 = newTemp(Ity_I16);
8716   assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
8717   assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
8718   assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
8719   assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
8720}
8721
8722/* Construct a 64-bit value from four 16-bit ints. */
8723
8724static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
8725                             IRTemp t1, IRTemp t0 )
8726{
8727   return
8728      binop( Iop_32HLto64,
8729             binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
8730             binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
8731   );
8732}
8733
8734
8735/* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
8736   values (aa,bb), computes, for each of the 4 16-bit lanes:
8737
8738   (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
8739*/
8740static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
8741{
8742   IRTemp aa      = newTemp(Ity_I64);
8743   IRTemp bb      = newTemp(Ity_I64);
8744   IRTemp aahi32s = newTemp(Ity_I64);
8745   IRTemp aalo32s = newTemp(Ity_I64);
8746   IRTemp bbhi32s = newTemp(Ity_I64);
8747   IRTemp bblo32s = newTemp(Ity_I64);
8748   IRTemp rHi     = newTemp(Ity_I64);
8749   IRTemp rLo     = newTemp(Ity_I64);
8750   IRTemp one32x2 = newTemp(Ity_I64);
8751   assign(aa, aax);
8752   assign(bb, bbx);
8753   assign( aahi32s,
8754           binop(Iop_SarN32x2,
8755                 binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
8756                 mkU8(16) ));
8757   assign( aalo32s,
8758           binop(Iop_SarN32x2,
8759                 binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
8760                 mkU8(16) ));
8761   assign( bbhi32s,
8762           binop(Iop_SarN32x2,
8763                 binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
8764                 mkU8(16) ));
8765   assign( bblo32s,
8766           binop(Iop_SarN32x2,
8767                 binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
8768                 mkU8(16) ));
8769   assign(one32x2, mkU64( (1ULL << 32) + 1 ));
8770   assign(
8771      rHi,
8772      binop(
8773         Iop_ShrN32x2,
8774         binop(
8775            Iop_Add32x2,
8776            binop(
8777               Iop_ShrN32x2,
8778               binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
8779               mkU8(14)
8780            ),
8781            mkexpr(one32x2)
8782         ),
8783         mkU8(1)
8784      )
8785   );
8786   assign(
8787      rLo,
8788      binop(
8789         Iop_ShrN32x2,
8790         binop(
8791            Iop_Add32x2,
8792            binop(
8793               Iop_ShrN32x2,
8794               binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
8795               mkU8(14)
8796            ),
8797            mkexpr(one32x2)
8798         ),
8799         mkU8(1)
8800      )
8801   );
8802   return
8803      binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
8804}
8805
8806/* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
8807   values (aa,bb), computes, for each lane:
8808
8809          if aa_lane < 0 then - bb_lane
8810     else if aa_lane > 0 then bb_lane
8811     else 0
8812*/
8813static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
8814{
8815   IRTemp aa       = newTemp(Ity_I64);
8816   IRTemp bb       = newTemp(Ity_I64);
8817   IRTemp zero     = newTemp(Ity_I64);
8818   IRTemp bbNeg    = newTemp(Ity_I64);
8819   IRTemp negMask  = newTemp(Ity_I64);
8820   IRTemp posMask  = newTemp(Ity_I64);
8821   IROp   opSub    = Iop_INVALID;
8822   IROp   opCmpGTS = Iop_INVALID;
8823
8824   switch (laneszB) {
8825      case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
8826      case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
8827      case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
8828      default: vassert(0);
8829   }
8830
8831   assign( aa,      aax );
8832   assign( bb,      bbx );
8833   assign( zero,    mkU64(0) );
8834   assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
8835   assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
8836   assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
8837
8838   return
8839      binop(Iop_Or64,
8840            binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
8841            binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
8842
8843}
8844
8845/* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
8846   value aa, computes, for each lane
8847
8848   if aa < 0 then -aa else aa
8849
8850   Note that the result is interpreted as unsigned, so that the
8851   absolute value of the most negative signed input can be
8852   represented.
8853*/
8854static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB )
8855{
8856   IRTemp aa      = newTemp(Ity_I64);
8857   IRTemp zero    = newTemp(Ity_I64);
8858   IRTemp aaNeg   = newTemp(Ity_I64);
8859   IRTemp negMask = newTemp(Ity_I64);
8860   IRTemp posMask = newTemp(Ity_I64);
8861   IROp   opSub   = Iop_INVALID;
8862   IROp   opSarN  = Iop_INVALID;
8863
8864   switch (laneszB) {
8865      case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
8866      case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
8867      case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
8868      default: vassert(0);
8869   }
8870
8871   assign( aa,      aax );
8872   assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
8873   assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
8874   assign( zero,    mkU64(0) );
8875   assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
8876   return
8877      binop(Iop_Or64,
8878            binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
8879            binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) );
8880}
8881
8882static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
8883                                        IRTemp lo64, Long byteShift )
8884{
8885   vassert(byteShift >= 1 && byteShift <= 7);
8886   return
8887      binop(Iop_Or64,
8888            binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
8889            binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
8890      );
8891}
8892
8893/* Generate a SIGSEGV followed by a restart of the current instruction
8894   if effective_addr is not 16-aligned.  This is required behaviour
8895   for some SSE3 instructions and all 128-bit SSSE3 instructions.
8896   This assumes that guest_RIP_curr_instr is set correctly! */
8897/* TODO(glider): we've replaced the 0xF mask with 0x0, effectively disabling
8898 * the check. Need to enable it once TSan stops generating unaligned
8899 * accesses in the wrappers.
8900 * See http://code.google.com/p/data-race-test/issues/detail?id=49 */
8901static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr )
8902{
8903   stmt(
8904      IRStmt_Exit(
8905         binop(Iop_CmpNE64,
8906               binop(Iop_And64,mkexpr(effective_addr),mkU64(0x0)),
8907               mkU64(0)),
8908         Ijk_SigSEGV,
8909         IRConst_U64(guest_RIP_curr_instr)
8910      )
8911   );
8912}
8913
8914
8915/* Helper for deciding whether a given insn (starting at the opcode
8916   byte) may validly be used with a LOCK prefix.  The following insns
8917   may be used with LOCK when their destination operand is in memory.
8918   AFAICS this is exactly the same for both 32-bit and 64-bit mode.
8919
8920   ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
8921   OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
8922   ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
8923   SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
8924   AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
8925   SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
8926   XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
8927
8928   DEC        FE /1,  FF /1
8929   INC        FE /0,  FF /0
8930
8931   NEG        F6 /3,  F7 /3
8932   NOT        F6 /2,  F7 /2
8933
8934   XCHG       86, 87
8935
8936   BTC        0F BB,  0F BA /7
8937   BTR        0F B3,  0F BA /6
8938   BTS        0F AB,  0F BA /5
8939
8940   CMPXCHG    0F B0,  0F B1
8941   CMPXCHG8B  0F C7 /1
8942
8943   XADD       0F C0,  0F C1
8944
8945   ------------------------------
8946
8947   80 /0  =  addb $imm8,  rm8
8948   81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
8949   82 /0  =  addb $imm8,  rm8
8950   83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
8951
8952   00     =  addb r8,  rm8
8953   01     =  addl r32, rm32  and  addw r16, rm16
8954
8955   Same for ADD OR ADC SBB AND SUB XOR
8956
8957   FE /1  = dec rm8
8958   FF /1  = dec rm32  and  dec rm16
8959
8960   FE /0  = inc rm8
8961   FF /0  = inc rm32  and  inc rm16
8962
8963   F6 /3  = neg rm8
8964   F7 /3  = neg rm32  and  neg rm16
8965
8966   F6 /2  = not rm8
8967   F7 /2  = not rm32  and  not rm16
8968
8969   0F BB     = btcw r16, rm16    and  btcl r32, rm32
8970   OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
8971
8972   Same for BTS, BTR
8973*/
8974static Bool can_be_used_with_LOCK_prefix ( UChar* opc )
8975{
8976   switch (opc[0]) {
8977      case 0x00: case 0x01: case 0x08: case 0x09:
8978      case 0x10: case 0x11: case 0x18: case 0x19:
8979      case 0x20: case 0x21: case 0x28: case 0x29:
8980      case 0x30: case 0x31:
8981         if (!epartIsReg(opc[1]))
8982            return True;
8983         break;
8984
8985      case 0x80: case 0x81: case 0x82: case 0x83:
8986         if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 6
8987             && !epartIsReg(opc[1]))
8988            return True;
8989         break;
8990
8991      case 0xFE: case 0xFF:
8992         if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 1
8993             && !epartIsReg(opc[1]))
8994            return True;
8995         break;
8996
8997      case 0xF6: case 0xF7:
8998         if (gregLO3ofRM(opc[1]) >= 2 && gregLO3ofRM(opc[1]) <= 3
8999             && !epartIsReg(opc[1]))
9000            return True;
9001         break;
9002
9003      case 0x86: case 0x87:
9004         if (!epartIsReg(opc[1]))
9005            return True;
9006         break;
9007
9008      case 0x0F: {
9009         switch (opc[1]) {
9010            case 0xBB: case 0xB3: case 0xAB:
9011               if (!epartIsReg(opc[2]))
9012                  return True;
9013               break;
9014            case 0xBA:
9015               if (gregLO3ofRM(opc[2]) >= 5 && gregLO3ofRM(opc[2]) <= 7
9016                   && !epartIsReg(opc[2]))
9017                  return True;
9018               break;
9019            case 0xB0: case 0xB1:
9020               if (!epartIsReg(opc[2]))
9021                  return True;
9022               break;
9023            case 0xC7:
9024               if (gregLO3ofRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
9025                  return True;
9026               break;
9027            case 0xC0: case 0xC1:
9028               if (!epartIsReg(opc[2]))
9029                  return True;
9030               break;
9031            default:
9032               break;
9033         } /* switch (opc[1]) */
9034         break;
9035      }
9036
9037      default:
9038         break;
9039   } /* switch (opc[0]) */
9040
9041   return False;
9042}
9043
9044
9045/*------------------------------------------------------------*/
9046/*--- Disassemble a single instruction                     ---*/
9047/*------------------------------------------------------------*/
9048
9049/* Disassemble a single instruction into IR.  The instruction is
9050   located in host memory at &guest_code[delta]. */
9051
9052static
9053DisResult disInstr_AMD64_WRK (
9054             /*OUT*/Bool* expect_CAS,
9055             Bool         put_IP,
9056             Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
9057             Bool         resteerCisOk,
9058             void*        callback_opaque,
9059             Long         delta64,
9060             VexArchInfo* archinfo,
9061             VexAbiInfo*  vbi
9062          )
9063{
9064   IRType    ty;
9065   IRTemp    addr, t0, t1, t2, t3, t4, t5, t6;
9066   Int       alen;
9067   UChar     opc, modrm, abyte, pre;
9068   Long      d64;
9069   HChar     dis_buf[50];
9070   Int       am_sz, d_sz, n, n_prefixes;
9071   DisResult dres;
9072   UChar*    insn; /* used in SSE decoders */
9073
9074   /* The running delta */
9075   Long delta = delta64;
9076
9077   /* Holds eip at the start of the insn, so that we can print
9078      consistent error messages for unimplemented insns. */
9079   Long delta_start = delta;
9080
9081   /* sz denotes the nominal data-op size of the insn; we change it to
9082      2 if an 0x66 prefix is seen and 8 if REX.W is 1.  In case of
9083      conflict REX.W takes precedence. */
9084   Int sz = 4;
9085
9086   /* pfx holds the summary of prefixes. */
9087   Prefix pfx = PFX_EMPTY;
9088
9089   /* Set result defaults. */
9090   dres.whatNext   = Dis_Continue;
9091   dres.len        = 0;
9092   dres.continueAt = 0;
9093
9094   *expect_CAS = False;
9095
9096   vassert(guest_RIP_next_assumed == 0);
9097   vassert(guest_RIP_next_mustcheck == False);
9098
9099   addr = t0 = t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID;
9100
9101   DIP("\t0x%llx:  ", guest_RIP_bbstart+delta);
9102
9103   /* We may be asked to update the guest RIP before going further. */
9104   if (put_IP)
9105      stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr)) );
9106
9107   /* Spot "Special" instructions (see comment at top of file). */
9108   {
9109      UChar* code = (UChar*)(guest_code + delta);
9110      /* Spot the 16-byte preamble:
9111         48C1C703   rolq $3,  %rdi
9112         48C1C70D   rolq $13, %rdi
9113         48C1C73D   rolq $61, %rdi
9114         48C1C733   rolq $51, %rdi
9115      */
9116      if (code[ 0] == 0x48 && code[ 1] == 0xC1 && code[ 2] == 0xC7
9117                                               && code[ 3] == 0x03 &&
9118          code[ 4] == 0x48 && code[ 5] == 0xC1 && code[ 6] == 0xC7
9119                                               && code[ 7] == 0x0D &&
9120          code[ 8] == 0x48 && code[ 9] == 0xC1 && code[10] == 0xC7
9121                                               && code[11] == 0x3D &&
9122          code[12] == 0x48 && code[13] == 0xC1 && code[14] == 0xC7
9123                                               && code[15] == 0x33) {
9124         /* Got a "Special" instruction preamble.  Which one is it? */
9125         if (code[16] == 0x48 && code[17] == 0x87
9126                              && code[18] == 0xDB /* xchgq %rbx,%rbx */) {
9127            /* %RDX = client_request ( %RAX ) */
9128            DIP("%%rdx = client_request ( %%rax )\n");
9129            delta += 19;
9130            jmp_lit(Ijk_ClientReq, guest_RIP_bbstart+delta);
9131            dres.whatNext = Dis_StopHere;
9132            goto decode_success;
9133         }
9134         else
9135         if (code[16] == 0x48 && code[17] == 0x87
9136                              && code[18] == 0xC9 /* xchgq %rcx,%rcx */) {
9137            /* %RAX = guest_NRADDR */
9138            DIP("%%rax = guest_NRADDR\n");
9139            delta += 19;
9140            putIRegRAX(8, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
9141            goto decode_success;
9142         }
9143         else
9144         if (code[16] == 0x48 && code[17] == 0x87
9145                              && code[18] == 0xD2 /* xchgq %rdx,%rdx */) {
9146            /* call-noredir *%RAX */
9147            DIP("call-noredir *%%rax\n");
9148            delta += 19;
9149            t1 = newTemp(Ity_I64);
9150            assign(t1, getIRegRAX(8));
9151            t2 = newTemp(Ity_I64);
9152            assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
9153            putIReg64(R_RSP, mkexpr(t2));
9154            storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta));
9155            jmp_treg(Ijk_NoRedir,t1);
9156            dres.whatNext = Dis_StopHere;
9157            goto decode_success;
9158         }
9159         /* We don't know what it is. */
9160         goto decode_failure;
9161         /*NOTREACHED*/
9162      }
9163   }
9164
9165   /* Eat prefixes, summarising the result in pfx and sz, and rejecting
9166      as many invalid combinations as possible. */
9167   n_prefixes = 0;
9168   while (True) {
9169      if (n_prefixes > 7) goto decode_failure;
9170      pre = getUChar(delta);
9171      switch (pre) {
9172         case 0x66: pfx |= PFX_66; break;
9173         case 0x67: pfx |= PFX_ASO; break;
9174         case 0xF2: pfx |= PFX_F2; break;
9175         case 0xF3: pfx |= PFX_F3; break;
9176         case 0xF0: pfx |= PFX_LOCK; *expect_CAS = True; break;
9177         case 0x2E: pfx |= PFX_CS; break;
9178         case 0x3E: pfx |= PFX_DS; break;
9179         case 0x26: pfx |= PFX_ES; break;
9180         case 0x64: pfx |= PFX_FS; break;
9181         case 0x65: pfx |= PFX_GS; break;
9182         case 0x36: pfx |= PFX_SS; break;
9183         case 0x40 ... 0x4F:
9184            pfx |= PFX_REX;
9185            if (pre & (1<<3)) pfx |= PFX_REXW;
9186            if (pre & (1<<2)) pfx |= PFX_REXR;
9187            if (pre & (1<<1)) pfx |= PFX_REXX;
9188            if (pre & (1<<0)) pfx |= PFX_REXB;
9189            break;
9190         default:
9191            goto not_a_prefix;
9192      }
9193      n_prefixes++;
9194      delta++;
9195   }
9196
9197   not_a_prefix:
9198
9199   /* Dump invalid combinations */
9200   n = 0;
9201   if (pfx & PFX_F2) n++;
9202   if (pfx & PFX_F3) n++;
9203   if (n > 1)
9204      goto decode_failure; /* can't have both */
9205
9206   n = 0;
9207   if (pfx & PFX_CS) n++;
9208   if (pfx & PFX_DS) n++;
9209   if (pfx & PFX_ES) n++;
9210   if (pfx & PFX_FS) n++;
9211   if (pfx & PFX_GS) n++;
9212   if (pfx & PFX_SS) n++;
9213   if (n > 1)
9214      goto decode_failure; /* multiple seg overrides == illegal */
9215
9216   /* We have a %fs prefix.  Reject it if there's no evidence in 'vbi'
9217      that we should accept it. */
9218   if ((pfx & PFX_FS) && !vbi->guest_amd64_assume_fs_is_zero)
9219      goto decode_failure;
9220
9221   /* Ditto for %gs prefixes. */
9222   if ((pfx & PFX_GS) && !vbi->guest_amd64_assume_gs_is_0x60)
9223      goto decode_failure;
9224
9225   /* Set up sz. */
9226   sz = 4;
9227   if (pfx & PFX_66) sz = 2;
9228   if ((pfx & PFX_REX) && (pfx & PFX_REXW)) sz = 8;
9229
9230   /* Now we should be looking at the primary opcode byte or the
9231      leading F2 or F3.  Check that any LOCK prefix is actually
9232      allowed. */
9233
9234   if (pfx & PFX_LOCK) {
9235      if (can_be_used_with_LOCK_prefix( (UChar*)&guest_code[delta] )) {
9236         DIP("lock ");
9237      } else {
9238         *expect_CAS = False;
9239         goto decode_failure;
9240      }
9241   }
9242
9243
9244   /* ---------------------------------------------------- */
9245   /* --- The SSE/SSE2 decoder.                        --- */
9246   /* ---------------------------------------------------- */
9247
9248   /* What did I do to deserve SSE ?  Perhaps I was really bad in a
9249      previous life? */
9250
9251   /* Note, this doesn't handle SSE3 right now.  All amd64s support
9252      SSE2 as a minimum so there is no point distinguishing SSE1 vs
9253      SSE2. */
9254
9255   insn = (UChar*)&guest_code[delta];
9256
9257   /* FXSAVE is spuriously at the start here only because it is
9258      thusly placed in guest-x86/toIR.c. */
9259
9260   /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory.
9261      Note that the presence or absence of REX.W slightly affects the
9262      written format: whether the saved FPU IP and DP pointers are 64
9263      or 32 bits.  But the helper function we call simply writes zero
9264      bits in the relevant fields (which are 64 bits regardless of
9265      what REX.W is) and so it's good enough (iow, equally broken) in
9266      both cases. */
9267   if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
9268       && insn[0] == 0x0F && insn[1] == 0xAE
9269       && !epartIsReg(insn[2]) && gregOfRexRM(pfx,insn[2]) == 0) {
9270       IRDirty* d;
9271      modrm = getUChar(delta+2);
9272      vassert(!epartIsReg(modrm));
9273
9274      addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
9275      delta += 2+alen;
9276      gen_SEGV_if_not_16_aligned(addr);
9277
9278      DIP("%sfxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
9279
9280      /* Uses dirty helper:
9281            void amd64g_do_FXSAVE ( VexGuestAMD64State*, ULong ) */
9282      d = unsafeIRDirty_0_N (
9283             0/*regparms*/,
9284             "amd64g_dirtyhelper_FXSAVE",
9285             &amd64g_dirtyhelper_FXSAVE,
9286             mkIRExprVec_1( mkexpr(addr) )
9287          );
9288      d->needsBBP = True;
9289
9290      /* declare we're writing memory */
9291      d->mFx   = Ifx_Write;
9292      d->mAddr = mkexpr(addr);
9293      d->mSize = 512;
9294
9295      /* declare we're reading guest state */
9296      d->nFxState = 7;
9297
9298      d->fxState[0].fx     = Ifx_Read;
9299      d->fxState[0].offset = OFFB_FTOP;
9300      d->fxState[0].size   = sizeof(UInt);
9301
9302      d->fxState[1].fx     = Ifx_Read;
9303      d->fxState[1].offset = OFFB_FPREGS;
9304      d->fxState[1].size   = 8 * sizeof(ULong);
9305
9306      d->fxState[2].fx     = Ifx_Read;
9307      d->fxState[2].offset = OFFB_FPTAGS;
9308      d->fxState[2].size   = 8 * sizeof(UChar);
9309
9310      d->fxState[3].fx     = Ifx_Read;
9311      d->fxState[3].offset = OFFB_FPROUND;
9312      d->fxState[3].size   = sizeof(ULong);
9313
9314      d->fxState[4].fx     = Ifx_Read;
9315      d->fxState[4].offset = OFFB_FC3210;
9316      d->fxState[4].size   = sizeof(ULong);
9317
9318      d->fxState[5].fx     = Ifx_Read;
9319      d->fxState[5].offset = OFFB_XMM0;
9320      d->fxState[5].size   = 16 * sizeof(U128);
9321
9322      d->fxState[6].fx     = Ifx_Read;
9323      d->fxState[6].offset = OFFB_SSEROUND;
9324      d->fxState[6].size   = sizeof(ULong);
9325
9326      /* Be paranoid ... this assertion tries to ensure the 16 %xmm
9327	 images are packed back-to-back.  If not, the value of
9328	 d->fxState[5].size is wrong. */
9329      vassert(16 == sizeof(U128));
9330      vassert(OFFB_XMM15 == (OFFB_XMM0 + 15 * 16));
9331
9332      stmt( IRStmt_Dirty(d) );
9333
9334      goto decode_success;
9335   }
9336
9337   /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory.
9338      As with FXSAVE above we ignore the value of REX.W since we're
9339      not bothering with the FPU DP and IP fields. */
9340   if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
9341       && insn[0] == 0x0F && insn[1] == 0xAE
9342       && !epartIsReg(insn[2]) && gregOfRexRM(pfx,insn[2]) == 1) {
9343       IRDirty* d;
9344      modrm = getUChar(delta+2);
9345      vassert(!epartIsReg(modrm));
9346
9347      addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
9348      delta += 2+alen;
9349      gen_SEGV_if_not_16_aligned(addr);
9350
9351      DIP("%sfxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
9352
9353      /* Uses dirty helper:
9354            VexEmWarn amd64g_do_FXRSTOR ( VexGuestAMD64State*, ULong )
9355         NOTE:
9356            the VexEmWarn value is simply ignored
9357      */
9358      d = unsafeIRDirty_0_N (
9359             0/*regparms*/,
9360             "amd64g_dirtyhelper_FXRSTOR",
9361             &amd64g_dirtyhelper_FXRSTOR,
9362             mkIRExprVec_1( mkexpr(addr) )
9363          );
9364      d->needsBBP = True;
9365
9366      /* declare we're reading memory */
9367      d->mFx   = Ifx_Read;
9368      d->mAddr = mkexpr(addr);
9369      d->mSize = 512;
9370
9371      /* declare we're writing guest state */
9372      d->nFxState = 7;
9373
9374      d->fxState[0].fx     = Ifx_Write;
9375      d->fxState[0].offset = OFFB_FTOP;
9376      d->fxState[0].size   = sizeof(UInt);
9377
9378      d->fxState[1].fx     = Ifx_Write;
9379      d->fxState[1].offset = OFFB_FPREGS;
9380      d->fxState[1].size   = 8 * sizeof(ULong);
9381
9382      d->fxState[2].fx     = Ifx_Write;
9383      d->fxState[2].offset = OFFB_FPTAGS;
9384      d->fxState[2].size   = 8 * sizeof(UChar);
9385
9386      d->fxState[3].fx     = Ifx_Write;
9387      d->fxState[3].offset = OFFB_FPROUND;
9388      d->fxState[3].size   = sizeof(ULong);
9389
9390      d->fxState[4].fx     = Ifx_Write;
9391      d->fxState[4].offset = OFFB_FC3210;
9392      d->fxState[4].size   = sizeof(ULong);
9393
9394      d->fxState[5].fx     = Ifx_Write;
9395      d->fxState[5].offset = OFFB_XMM0;
9396      d->fxState[5].size   = 16 * sizeof(U128);
9397
9398      d->fxState[6].fx     = Ifx_Write;
9399      d->fxState[6].offset = OFFB_SSEROUND;
9400      d->fxState[6].size   = sizeof(ULong);
9401
9402      /* Be paranoid ... this assertion tries to ensure the 16 %xmm
9403	 images are packed back-to-back.  If not, the value of
9404	 d->fxState[5].size is wrong. */
9405      vassert(16 == sizeof(U128));
9406      vassert(OFFB_XMM15 == (OFFB_XMM0 + 15 * 16));
9407
9408      stmt( IRStmt_Dirty(d) );
9409
9410      goto decode_success;
9411   }
9412
9413   /* ------ SSE decoder main ------ */
9414
9415   /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
9416   if (haveNo66noF2noF3(pfx) && sz == 4
9417       && insn[0] == 0x0F && insn[1] == 0x58) {
9418      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "addps", Iop_Add32Fx4 );
9419      goto decode_success;
9420   }
9421
9422   /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
9423   if (haveF3no66noF2(pfx) && sz == 4
9424       && insn[0] == 0x0F && insn[1] == 0x58) {
9425      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "addss", Iop_Add32F0x4 );
9426      goto decode_success;
9427   }
9428
9429   /* 0F 55 = ANDNPS -- G = (not G) and E */
9430   if (haveNo66noF2noF3(pfx) && sz == 4
9431       && insn[0] == 0x0F && insn[1] == 0x55) {
9432      delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta+2, "andnps", Iop_AndV128 );
9433      goto decode_success;
9434   }
9435
9436   /* 0F 54 = ANDPS -- G = G and E */
9437   if (haveNo66noF2noF3(pfx) && sz == 4
9438       && insn[0] == 0x0F && insn[1] == 0x54) {
9439      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "andps", Iop_AndV128 );
9440      goto decode_success;
9441   }
9442
9443   /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
9444   if (haveNo66noF2noF3(pfx) && sz == 4
9445       && insn[0] == 0x0F && insn[1] == 0xC2) {
9446      delta = dis_SSEcmp_E_to_G( vbi, pfx, delta+2, "cmpps", True, 4 );
9447      goto decode_success;
9448   }
9449
9450   /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
9451   if (haveF3no66noF2(pfx) && sz == 4
9452       && insn[0] == 0x0F && insn[1] == 0xC2) {
9453      delta = dis_SSEcmp_E_to_G( vbi, pfx, delta+2, "cmpss", False, 4 );
9454      goto decode_success;
9455   }
9456
9457   /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
9458   /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
9459   if (haveNo66noF2noF3(pfx) && sz == 4
9460       && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
9461      IRTemp argL = newTemp(Ity_F32);
9462      IRTemp argR = newTemp(Ity_F32);
9463      modrm = getUChar(delta+2);
9464      if (epartIsReg(modrm)) {
9465         assign( argR, getXMMRegLane32F( eregOfRexRM(pfx,modrm),
9466                                         0/*lowest lane*/ ) );
9467         delta += 2+1;
9468         DIP("%scomiss %s,%s\n", insn[1]==0x2E ? "u" : "",
9469                                 nameXMMReg(eregOfRexRM(pfx,modrm)),
9470                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
9471      } else {
9472         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
9473	 assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
9474         delta += 2+alen;
9475         DIP("%scomiss %s,%s\n", insn[1]==0x2E ? "u" : "",
9476                                 dis_buf,
9477                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
9478      }
9479      assign( argL, getXMMRegLane32F( gregOfRexRM(pfx,modrm),
9480                                      0/*lowest lane*/ ) );
9481
9482      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
9483      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
9484      stmt( IRStmt_Put(
9485               OFFB_CC_DEP1,
9486               binop( Iop_And64,
9487                      unop( Iop_32Uto64,
9488                            binop(Iop_CmpF64,
9489                                  unop(Iop_F32toF64,mkexpr(argL)),
9490                                  unop(Iop_F32toF64,mkexpr(argR)))),
9491                      mkU64(0x45)
9492          )));
9493
9494      goto decode_success;
9495   }
9496
9497   /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
9498      half xmm */
9499   if (haveNo66noF2noF3(pfx) && sz == 4
9500       && insn[0] == 0x0F && insn[1] == 0x2A) {
9501      IRTemp arg64 = newTemp(Ity_I64);
9502      IRTemp rmode = newTemp(Ity_I32);
9503
9504      modrm = getUChar(delta+2);
9505      do_MMX_preamble();
9506      if (epartIsReg(modrm)) {
9507         assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
9508         delta += 2+1;
9509         DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
9510                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
9511      } else {
9512         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
9513         assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
9514         delta += 2+alen;
9515         DIP("cvtpi2ps %s,%s\n", dis_buf,
9516                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
9517      }
9518
9519      assign( rmode, get_sse_roundingmode() );
9520
9521      putXMMRegLane32F(
9522         gregOfRexRM(pfx,modrm), 0,
9523         binop(Iop_F64toF32,
9524               mkexpr(rmode),
9525               unop(Iop_I32StoF64,
9526                    unop(Iop_64to32, mkexpr(arg64)) )) );
9527
9528      putXMMRegLane32F(
9529         gregOfRexRM(pfx,modrm), 1,
9530         binop(Iop_F64toF32,
9531               mkexpr(rmode),
9532               unop(Iop_I32StoF64,
9533                    unop(Iop_64HIto32, mkexpr(arg64)) )) );
9534
9535      goto decode_success;
9536   }
9537
9538   /* F3 0F 2A = CVTSI2SS
9539      -- sz==4: convert I32 in mem/ireg to F32 in low quarter xmm
9540      -- sz==8: convert I64 in mem/ireg to F32 in low quarter xmm */
9541   if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)
9542       && insn[0] == 0x0F && insn[1] == 0x2A) {
9543
9544      IRTemp rmode = newTemp(Ity_I32);
9545      assign( rmode, get_sse_roundingmode() );
9546      modrm = getUChar(delta+2);
9547
9548      if (sz == 4) {
9549         IRTemp arg32 = newTemp(Ity_I32);
9550         if (epartIsReg(modrm)) {
9551            assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
9552            delta += 2+1;
9553            DIP("cvtsi2ss %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
9554                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
9555         } else {
9556            addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
9557            assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
9558            delta += 2+alen;
9559            DIP("cvtsi2ss %s,%s\n", dis_buf,
9560                                    nameXMMReg(gregOfRexRM(pfx,modrm)) );
9561         }
9562         putXMMRegLane32F(
9563            gregOfRexRM(pfx,modrm), 0,
9564            binop(Iop_F64toF32,
9565                  mkexpr(rmode),
9566                  unop(Iop_I32StoF64, mkexpr(arg32)) ) );
9567      } else {
9568         /* sz == 8 */
9569         IRTemp arg64 = newTemp(Ity_I64);
9570         if (epartIsReg(modrm)) {
9571            assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
9572            delta += 2+1;
9573            DIP("cvtsi2ssq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
9574                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
9575         } else {
9576            addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
9577            assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
9578            delta += 2+alen;
9579            DIP("cvtsi2ssq %s,%s\n", dis_buf,
9580                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
9581         }
9582         putXMMRegLane32F(
9583            gregOfRexRM(pfx,modrm), 0,
9584            binop(Iop_F64toF32,
9585                  mkexpr(rmode),
9586                  binop(Iop_I64StoF64, mkexpr(rmode), mkexpr(arg64)) ) );
9587      }
9588
9589      goto decode_success;
9590   }
9591
9592   /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
9593      I32 in mmx, according to prevailing SSE rounding mode */
9594   /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
9595      I32 in mmx, rounding towards zero */
9596   if (haveNo66noF2noF3(pfx) && sz == 4
9597       && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
9598      IRTemp dst64  = newTemp(Ity_I64);
9599      IRTemp rmode  = newTemp(Ity_I32);
9600      IRTemp f32lo  = newTemp(Ity_F32);
9601      IRTemp f32hi  = newTemp(Ity_F32);
9602      Bool   r2zero = toBool(insn[1] == 0x2C);
9603
9604      do_MMX_preamble();
9605      modrm = getUChar(delta+2);
9606
9607      if (epartIsReg(modrm)) {
9608         delta += 2+1;
9609         assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
9610         assign(f32hi, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 1));
9611         DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
9612                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
9613                                   nameMMXReg(gregLO3ofRM(modrm)));
9614      } else {
9615         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
9616         assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
9617         assign(f32hi, loadLE(Ity_F32, binop( Iop_Add64,
9618                                              mkexpr(addr),
9619                                              mkU64(4) )));
9620         delta += 2+alen;
9621         DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
9622                                   dis_buf,
9623                                   nameMMXReg(gregLO3ofRM(modrm)));
9624      }
9625
9626      if (r2zero) {
9627         assign(rmode, mkU32((UInt)Irrm_ZERO) );
9628      } else {
9629         assign( rmode, get_sse_roundingmode() );
9630      }
9631
9632      assign(
9633         dst64,
9634         binop( Iop_32HLto64,
9635                binop( Iop_F64toI32S,
9636                       mkexpr(rmode),
9637                       unop( Iop_F32toF64, mkexpr(f32hi) ) ),
9638                binop( Iop_F64toI32S,
9639                       mkexpr(rmode),
9640                       unop( Iop_F32toF64, mkexpr(f32lo) ) )
9641              )
9642      );
9643
9644      putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
9645      goto decode_success;
9646   }
9647
9648   /* F3 0F 2D = CVTSS2SI
9649      when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
9650                    according to prevailing SSE rounding mode
9651      when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
9652                    according to prevailing SSE rounding mode
9653   */
9654   /* F3 0F 2C = CVTTSS2SI
9655      when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
9656                    truncating towards zero
9657      when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
9658                    truncating towards zero
9659   */
9660   if (haveF3no66noF2(pfx)
9661       && insn[0] == 0x0F
9662       && (insn[1] == 0x2D || insn[1] == 0x2C)) {
9663      IRTemp rmode  = newTemp(Ity_I32);
9664      IRTemp f32lo  = newTemp(Ity_F32);
9665      Bool   r2zero = toBool(insn[1] == 0x2C);
9666      vassert(sz == 4 || sz == 8);
9667
9668      modrm = getUChar(delta+2);
9669      if (epartIsReg(modrm)) {
9670         delta += 2+1;
9671         assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
9672         DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
9673                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
9674                                   nameIReg(sz, gregOfRexRM(pfx,modrm), False));
9675      } else {
9676         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
9677         assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
9678         delta += 2+alen;
9679         DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
9680                                   dis_buf,
9681                                   nameIReg(sz, gregOfRexRM(pfx,modrm), False));
9682      }
9683
9684      if (r2zero) {
9685         assign( rmode, mkU32((UInt)Irrm_ZERO) );
9686      } else {
9687         assign( rmode, get_sse_roundingmode() );
9688      }
9689
9690      if (sz == 4) {
9691         putIReg32( gregOfRexRM(pfx,modrm),
9692                    binop( Iop_F64toI32S,
9693                           mkexpr(rmode),
9694                           unop(Iop_F32toF64, mkexpr(f32lo))) );
9695      } else {
9696         putIReg64( gregOfRexRM(pfx,modrm),
9697                    binop( Iop_F64toI64S,
9698                           mkexpr(rmode),
9699                           unop(Iop_F32toF64, mkexpr(f32lo))) );
9700      }
9701
9702      goto decode_success;
9703   }
9704
9705   /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
9706   if (haveNo66noF2noF3(pfx) && sz == 4
9707       && insn[0] == 0x0F && insn[1] == 0x5E) {
9708      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "divps", Iop_Div32Fx4 );
9709      goto decode_success;
9710   }
9711
9712   /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
9713   if (haveF3no66noF2(pfx) && sz == 4
9714       && insn[0] == 0x0F && insn[1] == 0x5E) {
9715      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "divss", Iop_Div32F0x4 );
9716      goto decode_success;
9717   }
9718
9719   /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
9720   if (insn[0] == 0x0F && insn[1] == 0xAE
9721       && haveNo66noF2noF3(pfx)
9722       && !epartIsReg(insn[2]) && gregLO3ofRM(insn[2]) == 2) {
9723
9724      IRTemp t64 = newTemp(Ity_I64);
9725      IRTemp ew = newTemp(Ity_I32);
9726
9727      vassert(sz == 4);
9728      addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
9729      delta += 2+alen;
9730      DIP("ldmxcsr %s\n", dis_buf);
9731
9732      /* The only thing we observe in %mxcsr is the rounding mode.
9733         Therefore, pass the 32-bit value (SSE native-format control
9734         word) to a clean helper, getting back a 64-bit value, the
9735         lower half of which is the SSEROUND value to store, and the
9736         upper half of which is the emulation-warning token which may
9737         be generated.
9738      */
9739      /* ULong amd64h_check_ldmxcsr ( ULong ); */
9740      assign( t64, mkIRExprCCall(
9741                      Ity_I64, 0/*regparms*/,
9742                      "amd64g_check_ldmxcsr",
9743                      &amd64g_check_ldmxcsr,
9744                      mkIRExprVec_1(
9745                         unop(Iop_32Uto64,
9746                              loadLE(Ity_I32, mkexpr(addr))
9747                         )
9748                      )
9749                   )
9750            );
9751
9752      put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
9753      assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
9754      put_emwarn( mkexpr(ew) );
9755      /* Finally, if an emulation warning was reported, side-exit to
9756         the next insn, reporting the warning, so that Valgrind's
9757         dispatcher sees the warning. */
9758      stmt(
9759         IRStmt_Exit(
9760            binop(Iop_CmpNE64, unop(Iop_32Uto64,mkexpr(ew)), mkU64(0)),
9761            Ijk_EmWarn,
9762            IRConst_U64(guest_RIP_bbstart+delta)
9763         )
9764      );
9765      goto decode_success;
9766   }
9767
9768   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9769   /* 0F F7 = MASKMOVQ -- 8x8 masked store */
9770   if (haveNo66noF2noF3(pfx) && sz == 4
9771       && insn[0] == 0x0F && insn[1] == 0xF7) {
9772      Bool ok = False;
9773      delta = dis_MMX( &ok, vbi, pfx, sz, delta+1 );
9774      if (!ok)
9775         goto decode_failure;
9776      goto decode_success;
9777   }
9778
9779   /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
9780   if (haveNo66noF2noF3(pfx) && sz == 4
9781       && insn[0] == 0x0F && insn[1] == 0x5F) {
9782      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "maxps", Iop_Max32Fx4 );
9783      goto decode_success;
9784   }
9785
9786   /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
9787   if (haveF3no66noF2(pfx) && sz == 4
9788       && insn[0] == 0x0F && insn[1] == 0x5F) {
9789      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "maxss", Iop_Max32F0x4 );
9790      goto decode_success;
9791   }
9792
9793   /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
9794   if (haveNo66noF2noF3(pfx) && sz == 4
9795       && insn[0] == 0x0F && insn[1] == 0x5D) {
9796      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "minps", Iop_Min32Fx4 );
9797      goto decode_success;
9798   }
9799
9800   /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
9801   if (haveF3no66noF2(pfx) && sz == 4
9802       && insn[0] == 0x0F && insn[1] == 0x5D) {
9803      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "minss", Iop_Min32F0x4 );
9804      goto decode_success;
9805   }
9806
9807   /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
9808   /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
9809   if (haveNo66noF2noF3(pfx)
9810       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
9811       && insn[0] == 0x0F && (insn[1] == 0x28 || insn[1] == 0x10)) {
9812      modrm = getUChar(delta+2);
9813      if (epartIsReg(modrm)) {
9814         putXMMReg( gregOfRexRM(pfx,modrm),
9815                    getXMMReg( eregOfRexRM(pfx,modrm) ));
9816         DIP("mov[ua]ps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
9817                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
9818         delta += 2+1;
9819      } else {
9820         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
9821         if (insn[1] == 0x28/*movaps*/)
9822            gen_SEGV_if_not_16_aligned( addr );
9823         putXMMReg( gregOfRexRM(pfx,modrm),
9824                    loadLE(Ity_V128, mkexpr(addr)) );
9825         DIP("mov[ua]ps %s,%s\n", dis_buf,
9826                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
9827         delta += 2+alen;
9828      }
9829      goto decode_success;
9830   }
9831
9832   /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
9833   /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
9834   if (haveNo66noF2noF3(pfx)
9835       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
9836       && insn[0] == 0x0F && (insn[1] == 0x29 || insn[1] == 0x11)) {
9837      modrm = getUChar(delta+2);
9838      if (epartIsReg(modrm)) {
9839         /* fall through; awaiting test case */
9840      } else {
9841         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
9842         if (insn[1] == 0x29/*movaps*/)
9843            gen_SEGV_if_not_16_aligned( addr );
9844         storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
9845         DIP("mov[ua]ps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
9846                                  dis_buf );
9847         delta += 2+alen;
9848         goto decode_success;
9849      }
9850   }
9851
9852   /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
9853   /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
9854   if (haveNo66noF2noF3(pfx)
9855       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
9856       && insn[0] == 0x0F && insn[1] == 0x16) {
9857      modrm = getUChar(delta+2);
9858      if (epartIsReg(modrm)) {
9859         delta += 2+1;
9860         putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
9861                          getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ) );
9862         DIP("movhps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
9863                               nameXMMReg(gregOfRexRM(pfx,modrm)));
9864      } else {
9865         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
9866         delta += 2+alen;
9867         putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
9868                          loadLE(Ity_I64, mkexpr(addr)) );
9869         DIP("movhps %s,%s\n", dis_buf,
9870                               nameXMMReg( gregOfRexRM(pfx,modrm) ));
9871      }
9872      goto decode_success;
9873   }
9874
9875   /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
9876   if (haveNo66noF2noF3(pfx)
9877       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
9878       && insn[0] == 0x0F && insn[1] == 0x17) {
9879      if (!epartIsReg(insn[2])) {
9880         delta += 2;
9881         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9882         delta += alen;
9883         storeLE( mkexpr(addr),
9884                  getXMMRegLane64( gregOfRexRM(pfx,insn[2]),
9885                                   1/*upper lane*/ ) );
9886         DIP("movhps %s,%s\n", nameXMMReg( gregOfRexRM(pfx,insn[2]) ),
9887                               dis_buf);
9888         goto decode_success;
9889      }
9890      /* else fall through */
9891   }
9892
9893   /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
9894   /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
9895   if (haveNo66noF2noF3(pfx)
9896       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
9897       && insn[0] == 0x0F && insn[1] == 0x12) {
9898      modrm = getUChar(delta+2);
9899      if (epartIsReg(modrm)) {
9900         delta += 2+1;
9901         putXMMRegLane64( gregOfRexRM(pfx,modrm),
9902                          0/*lower lane*/,
9903                          getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ));
9904         DIP("movhlps %s, %s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
9905                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
9906      } else {
9907         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
9908         delta += 2+alen;
9909         putXMMRegLane64( gregOfRexRM(pfx,modrm),  0/*lower lane*/,
9910                          loadLE(Ity_I64, mkexpr(addr)) );
9911         DIP("movlps %s, %s\n",
9912             dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
9913      }
9914      goto decode_success;
9915   }
9916
9917   /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
9918   if (haveNo66noF2noF3(pfx)
9919       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
9920       && insn[0] == 0x0F && insn[1] == 0x13) {
9921      if (!epartIsReg(insn[2])) {
9922         delta += 2;
9923         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9924         delta += alen;
9925         storeLE( mkexpr(addr),
9926                  getXMMRegLane64( gregOfRexRM(pfx,insn[2]),
9927                                   0/*lower lane*/ ) );
9928         DIP("movlps %s, %s\n", nameXMMReg( gregOfRexRM(pfx,insn[2]) ),
9929                                dis_buf);
9930         goto decode_success;
9931      }
9932      /* else fall through */
9933   }
9934
9935   /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
9936      to 4 lowest bits of ireg(G) */
9937   if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
9938       && insn[0] == 0x0F && insn[1] == 0x50) {
9939      /* sz == 8 is a kludge to handle insns with REX.W redundantly
9940         set to 1, which has been known to happen:
9941
9942         4c 0f 50 d9             rex64X movmskps %xmm1,%r11d
9943
9944         20071106: Intel docs say that REX.W isn't redundant: when
9945         present, a 64-bit register is written; when not present, only
9946         the 32-bit half is written.  However, testing on a Core2
9947         machine suggests the entire 64 bit register is written
9948         irrespective of the status of REX.W.  That could be because
9949         of the default rule that says "if the lower half of a 32-bit
9950         register is written, the upper half is zeroed".  By using
9951         putIReg32 here we inadvertantly produce the same behaviour as
9952         the Core2, for the same reason -- putIReg32 implements said
9953         rule.
9954
9955         AMD docs give no indication that REX.W is even valid for this
9956         insn. */
9957      modrm = getUChar(delta+2);
9958      if (epartIsReg(modrm)) {
9959         Int src;
9960         t0 = newTemp(Ity_I32);
9961         t1 = newTemp(Ity_I32);
9962         t2 = newTemp(Ity_I32);
9963         t3 = newTemp(Ity_I32);
9964         delta += 2+1;
9965         src = eregOfRexRM(pfx,modrm);
9966         assign( t0, binop( Iop_And32,
9967                            binop(Iop_Shr32, getXMMRegLane32(src,0), mkU8(31)),
9968                            mkU32(1) ));
9969         assign( t1, binop( Iop_And32,
9970                            binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(30)),
9971                            mkU32(2) ));
9972         assign( t2, binop( Iop_And32,
9973                            binop(Iop_Shr32, getXMMRegLane32(src,2), mkU8(29)),
9974                            mkU32(4) ));
9975         assign( t3, binop( Iop_And32,
9976                            binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(28)),
9977                            mkU32(8) ));
9978         putIReg32( gregOfRexRM(pfx,modrm),
9979                    binop(Iop_Or32,
9980                          binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
9981                          binop(Iop_Or32, mkexpr(t2), mkexpr(t3))
9982                         )
9983                 );
9984         DIP("movmskps %s,%s\n", nameXMMReg(src),
9985                                 nameIReg32(gregOfRexRM(pfx,modrm)));
9986         goto decode_success;
9987      }
9988      /* else fall through */
9989   }
9990
9991   /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
9992   /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
9993   if ( ( (haveNo66noF2noF3(pfx) && sz == 4)
9994          || (have66noF2noF3(pfx) && sz == 2)
9995        )
9996        && insn[0] == 0x0F && insn[1] == 0x2B) {
9997      modrm = getUChar(delta+2);
9998      if (!epartIsReg(modrm)) {
9999         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
10000         gen_SEGV_if_not_16_aligned( addr );
10001         storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
10002         DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
10003                                 dis_buf,
10004                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
10005         delta += 2+alen;
10006         goto decode_success;
10007      }
10008      /* else fall through */
10009   }
10010
10011   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
10012   /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
10013      Intel manual does not say anything about the usual business of
10014      the FP reg tags getting trashed whenever an MMX insn happens.
10015      So we just leave them alone.
10016   */
10017   if (haveNo66noF2noF3(pfx) && sz == 4
10018       && insn[0] == 0x0F && insn[1] == 0xE7) {
10019      modrm = getUChar(delta+2);
10020      if (!epartIsReg(modrm)) {
10021         /* do_MMX_preamble(); Intel docs don't specify this */
10022         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
10023         storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
10024         DIP("movntq %s,%s\n", dis_buf,
10025                               nameMMXReg(gregLO3ofRM(modrm)));
10026         delta += 2+alen;
10027         goto decode_success;
10028      }
10029      /* else fall through */
10030   }
10031
10032   /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
10033      (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
10034   if (haveF3no66noF2(pfx)
10035       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
10036       && insn[0] == 0x0F && insn[1] == 0x10) {
10037      modrm = getUChar(delta+2);
10038      if (epartIsReg(modrm)) {
10039         putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
10040                          getXMMRegLane32( eregOfRexRM(pfx,modrm), 0 ));
10041         DIP("movss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
10042                              nameXMMReg(gregOfRexRM(pfx,modrm)));
10043         delta += 2+1;
10044      } else {
10045         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
10046         putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
10047         putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
10048                          loadLE(Ity_I32, mkexpr(addr)) );
10049         DIP("movss %s,%s\n", dis_buf,
10050                              nameXMMReg(gregOfRexRM(pfx,modrm)));
10051         delta += 2+alen;
10052      }
10053      goto decode_success;
10054   }
10055
10056   /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
10057      or lo 1/4 xmm). */
10058   if (haveF3no66noF2(pfx) && sz == 4
10059       && insn[0] == 0x0F && insn[1] == 0x11) {
10060      modrm = getUChar(delta+2);
10061      if (epartIsReg(modrm)) {
10062         /* fall through, we don't yet have a test case */
10063      } else {
10064         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
10065         storeLE( mkexpr(addr),
10066                  getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
10067         DIP("movss %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
10068                              dis_buf);
10069         delta += 2+alen;
10070         goto decode_success;
10071      }
10072   }
10073
10074   /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
10075   if (haveNo66noF2noF3(pfx) && sz == 4
10076       && insn[0] == 0x0F && insn[1] == 0x59) {
10077      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "mulps", Iop_Mul32Fx4 );
10078      goto decode_success;
10079   }
10080
10081   /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
10082   if (haveF3no66noF2(pfx) && sz == 4
10083       && insn[0] == 0x0F && insn[1] == 0x59) {
10084      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "mulss", Iop_Mul32F0x4 );
10085      goto decode_success;
10086   }
10087
10088   /* 0F 56 = ORPS -- G = G and E */
10089   if (haveNo66noF2noF3(pfx) && sz == 4
10090       && insn[0] == 0x0F && insn[1] == 0x56) {
10091      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "orps", Iop_OrV128 );
10092      goto decode_success;
10093   }
10094
10095   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
10096   /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
10097   if (haveNo66noF2noF3(pfx) && sz == 4
10098       && insn[0] == 0x0F && insn[1] == 0xE0) {
10099      do_MMX_preamble();
10100      delta = dis_MMXop_regmem_to_reg (
10101                 vbi, pfx, delta+2, insn[1], "pavgb", False );
10102      goto decode_success;
10103   }
10104
10105   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
10106   /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
10107   if (haveNo66noF2noF3(pfx) && sz == 4
10108       && insn[0] == 0x0F && insn[1] == 0xE3) {
10109      do_MMX_preamble();
10110      delta = dis_MMXop_regmem_to_reg (
10111                 vbi, pfx, delta+2, insn[1], "pavgw", False );
10112      goto decode_success;
10113   }
10114
10115   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
10116   /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
10117      zero-extend of it in ireg(G). */
10118   if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
10119       && insn[0] == 0x0F && insn[1] == 0xC5) {
10120      modrm = insn[2];
10121      if (epartIsReg(modrm)) {
10122         IRTemp sV = newTemp(Ity_I64);
10123         t5 = newTemp(Ity_I16);
10124         do_MMX_preamble();
10125         assign(sV, getMMXReg(eregLO3ofRM(modrm)));
10126         breakup64to16s( sV, &t3, &t2, &t1, &t0 );
10127         switch (insn[3] & 3) {
10128            case 0:  assign(t5, mkexpr(t0)); break;
10129            case 1:  assign(t5, mkexpr(t1)); break;
10130            case 2:  assign(t5, mkexpr(t2)); break;
10131            case 3:  assign(t5, mkexpr(t3)); break;
10132            default: vassert(0);
10133         }
10134         if (sz == 8)
10135            putIReg64(gregOfRexRM(pfx,modrm), unop(Iop_16Uto64, mkexpr(t5)));
10136         else
10137            putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_16Uto32, mkexpr(t5)));
10138         DIP("pextrw $%d,%s,%s\n",
10139             (Int)insn[3], nameMMXReg(eregLO3ofRM(modrm)),
10140                           sz==8 ? nameIReg64(gregOfRexRM(pfx,modrm))
10141                                 : nameIReg32(gregOfRexRM(pfx,modrm))
10142         );
10143         delta += 4;
10144         goto decode_success;
10145      }
10146      /* else fall through */
10147      /* note, for anyone filling in the mem case: this insn has one
10148         byte after the amode and therefore you must pass 1 as the
10149         last arg to disAMode */
10150   }
10151
10152   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
10153   /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
10154      put it into the specified lane of mmx(G). */
10155   if (haveNo66noF2noF3(pfx)
10156       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
10157       && insn[0] == 0x0F && insn[1] == 0xC4) {
10158      /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
10159         mmx reg.  t4 is the new lane value.  t5 is the original
10160         mmx value. t6 is the new mmx value. */
10161      Int lane;
10162      t4 = newTemp(Ity_I16);
10163      t5 = newTemp(Ity_I64);
10164      t6 = newTemp(Ity_I64);
10165      modrm = insn[2];
10166      do_MMX_preamble();
10167
10168      assign(t5, getMMXReg(gregLO3ofRM(modrm)));
10169      breakup64to16s( t5, &t3, &t2, &t1, &t0 );
10170
10171      if (epartIsReg(modrm)) {
10172         assign(t4, getIReg16(eregOfRexRM(pfx,modrm)));
10173         delta += 3+1;
10174         lane = insn[3+1-1];
10175         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
10176                                   nameIReg16(eregOfRexRM(pfx,modrm)),
10177                                   nameMMXReg(gregLO3ofRM(modrm)));
10178      } else {
10179         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 1 );
10180         delta += 3+alen;
10181         lane = insn[3+alen-1];
10182         assign(t4, loadLE(Ity_I16, mkexpr(addr)));
10183         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
10184                                   dis_buf,
10185                                   nameMMXReg(gregLO3ofRM(modrm)));
10186      }
10187
10188      switch (lane & 3) {
10189         case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
10190         case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
10191         case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
10192         case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
10193         default: vassert(0);
10194      }
10195      putMMXReg(gregLO3ofRM(modrm), mkexpr(t6));
10196      goto decode_success;
10197   }
10198
10199   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
10200   /* 0F EE = PMAXSW -- 16x4 signed max */
10201   if (haveNo66noF2noF3(pfx) && sz == 4
10202       && insn[0] == 0x0F && insn[1] == 0xEE) {
10203      do_MMX_preamble();
10204      delta = dis_MMXop_regmem_to_reg (
10205                 vbi, pfx, delta+2, insn[1], "pmaxsw", False );
10206      goto decode_success;
10207   }
10208
10209   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
10210   /* 0F DE = PMAXUB -- 8x8 unsigned max */
10211   if (haveNo66noF2noF3(pfx) && sz == 4
10212       && insn[0] == 0x0F && insn[1] == 0xDE) {
10213      do_MMX_preamble();
10214      delta = dis_MMXop_regmem_to_reg (
10215                 vbi, pfx, delta+2, insn[1], "pmaxub", False );
10216      goto decode_success;
10217   }
10218
10219   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
10220   /* 0F EA = PMINSW -- 16x4 signed min */
10221   if (haveNo66noF2noF3(pfx) && sz == 4
10222       && insn[0] == 0x0F && insn[1] == 0xEA) {
10223      do_MMX_preamble();
10224      delta = dis_MMXop_regmem_to_reg (
10225                 vbi, pfx, delta+2, insn[1], "pminsw", False );
10226      goto decode_success;
10227   }
10228
10229   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
10230   /* 0F DA = PMINUB -- 8x8 unsigned min */
10231   if (haveNo66noF2noF3(pfx) && sz == 4
10232       && insn[0] == 0x0F && insn[1] == 0xDA) {
10233      do_MMX_preamble();
10234      delta = dis_MMXop_regmem_to_reg (
10235                 vbi, pfx, delta+2, insn[1], "pminub", False );
10236      goto decode_success;
10237   }
10238
10239   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
10240   /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
10241      mmx(G), turn them into a byte, and put zero-extend of it in
10242      ireg(G). */
10243   if (haveNo66noF2noF3(pfx) && sz == 4
10244       && insn[0] == 0x0F && insn[1] == 0xD7) {
10245      modrm = insn[2];
10246      if (epartIsReg(modrm)) {
10247         do_MMX_preamble();
10248         t0 = newTemp(Ity_I64);
10249         t1 = newTemp(Ity_I64);
10250         assign(t0, getMMXReg(eregLO3ofRM(modrm)));
10251         assign(t1, mkIRExprCCall(
10252                       Ity_I64, 0/*regparms*/,
10253                       "amd64g_calculate_mmx_pmovmskb",
10254                       &amd64g_calculate_mmx_pmovmskb,
10255                       mkIRExprVec_1(mkexpr(t0))));
10256         putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_64to32,mkexpr(t1)));
10257         DIP("pmovmskb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
10258                                 nameIReg32(gregOfRexRM(pfx,modrm)));
10259         delta += 3;
10260         goto decode_success;
10261      }
10262      /* else fall through */
10263   }
10264
10265   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
10266   /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
10267   if (haveNo66noF2noF3(pfx) && sz == 4
10268       && insn[0] == 0x0F && insn[1] == 0xE4) {
10269      do_MMX_preamble();
10270      delta = dis_MMXop_regmem_to_reg (
10271                 vbi, pfx, delta+2, insn[1], "pmuluh", False );
10272      goto decode_success;
10273   }
10274
10275   /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
10276   /* 0F 18 /1 = PREFETCH0   -- with various different hints */
10277   /* 0F 18 /2 = PREFETCH1 */
10278   /* 0F 18 /3 = PREFETCH2 */
10279   if (insn[0] == 0x0F && insn[1] == 0x18
10280       && haveNo66noF2noF3(pfx)
10281       && !epartIsReg(insn[2])
10282       && gregLO3ofRM(insn[2]) >= 0 && gregLO3ofRM(insn[2]) <= 3) {
10283      HChar* hintstr = "??";
10284
10285      modrm = getUChar(delta+2);
10286      vassert(!epartIsReg(modrm));
10287
10288      addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
10289      delta += 2+alen;
10290
10291      switch (gregLO3ofRM(modrm)) {
10292         case 0: hintstr = "nta"; break;
10293         case 1: hintstr = "t0"; break;
10294         case 2: hintstr = "t1"; break;
10295         case 3: hintstr = "t2"; break;
10296         default: vassert(0);
10297      }
10298
10299      DIP("prefetch%s %s\n", hintstr, dis_buf);
10300      goto decode_success;
10301   }
10302
10303   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
10304   /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
10305   if (haveNo66noF2noF3(pfx) && sz == 4
10306       && insn[0] == 0x0F && insn[1] == 0xF6) {
10307      do_MMX_preamble();
10308      delta = dis_MMXop_regmem_to_reg (
10309                 vbi, pfx, delta+2, insn[1], "psadbw", False );
10310      goto decode_success;
10311   }
10312
10313   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
10314   /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
10315   if (haveNo66noF2noF3(pfx) && sz == 4
10316       && insn[0] == 0x0F && insn[1] == 0x70) {
10317      Int order;
10318      IRTemp sV, dV, s3, s2, s1, s0;
10319      s3 = s2 = s1 = s0 = IRTemp_INVALID;
10320      sV = newTemp(Ity_I64);
10321      dV = newTemp(Ity_I64);
10322      do_MMX_preamble();
10323      modrm = insn[2];
10324      if (epartIsReg(modrm)) {
10325         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
10326         order = (Int)insn[3];
10327         delta += 2+2;
10328         DIP("pshufw $%d,%s,%s\n", order,
10329                                   nameMMXReg(eregLO3ofRM(modrm)),
10330                                   nameMMXReg(gregLO3ofRM(modrm)));
10331      } else {
10332         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf,
10333                           1/*extra byte after amode*/ );
10334         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
10335         order = (Int)insn[2+alen];
10336         delta += 3+alen;
10337         DIP("pshufw $%d,%s,%s\n", order,
10338                                   dis_buf,
10339                                   nameMMXReg(gregLO3ofRM(modrm)));
10340      }
10341      breakup64to16s( sV, &s3, &s2, &s1, &s0 );
10342#     define SEL(n) \
10343                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
10344      assign(dV,
10345	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
10346                          SEL((order>>2)&3), SEL((order>>0)&3) )
10347      );
10348      putMMXReg(gregLO3ofRM(modrm), mkexpr(dV));
10349#     undef SEL
10350      goto decode_success;
10351   }
10352
10353   /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
10354   if (haveNo66noF2noF3(pfx) && sz == 4
10355       && insn[0] == 0x0F && insn[1] == 0x53) {
10356      delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta+2,
10357                                        "rcpps", Iop_Recip32Fx4 );
10358      goto decode_success;
10359   }
10360
10361   /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
10362   if (haveF3no66noF2(pfx) && sz == 4
10363       && insn[0] == 0x0F && insn[1] == 0x53) {
10364      delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta+2,
10365                                         "rcpss", Iop_Recip32F0x4 );
10366      goto decode_success;
10367   }
10368
10369   /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
10370   if (haveNo66noF2noF3(pfx) && sz == 4
10371       && insn[0] == 0x0F && insn[1] == 0x52) {
10372      delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta+2,
10373                                        "rsqrtps", Iop_RSqrt32Fx4 );
10374      goto decode_success;
10375   }
10376
10377   /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
10378   if (haveF3no66noF2(pfx) && sz == 4
10379       && insn[0] == 0x0F && insn[1] == 0x52) {
10380      delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta+2,
10381                                         "rsqrtss", Iop_RSqrt32F0x4 );
10382      goto decode_success;
10383   }
10384
10385   /* 0F AE /7 = SFENCE -- flush pending operations to memory */
10386   if (haveNo66noF2noF3(pfx)
10387       && insn[0] == 0x0F && insn[1] == 0xAE
10388       && epartIsReg(insn[2]) && gregLO3ofRM(insn[2]) == 7
10389       && sz == 4) {
10390      delta += 3;
10391      /* Insert a memory fence.  It's sometimes important that these
10392         are carried through to the generated code. */
10393      stmt( IRStmt_MBE(Imbe_Fence) );
10394      DIP("sfence\n");
10395      goto decode_success;
10396   }
10397
10398   /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
10399   if (haveNo66noF2noF3(pfx) && sz == 4
10400       && insn[0] == 0x0F && insn[1] == 0xC6) {
10401      Int    select;
10402      IRTemp sV, dV;
10403      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
10404      sV = newTemp(Ity_V128);
10405      dV = newTemp(Ity_V128);
10406      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
10407      modrm = insn[2];
10408      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
10409
10410      if (epartIsReg(modrm)) {
10411         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
10412         select = (Int)insn[3];
10413         delta += 2+2;
10414         DIP("shufps $%d,%s,%s\n", select,
10415                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
10416                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
10417      } else {
10418         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf,
10419                           1/*byte at end of insn*/ );
10420         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
10421         select = (Int)insn[2+alen];
10422         delta += 3+alen;
10423         DIP("shufps $%d,%s,%s\n", select,
10424                                   dis_buf,
10425                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
10426      }
10427
10428      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
10429      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
10430
10431#     define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
10432#     define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
10433
10434      putXMMReg(
10435         gregOfRexRM(pfx,modrm),
10436         mk128from32s( SELS((select>>6)&3), SELS((select>>4)&3),
10437                       SELD((select>>2)&3), SELD((select>>0)&3) )
10438      );
10439
10440#     undef SELD
10441#     undef SELS
10442
10443      goto decode_success;
10444   }
10445
10446   /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
10447   if (haveNo66noF2noF3(pfx) && sz == 4
10448       && insn[0] == 0x0F && insn[1] == 0x51) {
10449      delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta+2,
10450                                        "sqrtps", Iop_Sqrt32Fx4 );
10451      goto decode_success;
10452   }
10453
10454   /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
10455   if (haveF3no66noF2(pfx) && sz == 4
10456       && insn[0] == 0x0F && insn[1] == 0x51) {
10457      delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta+2,
10458                                         "sqrtss", Iop_Sqrt32F0x4 );
10459      goto decode_success;
10460   }
10461
10462   /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
10463   if (insn[0] == 0x0F && insn[1] == 0xAE
10464       && haveNo66noF2noF3(pfx)
10465       && !epartIsReg(insn[2]) && gregLO3ofRM(insn[2]) == 3) {
10466
10467      vassert(sz == 4);
10468      addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
10469      delta += 2+alen;
10470
10471      /* Fake up a native SSE mxcsr word.  The only thing it depends
10472         on is SSEROUND[1:0], so call a clean helper to cook it up.
10473      */
10474      /* ULong amd64h_create_mxcsr ( ULong sseround ) */
10475      DIP("stmxcsr %s\n", dis_buf);
10476      storeLE(
10477         mkexpr(addr),
10478         unop(Iop_64to32,
10479              mkIRExprCCall(
10480                 Ity_I64, 0/*regp*/,
10481                 "amd64g_create_mxcsr", &amd64g_create_mxcsr,
10482                 mkIRExprVec_1( unop(Iop_32Uto64,get_sse_roundingmode()) )
10483	      )
10484	 )
10485      );
10486      goto decode_success;
10487   }
10488
10489   /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
10490   if (haveNo66noF2noF3(pfx) && sz == 4
10491       && insn[0] == 0x0F && insn[1] == 0x5C) {
10492      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "subps", Iop_Sub32Fx4 );
10493      goto decode_success;
10494   }
10495
10496   /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
10497   if (haveF3no66noF2(pfx) && sz == 4
10498       && insn[0] == 0x0F && insn[1] == 0x5C) {
10499      delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta+2, "subss", Iop_Sub32F0x4 );
10500      goto decode_success;
10501   }
10502
10503   /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
10504   /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
10505   /* These just appear to be special cases of SHUFPS */
10506   if (haveNo66noF2noF3(pfx) && sz == 4
10507       && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
10508      IRTemp sV, dV;
10509      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
10510      Bool hi = toBool(insn[1] == 0x15);
10511      sV = newTemp(Ity_V128);
10512      dV = newTemp(Ity_V128);
10513      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
10514      modrm = insn[2];
10515      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
10516
10517      if (epartIsReg(modrm)) {
10518         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
10519         delta += 2+1;
10520         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
10521                                  nameXMMReg(eregOfRexRM(pfx,modrm)),
10522                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
10523      } else {
10524         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
10525         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
10526         delta += 2+alen;
10527         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
10528                                  dis_buf,
10529                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
10530      }
10531
10532      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
10533      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
10534
10535      if (hi) {
10536         putXMMReg( gregOfRexRM(pfx,modrm), mk128from32s( s3, d3, s2, d2 ) );
10537      } else {
10538         putXMMReg( gregOfRexRM(pfx,modrm), mk128from32s( s1, d1, s0, d0 ) );
10539      }
10540
10541      goto decode_success;
10542   }
10543
10544   /* 0F 57 = XORPS -- G = G and E */
10545   if (haveNo66noF2noF3(pfx) && sz == 4
10546       && insn[0] == 0x0F && insn[1] == 0x57) {
10547      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "xorps", Iop_XorV128 );
10548      goto decode_success;
10549   }
10550
10551   /* ---------------------------------------------------- */
10552   /* --- end of the SSE decoder.                      --- */
10553   /* ---------------------------------------------------- */
10554
10555   /* ---------------------------------------------------- */
10556   /* --- start of the SSE2 decoder.                   --- */
10557   /* ---------------------------------------------------- */
10558
10559   /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
10560   if (have66noF2noF3(pfx)
10561       && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
10562       && insn[0] == 0x0F && insn[1] == 0x58) {
10563      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "addpd", Iop_Add64Fx2 );
10564      goto decode_success;
10565   }
10566
10567   /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
10568   if (haveF2no66noF3(pfx)
10569       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
10570       && insn[0] == 0x0F && insn[1] == 0x58) {
10571      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "addsd", Iop_Add64F0x2 );
10572      goto decode_success;
10573   }
10574
10575   /* 66 0F 55 = ANDNPD -- G = (not G) and E */
10576   if (have66noF2noF3(pfx) && sz == 2
10577       && insn[0] == 0x0F && insn[1] == 0x55) {
10578      delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta+2, "andnpd", Iop_AndV128 );
10579      goto decode_success;
10580   }
10581
10582   /* 66 0F 54 = ANDPD -- G = G and E */
10583   if (have66noF2noF3(pfx) && sz == 2
10584       && insn[0] == 0x0F && insn[1] == 0x54) {
10585      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "andpd", Iop_AndV128 );
10586      goto decode_success;
10587   }
10588
10589   /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
10590   if (have66noF2noF3(pfx) && sz == 2
10591       && insn[0] == 0x0F && insn[1] == 0xC2) {
10592      delta = dis_SSEcmp_E_to_G( vbi, pfx, delta+2, "cmppd", True, 8 );
10593      goto decode_success;
10594   }
10595
10596   /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
10597   if (haveF2no66noF3(pfx) && sz == 4
10598       && insn[0] == 0x0F && insn[1] == 0xC2) {
10599      delta = dis_SSEcmp_E_to_G( vbi, pfx, delta+2, "cmpsd", False, 8 );
10600      goto decode_success;
10601   }
10602
10603   /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
10604   /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
10605   if (have66noF2noF3(pfx) && sz == 2
10606       && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
10607      IRTemp argL = newTemp(Ity_F64);
10608      IRTemp argR = newTemp(Ity_F64);
10609      modrm = getUChar(delta+2);
10610      if (epartIsReg(modrm)) {
10611         assign( argR, getXMMRegLane64F( eregOfRexRM(pfx,modrm),
10612                                         0/*lowest lane*/ ) );
10613         delta += 2+1;
10614         DIP("%scomisd %s,%s\n", insn[1]==0x2E ? "u" : "",
10615                                 nameXMMReg(eregOfRexRM(pfx,modrm)),
10616                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
10617      } else {
10618         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
10619         assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
10620         delta += 2+alen;
10621         DIP("%scomisd %s,%s\n", insn[1]==0x2E ? "u" : "",
10622                                 dis_buf,
10623                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
10624      }
10625      assign( argL, getXMMRegLane64F( gregOfRexRM(pfx,modrm),
10626                                      0/*lowest lane*/ ) );
10627
10628      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
10629      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
10630      stmt( IRStmt_Put(
10631               OFFB_CC_DEP1,
10632               binop( Iop_And64,
10633                      unop( Iop_32Uto64,
10634                            binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)) ),
10635                      mkU64(0x45)
10636          )));
10637
10638      goto decode_success;
10639   }
10640
10641   /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
10642      F64 in xmm(G) */
10643   if (haveF3no66noF2(pfx) && insn[0] == 0x0F && insn[1] == 0xE6) {
10644      IRTemp arg64 = newTemp(Ity_I64);
10645      if (sz != 4) goto decode_failure;
10646
10647      modrm = getUChar(delta+2);
10648      if (epartIsReg(modrm)) {
10649         assign( arg64, getXMMRegLane64(eregOfRexRM(pfx,modrm), 0) );
10650         delta += 2+1;
10651         DIP("cvtdq2pd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
10652                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
10653      } else {
10654         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
10655         assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
10656         delta += 2+alen;
10657         DIP("cvtdq2pd %s,%s\n", dis_buf,
10658                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
10659      }
10660
10661      putXMMRegLane64F(
10662         gregOfRexRM(pfx,modrm), 0,
10663         unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
10664      );
10665
10666      putXMMRegLane64F(
10667         gregOfRexRM(pfx,modrm), 1,
10668         unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
10669      );
10670
10671      goto decode_success;
10672   }
10673
10674   /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
10675      xmm(G) */
10676   if (haveNo66noF2noF3(pfx) && sz == 4
10677       && insn[0] == 0x0F && insn[1] == 0x5B) {
10678      IRTemp argV  = newTemp(Ity_V128);
10679      IRTemp rmode = newTemp(Ity_I32);
10680
10681      modrm = getUChar(delta+2);
10682      if (epartIsReg(modrm)) {
10683         assign( argV, getXMMReg(eregOfRexRM(pfx,modrm)) );
10684         delta += 2+1;
10685         DIP("cvtdq2ps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
10686                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
10687      } else {
10688         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
10689         assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10690         delta += 2+alen;
10691         DIP("cvtdq2ps %s,%s\n", dis_buf,
10692                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
10693      }
10694
10695      assign( rmode, get_sse_roundingmode() );
10696      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
10697
10698#     define CVT(_t)  binop( Iop_F64toF32,                    \
10699                             mkexpr(rmode),                   \
10700                             unop(Iop_I32StoF64,mkexpr(_t)))
10701
10702      putXMMRegLane32F( gregOfRexRM(pfx,modrm), 3, CVT(t3) );
10703      putXMMRegLane32F( gregOfRexRM(pfx,modrm), 2, CVT(t2) );
10704      putXMMRegLane32F( gregOfRexRM(pfx,modrm), 1, CVT(t1) );
10705      putXMMRegLane32F( gregOfRexRM(pfx,modrm), 0, CVT(t0) );
10706
10707#     undef CVT
10708
10709      goto decode_success;
10710   }
10711
10712   /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
10713      lo half xmm(G), and zero upper half, rounding towards zero */
10714   /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
10715      lo half xmm(G), according to prevailing rounding mode, and zero
10716      upper half */
10717   if ( ( (haveF2no66noF3(pfx) && sz == 4)
10718          || (have66noF2noF3(pfx) && sz == 2)
10719        )
10720        && insn[0] == 0x0F && insn[1] == 0xE6) {
10721      IRTemp argV   = newTemp(Ity_V128);
10722      IRTemp rmode  = newTemp(Ity_I32);
10723      Bool   r2zero = toBool(sz == 2);
10724
10725      modrm = getUChar(delta+2);
10726      if (epartIsReg(modrm)) {
10727         assign( argV, getXMMReg(eregOfRexRM(pfx,modrm)) );
10728         delta += 2+1;
10729         DIP("cvt%spd2dq %s,%s\n", r2zero ? "t" : "",
10730                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
10731                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
10732      } else {
10733         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
10734         assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10735         delta += 2+alen;
10736         DIP("cvt%spd2dq %s,%s\n", r2zero ? "t" : "",
10737                                   dis_buf,
10738                                   nameXMMReg(gregOfRexRM(pfx,modrm)) );
10739      }
10740
10741      if (r2zero) {
10742         assign(rmode, mkU32((UInt)Irrm_ZERO) );
10743      } else {
10744         assign( rmode, get_sse_roundingmode() );
10745      }
10746
10747      t0 = newTemp(Ity_F64);
10748      t1 = newTemp(Ity_F64);
10749      assign( t0, unop(Iop_ReinterpI64asF64,
10750                       unop(Iop_V128to64, mkexpr(argV))) );
10751      assign( t1, unop(Iop_ReinterpI64asF64,
10752                       unop(Iop_V128HIto64, mkexpr(argV))) );
10753
10754#     define CVT(_t)  binop( Iop_F64toI32S,                   \
10755                             mkexpr(rmode),                   \
10756                             mkexpr(_t) )
10757
10758      putXMMRegLane32( gregOfRexRM(pfx,modrm), 3, mkU32(0) );
10759      putXMMRegLane32( gregOfRexRM(pfx,modrm), 2, mkU32(0) );
10760      putXMMRegLane32( gregOfRexRM(pfx,modrm), 1, CVT(t1) );
10761      putXMMRegLane32( gregOfRexRM(pfx,modrm), 0, CVT(t0) );
10762
10763#     undef CVT
10764
10765      goto decode_success;
10766   }
10767
10768   /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
10769      I32 in mmx, according to prevailing SSE rounding mode */
10770   /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
10771      I32 in mmx, rounding towards zero */
10772   if (have66noF2noF3(pfx) && sz == 2
10773       && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
10774      IRTemp dst64  = newTemp(Ity_I64);
10775      IRTemp rmode  = newTemp(Ity_I32);
10776      IRTemp f64lo  = newTemp(Ity_F64);
10777      IRTemp f64hi  = newTemp(Ity_F64);
10778      Bool   r2zero = toBool(insn[1] == 0x2C);
10779
10780      do_MMX_preamble();
10781      modrm = getUChar(delta+2);
10782
10783      if (epartIsReg(modrm)) {
10784         delta += 2+1;
10785         assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
10786         assign(f64hi, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 1));
10787         DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
10788                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
10789                                   nameMMXReg(gregLO3ofRM(modrm)));
10790      } else {
10791         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
10792         assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
10793         assign(f64hi, loadLE(Ity_F64, binop( Iop_Add64,
10794                                              mkexpr(addr),
10795                                              mkU64(8) )));
10796         delta += 2+alen;
10797         DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
10798                                   dis_buf,
10799                                   nameMMXReg(gregLO3ofRM(modrm)));
10800      }
10801
10802      if (r2zero) {
10803         assign(rmode, mkU32((UInt)Irrm_ZERO) );
10804      } else {
10805         assign( rmode, get_sse_roundingmode() );
10806      }
10807
10808      assign(
10809         dst64,
10810         binop( Iop_32HLto64,
10811                binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
10812                binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
10813              )
10814      );
10815
10816      putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
10817      goto decode_success;
10818   }
10819
10820   /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
10821      lo half xmm(G), rounding according to prevailing SSE rounding
10822      mode, and zero upper half */
10823   /* Note, this is practically identical to CVTPD2DQ.  It would have
10824      been nicer to merge them together, but the insn[] offsets differ
10825      by one. */
10826   if (have66noF2noF3(pfx) && sz == 2
10827       && insn[0] == 0x0F && insn[1] == 0x5A) {
10828      IRTemp argV  = newTemp(Ity_V128);
10829      IRTemp rmode = newTemp(Ity_I32);
10830
10831      modrm = getUChar(delta+2);
10832      if (epartIsReg(modrm)) {
10833         assign( argV, getXMMReg(eregOfRexRM(pfx,modrm)) );
10834         delta += 2+1;
10835         DIP("cvtpd2ps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
10836                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
10837      } else {
10838         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
10839         assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10840         delta += 2+alen;
10841         DIP("cvtpd2ps %s,%s\n", dis_buf,
10842                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
10843      }
10844
10845      assign( rmode, get_sse_roundingmode() );
10846      t0 = newTemp(Ity_F64);
10847      t1 = newTemp(Ity_F64);
10848      assign( t0, unop(Iop_ReinterpI64asF64,
10849                       unop(Iop_V128to64, mkexpr(argV))) );
10850      assign( t1, unop(Iop_ReinterpI64asF64,
10851                       unop(Iop_V128HIto64, mkexpr(argV))) );
10852
10853#     define CVT(_t)  binop( Iop_F64toF32,                    \
10854                             mkexpr(rmode),                   \
10855                             mkexpr(_t) )
10856
10857      putXMMRegLane32(  gregOfRexRM(pfx,modrm), 3, mkU32(0) );
10858      putXMMRegLane32(  gregOfRexRM(pfx,modrm), 2, mkU32(0) );
10859      putXMMRegLane32F( gregOfRexRM(pfx,modrm), 1, CVT(t1) );
10860      putXMMRegLane32F( gregOfRexRM(pfx,modrm), 0, CVT(t0) );
10861
10862#     undef CVT
10863
10864      goto decode_success;
10865   }
10866
10867   /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
10868      xmm(G) */
10869   if (have66noF2noF3(pfx) && sz == 2
10870       && insn[0] == 0x0F && insn[1] == 0x2A) {
10871      IRTemp arg64 = newTemp(Ity_I64);
10872
10873      modrm = getUChar(delta+2);
10874      if (epartIsReg(modrm)) {
10875         /* Only switch to MMX mode if the source is a MMX register.
10876            This is inconsistent with all other instructions which
10877            convert between XMM and (M64 or MMX), which always switch
10878            to MMX mode even if 64-bit operand is M64 and not MMX.  At
10879            least, that's what the Intel docs seem to me to say.
10880            Fixes #210264. */
10881         do_MMX_preamble();
10882         assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
10883         delta += 2+1;
10884         DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
10885                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
10886      } else {
10887         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
10888         assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
10889         delta += 2+alen;
10890         DIP("cvtpi2pd %s,%s\n", dis_buf,
10891                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
10892      }
10893
10894      putXMMRegLane64F(
10895         gregOfRexRM(pfx,modrm), 0,
10896         unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
10897      );
10898
10899      putXMMRegLane64F(
10900         gregOfRexRM(pfx,modrm), 1,
10901         unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
10902      );
10903
10904      goto decode_success;
10905   }
10906
10907   /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
10908      xmm(G), rounding towards zero */
10909   /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
10910      xmm(G), as per the prevailing rounding mode */
10911   if ( ( (have66noF2noF3(pfx) && sz == 2)
10912          || (haveF3no66noF2(pfx) && sz == 4)
10913        )
10914        && insn[0] == 0x0F && insn[1] == 0x5B) {
10915      IRTemp argV   = newTemp(Ity_V128);
10916      IRTemp rmode  = newTemp(Ity_I32);
10917      Bool   r2zero = toBool(sz == 4);
10918
10919      modrm = getUChar(delta+2);
10920      if (epartIsReg(modrm)) {
10921         assign( argV, getXMMReg(eregOfRexRM(pfx,modrm)) );
10922         delta += 2+1;
10923         DIP("cvtps2dq %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
10924                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
10925      } else {
10926         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
10927         assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10928         delta += 2+alen;
10929         DIP("cvtps2dq %s,%s\n", dis_buf,
10930                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
10931      }
10932
10933      if (r2zero) {
10934         assign( rmode, mkU32((UInt)Irrm_ZERO) );
10935      } else {
10936         assign( rmode, get_sse_roundingmode() );
10937      }
10938
10939      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
10940
10941      /* This is less than ideal.  If it turns out to be a performance
10942         bottleneck it can be improved. */
10943#     define CVT(_t)                             \
10944         binop( Iop_F64toI32S,                   \
10945                mkexpr(rmode),                   \
10946                unop( Iop_F32toF64,              \
10947                      unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
10948
10949      putXMMRegLane32( gregOfRexRM(pfx,modrm), 3, CVT(t3) );
10950      putXMMRegLane32( gregOfRexRM(pfx,modrm), 2, CVT(t2) );
10951      putXMMRegLane32( gregOfRexRM(pfx,modrm), 1, CVT(t1) );
10952      putXMMRegLane32( gregOfRexRM(pfx,modrm), 0, CVT(t0) );
10953
10954#     undef CVT
10955
10956      goto decode_success;
10957   }
10958
10959   /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
10960      F64 in xmm(G). */
10961   if (haveNo66noF2noF3(pfx) && sz == 4
10962       && insn[0] == 0x0F && insn[1] == 0x5A) {
10963      IRTemp f32lo = newTemp(Ity_F32);
10964      IRTemp f32hi = newTemp(Ity_F32);
10965
10966      modrm = getUChar(delta+2);
10967      if (epartIsReg(modrm)) {
10968         assign( f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0) );
10969         assign( f32hi, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 1) );
10970         delta += 2+1;
10971         DIP("cvtps2pd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
10972                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
10973      } else {
10974         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
10975	 assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
10976	 assign( f32hi, loadLE(Ity_F32,
10977                               binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
10978         delta += 2+alen;
10979         DIP("cvtps2pd %s,%s\n", dis_buf,
10980                                 nameXMMReg(gregOfRexRM(pfx,modrm)) );
10981      }
10982
10983      putXMMRegLane64F( gregOfRexRM(pfx,modrm), 1,
10984                        unop(Iop_F32toF64, mkexpr(f32hi)) );
10985      putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
10986                        unop(Iop_F32toF64, mkexpr(f32lo)) );
10987
10988      goto decode_success;
10989   }
10990
10991   /* F2 0F 2D = CVTSD2SI
10992      when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
10993                    according to prevailing SSE rounding mode
10994      when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
10995                    according to prevailing SSE rounding mode
10996   */
10997   /* F2 0F 2C = CVTTSD2SI
10998      when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
10999                    truncating towards zero
11000      when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
11001                    truncating towards zero
11002   */
11003   if (haveF2no66noF3(pfx)
11004       && insn[0] == 0x0F
11005       && (insn[1] == 0x2D || insn[1] == 0x2C)) {
11006      IRTemp rmode  = newTemp(Ity_I32);
11007      IRTemp f64lo  = newTemp(Ity_F64);
11008      Bool   r2zero = toBool(insn[1] == 0x2C);
11009      vassert(sz == 4 || sz == 8);
11010
11011      modrm = getUChar(delta+2);
11012      if (epartIsReg(modrm)) {
11013         delta += 2+1;
11014         assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
11015         DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
11016                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
11017                                   nameIReg(sz, gregOfRexRM(pfx,modrm), False));
11018      } else {
11019         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
11020         assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
11021         delta += 2+alen;
11022         DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
11023                                   dis_buf,
11024                                   nameIReg(sz, gregOfRexRM(pfx,modrm), False));
11025      }
11026
11027      if (r2zero) {
11028         assign( rmode, mkU32((UInt)Irrm_ZERO) );
11029      } else {
11030         assign( rmode, get_sse_roundingmode() );
11031      }
11032
11033      if (sz == 4) {
11034         putIReg32( gregOfRexRM(pfx,modrm),
11035                    binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
11036      } else {
11037         putIReg64( gregOfRexRM(pfx,modrm),
11038                    binop( Iop_F64toI64S, mkexpr(rmode), mkexpr(f64lo)) );
11039      }
11040
11041      goto decode_success;
11042   }
11043
11044   /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
11045      low 1/4 xmm(G), according to prevailing SSE rounding mode */
11046   if (haveF2no66noF3(pfx) && sz == 4
11047       && insn[0] == 0x0F && insn[1] == 0x5A) {
11048      IRTemp rmode = newTemp(Ity_I32);
11049      IRTemp f64lo = newTemp(Ity_F64);
11050      vassert(sz == 4);
11051
11052      modrm = getUChar(delta+2);
11053      if (epartIsReg(modrm)) {
11054         delta += 2+1;
11055         assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
11056         DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
11057                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
11058      } else {
11059         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
11060         assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
11061         delta += 2+alen;
11062         DIP("cvtsd2ss %s,%s\n", dis_buf,
11063                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
11064      }
11065
11066      assign( rmode, get_sse_roundingmode() );
11067      putXMMRegLane32F(
11068         gregOfRexRM(pfx,modrm), 0,
11069         binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
11070      );
11071
11072      goto decode_success;
11073   }
11074
11075   /* F2 0F 2A = CVTSI2SD
11076      when sz==4 -- convert I32 in mem/ireg to F64 in low half xmm
11077      when sz==8 -- convert I64 in mem/ireg to F64 in low half xmm
11078   */
11079   if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)
11080       && insn[0] == 0x0F && insn[1] == 0x2A) {
11081      modrm = getUChar(delta+2);
11082
11083      if (sz == 4) {
11084         IRTemp arg32 = newTemp(Ity_I32);
11085         if (epartIsReg(modrm)) {
11086            assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
11087            delta += 2+1;
11088            DIP("cvtsi2sd %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
11089                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
11090         } else {
11091            addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
11092            assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
11093            delta += 2+alen;
11094            DIP("cvtsi2sd %s,%s\n", dis_buf,
11095                                    nameXMMReg(gregOfRexRM(pfx,modrm)) );
11096         }
11097         putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
11098                           unop(Iop_I32StoF64, mkexpr(arg32))
11099         );
11100      } else {
11101         /* sz == 8 */
11102         IRTemp arg64 = newTemp(Ity_I64);
11103         if (epartIsReg(modrm)) {
11104            assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
11105            delta += 2+1;
11106            DIP("cvtsi2sdq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
11107                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
11108         } else {
11109            addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
11110            assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
11111            delta += 2+alen;
11112            DIP("cvtsi2sdq %s,%s\n", dis_buf,
11113                                     nameXMMReg(gregOfRexRM(pfx,modrm)) );
11114         }
11115         putXMMRegLane64F(
11116            gregOfRexRM(pfx,modrm),
11117            0,
11118            binop( Iop_I64StoF64,
11119                   get_sse_roundingmode(),
11120                   mkexpr(arg64)
11121            )
11122         );
11123
11124      }
11125
11126      goto decode_success;
11127   }
11128
11129   /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
11130      low half xmm(G) */
11131   if (haveF3no66noF2(pfx) && sz == 4
11132       && insn[0] == 0x0F && insn[1] == 0x5A) {
11133      IRTemp f32lo = newTemp(Ity_F32);
11134
11135      modrm = getUChar(delta+2);
11136      if (epartIsReg(modrm)) {
11137         delta += 2+1;
11138         assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
11139         DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
11140                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
11141      } else {
11142         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
11143         assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
11144         delta += 2+alen;
11145         DIP("cvtss2sd %s,%s\n", dis_buf,
11146                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
11147      }
11148
11149      putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
11150                        unop( Iop_F32toF64, mkexpr(f32lo) ) );
11151
11152      goto decode_success;
11153   }
11154
11155   /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
11156   if (have66noF2noF3(pfx) && sz == 2
11157       && insn[0] == 0x0F && insn[1] == 0x5E) {
11158      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "divpd", Iop_Div64Fx2 );
11159      goto decode_success;
11160   }
11161
11162   /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
11163   if (haveF2no66noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x5E) {
11164      vassert(sz == 4);
11165      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "divsd", Iop_Div64F0x2 );
11166      goto decode_success;
11167   }
11168
11169   /* 0F AE /5 = LFENCE -- flush pending operations to memory */
11170   /* 0F AE /6 = MFENCE -- flush pending operations to memory */
11171   if (haveNo66noF2noF3(pfx) && sz == 4
11172       && insn[0] == 0x0F && insn[1] == 0xAE
11173       && epartIsReg(insn[2])
11174       && (gregLO3ofRM(insn[2]) == 5 || gregLO3ofRM(insn[2]) == 6)) {
11175      delta += 3;
11176      /* Insert a memory fence.  It's sometimes important that these
11177         are carried through to the generated code. */
11178      stmt( IRStmt_MBE(Imbe_Fence) );
11179      DIP("%sfence\n", gregLO3ofRM(insn[2])==5 ? "l" : "m");
11180      goto decode_success;
11181   }
11182
11183   /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
11184   if (have66noF2noF3(pfx) && sz == 2
11185       && insn[0] == 0x0F && insn[1] == 0x5F) {
11186      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "maxpd", Iop_Max64Fx2 );
11187      goto decode_success;
11188   }
11189
11190   /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
11191   if (haveF2no66noF3(pfx) && sz == 4
11192       && insn[0] == 0x0F && insn[1] == 0x5F) {
11193      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "maxsd", Iop_Max64F0x2 );
11194      goto decode_success;
11195   }
11196
11197   /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
11198   if (have66noF2noF3(pfx) && sz == 2
11199       && insn[0] == 0x0F && insn[1] == 0x5D) {
11200      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "minpd", Iop_Min64Fx2 );
11201      goto decode_success;
11202   }
11203
11204   /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
11205   if (haveF2no66noF3(pfx) && sz == 4
11206       && insn[0] == 0x0F && insn[1] == 0x5D) {
11207      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "minsd", Iop_Min64F0x2 );
11208      goto decode_success;
11209   }
11210
11211   /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
11212   /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
11213   /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
11214   if (have66noF2noF3(pfx)
11215       && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
11216       && insn[0] == 0x0F
11217       && (insn[1] == 0x28 || insn[1] == 0x10 || insn[1] == 0x6F)) {
11218      HChar* wot = insn[1]==0x28 ? "apd" :
11219                   insn[1]==0x10 ? "upd" : "dqa";
11220      modrm = getUChar(delta+2);
11221      if (epartIsReg(modrm)) {
11222         putXMMReg( gregOfRexRM(pfx,modrm),
11223                    getXMMReg( eregOfRexRM(pfx,modrm) ));
11224         DIP("mov%s %s,%s\n", wot, nameXMMReg(eregOfRexRM(pfx,modrm)),
11225                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
11226         delta += 2+1;
11227      } else {
11228         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
11229         if (insn[1] == 0x28/*movapd*/ || insn[1] == 0x6F/*movdqa*/)
11230            gen_SEGV_if_not_16_aligned( addr );
11231         putXMMReg( gregOfRexRM(pfx,modrm),
11232                    loadLE(Ity_V128, mkexpr(addr)) );
11233         DIP("mov%s %s,%s\n", wot, dis_buf,
11234                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
11235         delta += 2+alen;
11236      }
11237      goto decode_success;
11238   }
11239
11240   /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
11241   /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
11242   if (have66noF2noF3(pfx) && insn[0] == 0x0F
11243       && (insn[1] == 0x29 || insn[1] == 0x11)) {
11244      HChar* wot = insn[1]==0x29 ? "apd" : "upd";
11245      modrm = getUChar(delta+2);
11246      if (epartIsReg(modrm)) {
11247         putXMMReg( eregOfRexRM(pfx,modrm),
11248		    getXMMReg( gregOfRexRM(pfx,modrm) ) );
11249         DIP("mov%s %s,%s\n", wot, nameXMMReg(gregOfRexRM(pfx,modrm)),
11250	                           nameXMMReg(eregOfRexRM(pfx,modrm)));
11251         delta += 2+1;
11252      } else {
11253         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
11254         if (insn[1] == 0x29/*movapd*/)
11255            gen_SEGV_if_not_16_aligned( addr );
11256         storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
11257         DIP("mov%s %s,%s\n", wot, nameXMMReg(gregOfRexRM(pfx,modrm)),
11258                              dis_buf );
11259         delta += 2+alen;
11260      }
11261      goto decode_success;
11262   }
11263
11264   /* 66 0F 6E = MOVD from ireg32/m32 to xmm lo 1/4, zeroing high 3/4 of xmm. */
11265   /*              or from ireg64/m64 to xmm lo 1/2, zeroing high 1/2 of xmm. */
11266   if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x6E) {
11267      vassert(sz == 2 || sz == 8);
11268      if (sz == 2) sz = 4;
11269      modrm = getUChar(delta+2);
11270      if (epartIsReg(modrm)) {
11271         delta += 2+1;
11272         if (sz == 4) {
11273            putXMMReg(
11274               gregOfRexRM(pfx,modrm),
11275               unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
11276            );
11277            DIP("movd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
11278                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
11279         } else {
11280            putXMMReg(
11281               gregOfRexRM(pfx,modrm),
11282               unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
11283            );
11284            DIP("movq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
11285                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
11286	 }
11287      } else {
11288         addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0 );
11289         delta += 2+alen;
11290         putXMMReg(
11291            gregOfRexRM(pfx,modrm),
11292            sz == 4
11293               ?  unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
11294	       :  unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)) )
11295         );
11296         DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q', dis_buf,
11297                               nameXMMReg(gregOfRexRM(pfx,modrm)));
11298      }
11299      goto decode_success;
11300   }
11301
11302   /* 66 0F 7E = MOVD from xmm low 1/4 to ireg32 or m32. */
11303   /*              or from xmm low 1/2 to ireg64 or m64. */
11304   if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x7E) {
11305      if (sz == 2) sz = 4;
11306      vassert(sz == 4 || sz == 8);
11307      modrm = getUChar(delta+2);
11308      if (epartIsReg(modrm)) {
11309         delta += 2+1;
11310         if (sz == 4) {
11311            putIReg32( eregOfRexRM(pfx,modrm),
11312                       getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
11313            DIP("movd %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11314                                 nameIReg32(eregOfRexRM(pfx,modrm)));
11315	 } else {
11316            putIReg64( eregOfRexRM(pfx,modrm),
11317                       getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
11318            DIP("movq %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11319                                 nameIReg64(eregOfRexRM(pfx,modrm)));
11320	 }
11321      } else {
11322         addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0 );
11323         delta += 2+alen;
11324         storeLE( mkexpr(addr),
11325                  sz == 4
11326                     ? getXMMRegLane32(gregOfRexRM(pfx,modrm),0)
11327                     : getXMMRegLane64(gregOfRexRM(pfx,modrm),0) );
11328         DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q',
11329                               nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
11330      }
11331      goto decode_success;
11332   }
11333
11334   /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
11335   if (have66noF2noF3(pfx) && sz == 2
11336       && insn[0] == 0x0F && insn[1] == 0x7F) {
11337      modrm = getUChar(delta+2);
11338      if (epartIsReg(modrm)) {
11339         delta += 2+1;
11340         putXMMReg( eregOfRexRM(pfx,modrm),
11341                    getXMMReg(gregOfRexRM(pfx,modrm)) );
11342         DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11343                                nameXMMReg(eregOfRexRM(pfx,modrm)));
11344      } else {
11345         addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0 );
11346         gen_SEGV_if_not_16_aligned( addr );
11347         delta += 2+alen;
11348         storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
11349         DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
11350      }
11351      goto decode_success;
11352   }
11353
11354   /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
11355   if (haveF3no66noF2(pfx) && sz == 4
11356       && insn[0] == 0x0F && insn[1] == 0x6F) {
11357      modrm = getUChar(delta+2);
11358      if (epartIsReg(modrm)) {
11359         putXMMReg( gregOfRexRM(pfx,modrm),
11360                    getXMMReg( eregOfRexRM(pfx,modrm) ));
11361         DIP("movdqu %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
11362                               nameXMMReg(gregOfRexRM(pfx,modrm)));
11363         delta += 2+1;
11364      } else {
11365         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
11366         putXMMReg( gregOfRexRM(pfx,modrm),
11367                    loadLE(Ity_V128, mkexpr(addr)) );
11368         DIP("movdqu %s,%s\n", dis_buf,
11369                               nameXMMReg(gregOfRexRM(pfx,modrm)));
11370         delta += 2+alen;
11371      }
11372      goto decode_success;
11373   }
11374
11375   /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
11376   if (haveF3no66noF2(pfx) && sz == 4
11377       && insn[0] == 0x0F && insn[1] == 0x7F) {
11378      modrm = getUChar(delta+2);
11379      if (epartIsReg(modrm)) {
11380         goto decode_failure; /* awaiting test case */
11381         delta += 2+1;
11382         putXMMReg( eregOfRexRM(pfx,modrm),
11383                    getXMMReg(gregOfRexRM(pfx,modrm)) );
11384         DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11385                                nameXMMReg(eregOfRexRM(pfx,modrm)));
11386      } else {
11387         addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0 );
11388         delta += 2+alen;
11389         storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
11390         DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
11391      }
11392      goto decode_success;
11393   }
11394
11395   /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
11396   if (haveF2no66noF3(pfx) && sz == 4
11397       && insn[0] == 0x0F && insn[1] == 0xD6) {
11398      modrm = getUChar(delta+2);
11399      if (epartIsReg(modrm)) {
11400         do_MMX_preamble();
11401         putMMXReg( gregLO3ofRM(modrm),
11402                    getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
11403         DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
11404                                nameMMXReg(gregLO3ofRM(modrm)));
11405         delta += 2+1;
11406         goto decode_success;
11407      } else {
11408         /* apparently no mem case for this insn */
11409         goto decode_failure;
11410      }
11411   }
11412
11413   /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
11414   /* These seems identical to MOVHPS.  This instruction encoding is
11415      completely crazy. */
11416   if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x16) {
11417      modrm = getUChar(delta+2);
11418      if (epartIsReg(modrm)) {
11419         /* fall through; apparently reg-reg is not possible */
11420      } else {
11421         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
11422         delta += 2+alen;
11423         putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
11424                          loadLE(Ity_I64, mkexpr(addr)) );
11425         DIP("movhpd %s,%s\n", dis_buf,
11426                               nameXMMReg( gregOfRexRM(pfx,modrm) ));
11427         goto decode_success;
11428      }
11429   }
11430
11431   /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
11432   /* Again, this seems identical to MOVHPS. */
11433   if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x17) {
11434      if (!epartIsReg(insn[2])) {
11435         delta += 2;
11436         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11437         delta += alen;
11438         storeLE( mkexpr(addr),
11439                  getXMMRegLane64( gregOfRexRM(pfx,insn[2]),
11440                                   1/*upper lane*/ ) );
11441         DIP("movhpd %s,%s\n", nameXMMReg( gregOfRexRM(pfx,insn[2]) ),
11442                               dis_buf);
11443         goto decode_success;
11444      }
11445      /* else fall through */
11446   }
11447
11448   /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
11449   /* Identical to MOVLPS ? */
11450   if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x12) {
11451      modrm = getUChar(delta+2);
11452      if (epartIsReg(modrm)) {
11453         /* fall through; apparently reg-reg is not possible */
11454      } else {
11455         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
11456         delta += 2+alen;
11457         putXMMRegLane64( gregOfRexRM(pfx,modrm),
11458                          0/*lower lane*/,
11459                          loadLE(Ity_I64, mkexpr(addr)) );
11460         DIP("movlpd %s, %s\n",
11461             dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
11462         goto decode_success;
11463      }
11464   }
11465
11466   /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
11467   /* Identical to MOVLPS ? */
11468   if (have66noF2noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x13) {
11469      modrm = getUChar(delta+2);
11470      if (!epartIsReg(modrm)) {
11471         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
11472         delta += 2+alen;
11473         storeLE( mkexpr(addr),
11474                  getXMMRegLane64( gregOfRexRM(pfx,modrm),
11475                                   0/*lower lane*/ ) );
11476         DIP("movlpd %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
11477                                dis_buf);
11478         goto decode_success;
11479      }
11480      /* else fall through */
11481   }
11482
11483   /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
11484      2 lowest bits of ireg(G) */
11485   if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)
11486       && insn[0] == 0x0F && insn[1] == 0x50) {
11487      /* sz == 8 is a kludge to handle insns with REX.W redundantly
11488         set to 1, which has been known to happen:
11489         66 4c 0f 50 d9          rex64X movmskpd %xmm1,%r11d
11490         20071106: see further comments on MOVMSKPS implementation above.
11491      */
11492      modrm = getUChar(delta+2);
11493      if (epartIsReg(modrm)) {
11494         Int src;
11495         t0 = newTemp(Ity_I32);
11496         t1 = newTemp(Ity_I32);
11497         delta += 2+1;
11498         src = eregOfRexRM(pfx,modrm);
11499         assign( t0, binop( Iop_And32,
11500                            binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(31)),
11501                            mkU32(1) ));
11502         assign( t1, binop( Iop_And32,
11503                            binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(30)),
11504                            mkU32(2) ));
11505         putIReg32( gregOfRexRM(pfx,modrm),
11506                    binop(Iop_Or32, mkexpr(t0), mkexpr(t1))
11507                  );
11508         DIP("movmskpd %s,%s\n", nameXMMReg(src),
11509                                 nameIReg32(gregOfRexRM(pfx,modrm)));
11510         goto decode_success;
11511      }
11512      /* else fall through */
11513      goto decode_failure;
11514   }
11515
11516   /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
11517   if (have66noF2noF3(pfx) && sz == 2
11518       && insn[0] == 0x0F && insn[1] == 0xF7) {
11519      modrm = getUChar(delta+2);
11520      if (epartIsReg(modrm)) {
11521         IRTemp regD    = newTemp(Ity_V128);
11522         IRTemp mask    = newTemp(Ity_V128);
11523         IRTemp olddata = newTemp(Ity_V128);
11524         IRTemp newdata = newTemp(Ity_V128);
11525                addr    = newTemp(Ity_I64);
11526
11527         assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
11528         assign( regD, getXMMReg( gregOfRexRM(pfx,modrm) ));
11529
11530         /* Unfortunately can't do the obvious thing with SarN8x16
11531            here since that can't be re-emitted as SSE2 code - no such
11532            insn. */
11533	 assign(
11534            mask,
11535            binop(Iop_64HLtoV128,
11536                  binop(Iop_SarN8x8,
11537                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ),
11538                        mkU8(7) ),
11539                  binop(Iop_SarN8x8,
11540                        getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ),
11541                        mkU8(7) ) ));
11542         assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
11543         assign( newdata,
11544                 binop(Iop_OrV128,
11545                       binop(Iop_AndV128,
11546                             mkexpr(regD),
11547                             mkexpr(mask) ),
11548                       binop(Iop_AndV128,
11549                             mkexpr(olddata),
11550                             unop(Iop_NotV128, mkexpr(mask)))) );
11551         storeLE( mkexpr(addr), mkexpr(newdata) );
11552
11553         delta += 2+1;
11554         DIP("maskmovdqu %s,%s\n", nameXMMReg( eregOfRexRM(pfx,modrm) ),
11555                                   nameXMMReg( gregOfRexRM(pfx,modrm) ) );
11556         goto decode_success;
11557      }
11558      /* else fall through */
11559   }
11560
11561   /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
11562   if (have66noF2noF3(pfx) && sz == 2
11563       && insn[0] == 0x0F && insn[1] == 0xE7) {
11564      modrm = getUChar(delta+2);
11565      if (!epartIsReg(modrm)) {
11566         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
11567         gen_SEGV_if_not_16_aligned( addr );
11568         storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
11569         DIP("movntdq %s,%s\n", dis_buf,
11570                                nameXMMReg(gregOfRexRM(pfx,modrm)));
11571         delta += 2+alen;
11572         goto decode_success;
11573      }
11574      /* else fall through */
11575      goto decode_failure;
11576   }
11577
11578   /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
11579   if (haveNo66noF2noF3(pfx) &&
11580       insn[0] == 0x0F && insn[1] == 0xC3) {
11581      vassert(sz == 4 || sz == 8);
11582      modrm = getUChar(delta+2);
11583      if (!epartIsReg(modrm)) {
11584         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
11585         storeLE( mkexpr(addr), getIRegG(sz, pfx, modrm) );
11586         DIP("movnti %s,%s\n", dis_buf,
11587                               nameIRegG(sz, pfx, modrm));
11588         delta += 2+alen;
11589         goto decode_success;
11590      }
11591      /* else fall through */
11592   }
11593
11594   /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
11595      or lo half xmm).  */
11596   if (have66noF2noF3(pfx)
11597       && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
11598       && insn[0] == 0x0F && insn[1] == 0xD6) {
11599      modrm = getUChar(delta+2);
11600      if (epartIsReg(modrm)) {
11601         /* fall through, awaiting test case */
11602         /* dst: lo half copied, hi half zeroed */
11603      } else {
11604         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
11605         storeLE( mkexpr(addr),
11606                  getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
11607         DIP("movq %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf );
11608         delta += 2+alen;
11609         goto decode_success;
11610      }
11611   }
11612
11613   /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
11614      hi half). */
11615   if (haveF3no66noF2(pfx) && sz == 4
11616       && insn[0] == 0x0F && insn[1] == 0xD6) {
11617      modrm = getUChar(delta+2);
11618      if (epartIsReg(modrm)) {
11619         do_MMX_preamble();
11620         putXMMReg( gregOfRexRM(pfx,modrm),
11621                    unop(Iop_64UtoV128, getMMXReg( eregLO3ofRM(modrm) )) );
11622         DIP("movq2dq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
11623                                nameXMMReg(gregOfRexRM(pfx,modrm)));
11624         delta += 2+1;
11625         goto decode_success;
11626      } else {
11627         /* apparently no mem case for this insn */
11628         goto decode_failure;
11629      }
11630   }
11631
11632   /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
11633      G (lo half xmm).  Upper half of G is zeroed out. */
11634   /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
11635      G (lo half xmm).  If E is mem, upper half of G is zeroed out.
11636      If E is reg, upper half of G is unchanged. */
11637   if ( (haveF2no66noF3(pfx)
11638         && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
11639         && insn[0] == 0x0F && insn[1] == 0x10)
11640        ||
11641        (haveF3no66noF2(pfx)
11642         && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
11643         && insn[0] == 0x0F && insn[1] == 0x7E)
11644      ) {
11645      modrm = getUChar(delta+2);
11646      if (epartIsReg(modrm)) {
11647         putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
11648                          getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
11649         if (insn[1] == 0x7E/*MOVQ*/) {
11650            /* zero bits 127:64 */
11651            putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkU64(0) );
11652         }
11653         DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
11654                              nameXMMReg(gregOfRexRM(pfx,modrm)));
11655         delta += 2+1;
11656      } else {
11657         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
11658         putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
11659         putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
11660                          loadLE(Ity_I64, mkexpr(addr)) );
11661         DIP("movsd %s,%s\n", dis_buf,
11662                              nameXMMReg(gregOfRexRM(pfx,modrm)));
11663         delta += 2+alen;
11664      }
11665      goto decode_success;
11666   }
11667
11668   /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
11669      or lo half xmm). */
11670   if (haveF2no66noF3(pfx)
11671       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
11672       && insn[0] == 0x0F && insn[1] == 0x11) {
11673      modrm = getUChar(delta+2);
11674      if (epartIsReg(modrm)) {
11675         putXMMRegLane64( eregOfRexRM(pfx,modrm), 0,
11676                          getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
11677         DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11678                              nameXMMReg(eregOfRexRM(pfx,modrm)));
11679         delta += 2+1;
11680      } else {
11681         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
11682         storeLE( mkexpr(addr),
11683                  getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
11684         DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11685                              dis_buf);
11686         delta += 2+alen;
11687      }
11688      goto decode_success;
11689   }
11690
11691   /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
11692   if (have66noF2noF3(pfx)
11693       && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
11694       && insn[0] == 0x0F && insn[1] == 0x59) {
11695      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "mulpd", Iop_Mul64Fx2 );
11696      goto decode_success;
11697   }
11698
11699   /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
11700   if (haveF2no66noF3(pfx)
11701       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
11702       && insn[0] == 0x0F && insn[1] == 0x59) {
11703      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "mulsd", Iop_Mul64F0x2 );
11704      goto decode_success;
11705   }
11706
11707   /* 66 0F 56 = ORPD -- G = G and E */
11708   if (have66noF2noF3(pfx) && sz == 2
11709       && insn[0] == 0x0F && insn[1] == 0x56) {
11710      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "orpd", Iop_OrV128 );
11711      goto decode_success;
11712   }
11713
11714   /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
11715   if (have66noF2noF3(pfx) && sz == 2
11716       && insn[0] == 0x0F && insn[1] == 0xC6) {
11717      Int    select;
11718      IRTemp sV = newTemp(Ity_V128);
11719      IRTemp dV = newTemp(Ity_V128);
11720      IRTemp s1 = newTemp(Ity_I64);
11721      IRTemp s0 = newTemp(Ity_I64);
11722      IRTemp d1 = newTemp(Ity_I64);
11723      IRTemp d0 = newTemp(Ity_I64);
11724
11725      modrm = insn[2];
11726      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
11727
11728      if (epartIsReg(modrm)) {
11729         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
11730         select = (Int)insn[3];
11731         delta += 2+2;
11732         DIP("shufpd $%d,%s,%s\n", select,
11733                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
11734                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
11735      } else {
11736         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 1 );
11737         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11738         select = (Int)insn[2+alen];
11739         delta += 3+alen;
11740         DIP("shufpd $%d,%s,%s\n", select,
11741                                   dis_buf,
11742                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
11743      }
11744
11745      assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
11746      assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
11747      assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
11748      assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
11749
11750#     define SELD(n) mkexpr((n)==0 ? d0 : d1)
11751#     define SELS(n) mkexpr((n)==0 ? s0 : s1)
11752
11753      putXMMReg(
11754         gregOfRexRM(pfx,modrm),
11755         binop(Iop_64HLtoV128, SELS((select>>1)&1), SELD((select>>0)&1) )
11756      );
11757
11758#     undef SELD
11759#     undef SELS
11760
11761      goto decode_success;
11762   }
11763
11764   /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
11765   if (have66noF2noF3(pfx) && sz == 2
11766       && insn[0] == 0x0F && insn[1] == 0x51) {
11767      delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta+2,
11768                                        "sqrtpd", Iop_Sqrt64Fx2 );
11769      goto decode_success;
11770   }
11771
11772   /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
11773   if (haveF2no66noF3(pfx) && insn[0] == 0x0F && insn[1] == 0x51) {
11774      vassert(sz == 4);
11775      delta = dis_SSE_E_to_G_unary_lo64( vbi, pfx, delta+2,
11776                                         "sqrtsd", Iop_Sqrt64F0x2 );
11777      goto decode_success;
11778   }
11779
11780   /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
11781   if (have66noF2noF3(pfx) && sz == 2
11782       && insn[0] == 0x0F && insn[1] == 0x5C) {
11783      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "subpd", Iop_Sub64Fx2 );
11784      goto decode_success;
11785   }
11786
11787   /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
11788   if (haveF2no66noF3(pfx)
11789       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
11790       && insn[0] == 0x0F && insn[1] == 0x5C) {
11791      delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta+2, "subsd", Iop_Sub64F0x2 );
11792      goto decode_success;
11793   }
11794
11795   /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
11796   /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
11797   /* These just appear to be special cases of SHUFPS */
11798   if (have66noF2noF3(pfx)
11799       && sz == 2 /* could be 8 if rex also present */
11800       && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
11801      IRTemp s1 = newTemp(Ity_I64);
11802      IRTemp s0 = newTemp(Ity_I64);
11803      IRTemp d1 = newTemp(Ity_I64);
11804      IRTemp d0 = newTemp(Ity_I64);
11805      IRTemp sV = newTemp(Ity_V128);
11806      IRTemp dV = newTemp(Ity_V128);
11807      Bool   hi = toBool(insn[1] == 0x15);
11808
11809      modrm = insn[2];
11810      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
11811
11812      if (epartIsReg(modrm)) {
11813         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
11814         delta += 2+1;
11815         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
11816                                  nameXMMReg(eregOfRexRM(pfx,modrm)),
11817                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
11818      } else {
11819         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
11820         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11821         delta += 2+alen;
11822         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
11823                                  dis_buf,
11824                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
11825      }
11826
11827      assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
11828      assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
11829      assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
11830      assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
11831
11832      if (hi) {
11833         putXMMReg( gregOfRexRM(pfx,modrm),
11834                    binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
11835      } else {
11836         putXMMReg( gregOfRexRM(pfx,modrm),
11837                    binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
11838      }
11839
11840      goto decode_success;
11841   }
11842
11843   /* 66 0F 57 = XORPD -- G = G xor E */
11844   if (have66noF2noF3(pfx) && sz == 2
11845       && insn[0] == 0x0F && insn[1] == 0x57) {
11846      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "xorpd", Iop_XorV128 );
11847      goto decode_success;
11848   }
11849
11850   /* 66 0F 6B = PACKSSDW */
11851   if (have66noF2noF3(pfx) && sz == 2
11852       && insn[0] == 0x0F && insn[1] == 0x6B) {
11853      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
11854                                 "packssdw",
11855                                 Iop_QNarrowBin32Sto16Sx8, True );
11856      goto decode_success;
11857   }
11858
11859   /* 66 0F 63 = PACKSSWB */
11860   if (have66noF2noF3(pfx) && sz == 2
11861       && insn[0] == 0x0F && insn[1] == 0x63) {
11862      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
11863                                 "packsswb",
11864                                 Iop_QNarrowBin16Sto8Sx16, True );
11865      goto decode_success;
11866   }
11867
11868   /* 66 0F 67 = PACKUSWB */
11869   if (have66noF2noF3(pfx) && sz == 2
11870       && insn[0] == 0x0F && insn[1] == 0x67) {
11871      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
11872                                 "packuswb",
11873                                 Iop_QNarrowBin16Sto8Ux16, True );
11874      goto decode_success;
11875   }
11876
11877   /* 66 0F FC = PADDB */
11878   if (have66noF2noF3(pfx) && sz == 2
11879       && insn[0] == 0x0F && insn[1] == 0xFC) {
11880      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
11881                                 "paddb", Iop_Add8x16, False );
11882      goto decode_success;
11883   }
11884
11885   /* 66 0F FE = PADDD */
11886   if (have66noF2noF3(pfx) && sz == 2
11887       && insn[0] == 0x0F && insn[1] == 0xFE) {
11888      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
11889                                 "paddd", Iop_Add32x4, False );
11890      goto decode_success;
11891   }
11892
11893   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
11894   /* 0F D4 = PADDQ -- add 64x1 */
11895   if (haveNo66noF2noF3(pfx) && sz == 4
11896       && insn[0] == 0x0F && insn[1] == 0xD4) {
11897      do_MMX_preamble();
11898      delta = dis_MMXop_regmem_to_reg (
11899                vbi, pfx, delta+2, insn[1], "paddq", False );
11900      goto decode_success;
11901   }
11902
11903   /* 66 0F D4 = PADDQ */
11904   if (have66noF2noF3(pfx) && sz == 2
11905       && insn[0] == 0x0F && insn[1] == 0xD4) {
11906      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
11907                                 "paddq", Iop_Add64x2, False );
11908      goto decode_success;
11909   }
11910
11911   /* 66 0F FD = PADDW */
11912   if (have66noF2noF3(pfx) && sz == 2
11913       && insn[0] == 0x0F && insn[1] == 0xFD) {
11914      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
11915                                 "paddw", Iop_Add16x8, False );
11916      goto decode_success;
11917   }
11918
11919   /* 66 0F EC = PADDSB */
11920   if (have66noF2noF3(pfx) && sz == 2
11921       && insn[0] == 0x0F && insn[1] == 0xEC) {
11922      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
11923                                 "paddsb", Iop_QAdd8Sx16, False );
11924      goto decode_success;
11925   }
11926
11927   /* 66 0F ED = PADDSW */
11928   if (have66noF2noF3(pfx) && sz == 2
11929       && insn[0] == 0x0F && insn[1] == 0xED) {
11930      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
11931                                 "paddsw", Iop_QAdd16Sx8, False );
11932      goto decode_success;
11933   }
11934
11935   /* 66 0F DC = PADDUSB */
11936   if (have66noF2noF3(pfx) && sz == 2
11937       && insn[0] == 0x0F && insn[1] == 0xDC) {
11938      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
11939                                 "paddusb", Iop_QAdd8Ux16, False );
11940      goto decode_success;
11941   }
11942
11943   /* 66 0F DD = PADDUSW */
11944   if (have66noF2noF3(pfx) && sz == 2
11945       && insn[0] == 0x0F && insn[1] == 0xDD) {
11946      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
11947                                 "paddusw", Iop_QAdd16Ux8, False );
11948      goto decode_success;
11949   }
11950
11951   /* 66 0F DB = PAND */
11952   if (have66noF2noF3(pfx) && sz == 2
11953       && insn[0] == 0x0F && insn[1] == 0xDB) {
11954      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "pand", Iop_AndV128 );
11955      goto decode_success;
11956   }
11957
11958   /* 66 0F DF = PANDN */
11959   if (have66noF2noF3(pfx) && sz == 2
11960       && insn[0] == 0x0F && insn[1] == 0xDF) {
11961      delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta+2, "pandn", Iop_AndV128 );
11962      goto decode_success;
11963   }
11964
11965   /* 66 0F E0 = PAVGB */
11966   if (have66noF2noF3(pfx) && sz == 2
11967       && insn[0] == 0x0F && insn[1] == 0xE0) {
11968      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
11969                                 "pavgb", Iop_Avg8Ux16, False );
11970      goto decode_success;
11971   }
11972
11973   /* 66 0F E3 = PAVGW */
11974   if (have66noF2noF3(pfx) && sz == 2
11975       && insn[0] == 0x0F && insn[1] == 0xE3) {
11976      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
11977                                 "pavgw", Iop_Avg16Ux8, False );
11978      goto decode_success;
11979   }
11980
11981   /* 66 0F 74 = PCMPEQB */
11982   if (have66noF2noF3(pfx) && sz == 2
11983       && insn[0] == 0x0F && insn[1] == 0x74) {
11984      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
11985                                 "pcmpeqb", Iop_CmpEQ8x16, False );
11986      goto decode_success;
11987   }
11988
11989   /* 66 0F 76 = PCMPEQD */
11990   if (have66noF2noF3(pfx) && sz == 2
11991       && insn[0] == 0x0F && insn[1] == 0x76) {
11992      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
11993                                 "pcmpeqd", Iop_CmpEQ32x4, False );
11994      goto decode_success;
11995   }
11996
11997   /* 66 0F 75 = PCMPEQW */
11998   if (have66noF2noF3(pfx) && sz == 2
11999       && insn[0] == 0x0F && insn[1] == 0x75) {
12000      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12001                                 "pcmpeqw", Iop_CmpEQ16x8, False );
12002      goto decode_success;
12003   }
12004
12005   /* 66 0F 64 = PCMPGTB */
12006   if (have66noF2noF3(pfx) && sz == 2
12007       && insn[0] == 0x0F && insn[1] == 0x64) {
12008      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12009                                 "pcmpgtb", Iop_CmpGT8Sx16, False );
12010      goto decode_success;
12011   }
12012
12013   /* 66 0F 66 = PCMPGTD */
12014   if (have66noF2noF3(pfx) && sz == 2
12015       && insn[0] == 0x0F && insn[1] == 0x66) {
12016      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12017                                 "pcmpgtd", Iop_CmpGT32Sx4, False );
12018      goto decode_success;
12019   }
12020
12021   /* 66 0F 65 = PCMPGTW */
12022   if (have66noF2noF3(pfx) && sz == 2
12023       && insn[0] == 0x0F && insn[1] == 0x65) {
12024      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12025                                 "pcmpgtw", Iop_CmpGT16Sx8, False );
12026      goto decode_success;
12027   }
12028
12029   /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
12030      zero-extend of it in ireg(G). */
12031   if (have66noF2noF3(pfx)
12032       && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
12033       && insn[0] == 0x0F && insn[1] == 0xC5) {
12034      modrm = insn[2];
12035      if (epartIsReg(modrm)) {
12036         t5 = newTemp(Ity_V128);
12037         t4 = newTemp(Ity_I16);
12038         assign(t5, getXMMReg(eregOfRexRM(pfx,modrm)));
12039         breakup128to32s( t5, &t3, &t2, &t1, &t0 );
12040         switch (insn[3] & 7) {
12041            case 0:  assign(t4, unop(Iop_32to16,   mkexpr(t0))); break;
12042            case 1:  assign(t4, unop(Iop_32HIto16, mkexpr(t0))); break;
12043            case 2:  assign(t4, unop(Iop_32to16,   mkexpr(t1))); break;
12044            case 3:  assign(t4, unop(Iop_32HIto16, mkexpr(t1))); break;
12045            case 4:  assign(t4, unop(Iop_32to16,   mkexpr(t2))); break;
12046            case 5:  assign(t4, unop(Iop_32HIto16, mkexpr(t2))); break;
12047            case 6:  assign(t4, unop(Iop_32to16,   mkexpr(t3))); break;
12048            case 7:  assign(t4, unop(Iop_32HIto16, mkexpr(t3))); break;
12049            default: vassert(0);
12050         }
12051         putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_16Uto32, mkexpr(t4)));
12052         DIP("pextrw $%d,%s,%s\n",
12053             (Int)insn[3], nameXMMReg(eregOfRexRM(pfx,modrm)),
12054                           nameIReg32(gregOfRexRM(pfx,modrm)));
12055         delta += 4;
12056         goto decode_success;
12057      }
12058      /* else fall through */
12059      /* note, if memory case is ever filled in, there is 1 byte after
12060         amode */
12061   }
12062
12063   /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
12064      put it into the specified lane of xmm(G). */
12065   if (have66noF2noF3(pfx)
12066       && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
12067       && insn[0] == 0x0F && insn[1] == 0xC4) {
12068      Int lane;
12069      t4 = newTemp(Ity_I16);
12070      modrm = insn[2];
12071
12072      if (epartIsReg(modrm)) {
12073         assign(t4, getIReg16(eregOfRexRM(pfx,modrm)));
12074         delta += 3+1;
12075         lane = insn[3+1-1];
12076         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
12077                                   nameIReg16(eregOfRexRM(pfx,modrm)),
12078                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
12079      } else {
12080         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf,
12081                           1/*byte after the amode*/ );
12082         delta += 3+alen;
12083         lane = insn[3+alen-1];
12084         assign(t4, loadLE(Ity_I16, mkexpr(addr)));
12085         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
12086                                   dis_buf,
12087                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
12088     }
12089
12090      putXMMRegLane16( gregOfRexRM(pfx,modrm), lane & 7, mkexpr(t4) );
12091      goto decode_success;
12092   }
12093
12094   /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
12095      E(xmm or mem) to G(xmm) */
12096   if (have66noF2noF3(pfx) && sz == 2
12097       && insn[0] == 0x0F && insn[1] == 0xF5) {
12098      IRTemp s1V  = newTemp(Ity_V128);
12099      IRTemp s2V  = newTemp(Ity_V128);
12100      IRTemp dV   = newTemp(Ity_V128);
12101      IRTemp s1Hi = newTemp(Ity_I64);
12102      IRTemp s1Lo = newTemp(Ity_I64);
12103      IRTemp s2Hi = newTemp(Ity_I64);
12104      IRTemp s2Lo = newTemp(Ity_I64);
12105      IRTemp dHi  = newTemp(Ity_I64);
12106      IRTemp dLo  = newTemp(Ity_I64);
12107      modrm = insn[2];
12108      if (epartIsReg(modrm)) {
12109         assign( s1V, getXMMReg(eregOfRexRM(pfx,modrm)) );
12110         delta += 2+1;
12111         DIP("pmaddwd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12112                                nameXMMReg(gregOfRexRM(pfx,modrm)));
12113      } else {
12114         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
12115         assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
12116         delta += 2+alen;
12117         DIP("pmaddwd %s,%s\n", dis_buf,
12118                                nameXMMReg(gregOfRexRM(pfx,modrm)));
12119      }
12120      assign( s2V, getXMMReg(gregOfRexRM(pfx,modrm)) );
12121      assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
12122      assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
12123      assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
12124      assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
12125      assign( dHi, mkIRExprCCall(
12126                      Ity_I64, 0/*regparms*/,
12127                      "amd64g_calculate_mmx_pmaddwd",
12128                      &amd64g_calculate_mmx_pmaddwd,
12129                      mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
12130                   ));
12131      assign( dLo, mkIRExprCCall(
12132                      Ity_I64, 0/*regparms*/,
12133                      "amd64g_calculate_mmx_pmaddwd",
12134                      &amd64g_calculate_mmx_pmaddwd,
12135                      mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
12136                   ));
12137      assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
12138      putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV));
12139      goto decode_success;
12140   }
12141
12142   /* 66 0F EE = PMAXSW -- 16x8 signed max */
12143   if (have66noF2noF3(pfx) && sz == 2
12144       && insn[0] == 0x0F && insn[1] == 0xEE) {
12145      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12146                                 "pmaxsw", Iop_Max16Sx8, False );
12147      goto decode_success;
12148   }
12149
12150   /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
12151   if (have66noF2noF3(pfx) && sz == 2
12152       && insn[0] == 0x0F && insn[1] == 0xDE) {
12153      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12154                                 "pmaxub", Iop_Max8Ux16, False );
12155      goto decode_success;
12156   }
12157
12158   /* 66 0F EA = PMINSW -- 16x8 signed min */
12159   if (have66noF2noF3(pfx) && sz == 2
12160       && insn[0] == 0x0F && insn[1] == 0xEA) {
12161      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12162                                 "pminsw", Iop_Min16Sx8, False );
12163      goto decode_success;
12164   }
12165
12166   /* 66 0F DA = PMINUB -- 8x16 unsigned min */
12167   if (have66noF2noF3(pfx) && sz == 2
12168       && insn[0] == 0x0F && insn[1] == 0xDA) {
12169      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12170                                 "pminub", Iop_Min8Ux16, False );
12171      goto decode_success;
12172   }
12173
12174   /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16 lanes in
12175      xmm(E), turn them into a byte, and put zero-extend of it in
12176      ireg(G).  Doing this directly is just too cumbersome; give up
12177      therefore and call a helper. */
12178   /* UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ); */
12179   if (have66noF2noF3(pfx)
12180       && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
12181       && insn[0] == 0x0F && insn[1] == 0xD7) {
12182      modrm = insn[2];
12183      if (epartIsReg(modrm)) {
12184         t0 = newTemp(Ity_I64);
12185         t1 = newTemp(Ity_I64);
12186         assign(t0, getXMMRegLane64(eregOfRexRM(pfx,modrm), 0));
12187         assign(t1, getXMMRegLane64(eregOfRexRM(pfx,modrm), 1));
12188         t5 = newTemp(Ity_I64);
12189         assign(t5, mkIRExprCCall(
12190                       Ity_I64, 0/*regparms*/,
12191                       "amd64g_calculate_sse_pmovmskb",
12192                       &amd64g_calculate_sse_pmovmskb,
12193                       mkIRExprVec_2( mkexpr(t1), mkexpr(t0) )));
12194         putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_64to32,mkexpr(t5)));
12195         DIP("pmovmskb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12196                                 nameIReg32(gregOfRexRM(pfx,modrm)));
12197         delta += 3;
12198         goto decode_success;
12199      }
12200      /* else fall through */
12201   }
12202
12203   /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
12204   if (have66noF2noF3(pfx) && sz == 2
12205       && insn[0] == 0x0F && insn[1] == 0xE4) {
12206      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12207                                 "pmulhuw", Iop_MulHi16Ux8, False );
12208      goto decode_success;
12209   }
12210
12211   /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
12212   if (have66noF2noF3(pfx) && sz == 2
12213       && insn[0] == 0x0F && insn[1] == 0xE5) {
12214      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12215                                 "pmulhw", Iop_MulHi16Sx8, False );
12216      goto decode_success;
12217   }
12218
12219   /* 66 0F D5 = PMULHL -- 16x8 multiply */
12220   if (have66noF2noF3(pfx) && sz == 2
12221       && insn[0] == 0x0F && insn[1] == 0xD5) {
12222      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12223                                 "pmullw", Iop_Mul16x8, False );
12224      goto decode_success;
12225   }
12226
12227   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
12228   /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
12229      0 to form 64-bit result */
12230   if (haveNo66noF2noF3(pfx) && sz == 4
12231       && insn[0] == 0x0F && insn[1] == 0xF4) {
12232      IRTemp sV = newTemp(Ity_I64);
12233      IRTemp dV = newTemp(Ity_I64);
12234      t1 = newTemp(Ity_I32);
12235      t0 = newTemp(Ity_I32);
12236      modrm = insn[2];
12237
12238      do_MMX_preamble();
12239      assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
12240
12241      if (epartIsReg(modrm)) {
12242         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
12243         delta += 2+1;
12244         DIP("pmuludq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
12245                                nameMMXReg(gregLO3ofRM(modrm)));
12246      } else {
12247         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
12248         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12249         delta += 2+alen;
12250         DIP("pmuludq %s,%s\n", dis_buf,
12251                                nameMMXReg(gregLO3ofRM(modrm)));
12252      }
12253
12254      assign( t0, unop(Iop_64to32, mkexpr(dV)) );
12255      assign( t1, unop(Iop_64to32, mkexpr(sV)) );
12256      putMMXReg( gregLO3ofRM(modrm),
12257                 binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
12258      goto decode_success;
12259   }
12260
12261   /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
12262      0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
12263      half */
12264   /* This is a really poor translation -- could be improved if
12265      performance critical */
12266   if (have66noF2noF3(pfx) && sz == 2
12267       && insn[0] == 0x0F && insn[1] == 0xF4) {
12268      IRTemp sV, dV;
12269      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
12270      sV = newTemp(Ity_V128);
12271      dV = newTemp(Ity_V128);
12272      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
12273      t1 = newTemp(Ity_I64);
12274      t0 = newTemp(Ity_I64);
12275      modrm = insn[2];
12276      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
12277
12278      if (epartIsReg(modrm)) {
12279         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
12280         delta += 2+1;
12281         DIP("pmuludq %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12282                                nameXMMReg(gregOfRexRM(pfx,modrm)));
12283      } else {
12284         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
12285         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12286         delta += 2+alen;
12287         DIP("pmuludq %s,%s\n", dis_buf,
12288                                nameXMMReg(gregOfRexRM(pfx,modrm)));
12289      }
12290
12291      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
12292      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
12293
12294      assign( t0, binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) );
12295      putXMMRegLane64( gregOfRexRM(pfx,modrm), 0, mkexpr(t0) );
12296      assign( t1, binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)) );
12297      putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkexpr(t1) );
12298      goto decode_success;
12299   }
12300
12301   /* 66 0F EB = POR */
12302   if (have66noF2noF3(pfx) && sz == 2
12303       && insn[0] == 0x0F && insn[1] == 0xEB) {
12304      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "por", Iop_OrV128 );
12305      goto decode_success;
12306   }
12307
12308   /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
12309      from E(xmm or mem) to G(xmm) */
12310   if (have66noF2noF3(pfx) && sz == 2
12311       && insn[0] == 0x0F && insn[1] == 0xF6) {
12312      IRTemp s1V  = newTemp(Ity_V128);
12313      IRTemp s2V  = newTemp(Ity_V128);
12314      IRTemp dV   = newTemp(Ity_V128);
12315      IRTemp s1Hi = newTemp(Ity_I64);
12316      IRTemp s1Lo = newTemp(Ity_I64);
12317      IRTemp s2Hi = newTemp(Ity_I64);
12318      IRTemp s2Lo = newTemp(Ity_I64);
12319      IRTemp dHi  = newTemp(Ity_I64);
12320      IRTemp dLo  = newTemp(Ity_I64);
12321      modrm = insn[2];
12322      if (epartIsReg(modrm)) {
12323         assign( s1V, getXMMReg(eregOfRexRM(pfx,modrm)) );
12324         delta += 2+1;
12325         DIP("psadbw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12326                               nameXMMReg(gregOfRexRM(pfx,modrm)));
12327      } else {
12328         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
12329         assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
12330         delta += 2+alen;
12331         DIP("psadbw %s,%s\n", dis_buf,
12332                               nameXMMReg(gregOfRexRM(pfx,modrm)));
12333      }
12334      assign( s2V, getXMMReg(gregOfRexRM(pfx,modrm)) );
12335      assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
12336      assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
12337      assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
12338      assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
12339      assign( dHi, mkIRExprCCall(
12340                      Ity_I64, 0/*regparms*/,
12341                      "amd64g_calculate_mmx_psadbw",
12342                      &amd64g_calculate_mmx_psadbw,
12343                      mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
12344                   ));
12345      assign( dLo, mkIRExprCCall(
12346                      Ity_I64, 0/*regparms*/,
12347                      "amd64g_calculate_mmx_psadbw",
12348                      &amd64g_calculate_mmx_psadbw,
12349                      mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
12350                   ));
12351      assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
12352      putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV));
12353      goto decode_success;
12354   }
12355
12356   /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
12357   if (have66noF2noF3(pfx) && sz == 2
12358       && insn[0] == 0x0F && insn[1] == 0x70) {
12359      Int order;
12360      IRTemp sV, dV, s3, s2, s1, s0;
12361      s3 = s2 = s1 = s0 = IRTemp_INVALID;
12362      sV = newTemp(Ity_V128);
12363      dV = newTemp(Ity_V128);
12364      modrm = insn[2];
12365      if (epartIsReg(modrm)) {
12366         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
12367         order = (Int)insn[3];
12368         delta += 3+1;
12369         DIP("pshufd $%d,%s,%s\n", order,
12370                                   nameXMMReg(eregOfRexRM(pfx,modrm)),
12371                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
12372      } else {
12373         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf,
12374                           1/*byte after the amode*/ );
12375         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12376	 order = (Int)insn[2+alen];
12377         delta += 2+alen+1;
12378         DIP("pshufd $%d,%s,%s\n", order,
12379                                   dis_buf,
12380                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
12381      }
12382      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
12383
12384#     define SEL(n) \
12385                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
12386      assign(dV,
12387	     mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
12388                           SEL((order>>2)&3), SEL((order>>0)&3) )
12389      );
12390      putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV));
12391#     undef SEL
12392      goto decode_success;
12393   }
12394
12395   /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
12396      mem) to G(xmm), and copy lower half */
12397   if (haveF3no66noF2(pfx) && sz == 4
12398       && insn[0] == 0x0F && insn[1] == 0x70) {
12399      Int order;
12400      IRTemp sVhi, dVhi, sV, dV, s3, s2, s1, s0;
12401      s3 = s2 = s1 = s0 = IRTemp_INVALID;
12402      sV   = newTemp(Ity_V128);
12403      dV   = newTemp(Ity_V128);
12404      sVhi = newTemp(Ity_I64);
12405      dVhi = newTemp(Ity_I64);
12406      modrm = insn[2];
12407      if (epartIsReg(modrm)) {
12408         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
12409         order = (Int)insn[3];
12410         delta += 3+1;
12411         DIP("pshufhw $%d,%s,%s\n", order,
12412                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
12413                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
12414      } else {
12415         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf,
12416                           1/*byte after the amode*/ );
12417         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12418	 order = (Int)insn[2+alen];
12419         delta += 2+alen+1;
12420         DIP("pshufhw $%d,%s,%s\n", order,
12421                                    dis_buf,
12422                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
12423      }
12424      assign( sVhi, unop(Iop_V128HIto64, mkexpr(sV)) );
12425      breakup64to16s( sVhi, &s3, &s2, &s1, &s0 );
12426
12427#     define SEL(n) \
12428                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
12429      assign(dVhi,
12430	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
12431                          SEL((order>>2)&3), SEL((order>>0)&3) )
12432      );
12433      assign(dV, binop( Iop_64HLtoV128,
12434                        mkexpr(dVhi),
12435                        unop(Iop_V128to64, mkexpr(sV))) );
12436      putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV));
12437#     undef SEL
12438      goto decode_success;
12439   }
12440
12441   /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
12442      mem) to G(xmm), and copy upper half */
12443   if (haveF2no66noF3(pfx) && sz == 4
12444       && insn[0] == 0x0F && insn[1] == 0x70) {
12445      Int order;
12446      IRTemp sVlo, dVlo, sV, dV, s3, s2, s1, s0;
12447      s3 = s2 = s1 = s0 = IRTemp_INVALID;
12448      sV   = newTemp(Ity_V128);
12449      dV   = newTemp(Ity_V128);
12450      sVlo = newTemp(Ity_I64);
12451      dVlo = newTemp(Ity_I64);
12452      modrm = insn[2];
12453      if (epartIsReg(modrm)) {
12454         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
12455         order = (Int)insn[3];
12456         delta += 3+1;
12457         DIP("pshuflw $%d,%s,%s\n", order,
12458                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
12459                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
12460      } else {
12461         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf,
12462                           1/*byte after the amode*/ );
12463         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12464	 order = (Int)insn[2+alen];
12465         delta += 2+alen+1;
12466         DIP("pshuflw $%d,%s,%s\n", order,
12467                                    dis_buf,
12468                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
12469      }
12470      assign( sVlo, unop(Iop_V128to64, mkexpr(sV)) );
12471      breakup64to16s( sVlo, &s3, &s2, &s1, &s0 );
12472
12473#     define SEL(n) \
12474                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
12475      assign(dVlo,
12476	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
12477                          SEL((order>>2)&3), SEL((order>>0)&3) )
12478      );
12479      assign(dV, binop( Iop_64HLtoV128,
12480                        unop(Iop_V128HIto64, mkexpr(sV)),
12481                        mkexpr(dVlo) ) );
12482      putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(dV));
12483#     undef SEL
12484      goto decode_success;
12485   }
12486
12487   /* 66 0F 72 /6 ib = PSLLD by immediate */
12488   if (have66noF2noF3(pfx) && sz == 2
12489       && insn[0] == 0x0F && insn[1] == 0x72
12490       && epartIsReg(insn[2])
12491       && gregLO3ofRM(insn[2]) == 6) {
12492      delta = dis_SSE_shiftE_imm( pfx, delta+2, "pslld", Iop_ShlN32x4 );
12493      goto decode_success;
12494   }
12495
12496   /* 66 0F F2 = PSLLD by E */
12497   if (have66noF2noF3(pfx) && sz == 2
12498       && insn[0] == 0x0F && insn[1] == 0xF2) {
12499      delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "pslld", Iop_ShlN32x4 );
12500      goto decode_success;
12501   }
12502
12503   /* 66 0F 73 /7 ib = PSLLDQ by immediate */
12504   /* note, if mem case ever filled in, 1 byte after amode */
12505   if (have66noF2noF3(pfx) && sz == 2
12506       && insn[0] == 0x0F && insn[1] == 0x73
12507       && epartIsReg(insn[2])
12508       && gregLO3ofRM(insn[2]) == 7) {
12509      IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
12510      Int    imm = (Int)insn[3];
12511      Int    reg = eregOfRexRM(pfx,insn[2]);
12512      DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
12513      vassert(imm >= 0 && imm <= 255);
12514      delta += 4;
12515
12516      sV    = newTemp(Ity_V128);
12517      dV    = newTemp(Ity_V128);
12518      hi64  = newTemp(Ity_I64);
12519      lo64  = newTemp(Ity_I64);
12520      hi64r = newTemp(Ity_I64);
12521      lo64r = newTemp(Ity_I64);
12522
12523      if (imm >= 16) {
12524         putXMMReg(reg, mkV128(0x0000));
12525         goto decode_success;
12526      }
12527
12528      assign( sV, getXMMReg(reg) );
12529      assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
12530      assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
12531
12532      if (imm == 0) {
12533         assign( lo64r, mkexpr(lo64) );
12534         assign( hi64r, mkexpr(hi64) );
12535      }
12536      else
12537      if (imm == 8) {
12538         assign( lo64r, mkU64(0) );
12539         assign( hi64r, mkexpr(lo64) );
12540      }
12541      else
12542      if (imm > 8) {
12543         assign( lo64r, mkU64(0) );
12544         assign( hi64r, binop( Iop_Shl64,
12545                               mkexpr(lo64),
12546                               mkU8( 8*(imm-8) ) ));
12547      } else {
12548         assign( lo64r, binop( Iop_Shl64,
12549                               mkexpr(lo64),
12550                               mkU8(8 * imm) ));
12551         assign( hi64r,
12552                 binop( Iop_Or64,
12553                        binop(Iop_Shl64, mkexpr(hi64),
12554                                         mkU8(8 * imm)),
12555                        binop(Iop_Shr64, mkexpr(lo64),
12556                                         mkU8(8 * (8 - imm)) )
12557                      )
12558               );
12559      }
12560      assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
12561      putXMMReg(reg, mkexpr(dV));
12562      goto decode_success;
12563   }
12564
12565   /* 66 0F 73 /6 ib = PSLLQ by immediate */
12566   if (have66noF2noF3(pfx) && sz == 2
12567       && insn[0] == 0x0F && insn[1] == 0x73
12568       && epartIsReg(insn[2])
12569       && gregLO3ofRM(insn[2]) == 6) {
12570      delta = dis_SSE_shiftE_imm( pfx, delta+2, "psllq", Iop_ShlN64x2 );
12571      goto decode_success;
12572   }
12573
12574   /* 66 0F F3 = PSLLQ by E */
12575   if (have66noF2noF3(pfx) && sz == 2
12576       && insn[0] == 0x0F && insn[1] == 0xF3) {
12577      delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psllq", Iop_ShlN64x2 );
12578      goto decode_success;
12579   }
12580
12581   /* 66 0F 71 /6 ib = PSLLW by immediate */
12582   if (have66noF2noF3(pfx) && sz == 2
12583       && insn[0] == 0x0F && insn[1] == 0x71
12584       && epartIsReg(insn[2])
12585       && gregLO3ofRM(insn[2]) == 6) {
12586      delta = dis_SSE_shiftE_imm( pfx, delta+2, "psllw", Iop_ShlN16x8 );
12587      goto decode_success;
12588   }
12589
12590   /* 66 0F F1 = PSLLW by E */
12591   if (have66noF2noF3(pfx) && sz == 2
12592       && insn[0] == 0x0F && insn[1] == 0xF1) {
12593      delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psllw", Iop_ShlN16x8 );
12594      goto decode_success;
12595   }
12596
12597   /* 66 0F 72 /4 ib = PSRAD by immediate */
12598   if (have66noF2noF3(pfx) && sz == 2
12599       && insn[0] == 0x0F && insn[1] == 0x72
12600       && epartIsReg(insn[2])
12601       && gregLO3ofRM(insn[2]) == 4) {
12602      delta = dis_SSE_shiftE_imm( pfx, delta+2, "psrad", Iop_SarN32x4 );
12603      goto decode_success;
12604   }
12605
12606   /* 66 0F E2 = PSRAD by E */
12607   if (have66noF2noF3(pfx) && sz == 2
12608       && insn[0] == 0x0F && insn[1] == 0xE2) {
12609      delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psrad", Iop_SarN32x4 );
12610      goto decode_success;
12611   }
12612
12613   /* 66 0F 71 /4 ib = PSRAW by immediate */
12614   if (have66noF2noF3(pfx) && sz == 2
12615       && insn[0] == 0x0F && insn[1] == 0x71
12616       && epartIsReg(insn[2])
12617       && gregLO3ofRM(insn[2]) == 4) {
12618      delta = dis_SSE_shiftE_imm( pfx, delta+2, "psraw", Iop_SarN16x8 );
12619      goto decode_success;
12620   }
12621
12622   /* 66 0F E1 = PSRAW by E */
12623   if (have66noF2noF3(pfx) && sz == 2
12624       && insn[0] == 0x0F && insn[1] == 0xE1) {
12625      delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psraw", Iop_SarN16x8 );
12626      goto decode_success;
12627   }
12628
12629   /* 66 0F 72 /2 ib = PSRLD by immediate */
12630   if (have66noF2noF3(pfx) && sz == 2
12631       && insn[0] == 0x0F && insn[1] == 0x72
12632       && epartIsReg(insn[2])
12633       && gregLO3ofRM(insn[2]) == 2) {
12634      delta = dis_SSE_shiftE_imm( pfx, delta+2, "psrld", Iop_ShrN32x4 );
12635      goto decode_success;
12636   }
12637
12638   /* 66 0F D2 = PSRLD by E */
12639   if (have66noF2noF3(pfx) && sz == 2
12640       && insn[0] == 0x0F && insn[1] == 0xD2) {
12641      delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psrld", Iop_ShrN32x4 );
12642      goto decode_success;
12643   }
12644
12645   /* 66 0F 73 /3 ib = PSRLDQ by immediate */
12646   /* note, if mem case ever filled in, 1 byte after amode */
12647   if (have66noF2noF3(pfx) && sz == 2
12648       && insn[0] == 0x0F && insn[1] == 0x73
12649       && epartIsReg(insn[2])
12650       && gregLO3ofRM(insn[2]) == 3) {
12651      IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
12652      Int    imm = (Int)insn[3];
12653      Int    reg = eregOfRexRM(pfx,insn[2]);
12654      DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
12655      vassert(imm >= 0 && imm <= 255);
12656      delta += 4;
12657
12658      sV    = newTemp(Ity_V128);
12659      dV    = newTemp(Ity_V128);
12660      hi64  = newTemp(Ity_I64);
12661      lo64  = newTemp(Ity_I64);
12662      hi64r = newTemp(Ity_I64);
12663      lo64r = newTemp(Ity_I64);
12664
12665      if (imm >= 16) {
12666         putXMMReg(reg, mkV128(0x0000));
12667         goto decode_success;
12668      }
12669
12670      assign( sV, getXMMReg(reg) );
12671      assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
12672      assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
12673
12674      if (imm == 0) {
12675         assign( lo64r, mkexpr(lo64) );
12676         assign( hi64r, mkexpr(hi64) );
12677      }
12678      else
12679      if (imm == 8) {
12680         assign( hi64r, mkU64(0) );
12681         assign( lo64r, mkexpr(hi64) );
12682      }
12683      else
12684      if (imm > 8) {
12685         assign( hi64r, mkU64(0) );
12686         assign( lo64r, binop( Iop_Shr64,
12687                               mkexpr(hi64),
12688                               mkU8( 8*(imm-8) ) ));
12689      } else {
12690         assign( hi64r, binop( Iop_Shr64,
12691                               mkexpr(hi64),
12692                               mkU8(8 * imm) ));
12693         assign( lo64r,
12694                 binop( Iop_Or64,
12695                        binop(Iop_Shr64, mkexpr(lo64),
12696                                         mkU8(8 * imm)),
12697                        binop(Iop_Shl64, mkexpr(hi64),
12698                                         mkU8(8 * (8 - imm)) )
12699                      )
12700               );
12701      }
12702
12703      assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
12704      putXMMReg(reg, mkexpr(dV));
12705      goto decode_success;
12706   }
12707
12708   /* 66 0F 73 /2 ib = PSRLQ by immediate */
12709   if (have66noF2noF3(pfx) && sz == 2
12710       && insn[0] == 0x0F && insn[1] == 0x73
12711       && epartIsReg(insn[2])
12712       && gregLO3ofRM(insn[2]) == 2) {
12713      delta = dis_SSE_shiftE_imm( pfx, delta+2, "psrlq", Iop_ShrN64x2 );
12714      goto decode_success;
12715   }
12716
12717   /* 66 0F D3 = PSRLQ by E */
12718   if (have66noF2noF3(pfx) && sz == 2
12719       && insn[0] == 0x0F && insn[1] == 0xD3) {
12720      delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psrlq", Iop_ShrN64x2 );
12721      goto decode_success;
12722   }
12723
12724   /* 66 0F 71 /2 ib = PSRLW by immediate */
12725   if (have66noF2noF3(pfx) && sz == 2
12726       && insn[0] == 0x0F && insn[1] == 0x71
12727       && epartIsReg(insn[2])
12728       && gregLO3ofRM(insn[2]) == 2) {
12729      delta = dis_SSE_shiftE_imm( pfx, delta+2, "psrlw", Iop_ShrN16x8 );
12730      goto decode_success;
12731   }
12732
12733   /* 66 0F D1 = PSRLW by E */
12734   if (have66noF2noF3(pfx) && sz == 2
12735       && insn[0] == 0x0F && insn[1] == 0xD1) {
12736      delta = dis_SSE_shiftG_byE( vbi, pfx, delta+2, "psrlw", Iop_ShrN16x8 );
12737      goto decode_success;
12738   }
12739
12740   /* 66 0F F8 = PSUBB */
12741   if (have66noF2noF3(pfx) && sz == 2
12742       && insn[0] == 0x0F && insn[1] == 0xF8) {
12743      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12744                                 "psubb", Iop_Sub8x16, False );
12745      goto decode_success;
12746   }
12747
12748   /* 66 0F FA = PSUBD */
12749   if (have66noF2noF3(pfx) && sz == 2
12750       && insn[0] == 0x0F && insn[1] == 0xFA) {
12751      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12752                                 "psubd", Iop_Sub32x4, False );
12753      goto decode_success;
12754   }
12755
12756   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
12757   /* 0F FB = PSUBQ -- sub 64x1 */
12758   if (haveNo66noF2noF3(pfx) && sz == 4
12759       && insn[0] == 0x0F && insn[1] == 0xFB) {
12760      do_MMX_preamble();
12761      delta = dis_MMXop_regmem_to_reg (
12762                vbi, pfx, delta+2, insn[1], "psubq", False );
12763      goto decode_success;
12764   }
12765
12766   /* 66 0F FB = PSUBQ */
12767   if (have66noF2noF3(pfx) && sz == 2
12768       && insn[0] == 0x0F && insn[1] == 0xFB) {
12769      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12770                                 "psubq", Iop_Sub64x2, False );
12771      goto decode_success;
12772   }
12773
12774   /* 66 0F F9 = PSUBW */
12775   if (have66noF2noF3(pfx) && sz == 2
12776       && insn[0] == 0x0F && insn[1] == 0xF9) {
12777      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12778                                 "psubw", Iop_Sub16x8, False );
12779      goto decode_success;
12780   }
12781
12782   /* 66 0F E8 = PSUBSB */
12783   if (have66noF2noF3(pfx) && sz == 2
12784       && insn[0] == 0x0F && insn[1] == 0xE8) {
12785      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12786                                 "psubsb", Iop_QSub8Sx16, False );
12787      goto decode_success;
12788   }
12789
12790   /* 66 0F E9 = PSUBSW */
12791   if (have66noF2noF3(pfx) && sz == 2
12792       && insn[0] == 0x0F && insn[1] == 0xE9) {
12793      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12794                                 "psubsw", Iop_QSub16Sx8, False );
12795      goto decode_success;
12796   }
12797
12798   /* 66 0F D8 = PSUBSB */
12799   if (have66noF2noF3(pfx) && sz == 2
12800       && insn[0] == 0x0F && insn[1] == 0xD8) {
12801      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12802                                 "psubusb", Iop_QSub8Ux16, False );
12803      goto decode_success;
12804   }
12805
12806   /* 66 0F D9 = PSUBSW */
12807   if (have66noF2noF3(pfx) && sz == 2
12808       && insn[0] == 0x0F && insn[1] == 0xD9) {
12809      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12810                                 "psubusw", Iop_QSub16Ux8, False );
12811      goto decode_success;
12812   }
12813
12814   /* 66 0F 68 = PUNPCKHBW */
12815   if (have66noF2noF3(pfx) && sz == 2
12816       && insn[0] == 0x0F && insn[1] == 0x68) {
12817      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12818                                 "punpckhbw",
12819                                 Iop_InterleaveHI8x16, True );
12820      goto decode_success;
12821   }
12822
12823   /* 66 0F 6A = PUNPCKHDQ */
12824   if (have66noF2noF3(pfx) && sz == 2
12825       && insn[0] == 0x0F && insn[1] == 0x6A) {
12826      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12827                                 "punpckhdq",
12828                                 Iop_InterleaveHI32x4, True );
12829      goto decode_success;
12830   }
12831
12832   /* 66 0F 6D = PUNPCKHQDQ */
12833   if (have66noF2noF3(pfx) && sz == 2
12834       && insn[0] == 0x0F && insn[1] == 0x6D) {
12835      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12836                                 "punpckhqdq",
12837                                 Iop_InterleaveHI64x2, True );
12838      goto decode_success;
12839   }
12840
12841   /* 66 0F 69 = PUNPCKHWD */
12842   if (have66noF2noF3(pfx) && sz == 2
12843       && insn[0] == 0x0F && insn[1] == 0x69) {
12844      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12845                                 "punpckhwd",
12846                                 Iop_InterleaveHI16x8, True );
12847      goto decode_success;
12848   }
12849
12850   /* 66 0F 60 = PUNPCKLBW */
12851   if (have66noF2noF3(pfx) && sz == 2
12852       && insn[0] == 0x0F && insn[1] == 0x60) {
12853      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12854                                 "punpcklbw",
12855                                 Iop_InterleaveLO8x16, True );
12856      goto decode_success;
12857   }
12858
12859   /* 66 0F 62 = PUNPCKLDQ */
12860   if (have66noF2noF3(pfx) && sz == 2
12861       && insn[0] == 0x0F && insn[1] == 0x62) {
12862      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12863                                 "punpckldq",
12864                                 Iop_InterleaveLO32x4, True );
12865      goto decode_success;
12866   }
12867
12868   /* 66 0F 6C = PUNPCKLQDQ */
12869   if (have66noF2noF3(pfx) && sz == 2
12870       && insn[0] == 0x0F && insn[1] == 0x6C) {
12871      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12872                                 "punpcklqdq",
12873                                 Iop_InterleaveLO64x2, True );
12874      goto decode_success;
12875   }
12876
12877   /* 66 0F 61 = PUNPCKLWD */
12878   if (have66noF2noF3(pfx) && sz == 2
12879       && insn[0] == 0x0F && insn[1] == 0x61) {
12880      delta = dis_SSEint_E_to_G( vbi, pfx, delta+2,
12881                                 "punpcklwd",
12882                                 Iop_InterleaveLO16x8, True );
12883      goto decode_success;
12884   }
12885
12886   /* 66 0F EF = PXOR */
12887   if (have66noF2noF3(pfx) && sz == 2
12888       && insn[0] == 0x0F && insn[1] == 0xEF) {
12889      delta = dis_SSE_E_to_G_all( vbi, pfx, delta+2, "pxor", Iop_XorV128 );
12890      goto decode_success;
12891   }
12892
12893//.. //--    /* FXSAVE/FXRSTOR m32 -- load/store the FPU/MMX/SSE state. */
12894//.. //--    if (insn[0] == 0x0F && insn[1] == 0xAE
12895//.. //--        && (!epartIsReg(insn[2]))
12896//.. //--        && (gregOfRM(insn[2]) == 1 || gregOfRM(insn[2]) == 0) ) {
12897//.. //--       Bool store = gregOfRM(insn[2]) == 0;
12898//.. //--       vg_assert(sz == 4);
12899//.. //--       pair = disAMode ( cb, sorb, eip+2, dis_buf );
12900//.. //--       t1   = LOW24(pair);
12901//.. //--       eip += 2+HI8(pair);
12902//.. //--       uInstr3(cb, store ? SSE2a_MemWr : SSE2a_MemRd, 512,
12903//.. //--                   Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1],
12904//.. //--                   Lit16, (UShort)insn[2],
12905//.. //--                   TempReg, t1 );
12906//.. //--       DIP("fx%s %s\n", store ? "save" : "rstor", dis_buf );
12907//.. //--       goto decode_success;
12908//.. //--    }
12909
12910   /* 0F AE /7 = CLFLUSH -- flush cache line */
12911   if (haveNo66noF2noF3(pfx) && sz == 4
12912       && insn[0] == 0x0F && insn[1] == 0xAE
12913       && !epartIsReg(insn[2]) && gregLO3ofRM(insn[2]) == 7) {
12914
12915      /* This is something of a hack.  We need to know the size of the
12916         cache line containing addr.  Since we don't (easily), assume
12917         256 on the basis that no real cache would have a line that
12918         big.  It's safe to invalidate more stuff than we need, just
12919         inefficient. */
12920      ULong lineszB = 256ULL;
12921
12922      addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
12923      delta += 2+alen;
12924
12925      /* Round addr down to the start of the containing block. */
12926      stmt( IRStmt_Put(
12927               OFFB_TISTART,
12928               binop( Iop_And64,
12929                      mkexpr(addr),
12930                      mkU64( ~(lineszB-1) ))) );
12931
12932      stmt( IRStmt_Put(OFFB_TILEN, mkU64(lineszB) ) );
12933
12934      irsb->jumpkind = Ijk_TInval;
12935      irsb->next     = mkU64(guest_RIP_bbstart+delta);
12936      dres.whatNext  = Dis_StopHere;
12937
12938      DIP("clflush %s\n", dis_buf);
12939      goto decode_success;
12940   }
12941
12942   /* ---------------------------------------------------- */
12943   /* --- end of the SSE/SSE2 decoder.                 --- */
12944   /* ---------------------------------------------------- */
12945
12946   /* ---------------------------------------------------- */
12947   /* --- start of the SSE3 decoder.                   --- */
12948   /* ---------------------------------------------------- */
12949
12950   /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
12951      duplicating some lanes (2:2:0:0). */
12952   /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
12953      duplicating some lanes (3:3:1:1). */
12954   if (haveF3no66noF2(pfx) && sz == 4
12955       && insn[0] == 0x0F && (insn[1] == 0x12 || insn[1] == 0x16)) {
12956      IRTemp s3, s2, s1, s0;
12957      IRTemp sV  = newTemp(Ity_V128);
12958      Bool   isH = insn[1] == 0x16;
12959      s3 = s2 = s1 = s0 = IRTemp_INVALID;
12960
12961      modrm = insn[2];
12962      if (epartIsReg(modrm)) {
12963         assign( sV, getXMMReg( eregOfRexRM(pfx,modrm)) );
12964         DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
12965                                  nameXMMReg(eregOfRexRM(pfx,modrm)),
12966                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12967         delta += 2+1;
12968      } else {
12969         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
12970         gen_SEGV_if_not_16_aligned( addr );
12971         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12972         DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
12973	     dis_buf,
12974             nameXMMReg(gregOfRexRM(pfx,modrm)));
12975         delta += 2+alen;
12976      }
12977
12978      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
12979      putXMMReg( gregOfRexRM(pfx,modrm),
12980                 isH ? mk128from32s( s3, s3, s1, s1 )
12981                     : mk128from32s( s2, s2, s0, s0 ) );
12982      goto decode_success;
12983   }
12984
12985   /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
12986      duplicating some lanes (0:1:0:1). */
12987   if (haveF2no66noF3(pfx)
12988       && (sz == 4 || /* ignore redundant REX.W */ sz == 8)
12989       && insn[0] == 0x0F && insn[1] == 0x12) {
12990      IRTemp sV = newTemp(Ity_V128);
12991      IRTemp d0 = newTemp(Ity_I64);
12992
12993      modrm = insn[2];
12994      if (epartIsReg(modrm)) {
12995         assign( sV, getXMMReg( eregOfRexRM(pfx,modrm)) );
12996         DIP("movddup %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12997                                nameXMMReg(gregOfRexRM(pfx,modrm)));
12998         delta += 2+1;
12999         assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
13000      } else {
13001         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
13002         assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
13003         DIP("movddup %s,%s\n", dis_buf,
13004                                nameXMMReg(gregOfRexRM(pfx,modrm)));
13005         delta += 2+alen;
13006      }
13007
13008      putXMMReg( gregOfRexRM(pfx,modrm),
13009                 binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
13010      goto decode_success;
13011   }
13012
13013   /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
13014   if (haveF2no66noF3(pfx) && sz == 4
13015       && insn[0] == 0x0F && insn[1] == 0xD0) {
13016      IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
13017      IRTemp eV   = newTemp(Ity_V128);
13018      IRTemp gV   = newTemp(Ity_V128);
13019      IRTemp addV = newTemp(Ity_V128);
13020      IRTemp subV = newTemp(Ity_V128);
13021      a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
13022
13023      modrm = insn[2];
13024      if (epartIsReg(modrm)) {
13025         assign( eV, getXMMReg( eregOfRexRM(pfx,modrm)) );
13026         DIP("addsubps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
13027                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
13028         delta += 2+1;
13029      } else {
13030         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
13031         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
13032         DIP("addsubps %s,%s\n", dis_buf,
13033                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
13034         delta += 2+alen;
13035      }
13036
13037      assign( gV, getXMMReg(gregOfRexRM(pfx,modrm)) );
13038
13039      assign( addV, binop(Iop_Add32Fx4, mkexpr(gV), mkexpr(eV)) );
13040      assign( subV, binop(Iop_Sub32Fx4, mkexpr(gV), mkexpr(eV)) );
13041
13042      breakup128to32s( addV, &a3, &a2, &a1, &a0 );
13043      breakup128to32s( subV, &s3, &s2, &s1, &s0 );
13044
13045      putXMMReg( gregOfRexRM(pfx,modrm), mk128from32s( a3, s2, a1, s0 ));
13046      goto decode_success;
13047   }
13048
13049   /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
13050   if (have66noF2noF3(pfx) && sz == 2
13051       && insn[0] == 0x0F && insn[1] == 0xD0) {
13052      IRTemp eV   = newTemp(Ity_V128);
13053      IRTemp gV   = newTemp(Ity_V128);
13054      IRTemp addV = newTemp(Ity_V128);
13055      IRTemp subV = newTemp(Ity_V128);
13056      IRTemp a1     = newTemp(Ity_I64);
13057      IRTemp s0     = newTemp(Ity_I64);
13058
13059      modrm = insn[2];
13060      if (epartIsReg(modrm)) {
13061         assign( eV, getXMMReg( eregOfRexRM(pfx,modrm)) );
13062         DIP("addsubpd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
13063                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
13064         delta += 2+1;
13065      } else {
13066         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
13067         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
13068         DIP("addsubpd %s,%s\n", dis_buf,
13069                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
13070         delta += 2+alen;
13071      }
13072
13073      assign( gV, getXMMReg(gregOfRexRM(pfx,modrm)) );
13074
13075      assign( addV, binop(Iop_Add64Fx2, mkexpr(gV), mkexpr(eV)) );
13076      assign( subV, binop(Iop_Sub64Fx2, mkexpr(gV), mkexpr(eV)) );
13077
13078      assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
13079      assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
13080
13081      putXMMReg( gregOfRexRM(pfx,modrm),
13082                 binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
13083      goto decode_success;
13084   }
13085
13086   /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
13087   /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
13088   if (haveF2no66noF3(pfx) && sz == 4
13089       && insn[0] == 0x0F && (insn[1] == 0x7C || insn[1] == 0x7D)) {
13090      IRTemp e3, e2, e1, e0, g3, g2, g1, g0;
13091      IRTemp eV     = newTemp(Ity_V128);
13092      IRTemp gV     = newTemp(Ity_V128);
13093      IRTemp leftV  = newTemp(Ity_V128);
13094      IRTemp rightV = newTemp(Ity_V128);
13095      Bool   isAdd  = insn[1] == 0x7C;
13096      HChar* str    = isAdd ? "add" : "sub";
13097      e3 = e2 = e1 = e0 = g3 = g2 = g1 = g0 = IRTemp_INVALID;
13098
13099      modrm = insn[2];
13100      if (epartIsReg(modrm)) {
13101         assign( eV, getXMMReg( eregOfRexRM(pfx,modrm)) );
13102         DIP("h%sps %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
13103                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
13104         delta += 2+1;
13105      } else {
13106         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
13107         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
13108         DIP("h%sps %s,%s\n", str, dis_buf,
13109                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
13110         delta += 2+alen;
13111      }
13112
13113      assign( gV, getXMMReg(gregOfRexRM(pfx,modrm)) );
13114
13115      breakup128to32s( eV, &e3, &e2, &e1, &e0 );
13116      breakup128to32s( gV, &g3, &g2, &g1, &g0 );
13117
13118      assign( leftV,  mk128from32s( e2, e0, g2, g0 ) );
13119      assign( rightV, mk128from32s( e3, e1, g3, g1 ) );
13120
13121      putXMMReg( gregOfRexRM(pfx,modrm),
13122                 binop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
13123                       mkexpr(leftV), mkexpr(rightV) ) );
13124      goto decode_success;
13125   }
13126
13127   /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
13128   /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
13129   if (have66noF2noF3(pfx) && sz == 2
13130       && insn[0] == 0x0F && (insn[1] == 0x7C || insn[1] == 0x7D)) {
13131      IRTemp e1     = newTemp(Ity_I64);
13132      IRTemp e0     = newTemp(Ity_I64);
13133      IRTemp g1     = newTemp(Ity_I64);
13134      IRTemp g0     = newTemp(Ity_I64);
13135      IRTemp eV     = newTemp(Ity_V128);
13136      IRTemp gV     = newTemp(Ity_V128);
13137      IRTemp leftV  = newTemp(Ity_V128);
13138      IRTemp rightV = newTemp(Ity_V128);
13139      Bool   isAdd  = insn[1] == 0x7C;
13140      HChar* str    = isAdd ? "add" : "sub";
13141
13142      modrm = insn[2];
13143      if (epartIsReg(modrm)) {
13144         assign( eV, getXMMReg( eregOfRexRM(pfx,modrm)) );
13145         DIP("h%spd %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
13146                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
13147         delta += 2+1;
13148      } else {
13149         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
13150         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
13151         DIP("h%spd %s,%s\n", str, dis_buf,
13152                              nameXMMReg(gregOfRexRM(pfx,modrm)));
13153         delta += 2+alen;
13154      }
13155
13156      assign( gV, getXMMReg(gregOfRexRM(pfx,modrm)) );
13157
13158      assign( e1, unop(Iop_V128HIto64, mkexpr(eV) ));
13159      assign( e0, unop(Iop_V128to64, mkexpr(eV) ));
13160      assign( g1, unop(Iop_V128HIto64, mkexpr(gV) ));
13161      assign( g0, unop(Iop_V128to64, mkexpr(gV) ));
13162
13163      assign( leftV,  binop(Iop_64HLtoV128, mkexpr(e0),mkexpr(g0)) );
13164      assign( rightV, binop(Iop_64HLtoV128, mkexpr(e1),mkexpr(g1)) );
13165
13166      putXMMReg( gregOfRexRM(pfx,modrm),
13167                 binop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
13168                       mkexpr(leftV), mkexpr(rightV) ) );
13169      goto decode_success;
13170   }
13171
13172   /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
13173   if (haveF2no66noF3(pfx) && sz == 4
13174       && insn[0] == 0x0F && insn[1] == 0xF0) {
13175      modrm = insn[2];
13176      if (epartIsReg(modrm)) {
13177         goto decode_failure;
13178      } else {
13179         addr = disAMode ( &alen, vbi, pfx, delta+2, dis_buf, 0 );
13180         putXMMReg( gregOfRexRM(pfx,modrm),
13181                    loadLE(Ity_V128, mkexpr(addr)) );
13182         DIP("lddqu %s,%s\n", dis_buf,
13183                              nameXMMReg(gregOfRexRM(pfx,modrm)));
13184         delta += 2+alen;
13185      }
13186      goto decode_success;
13187   }
13188
13189   /* ---------------------------------------------------- */
13190   /* --- end of the SSE3 decoder.                     --- */
13191   /* ---------------------------------------------------- */
13192
13193   /* ---------------------------------------------------- */
13194   /* --- start of the SSSE3 decoder.                  --- */
13195   /* ---------------------------------------------------- */
13196
13197   /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
13198      Unsigned Bytes (MMX) */
13199   if (haveNo66noF2noF3(pfx)
13200       && sz == 4
13201       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
13202      IRTemp sV        = newTemp(Ity_I64);
13203      IRTemp dV        = newTemp(Ity_I64);
13204      IRTemp sVoddsSX  = newTemp(Ity_I64);
13205      IRTemp sVevensSX = newTemp(Ity_I64);
13206      IRTemp dVoddsZX  = newTemp(Ity_I64);
13207      IRTemp dVevensZX = newTemp(Ity_I64);
13208
13209      modrm = insn[3];
13210      do_MMX_preamble();
13211      assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
13212
13213      if (epartIsReg(modrm)) {
13214         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
13215         delta += 3+1;
13216         DIP("pmaddubsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
13217                                  nameMMXReg(gregLO3ofRM(modrm)));
13218      } else {
13219         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
13220         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
13221         delta += 3+alen;
13222         DIP("pmaddubsw %s,%s\n", dis_buf,
13223                                  nameMMXReg(gregLO3ofRM(modrm)));
13224      }
13225
13226      /* compute dV unsigned x sV signed */
13227      assign( sVoddsSX,
13228              binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
13229      assign( sVevensSX,
13230              binop(Iop_SarN16x4,
13231                    binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
13232                    mkU8(8)) );
13233      assign( dVoddsZX,
13234              binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
13235      assign( dVevensZX,
13236              binop(Iop_ShrN16x4,
13237                    binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
13238                    mkU8(8)) );
13239
13240      putMMXReg(
13241         gregLO3ofRM(modrm),
13242         binop(Iop_QAdd16Sx4,
13243               binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
13244               binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
13245         )
13246      );
13247      goto decode_success;
13248   }
13249
13250   /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
13251      Unsigned Bytes (XMM) */
13252   if (have66noF2noF3(pfx)
13253       && (sz == 2 || /*redundant REX.W*/ sz == 8)
13254       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
13255      IRTemp sV        = newTemp(Ity_V128);
13256      IRTemp dV        = newTemp(Ity_V128);
13257      IRTemp sVoddsSX  = newTemp(Ity_V128);
13258      IRTemp sVevensSX = newTemp(Ity_V128);
13259      IRTemp dVoddsZX  = newTemp(Ity_V128);
13260      IRTemp dVevensZX = newTemp(Ity_V128);
13261
13262      modrm = insn[3];
13263      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
13264
13265      if (epartIsReg(modrm)) {
13266         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
13267         delta += 3+1;
13268         DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
13269                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
13270      } else {
13271         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
13272         gen_SEGV_if_not_16_aligned( addr );
13273         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
13274         delta += 3+alen;
13275         DIP("pmaddubsw %s,%s\n", dis_buf,
13276                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
13277      }
13278
13279      /* compute dV unsigned x sV signed */
13280      assign( sVoddsSX,
13281              binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
13282      assign( sVevensSX,
13283              binop(Iop_SarN16x8,
13284                    binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
13285                    mkU8(8)) );
13286      assign( dVoddsZX,
13287              binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
13288      assign( dVevensZX,
13289              binop(Iop_ShrN16x8,
13290                    binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
13291                    mkU8(8)) );
13292
13293      putXMMReg(
13294         gregOfRexRM(pfx,modrm),
13295         binop(Iop_QAdd16Sx8,
13296               binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
13297               binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
13298         )
13299      );
13300      goto decode_success;
13301   }
13302
13303   /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
13304   /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
13305      mmx) and G to G (mmx). */
13306   /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
13307      mmx) and G to G (mmx). */
13308   /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
13309      to G (mmx). */
13310   /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
13311      to G (mmx). */
13312   /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
13313      to G (mmx). */
13314   /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
13315      to G (mmx). */
13316
13317   if (haveNo66noF2noF3(pfx)
13318       && sz == 4
13319       && insn[0] == 0x0F && insn[1] == 0x38
13320       && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
13321           || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
13322      HChar* str    = "???";
13323      IROp   opV64  = Iop_INVALID;
13324      IROp   opCatO = Iop_CatOddLanes16x4;
13325      IROp   opCatE = Iop_CatEvenLanes16x4;
13326      IRTemp sV     = newTemp(Ity_I64);
13327      IRTemp dV     = newTemp(Ity_I64);
13328
13329      modrm = insn[3];
13330
13331      switch (insn[2]) {
13332         case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
13333         case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
13334         case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
13335         case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
13336         case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
13337         case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
13338         default: vassert(0);
13339      }
13340      if (insn[2] == 0x02 || insn[2] == 0x06) {
13341         opCatO = Iop_InterleaveHI32x2;
13342         opCatE = Iop_InterleaveLO32x2;
13343      }
13344
13345      do_MMX_preamble();
13346      assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
13347
13348      if (epartIsReg(modrm)) {
13349         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
13350         delta += 3+1;
13351         DIP("ph%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
13352                                  nameMMXReg(gregLO3ofRM(modrm)));
13353      } else {
13354         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
13355         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
13356         delta += 3+alen;
13357         DIP("ph%s %s,%s\n", str, dis_buf,
13358                                  nameMMXReg(gregLO3ofRM(modrm)));
13359      }
13360
13361      putMMXReg(
13362         gregLO3ofRM(modrm),
13363         binop(opV64,
13364               binop(opCatE,mkexpr(sV),mkexpr(dV)),
13365               binop(opCatO,mkexpr(sV),mkexpr(dV))
13366         )
13367      );
13368      goto decode_success;
13369   }
13370
13371   /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
13372      xmm) and G to G (xmm). */
13373   /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
13374      xmm) and G to G (xmm). */
13375   /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
13376      G to G (xmm). */
13377   /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
13378      G to G (xmm). */
13379   /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
13380      G to G (xmm). */
13381   /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
13382      G to G (xmm). */
13383
13384   if (have66noF2noF3(pfx)
13385       && (sz == 2 || /*redundant REX.W*/ sz == 8)
13386       && insn[0] == 0x0F && insn[1] == 0x38
13387       && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
13388           || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
13389      HChar* str    = "???";
13390      IROp   opV64  = Iop_INVALID;
13391      IROp   opCatO = Iop_CatOddLanes16x4;
13392      IROp   opCatE = Iop_CatEvenLanes16x4;
13393      IRTemp sV     = newTemp(Ity_V128);
13394      IRTemp dV     = newTemp(Ity_V128);
13395      IRTemp sHi    = newTemp(Ity_I64);
13396      IRTemp sLo    = newTemp(Ity_I64);
13397      IRTemp dHi    = newTemp(Ity_I64);
13398      IRTemp dLo    = newTemp(Ity_I64);
13399
13400      modrm = insn[3];
13401
13402      switch (insn[2]) {
13403         case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
13404         case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
13405         case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
13406         case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
13407         case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
13408         case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
13409         default: vassert(0);
13410      }
13411      if (insn[2] == 0x02 || insn[2] == 0x06) {
13412         opCatO = Iop_InterleaveHI32x2;
13413         opCatE = Iop_InterleaveLO32x2;
13414      }
13415
13416      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
13417
13418      if (epartIsReg(modrm)) {
13419         assign( sV, getXMMReg( eregOfRexRM(pfx,modrm)) );
13420         DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
13421                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
13422         delta += 3+1;
13423      } else {
13424         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
13425         gen_SEGV_if_not_16_aligned( addr );
13426         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
13427         DIP("ph%s %s,%s\n", str, dis_buf,
13428                             nameXMMReg(gregOfRexRM(pfx,modrm)));
13429         delta += 3+alen;
13430      }
13431
13432      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
13433      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
13434      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
13435      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
13436
13437      /* This isn't a particularly efficient way to compute the
13438         result, but at least it avoids a proliferation of IROps,
13439         hence avoids complication all the backends. */
13440      putXMMReg(
13441         gregOfRexRM(pfx,modrm),
13442         binop(Iop_64HLtoV128,
13443               binop(opV64,
13444                     binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
13445                     binop(opCatO,mkexpr(sHi),mkexpr(sLo))
13446               ),
13447               binop(opV64,
13448                     binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
13449                     binop(opCatO,mkexpr(dHi),mkexpr(dLo))
13450               )
13451         )
13452      );
13453      goto decode_success;
13454   }
13455
13456   /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
13457      (MMX) */
13458   if (haveNo66noF2noF3(pfx)
13459       && sz == 4
13460       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
13461      IRTemp sV = newTemp(Ity_I64);
13462      IRTemp dV = newTemp(Ity_I64);
13463
13464      modrm = insn[3];
13465      do_MMX_preamble();
13466      assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
13467
13468      if (epartIsReg(modrm)) {
13469         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
13470         delta += 3+1;
13471         DIP("pmulhrsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
13472                                 nameMMXReg(gregLO3ofRM(modrm)));
13473      } else {
13474         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
13475         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
13476         delta += 3+alen;
13477         DIP("pmulhrsw %s,%s\n", dis_buf,
13478                                 nameMMXReg(gregLO3ofRM(modrm)));
13479      }
13480
13481      putMMXReg(
13482         gregLO3ofRM(modrm),
13483         dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
13484      );
13485      goto decode_success;
13486   }
13487
13488   /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
13489      Scale (XMM) */
13490   if (have66noF2noF3(pfx)
13491       && (sz == 2 || /*redundant REX.W*/ sz == 8)
13492       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
13493      IRTemp sV  = newTemp(Ity_V128);
13494      IRTemp dV  = newTemp(Ity_V128);
13495      IRTemp sHi = newTemp(Ity_I64);
13496      IRTemp sLo = newTemp(Ity_I64);
13497      IRTemp dHi = newTemp(Ity_I64);
13498      IRTemp dLo = newTemp(Ity_I64);
13499
13500      modrm = insn[3];
13501      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
13502
13503      if (epartIsReg(modrm)) {
13504         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
13505         delta += 3+1;
13506         DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
13507                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
13508      } else {
13509         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
13510         gen_SEGV_if_not_16_aligned( addr );
13511         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
13512         delta += 3+alen;
13513         DIP("pmulhrsw %s,%s\n", dis_buf,
13514                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
13515      }
13516
13517      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
13518      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
13519      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
13520      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
13521
13522      putXMMReg(
13523         gregOfRexRM(pfx,modrm),
13524         binop(Iop_64HLtoV128,
13525               dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
13526               dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
13527         )
13528      );
13529      goto decode_success;
13530   }
13531
13532   /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
13533   /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
13534   /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */
13535   if (haveNo66noF2noF3(pfx)
13536       && sz == 4
13537       && insn[0] == 0x0F && insn[1] == 0x38
13538       && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
13539      IRTemp sV      = newTemp(Ity_I64);
13540      IRTemp dV      = newTemp(Ity_I64);
13541      HChar* str     = "???";
13542      Int    laneszB = 0;
13543
13544      switch (insn[2]) {
13545         case 0x08: laneszB = 1; str = "b"; break;
13546         case 0x09: laneszB = 2; str = "w"; break;
13547         case 0x0A: laneszB = 4; str = "d"; break;
13548         default: vassert(0);
13549      }
13550
13551      modrm = insn[3];
13552      do_MMX_preamble();
13553      assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
13554
13555      if (epartIsReg(modrm)) {
13556         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
13557         delta += 3+1;
13558         DIP("psign%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
13559                                     nameMMXReg(gregLO3ofRM(modrm)));
13560      } else {
13561         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
13562         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
13563         delta += 3+alen;
13564         DIP("psign%s %s,%s\n", str, dis_buf,
13565                                     nameMMXReg(gregLO3ofRM(modrm)));
13566      }
13567
13568      putMMXReg(
13569         gregLO3ofRM(modrm),
13570         dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
13571      );
13572      goto decode_success;
13573   }
13574
13575   /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
13576   /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
13577   /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */
13578   if (have66noF2noF3(pfx)
13579       && (sz == 2 || /*redundant REX.W*/ sz == 8)
13580       && insn[0] == 0x0F && insn[1] == 0x38
13581       && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
13582      IRTemp sV      = newTemp(Ity_V128);
13583      IRTemp dV      = newTemp(Ity_V128);
13584      IRTemp sHi     = newTemp(Ity_I64);
13585      IRTemp sLo     = newTemp(Ity_I64);
13586      IRTemp dHi     = newTemp(Ity_I64);
13587      IRTemp dLo     = newTemp(Ity_I64);
13588      HChar* str     = "???";
13589      Int    laneszB = 0;
13590
13591      switch (insn[2]) {
13592         case 0x08: laneszB = 1; str = "b"; break;
13593         case 0x09: laneszB = 2; str = "w"; break;
13594         case 0x0A: laneszB = 4; str = "d"; break;
13595         default: vassert(0);
13596      }
13597
13598      modrm = insn[3];
13599      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
13600
13601      if (epartIsReg(modrm)) {
13602         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
13603         delta += 3+1;
13604         DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
13605                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
13606      } else {
13607         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
13608         gen_SEGV_if_not_16_aligned( addr );
13609         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
13610         delta += 3+alen;
13611         DIP("psign%s %s,%s\n", str, dis_buf,
13612                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
13613      }
13614
13615      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
13616      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
13617      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
13618      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
13619
13620      putXMMReg(
13621         gregOfRexRM(pfx,modrm),
13622         binop(Iop_64HLtoV128,
13623               dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
13624               dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
13625         )
13626      );
13627      goto decode_success;
13628   }
13629
13630   /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
13631   /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
13632   /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
13633   if (haveNo66noF2noF3(pfx)
13634       && sz == 4
13635       && insn[0] == 0x0F && insn[1] == 0x38
13636       && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
13637      IRTemp sV      = newTemp(Ity_I64);
13638      HChar* str     = "???";
13639      Int    laneszB = 0;
13640
13641      switch (insn[2]) {
13642         case 0x1C: laneszB = 1; str = "b"; break;
13643         case 0x1D: laneszB = 2; str = "w"; break;
13644         case 0x1E: laneszB = 4; str = "d"; break;
13645         default: vassert(0);
13646      }
13647
13648      modrm = insn[3];
13649      do_MMX_preamble();
13650
13651      if (epartIsReg(modrm)) {
13652         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
13653         delta += 3+1;
13654         DIP("pabs%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
13655                                    nameMMXReg(gregLO3ofRM(modrm)));
13656      } else {
13657         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
13658         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
13659         delta += 3+alen;
13660         DIP("pabs%s %s,%s\n", str, dis_buf,
13661                                    nameMMXReg(gregLO3ofRM(modrm)));
13662      }
13663
13664      putMMXReg(
13665         gregLO3ofRM(modrm),
13666         dis_PABS_helper( mkexpr(sV), laneszB )
13667      );
13668      goto decode_success;
13669   }
13670
13671   /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
13672   /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
13673   /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
13674   if (have66noF2noF3(pfx)
13675       && (sz == 2 || /*redundant REX.W*/ sz == 8)
13676       && insn[0] == 0x0F && insn[1] == 0x38
13677       && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
13678      IRTemp sV      = newTemp(Ity_V128);
13679      IRTemp sHi     = newTemp(Ity_I64);
13680      IRTemp sLo     = newTemp(Ity_I64);
13681      HChar* str     = "???";
13682      Int    laneszB = 0;
13683
13684      switch (insn[2]) {
13685         case 0x1C: laneszB = 1; str = "b"; break;
13686         case 0x1D: laneszB = 2; str = "w"; break;
13687         case 0x1E: laneszB = 4; str = "d"; break;
13688         default: vassert(0);
13689      }
13690
13691      modrm = insn[3];
13692
13693      if (epartIsReg(modrm)) {
13694         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
13695         delta += 3+1;
13696         DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
13697                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
13698      } else {
13699         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
13700         gen_SEGV_if_not_16_aligned( addr );
13701         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
13702         delta += 3+alen;
13703         DIP("pabs%s %s,%s\n", str, dis_buf,
13704                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
13705      }
13706
13707      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
13708      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
13709
13710      putXMMReg(
13711         gregOfRexRM(pfx,modrm),
13712         binop(Iop_64HLtoV128,
13713               dis_PABS_helper( mkexpr(sHi), laneszB ),
13714               dis_PABS_helper( mkexpr(sLo), laneszB )
13715         )
13716      );
13717      goto decode_success;
13718   }
13719
13720   /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
13721   if (haveNo66noF2noF3(pfx) && sz == 4
13722       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
13723      IRTemp sV  = newTemp(Ity_I64);
13724      IRTemp dV  = newTemp(Ity_I64);
13725      IRTemp res = newTemp(Ity_I64);
13726
13727      modrm = insn[3];
13728      do_MMX_preamble();
13729      assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
13730
13731      if (epartIsReg(modrm)) {
13732         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
13733         d64 = (Long)insn[3+1];
13734         delta += 3+1+1;
13735         DIP("palignr $%d,%s,%s\n",  (Int)d64,
13736                                     nameMMXReg(eregLO3ofRM(modrm)),
13737                                     nameMMXReg(gregLO3ofRM(modrm)));
13738      } else {
13739         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 1 );
13740         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
13741         d64 = (Long)insn[3+alen];
13742         delta += 3+alen+1;
13743         DIP("palignr $%d%s,%s\n", (Int)d64,
13744                                   dis_buf,
13745                                   nameMMXReg(gregLO3ofRM(modrm)));
13746      }
13747
13748      if (d64 == 0) {
13749         assign( res, mkexpr(sV) );
13750      }
13751      else if (d64 >= 1 && d64 <= 7) {
13752         assign(res,
13753                binop(Iop_Or64,
13754                      binop(Iop_Shr64, mkexpr(sV), mkU8(8*d64)),
13755                      binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d64))
13756                     )));
13757      }
13758      else if (d64 == 8) {
13759        assign( res, mkexpr(dV) );
13760      }
13761      else if (d64 >= 9 && d64 <= 15) {
13762         assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d64-8))) );
13763      }
13764      else if (d64 >= 16 && d64 <= 255) {
13765         assign( res, mkU64(0) );
13766      }
13767      else
13768         vassert(0);
13769
13770      putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
13771      goto decode_success;
13772   }
13773
13774   /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
13775   if (have66noF2noF3(pfx)
13776       && (sz == 2 || /*redundant REX.W*/ sz == 8)
13777       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
13778      IRTemp sV  = newTemp(Ity_V128);
13779      IRTemp dV  = newTemp(Ity_V128);
13780      IRTemp sHi = newTemp(Ity_I64);
13781      IRTemp sLo = newTemp(Ity_I64);
13782      IRTemp dHi = newTemp(Ity_I64);
13783      IRTemp dLo = newTemp(Ity_I64);
13784      IRTemp rHi = newTemp(Ity_I64);
13785      IRTemp rLo = newTemp(Ity_I64);
13786
13787      modrm = insn[3];
13788      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
13789
13790      if (epartIsReg(modrm)) {
13791         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
13792         d64 = (Long)insn[3+1];
13793         delta += 3+1+1;
13794         DIP("palignr $%d,%s,%s\n", (Int)d64,
13795                                    nameXMMReg(eregOfRexRM(pfx,modrm)),
13796                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
13797      } else {
13798         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 1 );
13799         gen_SEGV_if_not_16_aligned( addr );
13800         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
13801         d64 = (Long)insn[3+alen];
13802         delta += 3+alen+1;
13803         DIP("palignr $%d,%s,%s\n", (Int)d64,
13804                                    dis_buf,
13805                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
13806      }
13807
13808      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
13809      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
13810      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
13811      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
13812
13813      if (d64 == 0) {
13814         assign( rHi, mkexpr(sHi) );
13815         assign( rLo, mkexpr(sLo) );
13816      }
13817      else if (d64 >= 1 && d64 <= 7) {
13818         assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d64) );
13819         assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d64) );
13820      }
13821      else if (d64 == 8) {
13822         assign( rHi, mkexpr(dLo) );
13823         assign( rLo, mkexpr(sHi) );
13824      }
13825      else if (d64 >= 9 && d64 <= 15) {
13826         assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d64-8) );
13827         assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d64-8) );
13828      }
13829      else if (d64 == 16) {
13830         assign( rHi, mkexpr(dHi) );
13831         assign( rLo, mkexpr(dLo) );
13832      }
13833      else if (d64 >= 17 && d64 <= 23) {
13834         assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d64-16))) );
13835         assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d64-16) );
13836      }
13837      else if (d64 == 24) {
13838         assign( rHi, mkU64(0) );
13839         assign( rLo, mkexpr(dHi) );
13840      }
13841      else if (d64 >= 25 && d64 <= 31) {
13842         assign( rHi, mkU64(0) );
13843         assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d64-24))) );
13844      }
13845      else if (d64 >= 32 && d64 <= 255) {
13846         assign( rHi, mkU64(0) );
13847         assign( rLo, mkU64(0) );
13848      }
13849      else
13850         vassert(0);
13851
13852      putXMMReg(
13853         gregOfRexRM(pfx,modrm),
13854         binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
13855      );
13856      goto decode_success;
13857   }
13858
13859   /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
13860   if (haveNo66noF2noF3(pfx)
13861       && sz == 4
13862       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
13863      IRTemp sV      = newTemp(Ity_I64);
13864      IRTemp dV      = newTemp(Ity_I64);
13865
13866      modrm = insn[3];
13867      do_MMX_preamble();
13868      assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
13869
13870      if (epartIsReg(modrm)) {
13871         assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
13872         delta += 3+1;
13873         DIP("pshufb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
13874                               nameMMXReg(gregLO3ofRM(modrm)));
13875      } else {
13876         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
13877         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
13878         delta += 3+alen;
13879         DIP("pshufb %s,%s\n", dis_buf,
13880                               nameMMXReg(gregLO3ofRM(modrm)));
13881      }
13882
13883      putMMXReg(
13884         gregLO3ofRM(modrm),
13885         binop(
13886            Iop_And64,
13887            /* permute the lanes */
13888            binop(
13889               Iop_Perm8x8,
13890               mkexpr(dV),
13891               binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
13892            ),
13893            /* mask off lanes which have (index & 0x80) == 0x80 */
13894            unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
13895         )
13896      );
13897      goto decode_success;
13898   }
13899
13900   /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
13901   if (have66noF2noF3(pfx)
13902       && (sz == 2 || /*redundant REX.W*/ sz == 8)
13903       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
13904      IRTemp sV         = newTemp(Ity_V128);
13905      IRTemp dV         = newTemp(Ity_V128);
13906      IRTemp sHi        = newTemp(Ity_I64);
13907      IRTemp sLo        = newTemp(Ity_I64);
13908      IRTemp dHi        = newTemp(Ity_I64);
13909      IRTemp dLo        = newTemp(Ity_I64);
13910      IRTemp rHi        = newTemp(Ity_I64);
13911      IRTemp rLo        = newTemp(Ity_I64);
13912      IRTemp sevens     = newTemp(Ity_I64);
13913      IRTemp mask0x80hi = newTemp(Ity_I64);
13914      IRTemp mask0x80lo = newTemp(Ity_I64);
13915      IRTemp maskBit3hi = newTemp(Ity_I64);
13916      IRTemp maskBit3lo = newTemp(Ity_I64);
13917      IRTemp sAnd7hi    = newTemp(Ity_I64);
13918      IRTemp sAnd7lo    = newTemp(Ity_I64);
13919      IRTemp permdHi    = newTemp(Ity_I64);
13920      IRTemp permdLo    = newTemp(Ity_I64);
13921
13922      modrm = insn[3];
13923      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
13924
13925      if (epartIsReg(modrm)) {
13926         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
13927         delta += 3+1;
13928         DIP("pshufb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
13929                               nameXMMReg(gregOfRexRM(pfx,modrm)));
13930      } else {
13931         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
13932         gen_SEGV_if_not_16_aligned( addr );
13933         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
13934         delta += 3+alen;
13935         DIP("pshufb %s,%s\n", dis_buf,
13936                               nameXMMReg(gregOfRexRM(pfx,modrm)));
13937      }
13938
13939      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
13940      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
13941      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
13942      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
13943
13944      assign( sevens, mkU64(0x0707070707070707ULL) );
13945
13946      /*
13947      mask0x80hi = Not(SarN8x8(sHi,7))
13948      maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
13949      sAnd7hi    = And(sHi,sevens)
13950      permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
13951                       And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
13952      rHi        = And(permdHi,mask0x80hi)
13953      */
13954      assign(
13955         mask0x80hi,
13956         unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
13957
13958      assign(
13959         maskBit3hi,
13960         binop(Iop_SarN8x8,
13961               binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
13962               mkU8(7)));
13963
13964      assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
13965
13966      assign(
13967         permdHi,
13968         binop(
13969            Iop_Or64,
13970            binop(Iop_And64,
13971                  binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
13972                  mkexpr(maskBit3hi)),
13973            binop(Iop_And64,
13974                  binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
13975                  unop(Iop_Not64,mkexpr(maskBit3hi))) ));
13976
13977      assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
13978
13979      /* And the same for the lower half of the result.  What fun. */
13980
13981      assign(
13982         mask0x80lo,
13983         unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
13984
13985      assign(
13986         maskBit3lo,
13987         binop(Iop_SarN8x8,
13988               binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
13989               mkU8(7)));
13990
13991      assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
13992
13993      assign(
13994         permdLo,
13995         binop(
13996            Iop_Or64,
13997            binop(Iop_And64,
13998                  binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
13999                  mkexpr(maskBit3lo)),
14000            binop(Iop_And64,
14001                  binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
14002                  unop(Iop_Not64,mkexpr(maskBit3lo))) ));
14003
14004      assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
14005
14006      putXMMReg(
14007         gregOfRexRM(pfx,modrm),
14008         binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
14009      );
14010      goto decode_success;
14011   }
14012
14013   /* ---------------------------------------------------- */
14014   /* --- end of the SSSE3 decoder.                    --- */
14015   /* ---------------------------------------------------- */
14016
14017   /* ---------------------------------------------------- */
14018   /* --- start of the SSE4 decoder                    --- */
14019   /* ---------------------------------------------------- */
14020
14021   /* 66 0F 3A 0D /r ib = BLENDPD xmm1, xmm2/m128, imm8
14022      Blend Packed Double Precision Floating-Point Values (XMM) */
14023   if ( have66noF2noF3( pfx )
14024        && sz == 2
14025        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0D ) {
14026
14027      Int imm8;
14028      UShort imm8_mask_16;
14029
14030      IRTemp dst_vec = newTemp(Ity_V128);
14031      IRTemp src_vec = newTemp(Ity_V128);
14032      IRTemp imm8_mask = newTemp(Ity_V128);
14033
14034      modrm = insn[3];
14035      assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
14036
14037      if ( epartIsReg( modrm ) ) {
14038         imm8 = (Int)insn[4];
14039         assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
14040         delta += 3+1+1;
14041         DIP( "blendpd $%d, %s,%s\n", imm8,
14042              nameXMMReg( eregOfRexRM(pfx, modrm) ),
14043              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14044      } else {
14045         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
14046                          1/* imm8 is 1 byte after the amode */ );
14047         gen_SEGV_if_not_16_aligned( addr );
14048         assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
14049         imm8 = (Int)insn[2+alen+1];
14050         delta += 3+alen+1;
14051         DIP( "blendpd $%d, %s,%s\n",
14052              imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14053      }
14054
14055      switch( imm8 & 3 ) {
14056         case 0:  imm8_mask_16 = 0x0000; break;
14057         case 1:  imm8_mask_16 = 0x00FF; break;
14058         case 2:  imm8_mask_16 = 0xFF00; break;
14059         case 3:  imm8_mask_16 = 0xFFFF; break;
14060         default: vassert(0);            break;
14061      }
14062      assign( imm8_mask, mkV128( imm8_mask_16 ) );
14063
14064      putXMMReg( gregOfRexRM(pfx, modrm),
14065                 binop( Iop_OrV128,
14066                        binop( Iop_AndV128, mkexpr(src_vec), mkexpr(imm8_mask) ),
14067                        binop( Iop_AndV128, mkexpr(dst_vec),
14068                               unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
14069
14070      goto decode_success;
14071   }
14072
14073
14074   /* 66 0F 3A 0C /r ib = BLENDPS xmm1, xmm2/m128, imm8
14075      Blend Packed Single Precision Floating-Point Values (XMM) */
14076   if ( have66noF2noF3( pfx )
14077        && sz == 2
14078        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0C ) {
14079
14080      Int imm8;
14081      IRTemp dst_vec = newTemp(Ity_V128);
14082      IRTemp src_vec = newTemp(Ity_V128);
14083
14084      modrm = insn[3];
14085
14086      assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
14087
14088      if ( epartIsReg( modrm ) ) {
14089         imm8 = (Int)insn[3+1];
14090         assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
14091         delta += 3+1+1;
14092         DIP( "blendps $%d, %s,%s\n", imm8,
14093              nameXMMReg( eregOfRexRM(pfx, modrm) ),
14094              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14095      } else {
14096         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
14097                          1/* imm8 is 1 byte after the amode */ );
14098         gen_SEGV_if_not_16_aligned( addr );
14099         assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
14100         imm8 = (Int)insn[3+alen];
14101         delta += 3+alen+1;
14102         DIP( "blendpd $%d, %s,%s\n",
14103              imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14104      }
14105
14106      UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00, 0x0F0F,
14107                                0x0FF0, 0x0FFF, 0xF000, 0xF00F, 0xF0F0, 0xF0FF,
14108                                0xFF00, 0xFF0F, 0xFFF0, 0xFFFF };
14109      IRTemp imm8_mask = newTemp(Ity_V128);
14110      assign( imm8_mask, mkV128( imm8_perms[ (imm8 & 15) ] ) );
14111
14112      putXMMReg( gregOfRexRM(pfx, modrm),
14113                 binop( Iop_OrV128,
14114                        binop( Iop_AndV128, mkexpr(src_vec), mkexpr(imm8_mask) ),
14115                        binop( Iop_AndV128, mkexpr(dst_vec),
14116                               unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
14117
14118      goto decode_success;
14119   }
14120
14121
14122   /* 66 0F 3A 0E /r ib = PBLENDW xmm1, xmm2/m128, imm8
14123      Blend Packed Words (XMM) */
14124   if ( have66noF2noF3( pfx )
14125        && sz == 2
14126        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0E ) {
14127
14128      Int imm8;
14129      IRTemp dst_vec = newTemp(Ity_V128);
14130      IRTemp src_vec = newTemp(Ity_V128);
14131
14132      modrm = insn[3];
14133
14134      assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
14135
14136      if ( epartIsReg( modrm ) ) {
14137         imm8 = (Int)insn[3+1];
14138         assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
14139         delta += 3+1+1;
14140         DIP( "pblendw $%d, %s,%s\n", imm8,
14141              nameXMMReg( eregOfRexRM(pfx, modrm) ),
14142              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14143      } else {
14144         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
14145                          1/* imm8 is 1 byte after the amode */ );
14146         gen_SEGV_if_not_16_aligned( addr );
14147         assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
14148         imm8 = (Int)insn[3+alen];
14149         delta += 3+alen+1;
14150         DIP( "pblendw $%d, %s,%s\n",
14151              imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14152      }
14153
14154      /* Make w be a 16-bit version of imm8, formed by duplicating each
14155         bit in imm8. */
14156      Int i;
14157      UShort imm16 = 0;
14158      for (i = 0; i < 8; i++) {
14159         if (imm8 & (1 << i))
14160             imm16 |= (3 << (2*i));
14161      }
14162      IRTemp imm16_mask = newTemp(Ity_V128);
14163      assign( imm16_mask, mkV128( imm16 ));
14164
14165      putXMMReg( gregOfRexRM(pfx, modrm),
14166                 binop( Iop_OrV128,
14167                        binop( Iop_AndV128, mkexpr(src_vec), mkexpr(imm16_mask) ),
14168                        binop( Iop_AndV128, mkexpr(dst_vec),
14169                               unop( Iop_NotV128, mkexpr(imm16_mask) ) ) ) );
14170
14171      goto decode_success;
14172   }
14173
14174
14175   /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
14176    * Carry-less multiplication of selected XMM quadwords into XMM
14177    * registers (a.k.a multiplication of polynomials over GF(2))
14178    */
14179   if ( have66noF2noF3( pfx )
14180        && sz == 2
14181        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x44 ) {
14182
14183      Int imm8;
14184      IRTemp svec = newTemp(Ity_V128);
14185      IRTemp dvec = newTemp(Ity_V128);
14186
14187      modrm = insn[3];
14188
14189      assign( dvec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
14190
14191      if ( epartIsReg( modrm ) ) {
14192         imm8 = (Int)insn[4];
14193         assign( svec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
14194         delta += 3+1+1;
14195         DIP( "pclmulqdq $%d, %s,%s\n", imm8,
14196              nameXMMReg( eregOfRexRM(pfx, modrm) ),
14197              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14198      } else {
14199         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
14200                          1/* imm8 is 1 byte after the amode */ );
14201         gen_SEGV_if_not_16_aligned( addr );
14202         assign( svec, loadLE( Ity_V128, mkexpr(addr) ) );
14203         imm8 = (Int)insn[2+alen+1];
14204         delta += 3+alen+1;
14205         DIP( "pclmulqdq $%d, %s,%s\n",
14206              imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14207      }
14208
14209      t0 = newTemp(Ity_I64);
14210      t1 = newTemp(Ity_I64);
14211      assign(t0, unop((imm8&1)? Iop_V128HIto64 : Iop_V128to64, mkexpr(dvec)));
14212      assign(t1, unop((imm8&16) ? Iop_V128HIto64 : Iop_V128to64, mkexpr(svec)));
14213
14214      t2 = newTemp(Ity_I64);
14215      t3 = newTemp(Ity_I64);
14216
14217      IRExpr** args;
14218
14219      args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(0));
14220      assign(t2,
14221              mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
14222                                       &amd64g_calculate_pclmul, args));
14223      args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(1));
14224      assign(t3,
14225              mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
14226                                       &amd64g_calculate_pclmul, args));
14227
14228      IRTemp res     = newTemp(Ity_V128);
14229      assign(res, binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)));
14230      putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
14231
14232      goto decode_success;
14233   }
14234
14235   /* 66 0F 3A 41 /r ib = DPPD xmm1, xmm2/m128, imm8
14236      Dot Product of Packed Double Precision Floating-Point Values (XMM) */
14237   if ( have66noF2noF3( pfx )
14238        && sz == 2
14239        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x41 ) {
14240
14241      Int imm8;
14242      IRTemp src_vec = newTemp(Ity_V128);
14243      IRTemp dst_vec = newTemp(Ity_V128);
14244      IRTemp and_vec = newTemp(Ity_V128);
14245      IRTemp sum_vec = newTemp(Ity_V128);
14246
14247      modrm = insn[3];
14248
14249      assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
14250
14251      if ( epartIsReg( modrm ) ) {
14252         imm8 = (Int)insn[4];
14253         assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
14254         delta += 3+1+1;
14255         DIP( "dppd $%d, %s,%s\n", imm8,
14256              nameXMMReg( eregOfRexRM(pfx, modrm) ),
14257              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14258      } else {
14259         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
14260                          1/* imm8 is 1 byte after the amode */ );
14261         gen_SEGV_if_not_16_aligned( addr );
14262         assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
14263         imm8 = (Int)insn[2+alen+1];
14264         delta += 3+alen+1;
14265         DIP( "dppd $%d, %s,%s\n",
14266              imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14267      }
14268
14269      UShort imm8_perms[4] = { 0x0000, 0x00FF, 0xFF00, 0xFFFF };
14270
14271      assign( and_vec, binop( Iop_AndV128,
14272                              binop( Iop_Mul64Fx2,
14273                                     mkexpr(dst_vec), mkexpr(src_vec) ),
14274                              mkV128( imm8_perms[ ((imm8 >> 4) & 3) ] ) ) );
14275
14276      assign( sum_vec, binop( Iop_Add64F0x2,
14277                              binop( Iop_InterleaveHI64x2,
14278                                     mkexpr(and_vec), mkexpr(and_vec) ),
14279                              binop( Iop_InterleaveLO64x2,
14280                                     mkexpr(and_vec), mkexpr(and_vec) ) ) );
14281
14282      putXMMReg( gregOfRexRM( pfx, modrm ),
14283                 binop( Iop_AndV128,
14284                        binop( Iop_InterleaveLO64x2,
14285                               mkexpr(sum_vec), mkexpr(sum_vec) ),
14286                        mkV128( imm8_perms[ (imm8 & 3) ] ) ) );
14287
14288      goto decode_success;
14289   }
14290
14291
14292   /* 66 0F 3A 40 /r ib = DPPS xmm1, xmm2/m128, imm8
14293      Dot Product of Packed Single Precision Floating-Point Values (XMM) */
14294   if ( have66noF2noF3( pfx )
14295        && sz == 2
14296        && insn[0] == 0x0F
14297        && insn[1] == 0x3A
14298        && insn[2] == 0x40 ) {
14299
14300      Int imm8;
14301      IRTemp xmm1_vec     = newTemp(Ity_V128);
14302      IRTemp xmm2_vec     = newTemp(Ity_V128);
14303      IRTemp tmp_prod_vec = newTemp(Ity_V128);
14304      IRTemp prod_vec     = newTemp(Ity_V128);
14305      IRTemp sum_vec      = newTemp(Ity_V128);
14306      IRTemp v3, v2, v1, v0;
14307      v3 = v2 = v1 = v0   = IRTemp_INVALID;
14308
14309      modrm = insn[3];
14310
14311      assign( xmm1_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
14312
14313      if ( epartIsReg( modrm ) ) {
14314         imm8 = (Int)insn[4];
14315         assign( xmm2_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
14316         delta += 3+1+1;
14317         DIP( "dpps $%d, %s,%s\n", imm8,
14318              nameXMMReg( eregOfRexRM(pfx, modrm) ),
14319              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14320      } else {
14321         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
14322                          1/* imm8 is 1 byte after the amode */ );
14323         gen_SEGV_if_not_16_aligned( addr );
14324         assign( xmm2_vec, loadLE( Ity_V128, mkexpr(addr) ) );
14325         imm8 = (Int)insn[2+alen+1];
14326         delta += 3+alen+1;
14327         DIP( "dpps $%d, %s,%s\n",
14328              imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14329      }
14330
14331      UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
14332                                0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
14333                                0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0, 0xFFFF };
14334
14335      assign( tmp_prod_vec,
14336              binop( Iop_AndV128,
14337                     binop( Iop_Mul32Fx4, mkexpr(xmm1_vec), mkexpr(xmm2_vec) ),
14338                     mkV128( imm8_perms[((imm8 >> 4)& 15)] ) ) );
14339      breakup128to32s( tmp_prod_vec, &v3, &v2, &v1, &v0 );
14340      assign( prod_vec, mk128from32s( v3, v1, v2, v0 ) );
14341
14342      assign( sum_vec, binop( Iop_Add32Fx4,
14343                              binop( Iop_InterleaveHI32x4,
14344                                     mkexpr(prod_vec), mkexpr(prod_vec) ),
14345                              binop( Iop_InterleaveLO32x4,
14346                                     mkexpr(prod_vec), mkexpr(prod_vec) ) ) );
14347
14348      putXMMReg( gregOfRexRM(pfx, modrm),
14349                 binop( Iop_AndV128,
14350                        binop( Iop_Add32Fx4,
14351                               binop( Iop_InterleaveHI32x4,
14352                                      mkexpr(sum_vec), mkexpr(sum_vec) ),
14353                               binop( Iop_InterleaveLO32x4,
14354                                      mkexpr(sum_vec), mkexpr(sum_vec) ) ),
14355                        mkV128( imm8_perms[ (imm8 & 15) ] ) ) );
14356
14357      goto decode_success;
14358   }
14359
14360
14361   /* 66 0F 3A 21 /r ib = INSERTPS xmm1, xmm2/m32, imm8
14362      Insert Packed Single Precision Floating-Point Value (XMM) */
14363   if ( have66noF2noF3( pfx )
14364        && sz == 2
14365        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x21 ) {
14366
14367      Int imm8;
14368      Int imm8_count_s;
14369      Int imm8_count_d;
14370      Int imm8_zmask;
14371      IRTemp dstVec   = newTemp(Ity_V128);
14372      IRTemp srcDWord = newTemp(Ity_I32);
14373
14374      modrm = insn[3];
14375
14376      assign( dstVec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
14377
14378      if ( epartIsReg( modrm ) ) {
14379         IRTemp src_vec = newTemp(Ity_V128);
14380         assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
14381
14382         IRTemp src_lane_0 = IRTemp_INVALID;
14383         IRTemp src_lane_1 = IRTemp_INVALID;
14384         IRTemp src_lane_2 = IRTemp_INVALID;
14385         IRTemp src_lane_3 = IRTemp_INVALID;
14386         breakup128to32s( src_vec,
14387                          &src_lane_3, &src_lane_2, &src_lane_1, &src_lane_0 );
14388
14389         imm8 = (Int)insn[4];
14390         imm8_count_s = ((imm8 >> 6) & 3);
14391         switch( imm8_count_s ) {
14392           case 0:  assign( srcDWord, mkexpr(src_lane_0) ); break;
14393           case 1:  assign( srcDWord, mkexpr(src_lane_1) ); break;
14394           case 2:  assign( srcDWord, mkexpr(src_lane_2) ); break;
14395           case 3:  assign( srcDWord, mkexpr(src_lane_3) ); break;
14396           default: vassert(0);                             break;
14397         }
14398
14399         delta += 3+1+1;
14400         DIP( "insertps $%d, %s,%s\n", imm8,
14401              nameXMMReg( eregOfRexRM(pfx, modrm) ),
14402              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14403      } else {
14404         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf,
14405                          1/* const imm8 is 1 byte after the amode */ );
14406         assign( srcDWord, loadLE( Ity_I32, mkexpr(addr) ) );
14407         imm8 = (Int)insn[2+alen+1];
14408         imm8_count_s = 0;
14409         delta += 3+alen+1;
14410         DIP( "insertps $%d, %s,%s\n",
14411              imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14412      }
14413
14414      IRTemp dst_lane_0 = IRTemp_INVALID;
14415      IRTemp dst_lane_1 = IRTemp_INVALID;
14416      IRTemp dst_lane_2 = IRTemp_INVALID;
14417      IRTemp dst_lane_3 = IRTemp_INVALID;
14418      breakup128to32s( dstVec,
14419                       &dst_lane_3, &dst_lane_2, &dst_lane_1, &dst_lane_0 );
14420
14421      imm8_count_d = ((imm8 >> 4) & 3);
14422      switch( imm8_count_d ) {
14423         case 0:  dst_lane_0 = srcDWord; break;
14424         case 1:  dst_lane_1 = srcDWord; break;
14425         case 2:  dst_lane_2 = srcDWord; break;
14426         case 3:  dst_lane_3 = srcDWord; break;
14427         default: vassert(0);            break;
14428      }
14429
14430      imm8_zmask = (imm8 & 15);
14431      IRTemp zero_32 = newTemp(Ity_I32);
14432      assign( zero_32, mkU32(0) );
14433
14434      IRExpr* ire_vec_128 = mk128from32s(
14435                               ((imm8_zmask & 8) == 8) ? zero_32 : dst_lane_3,
14436                               ((imm8_zmask & 4) == 4) ? zero_32 : dst_lane_2,
14437                               ((imm8_zmask & 2) == 2) ? zero_32 : dst_lane_1,
14438                               ((imm8_zmask & 1) == 1) ? zero_32 : dst_lane_0 );
14439
14440      putXMMReg( gregOfRexRM(pfx, modrm), ire_vec_128 );
14441
14442      goto decode_success;
14443   }
14444
14445
14446  /* 66 0F 3A 14 /r ib = PEXTRB r/m16, xmm, imm8
14447     Extract Byte from xmm, store in mem or zero-extend + store in gen.reg. (XMM) */
14448  if ( have66noF2noF3( pfx )
14449       && sz == 2
14450       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x14 ) {
14451
14452     Int imm8;
14453     IRTemp xmm_vec  = newTemp(Ity_V128);
14454     IRTemp sel_lane = newTemp(Ity_I32);
14455     IRTemp shr_lane = newTemp(Ity_I32);
14456
14457     modrm = insn[3];
14458     assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
14459     breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
14460
14461     if ( epartIsReg( modrm ) ) {
14462        imm8 = (Int)insn[3+1];
14463     } else {
14464        addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
14465        imm8 = (Int)insn[3+alen];
14466     }
14467     switch( (imm8 >> 2) & 3 ) {
14468        case 0:  assign( sel_lane, mkexpr(t0) ); break;
14469        case 1:  assign( sel_lane, mkexpr(t1) ); break;
14470        case 2:  assign( sel_lane, mkexpr(t2) ); break;
14471        case 3:  assign( sel_lane, mkexpr(t3) ); break;
14472        default: vassert(0);
14473     }
14474     assign( shr_lane,
14475             binop( Iop_Shr32, mkexpr(sel_lane), mkU8(((imm8 & 3)*8)) ) );
14476
14477     if ( epartIsReg( modrm ) ) {
14478        putIReg64( eregOfRexRM(pfx,modrm),
14479                   unop( Iop_32Uto64,
14480                         binop(Iop_And32, mkexpr(shr_lane), mkU32(255)) ) );
14481
14482        delta += 3+1+1;
14483        DIP( "pextrb $%d, %s,%s\n", imm8,
14484             nameXMMReg( gregOfRexRM(pfx, modrm) ),
14485             nameIReg64( eregOfRexRM(pfx, modrm) ) );
14486     } else {
14487        storeLE( mkexpr(addr), unop(Iop_32to8, mkexpr(shr_lane) ) );
14488        delta += 3+alen+1;
14489        DIP( "$%d, pextrb %s,%s\n",
14490             imm8, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
14491     }
14492
14493     goto decode_success;
14494  }
14495
14496
14497   /* 66 0F 3A 16 /r ib = PEXTRD reg/mem32, xmm2, imm8
14498      Extract Doubleword int from xmm reg and store in gen.reg or mem. (XMM)
14499      Note that this insn has the same opcodes as PEXTRQ, but
14500      here the REX.W bit is _not_ present */
14501   if ( have66noF2noF3( pfx )
14502        && sz == 2  /* REX.W is _not_ present */
14503        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x16 ) {
14504
14505      Int imm8_10;
14506      IRTemp xmm_vec   = newTemp(Ity_V128);
14507      IRTemp src_dword = newTemp(Ity_I32);
14508
14509      modrm = insn[3];
14510      assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
14511      breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
14512
14513      if ( epartIsReg( modrm ) ) {
14514         imm8_10 = (Int)(insn[3+1] & 3);
14515      } else {
14516         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
14517         imm8_10 = (Int)(insn[3+alen] & 3);
14518      }
14519
14520      switch ( imm8_10 ) {
14521         case 0:  assign( src_dword, mkexpr(t0) ); break;
14522         case 1:  assign( src_dword, mkexpr(t1) ); break;
14523         case 2:  assign( src_dword, mkexpr(t2) ); break;
14524         case 3:  assign( src_dword, mkexpr(t3) ); break;
14525         default: vassert(0);
14526      }
14527
14528      if ( epartIsReg( modrm ) ) {
14529         putIReg32( eregOfRexRM(pfx,modrm), mkexpr(src_dword) );
14530         delta += 3+1+1;
14531         DIP( "pextrd $%d, %s,%s\n", imm8_10,
14532              nameXMMReg( gregOfRexRM(pfx, modrm) ),
14533              nameIReg32( eregOfRexRM(pfx, modrm) ) );
14534      } else {
14535         storeLE( mkexpr(addr), mkexpr(src_dword) );
14536         delta += 3+alen+1;
14537         DIP( "pextrd $%d, %s,%s\n",
14538              imm8_10, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
14539      }
14540
14541      goto decode_success;
14542   }
14543
14544
14545   /* 66 REX.W 0F 3A 16 /r ib = PEXTRQ reg/mem64, xmm2, imm8
14546      Extract Quadword int from xmm reg and store in gen.reg or mem. (XMM)
14547      Note that this insn has the same opcodes as PEXTRD, but
14548      here the REX.W bit is present */
14549   if ( have66noF2noF3( pfx )
14550        && sz == 8  /* REX.W is present */
14551        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x16 ) {
14552
14553      Int imm8_0;
14554      IRTemp xmm_vec   = newTemp(Ity_V128);
14555      IRTemp src_qword = newTemp(Ity_I64);
14556
14557      modrm = insn[3];
14558      assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
14559
14560      if ( epartIsReg( modrm ) ) {
14561         imm8_0 = (Int)(insn[3+1] & 1);
14562      } else {
14563         addr   = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
14564         imm8_0 = (Int)(insn[3+alen] & 1);
14565      }
14566      switch ( imm8_0 ) {
14567         case 0:  assign( src_qword, unop(Iop_V128to64,   mkexpr(xmm_vec)) ); break;
14568         case 1:  assign( src_qword, unop(Iop_V128HIto64, mkexpr(xmm_vec)) ); break;
14569         default: vassert(0);
14570      }
14571
14572      if ( epartIsReg( modrm ) ) {
14573         putIReg64( eregOfRexRM(pfx,modrm), mkexpr(src_qword) );
14574         delta += 3+1+1;
14575         DIP( "pextrq $%d, %s,%s\n", imm8_0,
14576              nameXMMReg( gregOfRexRM(pfx, modrm) ),
14577              nameIReg64( eregOfRexRM(pfx, modrm) ) );
14578      } else {
14579         storeLE( mkexpr(addr), mkexpr(src_qword) );
14580         delta += 3+alen+1;
14581         DIP( "pextrq $%d, %s,%s\n",
14582              imm8_0, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
14583      }
14584
14585      goto decode_success;
14586   }
14587
14588
14589   /* 66 0F 3A 15 /r ib = PEXTRW r/m16, xmm, imm8
14590      Extract Word from xmm, store in mem or zero-extend + store in gen.reg. (XMM) */
14591   if ( have66noF2noF3( pfx )
14592        && sz == 2
14593        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x15 ) {
14594
14595      Int imm8_20;
14596      IRTemp xmm_vec = newTemp(Ity_V128);
14597      IRTemp src_word = newTemp(Ity_I16);
14598
14599      modrm = insn[3];
14600      assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
14601      breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
14602
14603      if ( epartIsReg( modrm ) ) {
14604         imm8_20 = (Int)(insn[3+1] & 7);
14605      } else {
14606         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
14607         imm8_20 = (Int)(insn[3+alen] & 7);
14608      }
14609
14610      switch ( imm8_20 ) {
14611         case 0:  assign( src_word, unop(Iop_32to16,   mkexpr(t0)) ); break;
14612         case 1:  assign( src_word, unop(Iop_32HIto16, mkexpr(t0)) ); break;
14613         case 2:  assign( src_word, unop(Iop_32to16,   mkexpr(t1)) ); break;
14614         case 3:  assign( src_word, unop(Iop_32HIto16, mkexpr(t1)) ); break;
14615         case 4:  assign( src_word, unop(Iop_32to16,   mkexpr(t2)) ); break;
14616         case 5:  assign( src_word, unop(Iop_32HIto16, mkexpr(t2)) ); break;
14617         case 6:  assign( src_word, unop(Iop_32to16,   mkexpr(t3)) ); break;
14618         case 7:  assign( src_word, unop(Iop_32HIto16, mkexpr(t3)) ); break;
14619         default: vassert(0);
14620      }
14621
14622      if ( epartIsReg( modrm ) ) {
14623         putIReg64( eregOfRexRM(pfx,modrm), unop(Iop_16Uto64, mkexpr(src_word)) );
14624         delta += 3+1+1;
14625         DIP( "pextrw $%d, %s,%s\n", imm8_20,
14626              nameXMMReg( gregOfRexRM(pfx, modrm) ),
14627              nameIReg64( eregOfRexRM(pfx, modrm) ) );
14628      } else {
14629         storeLE( mkexpr(addr), mkexpr(src_word) );
14630         delta += 3+alen+1;
14631         DIP( "pextrw $%d, %s,%s\n",
14632              imm8_20, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
14633      }
14634
14635      goto decode_success;
14636   }
14637
14638
14639   /* 66 REX.W 0F 3A 22 /r ib = PINSRQ xmm1, r/m64, imm8
14640      Extract Quadword int from gen.reg/mem64 and insert into xmm1 */
14641   if ( have66noF2noF3( pfx )
14642        && sz == 8  /* REX.W is present */
14643        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x22 ) {
14644
14645      Int imm8_0;
14646      IRTemp src_elems = newTemp(Ity_I64);
14647      IRTemp src_vec   = newTemp(Ity_V128);
14648
14649      modrm = insn[3];
14650
14651      if ( epartIsReg( modrm ) ) {
14652         imm8_0 = (Int)(insn[3+1] & 1);
14653         assign( src_elems, getIReg64( eregOfRexRM(pfx,modrm) ) );
14654         delta += 3+1+1;
14655         DIP( "pinsrq $%d, %s,%s\n", imm8_0,
14656              nameIReg64( eregOfRexRM(pfx, modrm) ),
14657              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14658      } else {
14659         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
14660         imm8_0 = (Int)(insn[3+alen] & 1);
14661         assign( src_elems, loadLE( Ity_I64, mkexpr(addr) ) );
14662         delta += 3+alen+1;
14663         DIP( "pinsrq $%d, %s,%s\n",
14664              imm8_0, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14665      }
14666
14667      UShort mask = 0;
14668      if ( imm8_0 == 0 ) {
14669         mask = 0xFF00;
14670         assign( src_vec,  binop( Iop_64HLtoV128, mkU64(0), mkexpr(src_elems) ) );
14671      } else {
14672         mask = 0x00FF;
14673         assign( src_vec, binop( Iop_64HLtoV128, mkexpr(src_elems), mkU64(0) ) );
14674      }
14675
14676      putXMMReg( gregOfRexRM(pfx, modrm),
14677                 binop( Iop_OrV128, mkexpr(src_vec),
14678                        binop( Iop_AndV128,
14679                               getXMMReg( gregOfRexRM(pfx, modrm) ),
14680                               mkV128(mask) ) ) );
14681
14682      goto decode_success;
14683   }
14684
14685
14686   /* 66 no-REX.W 0F 3A 22 /r ib = PINSRD xmm1, r/m32, imm8
14687      Extract Doubleword int from gen.reg/mem32 and insert into xmm1 */
14688   if ( have66noF2noF3( pfx )
14689        && sz == 2 /* REX.W is NOT present */
14690        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x22 ) {
14691
14692      Int imm8_10;
14693      IRTemp src_elems = newTemp(Ity_I32);
14694      IRTemp src_vec   = newTemp(Ity_V128);
14695      IRTemp z32       = newTemp(Ity_I32);
14696
14697      modrm = insn[3];
14698
14699      if ( epartIsReg( modrm ) ) {
14700         imm8_10 = (Int)(insn[3+1] & 3);
14701         assign( src_elems, getIReg32( eregOfRexRM(pfx,modrm) ) );
14702         delta += 3+1+1;
14703         DIP( "pinsrd $%d, %s,%s\n", imm8_10,
14704              nameIReg32( eregOfRexRM(pfx, modrm) ),
14705              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14706      } else {
14707         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
14708         imm8_10 = (Int)(insn[3+alen] & 3);
14709         assign( src_elems, loadLE( Ity_I32, mkexpr(addr) ) );
14710         delta += 3+alen+1;
14711         DIP( "pinsrd $%d, %s,%s\n",
14712              imm8_10, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14713      }
14714
14715      assign(z32, mkU32(0));
14716
14717      UShort mask = 0;
14718      switch (imm8_10) {
14719         case 3:  mask = 0x0FFF;
14720                  assign(src_vec, mk128from32s(src_elems, z32, z32, z32));
14721                  break;
14722         case 2:  mask = 0xF0FF;
14723                  assign(src_vec, mk128from32s(z32, src_elems, z32, z32));
14724                  break;
14725         case 1:  mask = 0xFF0F;
14726                  assign(src_vec, mk128from32s(z32, z32, src_elems, z32));
14727                  break;
14728         case 0:  mask = 0xFFF0;
14729                  assign(src_vec, mk128from32s(z32, z32, z32, src_elems));
14730                  break;
14731         default: vassert(0);
14732      }
14733
14734      putXMMReg( gregOfRexRM(pfx, modrm),
14735                 binop( Iop_OrV128, mkexpr(src_vec),
14736                        binop( Iop_AndV128,
14737                               getXMMReg( gregOfRexRM(pfx, modrm) ),
14738                               mkV128(mask) ) ) );
14739
14740      goto decode_success;
14741   }
14742
14743   /* 66 0F 3A 20 /r ib = PINSRB xmm1, r32/m8, imm8
14744      Extract byte from r32/m8 and insert into xmm1 */
14745   if ( have66noF2noF3( pfx )
14746        && sz == 2
14747        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x20 ) {
14748
14749      Int    imm8;
14750      IRTemp new8 = newTemp(Ity_I64);
14751
14752      modrm = insn[3];
14753
14754      if ( epartIsReg( modrm ) ) {
14755         imm8 = (Int)(insn[3+1] & 0xF);
14756         assign( new8, binop(Iop_And64,
14757                             unop(Iop_32Uto64,
14758                                  getIReg32(eregOfRexRM(pfx,modrm))),
14759                             mkU64(0xFF)));
14760         delta += 3+1+1;
14761         DIP( "pinsrb $%d,%s,%s\n", imm8,
14762              nameIReg32( eregOfRexRM(pfx, modrm) ),
14763              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14764      } else {
14765         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
14766         imm8 = (Int)(insn[3+alen] & 0xF);
14767         assign( new8, unop(Iop_8Uto64, loadLE( Ity_I8, mkexpr(addr) )));
14768         delta += 3+alen+1;
14769         DIP( "pinsrb $%d,%s,%s\n",
14770              imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14771      }
14772
14773      // Create a V128 value which has the selected byte in the
14774      // specified lane, and zeroes everywhere else.
14775      IRTemp tmp128 = newTemp(Ity_V128);
14776      IRTemp halfshift = newTemp(Ity_I64);
14777      assign(halfshift, binop(Iop_Shl64,
14778                              mkexpr(new8), mkU8(8 * (imm8 & 7))));
14779      vassert(imm8 >= 0 && imm8 <= 15);
14780      if (imm8 < 8) {
14781         assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
14782      } else {
14783         assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
14784      }
14785
14786      UShort mask = ~(1 << imm8);
14787
14788      putXMMReg( gregOfRexRM(pfx, modrm),
14789                 binop( Iop_OrV128,
14790                        mkexpr(tmp128),
14791                        binop( Iop_AndV128,
14792                               getXMMReg( gregOfRexRM(pfx, modrm) ),
14793                               mkV128(mask) ) ) );
14794
14795      goto decode_success;
14796   }
14797
14798
14799   /* 66 0F 3A 17 /r ib = EXTRACTPS reg/mem32, xmm2, imm8 Extract
14800      float from xmm reg and store in gen.reg or mem.  This is
14801      identical to PEXTRD, except that REX.W appears to be ignored.
14802   */
14803   if ( have66noF2noF3( pfx )
14804        && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
14805        && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x17 ) {
14806
14807      Int imm8_10;
14808      IRTemp xmm_vec   = newTemp(Ity_V128);
14809      IRTemp src_dword = newTemp(Ity_I32);
14810
14811      modrm = insn[3];
14812      assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
14813      breakup128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
14814
14815      if ( epartIsReg( modrm ) ) {
14816         imm8_10 = (Int)(insn[3+1] & 3);
14817      } else {
14818         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 1 );
14819         imm8_10 = (Int)(insn[3+alen] & 3);
14820      }
14821
14822      switch ( imm8_10 ) {
14823         case 0:  assign( src_dword, mkexpr(t0) ); break;
14824         case 1:  assign( src_dword, mkexpr(t1) ); break;
14825         case 2:  assign( src_dword, mkexpr(t2) ); break;
14826         case 3:  assign( src_dword, mkexpr(t3) ); break;
14827         default: vassert(0);
14828      }
14829
14830      if ( epartIsReg( modrm ) ) {
14831         putIReg32( eregOfRexRM(pfx,modrm), mkexpr(src_dword) );
14832         delta += 3+1+1;
14833         DIP( "extractps $%d, %s,%s\n", imm8_10,
14834              nameXMMReg( gregOfRexRM(pfx, modrm) ),
14835              nameIReg32( eregOfRexRM(pfx, modrm) ) );
14836      } else {
14837         storeLE( mkexpr(addr), mkexpr(src_dword) );
14838         delta += 3+alen+1;
14839         DIP( "extractps $%d, %s,%s\n",
14840              imm8_10, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
14841      }
14842
14843      goto decode_success;
14844   }
14845
14846
14847   /* 66 0F 38 37 = PCMPGTQ
14848      64x2 comparison (signed, presumably; the Intel docs don't say :-)
14849   */
14850   if ( have66noF2noF3( pfx ) && sz == 2
14851        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x37) {
14852      /* FIXME: this needs an alignment check */
14853      delta = dis_SSEint_E_to_G( vbi, pfx, delta+3,
14854                                 "pcmpgtq", Iop_CmpGT64Sx2, False );
14855      goto decode_success;
14856   }
14857
14858   /* 66 0F 38 3D /r = PMAXSD xmm1, xmm2/m128
14859      Maximum of Packed Signed Double Word Integers (XMM)
14860      66 0F 38 39 /r = PMINSD xmm1, xmm2/m128
14861      Minimum of Packed Signed Double Word Integers (XMM) */
14862   if ( have66noF2noF3( pfx ) && sz == 2
14863        && insn[0] == 0x0F && insn[1] == 0x38
14864        && (insn[2] == 0x3D || insn[2] == 0x39)) {
14865      /* FIXME: this needs an alignment check */
14866      Bool isMAX = insn[2] == 0x3D;
14867      delta = dis_SSEint_E_to_G(
14868                 vbi, pfx, delta+3,
14869                 isMAX ? "pmaxsd" : "pminsd",
14870                 isMAX ? Iop_Max32Sx4 : Iop_Min32Sx4,
14871                 False
14872              );
14873      goto decode_success;
14874   }
14875
14876   /* 66 0F 38 3F /r = PMAXUD xmm1, xmm2/m128
14877      Maximum of Packed Unsigned Doubleword Integers (XMM)
14878      66 0F 38 3B /r = PMINUD xmm1, xmm2/m128
14879      Minimum of Packed Unsigned Doubleword Integers (XMM) */
14880   if ( have66noF2noF3( pfx ) && sz == 2
14881        && insn[0] == 0x0F && insn[1] == 0x38
14882        && (insn[2] == 0x3F || insn[2] == 0x3B)) {
14883      /* FIXME: this needs an alignment check */
14884      Bool isMAX = insn[2] == 0x3F;
14885      delta = dis_SSEint_E_to_G(
14886                 vbi, pfx, delta+3,
14887                 isMAX ? "pmaxud" : "pminud",
14888                 isMAX ? Iop_Max32Ux4 : Iop_Min32Ux4,
14889                 False
14890              );
14891      goto decode_success;
14892   }
14893
14894   /* 66 0F 38 3E /r = PMAXUW xmm1, xmm2/m128
14895      Maximum of Packed Unsigned Word Integers (XMM)
14896      66 0F 38 3A /r = PMINUW xmm1, xmm2/m128
14897      Minimum of Packed Unsigned Word Integers (XMM)
14898   */
14899   if ( have66noF2noF3( pfx ) && sz == 2
14900        && insn[0] == 0x0F && insn[1] == 0x38
14901        && (insn[2] == 0x3E || insn[2] == 0x3A)) {
14902      /* FIXME: this needs an alignment check */
14903      Bool isMAX = insn[2] == 0x3E;
14904      delta = dis_SSEint_E_to_G(
14905                 vbi, pfx, delta+3,
14906                 isMAX ? "pmaxuw" : "pminuw",
14907                 isMAX ? Iop_Max16Ux8 : Iop_Min16Ux8,
14908                 False
14909              );
14910      goto decode_success;
14911   }
14912
14913   /* 66 0F 38 3C /r = PMAXSB xmm1, xmm2/m128
14914      8Sx16 (signed) max
14915      66 0F 38 38 /r = PMINSB xmm1, xmm2/m128
14916      8Sx16 (signed) min
14917   */
14918   if ( have66noF2noF3( pfx ) && sz == 2
14919        && insn[0] == 0x0F && insn[1] == 0x38
14920        && (insn[2] == 0x3C || insn[2] == 0x38)) {
14921      /* FIXME: this needs an alignment check */
14922      Bool isMAX = insn[2] == 0x3C;
14923      delta = dis_SSEint_E_to_G(
14924                 vbi, pfx, delta+3,
14925                 isMAX ? "pmaxsb" : "pminsb",
14926                 isMAX ? Iop_Max8Sx16 : Iop_Min8Sx16,
14927                 False
14928              );
14929      goto decode_success;
14930   }
14931
14932   /* 66 0f 38 20 /r = PMOVSXBW xmm1, xmm2/m64
14933      Packed Move with Sign Extend from Byte to Word (XMM) */
14934   if ( have66noF2noF3( pfx )
14935        && sz == 2
14936        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x20 ) {
14937
14938      modrm = insn[3];
14939
14940      IRTemp srcVec = newTemp(Ity_V128);
14941
14942      if ( epartIsReg( modrm ) ) {
14943         assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
14944         delta += 3+1;
14945         DIP( "pmovsxbw %s,%s\n",
14946              nameXMMReg( eregOfRexRM(pfx, modrm) ),
14947              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14948      } else {
14949         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
14950         assign( srcVec,
14951                 unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
14952         delta += 3+alen;
14953         DIP( "pmovsxbw %s,%s\n",
14954              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14955      }
14956
14957      putXMMReg( gregOfRexRM(pfx, modrm),
14958                 binop( Iop_SarN16x8,
14959                        binop( Iop_ShlN16x8,
14960                               binop( Iop_InterleaveLO8x16,
14961                                      IRExpr_Const( IRConst_V128(0) ),
14962                                      mkexpr(srcVec) ),
14963                               mkU8(8) ),
14964                        mkU8(8) ) );
14965
14966      goto decode_success;
14967   }
14968
14969
14970   /* 66 0f 38 21 /r = PMOVSXBD xmm1, xmm2/m32
14971      Packed Move with Sign Extend from Byte to DWord (XMM) */
14972   if ( have66noF2noF3( pfx )
14973        && sz == 2
14974        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x21 ) {
14975
14976      modrm = insn[3];
14977
14978      IRTemp srcVec = newTemp(Ity_V128);
14979
14980      if ( epartIsReg( modrm ) ) {
14981         assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
14982         delta += 3+1;
14983         DIP( "pmovsxbd %s,%s\n",
14984              nameXMMReg( eregOfRexRM(pfx, modrm) ),
14985              nameXMMReg( gregOfRexRM(pfx, modrm) )  );
14986      } else {
14987         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
14988         assign( srcVec,
14989                 unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
14990         delta += 3+alen;
14991         DIP( "pmovsxbd %s,%s\n",
14992              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
14993      }
14994
14995      IRTemp zeroVec = newTemp(Ity_V128);
14996      assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
14997
14998      putXMMReg( gregOfRexRM(pfx, modrm),
14999                 binop( Iop_SarN32x4,
15000                        binop( Iop_ShlN32x4,
15001                               binop( Iop_InterleaveLO8x16,
15002                                      mkexpr(zeroVec),
15003                                      binop( Iop_InterleaveLO8x16,
15004                                             mkexpr(zeroVec),
15005                                             mkexpr(srcVec) ) ),
15006                               mkU8(24) ), mkU8(24) ) );
15007
15008      goto decode_success;
15009   }
15010
15011
15012   /* 66 0f 38 22 /r = PMOVSXBQ xmm1, xmm2/m16
15013      Packed Move with Sign Extend from Byte to QWord (XMM) */
15014   if ( have66noF2noF3(pfx)
15015        && sz == 2
15016        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x22 ) {
15017
15018      modrm = insn[3];
15019
15020      IRTemp srcBytes = newTemp(Ity_I16);
15021
15022      if ( epartIsReg(modrm) ) {
15023         assign( srcBytes, getXMMRegLane16( eregOfRexRM(pfx, modrm), 0 ) );
15024         delta += 3+1;
15025         DIP( "pmovsxbq %s,%s\n",
15026              nameXMMReg( eregOfRexRM(pfx, modrm) ),
15027              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15028      } else {
15029         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
15030         assign( srcBytes, loadLE( Ity_I16, mkexpr(addr) ) );
15031         delta += 3+alen;
15032         DIP( "pmovsxbq %s,%s\n",
15033              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15034      }
15035
15036      putXMMReg( gregOfRexRM( pfx, modrm ),
15037                 binop( Iop_64HLtoV128,
15038                        unop( Iop_8Sto64,
15039                              unop( Iop_16HIto8,
15040                                    mkexpr(srcBytes) ) ),
15041                        unop( Iop_8Sto64,
15042                              unop( Iop_16to8, mkexpr(srcBytes) ) ) ) );
15043
15044      goto decode_success;
15045   }
15046
15047
15048   /* 66 0f 38 23 /r = PMOVSXWD xmm1, xmm2/m64
15049      Packed Move with Sign Extend from Word to DWord (XMM) */
15050   if ( have66noF2noF3( pfx )
15051        && sz == 2
15052        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x23 ) {
15053
15054      modrm = insn[3];
15055
15056      IRTemp srcVec = newTemp(Ity_V128);
15057
15058      if ( epartIsReg(modrm) ) {
15059         assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
15060         delta += 3+1;
15061         DIP( "pmovsxwd %s,%s\n",
15062              nameXMMReg( eregOfRexRM(pfx, modrm) ),
15063              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15064      } else {
15065         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
15066         assign( srcVec,
15067                 unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
15068         delta += 3+alen;
15069         DIP( "pmovsxwd %s,%s\n",
15070              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15071      }
15072
15073      putXMMReg( gregOfRexRM(pfx, modrm),
15074                 binop( Iop_SarN32x4,
15075                        binop( Iop_ShlN32x4,
15076                               binop( Iop_InterleaveLO16x8,
15077                                      IRExpr_Const( IRConst_V128(0) ),
15078                                      mkexpr(srcVec) ),
15079                               mkU8(16) ),
15080                        mkU8(16) ) );
15081
15082      goto decode_success;
15083   }
15084
15085
15086   /* 66 0f 38 24 /r = PMOVSXWQ xmm1, xmm2/m32
15087      Packed Move with Sign Extend from Word to QWord (XMM) */
15088   if ( have66noF2noF3( pfx )
15089        && sz == 2
15090        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x24 ) {
15091
15092      modrm = insn[3];
15093
15094      IRTemp srcBytes = newTemp(Ity_I32);
15095
15096      if ( epartIsReg( modrm ) ) {
15097         assign( srcBytes, getXMMRegLane32( eregOfRexRM(pfx, modrm), 0 ) );
15098         delta += 3+1;
15099         DIP( "pmovsxwq %s,%s\n",
15100              nameXMMReg( eregOfRexRM(pfx, modrm) ),
15101              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15102      } else {
15103         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
15104         assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) );
15105         delta += 3+alen;
15106         DIP( "pmovsxwq %s,%s\n",
15107              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15108      }
15109
15110      putXMMReg( gregOfRexRM( pfx, modrm ),
15111                 binop( Iop_64HLtoV128,
15112                        unop( Iop_16Sto64,
15113                              unop( Iop_32HIto16, mkexpr(srcBytes) ) ),
15114                        unop( Iop_16Sto64,
15115                              unop( Iop_32to16, mkexpr(srcBytes) ) ) ) );
15116
15117      goto decode_success;
15118   }
15119
15120
15121   /* 66 0f 38 25 /r = PMOVSXDQ xmm1, xmm2/m64
15122      Packed Move with Sign Extend from Double Word to Quad Word (XMM) */
15123   if ( have66noF2noF3( pfx )
15124        && sz == 2
15125        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x25 ) {
15126
15127      modrm = insn[3];
15128
15129      IRTemp srcBytes = newTemp(Ity_I64);
15130
15131      if ( epartIsReg(modrm) ) {
15132         assign( srcBytes, getXMMRegLane64( eregOfRexRM(pfx, modrm), 0 ) );
15133         delta += 3+1;
15134         DIP( "pmovsxdq %s,%s\n",
15135              nameXMMReg( eregOfRexRM(pfx, modrm) ),
15136              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15137      } else {
15138         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
15139         assign( srcBytes, loadLE( Ity_I64, mkexpr(addr) ) );
15140         delta += 3+alen;
15141         DIP( "pmovsxdq %s,%s\n",
15142              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15143      }
15144
15145      putXMMReg( gregOfRexRM(pfx, modrm),
15146                 binop( Iop_64HLtoV128,
15147                        unop( Iop_32Sto64,
15148                              unop( Iop_64HIto32, mkexpr(srcBytes) ) ),
15149                        unop( Iop_32Sto64,
15150                              unop( Iop_64to32, mkexpr(srcBytes) ) ) ) );
15151
15152      goto decode_success;
15153   }
15154
15155
15156   /* 66 0f 38 30 /r = PMOVZXBW xmm1, xmm2/m64
15157      Packed Move with Zero Extend from Byte to Word (XMM) */
15158   if ( have66noF2noF3(pfx)
15159        && sz == 2
15160        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x30 ) {
15161
15162      modrm = insn[3];
15163
15164      IRTemp srcVec = newTemp(Ity_V128);
15165
15166      if ( epartIsReg(modrm) ) {
15167         assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
15168         delta += 3+1;
15169         DIP( "pmovzxbw %s,%s\n",
15170              nameXMMReg( eregOfRexRM(pfx, modrm) ),
15171              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15172      } else {
15173         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
15174         assign( srcVec,
15175                 unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
15176         delta += 3+alen;
15177         DIP( "pmovzxbw %s,%s\n",
15178              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15179      }
15180
15181      putXMMReg( gregOfRexRM(pfx, modrm),
15182                 binop( Iop_InterleaveLO8x16,
15183                        IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
15184
15185      goto decode_success;
15186   }
15187
15188
15189   /* 66 0f 38 31 /r = PMOVZXBD xmm1, xmm2/m32
15190      Packed Move with Zero Extend from Byte to DWord (XMM) */
15191   if ( have66noF2noF3( pfx )
15192        && sz == 2
15193        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x31 ) {
15194
15195      modrm = insn[3];
15196
15197      IRTemp srcVec = newTemp(Ity_V128);
15198
15199      if ( epartIsReg(modrm) ) {
15200         assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
15201         delta += 3+1;
15202         DIP( "pmovzxbd %s,%s\n",
15203              nameXMMReg( eregOfRexRM(pfx, modrm) ),
15204              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15205      } else {
15206         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
15207         assign( srcVec,
15208                 unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
15209         delta += 3+alen;
15210         DIP( "pmovzxbd %s,%s\n",
15211              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15212      }
15213
15214      IRTemp zeroVec = newTemp(Ity_V128);
15215      assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
15216
15217      putXMMReg( gregOfRexRM( pfx, modrm ),
15218                 binop( Iop_InterleaveLO8x16,
15219                        mkexpr(zeroVec),
15220                        binop( Iop_InterleaveLO8x16,
15221                               mkexpr(zeroVec), mkexpr(srcVec) ) ) );
15222
15223      goto decode_success;
15224   }
15225
15226
15227   /* 66 0f 38 32 /r = PMOVZXBQ xmm1, xmm2/m16
15228      Packed Move with Zero Extend from Byte to QWord (XMM) */
15229   if ( have66noF2noF3( pfx )
15230        && sz == 2
15231        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x32 ) {
15232
15233      modrm = insn[3];
15234
15235      IRTemp srcVec = newTemp(Ity_V128);
15236
15237      if ( epartIsReg(modrm) ) {
15238         assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
15239         delta += 3+1;
15240         DIP( "pmovzxbq %s,%s\n",
15241              nameXMMReg( eregOfRexRM(pfx, modrm) ),
15242              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15243      } else {
15244         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
15245         assign( srcVec,
15246                 unop( Iop_32UtoV128,
15247                       unop( Iop_16Uto32, loadLE( Ity_I16, mkexpr(addr) ) ) ) );
15248         delta += 3+alen;
15249         DIP( "pmovzxbq %s,%s\n",
15250              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15251      }
15252
15253      IRTemp zeroVec = newTemp(Ity_V128);
15254      assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
15255
15256      putXMMReg( gregOfRexRM( pfx, modrm ),
15257                 binop( Iop_InterleaveLO8x16,
15258                        mkexpr(zeroVec),
15259                        binop( Iop_InterleaveLO8x16,
15260                               mkexpr(zeroVec),
15261                               binop( Iop_InterleaveLO8x16,
15262                                      mkexpr(zeroVec), mkexpr(srcVec) ) ) ) );
15263
15264      goto decode_success;
15265   }
15266
15267
15268   /* 66 0f 38 33 /r = PMOVZXWD xmm1, xmm2/m64
15269      Packed Move with Zero Extend from Word to DWord (XMM) */
15270   if ( have66noF2noF3( pfx )
15271        && sz == 2
15272        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x33 ) {
15273
15274      modrm = insn[3];
15275
15276      IRTemp srcVec = newTemp(Ity_V128);
15277
15278      if ( epartIsReg(modrm) ) {
15279         assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
15280         delta += 3+1;
15281         DIP( "pmovzxwd %s,%s\n",
15282              nameXMMReg( eregOfRexRM(pfx, modrm) ),
15283              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15284      } else {
15285         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
15286         assign( srcVec,
15287                 unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
15288         delta += 3+alen;
15289         DIP( "pmovzxwd %s,%s\n",
15290              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15291      }
15292
15293      putXMMReg( gregOfRexRM(pfx, modrm),
15294                 binop( Iop_InterleaveLO16x8,
15295                        IRExpr_Const( IRConst_V128(0) ),
15296                        mkexpr(srcVec) ) );
15297
15298      goto decode_success;
15299   }
15300
15301
15302   /* 66 0f 38 34 /r = PMOVZXWQ xmm1, xmm2/m32
15303      Packed Move with Zero Extend from Word to QWord (XMM) */
15304   if ( have66noF2noF3( pfx )
15305        && sz == 2
15306        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x34 ) {
15307
15308      modrm = insn[3];
15309
15310      IRTemp srcVec = newTemp(Ity_V128);
15311
15312      if ( epartIsReg( modrm ) ) {
15313         assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
15314         delta += 3+1;
15315         DIP( "pmovzxwq %s,%s\n",
15316              nameXMMReg( eregOfRexRM(pfx, modrm) ),
15317              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15318      } else {
15319         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
15320         assign( srcVec,
15321                 unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
15322         delta += 3+alen;
15323         DIP( "pmovzxwq %s,%s\n",
15324              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15325      }
15326
15327      IRTemp zeroVec = newTemp( Ity_V128 );
15328      assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
15329
15330      putXMMReg( gregOfRexRM( pfx, modrm ),
15331                 binop( Iop_InterleaveLO16x8,
15332                        mkexpr(zeroVec),
15333                        binop( Iop_InterleaveLO16x8,
15334                               mkexpr(zeroVec), mkexpr(srcVec) ) ) );
15335
15336      goto decode_success;
15337   }
15338
15339
15340   /* 66 0f 38 35 /r = PMOVZXDQ xmm1, xmm2/m64
15341      Packed Move with Zero Extend from DWord to QWord (XMM) */
15342   if ( have66noF2noF3( pfx )
15343        && sz == 2
15344        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x35 ) {
15345
15346      modrm = insn[3];
15347
15348      IRTemp srcVec = newTemp(Ity_V128);
15349
15350      if ( epartIsReg(modrm) ) {
15351         assign( srcVec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
15352         delta += 3+1;
15353         DIP( "pmovzxdq %s,%s\n",
15354              nameXMMReg( eregOfRexRM(pfx, modrm) ),
15355              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15356      } else {
15357         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
15358         assign( srcVec,
15359                 unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
15360         delta += 3+alen;
15361         DIP( "pmovzxdq %s,%s\n",
15362              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15363      }
15364
15365      putXMMReg( gregOfRexRM(pfx, modrm),
15366                 binop( Iop_InterleaveLO32x4,
15367                        IRExpr_Const( IRConst_V128(0) ),
15368                        mkexpr(srcVec) ) );
15369
15370      goto decode_success;
15371   }
15372
15373
15374   /* 66 0f 38 40 /r = PMULLD xmm1, xmm2/m128
15375      32x4 integer multiply from xmm2/m128 to xmm1 */
15376   if ( have66noF2noF3( pfx )
15377        && sz == 2
15378        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x40 ) {
15379
15380      modrm = insn[3];
15381
15382      IRTemp argL = newTemp(Ity_V128);
15383      IRTemp argR = newTemp(Ity_V128);
15384
15385      if ( epartIsReg(modrm) ) {
15386         assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
15387         delta += 3+1;
15388         DIP( "pmulld %s,%s\n",
15389              nameXMMReg( eregOfRexRM(pfx, modrm) ),
15390              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15391      } else {
15392         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
15393         gen_SEGV_if_not_16_aligned( addr );
15394         assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
15395         delta += 3+alen;
15396         DIP( "pmulld %s,%s\n",
15397              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15398      }
15399
15400      assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
15401
15402      putXMMReg( gregOfRexRM(pfx, modrm),
15403                 binop( Iop_Mul32x4, mkexpr(argL), mkexpr(argR)) );
15404
15405      goto decode_success;
15406   }
15407
15408
15409   /* F3 0F B8  = POPCNT{W,L,Q}
15410      Count the number of 1 bits in a register
15411    */
15412   if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
15413       && insn[0] == 0x0F && insn[1] == 0xB8) {
15414      vassert(sz == 2 || sz == 4 || sz == 8);
15415      /*IRType*/ ty  = szToITy(sz);
15416      IRTemp     src = newTemp(ty);
15417      modrm = insn[2];
15418      if (epartIsReg(modrm)) {
15419         assign(src, getIRegE(sz, pfx, modrm));
15420         delta += 2+1;
15421         DIP("popcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
15422             nameIRegG(sz, pfx, modrm));
15423      } else {
15424         addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0);
15425         assign(src, loadLE(ty, mkexpr(addr)));
15426         delta += 2+alen;
15427         DIP("popcnt%c %s, %s\n", nameISize(sz), dis_buf,
15428             nameIRegG(sz, pfx, modrm));
15429      }
15430
15431      IRTemp result = gen_POPCOUNT(ty, src);
15432      putIRegG(sz, pfx, modrm, mkexpr(result));
15433
15434      // Update flags.  This is pretty lame .. perhaps can do better
15435      // if this turns out to be performance critical.
15436      // O S A C P are cleared.  Z is set if SRC == 0.
15437      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
15438      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
15439      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
15440      stmt( IRStmt_Put( OFFB_CC_DEP1,
15441            binop(Iop_Shl64,
15442                  unop(Iop_1Uto64,
15443                       binop(Iop_CmpEQ64,
15444                             widenUto64(mkexpr(src)),
15445                             mkU64(0))),
15446                  mkU8(AMD64G_CC_SHIFT_Z))));
15447
15448      goto decode_success;
15449   }
15450
15451
15452   /* 66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
15453      66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
15454   */
15455   if (have66noF2noF3(pfx)
15456       && sz == 2
15457       && insn[0] == 0x0F && insn[1] == 0x3A
15458       && (insn[2] == 0x0B || insn[2] == 0x0A)) {
15459
15460      Bool   isD = insn[2] == 0x0B;
15461      IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
15462      IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
15463      Int    imm = 0;
15464
15465      modrm = insn[3];
15466
15467      if (epartIsReg(modrm)) {
15468         assign( src,
15469                 isD ? getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 )
15470                     : getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
15471         imm = insn[3+1];
15472         if (imm & ~15) goto decode_failure;
15473         delta += 3+1+1;
15474         DIP( "rounds%c $%d,%s,%s\n",
15475              isD ? 'd' : 's',
15476              imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
15477                   nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15478      } else {
15479         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
15480         assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
15481         imm = insn[3+alen];
15482         if (imm & ~15) goto decode_failure;
15483         delta += 3+alen+1;
15484         DIP( "rounds%c $%d,%s,%s\n",
15485              isD ? 'd' : 's',
15486              imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15487      }
15488
15489      /* (imm & 3) contains an Intel-encoded rounding mode.  Because
15490         that encoding is the same as the encoding for IRRoundingMode,
15491         we can use that value directly in the IR as a rounding
15492         mode. */
15493      assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
15494                        (imm & 4) ? get_sse_roundingmode()
15495                                  : mkU32(imm & 3),
15496                        mkexpr(src)) );
15497
15498      if (isD)
15499         putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
15500      else
15501         putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
15502
15503      goto decode_success;
15504   }
15505
15506
15507   /* 66 0F 3A 09 /r ib = ROUNDPD imm8, xmm2/m128, xmm1 */
15508   if (have66noF2noF3(pfx)
15509       && sz == 2
15510       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x09) {
15511
15512      IRTemp src0 = newTemp(Ity_F64);
15513      IRTemp src1 = newTemp(Ity_F64);
15514      IRTemp res0 = newTemp(Ity_F64);
15515      IRTemp res1 = newTemp(Ity_F64);
15516      IRTemp rm   = newTemp(Ity_I32);
15517      Int    imm  = 0;
15518
15519      modrm = insn[3];
15520
15521      if (epartIsReg(modrm)) {
15522         assign( src0,
15523                 getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 ) );
15524         assign( src1,
15525                 getXMMRegLane64F( eregOfRexRM(pfx, modrm), 1 ) );
15526         imm = insn[3+1];
15527         if (imm & ~15) goto decode_failure;
15528         delta += 3+1+1;
15529         DIP( "roundpd $%d,%s,%s\n",
15530              imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
15531                   nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15532      } else {
15533         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
15534         gen_SEGV_if_not_16_aligned(addr);
15535         assign( src0, loadLE(Ity_F64,
15536                              binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
15537         assign( src1, loadLE(Ity_F64,
15538                              binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
15539         imm = insn[3+alen];
15540         if (imm & ~15) goto decode_failure;
15541         delta += 3+alen+1;
15542         DIP( "roundpd $%d,%s,%s\n",
15543              imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15544      }
15545
15546      /* (imm & 3) contains an Intel-encoded rounding mode.  Because
15547         that encoding is the same as the encoding for IRRoundingMode,
15548         we can use that value directly in the IR as a rounding
15549         mode. */
15550      assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
15551
15552      assign(res0, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src0)) );
15553      assign(res1, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src1)) );
15554
15555      putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
15556      putXMMRegLane64F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
15557
15558      goto decode_success;
15559   }
15560
15561
15562   /* 66 0F 3A 08 /r ib = ROUNDPS imm8, xmm2/m128, xmm1 */
15563   if (have66noF2noF3(pfx)
15564       && sz == 2
15565       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x08) {
15566
15567      IRTemp src0 = newTemp(Ity_F32);
15568      IRTemp src1 = newTemp(Ity_F32);
15569      IRTemp src2 = newTemp(Ity_F32);
15570      IRTemp src3 = newTemp(Ity_F32);
15571      IRTemp res0 = newTemp(Ity_F32);
15572      IRTemp res1 = newTemp(Ity_F32);
15573      IRTemp res2 = newTemp(Ity_F32);
15574      IRTemp res3 = newTemp(Ity_F32);
15575      IRTemp rm   = newTemp(Ity_I32);
15576      Int    imm  = 0;
15577
15578      modrm = insn[3];
15579
15580      if (epartIsReg(modrm)) {
15581         assign( src0,
15582                 getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
15583         assign( src1,
15584                 getXMMRegLane32F( eregOfRexRM(pfx, modrm), 1 ) );
15585         assign( src2,
15586                 getXMMRegLane32F( eregOfRexRM(pfx, modrm), 2 ) );
15587         assign( src3,
15588                 getXMMRegLane32F( eregOfRexRM(pfx, modrm), 3 ) );
15589         imm = insn[3+1];
15590         if (imm & ~15) goto decode_failure;
15591         delta += 3+1+1;
15592         DIP( "roundps $%d,%s,%s\n",
15593              imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
15594                   nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15595      } else {
15596         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
15597         gen_SEGV_if_not_16_aligned(addr);
15598         assign( src0, loadLE(Ity_F32,
15599                              binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
15600         assign( src1, loadLE(Ity_F32,
15601                              binop(Iop_Add64, mkexpr(addr), mkU64(4) )));
15602         assign( src2, loadLE(Ity_F32,
15603                              binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
15604         assign( src3, loadLE(Ity_F32,
15605                              binop(Iop_Add64, mkexpr(addr), mkU64(12) )));
15606         imm = insn[3+alen];
15607         if (imm & ~15) goto decode_failure;
15608         delta += 3+alen+1;
15609         DIP( "roundps $%d,%s,%s\n",
15610              imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15611      }
15612
15613      /* (imm & 3) contains an Intel-encoded rounding mode.  Because
15614         that encoding is the same as the encoding for IRRoundingMode,
15615         we can use that value directly in the IR as a rounding
15616         mode. */
15617      assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
15618
15619      assign(res0, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src0)) );
15620      assign(res1, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src1)) );
15621      assign(res2, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src2)) );
15622      assign(res3, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src3)) );
15623
15624      putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
15625      putXMMRegLane32F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
15626      putXMMRegLane32F( gregOfRexRM(pfx, modrm), 2, mkexpr(res2) );
15627      putXMMRegLane32F( gregOfRexRM(pfx, modrm), 3, mkexpr(res3) );
15628
15629      goto decode_success;
15630   }
15631
15632
15633   /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
15634      which we can only decode if we're sure this is an AMD cpu that
15635      supports LZCNT, since otherwise it's BSR, which behaves
15636      differently. */
15637   if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
15638       && insn[0] == 0x0F && insn[1] == 0xBD
15639       && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT)) {
15640      vassert(sz == 2 || sz == 4 || sz == 8);
15641      /*IRType*/ ty  = szToITy(sz);
15642      IRTemp     src = newTemp(ty);
15643      modrm = insn[2];
15644      if (epartIsReg(modrm)) {
15645         assign(src, getIRegE(sz, pfx, modrm));
15646         delta += 2+1;
15647         DIP("lzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
15648             nameIRegG(sz, pfx, modrm));
15649      } else {
15650         addr = disAMode( &alen, vbi, pfx, delta+2, dis_buf, 0);
15651         assign(src, loadLE(ty, mkexpr(addr)));
15652         delta += 2+alen;
15653         DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
15654             nameIRegG(sz, pfx, modrm));
15655      }
15656
15657      IRTemp res = gen_LZCNT(ty, src);
15658      putIRegG(sz, pfx, modrm, mkexpr(res));
15659
15660      // Update flags.  This is pretty lame .. perhaps can do better
15661      // if this turns out to be performance critical.
15662      // O S A P are cleared.  Z is set if RESULT == 0.
15663      // C is set if SRC is zero.
15664      IRTemp src64 = newTemp(Ity_I64);
15665      IRTemp res64 = newTemp(Ity_I64);
15666      assign(src64, widenUto64(mkexpr(src)));
15667      assign(res64, widenUto64(mkexpr(res)));
15668
15669      IRTemp oszacp = newTemp(Ity_I64);
15670      assign(
15671         oszacp,
15672         binop(Iop_Or64,
15673               binop(Iop_Shl64,
15674                     unop(Iop_1Uto64,
15675                          binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))),
15676                     mkU8(AMD64G_CC_SHIFT_Z)),
15677               binop(Iop_Shl64,
15678                     unop(Iop_1Uto64,
15679                          binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))),
15680                     mkU8(AMD64G_CC_SHIFT_C))
15681         )
15682      );
15683
15684      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
15685      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
15686      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
15687      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
15688
15689      goto decode_success;
15690   }
15691
15692   /* 66 0F 3A 63 /r ib = PCMPISTRI imm8, xmm2/m128, xmm1
15693      66 0F 3A 62 /r ib = PCMPISTRM imm8, xmm2/m128, xmm1
15694      66 0F 3A 61 /r ib = PCMPESTRI imm8, xmm2/m128, xmm1
15695      66 0F 3A 60 /r ib = PCMPESTRM imm8, xmm2/m128, xmm1
15696      (selected special cases that actually occur in glibc,
15697       not by any means a complete implementation.)
15698   */
15699   if (have66noF2noF3(pfx)
15700       && sz == 2
15701       && insn[0] == 0x0F && insn[1] == 0x3A
15702       && (insn[2] >= 0x60 && insn[2] <= 0x63)) {
15703
15704      UInt  isISTRx = insn[2] & 2;
15705      UInt  isxSTRM = (insn[2] & 1) ^ 1;
15706      UInt  regNoL = 0;
15707      UInt  regNoR = 0;
15708      UChar imm    = 0;
15709
15710      /* This is a nasty kludge.  We need to pass 2 x V128 to the
15711         helper (which is clean).  Since we can't do that, use a dirty
15712         helper to compute the results directly from the XMM regs in
15713         the guest state.  That means for the memory case, we need to
15714         move the left operand into a pseudo-register (XMM16, let's
15715         call it). */
15716      modrm = insn[3];
15717      if (epartIsReg(modrm)) {
15718         regNoL = eregOfRexRM(pfx, modrm);
15719         regNoR = gregOfRexRM(pfx, modrm);
15720         imm = insn[3+1];
15721         delta += 3+1+1;
15722      } else {
15723         regNoL = 16; /* use XMM16 as an intermediary */
15724         regNoR = gregOfRexRM(pfx, modrm);
15725         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
15726         /* No alignment check; I guess that makes sense, given that
15727            these insns are for dealing with C style strings. */
15728         stmt( IRStmt_Put( OFFB_XMM16, loadLE(Ity_V128, mkexpr(addr)) ));
15729         imm = insn[3+alen];
15730         delta += 3+alen+1;
15731      }
15732
15733      /* Now we know the XMM reg numbers for the operands, and the
15734         immediate byte.  Is it one we can actually handle? Throw out
15735         any cases for which the helper function has not been
15736         verified. */
15737      switch (imm) {
15738         case 0x00:
15739         case 0x02: case 0x08: case 0x0A: case 0x0C: case 0x12:
15740         case 0x1A: case 0x38: case 0x3A: case 0x44: case 0x4A:
15741            break;
15742         default:
15743            goto decode_failure;
15744      }
15745
15746      /* Who ya gonna call?  Presumably not Ghostbusters. */
15747      void*  fn = &amd64g_dirtyhelper_PCMPxSTRx;
15748      HChar* nm = "amd64g_dirtyhelper_PCMPxSTRx";
15749
15750      /* Round up the arguments.  Note that this is a kludge -- the
15751         use of mkU64 rather than mkIRExpr_HWord implies the
15752         assumption that the host's word size is 64-bit. */
15753      UInt gstOffL = regNoL == 16 ? OFFB_XMM16 : xmmGuestRegOffset(regNoL);
15754      UInt gstOffR = xmmGuestRegOffset(regNoR);
15755
15756      IRExpr*  opc4_and_imm = mkU64((insn[2] << 8) | (imm & 0xFF));
15757      IRExpr*  gstOffLe     = mkU64(gstOffL);
15758      IRExpr*  gstOffRe     = mkU64(gstOffR);
15759      IRExpr*  edxIN        = isISTRx ? mkU64(0) : getIRegRDX(8);
15760      IRExpr*  eaxIN        = isISTRx ? mkU64(0) : getIRegRAX(8);
15761      IRExpr** args
15762         = mkIRExprVec_5( opc4_and_imm, gstOffLe, gstOffRe, edxIN, eaxIN );
15763
15764      IRTemp   resT = newTemp(Ity_I64);
15765      IRDirty* d    = unsafeIRDirty_1_N( resT, 0/*regparms*/, nm, fn, args );
15766      /* It's not really a dirty call, but we can't use the clean
15767         helper mechanism here for the very lame reason that we can't
15768         pass 2 x V128s by value to a helper, nor get one back.  Hence
15769         this roundabout scheme. */
15770      d->needsBBP = True;
15771      d->nFxState = 2;
15772      d->fxState[0].fx     = Ifx_Read;
15773      d->fxState[0].offset = gstOffL;
15774      d->fxState[0].size   = sizeof(U128);
15775      d->fxState[1].fx     = Ifx_Read;
15776      d->fxState[1].offset = gstOffR;
15777      d->fxState[1].size   = sizeof(U128);
15778      if (isxSTRM) {
15779         /* Declare that the helper writes XMM0. */
15780         d->nFxState = 3;
15781         d->fxState[2].fx     = Ifx_Write;
15782         d->fxState[2].offset = xmmGuestRegOffset(0);
15783         d->fxState[2].size   = sizeof(U128);
15784      }
15785
15786      stmt( IRStmt_Dirty(d) );
15787
15788      /* Now resT[15:0] holds the new OSZACP values, so the condition
15789         codes must be updated. And for a xSTRI case, resT[31:16]
15790         holds the new ECX value, so stash that too. */
15791      if (!isxSTRM) {
15792         putIReg64(R_RCX, binop(Iop_And64,
15793                                binop(Iop_Shr64, mkexpr(resT), mkU8(16)),
15794                                mkU64(0xFFFF)));
15795      }
15796
15797      stmt( IRStmt_Put(
15798               OFFB_CC_DEP1,
15799               binop(Iop_And64, mkexpr(resT), mkU64(0xFFFF))
15800      ));
15801      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
15802      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
15803      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
15804
15805      if (regNoL == 16) {
15806         DIP("pcmp%cstr%c $%x,%s,%s\n",
15807             isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
15808             (UInt)imm, dis_buf, nameXMMReg(regNoR));
15809      } else {
15810         DIP("pcmp%cstr%c $%x,%s,%s\n",
15811             isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
15812             (UInt)imm, nameXMMReg(regNoL), nameXMMReg(regNoR));
15813      }
15814
15815      goto decode_success;
15816   }
15817
15818
15819   /* 66 0f 38 17 /r = PTEST xmm1, xmm2/m128
15820      Logical compare (set ZF and CF from AND/ANDN of the operands) */
15821   if (have66noF2noF3( pfx )
15822       && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
15823       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x17) {
15824      modrm = insn[3];
15825      IRTemp vecE = newTemp(Ity_V128);
15826      IRTemp vecG = newTemp(Ity_V128);
15827
15828      if ( epartIsReg(modrm) ) {
15829         assign(vecE, getXMMReg(eregOfRexRM(pfx, modrm)));
15830         delta += 3+1;
15831         DIP( "ptest %s,%s\n",
15832              nameXMMReg( eregOfRexRM(pfx, modrm) ),
15833              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15834      } else {
15835         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
15836         gen_SEGV_if_not_16_aligned( addr );
15837         assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
15838         delta += 3+alen;
15839         DIP( "ptest %s,%s\n",
15840              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15841      }
15842
15843      assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm)));
15844
15845      /* Set Z=1 iff (vecE & vecG) == 0
15846         Set C=1 iff (vecE & not vecG) == 0
15847      */
15848
15849      /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
15850      IRTemp andV  = newTemp(Ity_V128);
15851      IRTemp andnV = newTemp(Ity_V128);
15852      assign(andV,  binop(Iop_AndV128, mkexpr(vecE), mkexpr(vecG)));
15853      assign(andnV, binop(Iop_AndV128,
15854                          mkexpr(vecE),
15855                          binop(Iop_XorV128, mkexpr(vecG),
15856                                             mkV128(0xFFFF))));
15857
15858      /* The same, but reduced to 64-bit values, by or-ing the top
15859         and bottom 64-bits together.  It relies on this trick:
15860
15861          InterleaveLO64x2([a,b],[c,d]) == [b,d]    hence
15862
15863          InterleaveLO64x2([a,b],[a,b]) == [b,b]    and similarly
15864          InterleaveHI64x2([a,b],[a,b]) == [a,a]
15865
15866          and so the OR of the above 2 exprs produces
15867          [a OR b, a OR b], from which we simply take the lower half.
15868      */
15869      IRTemp and64  = newTemp(Ity_I64);
15870      IRTemp andn64 = newTemp(Ity_I64);
15871
15872      assign(
15873         and64,
15874         unop(Iop_V128to64,
15875              binop(Iop_OrV128,
15876                    binop(Iop_InterleaveLO64x2, mkexpr(andV), mkexpr(andV)),
15877                    binop(Iop_InterleaveHI64x2, mkexpr(andV), mkexpr(andV))
15878              )
15879         )
15880      );
15881
15882      assign(
15883         andn64,
15884         unop(Iop_V128to64,
15885              binop(Iop_OrV128,
15886                    binop(Iop_InterleaveLO64x2, mkexpr(andnV), mkexpr(andnV)),
15887                    binop(Iop_InterleaveHI64x2, mkexpr(andnV), mkexpr(andnV))
15888              )
15889          )
15890       );
15891
15892      /* Now convert and64, andn64 to all-zeroes or all-1s, so we can
15893         slice out the Z and C bits conveniently.  We use the standard
15894         trick all-zeroes -> all-zeroes, anything-else -> all-ones
15895         done by "(x | -x) >>s (word-size - 1)".
15896      */
15897      IRTemp z64 = newTemp(Ity_I64);
15898      IRTemp c64 = newTemp(Ity_I64);
15899      assign(z64,
15900             unop(Iop_Not64,
15901                  binop(Iop_Sar64,
15902                        binop(Iop_Or64,
15903                              binop(Iop_Sub64, mkU64(0), mkexpr(and64)),
15904                              mkexpr(and64)
15905                        ),
15906                        mkU8(63)))
15907      );
15908
15909      assign(c64,
15910             unop(Iop_Not64,
15911                  binop(Iop_Sar64,
15912                        binop(Iop_Or64,
15913                              binop(Iop_Sub64, mkU64(0), mkexpr(andn64)),
15914                              mkexpr(andn64)
15915                        ),
15916                        mkU8(63)))
15917      );
15918
15919      /* And finally, slice out the Z and C flags and set the flags
15920         thunk to COPY for them.  OSAP are set to zero. */
15921      IRTemp newOSZACP = newTemp(Ity_I64);
15922      assign(newOSZACP,
15923             binop(Iop_Or64,
15924                   binop(Iop_And64, mkexpr(z64), mkU64(AMD64G_CC_MASK_Z)),
15925                   binop(Iop_And64, mkexpr(c64), mkU64(AMD64G_CC_MASK_C))
15926             )
15927      );
15928
15929      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(newOSZACP)));
15930      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
15931      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
15932      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
15933
15934      goto decode_success;
15935   }
15936
15937   /* 66 0F 38 15 /r = BLENDVPD xmm1, xmm2/m128  (double gran)
15938      66 0F 38 14 /r = BLENDVPS xmm1, xmm2/m128  (float gran)
15939      66 0F 38 10 /r = PBLENDVB xmm1, xmm2/m128  (byte gran)
15940      Blend at various granularities, with XMM0 (implicit operand)
15941      providing the controlling mask.
15942   */
15943   if (have66noF2noF3(pfx) && sz == 2
15944       && insn[0] == 0x0F && insn[1] == 0x38
15945       && (insn[2] == 0x15 || insn[2] == 0x14 || insn[2] == 0x10)) {
15946      modrm = insn[3];
15947
15948      HChar* nm    = NULL;
15949      UInt   gran  = 0;
15950      IROp   opSAR = Iop_INVALID;
15951      switch (insn[2]) {
15952         case 0x15:
15953            nm = "blendvpd"; gran = 8; opSAR = Iop_SarN64x2;
15954            break;
15955         case 0x14:
15956            nm = "blendvps"; gran = 4; opSAR = Iop_SarN32x4;
15957            break;
15958         case 0x10:
15959            nm = "pblendvb"; gran = 1; opSAR = Iop_SarN8x16;
15960            break;
15961      }
15962      vassert(nm);
15963
15964      IRTemp vecE = newTemp(Ity_V128);
15965      IRTemp vecG = newTemp(Ity_V128);
15966      IRTemp vec0 = newTemp(Ity_V128);
15967
15968      if ( epartIsReg(modrm) ) {
15969         assign(vecE, getXMMReg(eregOfRexRM(pfx, modrm)));
15970         delta += 3+1;
15971         DIP( "%s %s,%s\n", nm,
15972              nameXMMReg( eregOfRexRM(pfx, modrm) ),
15973              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15974      } else {
15975         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
15976         gen_SEGV_if_not_16_aligned( addr );
15977         assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
15978         delta += 3+alen;
15979         DIP( "%s %s,%s\n", nm,
15980              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
15981      }
15982
15983      assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm)));
15984      assign(vec0, getXMMReg(0));
15985
15986      /* Now the tricky bit is to convert vec0 into a suitable mask,
15987         by copying the most significant bit of each lane into all
15988         positions in the lane. */
15989      IRTemp sh = newTemp(Ity_I8);
15990      assign(sh, mkU8(8 * gran - 1));
15991
15992      IRTemp mask = newTemp(Ity_V128);
15993      assign(mask, binop(opSAR, mkexpr(vec0), mkexpr(sh)));
15994
15995      IRTemp notmask = newTemp(Ity_V128);
15996      assign(notmask, unop(Iop_NotV128, mkexpr(mask)));
15997
15998      IRExpr* res = binop(Iop_OrV128,
15999                          binop(Iop_AndV128, mkexpr(vecE), mkexpr(mask)),
16000                          binop(Iop_AndV128, mkexpr(vecG), mkexpr(notmask)));
16001      putXMMReg(gregOfRexRM(pfx, modrm), res);
16002
16003      goto decode_success;
16004   }
16005
16006   /* F2 0F 38 F0 /r = CRC32 r/m8, r32 (REX.W ok, 66 not ok)
16007      F2 0F 38 F1 /r = CRC32 r/m{16,32,64}, r32
16008      The decoding on this is a bit unusual.
16009   */
16010   if (haveF2noF3(pfx)
16011       && insn[0] == 0x0F && insn[1] == 0x38
16012       && (insn[2] == 0xF1
16013           || (insn[2] == 0xF0 && !have66(pfx)))) {
16014      modrm = insn[3];
16015
16016      if (insn[2] == 0xF0)
16017         sz = 1;
16018      else
16019         vassert(sz == 2 || sz == 4 || sz == 8);
16020
16021      IRType tyE = szToITy(sz);
16022      IRTemp valE = newTemp(tyE);
16023
16024      if (epartIsReg(modrm)) {
16025         assign(valE, getIRegE(sz, pfx, modrm));
16026         delta += 3+1;
16027         DIP("crc32b %s,%s\n", nameIRegE(sz, pfx, modrm),
16028             nameIRegG(1==getRexW(pfx) ? 8 : 4 ,pfx, modrm));
16029      } else {
16030         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
16031         assign(valE, loadLE(tyE, mkexpr(addr)));
16032         delta += 3+alen;
16033         DIP("crc32b %s,%s\n", dis_buf,
16034             nameIRegG(1==getRexW(pfx) ? 8 : 4 ,pfx, modrm));
16035      }
16036
16037      /* Somewhat funny getting/putting of the crc32 value, in order
16038         to ensure that it turns into 64-bit gets and puts.  However,
16039         mask off the upper 32 bits so as to not get memcheck false
16040         +ves around the helper call. */
16041      IRTemp valG0 = newTemp(Ity_I64);
16042      assign(valG0, binop(Iop_And64, getIRegG(8, pfx, modrm),
16043                          mkU64(0xFFFFFFFF)));
16044
16045      HChar* nm = NULL;
16046      void* fn = NULL;
16047      switch (sz) {
16048         case 1: nm = "amd64g_calc_crc32b";
16049                 fn = &amd64g_calc_crc32b; break;
16050         case 2: nm = "amd64g_calc_crc32w";
16051                 fn = &amd64g_calc_crc32w; break;
16052         case 4: nm = "amd64g_calc_crc32l";
16053                 fn = &amd64g_calc_crc32l; break;
16054         case 8: nm = "amd64g_calc_crc32q";
16055                 fn = &amd64g_calc_crc32q; break;
16056      }
16057      vassert(nm && fn);
16058      IRTemp valG1 = newTemp(Ity_I64);
16059      assign(valG1,
16060             mkIRExprCCall(Ity_I64, 0/*regparm*/, nm, fn,
16061                           mkIRExprVec_2(mkexpr(valG0),
16062                                         widenUto64(mkexpr(valE)))));
16063
16064      putIRegG(4, pfx, modrm, unop(Iop_64to32, mkexpr(valG1)));
16065      goto decode_success;
16066   }
16067
16068   /* 66 0f 38 2B /r = PACKUSDW xmm1, xmm2/m128
16069      2x 32x4 S->U saturating narrow from xmm2/m128 to xmm1 */
16070   if ( have66noF2noF3( pfx )
16071        && sz == 2
16072        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x2B ) {
16073
16074      modrm = insn[3];
16075
16076      IRTemp argL = newTemp(Ity_V128);
16077      IRTemp argR = newTemp(Ity_V128);
16078
16079      if ( epartIsReg(modrm) ) {
16080         assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
16081         delta += 3+1;
16082         DIP( "packusdw %s,%s\n",
16083              nameXMMReg( eregOfRexRM(pfx, modrm) ),
16084              nameXMMReg( gregOfRexRM(pfx, modrm) ) );
16085      } else {
16086         addr = disAMode( &alen, vbi, pfx, delta+3, dis_buf, 0 );
16087         gen_SEGV_if_not_16_aligned( addr );
16088         assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
16089         delta += 3+alen;
16090         DIP( "packusdw %s,%s\n",
16091              dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
16092      }
16093
16094      assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
16095
16096      putXMMReg( gregOfRexRM(pfx, modrm),
16097                 binop( Iop_QNarrowBin32Sto16Ux8,
16098                        mkexpr(argL), mkexpr(argR)) );
16099
16100      goto decode_success;
16101   }
16102
16103   /* 66 0F 38 28 = PMULUDQ -- signed widening multiply of 32-lanes 0 x
16104      0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
16105      half */
16106   /* This is a really poor translation -- could be improved if
16107      performance critical.  It's a copy-paste of PMULDQ, too. */
16108   if (have66noF2noF3(pfx) && sz == 2
16109       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x28) {
16110      IRTemp sV, dV;
16111      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
16112      sV = newTemp(Ity_V128);
16113      dV = newTemp(Ity_V128);
16114      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
16115      t1 = newTemp(Ity_I64);
16116      t0 = newTemp(Ity_I64);
16117      modrm = insn[3];
16118      assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
16119
16120      if (epartIsReg(modrm)) {
16121         assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
16122         delta += 3+1;
16123         DIP("pmuldq %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
16124                               nameXMMReg(gregOfRexRM(pfx,modrm)));
16125      } else {
16126         addr = disAMode ( &alen, vbi, pfx, delta+3, dis_buf, 0 );
16127         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
16128         delta += 3+alen;
16129         DIP("pmuldq %s,%s\n", dis_buf,
16130                               nameXMMReg(gregOfRexRM(pfx,modrm)));
16131      }
16132
16133      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
16134      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
16135
16136      assign( t0, binop( Iop_MullS32, mkexpr(d0), mkexpr(s0)) );
16137      putXMMRegLane64( gregOfRexRM(pfx,modrm), 0, mkexpr(t0) );
16138      assign( t1, binop( Iop_MullS32, mkexpr(d2), mkexpr(s2)) );
16139      putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkexpr(t1) );
16140      goto decode_success;
16141   }
16142
16143   /* 66 0F 38 29 = PCMPEQQ
16144      64x2 equality comparison
16145   */
16146   if ( have66noF2noF3( pfx ) && sz == 2
16147        && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x29) {
16148      /* FIXME: this needs an alignment check */
16149      delta = dis_SSEint_E_to_G( vbi, pfx, delta+3,
16150                                 "pcmpeqq", Iop_CmpEQ64x2, False );
16151      goto decode_success;
16152   }
16153
16154   /* ---------------------------------------------------- */
16155   /* --- end of the SSE4 decoder                      --- */
16156   /* ---------------------------------------------------- */
16157
16158   /*after_sse_decoders:*/
16159
16160   /* Get the primary opcode. */
16161   opc = getUChar(delta); delta++;
16162
16163   /* We get here if the current insn isn't SSE, or this CPU doesn't
16164      support SSE. */
16165
16166   switch (opc) {
16167
16168   /* ------------------------ Control flow --------------- */
16169
16170   case 0xC2: /* RET imm16 */
16171      if (have66orF2orF3(pfx)) goto decode_failure;
16172      d64 = getUDisp16(delta);
16173      delta += 2;
16174      dis_ret(vbi, d64);
16175      dres.whatNext = Dis_StopHere;
16176      DIP("ret %lld\n", d64);
16177      break;
16178
16179   case 0xC3: /* RET */
16180      if (have66orF2(pfx)) goto decode_failure;
16181      /* F3 is acceptable on AMD. */
16182      dis_ret(vbi, 0);
16183      dres.whatNext = Dis_StopHere;
16184      DIP(haveF3(pfx) ? "rep ; ret\n" : "ret\n");
16185      break;
16186
16187   case 0xE8: /* CALL J4 */
16188      if (haveF2orF3(pfx)) goto decode_failure;
16189      d64 = getSDisp32(delta); delta += 4;
16190      d64 += (guest_RIP_bbstart+delta);
16191      /* (guest_RIP_bbstart+delta) == return-to addr, d64 == call-to addr */
16192      t1 = newTemp(Ity_I64);
16193      assign(t1, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
16194      putIReg64(R_RSP, mkexpr(t1));
16195      storeLE( mkexpr(t1), mkU64(guest_RIP_bbstart+delta));
16196      t2 = newTemp(Ity_I64);
16197      assign(t2, mkU64((Addr64)d64));
16198      make_redzone_AbiHint(vbi, t1, t2/*nia*/, "call-d32");
16199      if (resteerOkFn( callback_opaque, (Addr64)d64) ) {
16200         /* follow into the call target. */
16201         dres.whatNext   = Dis_ResteerU;
16202         dres.continueAt = d64;
16203      } else {
16204         jmp_lit(Ijk_Call,d64);
16205         dres.whatNext = Dis_StopHere;
16206      }
16207      DIP("call 0x%llx\n",d64);
16208      break;
16209
16210//.. //--    case 0xC8: /* ENTER */
16211//.. //--       d32 = getUDisp16(eip); eip += 2;
16212//.. //--       abyte = getUChar(delta); delta++;
16213//.. //--
16214//.. //--       vg_assert(sz == 4);
16215//.. //--       vg_assert(abyte == 0);
16216//.. //--
16217//.. //--       t1 = newTemp(cb); t2 = newTemp(cb);
16218//.. //--       uInstr2(cb, GET,   sz, ArchReg, R_EBP, TempReg, t1);
16219//.. //--       uInstr2(cb, GET,    4, ArchReg, R_ESP, TempReg, t2);
16220//.. //--       uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
16221//.. //--       uLiteral(cb, sz);
16222//.. //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
16223//.. //--       uInstr2(cb, STORE,  4, TempReg, t1,    TempReg, t2);
16224//.. //--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_EBP);
16225//.. //--       if (d32) {
16226//.. //--          uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
16227//.. //--          uLiteral(cb, d32);
16228//.. //--          uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
16229//.. //--       }
16230//.. //--       DIP("enter 0x%x, 0x%x", d32, abyte);
16231//.. //--       break;
16232
16233   case 0xC8: /* ENTER */
16234      /* Same comments re operand size as for LEAVE below apply.
16235         Also, only handles the case "enter $imm16, $0"; other cases
16236         for the second operand (nesting depth) are not handled. */
16237      if (sz != 4)
16238         goto decode_failure;
16239      d64 = getUDisp16(delta);
16240      delta += 2;
16241      vassert(d64 >= 0 && d64 <= 0xFFFF);
16242      if (getUChar(delta) != 0)
16243         goto decode_failure;
16244      delta++;
16245      /* Intel docs seem to suggest:
16246           push rbp
16247           temp = rsp
16248           rbp = temp
16249           rsp = rsp - imm16
16250      */
16251      t1 = newTemp(Ity_I64);
16252      assign(t1, getIReg64(R_RBP));
16253      t2 = newTemp(Ity_I64);
16254      assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
16255      putIReg64(R_RSP, mkexpr(t2));
16256      storeLE(mkexpr(t2), mkexpr(t1));
16257      putIReg64(R_RBP, mkexpr(t2));
16258      if (d64 > 0) {
16259         putIReg64(R_RSP, binop(Iop_Sub64, mkexpr(t2), mkU64(d64)));
16260      }
16261      DIP("enter $%u, $0\n", (UInt)d64);
16262      break;
16263
16264   case 0xC9: /* LEAVE */
16265      /* In 64-bit mode this defaults to a 64-bit operand size.  There
16266         is no way to encode a 32-bit variant.  Hence sz==4 but we do
16267         it as if sz=8. */
16268      if (sz != 4)
16269         goto decode_failure;
16270      t1 = newTemp(Ity_I64);
16271      t2 = newTemp(Ity_I64);
16272      assign(t1, getIReg64(R_RBP));
16273      /* First PUT RSP looks redundant, but need it because RSP must
16274         always be up-to-date for Memcheck to work... */
16275      putIReg64(R_RSP, mkexpr(t1));
16276      assign(t2, loadLE(Ity_I64,mkexpr(t1)));
16277      putIReg64(R_RBP, mkexpr(t2));
16278      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(8)) );
16279      DIP("leave\n");
16280      break;
16281
16282//.. //--    /* ---------------- Misc weird-ass insns --------------- */
16283//.. //--
16284//.. //--    case 0x27: /* DAA */
16285//.. //--    case 0x2F: /* DAS */
16286//.. //--       t1 = newTemp(cb);
16287//.. //--       uInstr2(cb, GET, 1, ArchReg, R_AL, TempReg, t1);
16288//.. //--       /* Widen %AL to 32 bits, so it's all defined when we push it. */
16289//.. //--       uInstr1(cb, WIDEN, 4, TempReg, t1);
16290//.. //--       uWiden(cb, 1, False);
16291//.. //--       uInstr0(cb, CALLM_S, 0);
16292//.. //--       uInstr1(cb, PUSH, 4, TempReg, t1);
16293//.. //--       uInstr1(cb, CALLM, 0, Lit16,
16294//.. //--                   opc == 0x27 ? VGOFF_(helper_DAA) : VGOFF_(helper_DAS) );
16295//.. //--       uFlagsRWU(cb, FlagsAC, FlagsSZACP, FlagO);
16296//.. //--       uInstr1(cb, POP, 4, TempReg, t1);
16297//.. //--       uInstr0(cb, CALLM_E, 0);
16298//.. //--       uInstr2(cb, PUT, 1, TempReg, t1, ArchReg, R_AL);
16299//.. //--       DIP(opc == 0x27 ? "daa\n" : "das\n");
16300//.. //--       break;
16301//.. //--
16302//.. //--    case 0x37: /* AAA */
16303//.. //--    case 0x3F: /* AAS */
16304//.. //--       t1 = newTemp(cb);
16305//.. //--       uInstr2(cb, GET, 2, ArchReg, R_EAX, TempReg, t1);
16306//.. //--       /* Widen %AL to 32 bits, so it's all defined when we push it. */
16307//.. //--       uInstr1(cb, WIDEN, 4, TempReg, t1);
16308//.. //--       uWiden(cb, 2, False);
16309//.. //--       uInstr0(cb, CALLM_S, 0);
16310//.. //--       uInstr1(cb, PUSH, 4, TempReg, t1);
16311//.. //--       uInstr1(cb, CALLM, 0, Lit16,
16312//.. //--                   opc == 0x37 ? VGOFF_(helper_AAA) : VGOFF_(helper_AAS) );
16313//.. //--       uFlagsRWU(cb, FlagA, FlagsAC, FlagsEmpty);
16314//.. //--       uInstr1(cb, POP, 4, TempReg, t1);
16315//.. //--       uInstr0(cb, CALLM_E, 0);
16316//.. //--       uInstr2(cb, PUT, 2, TempReg, t1, ArchReg, R_EAX);
16317//.. //--       DIP(opc == 0x37 ? "aaa\n" : "aas\n");
16318//.. //--       break;
16319//.. //--
16320//.. //--    case 0xD4: /* AAM */
16321//.. //--    case 0xD5: /* AAD */
16322//.. //--       d32 = getUChar(delta); delta++;
16323//.. //--       if (d32 != 10) VG_(core_panic)("disInstr: AAM/AAD but base not 10 !");
16324//.. //--       t1 = newTemp(cb);
16325//.. //--       uInstr2(cb, GET, 2, ArchReg, R_EAX, TempReg, t1);
16326//.. //--       /* Widen %AX to 32 bits, so it's all defined when we push it. */
16327//.. //--       uInstr1(cb, WIDEN, 4, TempReg, t1);
16328//.. //--       uWiden(cb, 2, False);
16329//.. //--       uInstr0(cb, CALLM_S, 0);
16330//.. //--       uInstr1(cb, PUSH, 4, TempReg, t1);
16331//.. //--       uInstr1(cb, CALLM, 0, Lit16,
16332//.. //--                   opc == 0xD4 ? VGOFF_(helper_AAM) : VGOFF_(helper_AAD) );
16333//.. //--       uFlagsRWU(cb, FlagsEmpty, FlagsSZP, FlagsEmpty);
16334//.. //--       uInstr1(cb, POP, 4, TempReg, t1);
16335//.. //--       uInstr0(cb, CALLM_E, 0);
16336//.. //--       uInstr2(cb, PUT, 2, TempReg, t1, ArchReg, R_EAX);
16337//.. //--       DIP(opc == 0xD4 ? "aam\n" : "aad\n");
16338//.. //--       break;
16339
16340   /* ------------------------ CWD/CDQ -------------------- */
16341
16342   case 0x98: /* CBW */
16343      if (haveF2orF3(pfx)) goto decode_failure;
16344      if (sz == 8) {
16345         putIRegRAX( 8, unop(Iop_32Sto64, getIRegRAX(4)) );
16346         DIP(/*"cdqe\n"*/"cltq");
16347         break;
16348      }
16349      if (sz == 4) {
16350         putIRegRAX( 4, unop(Iop_16Sto32, getIRegRAX(2)) );
16351         DIP("cwtl\n");
16352         break;
16353      }
16354      if (sz == 2) {
16355         putIRegRAX( 2, unop(Iop_8Sto16, getIRegRAX(1)) );
16356         DIP("cbw\n");
16357         break;
16358      }
16359      goto decode_failure;
16360
16361   case 0x99: /* CWD/CDQ/CQO */
16362      if (haveF2orF3(pfx)) goto decode_failure;
16363      vassert(sz == 2 || sz == 4 || sz == 8);
16364      ty = szToITy(sz);
16365      putIRegRDX( sz,
16366                  binop(mkSizedOp(ty,Iop_Sar8),
16367                        getIRegRAX(sz),
16368                        mkU8(sz == 2 ? 15 : (sz == 4 ? 31 : 63))) );
16369      DIP(sz == 2 ? "cwd\n"
16370                  : (sz == 4 ? /*"cdq\n"*/ "cltd\n"
16371                             : "cqo\n"));
16372      break;
16373
16374   /* ------------------------ FPU ops -------------------- */
16375
16376   case 0x9E: /* SAHF */
16377      codegen_SAHF();
16378      DIP("sahf\n");
16379      break;
16380
16381   case 0x9F: /* LAHF */
16382      codegen_LAHF();
16383      DIP("lahf\n");
16384      break;
16385
16386   case 0x9B: /* FWAIT */
16387      /* ignore? */
16388      DIP("fwait\n");
16389      break;
16390
16391   case 0xD8:
16392   case 0xD9:
16393   case 0xDA:
16394   case 0xDB:
16395   case 0xDC:
16396   case 0xDD:
16397   case 0xDE:
16398   case 0xDF: {
16399      Bool redundantREXWok = False;
16400
16401      if (haveF2orF3(pfx))
16402         goto decode_failure;
16403
16404      /* kludge to tolerate redundant rex.w prefixes (should do this
16405         properly one day) */
16406      /* mono 1.1.18.1 produces 48 D9 FA, which is rex.w fsqrt */
16407      if ( (opc == 0xD9 && getUChar(delta+0) == 0xFA)/*fsqrt*/ )
16408         redundantREXWok = True;
16409
16410      if ( (sz == 4
16411           || (sz == 8 && redundantREXWok))
16412           && haveNo66noF2noF3(pfx)) {
16413         Long delta0    = delta;
16414         Bool decode_OK = False;
16415         delta = dis_FPU ( &decode_OK, vbi, pfx, delta );
16416         if (!decode_OK) {
16417            delta = delta0;
16418            goto decode_failure;
16419         }
16420         break;
16421      } else {
16422         goto decode_failure;
16423      }
16424   }
16425
16426   /* ------------------------ INT ------------------------ */
16427
16428   case 0xCC: /* INT 3 */
16429      jmp_lit(Ijk_SigTRAP, guest_RIP_bbstart + delta);
16430      dres.whatNext = Dis_StopHere;
16431      DIP("int $0x3\n");
16432      break;
16433
16434   case 0xCD: { /* INT imm8 */
16435      IRJumpKind jk = Ijk_Boring;
16436      if (have66orF2orF3(pfx)) goto decode_failure;
16437      d64 = getUChar(delta); delta++;
16438      switch (d64) {
16439         case 32: jk = Ijk_Sys_int32; break;
16440         default: goto decode_failure;
16441      }
16442      guest_RIP_next_mustcheck = True;
16443      guest_RIP_next_assumed = guest_RIP_bbstart + delta;
16444      jmp_lit(jk, guest_RIP_next_assumed);
16445      /* It's important that all ArchRegs carry their up-to-date value
16446         at this point.  So we declare an end-of-block here, which
16447         forces any TempRegs caching ArchRegs to be flushed. */
16448      dres.whatNext = Dis_StopHere;
16449      DIP("int $0x%02x\n", (UInt)d64);
16450      break;
16451   }
16452
16453   /* ------------------------ Jcond, byte offset --------- */
16454
16455   case 0xEB: /* Jb (jump, byte offset) */
16456      if (haveF2orF3(pfx)) goto decode_failure;
16457      if (sz != 4)
16458         goto decode_failure; /* JRS added 2004 July 11 */
16459      d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
16460      delta++;
16461      if (resteerOkFn(callback_opaque,d64)) {
16462         dres.whatNext   = Dis_ResteerU;
16463         dres.continueAt = d64;
16464      } else {
16465         jmp_lit(Ijk_Boring,d64);
16466         dres.whatNext = Dis_StopHere;
16467      }
16468      DIP("jmp-8 0x%llx\n", d64);
16469      break;
16470
16471   case 0xE9: /* Jv (jump, 16/32 offset) */
16472      if (haveF2orF3(pfx)) goto decode_failure;
16473      if (sz != 4)
16474         goto decode_failure; /* JRS added 2004 July 11 */
16475      d64 = (guest_RIP_bbstart+delta+sz) + getSDisp(sz,delta);
16476      delta += sz;
16477      if (resteerOkFn(callback_opaque,d64)) {
16478         dres.whatNext   = Dis_ResteerU;
16479         dres.continueAt = d64;
16480      } else {
16481         jmp_lit(Ijk_Boring,d64);
16482         dres.whatNext = Dis_StopHere;
16483      }
16484      DIP("jmp 0x%llx\n", d64);
16485      break;
16486
16487   case 0x70:
16488   case 0x71:
16489   case 0x72: /* JBb/JNAEb (jump below) */
16490   case 0x73: /* JNBb/JAEb (jump not below) */
16491   case 0x74: /* JZb/JEb (jump zero) */
16492   case 0x75: /* JNZb/JNEb (jump not zero) */
16493   case 0x76: /* JBEb/JNAb (jump below or equal) */
16494   case 0x77: /* JNBEb/JAb (jump not below or equal) */
16495   case 0x78: /* JSb (jump negative) */
16496   case 0x79: /* JSb (jump not negative) */
16497   case 0x7A: /* JP (jump parity even) */
16498   case 0x7B: /* JNP/JPO (jump parity odd) */
16499   case 0x7C: /* JLb/JNGEb (jump less) */
16500   case 0x7D: /* JGEb/JNLb (jump greater or equal) */
16501   case 0x7E: /* JLEb/JNGb (jump less or equal) */
16502   case 0x7F: /* JGb/JNLEb (jump greater) */
16503    { Long   jmpDelta;
16504      HChar* comment  = "";
16505      if (haveF2orF3(pfx)) goto decode_failure;
16506      jmpDelta = getSDisp8(delta);
16507      vassert(-128 <= jmpDelta && jmpDelta < 128);
16508      d64 = (guest_RIP_bbstart+delta+1) + jmpDelta;
16509      delta++;
16510      if (resteerCisOk
16511          && vex_control.guest_chase_cond
16512          && (Addr64)d64 != (Addr64)guest_RIP_bbstart
16513          && jmpDelta < 0
16514          && resteerOkFn( callback_opaque, d64) ) {
16515         /* Speculation: assume this backward branch is taken.  So we
16516            need to emit a side-exit to the insn following this one,
16517            on the negation of the condition, and continue at the
16518            branch target address (d64).  If we wind up back at the
16519            first instruction of the trace, just stop; it's better to
16520            let the IR loop unroller handle that case. */
16521         stmt( IRStmt_Exit(
16522                  mk_amd64g_calculate_condition(
16523                     (AMD64Condcode)(1 ^ (opc - 0x70))),
16524                  Ijk_Boring,
16525                  IRConst_U64(guest_RIP_bbstart+delta) ) );
16526         dres.whatNext   = Dis_ResteerC;
16527         dres.continueAt = d64;
16528         comment = "(assumed taken)";
16529      }
16530      else
16531      if (resteerCisOk
16532          && vex_control.guest_chase_cond
16533          && (Addr64)d64 != (Addr64)guest_RIP_bbstart
16534          && jmpDelta >= 0
16535          && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
16536         /* Speculation: assume this forward branch is not taken.  So
16537            we need to emit a side-exit to d64 (the dest) and continue
16538            disassembling at the insn immediately following this
16539            one. */
16540         stmt( IRStmt_Exit(
16541                  mk_amd64g_calculate_condition((AMD64Condcode)(opc - 0x70)),
16542                  Ijk_Boring,
16543                  IRConst_U64(d64) ) );
16544         dres.whatNext   = Dis_ResteerC;
16545         dres.continueAt = guest_RIP_bbstart+delta;
16546         comment = "(assumed not taken)";
16547      }
16548      else {
16549         /* Conservative default translation - end the block at this
16550            point. */
16551         jcc_01( (AMD64Condcode)(opc - 0x70),
16552                 guest_RIP_bbstart+delta,
16553                 d64 );
16554         dres.whatNext = Dis_StopHere;
16555      }
16556      DIP("j%s-8 0x%llx %s\n", name_AMD64Condcode(opc - 0x70), d64, comment);
16557      break;
16558    }
16559
16560   case 0xE3:
16561      /* JRCXZ or JECXZ, depending address size override. */
16562      if (have66orF2orF3(pfx)) goto decode_failure;
16563      d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
16564      delta++;
16565      if (haveASO(pfx)) {
16566         /* 32-bit */
16567         stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
16568                            unop(Iop_32Uto64, getIReg32(R_RCX)),
16569                            mkU64(0)),
16570               Ijk_Boring,
16571               IRConst_U64(d64))
16572             );
16573         DIP("jecxz 0x%llx\n", d64);
16574      } else {
16575         /* 64-bit */
16576         stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
16577                                  getIReg64(R_RCX),
16578                                  mkU64(0)),
16579               Ijk_Boring,
16580               IRConst_U64(d64))
16581             );
16582         DIP("jrcxz 0x%llx\n", d64);
16583      }
16584      break;
16585
16586   case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
16587   case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
16588   case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
16589    { /* The docs say this uses rCX as a count depending on the
16590         address size override, not the operand one. */
16591      IRExpr* zbit  = NULL;
16592      IRExpr* count = NULL;
16593      IRExpr* cond  = NULL;
16594      HChar*  xtra  = NULL;
16595
16596      if (have66orF2orF3(pfx) || 1==getRexW(pfx)) goto decode_failure;
16597      /* So at this point we've rejected any variants which appear to
16598         be governed by the usual operand-size modifiers.  Hence only
16599         the address size prefix can have an effect.  It changes the
16600         size from 64 (default) to 32. */
16601      d64 = guest_RIP_bbstart+delta+1 + getSDisp8(delta);
16602      delta++;
16603      if (haveASO(pfx)) {
16604         /* 64to32 of 64-bit get is merely a get-put improvement
16605            trick. */
16606         putIReg32(R_RCX, binop(Iop_Sub32,
16607                                unop(Iop_64to32, getIReg64(R_RCX)),
16608                                mkU32(1)));
16609      } else {
16610         putIReg64(R_RCX, binop(Iop_Sub64, getIReg64(R_RCX), mkU64(1)));
16611      }
16612
16613      /* This is correct, both for 32- and 64-bit versions.  If we're
16614         doing a 32-bit dec and the result is zero then the default
16615         zero extension rule will cause the upper 32 bits to be zero
16616         too.  Hence a 64-bit check against zero is OK. */
16617      count = getIReg64(R_RCX);
16618      cond = binop(Iop_CmpNE64, count, mkU64(0));
16619      switch (opc) {
16620         case 0xE2:
16621            xtra = "";
16622            break;
16623         case 0xE1:
16624            xtra = "e";
16625            zbit = mk_amd64g_calculate_condition( AMD64CondZ );
16626            cond = mkAnd1(cond, zbit);
16627            break;
16628         case 0xE0:
16629            xtra = "ne";
16630            zbit = mk_amd64g_calculate_condition( AMD64CondNZ );
16631            cond = mkAnd1(cond, zbit);
16632            break;
16633         default:
16634	    vassert(0);
16635      }
16636      stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U64(d64)) );
16637
16638      DIP("loop%s%s 0x%llx\n", xtra, haveASO(pfx) ? "l" : "", d64);
16639      break;
16640    }
16641
16642   /* ------------------------ IMUL ----------------------- */
16643
16644   case 0x69: /* IMUL Iv, Ev, Gv */
16645      if (haveF2orF3(pfx)) goto decode_failure;
16646      delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, sz );
16647      break;
16648   case 0x6B: /* IMUL Ib, Ev, Gv */
16649      delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, 1 );
16650      break;
16651
16652   /* ------------------------ MOV ------------------------ */
16653
16654   case 0x88: /* MOV Gb,Eb */
16655      if (haveF2orF3(pfx)) goto decode_failure;
16656      delta = dis_mov_G_E(vbi, pfx, 1, delta);
16657      break;
16658
16659   case 0x89: /* MOV Gv,Ev */
16660      if (haveF2orF3(pfx)) goto decode_failure;
16661      delta = dis_mov_G_E(vbi, pfx, sz, delta);
16662      break;
16663
16664   case 0x8A: /* MOV Eb,Gb */
16665      if (haveF2orF3(pfx)) goto decode_failure;
16666      delta = dis_mov_E_G(vbi, pfx, 1, delta);
16667      break;
16668
16669   case 0x8B: /* MOV Ev,Gv */
16670      if (haveF2orF3(pfx)) goto decode_failure;
16671      delta = dis_mov_E_G(vbi, pfx, sz, delta);
16672      break;
16673
16674   case 0x8D: /* LEA M,Gv */
16675      if (haveF2orF3(pfx)) goto decode_failure;
16676      if (sz != 4 && sz != 8)
16677         goto decode_failure;
16678      modrm = getUChar(delta);
16679      if (epartIsReg(modrm))
16680         goto decode_failure;
16681      /* NOTE!  this is the one place where a segment override prefix
16682         has no effect on the address calculation.  Therefore we clear
16683         any segment override bits in pfx. */
16684      addr = disAMode ( &alen, vbi, clearSegBits(pfx), delta, dis_buf, 0 );
16685      delta += alen;
16686      /* This is a hack.  But it isn't clear that really doing the
16687         calculation at 32 bits is really worth it.  Hence for leal,
16688         do the full 64-bit calculation and then truncate it. */
16689      putIRegG( sz, pfx, modrm,
16690                         sz == 4
16691                            ? unop(Iop_64to32, mkexpr(addr))
16692                            : mkexpr(addr)
16693              );
16694      DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
16695                            nameIRegG(sz,pfx,modrm));
16696      break;
16697
16698//..    case 0x8C: /* MOV Sw,Ew -- MOV from a SEGMENT REGISTER */
16699//..       delta = dis_mov_Sw_Ew(sorb, sz, delta);
16700//..       break;
16701//..
16702//..    case 0x8E: /* MOV Ew,Sw -- MOV to a SEGMENT REGISTER */
16703//..       delta = dis_mov_Ew_Sw(sorb, delta);
16704//..       break;
16705
16706   case 0xA0: /* MOV Ob,AL */
16707      if (have66orF2orF3(pfx)) goto decode_failure;
16708      sz = 1;
16709      /* Fall through ... */
16710   case 0xA1: /* MOV Ov,eAX */
16711      if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
16712         goto decode_failure;
16713      d64 = getDisp64(delta);
16714      delta += 8;
16715      ty = szToITy(sz);
16716      addr = newTemp(Ity_I64);
16717      assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
16718      putIRegRAX(sz, loadLE( ty, mkexpr(addr) ));
16719      DIP("mov%c %s0x%llx, %s\n", nameISize(sz),
16720                                  segRegTxt(pfx), d64,
16721                                  nameIRegRAX(sz));
16722      break;
16723
16724   case 0xA2: /* MOV AL,Ob */
16725      if (have66orF2orF3(pfx)) goto decode_failure;
16726      sz = 1;
16727      /* Fall through ... */
16728   case 0xA3: /* MOV eAX,Ov */
16729      if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
16730         goto decode_failure;
16731      d64 = getDisp64(delta);
16732      delta += 8;
16733      ty = szToITy(sz);
16734      addr = newTemp(Ity_I64);
16735      assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
16736      storeLE( mkexpr(addr), getIRegRAX(sz) );
16737      DIP("mov%c %s, %s0x%llx\n", nameISize(sz), nameIRegRAX(sz),
16738                                  segRegTxt(pfx), d64);
16739      break;
16740
16741   /* XXXX be careful here with moves to AH/BH/CH/DH */
16742   case 0xB0: /* MOV imm,AL */
16743   case 0xB1: /* MOV imm,CL */
16744   case 0xB2: /* MOV imm,DL */
16745   case 0xB3: /* MOV imm,BL */
16746   case 0xB4: /* MOV imm,AH */
16747   case 0xB5: /* MOV imm,CH */
16748   case 0xB6: /* MOV imm,DH */
16749   case 0xB7: /* MOV imm,BH */
16750      if (haveF2orF3(pfx)) goto decode_failure;
16751      d64 = getUChar(delta);
16752      delta += 1;
16753      putIRegRexB(1, pfx, opc-0xB0, mkU8(d64));
16754      DIP("movb $%lld,%s\n", d64, nameIRegRexB(1,pfx,opc-0xB0));
16755      break;
16756
16757   case 0xB8: /* MOV imm,eAX */
16758   case 0xB9: /* MOV imm,eCX */
16759   case 0xBA: /* MOV imm,eDX */
16760   case 0xBB: /* MOV imm,eBX */
16761   case 0xBC: /* MOV imm,eSP */
16762   case 0xBD: /* MOV imm,eBP */
16763   case 0xBE: /* MOV imm,eSI */
16764   case 0xBF: /* MOV imm,eDI */
16765      /* This is the one-and-only place where 64-bit literals are
16766         allowed in the instruction stream. */
16767      if (haveF2orF3(pfx)) goto decode_failure;
16768      if (sz == 8) {
16769         d64 = getDisp64(delta);
16770         delta += 8;
16771         putIRegRexB(8, pfx, opc-0xB8, mkU64(d64));
16772         DIP("movabsq $%lld,%s\n", (Long)d64,
16773                                   nameIRegRexB(8,pfx,opc-0xB8));
16774      } else {
16775         d64 = getSDisp(imin(4,sz),delta);
16776         delta += imin(4,sz);
16777         putIRegRexB(sz, pfx, opc-0xB8,
16778                         mkU(szToITy(sz), d64 & mkSizeMask(sz)));
16779         DIP("mov%c $%lld,%s\n", nameISize(sz),
16780                                 (Long)d64,
16781                                 nameIRegRexB(sz,pfx,opc-0xB8));
16782      }
16783      break;
16784
16785   case 0xC6: /* MOV Ib,Eb */
16786      sz = 1;
16787      goto do_Mov_I_E;
16788   case 0xC7: /* MOV Iv,Ev */
16789      goto do_Mov_I_E;
16790
16791   do_Mov_I_E:
16792      if (haveF2orF3(pfx)) goto decode_failure;
16793      modrm = getUChar(delta);
16794      if (epartIsReg(modrm)) {
16795         delta++; /* mod/rm byte */
16796         d64 = getSDisp(imin(4,sz),delta);
16797         delta += imin(4,sz);
16798         putIRegE(sz, pfx, modrm,
16799                      mkU(szToITy(sz), d64 & mkSizeMask(sz)));
16800         DIP("mov%c $%lld, %s\n", nameISize(sz),
16801                                  (Long)d64,
16802                                  nameIRegE(sz,pfx,modrm));
16803      } else {
16804         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
16805                           /*xtra*/imin(4,sz) );
16806         delta += alen;
16807         d64 = getSDisp(imin(4,sz),delta);
16808         delta += imin(4,sz);
16809         storeLE(mkexpr(addr),
16810                 mkU(szToITy(sz), d64 & mkSizeMask(sz)));
16811         DIP("mov%c $%lld, %s\n", nameISize(sz), (Long)d64, dis_buf);
16812      }
16813      break;
16814
16815   /* ------------------------ MOVx ------------------------ */
16816
16817   case 0x63: /* MOVSX */
16818      if (haveF2orF3(pfx)) goto decode_failure;
16819      if (haveREX(pfx) && 1==getRexW(pfx)) {
16820         vassert(sz == 8);
16821         /* movsx r/m32 to r64 */
16822         modrm = getUChar(delta);
16823         if (epartIsReg(modrm)) {
16824            delta++;
16825            putIRegG(8, pfx, modrm,
16826                             unop(Iop_32Sto64,
16827                                  getIRegE(4, pfx, modrm)));
16828            DIP("movslq %s,%s\n",
16829                nameIRegE(4, pfx, modrm),
16830                nameIRegG(8, pfx, modrm));
16831            break;
16832         } else {
16833            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
16834            delta += alen;
16835            putIRegG(8, pfx, modrm,
16836                             unop(Iop_32Sto64,
16837                                  loadLE(Ity_I32, mkexpr(addr))));
16838            DIP("movslq %s,%s\n", dis_buf,
16839                nameIRegG(8, pfx, modrm));
16840            break;
16841         }
16842      } else {
16843         goto decode_failure;
16844      }
16845
16846   /* ------------------------ opl imm, A ----------------- */
16847
16848   case 0x04: /* ADD Ib, AL */
16849      if (haveF2orF3(pfx)) goto decode_failure;
16850      delta = dis_op_imm_A( 1, False, Iop_Add8, True, delta, "add" );
16851      break;
16852   case 0x05: /* ADD Iv, eAX */
16853      if (haveF2orF3(pfx)) goto decode_failure;
16854      delta = dis_op_imm_A(sz, False, Iop_Add8, True, delta, "add" );
16855      break;
16856
16857   case 0x0C: /* OR Ib, AL */
16858      if (haveF2orF3(pfx)) goto decode_failure;
16859      delta = dis_op_imm_A( 1, False, Iop_Or8, True, delta, "or" );
16860      break;
16861   case 0x0D: /* OR Iv, eAX */
16862      if (haveF2orF3(pfx)) goto decode_failure;
16863      delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
16864      break;
16865
16866   case 0x14: /* ADC Ib, AL */
16867      if (haveF2orF3(pfx)) goto decode_failure;
16868      delta = dis_op_imm_A( 1, True, Iop_Add8, True, delta, "adc" );
16869      break;
16870   case 0x15: /* ADC Iv, eAX */
16871      if (haveF2orF3(pfx)) goto decode_failure;
16872      delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
16873      break;
16874
16875   case 0x1C: /* SBB Ib, AL */
16876      if (haveF2orF3(pfx)) goto decode_failure;
16877      delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
16878      break;
16879   case 0x1D: /* SBB Iv, eAX */
16880      if (haveF2orF3(pfx)) goto decode_failure;
16881      delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
16882      break;
16883
16884   case 0x24: /* AND Ib, AL */
16885      if (haveF2orF3(pfx)) goto decode_failure;
16886      delta = dis_op_imm_A( 1, False, Iop_And8, True, delta, "and" );
16887      break;
16888   case 0x25: /* AND Iv, eAX */
16889      if (haveF2orF3(pfx)) goto decode_failure;
16890      delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
16891      break;
16892
16893   case 0x2C: /* SUB Ib, AL */
16894      if (haveF2orF3(pfx)) goto decode_failure;
16895      delta = dis_op_imm_A(1, False, Iop_Sub8, True, delta, "sub" );
16896      break;
16897   case 0x2D: /* SUB Iv, eAX */
16898      if (haveF2orF3(pfx)) goto decode_failure;
16899      delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
16900      break;
16901
16902   case 0x34: /* XOR Ib, AL */
16903      if (haveF2orF3(pfx)) goto decode_failure;
16904      delta = dis_op_imm_A( 1, False, Iop_Xor8, True, delta, "xor" );
16905      break;
16906   case 0x35: /* XOR Iv, eAX */
16907      if (haveF2orF3(pfx)) goto decode_failure;
16908      delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
16909      break;
16910
16911   case 0x3C: /* CMP Ib, AL */
16912      if (haveF2orF3(pfx)) goto decode_failure;
16913      delta = dis_op_imm_A( 1, False, Iop_Sub8, False, delta, "cmp" );
16914      break;
16915   case 0x3D: /* CMP Iv, eAX */
16916      if (haveF2orF3(pfx)) goto decode_failure;
16917      delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
16918      break;
16919
16920   case 0xA8: /* TEST Ib, AL */
16921      if (haveF2orF3(pfx)) goto decode_failure;
16922      delta = dis_op_imm_A( 1, False, Iop_And8, False, delta, "test" );
16923      break;
16924   case 0xA9: /* TEST Iv, eAX */
16925      if (haveF2orF3(pfx)) goto decode_failure;
16926      delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
16927      break;
16928
16929   /* ------------------------ opl Ev, Gv ----------------- */
16930
16931   case 0x02: /* ADD Eb,Gb */
16932      if (haveF2orF3(pfx)) goto decode_failure;
16933      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Add8, True, 1, delta, "add" );
16934      break;
16935   case 0x03: /* ADD Ev,Gv */
16936      if (haveF2orF3(pfx)) goto decode_failure;
16937      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Add8, True, sz, delta, "add" );
16938      break;
16939
16940   case 0x0A: /* OR Eb,Gb */
16941      if (haveF2orF3(pfx)) goto decode_failure;
16942      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Or8, True, 1, delta, "or" );
16943      break;
16944   case 0x0B: /* OR Ev,Gv */
16945      if (haveF2orF3(pfx)) goto decode_failure;
16946      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Or8, True, sz, delta, "or" );
16947      break;
16948
16949   case 0x12: /* ADC Eb,Gb */
16950      if (haveF2orF3(pfx)) goto decode_failure;
16951      delta = dis_op2_E_G ( vbi, pfx, True, Iop_Add8, True, 1, delta, "adc" );
16952      break;
16953   case 0x13: /* ADC Ev,Gv */
16954      if (haveF2orF3(pfx)) goto decode_failure;
16955      delta = dis_op2_E_G ( vbi, pfx, True, Iop_Add8, True, sz, delta, "adc" );
16956      break;
16957
16958   case 0x1A: /* SBB Eb,Gb */
16959      if (haveF2orF3(pfx)) goto decode_failure;
16960      delta = dis_op2_E_G ( vbi, pfx, True, Iop_Sub8, True, 1, delta, "sbb" );
16961      break;
16962   case 0x1B: /* SBB Ev,Gv */
16963      if (haveF2orF3(pfx)) goto decode_failure;
16964      delta = dis_op2_E_G ( vbi, pfx, True, Iop_Sub8, True, sz, delta, "sbb" );
16965      break;
16966
16967   case 0x22: /* AND Eb,Gb */
16968      if (haveF2orF3(pfx)) goto decode_failure;
16969      delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, True, 1, delta, "and" );
16970      break;
16971   case 0x23: /* AND Ev,Gv */
16972      if (haveF2orF3(pfx)) goto decode_failure;
16973      delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, True, sz, delta, "and" );
16974      break;
16975
16976   case 0x2A: /* SUB Eb,Gb */
16977      if (haveF2orF3(pfx)) goto decode_failure;
16978      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, True, 1, delta, "sub" );
16979      break;
16980   case 0x2B: /* SUB Ev,Gv */
16981      if (haveF2orF3(pfx)) goto decode_failure;
16982      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, True, sz, delta, "sub" );
16983      break;
16984
16985   case 0x32: /* XOR Eb,Gb */
16986      if (haveF2orF3(pfx)) goto decode_failure;
16987      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Xor8, True, 1, delta, "xor" );
16988      break;
16989   case 0x33: /* XOR Ev,Gv */
16990      if (haveF2orF3(pfx)) goto decode_failure;
16991      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Xor8, True, sz, delta, "xor" );
16992      break;
16993
16994   case 0x3A: /* CMP Eb,Gb */
16995      if (haveF2orF3(pfx)) goto decode_failure;
16996      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, False, 1, delta, "cmp" );
16997      break;
16998   case 0x3B: /* CMP Ev,Gv */
16999      if (haveF2orF3(pfx)) goto decode_failure;
17000      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, False, sz, delta, "cmp" );
17001      break;
17002
17003   case 0x84: /* TEST Eb,Gb */
17004      if (haveF2orF3(pfx)) goto decode_failure;
17005      delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, False, 1, delta, "test" );
17006      break;
17007   case 0x85: /* TEST Ev,Gv */
17008      if (haveF2orF3(pfx)) goto decode_failure;
17009      delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, False, sz, delta, "test" );
17010      break;
17011
17012   /* ------------------------ opl Gv, Ev ----------------- */
17013
17014   case 0x00: /* ADD Gb,Eb */
17015      if (haveF2orF3(pfx)) goto decode_failure;
17016      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Add8, True, 1, delta, "add" );
17017      break;
17018   case 0x01: /* ADD Gv,Ev */
17019      if (haveF2orF3(pfx)) goto decode_failure;
17020      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Add8, True, sz, delta, "add" );
17021      break;
17022
17023   case 0x08: /* OR Gb,Eb */
17024      if (haveF2orF3(pfx)) goto decode_failure;
17025      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Or8, True, 1, delta, "or" );
17026      break;
17027   case 0x09: /* OR Gv,Ev */
17028      if (haveF2orF3(pfx)) goto decode_failure;
17029      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Or8, True, sz, delta, "or" );
17030      break;
17031
17032   case 0x10: /* ADC Gb,Eb */
17033      if (haveF2orF3(pfx)) goto decode_failure;
17034      delta = dis_op2_G_E ( vbi, pfx, True, Iop_Add8, True, 1, delta, "adc" );
17035      break;
17036   case 0x11: /* ADC Gv,Ev */
17037      if (haveF2orF3(pfx)) goto decode_failure;
17038      delta = dis_op2_G_E ( vbi, pfx, True, Iop_Add8, True, sz, delta, "adc" );
17039      break;
17040
17041   case 0x18: /* SBB Gb,Eb */
17042      if (haveF2orF3(pfx)) goto decode_failure;
17043      delta = dis_op2_G_E ( vbi, pfx, True, Iop_Sub8, True, 1, delta, "sbb" );
17044      break;
17045   case 0x19: /* SBB Gv,Ev */
17046      if (haveF2orF3(pfx)) goto decode_failure;
17047      delta = dis_op2_G_E ( vbi, pfx, True, Iop_Sub8, True, sz, delta, "sbb" );
17048      break;
17049
17050   case 0x20: /* AND Gb,Eb */
17051      if (haveF2orF3(pfx)) goto decode_failure;
17052      delta = dis_op2_G_E ( vbi, pfx, False, Iop_And8, True, 1, delta, "and" );
17053      break;
17054   case 0x21: /* AND Gv,Ev */
17055      if (haveF2orF3(pfx)) goto decode_failure;
17056      delta = dis_op2_G_E ( vbi, pfx, False, Iop_And8, True, sz, delta, "and" );
17057      break;
17058
17059   case 0x28: /* SUB Gb,Eb */
17060      if (haveF2orF3(pfx)) goto decode_failure;
17061      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, True, 1, delta, "sub" );
17062      break;
17063   case 0x29: /* SUB Gv,Ev */
17064      if (haveF2orF3(pfx)) goto decode_failure;
17065      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, True, sz, delta, "sub" );
17066      break;
17067
17068   case 0x30: /* XOR Gb,Eb */
17069      if (haveF2orF3(pfx)) goto decode_failure;
17070      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Xor8, True, 1, delta, "xor" );
17071      break;
17072   case 0x31: /* XOR Gv,Ev */
17073      if (haveF2orF3(pfx)) goto decode_failure;
17074      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Xor8, True, sz, delta, "xor" );
17075      break;
17076
17077   case 0x38: /* CMP Gb,Eb */
17078      if (haveF2orF3(pfx)) goto decode_failure;
17079      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, False, 1, delta, "cmp" );
17080      break;
17081   case 0x39: /* CMP Gv,Ev */
17082      if (haveF2orF3(pfx)) goto decode_failure;
17083      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, False, sz, delta, "cmp" );
17084      break;
17085
17086   /* ------------------------ POP ------------------------ */
17087
17088   case 0x58: /* POP eAX */
17089   case 0x59: /* POP eCX */
17090   case 0x5A: /* POP eDX */
17091   case 0x5B: /* POP eBX */
17092   case 0x5D: /* POP eBP */
17093   case 0x5E: /* POP eSI */
17094   case 0x5F: /* POP eDI */
17095   case 0x5C: /* POP eSP */
17096      if (haveF2orF3(pfx)) goto decode_failure;
17097      vassert(sz == 2 || sz == 4 || sz == 8);
17098      if (sz == 4)
17099         sz = 8; /* there is no encoding for 32-bit pop in 64-bit mode */
17100      t1 = newTemp(szToITy(sz));
17101      t2 = newTemp(Ity_I64);
17102      assign(t2, getIReg64(R_RSP));
17103      assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
17104      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
17105      putIRegRexB(sz, pfx, opc-0x58, mkexpr(t1));
17106      DIP("pop%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x58));
17107      break;
17108
17109   case 0x9D: /* POPF */
17110      /* Note.  There is no encoding for a 32-bit popf in 64-bit mode.
17111         So sz==4 actually means sz==8. */
17112      if (haveF2orF3(pfx)) goto decode_failure;
17113      vassert(sz == 2 || sz == 4 || sz == 8);
17114      if (sz == 4) sz = 8;
17115      if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
17116      t1 = newTemp(Ity_I64); t2 = newTemp(Ity_I64);
17117      assign(t2, getIReg64(R_RSP));
17118      assign(t1, widenUto64(loadLE(szToITy(sz),mkexpr(t2))));
17119      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
17120      /* t1 is the flag word.  Mask out everything except OSZACP and
17121         set the flags thunk to AMD64G_CC_OP_COPY. */
17122      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
17123      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
17124      stmt( IRStmt_Put( OFFB_CC_DEP1,
17125                        binop(Iop_And64,
17126                              mkexpr(t1),
17127                              mkU64( AMD64G_CC_MASK_C | AMD64G_CC_MASK_P
17128                                     | AMD64G_CC_MASK_A | AMD64G_CC_MASK_Z
17129                                     | AMD64G_CC_MASK_S| AMD64G_CC_MASK_O )
17130                             )
17131                       )
17132          );
17133
17134      /* Also need to set the D flag, which is held in bit 10 of t1.
17135         If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
17136      stmt( IRStmt_Put(
17137               OFFB_DFLAG,
17138               IRExpr_Mux0X(
17139                  unop(Iop_32to8,
17140                  unop(Iop_64to32,
17141                       binop(Iop_And64,
17142                             binop(Iop_Shr64, mkexpr(t1), mkU8(10)),
17143                             mkU64(1)))),
17144                  mkU64(1),
17145                  mkU64(0xFFFFFFFFFFFFFFFFULL)))
17146          );
17147
17148      /* And set the ID flag */
17149      stmt( IRStmt_Put(
17150               OFFB_IDFLAG,
17151               IRExpr_Mux0X(
17152                  unop(Iop_32to8,
17153                  unop(Iop_64to32,
17154                       binop(Iop_And64,
17155                             binop(Iop_Shr64, mkexpr(t1), mkU8(21)),
17156                             mkU64(1)))),
17157                  mkU64(0),
17158                  mkU64(1)))
17159          );
17160
17161      /* And set the AC flag too */
17162      stmt( IRStmt_Put(
17163               OFFB_ACFLAG,
17164               IRExpr_Mux0X(
17165                  unop(Iop_32to8,
17166                  unop(Iop_64to32,
17167                       binop(Iop_And64,
17168                             binop(Iop_Shr64, mkexpr(t1), mkU8(18)),
17169                             mkU64(1)))),
17170                  mkU64(0),
17171                  mkU64(1)))
17172          );
17173
17174      DIP("popf%c\n", nameISize(sz));
17175      break;
17176
17177//..    case 0x61: /* POPA */
17178//..       /* This is almost certainly wrong for sz==2.  So ... */
17179//..       if (sz != 4) goto decode_failure;
17180//..
17181//..       /* t5 is the old %ESP value. */
17182//..       t5 = newTemp(Ity_I32);
17183//..       assign( t5, getIReg(4, R_ESP) );
17184//..
17185//..       /* Reload all the registers, except %esp. */
17186//..       putIReg(4,R_EAX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(28)) ));
17187//..       putIReg(4,R_ECX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(24)) ));
17188//..       putIReg(4,R_EDX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(20)) ));
17189//..       putIReg(4,R_EBX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(16)) ));
17190//..       /* ignore saved %ESP */
17191//..       putIReg(4,R_EBP, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 8)) ));
17192//..       putIReg(4,R_ESI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 4)) ));
17193//..       putIReg(4,R_EDI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 0)) ));
17194//..
17195//..       /* and move %ESP back up */
17196//..       putIReg( 4, R_ESP, binop(Iop_Add32, mkexpr(t5), mkU32(8*4)) );
17197//..
17198//..       DIP("pusha%c\n", nameISize(sz));
17199//..       break;
17200
17201   case 0x8F: { /* POPQ m64 / POPW m16 */
17202      Int   len;
17203      UChar rm;
17204      /* There is no encoding for 32-bit pop in 64-bit mode.
17205         So sz==4 actually means sz==8. */
17206      if (haveF2orF3(pfx)) goto decode_failure;
17207      vassert(sz == 2 || sz == 4
17208              || /* tolerate redundant REX.W, see #210481 */ sz == 8);
17209      if (sz == 4) sz = 8;
17210      if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
17211
17212      rm = getUChar(delta);
17213
17214      /* make sure this instruction is correct POP */
17215      if (epartIsReg(rm) || gregLO3ofRM(rm) != 0)
17216         goto decode_failure;
17217      /* and has correct size */
17218      vassert(sz == 8);
17219
17220      t1 = newTemp(Ity_I64);
17221      t3 = newTemp(Ity_I64);
17222      assign( t1, getIReg64(R_RSP) );
17223      assign( t3, loadLE(Ity_I64, mkexpr(t1)) );
17224
17225      /* Increase RSP; must be done before the STORE.  Intel manual
17226         says: If the RSP register is used as a base register for
17227         addressing a destination operand in memory, the POP
17228         instruction computes the effective address of the operand
17229         after it increments the RSP register.  */
17230      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(sz)) );
17231
17232      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
17233      storeLE( mkexpr(addr), mkexpr(t3) );
17234
17235      DIP("popl %s\n", dis_buf);
17236
17237      delta += len;
17238      break;
17239   }
17240
17241//.. //--    case 0x1F: /* POP %DS */
17242//.. //--       dis_pop_segreg( cb, R_DS, sz ); break;
17243//.. //--    case 0x07: /* POP %ES */
17244//.. //--       dis_pop_segreg( cb, R_ES, sz ); break;
17245//.. //--    case 0x17: /* POP %SS */
17246//.. //--       dis_pop_segreg( cb, R_SS, sz ); break;
17247
17248   /* ------------------------ PUSH ----------------------- */
17249
17250   case 0x50: /* PUSH eAX */
17251   case 0x51: /* PUSH eCX */
17252   case 0x52: /* PUSH eDX */
17253   case 0x53: /* PUSH eBX */
17254   case 0x55: /* PUSH eBP */
17255   case 0x56: /* PUSH eSI */
17256   case 0x57: /* PUSH eDI */
17257   case 0x54: /* PUSH eSP */
17258      /* This is the Right Way, in that the value to be pushed is
17259         established before %rsp is changed, so that pushq %rsp
17260         correctly pushes the old value. */
17261      if (haveF2orF3(pfx)) goto decode_failure;
17262      vassert(sz == 2 || sz == 4 || sz == 8);
17263      if (sz == 4)
17264         sz = 8; /* there is no encoding for 32-bit push in 64-bit mode */
17265      ty = sz==2 ? Ity_I16 : Ity_I64;
17266      t1 = newTemp(ty);
17267      t2 = newTemp(Ity_I64);
17268      assign(t1, getIRegRexB(sz, pfx, opc-0x50));
17269      assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(sz)));
17270      putIReg64(R_RSP, mkexpr(t2) );
17271      storeLE(mkexpr(t2),mkexpr(t1));
17272      DIP("push%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x50));
17273      break;
17274
17275   case 0x68: /* PUSH Iv */
17276      if (haveF2orF3(pfx)) goto decode_failure;
17277      /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
17278      if (sz == 4) sz = 8;
17279      d64 = getSDisp(imin(4,sz),delta);
17280      delta += imin(4,sz);
17281      goto do_push_I;
17282   case 0x6A: /* PUSH Ib, sign-extended to sz */
17283      if (haveF2orF3(pfx)) goto decode_failure;
17284      /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
17285      if (sz == 4) sz = 8;
17286      d64 = getSDisp8(delta); delta += 1;
17287      goto do_push_I;
17288   do_push_I:
17289      ty = szToITy(sz);
17290      t1 = newTemp(Ity_I64);
17291      t2 = newTemp(ty);
17292      assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
17293      putIReg64(R_RSP, mkexpr(t1) );
17294      /* stop mkU16 asserting if d32 is a negative 16-bit number
17295         (bug #132813) */
17296      if (ty == Ity_I16)
17297         d64 &= 0xFFFF;
17298      storeLE( mkexpr(t1), mkU(ty,d64) );
17299      DIP("push%c $%lld\n", nameISize(sz), (Long)d64);
17300      break;
17301
17302   case 0x9C: /* PUSHF */ {
17303      /* Note.  There is no encoding for a 32-bit pushf in 64-bit
17304         mode.  So sz==4 actually means sz==8. */
17305      /* 24 July 06: has also been seen with a redundant REX prefix,
17306         so must also allow sz==8. */
17307      if (haveF2orF3(pfx)) goto decode_failure;
17308      vassert(sz == 2 || sz == 4 || sz == 8);
17309      if (sz == 4) sz = 8;
17310      if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
17311
17312      t1 = newTemp(Ity_I64);
17313      assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
17314      putIReg64(R_RSP, mkexpr(t1) );
17315
17316      t2 = newTemp(Ity_I64);
17317      assign( t2, mk_amd64g_calculate_rflags_all() );
17318
17319      /* Patch in the D flag.  This can simply be a copy of bit 10 of
17320         baseBlock[OFFB_DFLAG]. */
17321      t3 = newTemp(Ity_I64);
17322      assign( t3, binop(Iop_Or64,
17323                        mkexpr(t2),
17324                        binop(Iop_And64,
17325                              IRExpr_Get(OFFB_DFLAG,Ity_I64),
17326                              mkU64(1<<10)))
17327            );
17328
17329      /* And patch in the ID flag. */
17330      t4 = newTemp(Ity_I64);
17331      assign( t4, binop(Iop_Or64,
17332                        mkexpr(t3),
17333                        binop(Iop_And64,
17334                              binop(Iop_Shl64, IRExpr_Get(OFFB_IDFLAG,Ity_I64),
17335                                               mkU8(21)),
17336                              mkU64(1<<21)))
17337            );
17338
17339      /* And patch in the AC flag too. */
17340      t5 = newTemp(Ity_I64);
17341      assign( t5, binop(Iop_Or64,
17342                        mkexpr(t4),
17343                        binop(Iop_And64,
17344                              binop(Iop_Shl64, IRExpr_Get(OFFB_ACFLAG,Ity_I64),
17345                                               mkU8(18)),
17346                              mkU64(1<<18)))
17347            );
17348
17349      /* if sz==2, the stored value needs to be narrowed. */
17350      if (sz == 2)
17351        storeLE( mkexpr(t1), unop(Iop_32to16,
17352                             unop(Iop_64to32,mkexpr(t5))) );
17353      else
17354        storeLE( mkexpr(t1), mkexpr(t5) );
17355
17356      DIP("pushf%c\n", nameISize(sz));
17357      break;
17358   }
17359
17360//..    case 0x60: /* PUSHA */
17361//..       /* This is almost certainly wrong for sz==2.  So ... */
17362//..       if (sz != 4) goto decode_failure;
17363//..
17364//..       /* This is the Right Way, in that the value to be pushed is
17365//..          established before %esp is changed, so that pusha
17366//..          correctly pushes the old %esp value.  New value of %esp is
17367//..          pushed at start. */
17368//..       /* t0 is the %ESP value we're going to push. */
17369//..       t0 = newTemp(Ity_I32);
17370//..       assign( t0, getIReg(4, R_ESP) );
17371//..
17372//..       /* t5 will be the new %ESP value. */
17373//..       t5 = newTemp(Ity_I32);
17374//..       assign( t5, binop(Iop_Sub32, mkexpr(t0), mkU32(8*4)) );
17375//..
17376//..       /* Update guest state before prodding memory. */
17377//..       putIReg(4, R_ESP, mkexpr(t5));
17378//..
17379//..       /* Dump all the registers. */
17380//..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(28)), getIReg(4,R_EAX) );
17381//..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(24)), getIReg(4,R_ECX) );
17382//..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(20)), getIReg(4,R_EDX) );
17383//..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(16)), getIReg(4,R_EBX) );
17384//..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(12)), mkexpr(t0) /*esp*/);
17385//..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 8)), getIReg(4,R_EBP) );
17386//..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 4)), getIReg(4,R_ESI) );
17387//..       storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 0)), getIReg(4,R_EDI) );
17388//..
17389//..       DIP("pusha%c\n", nameISize(sz));
17390//..       break;
17391//..
17392//..
17393//.. //--    case 0x0E: /* PUSH %CS */
17394//.. //--       dis_push_segreg( cb, R_CS, sz ); break;
17395//.. //--    case 0x1E: /* PUSH %DS */
17396//.. //--       dis_push_segreg( cb, R_DS, sz ); break;
17397//.. //--    case 0x06: /* PUSH %ES */
17398//.. //--       dis_push_segreg( cb, R_ES, sz ); break;
17399//.. //--    case 0x16: /* PUSH %SS */
17400//.. //--       dis_push_segreg( cb, R_SS, sz ); break;
17401//..
17402//..    /* ------------------------ SCAS et al ----------------- */
17403//..
17404//..    case 0xA4: /* MOVS, no REP prefix */
17405//..    case 0xA5:
17406//..       dis_string_op( dis_MOVS, ( opc == 0xA4 ? 1 : sz ), "movs", sorb );
17407//..       break;
17408//..
17409//..   case 0xA6: /* CMPSb, no REP prefix */
17410//.. //--    case 0xA7:
17411//..      dis_string_op( dis_CMPS, ( opc == 0xA6 ? 1 : sz ), "cmps", sorb );
17412//..      break;
17413//.. //--
17414//.. //--
17415    case 0xAC: /* LODS, no REP prefix */
17416    case 0xAD:
17417       dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", pfx );
17418       break;
17419//..
17420//..    case 0xAE: /* SCAS, no REP prefix */
17421//..    case 0xAF:
17422//..       dis_string_op( dis_SCAS, ( opc == 0xAE ? 1 : sz ), "scas", sorb );
17423//..       break;
17424
17425
17426   case 0xFC: /* CLD */
17427      if (haveF2orF3(pfx)) goto decode_failure;
17428      stmt( IRStmt_Put( OFFB_DFLAG, mkU64(1)) );
17429      DIP("cld\n");
17430      break;
17431
17432   case 0xFD: /* STD */
17433      if (haveF2orF3(pfx)) goto decode_failure;
17434      stmt( IRStmt_Put( OFFB_DFLAG, mkU64(-1ULL)) );
17435      DIP("std\n");
17436      break;
17437
17438   case 0xF8: /* CLC */
17439   case 0xF9: /* STC */
17440   case 0xF5: /* CMC */
17441      t0 = newTemp(Ity_I64);
17442      t1 = newTemp(Ity_I64);
17443      assign( t0, mk_amd64g_calculate_rflags_all() );
17444      switch (opc) {
17445         case 0xF8:
17446            assign( t1, binop(Iop_And64, mkexpr(t0),
17447                                         mkU64(~AMD64G_CC_MASK_C)));
17448            DIP("clc\n");
17449            break;
17450         case 0xF9:
17451            assign( t1, binop(Iop_Or64, mkexpr(t0),
17452                                        mkU64(AMD64G_CC_MASK_C)));
17453            DIP("stc\n");
17454            break;
17455         case 0xF5:
17456            assign( t1, binop(Iop_Xor64, mkexpr(t0),
17457                                         mkU64(AMD64G_CC_MASK_C)));
17458            DIP("cmc\n");
17459            break;
17460         default:
17461            vpanic("disInstr(x64)(clc/stc/cmc)");
17462      }
17463      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
17464      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
17465      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t1) ));
17466      /* Set NDEP even though it isn't used.  This makes redundant-PUT
17467         elimination of previous stores to this field work better. */
17468      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
17469      break;
17470
17471//..    /* REPNE prefix insn */
17472//..    case 0xF2: {
17473//..       Addr32 eip_orig = guest_eip_bbstart + delta - 1;
17474//..       vassert(sorb == 0);
17475//..       abyte = getUChar(delta); delta++;
17476//..
17477//..       if (abyte == 0x66) { sz = 2; abyte = getUChar(delta); delta++; }
17478//..       whatNext = Dis_StopHere;
17479//..
17480//..       switch (abyte) {
17481//..       /* According to the Intel manual, "repne movs" should never occur, but
17482//..        * in practice it has happened, so allow for it here... */
17483//..       case 0xA4: sz = 1;   /* REPNE MOVS<sz> */
17484//..         goto decode_failure;
17485//.. //--       case 0xA5:
17486//..         //         dis_REP_op ( CondNZ, dis_MOVS, sz, eip_orig,
17487//..         //                              guest_eip_bbstart+delta, "repne movs" );
17488//..         //         break;
17489//.. //--
17490//.. //--       case 0xA6: sz = 1;   /* REPNE CMPS<sz> */
17491//.. //--       case 0xA7:
17492//.. //--          dis_REP_op ( cb, CondNZ, dis_CMPS, sz, eip_orig, eip, "repne cmps" );
17493//.. //--          break;
17494//.. //--
17495//..       case 0xAE: sz = 1;   /* REPNE SCAS<sz> */
17496//..       case 0xAF:
17497//..          dis_REP_op ( X86CondNZ, dis_SCAS, sz, eip_orig,
17498//..                                  guest_eip_bbstart+delta, "repne scas" );
17499//..          break;
17500//..
17501//..       default:
17502//..          goto decode_failure;
17503//..       }
17504//..       break;
17505//..    }
17506
17507   /* ------ AE: SCAS variants ------ */
17508   case 0xAE:
17509   case 0xAF:
17510      /* F2 AE/AF: repne scasb/repne scas{w,l,q} */
17511      if (haveF2(pfx) && !haveF3(pfx)) {
17512         if (opc == 0xAE)
17513            sz = 1;
17514         dis_REP_op ( AMD64CondNZ, dis_SCAS, sz,
17515                      guest_RIP_curr_instr,
17516                      guest_RIP_bbstart+delta, "repne scas", pfx );
17517         dres.whatNext = Dis_StopHere;
17518         break;
17519      }
17520      /* F3 AE/AF: repe scasb/repe scas{w,l,q} */
17521      if (!haveF2(pfx) && haveF3(pfx)) {
17522         if (opc == 0xAE)
17523            sz = 1;
17524         dis_REP_op ( AMD64CondZ, dis_SCAS, sz,
17525                      guest_RIP_curr_instr,
17526                      guest_RIP_bbstart+delta, "repe scas", pfx );
17527         dres.whatNext = Dis_StopHere;
17528         break;
17529      }
17530      /* AE/AF: scasb/scas{w,l,q} */
17531      if (!haveF2(pfx) && !haveF3(pfx)) {
17532         if (opc == 0xAE)
17533            sz = 1;
17534         dis_string_op( dis_SCAS, sz, "scas", pfx );
17535         break;
17536      }
17537      goto decode_failure;
17538
17539   /* ------ A6, A7: CMPS variants ------ */
17540   case 0xA6:
17541   case 0xA7:
17542      /* F3 A6/A7: repe cmps/rep cmps{w,l,q} */
17543      if (haveF3(pfx) && !haveF2(pfx)) {
17544         if (opc == 0xA6)
17545            sz = 1;
17546         dis_REP_op ( AMD64CondZ, dis_CMPS, sz,
17547                      guest_RIP_curr_instr,
17548                      guest_RIP_bbstart+delta, "repe cmps", pfx );
17549         dres.whatNext = Dis_StopHere;
17550         break;
17551      }
17552      goto decode_failure;
17553
17554   /* ------ AA, AB: STOS variants ------ */
17555   case 0xAA:
17556   case 0xAB:
17557      /* F3 AA/AB: rep stosb/rep stos{w,l,q} */
17558      if (haveF3(pfx) && !haveF2(pfx)) {
17559         if (opc == 0xAA)
17560            sz = 1;
17561         dis_REP_op ( AMD64CondAlways, dis_STOS, sz,
17562                      guest_RIP_curr_instr,
17563                      guest_RIP_bbstart+delta, "rep stos", pfx );
17564        dres.whatNext = Dis_StopHere;
17565        break;
17566      }
17567      /* AA/AB: stosb/stos{w,l,q} */
17568      if (!haveF3(pfx) && !haveF2(pfx)) {
17569         if (opc == 0xAA)
17570            sz = 1;
17571         dis_string_op( dis_STOS, sz, "stos", pfx );
17572         break;
17573      }
17574      goto decode_failure;
17575
17576   /* ------ A4, A5: MOVS variants ------ */
17577   case 0xA4:
17578   case 0xA5:
17579      /* F3 A4: rep movsb */
17580      if (haveF3(pfx) && !haveF2(pfx)) {
17581         if (opc == 0xA4)
17582            sz = 1;
17583         dis_REP_op ( AMD64CondAlways, dis_MOVS, sz,
17584                      guest_RIP_curr_instr,
17585                      guest_RIP_bbstart+delta, "rep movs", pfx );
17586        dres.whatNext = Dis_StopHere;
17587        break;
17588      }
17589      /* A4: movsb */
17590      if (!haveF3(pfx) && !haveF2(pfx)) {
17591         if (opc == 0xA4)
17592            sz = 1;
17593         dis_string_op( dis_MOVS, sz, "movs", pfx );
17594         break;
17595      }
17596      goto decode_failure;
17597
17598
17599   /* ------------------------ XCHG ----------------------- */
17600
17601   /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
17602      prefix.  Therefore, surround it with a IRStmt_MBE(Imbe_BusLock)
17603      and IRStmt_MBE(Imbe_BusUnlock) pair.  But be careful; if it is
17604      used with an explicit LOCK prefix, we don't want to end up with
17605      two IRStmt_MBE(Imbe_BusLock)s -- one made here and one made by
17606      the generic LOCK logic at the top of disInstr. */
17607   case 0x86: /* XCHG Gb,Eb */
17608      sz = 1;
17609      /* Fall through ... */
17610   case 0x87: /* XCHG Gv,Ev */
17611      if (haveF2orF3(pfx)) goto decode_failure;
17612      modrm = getUChar(delta);
17613      ty = szToITy(sz);
17614      t1 = newTemp(ty); t2 = newTemp(ty);
17615      if (epartIsReg(modrm)) {
17616         assign(t1, getIRegE(sz, pfx, modrm));
17617         assign(t2, getIRegG(sz, pfx, modrm));
17618         putIRegG(sz, pfx, modrm, mkexpr(t1));
17619         putIRegE(sz, pfx, modrm, mkexpr(t2));
17620         delta++;
17621         DIP("xchg%c %s, %s\n",
17622             nameISize(sz), nameIRegG(sz, pfx, modrm),
17623                            nameIRegE(sz, pfx, modrm));
17624      } else {
17625         *expect_CAS = True;
17626         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
17627         assign( t1, loadLE(ty, mkexpr(addr)) );
17628         assign( t2, getIRegG(sz, pfx, modrm) );
17629         casLE( mkexpr(addr),
17630                mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
17631         putIRegG( sz, pfx, modrm, mkexpr(t1) );
17632         delta += alen;
17633         DIP("xchg%c %s, %s\n", nameISize(sz),
17634                                nameIRegG(sz, pfx, modrm), dis_buf);
17635      }
17636      break;
17637
17638   case 0x90: /* XCHG eAX,eAX */
17639      /* detect and handle F3 90 (rep nop) specially */
17640      if (!have66(pfx) && !haveF2(pfx) && haveF3(pfx)) {
17641         DIP("rep nop (P4 pause)\n");
17642         /* "observe" the hint.  The Vex client needs to be careful not
17643            to cause very long delays as a result, though. */
17644         jmp_lit(Ijk_Yield, guest_RIP_bbstart+delta);
17645         dres.whatNext = Dis_StopHere;
17646         break;
17647      }
17648      /* detect and handle NOPs specially */
17649      if (/* F2/F3 probably change meaning completely */
17650          !haveF2orF3(pfx)
17651          /* If REX.B is 1, we're not exchanging rAX with itself */
17652          && getRexB(pfx)==0 ) {
17653         DIP("nop\n");
17654         break;
17655      }
17656      /* else fall through to normal case. */
17657   case 0x91: /* XCHG rAX,rCX */
17658   case 0x92: /* XCHG rAX,rDX */
17659   case 0x93: /* XCHG rAX,rBX */
17660   case 0x94: /* XCHG rAX,rSP */
17661   case 0x95: /* XCHG rAX,rBP */
17662   case 0x96: /* XCHG rAX,rSI */
17663   case 0x97: /* XCHG rAX,rDI */
17664
17665      /* guard against mutancy */
17666      if (haveF2orF3(pfx)) goto decode_failure;
17667
17668      codegen_xchg_rAX_Reg ( pfx, sz, opc - 0x90 );
17669      break;
17670
17671//.. //--    /* ------------------------ XLAT ----------------------- */
17672//.. //--
17673//.. //--    case 0xD7: /* XLAT */
17674//.. //--       t1 = newTemp(cb); t2 = newTemp(cb);
17675//.. //--       uInstr2(cb, GET, sz, ArchReg, R_EBX, TempReg, t1); /* get eBX */
17676//.. //--       handleAddrOverrides( cb, sorb, t1 );               /* make t1 DS:eBX */
17677//.. //--       uInstr2(cb, GET, 1, ArchReg, R_AL, TempReg, t2); /* get AL */
17678//.. //--       /* Widen %AL to 32 bits, so it's all defined when we add it. */
17679//.. //--       uInstr1(cb, WIDEN, 4, TempReg, t2);
17680//.. //--       uWiden(cb, 1, False);
17681//.. //--       uInstr2(cb, ADD, sz, TempReg, t2, TempReg, t1);  /* add AL to eBX */
17682//.. //--       uInstr2(cb, LOAD, 1, TempReg, t1,  TempReg, t2); /* get byte at t1 into t2 */
17683//.. //--       uInstr2(cb, PUT, 1, TempReg, t2, ArchReg, R_AL); /* put byte into AL */
17684//.. //--
17685//.. //--       DIP("xlat%c [ebx]\n", nameISize(sz));
17686//.. //--       break;
17687
17688   /* ------------------------ IN / OUT ----------------------- */
17689
17690   case 0xE4: /* IN imm8, AL */
17691      sz = 1;
17692      t1 = newTemp(Ity_I64);
17693      abyte = getUChar(delta); delta++;
17694      assign(t1, mkU64( abyte & 0xFF ));
17695      DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
17696      goto do_IN;
17697   case 0xE5: /* IN imm8, eAX */
17698      if (!(sz == 2 || sz == 4)) goto decode_failure;
17699      t1 = newTemp(Ity_I64);
17700      abyte = getUChar(delta); delta++;
17701      assign(t1, mkU64( abyte & 0xFF ));
17702      DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
17703      goto do_IN;
17704   case 0xEC: /* IN %DX, AL */
17705      sz = 1;
17706      t1 = newTemp(Ity_I64);
17707      assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
17708      DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
17709                                         nameIRegRAX(sz));
17710      goto do_IN;
17711   case 0xED: /* IN %DX, eAX */
17712      if (!(sz == 2 || sz == 4)) goto decode_failure;
17713      t1 = newTemp(Ity_I64);
17714      assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
17715      DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
17716                                         nameIRegRAX(sz));
17717      goto do_IN;
17718   do_IN: {
17719      /* At this point, sz indicates the width, and t1 is a 64-bit
17720         value giving port number. */
17721      IRDirty* d;
17722      if (haveF2orF3(pfx)) goto decode_failure;
17723      vassert(sz == 1 || sz == 2 || sz == 4);
17724      ty = szToITy(sz);
17725      t2 = newTemp(Ity_I64);
17726      d = unsafeIRDirty_1_N(
17727             t2,
17728             0/*regparms*/,
17729             "amd64g_dirtyhelper_IN",
17730             &amd64g_dirtyhelper_IN,
17731             mkIRExprVec_2( mkexpr(t1), mkU64(sz) )
17732          );
17733      /* do the call, dumping the result in t2. */
17734      stmt( IRStmt_Dirty(d) );
17735      putIRegRAX(sz, narrowTo( ty, mkexpr(t2) ) );
17736      break;
17737   }
17738
17739   case 0xE6: /* OUT AL, imm8 */
17740      sz = 1;
17741      t1 = newTemp(Ity_I64);
17742      abyte = getUChar(delta); delta++;
17743      assign( t1, mkU64( abyte & 0xFF ) );
17744      DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
17745      goto do_OUT;
17746   case 0xE7: /* OUT eAX, imm8 */
17747      if (!(sz == 2 || sz == 4)) goto decode_failure;
17748      t1 = newTemp(Ity_I64);
17749      abyte = getUChar(delta); delta++;
17750      assign( t1, mkU64( abyte & 0xFF ) );
17751      DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
17752      goto do_OUT;
17753   case 0xEE: /* OUT AL, %DX */
17754      sz = 1;
17755      t1 = newTemp(Ity_I64);
17756      assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
17757      DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
17758                                          nameIRegRDX(2));
17759      goto do_OUT;
17760   case 0xEF: /* OUT eAX, %DX */
17761      if (!(sz == 2 || sz == 4)) goto decode_failure;
17762      t1 = newTemp(Ity_I64);
17763      assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
17764      DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
17765                                          nameIRegRDX(2));
17766      goto do_OUT;
17767   do_OUT: {
17768      /* At this point, sz indicates the width, and t1 is a 64-bit
17769         value giving port number. */
17770      IRDirty* d;
17771      if (haveF2orF3(pfx)) goto decode_failure;
17772      vassert(sz == 1 || sz == 2 || sz == 4);
17773      ty = szToITy(sz);
17774      d = unsafeIRDirty_0_N(
17775             0/*regparms*/,
17776             "amd64g_dirtyhelper_OUT",
17777             &amd64g_dirtyhelper_OUT,
17778             mkIRExprVec_3( mkexpr(t1),
17779                            widenUto64( getIRegRAX(sz) ),
17780                            mkU64(sz) )
17781          );
17782      stmt( IRStmt_Dirty(d) );
17783      break;
17784   }
17785
17786   /* ------------------------ (Grp1 extensions) ---------- */
17787
17788   case 0x80: /* Grp1 Ib,Eb */
17789      if (haveF2orF3(pfx)) goto decode_failure;
17790      modrm = getUChar(delta);
17791      am_sz = lengthAMode(pfx,delta);
17792      sz    = 1;
17793      d_sz  = 1;
17794      d64   = getSDisp8(delta + am_sz);
17795      delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
17796      break;
17797
17798   case 0x81: /* Grp1 Iv,Ev */
17799      if (haveF2orF3(pfx)) goto decode_failure;
17800      modrm = getUChar(delta);
17801      am_sz = lengthAMode(pfx,delta);
17802      d_sz  = imin(sz,4);
17803      d64   = getSDisp(d_sz, delta + am_sz);
17804      delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
17805      break;
17806
17807   case 0x83: /* Grp1 Ib,Ev */
17808      if (haveF2orF3(pfx)) goto decode_failure;
17809      modrm = getUChar(delta);
17810      am_sz = lengthAMode(pfx,delta);
17811      d_sz  = 1;
17812      d64   = getSDisp8(delta + am_sz);
17813      delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
17814      break;
17815
17816   /* ------------------------ (Grp2 extensions) ---------- */
17817
17818   case 0xC0: { /* Grp2 Ib,Eb */
17819      Bool decode_OK = True;
17820      if (haveF2orF3(pfx)) goto decode_failure;
17821      modrm = getUChar(delta);
17822      am_sz = lengthAMode(pfx,delta);
17823      d_sz  = 1;
17824      d64   = getUChar(delta + am_sz);
17825      sz    = 1;
17826      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
17827                         mkU8(d64 & 0xFF), NULL, &decode_OK );
17828      if (!decode_OK) goto decode_failure;
17829      break;
17830   }
17831   case 0xC1: { /* Grp2 Ib,Ev */
17832      Bool decode_OK = True;
17833      if (haveF2orF3(pfx)) goto decode_failure;
17834      modrm = getUChar(delta);
17835      am_sz = lengthAMode(pfx,delta);
17836      d_sz  = 1;
17837      d64   = getUChar(delta + am_sz);
17838      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
17839                         mkU8(d64 & 0xFF), NULL, &decode_OK );
17840      if (!decode_OK) goto decode_failure;
17841      break;
17842   }
17843   case 0xD0: { /* Grp2 1,Eb */
17844      Bool decode_OK = True;
17845      if (haveF2orF3(pfx)) goto decode_failure;
17846      modrm = getUChar(delta);
17847      am_sz = lengthAMode(pfx,delta);
17848      d_sz  = 0;
17849      d64   = 1;
17850      sz    = 1;
17851      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
17852                         mkU8(d64), NULL, &decode_OK );
17853      if (!decode_OK) goto decode_failure;
17854      break;
17855   }
17856   case 0xD1: { /* Grp2 1,Ev */
17857      Bool decode_OK = True;
17858      if (haveF2orF3(pfx)) goto decode_failure;
17859      modrm = getUChar(delta);
17860      am_sz = lengthAMode(pfx,delta);
17861      d_sz  = 0;
17862      d64   = 1;
17863      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
17864                         mkU8(d64), NULL, &decode_OK );
17865      if (!decode_OK) goto decode_failure;
17866      break;
17867   }
17868   case 0xD2: { /* Grp2 CL,Eb */
17869      Bool decode_OK = True;
17870      if (haveF2orF3(pfx)) goto decode_failure;
17871      modrm = getUChar(delta);
17872      am_sz = lengthAMode(pfx,delta);
17873      d_sz  = 0;
17874      sz    = 1;
17875      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
17876                         getIRegCL(), "%cl", &decode_OK );
17877      if (!decode_OK) goto decode_failure;
17878      break;
17879   }
17880   case 0xD3: { /* Grp2 CL,Ev */
17881      Bool decode_OK = True;
17882      if (haveF2orF3(pfx)) goto decode_failure;
17883      modrm = getUChar(delta);
17884      am_sz = lengthAMode(pfx,delta);
17885      d_sz  = 0;
17886      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
17887                         getIRegCL(), "%cl", &decode_OK );
17888      if (!decode_OK) goto decode_failure;
17889      break;
17890   }
17891
17892   /* ------------------------ (Grp3 extensions) ---------- */
17893
17894   case 0xF6: { /* Grp3 Eb */
17895      Bool decode_OK = True;
17896      if (haveF2orF3(pfx)) goto decode_failure;
17897      delta = dis_Grp3 ( vbi, pfx, 1, delta, &decode_OK );
17898      if (!decode_OK) goto decode_failure;
17899      break;
17900   }
17901   case 0xF7: { /* Grp3 Ev */
17902      Bool decode_OK = True;
17903      if (haveF2orF3(pfx)) goto decode_failure;
17904      delta = dis_Grp3 ( vbi, pfx, sz, delta, &decode_OK );
17905      if (!decode_OK) goto decode_failure;
17906      break;
17907   }
17908
17909   /* ------------------------ (Grp4 extensions) ---------- */
17910
17911   case 0xFE: { /* Grp4 Eb */
17912      Bool decode_OK = True;
17913      if (haveF2orF3(pfx)) goto decode_failure;
17914      delta = dis_Grp4 ( vbi, pfx, delta, &decode_OK );
17915      if (!decode_OK) goto decode_failure;
17916      break;
17917   }
17918
17919   /* ------------------------ (Grp5 extensions) ---------- */
17920
17921   case 0xFF: { /* Grp5 Ev */
17922      Bool decode_OK = True;
17923      if (haveF2orF3(pfx)) goto decode_failure;
17924      delta = dis_Grp5 ( vbi, pfx, sz, delta, &dres, &decode_OK );
17925      if (!decode_OK) goto decode_failure;
17926      break;
17927   }
17928
17929   /* ------------------------ Escapes to 2-byte opcodes -- */
17930
17931   case 0x0F: {
17932      opc = getUChar(delta); delta++;
17933      switch (opc) {
17934
17935      /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
17936
17937      case 0xBA: { /* Grp8 Ib,Ev */
17938         Bool decode_OK = False;
17939         if (haveF2orF3(pfx)) goto decode_failure;
17940         modrm = getUChar(delta);
17941         am_sz = lengthAMode(pfx,delta);
17942         d64   = getSDisp8(delta + am_sz);
17943         delta = dis_Grp8_Imm ( vbi, pfx, delta, modrm, am_sz, sz, d64,
17944                                &decode_OK );
17945         if (!decode_OK)
17946            goto decode_failure;
17947         break;
17948      }
17949
17950      /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
17951
17952      case 0xBC: /* BSF Gv,Ev */
17953         if (haveF2orF3(pfx)) goto decode_failure;
17954         delta = dis_bs_E_G ( vbi, pfx, sz, delta, True );
17955         break;
17956      case 0xBD: /* BSR Gv,Ev */
17957         if (haveF2orF3(pfx)) goto decode_failure;
17958         delta = dis_bs_E_G ( vbi, pfx, sz, delta, False );
17959         break;
17960
17961      /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
17962
17963      case 0xC8: /* BSWAP %eax */
17964      case 0xC9:
17965      case 0xCA:
17966      case 0xCB:
17967      case 0xCC:
17968      case 0xCD:
17969      case 0xCE:
17970      case 0xCF: /* BSWAP %edi */
17971         if (haveF2orF3(pfx)) goto decode_failure;
17972         /* According to the AMD64 docs, this insn can have size 4 or
17973            8. */
17974         if (sz == 4) {
17975            t1 = newTemp(Ity_I32);
17976            t2 = newTemp(Ity_I32);
17977            assign( t1, getIRegRexB(4, pfx, opc-0xC8) );
17978            assign( t2,
17979               binop(Iop_Or32,
17980                  binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
17981               binop(Iop_Or32,
17982                  binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
17983                                   mkU32(0x00FF0000)),
17984               binop(Iop_Or32,
17985                  binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
17986                                   mkU32(0x0000FF00)),
17987                  binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
17988                                   mkU32(0x000000FF) )
17989               )))
17990            );
17991            putIRegRexB(4, pfx, opc-0xC8, mkexpr(t2));
17992            DIP("bswapl %s\n", nameIRegRexB(4, pfx, opc-0xC8));
17993            break;
17994         }
17995	 else if (sz == 8) {
17996            IRTemp m8  = newTemp(Ity_I64);
17997            IRTemp s8  = newTemp(Ity_I64);
17998            IRTemp m16 = newTemp(Ity_I64);
17999            IRTemp s16 = newTemp(Ity_I64);
18000            IRTemp m32 = newTemp(Ity_I64);
18001            t1 = newTemp(Ity_I64);
18002            t2 = newTemp(Ity_I64);
18003            assign( t1, getIRegRexB(8, pfx, opc-0xC8) );
18004
18005            assign( m8, mkU64(0xFF00FF00FF00FF00ULL) );
18006            assign( s8,
18007                    binop(Iop_Or64,
18008                          binop(Iop_Shr64,
18009                                binop(Iop_And64,mkexpr(t1),mkexpr(m8)),
18010                                mkU8(8)),
18011                          binop(Iop_And64,
18012                                binop(Iop_Shl64,mkexpr(t1),mkU8(8)),
18013                                mkexpr(m8))
18014                         )
18015                  );
18016
18017            assign( m16, mkU64(0xFFFF0000FFFF0000ULL) );
18018            assign( s16,
18019                    binop(Iop_Or64,
18020                          binop(Iop_Shr64,
18021                                binop(Iop_And64,mkexpr(s8),mkexpr(m16)),
18022                                mkU8(16)),
18023                          binop(Iop_And64,
18024                                binop(Iop_Shl64,mkexpr(s8),mkU8(16)),
18025                                mkexpr(m16))
18026                         )
18027                  );
18028
18029            assign( m32, mkU64(0xFFFFFFFF00000000ULL) );
18030            assign( t2,
18031                    binop(Iop_Or64,
18032                          binop(Iop_Shr64,
18033                                binop(Iop_And64,mkexpr(s16),mkexpr(m32)),
18034                                mkU8(32)),
18035                          binop(Iop_And64,
18036                                binop(Iop_Shl64,mkexpr(s16),mkU8(32)),
18037                                mkexpr(m32))
18038                         )
18039                  );
18040
18041            putIRegRexB(8, pfx, opc-0xC8, mkexpr(t2));
18042            DIP("bswapq %s\n", nameIRegRexB(8, pfx, opc-0xC8));
18043            break;
18044         } else {
18045            goto decode_failure;
18046         }
18047
18048      /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
18049
18050      /* All of these are possible at sizes 2, 4 and 8, but until a
18051         size 2 test case shows up, only handle sizes 4 and 8. */
18052
18053      case 0xA3: /* BT Gv,Ev */
18054         if (haveF2orF3(pfx)) goto decode_failure;
18055         if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
18056         delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpNone );
18057         break;
18058      case 0xB3: /* BTR Gv,Ev */
18059         if (haveF2orF3(pfx)) goto decode_failure;
18060         if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
18061         delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpReset );
18062         break;
18063      case 0xAB: /* BTS Gv,Ev */
18064         if (haveF2orF3(pfx)) goto decode_failure;
18065         if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
18066         delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpSet );
18067         break;
18068      case 0xBB: /* BTC Gv,Ev */
18069         if (haveF2orF3(pfx)) goto decode_failure;
18070         if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
18071         delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpComp );
18072         break;
18073
18074      /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
18075
18076      case 0x40:
18077      case 0x41:
18078      case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
18079      case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
18080      case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
18081      case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
18082      case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
18083      case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
18084      case 0x48: /* CMOVSb (cmov negative) */
18085      case 0x49: /* CMOVSb (cmov not negative) */
18086      case 0x4A: /* CMOVP (cmov parity even) */
18087      case 0x4B: /* CMOVNP (cmov parity odd) */
18088      case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
18089      case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
18090      case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
18091      case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
18092         if (haveF2orF3(pfx)) goto decode_failure;
18093         delta = dis_cmov_E_G(vbi, pfx, sz, (AMD64Condcode)(opc - 0x40), delta);
18094         break;
18095
18096      /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
18097
18098      case 0xB0: { /* CMPXCHG Gb,Eb */
18099         Bool ok = True;
18100         if (haveF2orF3(pfx)) goto decode_failure;
18101         delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, 1, delta );
18102         if (!ok) goto decode_failure;
18103         break;
18104      }
18105      case 0xB1: { /* CMPXCHG Gv,Ev (allowed in 16,32,64 bit) */
18106         Bool ok = True;
18107         if (haveF2orF3(pfx)) goto decode_failure;
18108         if (sz != 2 && sz != 4 && sz != 8) goto decode_failure;
18109         delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, sz, delta );
18110         if (!ok) goto decode_failure;
18111         break;
18112      }
18113
18114      case 0xC7: { /* CMPXCHG8B Ev, CMPXCHG16B Ev */
18115         IRType  elemTy     = sz==4 ? Ity_I32 : Ity_I64;
18116         IRTemp  expdHi     = newTemp(elemTy);
18117         IRTemp  expdLo     = newTemp(elemTy);
18118         IRTemp  dataHi     = newTemp(elemTy);
18119         IRTemp  dataLo     = newTemp(elemTy);
18120         IRTemp  oldHi      = newTemp(elemTy);
18121         IRTemp  oldLo      = newTemp(elemTy);
18122         IRTemp  flags_old  = newTemp(Ity_I64);
18123         IRTemp  flags_new  = newTemp(Ity_I64);
18124         IRTemp  success    = newTemp(Ity_I1);
18125         IROp    opOR       = sz==4 ? Iop_Or32    : Iop_Or64;
18126         IROp    opXOR      = sz==4 ? Iop_Xor32   : Iop_Xor64;
18127         IROp    opCasCmpEQ = sz==4 ? Iop_CasCmpEQ32 : Iop_CasCmpEQ64;
18128         IRExpr* zero       = sz==4 ? mkU32(0)    : mkU64(0);
18129         IRTemp expdHi64    = newTemp(Ity_I64);
18130         IRTemp expdLo64    = newTemp(Ity_I64);
18131
18132         /* Translate this using a DCAS, even if there is no LOCK
18133            prefix.  Life is too short to bother with generating two
18134            different translations for the with/without-LOCK-prefix
18135            cases. */
18136         *expect_CAS = True;
18137
18138	 /* Decode, and generate address. */
18139         if (have66orF2orF3(pfx)) goto decode_failure;
18140         if (sz != 4 && sz != 8) goto decode_failure;
18141         if (sz == 8 && !(archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16))
18142            goto decode_failure;
18143         modrm = getUChar(delta);
18144         if (epartIsReg(modrm)) goto decode_failure;
18145         if (gregLO3ofRM(modrm) != 1) goto decode_failure;
18146         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
18147         delta += alen;
18148
18149         /* cmpxchg16b requires an alignment check. */
18150         if (sz == 8)
18151            gen_SEGV_if_not_16_aligned( addr );
18152
18153         /* Get the expected and new values. */
18154         assign( expdHi64, getIReg64(R_RDX) );
18155         assign( expdLo64, getIReg64(R_RAX) );
18156
18157         /* These are the correctly-sized expected and new values.
18158            However, we also get expdHi64/expdLo64 above as 64-bits
18159            regardless, because we will need them later in the 32-bit
18160            case (paradoxically). */
18161         assign( expdHi, sz==4 ? unop(Iop_64to32, mkexpr(expdHi64))
18162                               : mkexpr(expdHi64) );
18163         assign( expdLo, sz==4 ? unop(Iop_64to32, mkexpr(expdLo64))
18164                               : mkexpr(expdLo64) );
18165         assign( dataHi, sz==4 ? getIReg32(R_RCX) : getIReg64(R_RCX) );
18166         assign( dataLo, sz==4 ? getIReg32(R_RBX) : getIReg64(R_RBX) );
18167
18168         /* Do the DCAS */
18169         stmt( IRStmt_CAS(
18170                  mkIRCAS( oldHi, oldLo,
18171                           Iend_LE, mkexpr(addr),
18172                           mkexpr(expdHi), mkexpr(expdLo),
18173                           mkexpr(dataHi), mkexpr(dataLo)
18174               )));
18175
18176         /* success when oldHi:oldLo == expdHi:expdLo */
18177         assign( success,
18178                 binop(opCasCmpEQ,
18179                       binop(opOR,
18180                             binop(opXOR, mkexpr(oldHi), mkexpr(expdHi)),
18181                             binop(opXOR, mkexpr(oldLo), mkexpr(expdLo))
18182                       ),
18183                       zero
18184                 ));
18185
18186         /* If the DCAS is successful, that is to say oldHi:oldLo ==
18187            expdHi:expdLo, then put expdHi:expdLo back in RDX:RAX,
18188            which is where they came from originally.  Both the actual
18189            contents of these two regs, and any shadow values, are
18190            unchanged.  If the DCAS fails then we're putting into
18191            RDX:RAX the value seen in memory. */
18192         /* Now of course there's a complication in the 32-bit case
18193            (bah!): if the DCAS succeeds, we need to leave RDX:RAX
18194            unchanged; but if we use the same scheme as in the 64-bit
18195            case, we get hit by the standard rule that a write to the
18196            bottom 32 bits of an integer register zeros the upper 32
18197            bits.  And so the upper halves of RDX and RAX mysteriously
18198            become zero.  So we have to stuff back in the original
18199            64-bit values which we previously stashed in
18200            expdHi64:expdLo64, even if we're doing a cmpxchg8b. */
18201         /* It's just _so_ much fun ... */
18202         putIRegRDX( 8,
18203                     IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
18204                                   sz == 4 ? unop(Iop_32Uto64, mkexpr(oldHi))
18205                                           : mkexpr(oldHi),
18206                                   mkexpr(expdHi64)
18207                   ));
18208         putIRegRAX( 8,
18209                     IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
18210                                   sz == 4 ? unop(Iop_32Uto64, mkexpr(oldLo))
18211                                           : mkexpr(oldLo),
18212                                   mkexpr(expdLo64)
18213                   ));
18214
18215         /* Copy the success bit into the Z flag and leave the others
18216            unchanged */
18217         assign( flags_old, widenUto64(mk_amd64g_calculate_rflags_all()));
18218         assign(
18219            flags_new,
18220            binop(Iop_Or64,
18221                  binop(Iop_And64, mkexpr(flags_old),
18222                                   mkU64(~AMD64G_CC_MASK_Z)),
18223                  binop(Iop_Shl64,
18224                        binop(Iop_And64,
18225                              unop(Iop_1Uto64, mkexpr(success)), mkU64(1)),
18226                        mkU8(AMD64G_CC_SHIFT_Z)) ));
18227
18228         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
18229         stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
18230         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
18231         /* Set NDEP even though it isn't used.  This makes
18232            redundant-PUT elimination of previous stores to this field
18233            work better. */
18234         stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
18235
18236         /* Sheesh.  Aren't you glad it was me and not you that had to
18237	    write and validate all this grunge? */
18238
18239	 DIP("cmpxchg8b %s\n", dis_buf);
18240	 break;
18241
18242      }
18243
18244      /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
18245
18246      case 0xA2: { /* CPUID */
18247         /* Uses dirty helper:
18248               void amd64g_dirtyhelper_CPUID ( VexGuestAMD64State* )
18249            declared to mod rax, wr rbx, rcx, rdx
18250         */
18251         IRDirty* d     = NULL;
18252         HChar*   fName = NULL;
18253         void*    fAddr = NULL;
18254         if (haveF2orF3(pfx)) goto decode_failure;
18255         if (archinfo->hwcaps == (VEX_HWCAPS_AMD64_SSE3
18256                                  |VEX_HWCAPS_AMD64_CX16)) {
18257            fName = "amd64g_dirtyhelper_CPUID_sse3_and_cx16";
18258            fAddr = &amd64g_dirtyhelper_CPUID_sse3_and_cx16;
18259            /* This is a Core-2-like machine */
18260            //fName = "amd64g_dirtyhelper_CPUID_sse42_and_cx16";
18261            //fAddr = &amd64g_dirtyhelper_CPUID_sse42_and_cx16;
18262            /* This is a Core-i5-like machine */
18263         }
18264         else {
18265            /* Give a CPUID for at least a baseline machine, SSE2
18266               only, and no CX16 */
18267            fName = "amd64g_dirtyhelper_CPUID_baseline";
18268            fAddr = &amd64g_dirtyhelper_CPUID_baseline;
18269         }
18270
18271         vassert(fName); vassert(fAddr);
18272         d = unsafeIRDirty_0_N ( 0/*regparms*/,
18273                                 fName, fAddr, mkIRExprVec_0() );
18274         /* declare guest state effects */
18275         d->needsBBP = True;
18276         d->nFxState = 4;
18277         d->fxState[0].fx     = Ifx_Modify;
18278         d->fxState[0].offset = OFFB_RAX;
18279         d->fxState[0].size   = 8;
18280         d->fxState[1].fx     = Ifx_Write;
18281         d->fxState[1].offset = OFFB_RBX;
18282         d->fxState[1].size   = 8;
18283         d->fxState[2].fx     = Ifx_Modify;
18284         d->fxState[2].offset = OFFB_RCX;
18285         d->fxState[2].size   = 8;
18286         d->fxState[3].fx     = Ifx_Write;
18287         d->fxState[3].offset = OFFB_RDX;
18288         d->fxState[3].size   = 8;
18289         /* execute the dirty call, side-effecting guest state */
18290         stmt( IRStmt_Dirty(d) );
18291         /* CPUID is a serialising insn.  So, just in case someone is
18292            using it as a memory fence ... */
18293         stmt( IRStmt_MBE(Imbe_Fence) );
18294         DIP("cpuid\n");
18295         break;
18296      }
18297
18298      /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
18299
18300      case 0xB6: /* MOVZXb Eb,Gv */
18301         if (haveF2orF3(pfx)) goto decode_failure;
18302         if (sz != 2 && sz != 4 && sz != 8)
18303            goto decode_failure;
18304         delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, False );
18305         break;
18306      case 0xB7: /* MOVZXw Ew,Gv */
18307         if (haveF2orF3(pfx)) goto decode_failure;
18308         if (sz != 4 && sz != 8)
18309            goto decode_failure;
18310         delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, False );
18311         break;
18312
18313      case 0xBE: /* MOVSXb Eb,Gv */
18314         if (haveF2orF3(pfx)) goto decode_failure;
18315         if (sz != 2 && sz != 4 && sz != 8)
18316            goto decode_failure;
18317         delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, True );
18318         break;
18319      case 0xBF: /* MOVSXw Ew,Gv */
18320         if (haveF2orF3(pfx)) goto decode_failure;
18321         if (sz != 4 && sz != 8)
18322            goto decode_failure;
18323         delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, True );
18324         break;
18325
18326//.. //--       /* =-=-=-=-=-=-=-=-=-=-= MOVNTI -=-=-=-=-=-=-=-=-= */
18327//.. //--
18328//.. //--       case 0xC3: /* MOVNTI Gv,Ev */
18329//.. //--          vg_assert(sz == 4);
18330//.. //--          modrm = getUChar(eip);
18331//.. //--          vg_assert(!epartIsReg(modrm));
18332//.. //--          t1 = newTemp(cb);
18333//.. //--          uInstr2(cb, GET, 4, ArchReg, gregOfRM(modrm), TempReg, t1);
18334//.. //--          pair = disAMode ( cb, sorb, eip, dis_buf );
18335//.. //--          t2 = LOW24(pair);
18336//.. //--          eip += HI8(pair);
18337//.. //--          uInstr2(cb, STORE, 4, TempReg, t1, TempReg, t2);
18338//.. //--          DIP("movnti %s,%s\n", nameIReg(4,gregOfRM(modrm)), dis_buf);
18339//.. //--          break;
18340
18341      /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
18342
18343      case 0xAF: /* IMUL Ev, Gv */
18344         if (haveF2orF3(pfx)) goto decode_failure;
18345         delta = dis_mul_E_G ( vbi, pfx, sz, delta );
18346         break;
18347
18348      /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
18349
18350      case 0x1F:
18351         if (haveF2orF3(pfx)) goto decode_failure;
18352         modrm = getUChar(delta);
18353         if (epartIsReg(modrm)) goto decode_failure;
18354         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
18355         delta += alen;
18356         DIP("nop%c %s\n", nameISize(sz), dis_buf);
18357         break;
18358
18359      /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
18360      case 0x80:
18361      case 0x81:
18362      case 0x82: /* JBb/JNAEb (jump below) */
18363      case 0x83: /* JNBb/JAEb (jump not below) */
18364      case 0x84: /* JZb/JEb (jump zero) */
18365      case 0x85: /* JNZb/JNEb (jump not zero) */
18366      case 0x86: /* JBEb/JNAb (jump below or equal) */
18367      case 0x87: /* JNBEb/JAb (jump not below or equal) */
18368      case 0x88: /* JSb (jump negative) */
18369      case 0x89: /* JSb (jump not negative) */
18370      case 0x8A: /* JP (jump parity even) */
18371      case 0x8B: /* JNP/JPO (jump parity odd) */
18372      case 0x8C: /* JLb/JNGEb (jump less) */
18373      case 0x8D: /* JGEb/JNLb (jump greater or equal) */
18374      case 0x8E: /* JLEb/JNGb (jump less or equal) */
18375      case 0x8F: /* JGb/JNLEb (jump greater) */
18376       { Long   jmpDelta;
18377         HChar* comment  = "";
18378         if (haveF2orF3(pfx)) goto decode_failure;
18379         jmpDelta = getSDisp32(delta);
18380         d64 = (guest_RIP_bbstart+delta+4) + jmpDelta;
18381         delta += 4;
18382         if (resteerCisOk
18383             && vex_control.guest_chase_cond
18384             && (Addr64)d64 != (Addr64)guest_RIP_bbstart
18385             && jmpDelta < 0
18386             && resteerOkFn( callback_opaque, d64) ) {
18387            /* Speculation: assume this backward branch is taken.  So
18388               we need to emit a side-exit to the insn following this
18389               one, on the negation of the condition, and continue at
18390               the branch target address (d64).  If we wind up back at
18391               the first instruction of the trace, just stop; it's
18392               better to let the IR loop unroller handle that case. */
18393            stmt( IRStmt_Exit(
18394                     mk_amd64g_calculate_condition(
18395                        (AMD64Condcode)(1 ^ (opc - 0x80))),
18396                     Ijk_Boring,
18397                     IRConst_U64(guest_RIP_bbstart+delta) ) );
18398            dres.whatNext   = Dis_ResteerC;
18399            dres.continueAt = d64;
18400            comment = "(assumed taken)";
18401         }
18402         else
18403         if (resteerCisOk
18404             && vex_control.guest_chase_cond
18405             && (Addr64)d64 != (Addr64)guest_RIP_bbstart
18406             && jmpDelta >= 0
18407             && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
18408            /* Speculation: assume this forward branch is not taken.
18409               So we need to emit a side-exit to d64 (the dest) and
18410               continue disassembling at the insn immediately
18411               following this one. */
18412            stmt( IRStmt_Exit(
18413                     mk_amd64g_calculate_condition((AMD64Condcode)
18414                                                   (opc - 0x80)),
18415                     Ijk_Boring,
18416                     IRConst_U64(d64) ) );
18417            dres.whatNext   = Dis_ResteerC;
18418            dres.continueAt = guest_RIP_bbstart+delta;
18419            comment = "(assumed not taken)";
18420         }
18421         else {
18422            /* Conservative default translation - end the block at
18423               this point. */
18424            jcc_01( (AMD64Condcode)(opc - 0x80),
18425                    guest_RIP_bbstart+delta,
18426                    d64 );
18427            dres.whatNext = Dis_StopHere;
18428         }
18429         DIP("j%s-32 0x%llx %s\n", name_AMD64Condcode(opc - 0x80), d64, comment);
18430         break;
18431       }
18432
18433      /* =-=-=-=-=-=-=-=-=- PREFETCH =-=-=-=-=-=-=-=-=-= */
18434      case 0x0D: /* 0F 0D /0 -- prefetch mem8 */
18435                 /* 0F 0D /1 -- prefetchw mem8 */
18436         if (have66orF2orF3(pfx)) goto decode_failure;
18437         modrm = getUChar(delta);
18438         if (epartIsReg(modrm)) goto decode_failure;
18439         if (gregLO3ofRM(modrm) != 0 && gregLO3ofRM(modrm) != 1)
18440            goto decode_failure;
18441
18442         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
18443         delta += alen;
18444
18445         switch (gregLO3ofRM(modrm)) {
18446            case 0: DIP("prefetch %s\n", dis_buf); break;
18447            case 1: DIP("prefetchw %s\n", dis_buf); break;
18448            default: vassert(0); /*NOTREACHED*/
18449         }
18450         break;
18451
18452      /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
18453      case 0x31: { /* RDTSC */
18454         IRTemp   val  = newTemp(Ity_I64);
18455         IRExpr** args = mkIRExprVec_0();
18456         IRDirty* d    = unsafeIRDirty_1_N (
18457                            val,
18458                            0/*regparms*/,
18459                            "amd64g_dirtyhelper_RDTSC",
18460                            &amd64g_dirtyhelper_RDTSC,
18461                            args
18462                         );
18463         if (have66orF2orF3(pfx)) goto decode_failure;
18464         /* execute the dirty call, dumping the result in val. */
18465         stmt( IRStmt_Dirty(d) );
18466         putIRegRDX(4, unop(Iop_64HIto32, mkexpr(val)));
18467         putIRegRAX(4, unop(Iop_64to32, mkexpr(val)));
18468         DIP("rdtsc\n");
18469         break;
18470      }
18471
18472//..       /* =-=-=-=-=-=-=-=-=- PUSH/POP Sreg =-=-=-=-=-=-=-=-=-= */
18473//..
18474//..       case 0xA1: /* POP %FS */
18475//..          dis_pop_segreg( R_FS, sz ); break;
18476//..       case 0xA9: /* POP %GS */
18477//..          dis_pop_segreg( R_GS, sz ); break;
18478//..
18479//..       case 0xA0: /* PUSH %FS */
18480//..          dis_push_segreg( R_FS, sz ); break;
18481//..       case 0xA8: /* PUSH %GS */
18482//..          dis_push_segreg( R_GS, sz ); break;
18483
18484      /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
18485      case 0x90:
18486      case 0x91:
18487      case 0x92: /* set-Bb/set-NAEb (set if below) */
18488      case 0x93: /* set-NBb/set-AEb (set if not below) */
18489      case 0x94: /* set-Zb/set-Eb (set if zero) */
18490      case 0x95: /* set-NZb/set-NEb (set if not zero) */
18491      case 0x96: /* set-BEb/set-NAb (set if below or equal) */
18492      case 0x97: /* set-NBEb/set-Ab (set if not below or equal) */
18493      case 0x98: /* set-Sb (set if negative) */
18494      case 0x99: /* set-Sb (set if not negative) */
18495      case 0x9A: /* set-P (set if parity even) */
18496      case 0x9B: /* set-NP (set if parity odd) */
18497      case 0x9C: /* set-Lb/set-NGEb (set if less) */
18498      case 0x9D: /* set-GEb/set-NLb (set if greater or equal) */
18499      case 0x9E: /* set-LEb/set-NGb (set if less or equal) */
18500      case 0x9F: /* set-Gb/set-NLEb (set if greater) */
18501         if (haveF2orF3(pfx)) goto decode_failure;
18502         t1 = newTemp(Ity_I8);
18503         assign( t1, unop(Iop_1Uto8,mk_amd64g_calculate_condition(opc-0x90)) );
18504         modrm = getUChar(delta);
18505         if (epartIsReg(modrm)) {
18506            delta++;
18507            putIRegE(1, pfx, modrm, mkexpr(t1));
18508            DIP("set%s %s\n", name_AMD64Condcode(opc-0x90),
18509                              nameIRegE(1,pfx,modrm));
18510         } else {
18511            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
18512            delta += alen;
18513            storeLE( mkexpr(addr), mkexpr(t1) );
18514            DIP("set%s %s\n", name_AMD64Condcode(opc-0x90), dis_buf);
18515         }
18516         break;
18517
18518      /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
18519
18520      case 0xA4: /* SHLDv imm8,Gv,Ev */
18521         modrm = getUChar(delta);
18522         d64   = delta + lengthAMode(pfx, delta);
18523         vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
18524         delta = dis_SHLRD_Gv_Ev (
18525                    vbi, pfx, delta, modrm, sz,
18526                    mkU8(getUChar(d64)), True, /* literal */
18527                    dis_buf, True /* left */ );
18528         break;
18529      case 0xA5: /* SHLDv %cl,Gv,Ev */
18530         modrm = getUChar(delta);
18531         delta = dis_SHLRD_Gv_Ev (
18532                    vbi, pfx, delta, modrm, sz,
18533                    getIRegCL(), False, /* not literal */
18534                    "%cl", True /* left */ );
18535         break;
18536
18537      case 0xAC: /* SHRDv imm8,Gv,Ev */
18538         modrm = getUChar(delta);
18539         d64   = delta + lengthAMode(pfx, delta);
18540         vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
18541         delta = dis_SHLRD_Gv_Ev (
18542                    vbi, pfx, delta, modrm, sz,
18543                    mkU8(getUChar(d64)), True, /* literal */
18544                    dis_buf, False /* right */ );
18545         break;
18546      case 0xAD: /* SHRDv %cl,Gv,Ev */
18547         modrm = getUChar(delta);
18548         delta = dis_SHLRD_Gv_Ev (
18549                    vbi, pfx, delta, modrm, sz,
18550                    getIRegCL(), False, /* not literal */
18551                    "%cl", False /* right */);
18552         break;
18553
18554      /* =-=-=-=-=-=-=-=-=- SYSCALL -=-=-=-=-=-=-=-=-=-= */
18555      case 0x05: /* SYSCALL */
18556         guest_RIP_next_mustcheck = True;
18557         guest_RIP_next_assumed = guest_RIP_bbstart + delta;
18558         putIReg64( R_RCX, mkU64(guest_RIP_next_assumed) );
18559         /* It's important that all guest state is up-to-date
18560            at this point.  So we declare an end-of-block here, which
18561            forces any cached guest state to be flushed. */
18562         jmp_lit(Ijk_Sys_syscall, guest_RIP_next_assumed);
18563         dres.whatNext = Dis_StopHere;
18564         DIP("syscall\n");
18565         break;
18566
18567      /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
18568
18569      case 0xC0: { /* XADD Gb,Eb */
18570         Bool decode_OK = False;
18571         delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, 1, delta );
18572         if (!decode_OK)
18573            goto decode_failure;
18574         break;
18575      }
18576      case 0xC1: { /* XADD Gv,Ev */
18577         Bool decode_OK = False;
18578         delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, sz, delta );
18579         if (!decode_OK)
18580            goto decode_failure;
18581         break;
18582      }
18583
18584      /* =-=-=-=-=-=-=-=-=- MMXery =-=-=-=-=-=-=-=-=-=-= */
18585
18586      case 0x71:
18587      case 0x72:
18588      case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
18589
18590      case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
18591      case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
18592      case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
18593      case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
18594
18595      case 0xFC:
18596      case 0xFD:
18597      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
18598
18599      case 0xEC:
18600      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
18601
18602      case 0xDC:
18603      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
18604
18605      case 0xF8:
18606      case 0xF9:
18607      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
18608
18609      case 0xE8:
18610      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
18611
18612      case 0xD8:
18613      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
18614
18615      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
18616      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
18617
18618      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
18619
18620      case 0x74:
18621      case 0x75:
18622      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
18623
18624      case 0x64:
18625      case 0x65:
18626      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
18627
18628      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
18629      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
18630      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
18631
18632      case 0x68:
18633      case 0x69:
18634      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
18635
18636      case 0x60:
18637      case 0x61:
18638      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
18639
18640      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
18641      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
18642      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
18643      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
18644
18645      case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
18646      case 0xF2:
18647      case 0xF3:
18648
18649      case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
18650      case 0xD2:
18651      case 0xD3:
18652
18653      case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
18654      case 0xE2:
18655      {
18656         Long delta0    = delta-1;
18657         Bool decode_OK = False;
18658
18659         /* If sz==2 this is SSE, and we assume sse idec has
18660            already spotted those cases by now. */
18661         if (sz != 4 && sz != 8)
18662            goto decode_failure;
18663         if (have66orF2orF3(pfx))
18664            goto decode_failure;
18665
18666         delta = dis_MMX ( &decode_OK, vbi, pfx, sz, delta-1 );
18667         if (!decode_OK) {
18668            delta = delta0;
18669            goto decode_failure;
18670         }
18671         break;
18672      }
18673
18674      case 0x0E: /* FEMMS */
18675      case 0x77: /* EMMS */
18676         if (sz != 4)
18677            goto decode_failure;
18678         do_EMMS_preamble();
18679         DIP("{f}emms\n");
18680         break;
18681
18682      /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
18683      case 0x01: /* 0F 01 /0 -- SGDT */
18684                 /* 0F 01 /1 -- SIDT */
18685      {
18686          /* This is really revolting, but ... since each processor
18687             (core) only has one IDT and one GDT, just let the guest
18688             see it (pass-through semantics).  I can't see any way to
18689             construct a faked-up value, so don't bother to try. */
18690         modrm = getUChar(delta);
18691         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
18692         delta += alen;
18693         if (epartIsReg(modrm)) goto decode_failure;
18694         if (gregLO3ofRM(modrm) != 0 && gregLO3ofRM(modrm) != 1)
18695            goto decode_failure;
18696         switch (gregLO3ofRM(modrm)) {
18697            case 0: DIP("sgdt %s\n", dis_buf); break;
18698            case 1: DIP("sidt %s\n", dis_buf); break;
18699            default: vassert(0); /*NOTREACHED*/
18700         }
18701
18702         IRDirty* d = unsafeIRDirty_0_N (
18703                          0/*regparms*/,
18704                          "amd64g_dirtyhelper_SxDT",
18705                          &amd64g_dirtyhelper_SxDT,
18706                          mkIRExprVec_2( mkexpr(addr),
18707                                         mkU64(gregLO3ofRM(modrm)) )
18708                      );
18709         /* declare we're writing memory */
18710         d->mFx   = Ifx_Write;
18711         d->mAddr = mkexpr(addr);
18712         d->mSize = 6;
18713         stmt( IRStmt_Dirty(d) );
18714         break;
18715      }
18716
18717      /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
18718
18719      default:
18720         goto decode_failure;
18721   } /* switch (opc) for the 2-byte opcodes */
18722   goto decode_success;
18723   } /* case 0x0F: of primary opcode */
18724
18725   /* ------------------------ ??? ------------------------ */
18726
18727  default:
18728  decode_failure:
18729   /* All decode failures end up here. */
18730   vex_printf("vex amd64->IR: unhandled instruction bytes: "
18731              "0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
18732              (Int)getUChar(delta_start+0),
18733              (Int)getUChar(delta_start+1),
18734              (Int)getUChar(delta_start+2),
18735              (Int)getUChar(delta_start+3),
18736              (Int)getUChar(delta_start+4),
18737              (Int)getUChar(delta_start+5),
18738              (Int)getUChar(delta_start+6),
18739              (Int)getUChar(delta_start+7) );
18740
18741   /* Tell the dispatcher that this insn cannot be decoded, and so has
18742      not been executed, and (is currently) the next to be executed.
18743      RIP should be up-to-date since it made so at the start of each
18744      insn, but nevertheless be paranoid and update it again right
18745      now. */
18746   stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
18747   jmp_lit(Ijk_NoDecode, guest_RIP_curr_instr);
18748   dres.whatNext = Dis_StopHere;
18749   dres.len      = 0;
18750   /* We also need to say that a CAS is not expected now, regardless
18751      of what it might have been set to at the start of the function,
18752      since the IR that we've emitted just above (to synthesis a
18753      SIGILL) does not involve any CAS, and presumably no other IR has
18754      been emitted for this (non-decoded) insn. */
18755   *expect_CAS = False;
18756   return dres;
18757
18758   } /* switch (opc) for the main (primary) opcode switch. */
18759
18760  decode_success:
18761   /* All decode successes end up here. */
18762   DIP("\n");
18763   dres.len = (Int)toUInt(delta - delta_start);
18764   return dres;
18765}
18766
18767#undef DIP
18768#undef DIS
18769
18770
18771/*------------------------------------------------------------*/
18772/*--- Top-level fn                                         ---*/
18773/*------------------------------------------------------------*/
18774
18775/* Disassemble a single instruction into IR.  The instruction
18776   is located in host memory at &guest_code[delta]. */
18777
18778DisResult disInstr_AMD64 ( IRSB*        irsb_IN,
18779                           Bool         put_IP,
18780                           Bool         (*resteerOkFn) ( void*, Addr64 ),
18781                           Bool         resteerCisOk,
18782                           void*        callback_opaque,
18783                           UChar*       guest_code_IN,
18784                           Long         delta,
18785                           Addr64       guest_IP,
18786                           VexArch      guest_arch,
18787                           VexArchInfo* archinfo,
18788                           VexAbiInfo*  abiinfo,
18789                           Bool         host_bigendian_IN )
18790{
18791   Int       i, x1, x2;
18792   Bool      expect_CAS, has_CAS;
18793   DisResult dres;
18794
18795   /* Set globals (see top of this file) */
18796   vassert(guest_arch == VexArchAMD64);
18797   guest_code           = guest_code_IN;
18798   irsb                 = irsb_IN;
18799   host_is_bigendian    = host_bigendian_IN;
18800   guest_RIP_curr_instr = guest_IP;
18801   guest_RIP_bbstart    = guest_IP - delta;
18802
18803   /* We'll consult these after doing disInstr_AMD64_WRK. */
18804   guest_RIP_next_assumed   = 0;
18805   guest_RIP_next_mustcheck = False;
18806
18807   x1 = irsb_IN->stmts_used;
18808   expect_CAS = False;
18809   dres = disInstr_AMD64_WRK ( &expect_CAS, put_IP, resteerOkFn,
18810                               resteerCisOk,
18811                               callback_opaque,
18812                               delta, archinfo, abiinfo );
18813   x2 = irsb_IN->stmts_used;
18814   vassert(x2 >= x1);
18815
18816   /* If disInstr_AMD64_WRK tried to figure out the next rip, check it
18817      got it right.  Failure of this assertion is serious and denotes
18818      a bug in disInstr. */
18819   if (guest_RIP_next_mustcheck
18820       && guest_RIP_next_assumed != guest_RIP_curr_instr + dres.len) {
18821      vex_printf("\n");
18822      vex_printf("assumed next %%rip = 0x%llx\n",
18823                 guest_RIP_next_assumed );
18824      vex_printf(" actual next %%rip = 0x%llx\n",
18825                 guest_RIP_curr_instr + dres.len );
18826      vpanic("disInstr_AMD64: disInstr miscalculated next %rip");
18827   }
18828
18829   /* See comment at the top of disInstr_AMD64_WRK for meaning of
18830      expect_CAS.  Here, we (sanity-)check for the presence/absence of
18831      IRCAS as directed by the returned expect_CAS value. */
18832   has_CAS = False;
18833   for (i = x1; i < x2; i++) {
18834      if (irsb_IN->stmts[i]->tag == Ist_CAS)
18835         has_CAS = True;
18836   }
18837
18838   if (expect_CAS != has_CAS) {
18839      /* inconsistency detected.  re-disassemble the instruction so as
18840         to generate a useful error message; then assert. */
18841      vex_traceflags |= VEX_TRACE_FE;
18842      dres = disInstr_AMD64_WRK ( &expect_CAS, put_IP, resteerOkFn,
18843                                  resteerCisOk,
18844                                  callback_opaque,
18845                                  delta, archinfo, abiinfo );
18846      for (i = x1; i < x2; i++) {
18847         vex_printf("\t\t");
18848         ppIRStmt(irsb_IN->stmts[i]);
18849         vex_printf("\n");
18850      }
18851      /* Failure of this assertion is serious and denotes a bug in
18852         disInstr. */
18853      vpanic("disInstr_AMD64: inconsistency in LOCK prefix handling");
18854   }
18855
18856   return dres;
18857}
18858
18859
18860/*------------------------------------------------------------*/
18861/*--- Unused stuff                                         ---*/
18862/*------------------------------------------------------------*/
18863
18864// A potentially more Memcheck-friendly version of gen_LZCNT, if
18865// this should ever be needed.
18866//
18867//static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
18868//{
18869//   /* Scheme is simple: propagate the most significant 1-bit into all
18870//      lower positions in the word.  This gives a word of the form
18871//      0---01---1.  Now invert it, giving a word of the form
18872//      1---10---0, then do a population-count idiom (to count the 1s,
18873//      which is the number of leading zeroes, or the word size if the
18874//      original word was 0.
18875//   */
18876//   Int i;
18877//   IRTemp t[7];
18878//   for (i = 0; i < 7; i++) {
18879//      t[i] = newTemp(ty);
18880//   }
18881//   if (ty == Ity_I64) {
18882//      assign(t[0], binop(Iop_Or64, mkexpr(src),
18883//                                   binop(Iop_Shr64, mkexpr(src),  mkU8(1))));
18884//      assign(t[1], binop(Iop_Or64, mkexpr(t[0]),
18885//                                   binop(Iop_Shr64, mkexpr(t[0]), mkU8(2))));
18886//      assign(t[2], binop(Iop_Or64, mkexpr(t[1]),
18887//                                   binop(Iop_Shr64, mkexpr(t[1]), mkU8(4))));
18888//      assign(t[3], binop(Iop_Or64, mkexpr(t[2]),
18889//                                   binop(Iop_Shr64, mkexpr(t[2]), mkU8(8))));
18890//      assign(t[4], binop(Iop_Or64, mkexpr(t[3]),
18891//                                   binop(Iop_Shr64, mkexpr(t[3]), mkU8(16))));
18892//      assign(t[5], binop(Iop_Or64, mkexpr(t[4]),
18893//                                   binop(Iop_Shr64, mkexpr(t[4]), mkU8(32))));
18894//      assign(t[6], unop(Iop_Not64, mkexpr(t[5])));
18895//      return gen_POPCOUNT(ty, t[6]);
18896//   }
18897//   if (ty == Ity_I32) {
18898//      assign(t[0], binop(Iop_Or32, mkexpr(src),
18899//                                   binop(Iop_Shr32, mkexpr(src),  mkU8(1))));
18900//      assign(t[1], binop(Iop_Or32, mkexpr(t[0]),
18901//                                   binop(Iop_Shr32, mkexpr(t[0]), mkU8(2))));
18902//      assign(t[2], binop(Iop_Or32, mkexpr(t[1]),
18903//                                   binop(Iop_Shr32, mkexpr(t[1]), mkU8(4))));
18904//      assign(t[3], binop(Iop_Or32, mkexpr(t[2]),
18905//                                   binop(Iop_Shr32, mkexpr(t[2]), mkU8(8))));
18906//      assign(t[4], binop(Iop_Or32, mkexpr(t[3]),
18907//                                   binop(Iop_Shr32, mkexpr(t[3]), mkU8(16))));
18908//      assign(t[5], unop(Iop_Not32, mkexpr(t[4])));
18909//      return gen_POPCOUNT(ty, t[5]);
18910//   }
18911//   if (ty == Ity_I16) {
18912//      assign(t[0], binop(Iop_Or16, mkexpr(src),
18913//                                   binop(Iop_Shr16, mkexpr(src),  mkU8(1))));
18914//      assign(t[1], binop(Iop_Or16, mkexpr(t[0]),
18915//                                   binop(Iop_Shr16, mkexpr(t[0]), mkU8(2))));
18916//      assign(t[2], binop(Iop_Or16, mkexpr(t[1]),
18917//                                   binop(Iop_Shr16, mkexpr(t[1]), mkU8(4))));
18918//      assign(t[3], binop(Iop_Or16, mkexpr(t[2]),
18919//                                   binop(Iop_Shr16, mkexpr(t[2]), mkU8(8))));
18920//      assign(t[4], unop(Iop_Not16, mkexpr(t[3])));
18921//      return gen_POPCOUNT(ty, t[4]);
18922//   }
18923//   vassert(0);
18924//}
18925
18926
18927/*--------------------------------------------------------------------*/
18928/*--- end                                       guest_amd64_toIR.c ---*/
18929/*--------------------------------------------------------------------*/
18930