1
2/*--------------------------------------------------------------------*/
3/*--- begin                                     guest_amd64_toIR.c ---*/
4/*--------------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2012 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36/* Translates AMD64 code to IR. */
37
38/* TODO:
39
40   All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
41   to ensure a 64-bit value is being written.
42
43   x87 FP Limitations:
44
45   * all arithmetic done at 64 bits
46
47   * no FP exceptions, except for handling stack over/underflow
48
49   * FP rounding mode observed only for float->int conversions and
50     int->float conversions which could lose accuracy, and for
51     float-to-float rounding.  For all other operations,
52     round-to-nearest is used, regardless.
53
54   * FP sin/cos/tan/sincos: C2 flag is always cleared.  IOW the
55     simulation claims the argument is in-range (-2^63 <= arg <= 2^63)
56     even when it isn't.
57
58   * some of the FCOM cases could do with testing -- not convinced
59     that the args are the right way round.
60
61   * FSAVE does not re-initialise the FPU; it should do
62
63   * FINIT not only initialises the FPU environment, it also zeroes
64     all the FP registers.  It should leave the registers unchanged.
65
66    RDTSC returns zero, always.
67
68    SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
69    per Intel docs this bit has no meaning anyway.  Since PUSHF is the
70    only way to observe eflags[1], a proper fix would be to make that
71    bit be set by PUSHF.
72
73    This module uses global variables and so is not MT-safe (if that
74    should ever become relevant).
75*/
76
77/* Notes re address size overrides (0x67).
78
79   According to the AMD documentation (24594 Rev 3.09, Sept 2003,
80   "AMD64 Architecture Programmer's Manual Volume 3: General-Purpose
81   and System Instructions"), Section 1.2.3 ("Address-Size Override
82   Prefix"):
83
84   0x67 applies to all explicit memory references, causing the top
85   32 bits of the effective address to become zero.
86
87   0x67 has no effect on stack references (push/pop); these always
88   use a 64-bit address.
89
90   0x67 changes the interpretation of instructions which implicitly
91   reference RCX/RSI/RDI, so that in fact ECX/ESI/EDI are used
92   instead.  These are:
93
94      cmp{s,sb,sw,sd,sq}
95      in{s,sb,sw,sd}
96      jcxz, jecxz, jrcxz
97      lod{s,sb,sw,sd,sq}
98      loop{,e,bz,be,z}
99      mov{s,sb,sw,sd,sq}
100      out{s,sb,sw,sd}
101      rep{,e,ne,nz}
102      sca{s,sb,sw,sd,sq}
103      sto{s,sb,sw,sd,sq}
104      xlat{,b} */
105
106/* "Special" instructions.
107
108   This instruction decoder can decode three special instructions
109   which mean nothing natively (are no-ops as far as regs/mem are
110   concerned) but have meaning for supporting Valgrind.  A special
111   instruction is flagged by the 16-byte preamble 48C1C703 48C1C70D
112   48C1C73D 48C1C733 (in the standard interpretation, that means: rolq
113   $3, %rdi; rolq $13, %rdi; rolq $61, %rdi; rolq $51, %rdi).
114   Following that, one of the following 3 are allowed (standard
115   interpretation in parentheses):
116
117      4887DB (xchgq %rbx,%rbx)   %RDX = client_request ( %RAX )
118      4887C9 (xchgq %rcx,%rcx)   %RAX = guest_NRADDR
119      4887D2 (xchgq %rdx,%rdx)   call-noredir *%RAX
120
121   Any other bytes following the 16-byte preamble are illegal and
122   constitute a failure in instruction decoding.  This all assumes
123   that the preamble will never occur except in specific code
124   fragments designed for Valgrind to catch.
125
126   No prefixes may precede a "Special" instruction.
127*/
128
129/* casLE (implementation of lock-prefixed insns) and rep-prefixed
130   insns: the side-exit back to the start of the insn is done with
131   Ijk_Boring.  This is quite wrong, it should be done with
132   Ijk_NoRedir, since otherwise the side exit, which is intended to
133   restart the instruction for whatever reason, could go somewhere
134   entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
135   no-redir jumps performance critical, at least for rep-prefixed
136   instructions, since all iterations thereof would involve such a
137   jump.  It's not such a big deal with casLE since the side exit is
138   only taken if the CAS fails, that is, the location is contended,
139   which is relatively unlikely.
140
141   Note also, the test for CAS success vs failure is done using
142   Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
143   Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
144   shouldn't definedness-check these comparisons.  See
145   COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
146   background/rationale.
147*/
148
149/* LOCK prefixed instructions.  These are translated using IR-level
150   CAS statements (IRCAS) and are believed to preserve atomicity, even
151   from the point of view of some other process racing against a
152   simulated one (presumably they communicate via a shared memory
153   segment).
154
155   Handlers which are aware of LOCK prefixes are:
156      dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
157      dis_cmpxchg_G_E  (cmpxchg)
158      dis_Grp1         (add, or, adc, sbb, and, sub, xor)
159      dis_Grp3         (not, neg)
160      dis_Grp4         (inc, dec)
161      dis_Grp5         (inc, dec)
162      dis_Grp8_Imm     (bts, btc, btr)
163      dis_bt_G_E       (bts, btc, btr)
164      dis_xadd_G_E     (xadd)
165*/
166
167
168#include "libvex_basictypes.h"
169#include "libvex_ir.h"
170#include "libvex.h"
171#include "libvex_guest_amd64.h"
172
173#include "main_util.h"
174#include "main_globals.h"
175#include "guest_generic_bb_to_IR.h"
176#include "guest_generic_x87.h"
177#include "guest_amd64_defs.h"
178
179
180/*------------------------------------------------------------*/
181/*--- Globals                                              ---*/
182/*------------------------------------------------------------*/
183
184/* These are set at the start of the translation of an insn, right
185   down in disInstr_AMD64, so that we don't have to pass them around
186   endlessly.  They are all constant during the translation of any
187   given insn. */
188
189/* These are set at the start of the translation of a BB, so
190   that we don't have to pass them around endlessly. */
191
192/* We need to know this to do sub-register accesses correctly. */
193static Bool host_is_bigendian;
194
195/* Pointer to the guest code area (points to start of BB, not to the
196   insn being processed). */
197static UChar* guest_code;
198
199/* The guest address corresponding to guest_code[0]. */
200static Addr64 guest_RIP_bbstart;
201
202/* The guest address for the instruction currently being
203   translated. */
204static Addr64 guest_RIP_curr_instr;
205
206/* The IRSB* into which we're generating code. */
207static IRSB* irsb;
208
209/* For ensuring that %rip-relative addressing is done right.  A read
210   of %rip generates the address of the next instruction.  It may be
211   that we don't conveniently know that inside disAMode().  For sanity
212   checking, if the next insn %rip is needed, we make a guess at what
213   it is, record that guess here, and set the accompanying Bool to
214   indicate that -- after this insn's decode is finished -- that guess
215   needs to be checked.  */
216
217/* At the start of each insn decode, is set to (0, False).
218   After the decode, if _mustcheck is now True, _assumed is
219   checked. */
220
221static Addr64 guest_RIP_next_assumed;
222static Bool   guest_RIP_next_mustcheck;
223
224
225/*------------------------------------------------------------*/
226/*--- Helpers for constructing IR.                         ---*/
227/*------------------------------------------------------------*/
228
229/* Generate a new temporary of the given type. */
230static IRTemp newTemp ( IRType ty )
231{
232   vassert(isPlausibleIRType(ty));
233   return newIRTemp( irsb->tyenv, ty );
234}
235
236/* Add a statement to the list held by "irsb". */
237static void stmt ( IRStmt* st )
238{
239   addStmtToIRSB( irsb, st );
240}
241
242/* Generate a statement "dst := e". */
243static void assign ( IRTemp dst, IRExpr* e )
244{
245   stmt( IRStmt_WrTmp(dst, e) );
246}
247
248static IRExpr* unop ( IROp op, IRExpr* a )
249{
250   return IRExpr_Unop(op, a);
251}
252
253static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
254{
255   return IRExpr_Binop(op, a1, a2);
256}
257
258static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
259{
260   return IRExpr_Triop(op, a1, a2, a3);
261}
262
263static IRExpr* mkexpr ( IRTemp tmp )
264{
265   return IRExpr_RdTmp(tmp);
266}
267
268static IRExpr* mkU8 ( ULong i )
269{
270   vassert(i < 256);
271   return IRExpr_Const(IRConst_U8( (UChar)i ));
272}
273
274static IRExpr* mkU16 ( ULong i )
275{
276   vassert(i < 0x10000ULL);
277   return IRExpr_Const(IRConst_U16( (UShort)i ));
278}
279
280static IRExpr* mkU32 ( ULong i )
281{
282   vassert(i < 0x100000000ULL);
283   return IRExpr_Const(IRConst_U32( (UInt)i ));
284}
285
286static IRExpr* mkU64 ( ULong i )
287{
288   return IRExpr_Const(IRConst_U64(i));
289}
290
291static IRExpr* mkU ( IRType ty, ULong i )
292{
293   switch (ty) {
294      case Ity_I8:  return mkU8(i);
295      case Ity_I16: return mkU16(i);
296      case Ity_I32: return mkU32(i);
297      case Ity_I64: return mkU64(i);
298      default: vpanic("mkU(amd64)");
299   }
300}
301
302static void storeLE ( IRExpr* addr, IRExpr* data )
303{
304   stmt( IRStmt_Store(Iend_LE, addr, data) );
305}
306
307static IRExpr* loadLE ( IRType ty, IRExpr* addr )
308{
309   return IRExpr_Load(Iend_LE, ty, addr);
310}
311
312static IROp mkSizedOp ( IRType ty, IROp op8 )
313{
314   vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
315           || op8 == Iop_Mul8
316           || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
317           || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
318           || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
319           || op8 == Iop_CasCmpNE8
320           || op8 == Iop_Not8 );
321   switch (ty) {
322      case Ity_I8:  return 0 +op8;
323      case Ity_I16: return 1 +op8;
324      case Ity_I32: return 2 +op8;
325      case Ity_I64: return 3 +op8;
326      default: vpanic("mkSizedOp(amd64)");
327   }
328}
329
330static
331IRExpr* doScalarWidening ( Int szSmall, Int szBig, Bool signd, IRExpr* src )
332{
333   if (szSmall == 1 && szBig == 4) {
334      return unop(signd ? Iop_8Sto32 : Iop_8Uto32, src);
335   }
336   if (szSmall == 1 && szBig == 2) {
337      return unop(signd ? Iop_8Sto16 : Iop_8Uto16, src);
338   }
339   if (szSmall == 2 && szBig == 4) {
340      return unop(signd ? Iop_16Sto32 : Iop_16Uto32, src);
341   }
342   if (szSmall == 1 && szBig == 8 && !signd) {
343      return unop(Iop_8Uto64, src);
344   }
345   if (szSmall == 1 && szBig == 8 && signd) {
346      return unop(Iop_8Sto64, src);
347   }
348   if (szSmall == 2 && szBig == 8 && !signd) {
349      return unop(Iop_16Uto64, src);
350   }
351   if (szSmall == 2 && szBig == 8 && signd) {
352      return unop(Iop_16Sto64, src);
353   }
354   vpanic("doScalarWidening(amd64)");
355}
356
357
358
359/*------------------------------------------------------------*/
360/*--- Debugging output                                     ---*/
361/*------------------------------------------------------------*/
362
363/* Bomb out if we can't handle something. */
364__attribute__ ((noreturn))
365static void unimplemented ( HChar* str )
366{
367   vex_printf("amd64toIR: unimplemented feature\n");
368   vpanic(str);
369}
370
371#define DIP(format, args...)           \
372   if (vex_traceflags & VEX_TRACE_FE)  \
373      vex_printf(format, ## args)
374
375#define DIS(buf, format, args...)      \
376   if (vex_traceflags & VEX_TRACE_FE)  \
377      vex_sprintf(buf, format, ## args)
378
379
380/*------------------------------------------------------------*/
381/*--- Offsets of various parts of the amd64 guest state.   ---*/
382/*------------------------------------------------------------*/
383
384#define OFFB_RAX       offsetof(VexGuestAMD64State,guest_RAX)
385#define OFFB_RBX       offsetof(VexGuestAMD64State,guest_RBX)
386#define OFFB_RCX       offsetof(VexGuestAMD64State,guest_RCX)
387#define OFFB_RDX       offsetof(VexGuestAMD64State,guest_RDX)
388#define OFFB_RSP       offsetof(VexGuestAMD64State,guest_RSP)
389#define OFFB_RBP       offsetof(VexGuestAMD64State,guest_RBP)
390#define OFFB_RSI       offsetof(VexGuestAMD64State,guest_RSI)
391#define OFFB_RDI       offsetof(VexGuestAMD64State,guest_RDI)
392#define OFFB_R8        offsetof(VexGuestAMD64State,guest_R8)
393#define OFFB_R9        offsetof(VexGuestAMD64State,guest_R9)
394#define OFFB_R10       offsetof(VexGuestAMD64State,guest_R10)
395#define OFFB_R11       offsetof(VexGuestAMD64State,guest_R11)
396#define OFFB_R12       offsetof(VexGuestAMD64State,guest_R12)
397#define OFFB_R13       offsetof(VexGuestAMD64State,guest_R13)
398#define OFFB_R14       offsetof(VexGuestAMD64State,guest_R14)
399#define OFFB_R15       offsetof(VexGuestAMD64State,guest_R15)
400
401#define OFFB_RIP       offsetof(VexGuestAMD64State,guest_RIP)
402
403#define OFFB_FS_ZERO   offsetof(VexGuestAMD64State,guest_FS_ZERO)
404#define OFFB_GS_0x60   offsetof(VexGuestAMD64State,guest_GS_0x60)
405
406#define OFFB_CC_OP     offsetof(VexGuestAMD64State,guest_CC_OP)
407#define OFFB_CC_DEP1   offsetof(VexGuestAMD64State,guest_CC_DEP1)
408#define OFFB_CC_DEP2   offsetof(VexGuestAMD64State,guest_CC_DEP2)
409#define OFFB_CC_NDEP   offsetof(VexGuestAMD64State,guest_CC_NDEP)
410
411#define OFFB_FPREGS    offsetof(VexGuestAMD64State,guest_FPREG[0])
412#define OFFB_FPTAGS    offsetof(VexGuestAMD64State,guest_FPTAG[0])
413#define OFFB_DFLAG     offsetof(VexGuestAMD64State,guest_DFLAG)
414#define OFFB_ACFLAG    offsetof(VexGuestAMD64State,guest_ACFLAG)
415#define OFFB_IDFLAG    offsetof(VexGuestAMD64State,guest_IDFLAG)
416#define OFFB_FTOP      offsetof(VexGuestAMD64State,guest_FTOP)
417#define OFFB_FC3210    offsetof(VexGuestAMD64State,guest_FC3210)
418#define OFFB_FPROUND   offsetof(VexGuestAMD64State,guest_FPROUND)
419
420#define OFFB_SSEROUND  offsetof(VexGuestAMD64State,guest_SSEROUND)
421#define OFFB_YMM0      offsetof(VexGuestAMD64State,guest_YMM0)
422#define OFFB_YMM1      offsetof(VexGuestAMD64State,guest_YMM1)
423#define OFFB_YMM2      offsetof(VexGuestAMD64State,guest_YMM2)
424#define OFFB_YMM3      offsetof(VexGuestAMD64State,guest_YMM3)
425#define OFFB_YMM4      offsetof(VexGuestAMD64State,guest_YMM4)
426#define OFFB_YMM5      offsetof(VexGuestAMD64State,guest_YMM5)
427#define OFFB_YMM6      offsetof(VexGuestAMD64State,guest_YMM6)
428#define OFFB_YMM7      offsetof(VexGuestAMD64State,guest_YMM7)
429#define OFFB_YMM8      offsetof(VexGuestAMD64State,guest_YMM8)
430#define OFFB_YMM9      offsetof(VexGuestAMD64State,guest_YMM9)
431#define OFFB_YMM10     offsetof(VexGuestAMD64State,guest_YMM10)
432#define OFFB_YMM11     offsetof(VexGuestAMD64State,guest_YMM11)
433#define OFFB_YMM12     offsetof(VexGuestAMD64State,guest_YMM12)
434#define OFFB_YMM13     offsetof(VexGuestAMD64State,guest_YMM13)
435#define OFFB_YMM14     offsetof(VexGuestAMD64State,guest_YMM14)
436#define OFFB_YMM15     offsetof(VexGuestAMD64State,guest_YMM15)
437#define OFFB_YMM16     offsetof(VexGuestAMD64State,guest_YMM16)
438
439#define OFFB_EMWARN    offsetof(VexGuestAMD64State,guest_EMWARN)
440#define OFFB_TISTART   offsetof(VexGuestAMD64State,guest_TISTART)
441#define OFFB_TILEN     offsetof(VexGuestAMD64State,guest_TILEN)
442
443#define OFFB_NRADDR    offsetof(VexGuestAMD64State,guest_NRADDR)
444
445
446/*------------------------------------------------------------*/
447/*--- Helper bits and pieces for deconstructing the        ---*/
448/*--- amd64 insn stream.                                   ---*/
449/*------------------------------------------------------------*/
450
451/* This is the AMD64 register encoding -- integer regs. */
452#define R_RAX 0
453#define R_RCX 1
454#define R_RDX 2
455#define R_RBX 3
456#define R_RSP 4
457#define R_RBP 5
458#define R_RSI 6
459#define R_RDI 7
460#define R_R8  8
461#define R_R9  9
462#define R_R10 10
463#define R_R11 11
464#define R_R12 12
465#define R_R13 13
466#define R_R14 14
467#define R_R15 15
468
469/* This is the Intel register encoding -- segment regs. */
470#define R_ES 0
471#define R_CS 1
472#define R_SS 2
473#define R_DS 3
474#define R_FS 4
475#define R_GS 5
476
477
478/* Various simple conversions */
479
480static ULong extend_s_8to64 ( UChar x )
481{
482   return (ULong)((((Long)x) << 56) >> 56);
483}
484
485static ULong extend_s_16to64 ( UShort x )
486{
487   return (ULong)((((Long)x) << 48) >> 48);
488}
489
490static ULong extend_s_32to64 ( UInt x )
491{
492   return (ULong)((((Long)x) << 32) >> 32);
493}
494
495/* Figure out whether the mod and rm parts of a modRM byte refer to a
496   register or memory.  If so, the byte will have the form 11XXXYYY,
497   where YYY is the register number. */
498inline
499static Bool epartIsReg ( UChar mod_reg_rm )
500{
501   return toBool(0xC0 == (mod_reg_rm & 0xC0));
502}
503
504/* Extract the 'g' field from a modRM byte.  This only produces 3
505   bits, which is not a complete register number.  You should avoid
506   this function if at all possible. */
507inline
508static Int gregLO3ofRM ( UChar mod_reg_rm )
509{
510   return (Int)( (mod_reg_rm >> 3) & 7 );
511}
512
513/* Ditto the 'e' field of a modRM byte. */
514inline
515static Int eregLO3ofRM ( UChar mod_reg_rm )
516{
517   return (Int)(mod_reg_rm & 0x7);
518}
519
520/* Get a 8/16/32-bit unsigned value out of the insn stream. */
521
522static inline UChar getUChar ( Long delta )
523{
524   UChar v = guest_code[delta+0];
525   return v;
526}
527
528static UInt getUDisp16 ( Long delta )
529{
530   UInt v = guest_code[delta+1]; v <<= 8;
531   v |= guest_code[delta+0];
532   return v & 0xFFFF;
533}
534
535//.. static UInt getUDisp ( Int size, Long delta )
536//.. {
537//..    switch (size) {
538//..       case 4: return getUDisp32(delta);
539//..       case 2: return getUDisp16(delta);
540//..       case 1: return getUChar(delta);
541//..       default: vpanic("getUDisp(x86)");
542//..    }
543//..    return 0; /*notreached*/
544//.. }
545
546
547/* Get a byte value out of the insn stream and sign-extend to 64
548   bits. */
549static Long getSDisp8 ( Long delta )
550{
551   return extend_s_8to64( guest_code[delta] );
552}
553
554/* Get a 16-bit value out of the insn stream and sign-extend to 64
555   bits. */
556static Long getSDisp16 ( Long delta )
557{
558   UInt v = guest_code[delta+1]; v <<= 8;
559   v |= guest_code[delta+0];
560   return extend_s_16to64( (UShort)v );
561}
562
563/* Get a 32-bit value out of the insn stream and sign-extend to 64
564   bits. */
565static Long getSDisp32 ( Long delta )
566{
567   UInt v = guest_code[delta+3]; v <<= 8;
568   v |= guest_code[delta+2]; v <<= 8;
569   v |= guest_code[delta+1]; v <<= 8;
570   v |= guest_code[delta+0];
571   return extend_s_32to64( v );
572}
573
574/* Get a 64-bit value out of the insn stream. */
575static Long getDisp64 ( Long delta )
576{
577   ULong v = 0;
578   v |= guest_code[delta+7]; v <<= 8;
579   v |= guest_code[delta+6]; v <<= 8;
580   v |= guest_code[delta+5]; v <<= 8;
581   v |= guest_code[delta+4]; v <<= 8;
582   v |= guest_code[delta+3]; v <<= 8;
583   v |= guest_code[delta+2]; v <<= 8;
584   v |= guest_code[delta+1]; v <<= 8;
585   v |= guest_code[delta+0];
586   return v;
587}
588
589/* Note: because AMD64 doesn't allow 64-bit literals, it is an error
590   if this is called with size==8.  Should not happen. */
591static Long getSDisp ( Int size, Long delta )
592{
593   switch (size) {
594      case 4: return getSDisp32(delta);
595      case 2: return getSDisp16(delta);
596      case 1: return getSDisp8(delta);
597      default: vpanic("getSDisp(amd64)");
598  }
599}
600
601static ULong mkSizeMask ( Int sz )
602{
603   switch (sz) {
604      case 1: return 0x00000000000000FFULL;
605      case 2: return 0x000000000000FFFFULL;
606      case 4: return 0x00000000FFFFFFFFULL;
607      case 8: return 0xFFFFFFFFFFFFFFFFULL;
608      default: vpanic("mkSzMask(amd64)");
609   }
610}
611
612static Int imin ( Int a, Int b )
613{
614   return (a < b) ? a : b;
615}
616
617static IRType szToITy ( Int n )
618{
619   switch (n) {
620      case 1: return Ity_I8;
621      case 2: return Ity_I16;
622      case 4: return Ity_I32;
623      case 8: return Ity_I64;
624      default: vex_printf("\nszToITy(%d)\n", n);
625               vpanic("szToITy(amd64)");
626   }
627}
628
629
630/*------------------------------------------------------------*/
631/*--- For dealing with prefixes.                           ---*/
632/*------------------------------------------------------------*/
633
634/* The idea is to pass around an int holding a bitmask summarising
635   info from the prefixes seen on the current instruction, including
636   info from the REX byte.  This info is used in various places, but
637   most especially when making sense of register fields in
638   instructions.
639
640   The top 8 bits of the prefix are 0x55, just as a hacky way to
641   ensure it really is a valid prefix.
642
643   Things you can safely assume about a well-formed prefix:
644   * at most one segment-override bit (CS,DS,ES,FS,GS,SS) is set.
645   * if REX is not present then REXW,REXR,REXX,REXB will read
646     as zero.
647   * F2 and F3 will not both be 1.
648*/
649
650typedef UInt  Prefix;
651
652#define PFX_ASO    (1<<0)    /* address-size override present (0x67) */
653#define PFX_66     (1<<1)    /* operand-size override-to-16 present (0x66) */
654#define PFX_REX    (1<<2)    /* REX byte present (0x40 to 0x4F) */
655#define PFX_REXW   (1<<3)    /* REX W bit, if REX present, else 0 */
656#define PFX_REXR   (1<<4)    /* REX R bit, if REX present, else 0 */
657#define PFX_REXX   (1<<5)    /* REX X bit, if REX present, else 0 */
658#define PFX_REXB   (1<<6)    /* REX B bit, if REX present, else 0 */
659#define PFX_LOCK   (1<<7)    /* bus LOCK prefix present (0xF0) */
660#define PFX_F2     (1<<8)    /* REP/REPE/REPZ prefix present (0xF2) */
661#define PFX_F3     (1<<9)    /* REPNE/REPNZ prefix present (0xF3) */
662#define PFX_CS     (1<<10)   /* CS segment prefix present (0x2E) */
663#define PFX_DS     (1<<11)   /* DS segment prefix present (0x3E) */
664#define PFX_ES     (1<<12)   /* ES segment prefix present (0x26) */
665#define PFX_FS     (1<<13)   /* FS segment prefix present (0x64) */
666#define PFX_GS     (1<<14)   /* GS segment prefix present (0x65) */
667#define PFX_SS     (1<<15)   /* SS segment prefix present (0x36) */
668#define PFX_VEX    (1<<16)   /* VEX prefix present (0xC4 or 0xC5) */
669#define PFX_VEXL   (1<<17)   /* VEX L bit, if VEX present, else 0 */
670/* The extra register field VEX.vvvv is encoded (after not-ing it) as
671   PFX_VEXnV3 .. PFX_VEXnV0, so these must occupy adjacent bit
672   positions. */
673#define PFX_VEXnV0 (1<<18)   /* ~VEX vvvv[0], if VEX present, else 0 */
674#define PFX_VEXnV1 (1<<19)   /* ~VEX vvvv[1], if VEX present, else 0 */
675#define PFX_VEXnV2 (1<<20)   /* ~VEX vvvv[2], if VEX present, else 0 */
676#define PFX_VEXnV3 (1<<21)   /* ~VEX vvvv[3], if VEX present, else 0 */
677
678
679#define PFX_EMPTY 0x55000000
680
681static Bool IS_VALID_PFX ( Prefix pfx ) {
682   return toBool((pfx & 0xFF000000) == PFX_EMPTY);
683}
684
685static Bool haveREX ( Prefix pfx ) {
686   return toBool(pfx & PFX_REX);
687}
688
689static Int getRexW ( Prefix pfx ) {
690   return (pfx & PFX_REXW) ? 1 : 0;
691}
692static Int getRexR ( Prefix pfx ) {
693   return (pfx & PFX_REXR) ? 1 : 0;
694}
695static Int getRexX ( Prefix pfx ) {
696   return (pfx & PFX_REXX) ? 1 : 0;
697}
698static Int getRexB ( Prefix pfx ) {
699   return (pfx & PFX_REXB) ? 1 : 0;
700}
701
702/* Check a prefix doesn't have F2 or F3 set in it, since usually that
703   completely changes what instruction it really is. */
704static Bool haveF2orF3 ( Prefix pfx ) {
705   return toBool((pfx & (PFX_F2|PFX_F3)) > 0);
706}
707static Bool haveF2 ( Prefix pfx ) {
708   return toBool((pfx & PFX_F2) > 0);
709}
710static Bool haveF3 ( Prefix pfx ) {
711   return toBool((pfx & PFX_F3) > 0);
712}
713
714static Bool have66 ( Prefix pfx ) {
715   return toBool((pfx & PFX_66) > 0);
716}
717static Bool haveASO ( Prefix pfx ) {
718   return toBool((pfx & PFX_ASO) > 0);
719}
720
721/* Return True iff pfx has 66 set and F2 and F3 clear */
722static Bool have66noF2noF3 ( Prefix pfx )
723{
724  return
725     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_66);
726}
727
728/* Return True iff pfx has F2 set and 66 and F3 clear */
729static Bool haveF2no66noF3 ( Prefix pfx )
730{
731  return
732     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F2);
733}
734
735/* Return True iff pfx has F3 set and 66 and F2 clear */
736static Bool haveF3no66noF2 ( Prefix pfx )
737{
738  return
739     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F3);
740}
741
742/* Return True iff pfx has F3 set and F2 clear */
743static Bool haveF3noF2 ( Prefix pfx )
744{
745  return
746     toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F3);
747}
748
749/* Return True iff pfx has F2 set and F3 clear */
750static Bool haveF2noF3 ( Prefix pfx )
751{
752  return
753     toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F2);
754}
755
756/* Return True iff pfx has 66, F2 and F3 clear */
757static Bool haveNo66noF2noF3 ( Prefix pfx )
758{
759  return
760     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == 0);
761}
762
763/* Return True iff pfx has any of 66, F2 and F3 set */
764static Bool have66orF2orF3 ( Prefix pfx )
765{
766  return toBool( ! haveNo66noF2noF3(pfx) );
767}
768
769/* Return True iff pfx has 66 or F2 set */
770static Bool have66orF2 ( Prefix pfx )
771{
772   return toBool((pfx & (PFX_66|PFX_F2)) > 0);
773}
774
775/* Clear all the segment-override bits in a prefix. */
776static Prefix clearSegBits ( Prefix p )
777{
778   return
779      p & ~(PFX_CS | PFX_DS | PFX_ES | PFX_FS | PFX_GS | PFX_SS);
780}
781
782/* Get the (inverted, hence back to "normal") VEX.vvvv field. */
783static UInt getVexNvvvv ( Prefix pfx ) {
784   UInt r = (UInt)pfx;
785   r /= (UInt)PFX_VEXnV0; /* pray this turns into a shift */
786   return r & 0xF;
787}
788
789static Bool haveVEX ( Prefix pfx ) {
790   return toBool(pfx & PFX_VEX);
791}
792
793static Int getVexL ( Prefix pfx ) {
794   return (pfx & PFX_VEXL) ? 1 : 0;
795}
796
797
798/*------------------------------------------------------------*/
799/*--- For dealing with escapes                             ---*/
800/*------------------------------------------------------------*/
801
802
803/* Escapes come after the prefixes, but before the primary opcode
804   byte.  They escape the primary opcode byte into a bigger space.
805   The 0xF0000000 isn't significant, except so as to make it not
806   overlap valid Prefix values, for sanity checking.
807*/
808
809typedef
810   enum {
811      ESC_NONE=0xF0000000, // none
812      ESC_0F,              // 0F
813      ESC_0F38,            // 0F 38
814      ESC_0F3A             // 0F 3A
815   }
816   Escape;
817
818
819/*------------------------------------------------------------*/
820/*--- For dealing with integer registers                   ---*/
821/*------------------------------------------------------------*/
822
823/* This is somewhat complex.  The rules are:
824
825   For 64, 32 and 16 bit register references, the e or g fields in the
826   modrm bytes supply the low 3 bits of the register number.  The
827   fourth (most-significant) bit of the register number is supplied by
828   the REX byte, if it is present; else that bit is taken to be zero.
829
830   The REX.R bit supplies the high bit corresponding to the g register
831   field, and the REX.B bit supplies the high bit corresponding to the
832   e register field (when the mod part of modrm indicates that modrm's
833   e component refers to a register and not to memory).
834
835   The REX.X bit supplies a high register bit for certain registers
836   in SIB address modes, and is generally rarely used.
837
838   For 8 bit register references, the presence of the REX byte itself
839   has significance.  If there is no REX present, then the 3-bit
840   number extracted from the modrm e or g field is treated as an index
841   into the sequence %al %cl %dl %bl %ah %ch %dh %bh -- that is, the
842   old x86 encoding scheme.
843
844   But if there is a REX present, the register reference is
845   interpreted in the same way as for 64/32/16-bit references: a high
846   bit is extracted from REX, giving a 4-bit number, and the denoted
847   register is the lowest 8 bits of the 16 integer registers denoted
848   by the number.  In particular, values 3 through 7 of this sequence
849   do not refer to %ah %ch %dh %bh but instead to the lowest 8 bits of
850   %rsp %rbp %rsi %rdi.
851
852   The REX.W bit has no bearing at all on register numbers.  Instead
853   its presence indicates that the operand size is to be overridden
854   from its default value (32 bits) to 64 bits instead.  This is in
855   the same fashion that an 0x66 prefix indicates the operand size is
856   to be overridden from 32 bits down to 16 bits.  When both REX.W and
857   0x66 are present there is a conflict, and REX.W takes precedence.
858
859   Rather than try to handle this complexity using a single huge
860   function, several smaller ones are provided.  The aim is to make it
861   as difficult as possible to screw up register decoding in a subtle
862   and hard-to-track-down way.
863
864   Because these routines fish around in the host's memory (that is,
865   in the guest state area) for sub-parts of guest registers, their
866   correctness depends on the host's endianness.  So far these
867   routines only work for little-endian hosts.  Those for which
868   endianness is important have assertions to ensure sanity.
869*/
870
871
872/* About the simplest question you can ask: where do the 64-bit
873   integer registers live (in the guest state) ? */
874
875static Int integerGuestReg64Offset ( UInt reg )
876{
877   switch (reg) {
878      case R_RAX: return OFFB_RAX;
879      case R_RCX: return OFFB_RCX;
880      case R_RDX: return OFFB_RDX;
881      case R_RBX: return OFFB_RBX;
882      case R_RSP: return OFFB_RSP;
883      case R_RBP: return OFFB_RBP;
884      case R_RSI: return OFFB_RSI;
885      case R_RDI: return OFFB_RDI;
886      case R_R8:  return OFFB_R8;
887      case R_R9:  return OFFB_R9;
888      case R_R10: return OFFB_R10;
889      case R_R11: return OFFB_R11;
890      case R_R12: return OFFB_R12;
891      case R_R13: return OFFB_R13;
892      case R_R14: return OFFB_R14;
893      case R_R15: return OFFB_R15;
894      default: vpanic("integerGuestReg64Offset(amd64)");
895   }
896}
897
898
899/* Produce the name of an integer register, for printing purposes.
900   reg is a number in the range 0 .. 15 that has been generated from a
901   3-bit reg-field number and a REX extension bit.  irregular denotes
902   the case where sz==1 and no REX byte is present. */
903
904static
905HChar* nameIReg ( Int sz, UInt reg, Bool irregular )
906{
907   static HChar* ireg64_names[16]
908     = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
909         "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
910   static HChar* ireg32_names[16]
911     = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
912         "%r8d", "%r9d", "%r10d","%r11d","%r12d","%r13d","%r14d","%r15d" };
913   static HChar* ireg16_names[16]
914     = { "%ax",  "%cx",  "%dx",  "%bx",  "%sp",  "%bp",  "%si",  "%di",
915         "%r8w", "%r9w", "%r10w","%r11w","%r12w","%r13w","%r14w","%r15w" };
916   static HChar* ireg8_names[16]
917     = { "%al",  "%cl",  "%dl",  "%bl",  "%spl", "%bpl", "%sil", "%dil",
918         "%r8b", "%r9b", "%r10b","%r11b","%r12b","%r13b","%r14b","%r15b" };
919   static HChar* ireg8_irregular[8]
920     = { "%al", "%cl", "%dl", "%bl", "%ah", "%ch", "%dh", "%bh" };
921
922   vassert(reg < 16);
923   if (sz == 1) {
924      if (irregular)
925         vassert(reg < 8);
926   } else {
927      vassert(irregular == False);
928   }
929
930   switch (sz) {
931      case 8: return ireg64_names[reg];
932      case 4: return ireg32_names[reg];
933      case 2: return ireg16_names[reg];
934      case 1: if (irregular) {
935                 return ireg8_irregular[reg];
936              } else {
937                 return ireg8_names[reg];
938              }
939      default: vpanic("nameIReg(amd64)");
940   }
941}
942
943/* Using the same argument conventions as nameIReg, produce the
944   guest state offset of an integer register. */
945
946static
947Int offsetIReg ( Int sz, UInt reg, Bool irregular )
948{
949   vassert(reg < 16);
950   if (sz == 1) {
951      if (irregular)
952         vassert(reg < 8);
953   } else {
954      vassert(irregular == False);
955   }
956
957   /* Deal with irregular case -- sz==1 and no REX present */
958   if (sz == 1 && irregular) {
959      switch (reg) {
960         case R_RSP: return 1+ OFFB_RAX;
961         case R_RBP: return 1+ OFFB_RCX;
962         case R_RSI: return 1+ OFFB_RDX;
963         case R_RDI: return 1+ OFFB_RBX;
964         default:    break; /* use the normal case */
965      }
966   }
967
968   /* Normal case */
969   return integerGuestReg64Offset(reg);
970}
971
972
973/* Read the %CL register :: Ity_I8, for shift/rotate operations. */
974
975static IRExpr* getIRegCL ( void )
976{
977   vassert(!host_is_bigendian);
978   return IRExpr_Get( OFFB_RCX, Ity_I8 );
979}
980
981
982/* Write to the %AH register. */
983
984static void putIRegAH ( IRExpr* e )
985{
986   vassert(!host_is_bigendian);
987   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
988   stmt( IRStmt_Put( OFFB_RAX+1, e ) );
989}
990
991
992/* Read/write various widths of %RAX, as it has various
993   special-purpose uses. */
994
995static HChar* nameIRegRAX ( Int sz )
996{
997   switch (sz) {
998      case 1: return "%al";
999      case 2: return "%ax";
1000      case 4: return "%eax";
1001      case 8: return "%rax";
1002      default: vpanic("nameIRegRAX(amd64)");
1003   }
1004}
1005
1006static IRExpr* getIRegRAX ( Int sz )
1007{
1008   vassert(!host_is_bigendian);
1009   switch (sz) {
1010      case 1: return IRExpr_Get( OFFB_RAX, Ity_I8 );
1011      case 2: return IRExpr_Get( OFFB_RAX, Ity_I16 );
1012      case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RAX, Ity_I64 ));
1013      case 8: return IRExpr_Get( OFFB_RAX, Ity_I64 );
1014      default: vpanic("getIRegRAX(amd64)");
1015   }
1016}
1017
1018static void putIRegRAX ( Int sz, IRExpr* e )
1019{
1020   IRType ty = typeOfIRExpr(irsb->tyenv, e);
1021   vassert(!host_is_bigendian);
1022   switch (sz) {
1023      case 8: vassert(ty == Ity_I64);
1024              stmt( IRStmt_Put( OFFB_RAX, e ));
1025              break;
1026      case 4: vassert(ty == Ity_I32);
1027              stmt( IRStmt_Put( OFFB_RAX, unop(Iop_32Uto64,e) ));
1028              break;
1029      case 2: vassert(ty == Ity_I16);
1030              stmt( IRStmt_Put( OFFB_RAX, e ));
1031              break;
1032      case 1: vassert(ty == Ity_I8);
1033              stmt( IRStmt_Put( OFFB_RAX, e ));
1034              break;
1035      default: vpanic("putIRegRAX(amd64)");
1036   }
1037}
1038
1039
1040/* Read/write various widths of %RDX, as it has various
1041   special-purpose uses. */
1042
1043static HChar* nameIRegRDX ( Int sz )
1044{
1045   switch (sz) {
1046      case 1: return "%dl";
1047      case 2: return "%dx";
1048      case 4: return "%edx";
1049      case 8: return "%rdx";
1050      default: vpanic("nameIRegRDX(amd64)");
1051   }
1052}
1053
1054static IRExpr* getIRegRDX ( Int sz )
1055{
1056   vassert(!host_is_bigendian);
1057   switch (sz) {
1058      case 1: return IRExpr_Get( OFFB_RDX, Ity_I8 );
1059      case 2: return IRExpr_Get( OFFB_RDX, Ity_I16 );
1060      case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RDX, Ity_I64 ));
1061      case 8: return IRExpr_Get( OFFB_RDX, Ity_I64 );
1062      default: vpanic("getIRegRDX(amd64)");
1063   }
1064}
1065
1066static void putIRegRDX ( Int sz, IRExpr* e )
1067{
1068   vassert(!host_is_bigendian);
1069   vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
1070   switch (sz) {
1071      case 8: stmt( IRStmt_Put( OFFB_RDX, e ));
1072              break;
1073      case 4: stmt( IRStmt_Put( OFFB_RDX, unop(Iop_32Uto64,e) ));
1074              break;
1075      case 2: stmt( IRStmt_Put( OFFB_RDX, e ));
1076              break;
1077      case 1: stmt( IRStmt_Put( OFFB_RDX, e ));
1078              break;
1079      default: vpanic("putIRegRDX(amd64)");
1080   }
1081}
1082
1083
1084/* Simplistic functions to deal with the integer registers as a
1085   straightforward bank of 16 64-bit regs. */
1086
1087static IRExpr* getIReg64 ( UInt regno )
1088{
1089   return IRExpr_Get( integerGuestReg64Offset(regno),
1090                      Ity_I64 );
1091}
1092
1093static void putIReg64 ( UInt regno, IRExpr* e )
1094{
1095   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
1096   stmt( IRStmt_Put( integerGuestReg64Offset(regno), e ) );
1097}
1098
1099static HChar* nameIReg64 ( UInt regno )
1100{
1101   return nameIReg( 8, regno, False );
1102}
1103
1104
1105/* Simplistic functions to deal with the lower halves of integer
1106   registers as a straightforward bank of 16 32-bit regs. */
1107
1108static IRExpr* getIReg32 ( UInt regno )
1109{
1110   vassert(!host_is_bigendian);
1111   return unop(Iop_64to32,
1112               IRExpr_Get( integerGuestReg64Offset(regno),
1113                           Ity_I64 ));
1114}
1115
1116static void putIReg32 ( UInt regno, IRExpr* e )
1117{
1118   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
1119   stmt( IRStmt_Put( integerGuestReg64Offset(regno),
1120                     unop(Iop_32Uto64,e) ) );
1121}
1122
1123static HChar* nameIReg32 ( UInt regno )
1124{
1125   return nameIReg( 4, regno, False );
1126}
1127
1128
1129/* Simplistic functions to deal with the lower quarters of integer
1130   registers as a straightforward bank of 16 16-bit regs. */
1131
1132static IRExpr* getIReg16 ( UInt regno )
1133{
1134   vassert(!host_is_bigendian);
1135   return IRExpr_Get( integerGuestReg64Offset(regno),
1136                      Ity_I16 );
1137}
1138
1139static void putIReg16 ( UInt regno, IRExpr* e )
1140{
1141   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
1142   stmt( IRStmt_Put( integerGuestReg64Offset(regno),
1143                     unop(Iop_16Uto64,e) ) );
1144}
1145
1146static HChar* nameIReg16 ( UInt regno )
1147{
1148   return nameIReg( 2, regno, False );
1149}
1150
1151
1152/* Sometimes what we know is a 3-bit register number, a REX byte, and
1153   which field of the REX byte is to be used to extend to a 4-bit
1154   number.  These functions cater for that situation.
1155*/
1156static IRExpr* getIReg64rexX ( Prefix pfx, UInt lo3bits )
1157{
1158   vassert(lo3bits < 8);
1159   vassert(IS_VALID_PFX(pfx));
1160   return getIReg64( lo3bits | (getRexX(pfx) << 3) );
1161}
1162
1163static HChar* nameIReg64rexX ( Prefix pfx, UInt lo3bits )
1164{
1165   vassert(lo3bits < 8);
1166   vassert(IS_VALID_PFX(pfx));
1167   return nameIReg( 8, lo3bits | (getRexX(pfx) << 3), False );
1168}
1169
1170static HChar* nameIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
1171{
1172   vassert(lo3bits < 8);
1173   vassert(IS_VALID_PFX(pfx));
1174   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1175   return nameIReg( sz, lo3bits | (getRexB(pfx) << 3),
1176                        toBool(sz==1 && !haveREX(pfx)) );
1177}
1178
1179static IRExpr* getIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
1180{
1181   vassert(lo3bits < 8);
1182   vassert(IS_VALID_PFX(pfx));
1183   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1184   if (sz == 4) {
1185      sz = 8;
1186      return unop(Iop_64to32,
1187                  IRExpr_Get(
1188                     offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
1189                                     toBool(sz==1 && !haveREX(pfx)) ),
1190                     szToITy(sz)
1191                 )
1192             );
1193   } else {
1194      return IRExpr_Get(
1195                offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
1196                                toBool(sz==1 && !haveREX(pfx)) ),
1197                szToITy(sz)
1198             );
1199   }
1200}
1201
1202static void putIRegRexB ( Int sz, Prefix pfx, UInt lo3bits, IRExpr* e )
1203{
1204   vassert(lo3bits < 8);
1205   vassert(IS_VALID_PFX(pfx));
1206   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1207   vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
1208   stmt( IRStmt_Put(
1209            offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
1210                            toBool(sz==1 && !haveREX(pfx)) ),
1211            sz==4 ? unop(Iop_32Uto64,e) : e
1212   ));
1213}
1214
1215
1216/* Functions for getting register numbers from modrm bytes and REX
1217   when we don't have to consider the complexities of integer subreg
1218   accesses.
1219*/
1220/* Extract the g reg field from a modRM byte, and augment it using the
1221   REX.R bit from the supplied REX byte.  The R bit usually is
1222   associated with the g register field.
1223*/
1224static UInt gregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
1225{
1226   Int reg = (Int)( (mod_reg_rm >> 3) & 7 );
1227   reg += (pfx & PFX_REXR) ? 8 : 0;
1228   return reg;
1229}
1230
1231/* Extract the e reg field from a modRM byte, and augment it using the
1232   REX.B bit from the supplied REX byte.  The B bit usually is
1233   associated with the e register field (when modrm indicates e is a
1234   register, that is).
1235*/
1236static UInt eregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
1237{
1238   Int rm;
1239   vassert(epartIsReg(mod_reg_rm));
1240   rm = (Int)(mod_reg_rm & 0x7);
1241   rm += (pfx & PFX_REXB) ? 8 : 0;
1242   return rm;
1243}
1244
1245
1246/* General functions for dealing with integer register access. */
1247
1248/* Produce the guest state offset for a reference to the 'g' register
1249   field in a modrm byte, taking into account REX (or its absence),
1250   and the size of the access.
1251*/
1252static UInt offsetIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
1253{
1254   UInt reg;
1255   vassert(!host_is_bigendian);
1256   vassert(IS_VALID_PFX(pfx));
1257   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1258   reg = gregOfRexRM( pfx, mod_reg_rm );
1259   return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
1260}
1261
1262static
1263IRExpr* getIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
1264{
1265   if (sz == 4) {
1266      sz = 8;
1267      return unop(Iop_64to32,
1268                  IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
1269                              szToITy(sz) ));
1270   } else {
1271      return IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
1272                         szToITy(sz) );
1273   }
1274}
1275
1276static
1277void putIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
1278{
1279   vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
1280   if (sz == 4) {
1281      e = unop(Iop_32Uto64,e);
1282   }
1283   stmt( IRStmt_Put( offsetIRegG( sz, pfx, mod_reg_rm ), e ) );
1284}
1285
1286static
1287HChar* nameIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
1288{
1289   return nameIReg( sz, gregOfRexRM(pfx,mod_reg_rm),
1290                        toBool(sz==1 && !haveREX(pfx)) );
1291}
1292
1293
1294/* Produce the guest state offset for a reference to the 'e' register
1295   field in a modrm byte, taking into account REX (or its absence),
1296   and the size of the access.  eregOfRexRM will assert if mod_reg_rm
1297   denotes a memory access rather than a register access.
1298*/
1299static UInt offsetIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
1300{
1301   UInt reg;
1302   vassert(!host_is_bigendian);
1303   vassert(IS_VALID_PFX(pfx));
1304   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1305   reg = eregOfRexRM( pfx, mod_reg_rm );
1306   return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
1307}
1308
1309static
1310IRExpr* getIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
1311{
1312   if (sz == 4) {
1313      sz = 8;
1314      return unop(Iop_64to32,
1315                  IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
1316                              szToITy(sz) ));
1317   } else {
1318      return IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
1319                         szToITy(sz) );
1320   }
1321}
1322
1323static
1324void putIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
1325{
1326   vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
1327   if (sz == 4) {
1328      e = unop(Iop_32Uto64,e);
1329   }
1330   stmt( IRStmt_Put( offsetIRegE( sz, pfx, mod_reg_rm ), e ) );
1331}
1332
1333static
1334HChar* nameIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
1335{
1336   return nameIReg( sz, eregOfRexRM(pfx,mod_reg_rm),
1337                        toBool(sz==1 && !haveREX(pfx)) );
1338}
1339
1340
1341/*------------------------------------------------------------*/
1342/*--- For dealing with XMM registers                       ---*/
1343/*------------------------------------------------------------*/
1344
1345static Int ymmGuestRegOffset ( UInt ymmreg )
1346{
1347   switch (ymmreg) {
1348      case 0:  return OFFB_YMM0;
1349      case 1:  return OFFB_YMM1;
1350      case 2:  return OFFB_YMM2;
1351      case 3:  return OFFB_YMM3;
1352      case 4:  return OFFB_YMM4;
1353      case 5:  return OFFB_YMM5;
1354      case 6:  return OFFB_YMM6;
1355      case 7:  return OFFB_YMM7;
1356      case 8:  return OFFB_YMM8;
1357      case 9:  return OFFB_YMM9;
1358      case 10: return OFFB_YMM10;
1359      case 11: return OFFB_YMM11;
1360      case 12: return OFFB_YMM12;
1361      case 13: return OFFB_YMM13;
1362      case 14: return OFFB_YMM14;
1363      case 15: return OFFB_YMM15;
1364      default: vpanic("ymmGuestRegOffset(amd64)");
1365   }
1366}
1367
1368static Int xmmGuestRegOffset ( UInt xmmreg )
1369{
1370   /* Correct for little-endian host only. */
1371   vassert(!host_is_bigendian);
1372   return ymmGuestRegOffset( xmmreg );
1373}
1374
1375/* Lanes of vector registers are always numbered from zero being the
1376   least significant lane (rightmost in the register).  */
1377
1378static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
1379{
1380   /* Correct for little-endian host only. */
1381   vassert(!host_is_bigendian);
1382   vassert(laneno >= 0 && laneno < 8);
1383   return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
1384}
1385
1386static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
1387{
1388   /* Correct for little-endian host only. */
1389   vassert(!host_is_bigendian);
1390   vassert(laneno >= 0 && laneno < 4);
1391   return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
1392}
1393
1394static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
1395{
1396   /* Correct for little-endian host only. */
1397   vassert(!host_is_bigendian);
1398   vassert(laneno >= 0 && laneno < 2);
1399   return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
1400}
1401
1402static Int ymmGuestRegLane128offset ( UInt ymmreg, Int laneno )
1403{
1404   /* Correct for little-endian host only. */
1405   vassert(!host_is_bigendian);
1406   vassert(laneno >= 0 && laneno < 2);
1407   return ymmGuestRegOffset( ymmreg ) + 16 * laneno;
1408}
1409
1410static Int ymmGuestRegLane64offset ( UInt ymmreg, Int laneno )
1411{
1412   /* Correct for little-endian host only. */
1413   vassert(!host_is_bigendian);
1414   vassert(laneno >= 0 && laneno < 4);
1415   return ymmGuestRegOffset( ymmreg ) + 8 * laneno;
1416}
1417
1418static Int ymmGuestRegLane32offset ( UInt ymmreg, Int laneno )
1419{
1420   /* Correct for little-endian host only. */
1421   vassert(!host_is_bigendian);
1422   vassert(laneno >= 0 && laneno < 8);
1423   return ymmGuestRegOffset( ymmreg ) + 4 * laneno;
1424}
1425
1426static IRExpr* getXMMReg ( UInt xmmreg )
1427{
1428   return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
1429}
1430
1431static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
1432{
1433   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
1434}
1435
1436static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
1437{
1438   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
1439}
1440
1441static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
1442{
1443   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
1444}
1445
1446static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
1447{
1448   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
1449}
1450
1451static IRExpr* getXMMRegLane16 ( UInt xmmreg, Int laneno )
1452{
1453  return IRExpr_Get( xmmGuestRegLane16offset(xmmreg,laneno), Ity_I16 );
1454}
1455
1456static void putXMMReg ( UInt xmmreg, IRExpr* e )
1457{
1458   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
1459   stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
1460}
1461
1462static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
1463{
1464   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
1465   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
1466}
1467
1468static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
1469{
1470   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
1471   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
1472}
1473
1474static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
1475{
1476   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
1477   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
1478}
1479
1480static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
1481{
1482   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
1483   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
1484}
1485
1486static IRExpr* getYMMReg ( UInt xmmreg )
1487{
1488   return IRExpr_Get( ymmGuestRegOffset(xmmreg), Ity_V256 );
1489}
1490
1491static IRExpr* getYMMRegLane128 ( UInt ymmreg, Int laneno )
1492{
1493   return IRExpr_Get( ymmGuestRegLane128offset(ymmreg,laneno), Ity_V128 );
1494}
1495
1496static IRExpr* getYMMRegLane64 ( UInt ymmreg, Int laneno )
1497{
1498   return IRExpr_Get( ymmGuestRegLane64offset(ymmreg,laneno), Ity_I64 );
1499}
1500
1501static IRExpr* getYMMRegLane32 ( UInt ymmreg, Int laneno )
1502{
1503   return IRExpr_Get( ymmGuestRegLane32offset(ymmreg,laneno), Ity_I32 );
1504}
1505
1506static void putYMMReg ( UInt ymmreg, IRExpr* e )
1507{
1508   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V256);
1509   stmt( IRStmt_Put( ymmGuestRegOffset(ymmreg), e ) );
1510}
1511
1512static void putYMMRegLane128 ( UInt ymmreg, Int laneno, IRExpr* e )
1513{
1514   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
1515   stmt( IRStmt_Put( ymmGuestRegLane128offset(ymmreg,laneno), e ) );
1516}
1517
1518static void putYMMRegLane64F ( UInt ymmreg, Int laneno, IRExpr* e )
1519{
1520   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
1521   stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
1522}
1523
1524static void putYMMRegLane64 ( UInt ymmreg, Int laneno, IRExpr* e )
1525{
1526   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
1527   stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
1528}
1529
1530static void putYMMRegLane32F ( UInt ymmreg, Int laneno, IRExpr* e )
1531{
1532   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
1533   stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
1534}
1535
1536static void putYMMRegLane32 ( UInt ymmreg, Int laneno, IRExpr* e )
1537{
1538   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
1539   stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
1540}
1541
1542static IRExpr* mkV128 ( UShort mask )
1543{
1544   return IRExpr_Const(IRConst_V128(mask));
1545}
1546
1547/* Write the low half of a YMM reg and zero out the upper half. */
1548static void putYMMRegLoAndZU ( UInt ymmreg, IRExpr* e )
1549{
1550   putYMMRegLane128( ymmreg, 0, e );
1551   putYMMRegLane128( ymmreg, 1, mkV128(0) );
1552}
1553
1554static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
1555{
1556   vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
1557   vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
1558   return unop(Iop_64to1,
1559               binop(Iop_And64,
1560                     unop(Iop_1Uto64,x),
1561                     unop(Iop_1Uto64,y)));
1562}
1563
1564/* Generate a compare-and-swap operation, operating on memory at
1565   'addr'.  The expected value is 'expVal' and the new value is
1566   'newVal'.  If the operation fails, then transfer control (with a
1567   no-redir jump (XXX no -- see comment at top of this file)) to
1568   'restart_point', which is presumably the address of the guest
1569   instruction again -- retrying, essentially. */
1570static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
1571                    Addr64 restart_point )
1572{
1573   IRCAS* cas;
1574   IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
1575   IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
1576   IRTemp oldTmp = newTemp(tyE);
1577   IRTemp expTmp = newTemp(tyE);
1578   vassert(tyE == tyN);
1579   vassert(tyE == Ity_I64 || tyE == Ity_I32
1580           || tyE == Ity_I16 || tyE == Ity_I8);
1581   assign(expTmp, expVal);
1582   cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
1583                  NULL, mkexpr(expTmp), NULL, newVal );
1584   stmt( IRStmt_CAS(cas) );
1585   stmt( IRStmt_Exit(
1586            binop( mkSizedOp(tyE,Iop_CasCmpNE8),
1587                   mkexpr(oldTmp), mkexpr(expTmp) ),
1588            Ijk_Boring, /*Ijk_NoRedir*/
1589            IRConst_U64( restart_point ),
1590            OFFB_RIP
1591         ));
1592}
1593
1594
1595/*------------------------------------------------------------*/
1596/*--- Helpers for %rflags.                                 ---*/
1597/*------------------------------------------------------------*/
1598
1599/* -------------- Evaluating the flags-thunk. -------------- */
1600
1601/* Build IR to calculate all the eflags from stored
1602   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1603   Ity_I64. */
1604static IRExpr* mk_amd64g_calculate_rflags_all ( void )
1605{
1606   IRExpr** args
1607      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1608                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1609                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1610                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1611   IRExpr* call
1612      = mkIRExprCCall(
1613           Ity_I64,
1614           0/*regparm*/,
1615           "amd64g_calculate_rflags_all", &amd64g_calculate_rflags_all,
1616           args
1617        );
1618   /* Exclude OP and NDEP from definedness checking.  We're only
1619      interested in DEP1 and DEP2. */
1620   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1621   return call;
1622}
1623
1624/* Build IR to calculate some particular condition from stored
1625   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1626   Ity_Bit. */
1627static IRExpr* mk_amd64g_calculate_condition ( AMD64Condcode cond )
1628{
1629   IRExpr** args
1630      = mkIRExprVec_5( mkU64(cond),
1631                       IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1632                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1633                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1634                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1635   IRExpr* call
1636      = mkIRExprCCall(
1637           Ity_I64,
1638           0/*regparm*/,
1639           "amd64g_calculate_condition", &amd64g_calculate_condition,
1640           args
1641        );
1642   /* Exclude the requested condition, OP and NDEP from definedness
1643      checking.  We're only interested in DEP1 and DEP2. */
1644   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
1645   return unop(Iop_64to1, call);
1646}
1647
1648/* Build IR to calculate just the carry flag from stored
1649   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I64. */
1650static IRExpr* mk_amd64g_calculate_rflags_c ( void )
1651{
1652   IRExpr** args
1653      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1654                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1655                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1656                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1657   IRExpr* call
1658      = mkIRExprCCall(
1659           Ity_I64,
1660           0/*regparm*/,
1661           "amd64g_calculate_rflags_c", &amd64g_calculate_rflags_c,
1662           args
1663        );
1664   /* Exclude OP and NDEP from definedness checking.  We're only
1665      interested in DEP1 and DEP2. */
1666   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1667   return call;
1668}
1669
1670
1671/* -------------- Building the flags-thunk. -------------- */
1672
1673/* The machinery in this section builds the flag-thunk following a
1674   flag-setting operation.  Hence the various setFlags_* functions.
1675*/
1676
1677static Bool isAddSub ( IROp op8 )
1678{
1679   return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
1680}
1681
1682static Bool isLogic ( IROp op8 )
1683{
1684   return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
1685}
1686
1687/* U-widen 8/16/32/64 bit int expr to 64. */
1688static IRExpr* widenUto64 ( IRExpr* e )
1689{
1690   switch (typeOfIRExpr(irsb->tyenv,e)) {
1691      case Ity_I64: return e;
1692      case Ity_I32: return unop(Iop_32Uto64, e);
1693      case Ity_I16: return unop(Iop_16Uto64, e);
1694      case Ity_I8:  return unop(Iop_8Uto64, e);
1695      default: vpanic("widenUto64");
1696   }
1697}
1698
1699/* S-widen 8/16/32/64 bit int expr to 32. */
1700static IRExpr* widenSto64 ( IRExpr* e )
1701{
1702   switch (typeOfIRExpr(irsb->tyenv,e)) {
1703      case Ity_I64: return e;
1704      case Ity_I32: return unop(Iop_32Sto64, e);
1705      case Ity_I16: return unop(Iop_16Sto64, e);
1706      case Ity_I8:  return unop(Iop_8Sto64, e);
1707      default: vpanic("widenSto64");
1708   }
1709}
1710
1711/* Narrow 8/16/32/64 bit int expr to 8/16/32/64.  Clearly only some
1712   of these combinations make sense. */
1713static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
1714{
1715   IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
1716   if (src_ty == dst_ty)
1717      return e;
1718   if (src_ty == Ity_I32 && dst_ty == Ity_I16)
1719      return unop(Iop_32to16, e);
1720   if (src_ty == Ity_I32 && dst_ty == Ity_I8)
1721      return unop(Iop_32to8, e);
1722   if (src_ty == Ity_I64 && dst_ty == Ity_I32)
1723      return unop(Iop_64to32, e);
1724   if (src_ty == Ity_I64 && dst_ty == Ity_I16)
1725      return unop(Iop_64to16, e);
1726   if (src_ty == Ity_I64 && dst_ty == Ity_I8)
1727      return unop(Iop_64to8, e);
1728
1729   vex_printf("\nsrc, dst tys are: ");
1730   ppIRType(src_ty);
1731   vex_printf(", ");
1732   ppIRType(dst_ty);
1733   vex_printf("\n");
1734   vpanic("narrowTo(amd64)");
1735}
1736
1737
1738/* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
1739   auto-sized up to the real op. */
1740
1741static
1742void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
1743{
1744   Int ccOp = 0;
1745   switch (ty) {
1746      case Ity_I8:  ccOp = 0; break;
1747      case Ity_I16: ccOp = 1; break;
1748      case Ity_I32: ccOp = 2; break;
1749      case Ity_I64: ccOp = 3; break;
1750      default: vassert(0);
1751   }
1752   switch (op8) {
1753      case Iop_Add8: ccOp += AMD64G_CC_OP_ADDB;   break;
1754      case Iop_Sub8: ccOp += AMD64G_CC_OP_SUBB;   break;
1755      default:       ppIROp(op8);
1756                     vpanic("setFlags_DEP1_DEP2(amd64)");
1757   }
1758   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
1759   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
1760   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(dep2))) );
1761}
1762
1763
1764/* Set the OP and DEP1 fields only, and write zero to DEP2. */
1765
1766static
1767void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
1768{
1769   Int ccOp = 0;
1770   switch (ty) {
1771      case Ity_I8:  ccOp = 0; break;
1772      case Ity_I16: ccOp = 1; break;
1773      case Ity_I32: ccOp = 2; break;
1774      case Ity_I64: ccOp = 3; break;
1775      default: vassert(0);
1776   }
1777   switch (op8) {
1778      case Iop_Or8:
1779      case Iop_And8:
1780      case Iop_Xor8: ccOp += AMD64G_CC_OP_LOGICB; break;
1781      default:       ppIROp(op8);
1782                     vpanic("setFlags_DEP1(amd64)");
1783   }
1784   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
1785   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
1786   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
1787}
1788
1789
1790/* For shift operations, we put in the result and the undershifted
1791   result.  Except if the shift amount is zero, the thunk is left
1792   unchanged. */
1793
1794static void setFlags_DEP1_DEP2_shift ( IROp    op64,
1795                                       IRTemp  res,
1796                                       IRTemp  resUS,
1797                                       IRType  ty,
1798                                       IRTemp  guard )
1799{
1800   Int ccOp = 0;
1801   switch (ty) {
1802      case Ity_I8:  ccOp = 0; break;
1803      case Ity_I16: ccOp = 1; break;
1804      case Ity_I32: ccOp = 2; break;
1805      case Ity_I64: ccOp = 3; break;
1806      default: vassert(0);
1807   }
1808
1809   vassert(guard);
1810
1811   /* Both kinds of right shifts are handled by the same thunk
1812      operation. */
1813   switch (op64) {
1814      case Iop_Shr64:
1815      case Iop_Sar64: ccOp += AMD64G_CC_OP_SHRB; break;
1816      case Iop_Shl64: ccOp += AMD64G_CC_OP_SHLB; break;
1817      default:        ppIROp(op64);
1818                      vpanic("setFlags_DEP1_DEP2_shift(amd64)");
1819   }
1820
1821   /* DEP1 contains the result, DEP2 contains the undershifted value. */
1822   stmt( IRStmt_Put( OFFB_CC_OP,
1823                     IRExpr_Mux0X( mkexpr(guard),
1824                                   IRExpr_Get(OFFB_CC_OP,Ity_I64),
1825                                   mkU64(ccOp))) );
1826   stmt( IRStmt_Put( OFFB_CC_DEP1,
1827                     IRExpr_Mux0X( mkexpr(guard),
1828                                   IRExpr_Get(OFFB_CC_DEP1,Ity_I64),
1829                                   widenUto64(mkexpr(res)))) );
1830   stmt( IRStmt_Put( OFFB_CC_DEP2,
1831                     IRExpr_Mux0X( mkexpr(guard),
1832                                   IRExpr_Get(OFFB_CC_DEP2,Ity_I64),
1833                                   widenUto64(mkexpr(resUS)))) );
1834}
1835
1836
1837/* For the inc/dec case, we store in DEP1 the result value and in NDEP
1838   the former value of the carry flag, which unfortunately we have to
1839   compute. */
1840
1841static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
1842{
1843   Int ccOp = inc ? AMD64G_CC_OP_INCB : AMD64G_CC_OP_DECB;
1844
1845   switch (ty) {
1846      case Ity_I8:  ccOp += 0; break;
1847      case Ity_I16: ccOp += 1; break;
1848      case Ity_I32: ccOp += 2; break;
1849      case Ity_I64: ccOp += 3; break;
1850      default: vassert(0);
1851   }
1852
1853   /* This has to come first, because calculating the C flag
1854      may require reading all four thunk fields. */
1855   stmt( IRStmt_Put( OFFB_CC_NDEP, mk_amd64g_calculate_rflags_c()) );
1856   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
1857   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(res))) );
1858   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
1859}
1860
1861
1862/* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
1863   two arguments. */
1864
1865static
1866void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, ULong base_op )
1867{
1868   switch (ty) {
1869      case Ity_I8:
1870         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+0) ) );
1871         break;
1872      case Ity_I16:
1873         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+1) ) );
1874         break;
1875      case Ity_I32:
1876         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+2) ) );
1877         break;
1878      case Ity_I64:
1879         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+3) ) );
1880         break;
1881      default:
1882         vpanic("setFlags_MUL(amd64)");
1883   }
1884   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(arg1)) ));
1885   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(arg2)) ));
1886}
1887
1888
1889/* -------------- Condition codes. -------------- */
1890
1891/* Condition codes, using the AMD encoding.  */
1892
1893static HChar* name_AMD64Condcode ( AMD64Condcode cond )
1894{
1895   switch (cond) {
1896      case AMD64CondO:      return "o";
1897      case AMD64CondNO:     return "no";
1898      case AMD64CondB:      return "b";
1899      case AMD64CondNB:     return "ae"; /*"nb";*/
1900      case AMD64CondZ:      return "e"; /*"z";*/
1901      case AMD64CondNZ:     return "ne"; /*"nz";*/
1902      case AMD64CondBE:     return "be";
1903      case AMD64CondNBE:    return "a"; /*"nbe";*/
1904      case AMD64CondS:      return "s";
1905      case AMD64CondNS:     return "ns";
1906      case AMD64CondP:      return "p";
1907      case AMD64CondNP:     return "np";
1908      case AMD64CondL:      return "l";
1909      case AMD64CondNL:     return "ge"; /*"nl";*/
1910      case AMD64CondLE:     return "le";
1911      case AMD64CondNLE:    return "g"; /*"nle";*/
1912      case AMD64CondAlways: return "ALWAYS";
1913      default: vpanic("name_AMD64Condcode");
1914   }
1915}
1916
1917static
1918AMD64Condcode positiveIse_AMD64Condcode ( AMD64Condcode  cond,
1919                                          /*OUT*/Bool*   needInvert )
1920{
1921   vassert(cond >= AMD64CondO && cond <= AMD64CondNLE);
1922   if (cond & 1) {
1923      *needInvert = True;
1924      return cond-1;
1925   } else {
1926      *needInvert = False;
1927      return cond;
1928   }
1929}
1930
1931
1932/* -------------- Helpers for ADD/SUB with carry. -------------- */
1933
1934/* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
1935   appropriately.
1936
1937   Optionally, generate a store for the 'tres' value.  This can either
1938   be a normal store, or it can be a cas-with-possible-failure style
1939   store:
1940
1941   if taddr is IRTemp_INVALID, then no store is generated.
1942
1943   if taddr is not IRTemp_INVALID, then a store (using taddr as
1944   the address) is generated:
1945
1946     if texpVal is IRTemp_INVALID then a normal store is
1947     generated, and restart_point must be zero (it is irrelevant).
1948
1949     if texpVal is not IRTemp_INVALID then a cas-style store is
1950     generated.  texpVal is the expected value, restart_point
1951     is the restart point if the store fails, and texpVal must
1952     have the same type as tres.
1953
1954*/
1955static void helper_ADC ( Int sz,
1956                         IRTemp tres, IRTemp ta1, IRTemp ta2,
1957                         /* info about optional store: */
1958                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
1959{
1960   UInt    thunkOp;
1961   IRType  ty    = szToITy(sz);
1962   IRTemp  oldc  = newTemp(Ity_I64);
1963   IRTemp  oldcn = newTemp(ty);
1964   IROp    plus  = mkSizedOp(ty, Iop_Add8);
1965   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
1966
1967   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
1968
1969   switch (sz) {
1970      case 8:  thunkOp = AMD64G_CC_OP_ADCQ; break;
1971      case 4:  thunkOp = AMD64G_CC_OP_ADCL; break;
1972      case 2:  thunkOp = AMD64G_CC_OP_ADCW; break;
1973      case 1:  thunkOp = AMD64G_CC_OP_ADCB; break;
1974      default: vassert(0);
1975   }
1976
1977   /* oldc = old carry flag, 0 or 1 */
1978   assign( oldc,  binop(Iop_And64,
1979                        mk_amd64g_calculate_rflags_c(),
1980                        mkU64(1)) );
1981
1982   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
1983
1984   assign( tres, binop(plus,
1985                       binop(plus,mkexpr(ta1),mkexpr(ta2)),
1986                       mkexpr(oldcn)) );
1987
1988   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
1989      start of this function. */
1990   if (taddr != IRTemp_INVALID) {
1991      if (texpVal == IRTemp_INVALID) {
1992         vassert(restart_point == 0);
1993         storeLE( mkexpr(taddr), mkexpr(tres) );
1994      } else {
1995         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
1996         /* .. and hence 'texpVal' has the same type as 'tres'. */
1997         casLE( mkexpr(taddr),
1998                mkexpr(texpVal), mkexpr(tres), restart_point );
1999      }
2000   }
2001
2002   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
2003   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1))  ));
2004   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
2005                                                         mkexpr(oldcn)) )) );
2006   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
2007}
2008
2009
2010/* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
2011   appropriately.  As with helper_ADC, possibly generate a store of
2012   the result -- see comments on helper_ADC for details.
2013*/
2014static void helper_SBB ( Int sz,
2015                         IRTemp tres, IRTemp ta1, IRTemp ta2,
2016                         /* info about optional store: */
2017                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
2018{
2019   UInt    thunkOp;
2020   IRType  ty    = szToITy(sz);
2021   IRTemp  oldc  = newTemp(Ity_I64);
2022   IRTemp  oldcn = newTemp(ty);
2023   IROp    minus = mkSizedOp(ty, Iop_Sub8);
2024   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
2025
2026   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
2027
2028   switch (sz) {
2029      case 8:  thunkOp = AMD64G_CC_OP_SBBQ; break;
2030      case 4:  thunkOp = AMD64G_CC_OP_SBBL; break;
2031      case 2:  thunkOp = AMD64G_CC_OP_SBBW; break;
2032      case 1:  thunkOp = AMD64G_CC_OP_SBBB; break;
2033      default: vassert(0);
2034   }
2035
2036   /* oldc = old carry flag, 0 or 1 */
2037   assign( oldc, binop(Iop_And64,
2038                       mk_amd64g_calculate_rflags_c(),
2039                       mkU64(1)) );
2040
2041   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
2042
2043   assign( tres, binop(minus,
2044                       binop(minus,mkexpr(ta1),mkexpr(ta2)),
2045                       mkexpr(oldcn)) );
2046
2047   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
2048      start of this function. */
2049   if (taddr != IRTemp_INVALID) {
2050      if (texpVal == IRTemp_INVALID) {
2051         vassert(restart_point == 0);
2052         storeLE( mkexpr(taddr), mkexpr(tres) );
2053      } else {
2054         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
2055         /* .. and hence 'texpVal' has the same type as 'tres'. */
2056         casLE( mkexpr(taddr),
2057                mkexpr(texpVal), mkexpr(tres), restart_point );
2058      }
2059   }
2060
2061   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
2062   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1) )) );
2063   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
2064                                                         mkexpr(oldcn)) )) );
2065   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
2066}
2067
2068
2069/* -------------- Helpers for disassembly printing. -------------- */
2070
2071static HChar* nameGrp1 ( Int opc_aux )
2072{
2073   static HChar* grp1_names[8]
2074     = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
2075   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(amd64)");
2076   return grp1_names[opc_aux];
2077}
2078
2079static HChar* nameGrp2 ( Int opc_aux )
2080{
2081   static HChar* grp2_names[8]
2082     = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
2083   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(amd64)");
2084   return grp2_names[opc_aux];
2085}
2086
2087static HChar* nameGrp4 ( Int opc_aux )
2088{
2089   static HChar* grp4_names[8]
2090     = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
2091   if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(amd64)");
2092   return grp4_names[opc_aux];
2093}
2094
2095static HChar* nameGrp5 ( Int opc_aux )
2096{
2097   static HChar* grp5_names[8]
2098     = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
2099   if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(amd64)");
2100   return grp5_names[opc_aux];
2101}
2102
2103static HChar* nameGrp8 ( Int opc_aux )
2104{
2105   static HChar* grp8_names[8]
2106      = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
2107   if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(amd64)");
2108   return grp8_names[opc_aux];
2109}
2110
2111//.. static HChar* nameSReg ( UInt sreg )
2112//.. {
2113//..    switch (sreg) {
2114//..       case R_ES: return "%es";
2115//..       case R_CS: return "%cs";
2116//..       case R_SS: return "%ss";
2117//..       case R_DS: return "%ds";
2118//..       case R_FS: return "%fs";
2119//..       case R_GS: return "%gs";
2120//..       default: vpanic("nameSReg(x86)");
2121//..    }
2122//.. }
2123
2124static HChar* nameMMXReg ( Int mmxreg )
2125{
2126   static HChar* mmx_names[8]
2127     = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
2128   if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(amd64,guest)");
2129   return mmx_names[mmxreg];
2130}
2131
2132static HChar* nameXMMReg ( Int xmmreg )
2133{
2134   static HChar* xmm_names[16]
2135     = { "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3",
2136         "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7",
2137         "%xmm8",  "%xmm9",  "%xmm10", "%xmm11",
2138         "%xmm12", "%xmm13", "%xmm14", "%xmm15" };
2139   if (xmmreg < 0 || xmmreg > 15) vpanic("nameXMMReg(amd64)");
2140   return xmm_names[xmmreg];
2141}
2142
2143static HChar* nameMMXGran ( Int gran )
2144{
2145   switch (gran) {
2146      case 0: return "b";
2147      case 1: return "w";
2148      case 2: return "d";
2149      case 3: return "q";
2150      default: vpanic("nameMMXGran(amd64,guest)");
2151   }
2152}
2153
2154static HChar nameISize ( Int size )
2155{
2156   switch (size) {
2157      case 8: return 'q';
2158      case 4: return 'l';
2159      case 2: return 'w';
2160      case 1: return 'b';
2161      default: vpanic("nameISize(amd64)");
2162   }
2163}
2164
2165static HChar* nameYMMReg ( Int ymmreg )
2166{
2167   static HChar* ymm_names[16]
2168     = { "%ymm0",  "%ymm1",  "%ymm2",  "%ymm3",
2169         "%ymm4",  "%ymm5",  "%ymm6",  "%ymm7",
2170         "%ymm8",  "%ymm9",  "%ymm10", "%ymm11",
2171         "%ymm12", "%ymm13", "%ymm14", "%ymm15" };
2172   if (ymmreg < 0 || ymmreg > 15) vpanic("nameYMMReg(amd64)");
2173   return ymm_names[ymmreg];
2174}
2175
2176
2177/*------------------------------------------------------------*/
2178/*--- JMP helpers                                          ---*/
2179/*------------------------------------------------------------*/
2180
2181static void jmp_lit( /*MOD*/DisResult* dres,
2182                     IRJumpKind kind, Addr64 d64 )
2183{
2184   vassert(dres->whatNext    == Dis_Continue);
2185   vassert(dres->len         == 0);
2186   vassert(dres->continueAt  == 0);
2187   vassert(dres->jk_StopHere == Ijk_INVALID);
2188   dres->whatNext    = Dis_StopHere;
2189   dres->jk_StopHere = kind;
2190   stmt( IRStmt_Put( OFFB_RIP, mkU64(d64) ) );
2191}
2192
2193static void jmp_treg( /*MOD*/DisResult* dres,
2194                      IRJumpKind kind, IRTemp t )
2195{
2196   vassert(dres->whatNext    == Dis_Continue);
2197   vassert(dres->len         == 0);
2198   vassert(dres->continueAt  == 0);
2199   vassert(dres->jk_StopHere == Ijk_INVALID);
2200   dres->whatNext    = Dis_StopHere;
2201   dres->jk_StopHere = kind;
2202   stmt( IRStmt_Put( OFFB_RIP, mkexpr(t) ) );
2203}
2204
2205static
2206void jcc_01 ( /*MOD*/DisResult* dres,
2207              AMD64Condcode cond, Addr64 d64_false, Addr64 d64_true )
2208{
2209   Bool          invert;
2210   AMD64Condcode condPos;
2211   vassert(dres->whatNext    == Dis_Continue);
2212   vassert(dres->len         == 0);
2213   vassert(dres->continueAt  == 0);
2214   vassert(dres->jk_StopHere == Ijk_INVALID);
2215   dres->whatNext    = Dis_StopHere;
2216   dres->jk_StopHere = Ijk_Boring;
2217   condPos = positiveIse_AMD64Condcode ( cond, &invert );
2218   if (invert) {
2219      stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
2220                         Ijk_Boring,
2221                         IRConst_U64(d64_false),
2222                         OFFB_RIP ) );
2223      stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_true) ) );
2224   } else {
2225      stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
2226                         Ijk_Boring,
2227                         IRConst_U64(d64_true),
2228                         OFFB_RIP ) );
2229      stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_false) ) );
2230   }
2231}
2232
2233/* Let new_rsp be the %rsp value after a call/return.  Let nia be the
2234   guest address of the next instruction to be executed.
2235
2236   This function generates an AbiHint to say that -128(%rsp)
2237   .. -1(%rsp) should now be regarded as uninitialised.
2238*/
2239static
2240void make_redzone_AbiHint ( VexAbiInfo* vbi,
2241                            IRTemp new_rsp, IRTemp nia, HChar* who )
2242{
2243   Int szB = vbi->guest_stack_redzone_size;
2244   vassert(szB >= 0);
2245
2246   /* A bit of a kludge.  Currently the only AbI we've guested AMD64
2247      for is ELF.  So just check it's the expected 128 value
2248      (paranoia). */
2249   vassert(szB == 128);
2250
2251   if (0) vex_printf("AbiHint: %s\n", who);
2252   vassert(typeOfIRTemp(irsb->tyenv, new_rsp) == Ity_I64);
2253   vassert(typeOfIRTemp(irsb->tyenv, nia) == Ity_I64);
2254   if (szB > 0)
2255      stmt( IRStmt_AbiHint(
2256               binop(Iop_Sub64, mkexpr(new_rsp), mkU64(szB)),
2257               szB,
2258               mkexpr(nia)
2259            ));
2260}
2261
2262
2263/*------------------------------------------------------------*/
2264/*--- Disassembling addressing modes                       ---*/
2265/*------------------------------------------------------------*/
2266
2267static
2268HChar* segRegTxt ( Prefix pfx )
2269{
2270   if (pfx & PFX_CS) return "%cs:";
2271   if (pfx & PFX_DS) return "%ds:";
2272   if (pfx & PFX_ES) return "%es:";
2273   if (pfx & PFX_FS) return "%fs:";
2274   if (pfx & PFX_GS) return "%gs:";
2275   if (pfx & PFX_SS) return "%ss:";
2276   return ""; /* no override */
2277}
2278
2279
2280/* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
2281   linear address by adding any required segment override as indicated
2282   by sorb, and also dealing with any address size override
2283   present. */
2284static
2285IRExpr* handleAddrOverrides ( VexAbiInfo* vbi,
2286                              Prefix pfx, IRExpr* virtual )
2287{
2288   /* --- segment overrides --- */
2289   if (pfx & PFX_FS) {
2290      if (vbi->guest_amd64_assume_fs_is_zero) {
2291         /* Note that this is a linux-kernel specific hack that relies
2292            on the assumption that %fs is always zero. */
2293         /* return virtual + guest_FS_ZERO. */
2294         virtual = binop(Iop_Add64, virtual,
2295                                    IRExpr_Get(OFFB_FS_ZERO, Ity_I64));
2296      } else {
2297         unimplemented("amd64 %fs segment override");
2298      }
2299   }
2300
2301   if (pfx & PFX_GS) {
2302      if (vbi->guest_amd64_assume_gs_is_0x60) {
2303         /* Note that this is a darwin-kernel specific hack that relies
2304            on the assumption that %gs is always 0x60. */
2305         /* return virtual + guest_GS_0x60. */
2306         virtual = binop(Iop_Add64, virtual,
2307                                    IRExpr_Get(OFFB_GS_0x60, Ity_I64));
2308      } else {
2309         unimplemented("amd64 %gs segment override");
2310      }
2311   }
2312
2313   /* cs, ds, es and ss are simply ignored in 64-bit mode. */
2314
2315   /* --- address size override --- */
2316   if (haveASO(pfx))
2317      virtual = unop(Iop_32Uto64, unop(Iop_64to32, virtual));
2318
2319   return virtual;
2320}
2321
2322//.. {
2323//..    Int    sreg;
2324//..    IRType hWordTy;
2325//..    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
2326//..
2327//..    if (sorb == 0)
2328//..       /* the common case - no override */
2329//..       return virtual;
2330//..
2331//..    switch (sorb) {
2332//..       case 0x3E: sreg = R_DS; break;
2333//..       case 0x26: sreg = R_ES; break;
2334//..       case 0x64: sreg = R_FS; break;
2335//..       case 0x65: sreg = R_GS; break;
2336//..       default: vpanic("handleAddrOverrides(x86,guest)");
2337//..    }
2338//..
2339//..    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
2340//..
2341//..    seg_selector = newTemp(Ity_I32);
2342//..    ldt_ptr      = newTemp(hWordTy);
2343//..    gdt_ptr      = newTemp(hWordTy);
2344//..    r64          = newTemp(Ity_I64);
2345//..
2346//..    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
2347//..    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
2348//..    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
2349//..
2350//..    /*
2351//..    Call this to do the translation and limit checks:
2352//..    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
2353//..                                  UInt seg_selector, UInt virtual_addr )
2354//..    */
2355//..    assign(
2356//..       r64,
2357//..       mkIRExprCCall(
2358//..          Ity_I64,
2359//..          0/*regparms*/,
2360//..          "x86g_use_seg_selector",
2361//..          &x86g_use_seg_selector,
2362//..          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
2363//..                         mkexpr(seg_selector), virtual)
2364//..       )
2365//..    );
2366//..
2367//..    /* If the high 32 of the result are non-zero, there was a
2368//..       failure in address translation.  In which case, make a
2369//..       quick exit.
2370//..    */
2371//..    stmt(
2372//..       IRStmt_Exit(
2373//..          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
2374//..          Ijk_MapFail,
2375//..          IRConst_U32( guest_eip_curr_instr )
2376//..       )
2377//..    );
2378//..
2379//..    /* otherwise, here's the translated result. */
2380//..    return unop(Iop_64to32, mkexpr(r64));
2381//.. }
2382
2383
2384/* Generate IR to calculate an address indicated by a ModRM and
2385   following SIB bytes.  The expression, and the number of bytes in
2386   the address mode, are returned (the latter in *len).  Note that
2387   this fn should not be called if the R/M part of the address denotes
2388   a register instead of memory.  If print_codegen is true, text of
2389   the addressing mode is placed in buf.
2390
2391   The computed address is stored in a new tempreg, and the
2392   identity of the tempreg is returned.
2393
2394   extra_bytes holds the number of bytes after the amode, as supplied
2395   by the caller.  This is needed to make sense of %rip-relative
2396   addresses.  Note that the value that *len is set to is only the
2397   length of the amode itself and does not include the value supplied
2398   in extra_bytes.
2399 */
2400
2401static IRTemp disAMode_copy2tmp ( IRExpr* addr64 )
2402{
2403   IRTemp tmp = newTemp(Ity_I64);
2404   assign( tmp, addr64 );
2405   return tmp;
2406}
2407
2408static
2409IRTemp disAMode ( /*OUT*/Int* len,
2410                  VexAbiInfo* vbi, Prefix pfx, Long delta,
2411                  /*OUT*/HChar* buf, Int extra_bytes )
2412{
2413   UChar mod_reg_rm = getUChar(delta);
2414   delta++;
2415
2416   buf[0] = (UChar)0;
2417   vassert(extra_bytes >= 0 && extra_bytes < 10);
2418
2419   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
2420      jump table seems a bit excessive.
2421   */
2422   mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
2423   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
2424                                               /* is now XX0XXYYY */
2425   mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
2426   switch (mod_reg_rm) {
2427
2428      /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
2429         REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
2430      */
2431      case 0x00: case 0x01: case 0x02: case 0x03:
2432      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
2433         { UChar rm = toUChar(mod_reg_rm & 7);
2434           DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
2435           *len = 1;
2436           return disAMode_copy2tmp(
2437                  handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,rm)));
2438         }
2439
2440      /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
2441         REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
2442      */
2443      case 0x08: case 0x09: case 0x0A: case 0x0B:
2444      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
2445         { UChar rm = toUChar(mod_reg_rm & 7);
2446           Long d   = getSDisp8(delta);
2447           if (d == 0) {
2448              DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
2449           } else {
2450              DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
2451           }
2452           *len = 2;
2453           return disAMode_copy2tmp(
2454                  handleAddrOverrides(vbi, pfx,
2455                     binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
2456         }
2457
2458      /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
2459         REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
2460      */
2461      case 0x10: case 0x11: case 0x12: case 0x13:
2462      /* ! 14 */ case 0x15: case 0x16: case 0x17:
2463         { UChar rm = toUChar(mod_reg_rm & 7);
2464           Long  d  = getSDisp32(delta);
2465           DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
2466           *len = 5;
2467           return disAMode_copy2tmp(
2468                  handleAddrOverrides(vbi, pfx,
2469                     binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
2470         }
2471
2472      /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
2473      /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
2474      case 0x18: case 0x19: case 0x1A: case 0x1B:
2475      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
2476         vpanic("disAMode(amd64): not an addr!");
2477
2478      /* RIP + disp32.  This assumes that guest_RIP_curr_instr is set
2479         correctly at the start of handling each instruction. */
2480      case 0x05:
2481         { Long d = getSDisp32(delta);
2482           *len = 5;
2483           DIS(buf, "%s%lld(%%rip)", segRegTxt(pfx), d);
2484           /* We need to know the next instruction's start address.
2485              Try and figure out what it is, record the guess, and ask
2486              the top-level driver logic (bbToIR_AMD64) to check we
2487              guessed right, after the instruction is completely
2488              decoded. */
2489           guest_RIP_next_mustcheck = True;
2490           guest_RIP_next_assumed = guest_RIP_bbstart
2491                                    + delta+4 + extra_bytes;
2492           return disAMode_copy2tmp(
2493                     handleAddrOverrides(vbi, pfx,
2494                        binop(Iop_Add64, mkU64(guest_RIP_next_assumed),
2495                                         mkU64(d))));
2496         }
2497
2498      case 0x04: {
2499         /* SIB, with no displacement.  Special cases:
2500            -- %rsp cannot act as an index value.
2501               If index_r indicates %rsp, zero is used for the index.
2502            -- when mod is zero and base indicates RBP or R13, base is
2503               instead a 32-bit sign-extended literal.
2504            It's all madness, I tell you.  Extract %index, %base and
2505            scale from the SIB byte.  The value denoted is then:
2506               | %index == %RSP && (%base == %RBP || %base == %R13)
2507               = d32 following SIB byte
2508               | %index == %RSP && !(%base == %RBP || %base == %R13)
2509               = %base
2510               | %index != %RSP && (%base == %RBP || %base == %R13)
2511               = d32 following SIB byte + (%index << scale)
2512               | %index != %RSP && !(%base == %RBP || %base == %R13)
2513               = %base + (%index << scale)
2514         */
2515         UChar sib     = getUChar(delta);
2516         UChar scale   = toUChar((sib >> 6) & 3);
2517         UChar index_r = toUChar((sib >> 3) & 7);
2518         UChar base_r  = toUChar(sib & 7);
2519         /* correct since #(R13) == 8 + #(RBP) */
2520         Bool  base_is_BPor13 = toBool(base_r == R_RBP);
2521         Bool  index_is_SP    = toBool(index_r == R_RSP && 0==getRexX(pfx));
2522         delta++;
2523
2524         if ((!index_is_SP) && (!base_is_BPor13)) {
2525            if (scale == 0) {
2526               DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
2527                         nameIRegRexB(8,pfx,base_r),
2528                         nameIReg64rexX(pfx,index_r));
2529            } else {
2530               DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
2531                         nameIRegRexB(8,pfx,base_r),
2532                         nameIReg64rexX(pfx,index_r), 1<<scale);
2533            }
2534            *len = 2;
2535            return
2536               disAMode_copy2tmp(
2537               handleAddrOverrides(vbi, pfx,
2538                  binop(Iop_Add64,
2539                        getIRegRexB(8,pfx,base_r),
2540                        binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
2541                              mkU8(scale)))));
2542         }
2543
2544         if ((!index_is_SP) && base_is_BPor13) {
2545            Long d = getSDisp32(delta);
2546            DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d,
2547                      nameIReg64rexX(pfx,index_r), 1<<scale);
2548            *len = 6;
2549            return
2550               disAMode_copy2tmp(
2551               handleAddrOverrides(vbi, pfx,
2552                  binop(Iop_Add64,
2553                        binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
2554                                         mkU8(scale)),
2555                        mkU64(d))));
2556         }
2557
2558         if (index_is_SP && (!base_is_BPor13)) {
2559            DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,base_r));
2560            *len = 2;
2561            return disAMode_copy2tmp(
2562                   handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,base_r)));
2563         }
2564
2565         if (index_is_SP && base_is_BPor13) {
2566            Long d = getSDisp32(delta);
2567            DIS(buf, "%s%lld", segRegTxt(pfx), d);
2568            *len = 6;
2569            return disAMode_copy2tmp(
2570                   handleAddrOverrides(vbi, pfx, mkU64(d)));
2571         }
2572
2573         vassert(0);
2574      }
2575
2576      /* SIB, with 8-bit displacement.  Special cases:
2577         -- %esp cannot act as an index value.
2578            If index_r indicates %esp, zero is used for the index.
2579         Denoted value is:
2580            | %index == %ESP
2581            = d8 + %base
2582            | %index != %ESP
2583            = d8 + %base + (%index << scale)
2584      */
2585      case 0x0C: {
2586         UChar sib     = getUChar(delta);
2587         UChar scale   = toUChar((sib >> 6) & 3);
2588         UChar index_r = toUChar((sib >> 3) & 7);
2589         UChar base_r  = toUChar(sib & 7);
2590         Long d        = getSDisp8(delta+1);
2591
2592         if (index_r == R_RSP && 0==getRexX(pfx)) {
2593            DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
2594                                   d, nameIRegRexB(8,pfx,base_r));
2595            *len = 3;
2596            return disAMode_copy2tmp(
2597                   handleAddrOverrides(vbi, pfx,
2598                      binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
2599         } else {
2600            if (scale == 0) {
2601               DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
2602                         nameIRegRexB(8,pfx,base_r),
2603                         nameIReg64rexX(pfx,index_r));
2604            } else {
2605               DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
2606                         nameIRegRexB(8,pfx,base_r),
2607                         nameIReg64rexX(pfx,index_r), 1<<scale);
2608            }
2609            *len = 3;
2610            return
2611                disAMode_copy2tmp(
2612                handleAddrOverrides(vbi, pfx,
2613                  binop(Iop_Add64,
2614                        binop(Iop_Add64,
2615                              getIRegRexB(8,pfx,base_r),
2616                              binop(Iop_Shl64,
2617                                    getIReg64rexX(pfx,index_r), mkU8(scale))),
2618                        mkU64(d))));
2619         }
2620         vassert(0); /*NOTREACHED*/
2621      }
2622
2623      /* SIB, with 32-bit displacement.  Special cases:
2624         -- %rsp cannot act as an index value.
2625            If index_r indicates %rsp, zero is used for the index.
2626         Denoted value is:
2627            | %index == %RSP
2628            = d32 + %base
2629            | %index != %RSP
2630            = d32 + %base + (%index << scale)
2631      */
2632      case 0x14: {
2633         UChar sib     = getUChar(delta);
2634         UChar scale   = toUChar((sib >> 6) & 3);
2635         UChar index_r = toUChar((sib >> 3) & 7);
2636         UChar base_r  = toUChar(sib & 7);
2637         Long d        = getSDisp32(delta+1);
2638
2639         if (index_r == R_RSP && 0==getRexX(pfx)) {
2640            DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
2641                                   d, nameIRegRexB(8,pfx,base_r));
2642            *len = 6;
2643            return disAMode_copy2tmp(
2644                   handleAddrOverrides(vbi, pfx,
2645                      binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
2646         } else {
2647            if (scale == 0) {
2648               DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
2649                         nameIRegRexB(8,pfx,base_r),
2650                         nameIReg64rexX(pfx,index_r));
2651            } else {
2652               DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
2653                         nameIRegRexB(8,pfx,base_r),
2654                         nameIReg64rexX(pfx,index_r), 1<<scale);
2655            }
2656            *len = 6;
2657            return
2658                disAMode_copy2tmp(
2659                handleAddrOverrides(vbi, pfx,
2660                  binop(Iop_Add64,
2661                        binop(Iop_Add64,
2662                              getIRegRexB(8,pfx,base_r),
2663                              binop(Iop_Shl64,
2664                                    getIReg64rexX(pfx,index_r), mkU8(scale))),
2665                        mkU64(d))));
2666         }
2667         vassert(0); /*NOTREACHED*/
2668      }
2669
2670      default:
2671         vpanic("disAMode(amd64)");
2672         return 0; /*notreached*/
2673   }
2674}
2675
2676
2677/* Figure out the number of (insn-stream) bytes constituting the amode
2678   beginning at delta.  Is useful for getting hold of literals beyond
2679   the end of the amode before it has been disassembled.  */
2680
2681static UInt lengthAMode ( Prefix pfx, Long delta )
2682{
2683   UChar mod_reg_rm = getUChar(delta);
2684   delta++;
2685
2686   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
2687      jump table seems a bit excessive.
2688   */
2689   mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
2690   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
2691                                               /* is now XX0XXYYY */
2692   mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
2693   switch (mod_reg_rm) {
2694
2695      /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
2696         REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
2697      */
2698      case 0x00: case 0x01: case 0x02: case 0x03:
2699      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
2700         return 1;
2701
2702      /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
2703         REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
2704      */
2705      case 0x08: case 0x09: case 0x0A: case 0x0B:
2706      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
2707         return 2;
2708
2709      /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
2710         REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
2711      */
2712      case 0x10: case 0x11: case 0x12: case 0x13:
2713      /* ! 14 */ case 0x15: case 0x16: case 0x17:
2714         return 5;
2715
2716      /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
2717      /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
2718      /* Not an address, but still handled. */
2719      case 0x18: case 0x19: case 0x1A: case 0x1B:
2720      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
2721         return 1;
2722
2723      /* RIP + disp32. */
2724      case 0x05:
2725         return 5;
2726
2727      case 0x04: {
2728         /* SIB, with no displacement. */
2729         UChar sib     = getUChar(delta);
2730         UChar base_r  = toUChar(sib & 7);
2731         /* correct since #(R13) == 8 + #(RBP) */
2732         Bool  base_is_BPor13 = toBool(base_r == R_RBP);
2733
2734         if (base_is_BPor13) {
2735            return 6;
2736         } else {
2737            return 2;
2738         }
2739      }
2740
2741      /* SIB, with 8-bit displacement. */
2742      case 0x0C:
2743         return 3;
2744
2745      /* SIB, with 32-bit displacement. */
2746      case 0x14:
2747         return 6;
2748
2749      default:
2750         vpanic("lengthAMode(amd64)");
2751         return 0; /*notreached*/
2752   }
2753}
2754
2755
2756/*------------------------------------------------------------*/
2757/*--- Disassembling common idioms                          ---*/
2758/*------------------------------------------------------------*/
2759
2760/* Handle binary integer instructions of the form
2761      op E, G  meaning
2762      op reg-or-mem, reg
2763   Is passed the a ptr to the modRM byte, the actual operation, and the
2764   data size.  Returns the address advanced completely over this
2765   instruction.
2766
2767   E(src) is reg-or-mem
2768   G(dst) is reg.
2769
2770   If E is reg, -->    GET %G,  tmp
2771                       OP %E,   tmp
2772                       PUT tmp, %G
2773
2774   If E is mem and OP is not reversible,
2775                -->    (getAddr E) -> tmpa
2776                       LD (tmpa), tmpa
2777                       GET %G, tmp2
2778                       OP tmpa, tmp2
2779                       PUT tmp2, %G
2780
2781   If E is mem and OP is reversible
2782                -->    (getAddr E) -> tmpa
2783                       LD (tmpa), tmpa
2784                       OP %G, tmpa
2785                       PUT tmpa, %G
2786*/
2787static
2788ULong dis_op2_E_G ( VexAbiInfo* vbi,
2789                    Prefix      pfx,
2790                    Bool        addSubCarry,
2791                    IROp        op8,
2792                    Bool        keep,
2793                    Int         size,
2794                    Long        delta0,
2795                    HChar*      t_amd64opc )
2796{
2797   HChar   dis_buf[50];
2798   Int     len;
2799   IRType  ty   = szToITy(size);
2800   IRTemp  dst1 = newTemp(ty);
2801   IRTemp  src  = newTemp(ty);
2802   IRTemp  dst0 = newTemp(ty);
2803   UChar   rm   = getUChar(delta0);
2804   IRTemp  addr = IRTemp_INVALID;
2805
2806   /* addSubCarry == True indicates the intended operation is
2807      add-with-carry or subtract-with-borrow. */
2808   if (addSubCarry) {
2809      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
2810      vassert(keep);
2811   }
2812
2813   if (epartIsReg(rm)) {
2814      /* Specially handle XOR reg,reg, because that doesn't really
2815         depend on reg, and doing the obvious thing potentially
2816         generates a spurious value check failure due to the bogus
2817         dependency. */
2818      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
2819          && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
2820         if (False && op8 == Iop_Sub8)
2821            vex_printf("vex amd64->IR: sbb %%r,%%r optimisation(1)\n");
2822	 putIRegG(size,pfx,rm, mkU(ty,0));
2823      }
2824
2825      assign( dst0, getIRegG(size,pfx,rm) );
2826      assign( src,  getIRegE(size,pfx,rm) );
2827
2828      if (addSubCarry && op8 == Iop_Add8) {
2829         helper_ADC( size, dst1, dst0, src,
2830                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2831         putIRegG(size, pfx, rm, mkexpr(dst1));
2832      } else
2833      if (addSubCarry && op8 == Iop_Sub8) {
2834         helper_SBB( size, dst1, dst0, src,
2835                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2836         putIRegG(size, pfx, rm, mkexpr(dst1));
2837      } else {
2838         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
2839         if (isAddSub(op8))
2840            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2841         else
2842            setFlags_DEP1(op8, dst1, ty);
2843         if (keep)
2844            putIRegG(size, pfx, rm, mkexpr(dst1));
2845      }
2846
2847      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
2848                          nameIRegE(size,pfx,rm),
2849                          nameIRegG(size,pfx,rm));
2850      return 1+delta0;
2851   } else {
2852      /* E refers to memory */
2853      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
2854      assign( dst0, getIRegG(size,pfx,rm) );
2855      assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
2856
2857      if (addSubCarry && op8 == Iop_Add8) {
2858         helper_ADC( size, dst1, dst0, src,
2859                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2860         putIRegG(size, pfx, rm, mkexpr(dst1));
2861      } else
2862      if (addSubCarry && op8 == Iop_Sub8) {
2863         helper_SBB( size, dst1, dst0, src,
2864                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2865         putIRegG(size, pfx, rm, mkexpr(dst1));
2866      } else {
2867         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
2868         if (isAddSub(op8))
2869            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2870         else
2871            setFlags_DEP1(op8, dst1, ty);
2872         if (keep)
2873            putIRegG(size, pfx, rm, mkexpr(dst1));
2874      }
2875
2876      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
2877                          dis_buf, nameIRegG(size, pfx, rm));
2878      return len+delta0;
2879   }
2880}
2881
2882
2883
2884/* Handle binary integer instructions of the form
2885      op G, E  meaning
2886      op reg, reg-or-mem
2887   Is passed the a ptr to the modRM byte, the actual operation, and the
2888   data size.  Returns the address advanced completely over this
2889   instruction.
2890
2891   G(src) is reg.
2892   E(dst) is reg-or-mem
2893
2894   If E is reg, -->    GET %E,  tmp
2895                       OP %G,   tmp
2896                       PUT tmp, %E
2897
2898   If E is mem, -->    (getAddr E) -> tmpa
2899                       LD (tmpa), tmpv
2900                       OP %G, tmpv
2901                       ST tmpv, (tmpa)
2902*/
2903static
2904ULong dis_op2_G_E ( VexAbiInfo* vbi,
2905                    Prefix      pfx,
2906                    Bool        addSubCarry,
2907                    IROp        op8,
2908                    Bool        keep,
2909                    Int         size,
2910                    Long        delta0,
2911                    HChar*      t_amd64opc )
2912{
2913   HChar   dis_buf[50];
2914   Int     len;
2915   IRType  ty   = szToITy(size);
2916   IRTemp  dst1 = newTemp(ty);
2917   IRTemp  src  = newTemp(ty);
2918   IRTemp  dst0 = newTemp(ty);
2919   UChar   rm   = getUChar(delta0);
2920   IRTemp  addr = IRTemp_INVALID;
2921
2922   /* addSubCarry == True indicates the intended operation is
2923      add-with-carry or subtract-with-borrow. */
2924   if (addSubCarry) {
2925      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
2926      vassert(keep);
2927   }
2928
2929   if (epartIsReg(rm)) {
2930      /* Specially handle XOR reg,reg, because that doesn't really
2931         depend on reg, and doing the obvious thing potentially
2932         generates a spurious value check failure due to the bogus
2933         dependency.  Ditto SBB reg,reg. */
2934      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
2935          && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
2936         putIRegE(size,pfx,rm, mkU(ty,0));
2937      }
2938
2939      assign(dst0, getIRegE(size,pfx,rm));
2940      assign(src,  getIRegG(size,pfx,rm));
2941
2942      if (addSubCarry && op8 == Iop_Add8) {
2943         helper_ADC( size, dst1, dst0, src,
2944                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2945         putIRegE(size, pfx, rm, mkexpr(dst1));
2946      } else
2947      if (addSubCarry && op8 == Iop_Sub8) {
2948         helper_SBB( size, dst1, dst0, src,
2949                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2950         putIRegE(size, pfx, rm, mkexpr(dst1));
2951      } else {
2952         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2953         if (isAddSub(op8))
2954            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2955         else
2956            setFlags_DEP1(op8, dst1, ty);
2957         if (keep)
2958            putIRegE(size, pfx, rm, mkexpr(dst1));
2959      }
2960
2961      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
2962                          nameIRegG(size,pfx,rm),
2963                          nameIRegE(size,pfx,rm));
2964      return 1+delta0;
2965   }
2966
2967   /* E refers to memory */
2968   {
2969      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
2970      assign(dst0, loadLE(ty,mkexpr(addr)));
2971      assign(src,  getIRegG(size,pfx,rm));
2972
2973      if (addSubCarry && op8 == Iop_Add8) {
2974         if (pfx & PFX_LOCK) {
2975            /* cas-style store */
2976            helper_ADC( size, dst1, dst0, src,
2977                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
2978         } else {
2979            /* normal store */
2980            helper_ADC( size, dst1, dst0, src,
2981                        /*store*/addr, IRTemp_INVALID, 0 );
2982         }
2983      } else
2984      if (addSubCarry && op8 == Iop_Sub8) {
2985         if (pfx & PFX_LOCK) {
2986            /* cas-style store */
2987            helper_SBB( size, dst1, dst0, src,
2988                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
2989         } else {
2990            /* normal store */
2991            helper_SBB( size, dst1, dst0, src,
2992                        /*store*/addr, IRTemp_INVALID, 0 );
2993         }
2994      } else {
2995         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2996         if (keep) {
2997            if (pfx & PFX_LOCK) {
2998               if (0) vex_printf("locked case\n" );
2999               casLE( mkexpr(addr),
3000                      mkexpr(dst0)/*expval*/,
3001                      mkexpr(dst1)/*newval*/, guest_RIP_curr_instr );
3002            } else {
3003               if (0) vex_printf("nonlocked case\n");
3004               storeLE(mkexpr(addr), mkexpr(dst1));
3005            }
3006         }
3007         if (isAddSub(op8))
3008            setFlags_DEP1_DEP2(op8, dst0, src, ty);
3009         else
3010            setFlags_DEP1(op8, dst1, ty);
3011      }
3012
3013      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
3014                          nameIRegG(size,pfx,rm), dis_buf);
3015      return len+delta0;
3016   }
3017}
3018
3019
3020/* Handle move instructions of the form
3021      mov E, G  meaning
3022      mov reg-or-mem, reg
3023   Is passed the a ptr to the modRM byte, and the data size.  Returns
3024   the address advanced completely over this instruction.
3025
3026   E(src) is reg-or-mem
3027   G(dst) is reg.
3028
3029   If E is reg, -->    GET %E,  tmpv
3030                       PUT tmpv, %G
3031
3032   If E is mem  -->    (getAddr E) -> tmpa
3033                       LD (tmpa), tmpb
3034                       PUT tmpb, %G
3035*/
3036static
3037ULong dis_mov_E_G ( VexAbiInfo* vbi,
3038                    Prefix      pfx,
3039                    Int         size,
3040                    Long        delta0 )
3041{
3042   Int len;
3043   UChar rm = getUChar(delta0);
3044   HChar dis_buf[50];
3045
3046   if (epartIsReg(rm)) {
3047      putIRegG(size, pfx, rm, getIRegE(size, pfx, rm));
3048      DIP("mov%c %s,%s\n", nameISize(size),
3049                           nameIRegE(size,pfx,rm),
3050                           nameIRegG(size,pfx,rm));
3051      return 1+delta0;
3052   }
3053
3054   /* E refers to memory */
3055   {
3056      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
3057      putIRegG(size, pfx, rm, loadLE(szToITy(size), mkexpr(addr)));
3058      DIP("mov%c %s,%s\n", nameISize(size),
3059                           dis_buf,
3060                           nameIRegG(size,pfx,rm));
3061      return delta0+len;
3062   }
3063}
3064
3065
3066/* Handle move instructions of the form
3067      mov G, E  meaning
3068      mov reg, reg-or-mem
3069   Is passed the a ptr to the modRM byte, and the data size.  Returns
3070   the address advanced completely over this instruction.
3071
3072   G(src) is reg.
3073   E(dst) is reg-or-mem
3074
3075   If E is reg, -->    GET %G,  tmp
3076                       PUT tmp, %E
3077
3078   If E is mem, -->    (getAddr E) -> tmpa
3079                       GET %G, tmpv
3080                       ST tmpv, (tmpa)
3081*/
3082static
3083ULong dis_mov_G_E ( VexAbiInfo* vbi,
3084                    Prefix      pfx,
3085                    Int         size,
3086                    Long        delta0 )
3087{
3088   Int len;
3089   UChar rm = getUChar(delta0);
3090   HChar dis_buf[50];
3091
3092   if (epartIsReg(rm)) {
3093      putIRegE(size, pfx, rm, getIRegG(size, pfx, rm));
3094      DIP("mov%c %s,%s\n", nameISize(size),
3095                           nameIRegG(size,pfx,rm),
3096                           nameIRegE(size,pfx,rm));
3097      return 1+delta0;
3098   }
3099
3100   /* E refers to memory */
3101   {
3102      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
3103      storeLE( mkexpr(addr), getIRegG(size, pfx, rm) );
3104      DIP("mov%c %s,%s\n", nameISize(size),
3105                           nameIRegG(size,pfx,rm),
3106                           dis_buf);
3107      return len+delta0;
3108   }
3109}
3110
3111
3112/* op $immediate, AL/AX/EAX/RAX. */
3113static
3114ULong dis_op_imm_A ( Int    size,
3115                     Bool   carrying,
3116                     IROp   op8,
3117                     Bool   keep,
3118                     Long   delta,
3119                     HChar* t_amd64opc )
3120{
3121   Int    size4 = imin(size,4);
3122   IRType ty    = szToITy(size);
3123   IRTemp dst0  = newTemp(ty);
3124   IRTemp src   = newTemp(ty);
3125   IRTemp dst1  = newTemp(ty);
3126   Long  lit    = getSDisp(size4,delta);
3127   assign(dst0, getIRegRAX(size));
3128   assign(src,  mkU(ty,lit & mkSizeMask(size)));
3129
3130   if (isAddSub(op8) && !carrying) {
3131      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
3132      setFlags_DEP1_DEP2(op8, dst0, src, ty);
3133   }
3134   else
3135   if (isLogic(op8)) {
3136      vassert(!carrying);
3137      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
3138      setFlags_DEP1(op8, dst1, ty);
3139   }
3140   else
3141   if (op8 == Iop_Add8 && carrying) {
3142      helper_ADC( size, dst1, dst0, src,
3143                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3144   }
3145   else
3146   if (op8 == Iop_Sub8 && carrying) {
3147      helper_SBB( size, dst1, dst0, src,
3148                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3149   }
3150   else
3151      vpanic("dis_op_imm_A(amd64,guest)");
3152
3153   if (keep)
3154      putIRegRAX(size, mkexpr(dst1));
3155
3156   DIP("%s%c $%lld, %s\n", t_amd64opc, nameISize(size),
3157                           lit, nameIRegRAX(size));
3158   return delta+size4;
3159}
3160
3161
3162/* Sign- and Zero-extending moves. */
3163static
3164ULong dis_movx_E_G ( VexAbiInfo* vbi,
3165                     Prefix pfx,
3166                     Long delta, Int szs, Int szd, Bool sign_extend )
3167{
3168   UChar rm = getUChar(delta);
3169   if (epartIsReg(rm)) {
3170      putIRegG(szd, pfx, rm,
3171                    doScalarWidening(
3172                       szs,szd,sign_extend,
3173                       getIRegE(szs,pfx,rm)));
3174      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
3175                               nameISize(szs),
3176                               nameISize(szd),
3177                               nameIRegE(szs,pfx,rm),
3178                               nameIRegG(szd,pfx,rm));
3179      return 1+delta;
3180   }
3181
3182   /* E refers to memory */
3183   {
3184      Int    len;
3185      HChar  dis_buf[50];
3186      IRTemp addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
3187      putIRegG(szd, pfx, rm,
3188                    doScalarWidening(
3189                       szs,szd,sign_extend,
3190                       loadLE(szToITy(szs),mkexpr(addr))));
3191      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
3192                               nameISize(szs),
3193                               nameISize(szd),
3194                               dis_buf,
3195                               nameIRegG(szd,pfx,rm));
3196      return len+delta;
3197   }
3198}
3199
3200
3201/* Generate code to divide ArchRegs RDX:RAX / EDX:EAX / DX:AX / AX by
3202   the 64 / 32 / 16 / 8 bit quantity in the given IRTemp.  */
3203static
3204void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
3205{
3206   /* special-case the 64-bit case */
3207   if (sz == 8) {
3208      IROp   op     = signed_divide ? Iop_DivModS128to64
3209                                    : Iop_DivModU128to64;
3210      IRTemp src128 = newTemp(Ity_I128);
3211      IRTemp dst128 = newTemp(Ity_I128);
3212      assign( src128, binop(Iop_64HLto128,
3213                            getIReg64(R_RDX),
3214                            getIReg64(R_RAX)) );
3215      assign( dst128, binop(op, mkexpr(src128), mkexpr(t)) );
3216      putIReg64( R_RAX, unop(Iop_128to64,mkexpr(dst128)) );
3217      putIReg64( R_RDX, unop(Iop_128HIto64,mkexpr(dst128)) );
3218   } else {
3219      IROp   op    = signed_divide ? Iop_DivModS64to32
3220                                   : Iop_DivModU64to32;
3221      IRTemp src64 = newTemp(Ity_I64);
3222      IRTemp dst64 = newTemp(Ity_I64);
3223      switch (sz) {
3224      case 4:
3225         assign( src64,
3226                 binop(Iop_32HLto64, getIRegRDX(4), getIRegRAX(4)) );
3227         assign( dst64,
3228                 binop(op, mkexpr(src64), mkexpr(t)) );
3229         putIRegRAX( 4, unop(Iop_64to32,mkexpr(dst64)) );
3230         putIRegRDX( 4, unop(Iop_64HIto32,mkexpr(dst64)) );
3231         break;
3232      case 2: {
3233         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
3234         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
3235         assign( src64, unop(widen3264,
3236                             binop(Iop_16HLto32,
3237                                   getIRegRDX(2),
3238                                   getIRegRAX(2))) );
3239         assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
3240         putIRegRAX( 2, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
3241         putIRegRDX( 2, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
3242         break;
3243      }
3244      case 1: {
3245         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
3246         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
3247         IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
3248         assign( src64, unop(widen3264,
3249                        unop(widen1632, getIRegRAX(2))) );
3250         assign( dst64,
3251                 binop(op, mkexpr(src64),
3252                           unop(widen1632, unop(widen816, mkexpr(t)))) );
3253         putIRegRAX( 1, unop(Iop_16to8,
3254                        unop(Iop_32to16,
3255                        unop(Iop_64to32,mkexpr(dst64)))) );
3256         putIRegAH( unop(Iop_16to8,
3257                    unop(Iop_32to16,
3258                    unop(Iop_64HIto32,mkexpr(dst64)))) );
3259         break;
3260      }
3261      default:
3262         vpanic("codegen_div(amd64)");
3263      }
3264   }
3265}
3266
3267static
3268ULong dis_Grp1 ( VexAbiInfo* vbi,
3269                 Prefix pfx,
3270                 Long delta, UChar modrm,
3271                 Int am_sz, Int d_sz, Int sz, Long d64 )
3272{
3273   Int     len;
3274   HChar   dis_buf[50];
3275   IRType  ty   = szToITy(sz);
3276   IRTemp  dst1 = newTemp(ty);
3277   IRTemp  src  = newTemp(ty);
3278   IRTemp  dst0 = newTemp(ty);
3279   IRTemp  addr = IRTemp_INVALID;
3280   IROp    op8  = Iop_INVALID;
3281   ULong   mask = mkSizeMask(sz);
3282
3283   switch (gregLO3ofRM(modrm)) {
3284      case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
3285      case 2: break;  // ADC
3286      case 3: break;  // SBB
3287      case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
3288      case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
3289      /*NOTREACHED*/
3290      default: vpanic("dis_Grp1(amd64): unhandled case");
3291   }
3292
3293   if (epartIsReg(modrm)) {
3294      vassert(am_sz == 1);
3295
3296      assign(dst0, getIRegE(sz,pfx,modrm));
3297      assign(src,  mkU(ty,d64 & mask));
3298
3299      if (gregLO3ofRM(modrm) == 2 /* ADC */) {
3300         helper_ADC( sz, dst1, dst0, src,
3301                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3302      } else
3303      if (gregLO3ofRM(modrm) == 3 /* SBB */) {
3304         helper_SBB( sz, dst1, dst0, src,
3305                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3306      } else {
3307         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
3308         if (isAddSub(op8))
3309            setFlags_DEP1_DEP2(op8, dst0, src, ty);
3310         else
3311            setFlags_DEP1(op8, dst1, ty);
3312      }
3313
3314      if (gregLO3ofRM(modrm) < 7)
3315         putIRegE(sz, pfx, modrm, mkexpr(dst1));
3316
3317      delta += (am_sz + d_sz);
3318      DIP("%s%c $%lld, %s\n",
3319          nameGrp1(gregLO3ofRM(modrm)), nameISize(sz), d64,
3320          nameIRegE(sz,pfx,modrm));
3321   } else {
3322      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
3323
3324      assign(dst0, loadLE(ty,mkexpr(addr)));
3325      assign(src, mkU(ty,d64 & mask));
3326
3327      if (gregLO3ofRM(modrm) == 2 /* ADC */) {
3328         if (pfx & PFX_LOCK) {
3329            /* cas-style store */
3330            helper_ADC( sz, dst1, dst0, src,
3331                       /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
3332         } else {
3333            /* normal store */
3334            helper_ADC( sz, dst1, dst0, src,
3335                        /*store*/addr, IRTemp_INVALID, 0 );
3336         }
3337      } else
3338      if (gregLO3ofRM(modrm) == 3 /* SBB */) {
3339         if (pfx & PFX_LOCK) {
3340            /* cas-style store */
3341            helper_SBB( sz, dst1, dst0, src,
3342                       /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
3343         } else {
3344            /* normal store */
3345            helper_SBB( sz, dst1, dst0, src,
3346                        /*store*/addr, IRTemp_INVALID, 0 );
3347         }
3348      } else {
3349         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
3350         if (gregLO3ofRM(modrm) < 7) {
3351            if (pfx & PFX_LOCK) {
3352               casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
3353                                    mkexpr(dst1)/*newVal*/,
3354                                    guest_RIP_curr_instr );
3355            } else {
3356               storeLE(mkexpr(addr), mkexpr(dst1));
3357            }
3358         }
3359         if (isAddSub(op8))
3360            setFlags_DEP1_DEP2(op8, dst0, src, ty);
3361         else
3362            setFlags_DEP1(op8, dst1, ty);
3363      }
3364
3365      delta += (len+d_sz);
3366      DIP("%s%c $%lld, %s\n",
3367          nameGrp1(gregLO3ofRM(modrm)), nameISize(sz),
3368          d64, dis_buf);
3369   }
3370   return delta;
3371}
3372
3373
3374/* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
3375   expression. */
3376
3377static
3378ULong dis_Grp2 ( VexAbiInfo* vbi,
3379                 Prefix pfx,
3380                 Long delta, UChar modrm,
3381                 Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
3382                 HChar* shift_expr_txt, Bool* decode_OK )
3383{
3384   /* delta on entry points at the modrm byte. */
3385   HChar  dis_buf[50];
3386   Int    len;
3387   Bool   isShift, isRotate, isRotateC;
3388   IRType ty    = szToITy(sz);
3389   IRTemp dst0  = newTemp(ty);
3390   IRTemp dst1  = newTemp(ty);
3391   IRTemp addr  = IRTemp_INVALID;
3392
3393   *decode_OK = True;
3394
3395   vassert(sz == 1 || sz == 2 || sz == 4 || sz == 8);
3396
3397   /* Put value to shift/rotate in dst0. */
3398   if (epartIsReg(modrm)) {
3399      assign(dst0, getIRegE(sz, pfx, modrm));
3400      delta += (am_sz + d_sz);
3401   } else {
3402      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
3403      assign(dst0, loadLE(ty,mkexpr(addr)));
3404      delta += len + d_sz;
3405   }
3406
3407   isShift = False;
3408   switch (gregLO3ofRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
3409
3410   isRotate = False;
3411   switch (gregLO3ofRM(modrm)) { case 0: case 1: isRotate = True; }
3412
3413   isRotateC = False;
3414   switch (gregLO3ofRM(modrm)) { case 2: case 3: isRotateC = True; }
3415
3416   if (!isShift && !isRotate && !isRotateC) {
3417      /*NOTREACHED*/
3418      vpanic("dis_Grp2(Reg): unhandled case(amd64)");
3419   }
3420
3421   if (isRotateC) {
3422      /* Call a helper; this insn is so ridiculous it does not deserve
3423         better.  One problem is, the helper has to calculate both the
3424         new value and the new flags.  This is more than 64 bits, and
3425         there is no way to return more than 64 bits from the helper.
3426         Hence the crude and obvious solution is to call it twice,
3427         using the sign of the sz field to indicate whether it is the
3428         value or rflags result we want.
3429      */
3430      Bool     left = toBool(gregLO3ofRM(modrm) == 2);
3431      IRExpr** argsVALUE;
3432      IRExpr** argsRFLAGS;
3433
3434      IRTemp new_value  = newTemp(Ity_I64);
3435      IRTemp new_rflags = newTemp(Ity_I64);
3436      IRTemp old_rflags = newTemp(Ity_I64);
3437
3438      assign( old_rflags, widenUto64(mk_amd64g_calculate_rflags_all()) );
3439
3440      argsVALUE
3441         = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
3442                          widenUto64(shift_expr),   /* rotate amount */
3443                          mkexpr(old_rflags),
3444                          mkU64(sz) );
3445      assign( new_value,
3446                 mkIRExprCCall(
3447                    Ity_I64,
3448                    0/*regparm*/,
3449                    left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
3450                    left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
3451                    argsVALUE
3452                 )
3453            );
3454
3455      argsRFLAGS
3456         = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
3457                          widenUto64(shift_expr),   /* rotate amount */
3458                          mkexpr(old_rflags),
3459                          mkU64(-sz) );
3460      assign( new_rflags,
3461                 mkIRExprCCall(
3462                    Ity_I64,
3463                    0/*regparm*/,
3464                    left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
3465                    left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
3466                    argsRFLAGS
3467                 )
3468            );
3469
3470      assign( dst1, narrowTo(ty, mkexpr(new_value)) );
3471      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
3472      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(new_rflags) ));
3473      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
3474      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
3475   }
3476
3477   else
3478   if (isShift) {
3479
3480      IRTemp pre64     = newTemp(Ity_I64);
3481      IRTemp res64     = newTemp(Ity_I64);
3482      IRTemp res64ss   = newTemp(Ity_I64);
3483      IRTemp shift_amt = newTemp(Ity_I8);
3484      UChar  mask      = toUChar(sz==8 ? 63 : 31);
3485      IROp   op64;
3486
3487      switch (gregLO3ofRM(modrm)) {
3488         case 4: op64 = Iop_Shl64; break;
3489         case 5: op64 = Iop_Shr64; break;
3490         case 6: op64 = Iop_Shl64; break;
3491         case 7: op64 = Iop_Sar64; break;
3492         /*NOTREACHED*/
3493         default: vpanic("dis_Grp2:shift"); break;
3494      }
3495
3496      /* Widen the value to be shifted to 64 bits, do the shift, and
3497         narrow back down.  This seems surprisingly long-winded, but
3498         unfortunately the AMD semantics requires that 8/16/32-bit
3499         shifts give defined results for shift values all the way up
3500         to 32, and this seems the simplest way to do it.  It has the
3501         advantage that the only IR level shifts generated are of 64
3502         bit values, and the shift amount is guaranteed to be in the
3503         range 0 .. 63, thereby observing the IR semantics requiring
3504         all shift values to be in the range 0 .. 2^word_size-1.
3505
3506         Therefore the shift amount is masked with 63 for 64-bit shifts
3507         and 31 for all others.
3508      */
3509      /* shift_amt = shift_expr & MASK, regardless of operation size */
3510      assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(mask)) );
3511
3512      /* suitably widen the value to be shifted to 64 bits. */
3513      assign( pre64, op64==Iop_Sar64 ? widenSto64(mkexpr(dst0))
3514                                     : widenUto64(mkexpr(dst0)) );
3515
3516      /* res64 = pre64 `shift` shift_amt */
3517      assign( res64, binop(op64, mkexpr(pre64), mkexpr(shift_amt)) );
3518
3519      /* res64ss = pre64 `shift` ((shift_amt - 1) & MASK) */
3520      assign( res64ss,
3521              binop(op64,
3522                    mkexpr(pre64),
3523                    binop(Iop_And8,
3524                          binop(Iop_Sub8,
3525                                mkexpr(shift_amt), mkU8(1)),
3526                          mkU8(mask))) );
3527
3528      /* Build the flags thunk. */
3529      setFlags_DEP1_DEP2_shift(op64, res64, res64ss, ty, shift_amt);
3530
3531      /* Narrow the result back down. */
3532      assign( dst1, narrowTo(ty, mkexpr(res64)) );
3533
3534   } /* if (isShift) */
3535
3536   else
3537   if (isRotate) {
3538      Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1
3539                                        : (ty==Ity_I32 ? 2 : 3));
3540      Bool   left      = toBool(gregLO3ofRM(modrm) == 0);
3541      IRTemp rot_amt   = newTemp(Ity_I8);
3542      IRTemp rot_amt64 = newTemp(Ity_I8);
3543      IRTemp oldFlags  = newTemp(Ity_I64);
3544      UChar  mask      = toUChar(sz==8 ? 63 : 31);
3545
3546      /* rot_amt = shift_expr & mask */
3547      /* By masking the rotate amount thusly, the IR-level Shl/Shr
3548         expressions never shift beyond the word size and thus remain
3549         well defined. */
3550      assign(rot_amt64, binop(Iop_And8, shift_expr, mkU8(mask)));
3551
3552      if (ty == Ity_I64)
3553         assign(rot_amt, mkexpr(rot_amt64));
3554      else
3555         assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt64), mkU8(8*sz-1)));
3556
3557      if (left) {
3558
3559         /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
3560         assign(dst1,
3561            binop( mkSizedOp(ty,Iop_Or8),
3562                   binop( mkSizedOp(ty,Iop_Shl8),
3563                          mkexpr(dst0),
3564                          mkexpr(rot_amt)
3565                   ),
3566                   binop( mkSizedOp(ty,Iop_Shr8),
3567                          mkexpr(dst0),
3568                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
3569                   )
3570            )
3571         );
3572         ccOp += AMD64G_CC_OP_ROLB;
3573
3574      } else { /* right */
3575
3576         /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
3577         assign(dst1,
3578            binop( mkSizedOp(ty,Iop_Or8),
3579                   binop( mkSizedOp(ty,Iop_Shr8),
3580                          mkexpr(dst0),
3581                          mkexpr(rot_amt)
3582                   ),
3583                   binop( mkSizedOp(ty,Iop_Shl8),
3584                          mkexpr(dst0),
3585                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
3586                   )
3587            )
3588         );
3589         ccOp += AMD64G_CC_OP_RORB;
3590
3591      }
3592
3593      /* dst1 now holds the rotated value.  Build flag thunk.  We
3594         need the resulting value for this, and the previous flags.
3595         Except don't set it if the rotate count is zero. */
3596
3597      assign(oldFlags, mk_amd64g_calculate_rflags_all());
3598
3599      /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
3600      stmt( IRStmt_Put( OFFB_CC_OP,
3601                        IRExpr_Mux0X( mkexpr(rot_amt64),
3602                                      IRExpr_Get(OFFB_CC_OP,Ity_I64),
3603                                      mkU64(ccOp))) );
3604      stmt( IRStmt_Put( OFFB_CC_DEP1,
3605                        IRExpr_Mux0X( mkexpr(rot_amt64),
3606                                      IRExpr_Get(OFFB_CC_DEP1,Ity_I64),
3607                                      widenUto64(mkexpr(dst1)))) );
3608      stmt( IRStmt_Put( OFFB_CC_DEP2,
3609                        IRExpr_Mux0X( mkexpr(rot_amt64),
3610                                      IRExpr_Get(OFFB_CC_DEP2,Ity_I64),
3611                                      mkU64(0))) );
3612      stmt( IRStmt_Put( OFFB_CC_NDEP,
3613                        IRExpr_Mux0X( mkexpr(rot_amt64),
3614                                      IRExpr_Get(OFFB_CC_NDEP,Ity_I64),
3615                                      mkexpr(oldFlags))) );
3616   } /* if (isRotate) */
3617
3618   /* Save result, and finish up. */
3619   if (epartIsReg(modrm)) {
3620      putIRegE(sz, pfx, modrm, mkexpr(dst1));
3621      if (vex_traceflags & VEX_TRACE_FE) {
3622         vex_printf("%s%c ",
3623                    nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
3624         if (shift_expr_txt)
3625            vex_printf("%s", shift_expr_txt);
3626         else
3627            ppIRExpr(shift_expr);
3628         vex_printf(", %s\n", nameIRegE(sz,pfx,modrm));
3629      }
3630   } else {
3631      storeLE(mkexpr(addr), mkexpr(dst1));
3632      if (vex_traceflags & VEX_TRACE_FE) {
3633         vex_printf("%s%c ",
3634                    nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
3635         if (shift_expr_txt)
3636            vex_printf("%s", shift_expr_txt);
3637         else
3638            ppIRExpr(shift_expr);
3639         vex_printf(", %s\n", dis_buf);
3640      }
3641   }
3642   return delta;
3643}
3644
3645
3646/* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
3647static
3648ULong dis_Grp8_Imm ( VexAbiInfo* vbi,
3649                     Prefix pfx,
3650                     Long delta, UChar modrm,
3651                     Int am_sz, Int sz, ULong src_val,
3652                     Bool* decode_OK )
3653{
3654   /* src_val denotes a d8.
3655      And delta on entry points at the modrm byte. */
3656
3657   IRType ty     = szToITy(sz);
3658   IRTemp t2     = newTemp(Ity_I64);
3659   IRTemp t2m    = newTemp(Ity_I64);
3660   IRTemp t_addr = IRTemp_INVALID;
3661   HChar  dis_buf[50];
3662   ULong  mask;
3663
3664   /* we're optimists :-) */
3665   *decode_OK = True;
3666
3667   /* Limit src_val -- the bit offset -- to something within a word.
3668      The Intel docs say that literal offsets larger than a word are
3669      masked in this way. */
3670   switch (sz) {
3671      case 2:  src_val &= 15; break;
3672      case 4:  src_val &= 31; break;
3673      case 8:  src_val &= 63; break;
3674      default: *decode_OK = False; return delta;
3675   }
3676
3677   /* Invent a mask suitable for the operation. */
3678   switch (gregLO3ofRM(modrm)) {
3679      case 4: /* BT */  mask = 0;                  break;
3680      case 5: /* BTS */ mask = 1ULL << src_val;    break;
3681      case 6: /* BTR */ mask = ~(1ULL << src_val); break;
3682      case 7: /* BTC */ mask = 1ULL << src_val;    break;
3683         /* If this needs to be extended, probably simplest to make a
3684            new function to handle the other cases (0 .. 3).  The
3685            Intel docs do however not indicate any use for 0 .. 3, so
3686            we don't expect this to happen. */
3687      default: *decode_OK = False; return delta;
3688   }
3689
3690   /* Fetch the value to be tested and modified into t2, which is
3691      64-bits wide regardless of sz. */
3692   if (epartIsReg(modrm)) {
3693      vassert(am_sz == 1);
3694      assign( t2, widenUto64(getIRegE(sz, pfx, modrm)) );
3695      delta += (am_sz + 1);
3696      DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
3697                                nameISize(sz),
3698                                src_val, nameIRegE(sz,pfx,modrm));
3699   } else {
3700      Int len;
3701      t_addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 1 );
3702      delta  += (len+1);
3703      assign( t2, widenUto64(loadLE(ty, mkexpr(t_addr))) );
3704      DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
3705                                nameISize(sz),
3706                                src_val, dis_buf);
3707   }
3708
3709   /* Compute the new value into t2m, if non-BT. */
3710   switch (gregLO3ofRM(modrm)) {
3711      case 4: /* BT */
3712         break;
3713      case 5: /* BTS */
3714         assign( t2m, binop(Iop_Or64, mkU64(mask), mkexpr(t2)) );
3715         break;
3716      case 6: /* BTR */
3717         assign( t2m, binop(Iop_And64, mkU64(mask), mkexpr(t2)) );
3718         break;
3719      case 7: /* BTC */
3720         assign( t2m, binop(Iop_Xor64, mkU64(mask), mkexpr(t2)) );
3721         break;
3722     default:
3723         /*NOTREACHED*/ /*the previous switch guards this*/
3724         vassert(0);
3725   }
3726
3727   /* Write the result back, if non-BT. */
3728   if (gregLO3ofRM(modrm) != 4 /* BT */) {
3729      if (epartIsReg(modrm)) {
3730	putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m)));
3731      } else {
3732         if (pfx & PFX_LOCK) {
3733            casLE( mkexpr(t_addr),
3734                   narrowTo(ty, mkexpr(t2))/*expd*/,
3735                   narrowTo(ty, mkexpr(t2m))/*new*/,
3736                   guest_RIP_curr_instr );
3737         } else {
3738            storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
3739         }
3740      }
3741   }
3742
3743   /* Copy relevant bit from t2 into the carry flag. */
3744   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
3745   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
3746   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
3747   stmt( IRStmt_Put(
3748            OFFB_CC_DEP1,
3749            binop(Iop_And64,
3750                  binop(Iop_Shr64, mkexpr(t2), mkU8(src_val)),
3751                  mkU64(1))
3752       ));
3753   /* Set NDEP even though it isn't used.  This makes redundant-PUT
3754      elimination of previous stores to this field work better. */
3755   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
3756
3757   return delta;
3758}
3759
3760
3761/* Signed/unsigned widening multiply.  Generate IR to multiply the
3762   value in RAX/EAX/AX/AL by the given IRTemp, and park the result in
3763   RDX:RAX/EDX:EAX/DX:AX/AX.
3764*/
3765static void codegen_mulL_A_D ( Int sz, Bool syned,
3766                               IRTemp tmp, HChar* tmp_txt )
3767{
3768   IRType ty = szToITy(sz);
3769   IRTemp t1 = newTemp(ty);
3770
3771   assign( t1, getIRegRAX(sz) );
3772
3773   switch (ty) {
3774      case Ity_I64: {
3775         IRTemp res128  = newTemp(Ity_I128);
3776         IRTemp resHi   = newTemp(Ity_I64);
3777         IRTemp resLo   = newTemp(Ity_I64);
3778         IROp   mulOp   = syned ? Iop_MullS64 : Iop_MullU64;
3779         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
3780         setFlags_MUL ( Ity_I64, t1, tmp, tBaseOp );
3781         assign( res128, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
3782         assign( resHi, unop(Iop_128HIto64,mkexpr(res128)));
3783         assign( resLo, unop(Iop_128to64,mkexpr(res128)));
3784         putIReg64(R_RDX, mkexpr(resHi));
3785         putIReg64(R_RAX, mkexpr(resLo));
3786         break;
3787      }
3788      case Ity_I32: {
3789         IRTemp res64   = newTemp(Ity_I64);
3790         IRTemp resHi   = newTemp(Ity_I32);
3791         IRTemp resLo   = newTemp(Ity_I32);
3792         IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
3793         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
3794         setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
3795         assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
3796         assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
3797         assign( resLo, unop(Iop_64to32,mkexpr(res64)));
3798         putIRegRDX(4, mkexpr(resHi));
3799         putIRegRAX(4, mkexpr(resLo));
3800         break;
3801      }
3802      case Ity_I16: {
3803         IRTemp res32   = newTemp(Ity_I32);
3804         IRTemp resHi   = newTemp(Ity_I16);
3805         IRTemp resLo   = newTemp(Ity_I16);
3806         IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
3807         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
3808         setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
3809         assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
3810         assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
3811         assign( resLo, unop(Iop_32to16,mkexpr(res32)));
3812         putIRegRDX(2, mkexpr(resHi));
3813         putIRegRAX(2, mkexpr(resLo));
3814         break;
3815      }
3816      case Ity_I8: {
3817         IRTemp res16   = newTemp(Ity_I16);
3818         IRTemp resHi   = newTemp(Ity_I8);
3819         IRTemp resLo   = newTemp(Ity_I8);
3820         IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
3821         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
3822         setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
3823         assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
3824         assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
3825         assign( resLo, unop(Iop_16to8,mkexpr(res16)));
3826         putIRegRAX(2, mkexpr(res16));
3827         break;
3828      }
3829      default:
3830         ppIRType(ty);
3831         vpanic("codegen_mulL_A_D(amd64)");
3832   }
3833   DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
3834}
3835
3836
3837/* Group 3 extended opcodes. */
3838static
3839ULong dis_Grp3 ( VexAbiInfo* vbi,
3840                 Prefix pfx, Int sz, Long delta, Bool* decode_OK )
3841{
3842   Long    d64;
3843   UChar   modrm;
3844   HChar   dis_buf[50];
3845   Int     len;
3846   IRTemp  addr;
3847   IRType  ty = szToITy(sz);
3848   IRTemp  t1 = newTemp(ty);
3849   IRTemp dst1, src, dst0;
3850   *decode_OK = True;
3851   modrm = getUChar(delta);
3852   if (epartIsReg(modrm)) {
3853      switch (gregLO3ofRM(modrm)) {
3854         case 0: { /* TEST */
3855            delta++;
3856            d64 = getSDisp(imin(4,sz), delta);
3857            delta += imin(4,sz);
3858            dst1 = newTemp(ty);
3859            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
3860                               getIRegE(sz,pfx,modrm),
3861                               mkU(ty, d64 & mkSizeMask(sz))));
3862            setFlags_DEP1( Iop_And8, dst1, ty );
3863            DIP("test%c $%lld, %s\n",
3864                nameISize(sz), d64,
3865                nameIRegE(sz, pfx, modrm));
3866            break;
3867         }
3868         case 1:
3869            *decode_OK = False;
3870            return delta;
3871         case 2: /* NOT */
3872            delta++;
3873            putIRegE(sz, pfx, modrm,
3874                              unop(mkSizedOp(ty,Iop_Not8),
3875                                   getIRegE(sz, pfx, modrm)));
3876            DIP("not%c %s\n", nameISize(sz),
3877                              nameIRegE(sz, pfx, modrm));
3878            break;
3879         case 3: /* NEG */
3880            delta++;
3881            dst0 = newTemp(ty);
3882            src  = newTemp(ty);
3883            dst1 = newTemp(ty);
3884            assign(dst0, mkU(ty,0));
3885            assign(src,  getIRegE(sz, pfx, modrm));
3886            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
3887                                                       mkexpr(src)));
3888            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
3889            putIRegE(sz, pfx, modrm, mkexpr(dst1));
3890            DIP("neg%c %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm));
3891            break;
3892         case 4: /* MUL (unsigned widening) */
3893            delta++;
3894            src = newTemp(ty);
3895            assign(src, getIRegE(sz,pfx,modrm));
3896            codegen_mulL_A_D ( sz, False, src,
3897                               nameIRegE(sz,pfx,modrm) );
3898            break;
3899         case 5: /* IMUL (signed widening) */
3900            delta++;
3901            src = newTemp(ty);
3902            assign(src, getIRegE(sz,pfx,modrm));
3903            codegen_mulL_A_D ( sz, True, src,
3904                               nameIRegE(sz,pfx,modrm) );
3905            break;
3906         case 6: /* DIV */
3907            delta++;
3908            assign( t1, getIRegE(sz, pfx, modrm) );
3909            codegen_div ( sz, t1, False );
3910            DIP("div%c %s\n", nameISize(sz),
3911                              nameIRegE(sz, pfx, modrm));
3912            break;
3913         case 7: /* IDIV */
3914            delta++;
3915            assign( t1, getIRegE(sz, pfx, modrm) );
3916            codegen_div ( sz, t1, True );
3917            DIP("idiv%c %s\n", nameISize(sz),
3918                               nameIRegE(sz, pfx, modrm));
3919            break;
3920         default:
3921            /*NOTREACHED*/
3922            vpanic("Grp3(amd64,R)");
3923      }
3924   } else {
3925      addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
3926                        /* we have to inform disAMode of any immediate
3927			   bytes used */
3928                        gregLO3ofRM(modrm)==0/*TEST*/
3929                           ? imin(4,sz)
3930                           : 0
3931                      );
3932      t1   = newTemp(ty);
3933      delta += len;
3934      assign(t1, loadLE(ty,mkexpr(addr)));
3935      switch (gregLO3ofRM(modrm)) {
3936         case 0: { /* TEST */
3937            d64 = getSDisp(imin(4,sz), delta);
3938            delta += imin(4,sz);
3939            dst1 = newTemp(ty);
3940            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
3941                               mkexpr(t1),
3942                               mkU(ty, d64 & mkSizeMask(sz))));
3943            setFlags_DEP1( Iop_And8, dst1, ty );
3944            DIP("test%c $%lld, %s\n", nameISize(sz), d64, dis_buf);
3945            break;
3946         }
3947         case 1:
3948            *decode_OK = False;
3949            return delta;
3950         case 2: /* NOT */
3951            dst1 = newTemp(ty);
3952            assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
3953            if (pfx & PFX_LOCK) {
3954               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
3955                                    guest_RIP_curr_instr );
3956            } else {
3957               storeLE( mkexpr(addr), mkexpr(dst1) );
3958            }
3959            DIP("not%c %s\n", nameISize(sz), dis_buf);
3960            break;
3961         case 3: /* NEG */
3962            dst0 = newTemp(ty);
3963            src  = newTemp(ty);
3964            dst1 = newTemp(ty);
3965            assign(dst0, mkU(ty,0));
3966            assign(src,  mkexpr(t1));
3967            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
3968                                                       mkexpr(src)));
3969            if (pfx & PFX_LOCK) {
3970               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
3971                                    guest_RIP_curr_instr );
3972            } else {
3973               storeLE( mkexpr(addr), mkexpr(dst1) );
3974            }
3975            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
3976            DIP("neg%c %s\n", nameISize(sz), dis_buf);
3977            break;
3978         case 4: /* MUL (unsigned widening) */
3979            codegen_mulL_A_D ( sz, False, t1, dis_buf );
3980            break;
3981         case 5: /* IMUL */
3982            codegen_mulL_A_D ( sz, True, t1, dis_buf );
3983            break;
3984         case 6: /* DIV */
3985            codegen_div ( sz, t1, False );
3986            DIP("div%c %s\n", nameISize(sz), dis_buf);
3987            break;
3988         case 7: /* IDIV */
3989            codegen_div ( sz, t1, True );
3990            DIP("idiv%c %s\n", nameISize(sz), dis_buf);
3991            break;
3992         default:
3993            /*NOTREACHED*/
3994            vpanic("Grp3(amd64,M)");
3995      }
3996   }
3997   return delta;
3998}
3999
4000
4001/* Group 4 extended opcodes. */
4002static
4003ULong dis_Grp4 ( VexAbiInfo* vbi,
4004                 Prefix pfx, Long delta, Bool* decode_OK )
4005{
4006   Int   alen;
4007   UChar modrm;
4008   HChar dis_buf[50];
4009   IRType ty = Ity_I8;
4010   IRTemp t1 = newTemp(ty);
4011   IRTemp t2 = newTemp(ty);
4012
4013   *decode_OK = True;
4014
4015   modrm = getUChar(delta);
4016   if (epartIsReg(modrm)) {
4017      assign(t1, getIRegE(1, pfx, modrm));
4018      switch (gregLO3ofRM(modrm)) {
4019         case 0: /* INC */
4020            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
4021            putIRegE(1, pfx, modrm, mkexpr(t2));
4022            setFlags_INC_DEC( True, t2, ty );
4023            break;
4024         case 1: /* DEC */
4025            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
4026            putIRegE(1, pfx, modrm, mkexpr(t2));
4027            setFlags_INC_DEC( False, t2, ty );
4028            break;
4029         default:
4030            *decode_OK = False;
4031            return delta;
4032      }
4033      delta++;
4034      DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)),
4035                      nameIRegE(1, pfx, modrm));
4036   } else {
4037      IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
4038      assign( t1, loadLE(ty, mkexpr(addr)) );
4039      switch (gregLO3ofRM(modrm)) {
4040         case 0: /* INC */
4041            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
4042            if (pfx & PFX_LOCK) {
4043               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
4044                      guest_RIP_curr_instr );
4045            } else {
4046               storeLE( mkexpr(addr), mkexpr(t2) );
4047            }
4048            setFlags_INC_DEC( True, t2, ty );
4049            break;
4050         case 1: /* DEC */
4051            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
4052            if (pfx & PFX_LOCK) {
4053               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
4054                      guest_RIP_curr_instr );
4055            } else {
4056               storeLE( mkexpr(addr), mkexpr(t2) );
4057            }
4058            setFlags_INC_DEC( False, t2, ty );
4059            break;
4060         default:
4061            *decode_OK = False;
4062            return delta;
4063      }
4064      delta += alen;
4065      DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)), dis_buf);
4066   }
4067   return delta;
4068}
4069
4070
4071/* Group 5 extended opcodes. */
4072static
4073ULong dis_Grp5 ( VexAbiInfo* vbi,
4074                 Prefix pfx, Int sz, Long delta,
4075                 /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
4076{
4077   Int     len;
4078   UChar   modrm;
4079   HChar   dis_buf[50];
4080   IRTemp  addr = IRTemp_INVALID;
4081   IRType  ty = szToITy(sz);
4082   IRTemp  t1 = newTemp(ty);
4083   IRTemp  t2 = IRTemp_INVALID;
4084   IRTemp  t3 = IRTemp_INVALID;
4085   Bool    showSz = True;
4086
4087   *decode_OK = True;
4088
4089   modrm = getUChar(delta);
4090   if (epartIsReg(modrm)) {
4091      assign(t1, getIRegE(sz,pfx,modrm));
4092      switch (gregLO3ofRM(modrm)) {
4093         case 0: /* INC */
4094            t2 = newTemp(ty);
4095            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
4096                             mkexpr(t1), mkU(ty,1)));
4097            setFlags_INC_DEC( True, t2, ty );
4098            putIRegE(sz,pfx,modrm, mkexpr(t2));
4099            break;
4100         case 1: /* DEC */
4101            t2 = newTemp(ty);
4102            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
4103                             mkexpr(t1), mkU(ty,1)));
4104            setFlags_INC_DEC( False, t2, ty );
4105            putIRegE(sz,pfx,modrm, mkexpr(t2));
4106            break;
4107         case 2: /* call Ev */
4108            /* Ignore any sz value and operate as if sz==8. */
4109            if (!(sz == 4 || sz == 8)) goto unhandled;
4110            sz = 8;
4111            t3 = newTemp(Ity_I64);
4112            assign(t3, getIRegE(sz,pfx,modrm));
4113            t2 = newTemp(Ity_I64);
4114            assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
4115            putIReg64(R_RSP, mkexpr(t2));
4116            storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+1));
4117            make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(reg)");
4118            jmp_treg(dres, Ijk_Call, t3);
4119            vassert(dres->whatNext == Dis_StopHere);
4120            showSz = False;
4121            break;
4122         case 4: /* jmp Ev */
4123            /* Ignore any sz value and operate as if sz==8. */
4124            if (!(sz == 4 || sz == 8)) goto unhandled;
4125            sz = 8;
4126            t3 = newTemp(Ity_I64);
4127            assign(t3, getIRegE(sz,pfx,modrm));
4128            jmp_treg(dres, Ijk_Boring, t3);
4129            vassert(dres->whatNext == Dis_StopHere);
4130            showSz = False;
4131            break;
4132         default:
4133            *decode_OK = False;
4134            return delta;
4135      }
4136      delta++;
4137      DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
4138                       showSz ? nameISize(sz) : ' ',
4139                       nameIRegE(sz, pfx, modrm));
4140   } else {
4141      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
4142      if (gregLO3ofRM(modrm) != 2 && gregLO3ofRM(modrm) != 4
4143                                  && gregLO3ofRM(modrm) != 6) {
4144         assign(t1, loadLE(ty,mkexpr(addr)));
4145      }
4146      switch (gregLO3ofRM(modrm)) {
4147         case 0: /* INC */
4148            t2 = newTemp(ty);
4149            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
4150                             mkexpr(t1), mkU(ty,1)));
4151            if (pfx & PFX_LOCK) {
4152               casLE( mkexpr(addr),
4153                      mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
4154            } else {
4155               storeLE(mkexpr(addr),mkexpr(t2));
4156            }
4157            setFlags_INC_DEC( True, t2, ty );
4158            break;
4159         case 1: /* DEC */
4160            t2 = newTemp(ty);
4161            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
4162                             mkexpr(t1), mkU(ty,1)));
4163            if (pfx & PFX_LOCK) {
4164               casLE( mkexpr(addr),
4165                      mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
4166            } else {
4167               storeLE(mkexpr(addr),mkexpr(t2));
4168            }
4169            setFlags_INC_DEC( False, t2, ty );
4170            break;
4171         case 2: /* call Ev */
4172            /* Ignore any sz value and operate as if sz==8. */
4173            if (!(sz == 4 || sz == 8)) goto unhandled;
4174            sz = 8;
4175            t3 = newTemp(Ity_I64);
4176            assign(t3, loadLE(Ity_I64,mkexpr(addr)));
4177            t2 = newTemp(Ity_I64);
4178            assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
4179            putIReg64(R_RSP, mkexpr(t2));
4180            storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+len));
4181            make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(mem)");
4182            jmp_treg(dres, Ijk_Call, t3);
4183            vassert(dres->whatNext == Dis_StopHere);
4184            showSz = False;
4185            break;
4186         case 4: /* JMP Ev */
4187            /* Ignore any sz value and operate as if sz==8. */
4188            if (!(sz == 4 || sz == 8)) goto unhandled;
4189            sz = 8;
4190            t3 = newTemp(Ity_I64);
4191            assign(t3, loadLE(Ity_I64,mkexpr(addr)));
4192            jmp_treg(dres, Ijk_Boring, t3);
4193            vassert(dres->whatNext == Dis_StopHere);
4194            showSz = False;
4195            break;
4196         case 6: /* PUSH Ev */
4197            /* There is no encoding for 32-bit operand size; hence ... */
4198            if (sz == 4) sz = 8;
4199            if (!(sz == 8 || sz == 2)) goto unhandled;
4200            if (sz == 8) {
4201               t3 = newTemp(Ity_I64);
4202               assign(t3, loadLE(Ity_I64,mkexpr(addr)));
4203               t2 = newTemp(Ity_I64);
4204               assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
4205               putIReg64(R_RSP, mkexpr(t2) );
4206               storeLE( mkexpr(t2), mkexpr(t3) );
4207               break;
4208	    } else {
4209               goto unhandled; /* awaiting test case */
4210	    }
4211         default:
4212         unhandled:
4213            *decode_OK = False;
4214            return delta;
4215      }
4216      delta += len;
4217      DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
4218                       showSz ? nameISize(sz) : ' ',
4219                       dis_buf);
4220   }
4221   return delta;
4222}
4223
4224
4225/*------------------------------------------------------------*/
4226/*--- Disassembling string ops (including REP prefixes)    ---*/
4227/*------------------------------------------------------------*/
4228
4229/* Code shared by all the string ops */
4230static
4231void dis_string_op_increment ( Int sz, IRTemp t_inc )
4232{
4233   UChar logSz;
4234   if (sz == 8 || sz == 4 || sz == 2) {
4235      logSz = 1;
4236      if (sz == 4) logSz = 2;
4237      if (sz == 8) logSz = 3;
4238      assign( t_inc,
4239              binop(Iop_Shl64, IRExpr_Get( OFFB_DFLAG, Ity_I64 ),
4240                               mkU8(logSz) ) );
4241   } else {
4242      assign( t_inc,
4243              IRExpr_Get( OFFB_DFLAG, Ity_I64 ) );
4244   }
4245}
4246
4247static
4248void dis_string_op( void (*dis_OP)( Int, IRTemp, Prefix pfx ),
4249                    Int sz, HChar* name, Prefix pfx )
4250{
4251   IRTemp t_inc = newTemp(Ity_I64);
4252   /* Really we ought to inspect the override prefixes, but we don't.
4253      The following assertion catches any resulting sillyness. */
4254   vassert(pfx == clearSegBits(pfx));
4255   dis_string_op_increment(sz, t_inc);
4256   dis_OP( sz, t_inc, pfx );
4257   DIP("%s%c\n", name, nameISize(sz));
4258}
4259
4260static
4261void dis_MOVS ( Int sz, IRTemp t_inc, Prefix pfx )
4262{
4263   IRType ty = szToITy(sz);
4264   IRTemp td = newTemp(Ity_I64);   /* RDI */
4265   IRTemp ts = newTemp(Ity_I64);   /* RSI */
4266   IRExpr *incd, *incs;
4267
4268   if (haveASO(pfx)) {
4269      assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
4270      assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
4271   } else {
4272      assign( td, getIReg64(R_RDI) );
4273      assign( ts, getIReg64(R_RSI) );
4274   }
4275
4276   storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
4277
4278   incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
4279   incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
4280   if (haveASO(pfx)) {
4281      incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
4282      incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
4283   }
4284   putIReg64( R_RDI, incd );
4285   putIReg64( R_RSI, incs );
4286}
4287
4288static
4289void dis_LODS ( Int sz, IRTemp t_inc, Prefix pfx )
4290{
4291   IRType ty = szToITy(sz);
4292   IRTemp ts = newTemp(Ity_I64);   /* RSI */
4293   IRExpr *incs;
4294
4295   if (haveASO(pfx))
4296      assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
4297   else
4298      assign( ts, getIReg64(R_RSI) );
4299
4300   putIRegRAX ( sz, loadLE(ty, mkexpr(ts)) );
4301
4302   incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
4303   if (haveASO(pfx))
4304      incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
4305   putIReg64( R_RSI, incs );
4306}
4307
4308static
4309void dis_STOS ( Int sz, IRTemp t_inc, Prefix pfx )
4310{
4311   IRType ty = szToITy(sz);
4312   IRTemp ta = newTemp(ty);        /* rAX */
4313   IRTemp td = newTemp(Ity_I64);   /* RDI */
4314   IRExpr *incd;
4315
4316   assign( ta, getIRegRAX(sz) );
4317
4318   if (haveASO(pfx))
4319      assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
4320   else
4321      assign( td, getIReg64(R_RDI) );
4322
4323   storeLE( mkexpr(td), mkexpr(ta) );
4324
4325   incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
4326   if (haveASO(pfx))
4327      incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
4328   putIReg64( R_RDI, incd );
4329}
4330
4331static
4332void dis_CMPS ( Int sz, IRTemp t_inc, Prefix pfx )
4333{
4334   IRType ty  = szToITy(sz);
4335   IRTemp tdv = newTemp(ty);      /* (RDI) */
4336   IRTemp tsv = newTemp(ty);      /* (RSI) */
4337   IRTemp td  = newTemp(Ity_I64); /*  RDI  */
4338   IRTemp ts  = newTemp(Ity_I64); /*  RSI  */
4339   IRExpr *incd, *incs;
4340
4341   if (haveASO(pfx)) {
4342      assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
4343      assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
4344   } else {
4345      assign( td, getIReg64(R_RDI) );
4346      assign( ts, getIReg64(R_RSI) );
4347   }
4348
4349   assign( tdv, loadLE(ty,mkexpr(td)) );
4350
4351   assign( tsv, loadLE(ty,mkexpr(ts)) );
4352
4353   setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
4354
4355   incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
4356   incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
4357   if (haveASO(pfx)) {
4358      incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
4359      incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
4360   }
4361   putIReg64( R_RDI, incd );
4362   putIReg64( R_RSI, incs );
4363}
4364
4365static
4366void dis_SCAS ( Int sz, IRTemp t_inc, Prefix pfx )
4367{
4368   IRType ty  = szToITy(sz);
4369   IRTemp ta  = newTemp(ty);       /*  rAX  */
4370   IRTemp td  = newTemp(Ity_I64);  /*  RDI  */
4371   IRTemp tdv = newTemp(ty);       /* (RDI) */
4372   IRExpr *incd;
4373
4374   assign( ta, getIRegRAX(sz) );
4375
4376   if (haveASO(pfx))
4377      assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
4378   else
4379      assign( td, getIReg64(R_RDI) );
4380
4381   assign( tdv, loadLE(ty,mkexpr(td)) );
4382
4383   setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
4384
4385   incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
4386   if (haveASO(pfx))
4387      incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
4388   putIReg64( R_RDI, incd );
4389}
4390
4391
4392/* Wrap the appropriate string op inside a REP/REPE/REPNE.  We assume
4393   the insn is the last one in the basic block, and so emit a jump to
4394   the next insn, rather than just falling through. */
4395static
4396void dis_REP_op ( /*MOD*/DisResult* dres,
4397                  AMD64Condcode cond,
4398                  void (*dis_OP)(Int, IRTemp, Prefix),
4399                  Int sz, Addr64 rip, Addr64 rip_next, HChar* name,
4400                  Prefix pfx )
4401{
4402   IRTemp t_inc = newTemp(Ity_I64);
4403   IRTemp tc;
4404   IRExpr* cmp;
4405
4406   /* Really we ought to inspect the override prefixes, but we don't.
4407      The following assertion catches any resulting sillyness. */
4408   vassert(pfx == clearSegBits(pfx));
4409
4410   if (haveASO(pfx)) {
4411      tc = newTemp(Ity_I32);  /*  ECX  */
4412      assign( tc, getIReg32(R_RCX) );
4413      cmp = binop(Iop_CmpEQ32, mkexpr(tc), mkU32(0));
4414   } else {
4415      tc = newTemp(Ity_I64);  /*  RCX  */
4416      assign( tc, getIReg64(R_RCX) );
4417      cmp = binop(Iop_CmpEQ64, mkexpr(tc), mkU64(0));
4418   }
4419
4420   stmt( IRStmt_Exit( cmp, Ijk_Boring,
4421                      IRConst_U64(rip_next), OFFB_RIP ) );
4422
4423   if (haveASO(pfx))
4424      putIReg32(R_RCX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
4425  else
4426      putIReg64(R_RCX, binop(Iop_Sub64, mkexpr(tc), mkU64(1)) );
4427
4428   dis_string_op_increment(sz, t_inc);
4429   dis_OP (sz, t_inc, pfx);
4430
4431   if (cond == AMD64CondAlways) {
4432      jmp_lit(dres, Ijk_Boring, rip);
4433      vassert(dres->whatNext == Dis_StopHere);
4434   } else {
4435      stmt( IRStmt_Exit( mk_amd64g_calculate_condition(cond),
4436                         Ijk_Boring,
4437                         IRConst_U64(rip),
4438                         OFFB_RIP ) );
4439      jmp_lit(dres, Ijk_Boring, rip_next);
4440      vassert(dres->whatNext == Dis_StopHere);
4441   }
4442   DIP("%s%c\n", name, nameISize(sz));
4443}
4444
4445
4446/*------------------------------------------------------------*/
4447/*--- Arithmetic, etc.                                     ---*/
4448/*------------------------------------------------------------*/
4449
4450/* IMUL E, G.  Supplied eip points to the modR/M byte. */
4451static
4452ULong dis_mul_E_G ( VexAbiInfo* vbi,
4453                    Prefix      pfx,
4454                    Int         size,
4455                    Long        delta0 )
4456{
4457   Int    alen;
4458   HChar  dis_buf[50];
4459   UChar  rm = getUChar(delta0);
4460   IRType ty = szToITy(size);
4461   IRTemp te = newTemp(ty);
4462   IRTemp tg = newTemp(ty);
4463   IRTemp resLo = newTemp(ty);
4464
4465   assign( tg, getIRegG(size, pfx, rm) );
4466   if (epartIsReg(rm)) {
4467      assign( te, getIRegE(size, pfx, rm) );
4468   } else {
4469      IRTemp addr = disAMode( &alen, vbi, pfx, delta0, dis_buf, 0 );
4470      assign( te, loadLE(ty,mkexpr(addr)) );
4471   }
4472
4473   setFlags_MUL ( ty, te, tg, AMD64G_CC_OP_SMULB );
4474
4475   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
4476
4477   putIRegG(size, pfx, rm, mkexpr(resLo) );
4478
4479   if (epartIsReg(rm)) {
4480      DIP("imul%c %s, %s\n", nameISize(size),
4481                             nameIRegE(size,pfx,rm),
4482                             nameIRegG(size,pfx,rm));
4483      return 1+delta0;
4484   } else {
4485      DIP("imul%c %s, %s\n", nameISize(size),
4486                             dis_buf,
4487                             nameIRegG(size,pfx,rm));
4488      return alen+delta0;
4489   }
4490}
4491
4492
4493/* IMUL I * E -> G.  Supplied rip points to the modR/M byte. */
4494static
4495ULong dis_imul_I_E_G ( VexAbiInfo* vbi,
4496                       Prefix      pfx,
4497                       Int         size,
4498                       Long        delta,
4499                       Int         litsize )
4500{
4501   Long   d64;
4502   Int    alen;
4503   HChar  dis_buf[50];
4504   UChar  rm = getUChar(delta);
4505   IRType ty = szToITy(size);
4506   IRTemp te = newTemp(ty);
4507   IRTemp tl = newTemp(ty);
4508   IRTemp resLo = newTemp(ty);
4509
4510   vassert(/*size == 1 ||*/ size == 2 || size == 4 || size == 8);
4511
4512   if (epartIsReg(rm)) {
4513      assign(te, getIRegE(size, pfx, rm));
4514      delta++;
4515   } else {
4516      IRTemp addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
4517                                     imin(4,litsize) );
4518      assign(te, loadLE(ty, mkexpr(addr)));
4519      delta += alen;
4520   }
4521   d64 = getSDisp(imin(4,litsize),delta);
4522   delta += imin(4,litsize);
4523
4524   d64 &= mkSizeMask(size);
4525   assign(tl, mkU(ty,d64));
4526
4527   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
4528
4529   setFlags_MUL ( ty, te, tl, AMD64G_CC_OP_SMULB );
4530
4531   putIRegG(size, pfx, rm, mkexpr(resLo));
4532
4533   DIP("imul%c $%lld, %s, %s\n",
4534       nameISize(size), d64,
4535       ( epartIsReg(rm) ? nameIRegE(size,pfx,rm) : dis_buf ),
4536       nameIRegG(size,pfx,rm) );
4537   return delta;
4538}
4539
4540
4541/* Generate an IR sequence to do a popcount operation on the supplied
4542   IRTemp, and return a new IRTemp holding the result.  'ty' may be
4543   Ity_I16, Ity_I32 or Ity_I64 only. */
4544static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src )
4545{
4546   Int i;
4547   if (ty == Ity_I16) {
4548      IRTemp old = IRTemp_INVALID;
4549      IRTemp nyu = IRTemp_INVALID;
4550      IRTemp mask[4], shift[4];
4551      for (i = 0; i < 4; i++) {
4552         mask[i]  = newTemp(ty);
4553         shift[i] = 1 << i;
4554      }
4555      assign(mask[0], mkU16(0x5555));
4556      assign(mask[1], mkU16(0x3333));
4557      assign(mask[2], mkU16(0x0F0F));
4558      assign(mask[3], mkU16(0x00FF));
4559      old = src;
4560      for (i = 0; i < 4; i++) {
4561         nyu = newTemp(ty);
4562         assign(nyu,
4563                binop(Iop_Add16,
4564                      binop(Iop_And16,
4565                            mkexpr(old),
4566                            mkexpr(mask[i])),
4567                      binop(Iop_And16,
4568                            binop(Iop_Shr16, mkexpr(old), mkU8(shift[i])),
4569                            mkexpr(mask[i]))));
4570         old = nyu;
4571      }
4572      return nyu;
4573   }
4574   if (ty == Ity_I32) {
4575      IRTemp old = IRTemp_INVALID;
4576      IRTemp nyu = IRTemp_INVALID;
4577      IRTemp mask[5], shift[5];
4578      for (i = 0; i < 5; i++) {
4579         mask[i]  = newTemp(ty);
4580         shift[i] = 1 << i;
4581      }
4582      assign(mask[0], mkU32(0x55555555));
4583      assign(mask[1], mkU32(0x33333333));
4584      assign(mask[2], mkU32(0x0F0F0F0F));
4585      assign(mask[3], mkU32(0x00FF00FF));
4586      assign(mask[4], mkU32(0x0000FFFF));
4587      old = src;
4588      for (i = 0; i < 5; i++) {
4589         nyu = newTemp(ty);
4590         assign(nyu,
4591                binop(Iop_Add32,
4592                      binop(Iop_And32,
4593                            mkexpr(old),
4594                            mkexpr(mask[i])),
4595                      binop(Iop_And32,
4596                            binop(Iop_Shr32, mkexpr(old), mkU8(shift[i])),
4597                            mkexpr(mask[i]))));
4598         old = nyu;
4599      }
4600      return nyu;
4601   }
4602   if (ty == Ity_I64) {
4603      IRTemp old = IRTemp_INVALID;
4604      IRTemp nyu = IRTemp_INVALID;
4605      IRTemp mask[6], shift[6];
4606      for (i = 0; i < 6; i++) {
4607         mask[i]  = newTemp(ty);
4608         shift[i] = 1 << i;
4609      }
4610      assign(mask[0], mkU64(0x5555555555555555ULL));
4611      assign(mask[1], mkU64(0x3333333333333333ULL));
4612      assign(mask[2], mkU64(0x0F0F0F0F0F0F0F0FULL));
4613      assign(mask[3], mkU64(0x00FF00FF00FF00FFULL));
4614      assign(mask[4], mkU64(0x0000FFFF0000FFFFULL));
4615      assign(mask[5], mkU64(0x00000000FFFFFFFFULL));
4616      old = src;
4617      for (i = 0; i < 6; i++) {
4618         nyu = newTemp(ty);
4619         assign(nyu,
4620                binop(Iop_Add64,
4621                      binop(Iop_And64,
4622                            mkexpr(old),
4623                            mkexpr(mask[i])),
4624                      binop(Iop_And64,
4625                            binop(Iop_Shr64, mkexpr(old), mkU8(shift[i])),
4626                            mkexpr(mask[i]))));
4627         old = nyu;
4628      }
4629      return nyu;
4630   }
4631   /*NOTREACHED*/
4632   vassert(0);
4633}
4634
4635
4636/* Generate an IR sequence to do a count-leading-zeroes operation on
4637   the supplied IRTemp, and return a new IRTemp holding the result.
4638   'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
4639   the argument is zero, return the number of bits in the word (the
4640   natural semantics). */
4641static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
4642{
4643   vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
4644
4645   IRTemp src64 = newTemp(Ity_I64);
4646   assign(src64, widenUto64( mkexpr(src) ));
4647
4648   IRTemp src64x = newTemp(Ity_I64);
4649   assign(src64x,
4650          binop(Iop_Shl64, mkexpr(src64),
4651                           mkU8(64 - 8 * sizeofIRType(ty))));
4652
4653   // Clz64 has undefined semantics when its input is zero, so
4654   // special-case around that.
4655   IRTemp res64 = newTemp(Ity_I64);
4656   assign(res64,
4657          IRExpr_Mux0X(
4658             unop(Iop_1Uto8,
4659                  binop(Iop_CmpEQ64, mkexpr(src64x), mkU64(0))),
4660             unop(Iop_Clz64, mkexpr(src64x)),
4661             mkU64(8 * sizeofIRType(ty))
4662   ));
4663
4664   IRTemp res = newTemp(ty);
4665   assign(res, narrowTo(ty, mkexpr(res64)));
4666   return res;
4667}
4668
4669
4670/*------------------------------------------------------------*/
4671/*---                                                      ---*/
4672/*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
4673/*---                                                      ---*/
4674/*------------------------------------------------------------*/
4675
4676/* --- Helper functions for dealing with the register stack. --- */
4677
4678/* --- Set the emulation-warning pseudo-register. --- */
4679
4680static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
4681{
4682   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
4683   stmt( IRStmt_Put( OFFB_EMWARN, e ) );
4684}
4685
4686/* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
4687
4688static IRExpr* mkQNaN64 ( void )
4689{
4690  /* QNaN is 0 2047 1 0(51times)
4691     == 0b 11111111111b 1 0(51times)
4692     == 0x7FF8 0000 0000 0000
4693   */
4694   return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
4695}
4696
4697/* --------- Get/put the top-of-stack pointer :: Ity_I32 --------- */
4698
4699static IRExpr* get_ftop ( void )
4700{
4701   return IRExpr_Get( OFFB_FTOP, Ity_I32 );
4702}
4703
4704static void put_ftop ( IRExpr* e )
4705{
4706   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
4707   stmt( IRStmt_Put( OFFB_FTOP, e ) );
4708}
4709
4710/* --------- Get/put the C3210 bits. --------- */
4711
4712static IRExpr*  /* :: Ity_I64 */ get_C3210 ( void )
4713{
4714   return IRExpr_Get( OFFB_FC3210, Ity_I64 );
4715}
4716
4717static void put_C3210 ( IRExpr* e  /* :: Ity_I64 */ )
4718{
4719   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
4720   stmt( IRStmt_Put( OFFB_FC3210, e ) );
4721}
4722
4723/* --------- Get/put the FPU rounding mode. --------- */
4724static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
4725{
4726   return unop(Iop_64to32, IRExpr_Get( OFFB_FPROUND, Ity_I64 ));
4727}
4728
4729static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
4730{
4731   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
4732   stmt( IRStmt_Put( OFFB_FPROUND, unop(Iop_32Uto64,e) ) );
4733}
4734
4735
4736/* --------- Synthesise a 2-bit FPU rounding mode. --------- */
4737/* Produces a value in 0 .. 3, which is encoded as per the type
4738   IRRoundingMode.  Since the guest_FPROUND value is also encoded as
4739   per IRRoundingMode, we merely need to get it and mask it for
4740   safety.
4741*/
4742static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
4743{
4744   return binop( Iop_And32, get_fpround(), mkU32(3) );
4745}
4746
4747static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
4748{
4749   return mkU32(Irrm_NEAREST);
4750}
4751
4752
4753/* --------- Get/set FP register tag bytes. --------- */
4754
4755/* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
4756
4757static void put_ST_TAG ( Int i, IRExpr* value )
4758{
4759   IRRegArray* descr;
4760   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
4761   descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
4762   stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
4763}
4764
4765/* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
4766   zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
4767
4768static IRExpr* get_ST_TAG ( Int i )
4769{
4770   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
4771   return IRExpr_GetI( descr, get_ftop(), i );
4772}
4773
4774
4775/* --------- Get/set FP registers. --------- */
4776
4777/* Given i, and some expression e, emit 'ST(i) = e' and set the
4778   register's tag to indicate the register is full.  The previous
4779   state of the register is not checked. */
4780
4781static void put_ST_UNCHECKED ( Int i, IRExpr* value )
4782{
4783   IRRegArray* descr;
4784   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
4785   descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
4786   stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
4787   /* Mark the register as in-use. */
4788   put_ST_TAG(i, mkU8(1));
4789}
4790
4791/* Given i, and some expression e, emit
4792      ST(i) = is_full(i) ? NaN : e
4793   and set the tag accordingly.
4794*/
4795
4796static void put_ST ( Int i, IRExpr* value )
4797{
4798   put_ST_UNCHECKED( i,
4799                     IRExpr_Mux0X( get_ST_TAG(i),
4800                                   /* 0 means empty */
4801                                   value,
4802                                   /* non-0 means full */
4803                                   mkQNaN64()
4804                   )
4805   );
4806}
4807
4808
4809/* Given i, generate an expression yielding 'ST(i)'. */
4810
4811static IRExpr* get_ST_UNCHECKED ( Int i )
4812{
4813   IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
4814   return IRExpr_GetI( descr, get_ftop(), i );
4815}
4816
4817
4818/* Given i, generate an expression yielding
4819  is_full(i) ? ST(i) : NaN
4820*/
4821
4822static IRExpr* get_ST ( Int i )
4823{
4824   return
4825      IRExpr_Mux0X( get_ST_TAG(i),
4826                    /* 0 means empty */
4827                    mkQNaN64(),
4828                    /* non-0 means full */
4829                    get_ST_UNCHECKED(i));
4830}
4831
4832
4833/* Adjust FTOP downwards by one register. */
4834
4835static void fp_push ( void )
4836{
4837   put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
4838}
4839
4840/* Adjust FTOP upwards by one register, and mark the vacated register
4841   as empty.  */
4842
4843static void fp_pop ( void )
4844{
4845   put_ST_TAG(0, mkU8(0));
4846   put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
4847}
4848
4849/* Clear the C2 bit of the FPU status register, for
4850   sin/cos/tan/sincos. */
4851
4852static void clear_C2 ( void )
4853{
4854   put_C3210( binop(Iop_And64, get_C3210(), mkU64(~AMD64G_FC_MASK_C2)) );
4855}
4856
4857/* Invent a plausible-looking FPU status word value:
4858      ((ftop & 7) << 11) | (c3210 & 0x4700)
4859 */
4860static IRExpr* get_FPU_sw ( void )
4861{
4862   return
4863      unop(Iop_32to16,
4864           binop(Iop_Or32,
4865                 binop(Iop_Shl32,
4866                       binop(Iop_And32, get_ftop(), mkU32(7)),
4867                             mkU8(11)),
4868                       binop(Iop_And32, unop(Iop_64to32, get_C3210()),
4869                                        mkU32(0x4700))
4870      ));
4871}
4872
4873
4874/* ------------------------------------------------------- */
4875/* Given all that stack-mangling junk, we can now go ahead
4876   and describe FP instructions.
4877*/
4878
4879/* ST(0) = ST(0) `op` mem64/32(addr)
4880   Need to check ST(0)'s tag on read, but not on write.
4881*/
4882static
4883void fp_do_op_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
4884                         IROp op, Bool dbl )
4885{
4886   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
4887   if (dbl) {
4888      put_ST_UNCHECKED(0,
4889         triop( op,
4890                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4891                get_ST(0),
4892                loadLE(Ity_F64,mkexpr(addr))
4893         ));
4894   } else {
4895      put_ST_UNCHECKED(0,
4896         triop( op,
4897                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4898                get_ST(0),
4899                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
4900         ));
4901   }
4902}
4903
4904
4905/* ST(0) = mem64/32(addr) `op` ST(0)
4906   Need to check ST(0)'s tag on read, but not on write.
4907*/
4908static
4909void fp_do_oprev_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
4910                            IROp op, Bool dbl )
4911{
4912   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
4913   if (dbl) {
4914      put_ST_UNCHECKED(0,
4915         triop( op,
4916                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4917                loadLE(Ity_F64,mkexpr(addr)),
4918                get_ST(0)
4919         ));
4920   } else {
4921      put_ST_UNCHECKED(0,
4922         triop( op,
4923                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4924                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
4925                get_ST(0)
4926         ));
4927   }
4928}
4929
4930
4931/* ST(dst) = ST(dst) `op` ST(src).
4932   Check dst and src tags when reading but not on write.
4933*/
4934static
4935void fp_do_op_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
4936                      Bool pop_after )
4937{
4938   DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
4939   put_ST_UNCHECKED(
4940      st_dst,
4941      triop( op,
4942             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4943             get_ST(st_dst),
4944             get_ST(st_src) )
4945   );
4946   if (pop_after)
4947      fp_pop();
4948}
4949
4950/* ST(dst) = ST(src) `op` ST(dst).
4951   Check dst and src tags when reading but not on write.
4952*/
4953static
4954void fp_do_oprev_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
4955                         Bool pop_after )
4956{
4957   DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
4958   put_ST_UNCHECKED(
4959      st_dst,
4960      triop( op,
4961             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4962             get_ST(st_src),
4963             get_ST(st_dst) )
4964   );
4965   if (pop_after)
4966      fp_pop();
4967}
4968
4969/* %rflags(Z,P,C) = UCOMI( st(0), st(i) ) */
4970static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
4971{
4972   DIP("fucomi%s %%st(0),%%st(%u)\n", pop_after ? "p" : "", i);
4973   /* This is a bit of a hack (and isn't really right).  It sets
4974      Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
4975      documentation implies A and S are unchanged.
4976   */
4977   /* It's also fishy in that it is used both for COMIP and
4978      UCOMIP, and they aren't the same (although similar). */
4979   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
4980   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
4981   stmt( IRStmt_Put(
4982            OFFB_CC_DEP1,
4983            binop( Iop_And64,
4984                   unop( Iop_32Uto64,
4985                         binop(Iop_CmpF64, get_ST(0), get_ST(i))),
4986                   mkU64(0x45)
4987        )));
4988   if (pop_after)
4989      fp_pop();
4990}
4991
4992
4993/* returns
4994   32to16( if e32 <s -32768 || e32 >s 32767 then -32768 else e32 )
4995*/
4996static IRExpr* x87ishly_qnarrow_32_to_16 ( IRExpr* e32 )
4997{
4998   IRTemp t32 = newTemp(Ity_I32);
4999   assign( t32, e32 );
5000   return
5001      IRExpr_Mux0X(
5002         unop(Iop_1Uto8,
5003              binop(Iop_CmpLT64U,
5004                    unop(Iop_32Uto64,
5005                         binop(Iop_Add32, mkexpr(t32), mkU32(32768))),
5006                    mkU64(65536))),
5007         mkU16( 0x8000 ),
5008         unop(Iop_32to16, mkexpr(t32)));
5009}
5010
5011
5012static
5013ULong dis_FPU ( /*OUT*/Bool* decode_ok,
5014                VexAbiInfo* vbi, Prefix pfx, Long delta )
5015{
5016   Int    len;
5017   UInt   r_src, r_dst;
5018   HChar  dis_buf[50];
5019   IRTemp t1, t2;
5020
5021   /* On entry, delta points at the second byte of the insn (the modrm
5022      byte).*/
5023   UChar first_opcode = getUChar(delta-1);
5024   UChar modrm        = getUChar(delta+0);
5025
5026   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
5027
5028   if (first_opcode == 0xD8) {
5029      if (modrm < 0xC0) {
5030
5031         /* bits 5,4,3 are an opcode extension, and the modRM also
5032           specifies an address. */
5033         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
5034         delta += len;
5035
5036         switch (gregLO3ofRM(modrm)) {
5037
5038            case 0: /* FADD single-real */
5039               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
5040               break;
5041
5042            case 1: /* FMUL single-real */
5043               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
5044               break;
5045
5046            case 2: /* FCOM single-real */
5047               DIP("fcoms %s\n", dis_buf);
5048               /* This forces C1 to zero, which isn't right. */
5049               /* The AMD documentation suggests that forcing C1 to
5050                  zero is correct (Eliot Moss) */
5051               put_C3210(
5052                   unop( Iop_32Uto64,
5053                       binop( Iop_And32,
5054                              binop(Iop_Shl32,
5055                                    binop(Iop_CmpF64,
5056                                          get_ST(0),
5057                                          unop(Iop_F32toF64,
5058                                               loadLE(Ity_F32,mkexpr(addr)))),
5059                                    mkU8(8)),
5060                              mkU32(0x4500)
5061                   )));
5062               break;
5063
5064            case 3: /* FCOMP single-real */
5065               /* The AMD documentation suggests that forcing C1 to
5066                  zero is correct (Eliot Moss) */
5067               DIP("fcomps %s\n", dis_buf);
5068               /* This forces C1 to zero, which isn't right. */
5069               put_C3210(
5070                   unop( Iop_32Uto64,
5071                       binop( Iop_And32,
5072                              binop(Iop_Shl32,
5073                                    binop(Iop_CmpF64,
5074                                          get_ST(0),
5075                                          unop(Iop_F32toF64,
5076                                               loadLE(Ity_F32,mkexpr(addr)))),
5077                                    mkU8(8)),
5078                              mkU32(0x4500)
5079                   )));
5080               fp_pop();
5081               break;
5082
5083            case 4: /* FSUB single-real */
5084               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
5085               break;
5086
5087            case 5: /* FSUBR single-real */
5088               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
5089               break;
5090
5091            case 6: /* FDIV single-real */
5092               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
5093               break;
5094
5095            case 7: /* FDIVR single-real */
5096               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
5097               break;
5098
5099            default:
5100               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
5101               vex_printf("first_opcode == 0xD8\n");
5102               goto decode_fail;
5103         }
5104      } else {
5105         delta++;
5106         switch (modrm) {
5107
5108            case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
5109               fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
5110               break;
5111
5112            case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
5113               fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
5114               break;
5115
5116            /* Dunno if this is right */
5117            case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
5118               r_dst = (UInt)modrm - 0xD0;
5119               DIP("fcom %%st(0),%%st(%d)\n", r_dst);
5120               /* This forces C1 to zero, which isn't right. */
5121               put_C3210(
5122                   unop(Iop_32Uto64,
5123                   binop( Iop_And32,
5124                          binop(Iop_Shl32,
5125                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
5126                                mkU8(8)),
5127                          mkU32(0x4500)
5128                   )));
5129               break;
5130
5131            /* Dunno if this is right */
5132            case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
5133               r_dst = (UInt)modrm - 0xD8;
5134               DIP("fcomp %%st(0),%%st(%d)\n", r_dst);
5135               /* This forces C1 to zero, which isn't right. */
5136               put_C3210(
5137                   unop(Iop_32Uto64,
5138                   binop( Iop_And32,
5139                          binop(Iop_Shl32,
5140                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
5141                                mkU8(8)),
5142                          mkU32(0x4500)
5143                   )));
5144               fp_pop();
5145               break;
5146
5147            case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
5148               fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
5149               break;
5150
5151            case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
5152               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
5153               break;
5154
5155            case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
5156               fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
5157               break;
5158
5159            case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
5160               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
5161               break;
5162
5163            default:
5164               goto decode_fail;
5165         }
5166      }
5167   }
5168
5169   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
5170   else
5171   if (first_opcode == 0xD9) {
5172      if (modrm < 0xC0) {
5173
5174         /* bits 5,4,3 are an opcode extension, and the modRM also
5175            specifies an address. */
5176         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
5177         delta += len;
5178
5179         switch (gregLO3ofRM(modrm)) {
5180
5181            case 0: /* FLD single-real */
5182               DIP("flds %s\n", dis_buf);
5183               fp_push();
5184               put_ST(0, unop(Iop_F32toF64,
5185                              loadLE(Ity_F32, mkexpr(addr))));
5186               break;
5187
5188            case 2: /* FST single-real */
5189               DIP("fsts %s\n", dis_buf);
5190               storeLE(mkexpr(addr),
5191                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
5192               break;
5193
5194            case 3: /* FSTP single-real */
5195               DIP("fstps %s\n", dis_buf);
5196               storeLE(mkexpr(addr),
5197                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
5198               fp_pop();
5199               break;
5200
5201            case 4: { /* FLDENV m28 */
5202               /* Uses dirty helper:
5203                     VexEmWarn amd64g_do_FLDENV ( VexGuestX86State*, HWord ) */
5204               IRTemp    ew = newTemp(Ity_I32);
5205               IRTemp   w64 = newTemp(Ity_I64);
5206               IRDirty*   d = unsafeIRDirty_0_N (
5207                                 0/*regparms*/,
5208                                 "amd64g_dirtyhelper_FLDENV",
5209                                 &amd64g_dirtyhelper_FLDENV,
5210                                 mkIRExprVec_1( mkexpr(addr) )
5211                              );
5212               d->needsBBP = True;
5213               d->tmp      = w64;
5214               /* declare we're reading memory */
5215               d->mFx   = Ifx_Read;
5216               d->mAddr = mkexpr(addr);
5217               d->mSize = 28;
5218
5219               /* declare we're writing guest state */
5220               d->nFxState = 4;
5221               vex_bzero(&d->fxState, sizeof(d->fxState));
5222
5223               d->fxState[0].fx     = Ifx_Write;
5224               d->fxState[0].offset = OFFB_FTOP;
5225               d->fxState[0].size   = sizeof(UInt);
5226
5227               d->fxState[1].fx     = Ifx_Write;
5228               d->fxState[1].offset = OFFB_FPTAGS;
5229               d->fxState[1].size   = 8 * sizeof(UChar);
5230
5231               d->fxState[2].fx     = Ifx_Write;
5232               d->fxState[2].offset = OFFB_FPROUND;
5233               d->fxState[2].size   = sizeof(ULong);
5234
5235               d->fxState[3].fx     = Ifx_Write;
5236               d->fxState[3].offset = OFFB_FC3210;
5237               d->fxState[3].size   = sizeof(ULong);
5238
5239               stmt( IRStmt_Dirty(d) );
5240
5241               /* ew contains any emulation warning we may need to
5242                  issue.  If needed, side-exit to the next insn,
5243                  reporting the warning, so that Valgrind's dispatcher
5244                  sees the warning. */
5245	       assign(ew, unop(Iop_64to32,mkexpr(w64)) );
5246               put_emwarn( mkexpr(ew) );
5247               stmt(
5248                  IRStmt_Exit(
5249                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
5250                     Ijk_EmWarn,
5251                     IRConst_U64( guest_RIP_bbstart+delta ),
5252                     OFFB_RIP
5253                  )
5254               );
5255
5256               DIP("fldenv %s\n", dis_buf);
5257               break;
5258            }
5259
5260            case 5: {/* FLDCW */
5261               /* The only thing we observe in the control word is the
5262                  rounding mode.  Therefore, pass the 16-bit value
5263                  (x87 native-format control word) to a clean helper,
5264                  getting back a 64-bit value, the lower half of which
5265                  is the FPROUND value to store, and the upper half of
5266                  which is the emulation-warning token which may be
5267                  generated.
5268               */
5269               /* ULong amd64h_check_fldcw ( ULong ); */
5270               IRTemp t64 = newTemp(Ity_I64);
5271               IRTemp ew = newTemp(Ity_I32);
5272               DIP("fldcw %s\n", dis_buf);
5273               assign( t64, mkIRExprCCall(
5274                               Ity_I64, 0/*regparms*/,
5275                               "amd64g_check_fldcw",
5276                               &amd64g_check_fldcw,
5277                               mkIRExprVec_1(
5278                                  unop( Iop_16Uto64,
5279                                        loadLE(Ity_I16, mkexpr(addr)))
5280                               )
5281                            )
5282                     );
5283
5284               put_fpround( unop(Iop_64to32, mkexpr(t64)) );
5285               assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
5286               put_emwarn( mkexpr(ew) );
5287               /* Finally, if an emulation warning was reported,
5288                  side-exit to the next insn, reporting the warning,
5289                  so that Valgrind's dispatcher sees the warning. */
5290               stmt(
5291                  IRStmt_Exit(
5292                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
5293                     Ijk_EmWarn,
5294                     IRConst_U64( guest_RIP_bbstart+delta ),
5295                     OFFB_RIP
5296                  )
5297               );
5298               break;
5299            }
5300
5301            case 6: { /* FNSTENV m28 */
5302               /* Uses dirty helper:
5303                     void amd64g_do_FSTENV ( VexGuestAMD64State*, HWord ) */
5304               IRDirty* d = unsafeIRDirty_0_N (
5305                               0/*regparms*/,
5306                               "amd64g_dirtyhelper_FSTENV",
5307                               &amd64g_dirtyhelper_FSTENV,
5308                               mkIRExprVec_1( mkexpr(addr) )
5309                            );
5310               d->needsBBP = True;
5311               /* declare we're writing memory */
5312               d->mFx   = Ifx_Write;
5313               d->mAddr = mkexpr(addr);
5314               d->mSize = 28;
5315
5316               /* declare we're reading guest state */
5317               d->nFxState = 4;
5318               vex_bzero(&d->fxState, sizeof(d->fxState));
5319
5320               d->fxState[0].fx     = Ifx_Read;
5321               d->fxState[0].offset = OFFB_FTOP;
5322               d->fxState[0].size   = sizeof(UInt);
5323
5324               d->fxState[1].fx     = Ifx_Read;
5325               d->fxState[1].offset = OFFB_FPTAGS;
5326               d->fxState[1].size   = 8 * sizeof(UChar);
5327
5328               d->fxState[2].fx     = Ifx_Read;
5329               d->fxState[2].offset = OFFB_FPROUND;
5330               d->fxState[2].size   = sizeof(ULong);
5331
5332               d->fxState[3].fx     = Ifx_Read;
5333               d->fxState[3].offset = OFFB_FC3210;
5334               d->fxState[3].size   = sizeof(ULong);
5335
5336               stmt( IRStmt_Dirty(d) );
5337
5338               DIP("fnstenv %s\n", dis_buf);
5339               break;
5340            }
5341
5342            case 7: /* FNSTCW */
5343               /* Fake up a native x87 FPU control word.  The only
5344                  thing it depends on is FPROUND[1:0], so call a clean
5345                  helper to cook it up. */
5346               /* ULong amd64g_create_fpucw ( ULong fpround ) */
5347               DIP("fnstcw %s\n", dis_buf);
5348               storeLE(
5349                  mkexpr(addr),
5350                  unop( Iop_64to16,
5351                        mkIRExprCCall(
5352                           Ity_I64, 0/*regp*/,
5353                           "amd64g_create_fpucw", &amd64g_create_fpucw,
5354                           mkIRExprVec_1( unop(Iop_32Uto64, get_fpround()) )
5355                        )
5356                  )
5357               );
5358               break;
5359
5360            default:
5361               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
5362               vex_printf("first_opcode == 0xD9\n");
5363               goto decode_fail;
5364         }
5365
5366      } else {
5367         delta++;
5368         switch (modrm) {
5369
5370            case 0xC0 ... 0xC7: /* FLD %st(?) */
5371               r_src = (UInt)modrm - 0xC0;
5372               DIP("fld %%st(%u)\n", r_src);
5373               t1 = newTemp(Ity_F64);
5374               assign(t1, get_ST(r_src));
5375               fp_push();
5376               put_ST(0, mkexpr(t1));
5377               break;
5378
5379            case 0xC8 ... 0xCF: /* FXCH %st(?) */
5380               r_src = (UInt)modrm - 0xC8;
5381               DIP("fxch %%st(%u)\n", r_src);
5382               t1 = newTemp(Ity_F64);
5383               t2 = newTemp(Ity_F64);
5384               assign(t1, get_ST(0));
5385               assign(t2, get_ST(r_src));
5386               put_ST_UNCHECKED(0, mkexpr(t2));
5387               put_ST_UNCHECKED(r_src, mkexpr(t1));
5388               break;
5389
5390            case 0xE0: /* FCHS */
5391               DIP("fchs\n");
5392               put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
5393               break;
5394
5395            case 0xE1: /* FABS */
5396               DIP("fabs\n");
5397               put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
5398               break;
5399
5400            case 0xE5: { /* FXAM */
5401               /* This is an interesting one.  It examines %st(0),
5402                  regardless of whether the tag says it's empty or not.
5403                  Here, just pass both the tag (in our format) and the
5404                  value (as a double, actually a ULong) to a helper
5405                  function. */
5406               IRExpr** args
5407                  = mkIRExprVec_2( unop(Iop_8Uto64, get_ST_TAG(0)),
5408                                   unop(Iop_ReinterpF64asI64,
5409                                        get_ST_UNCHECKED(0)) );
5410               put_C3210(mkIRExprCCall(
5411                            Ity_I64,
5412                            0/*regparm*/,
5413                            "amd64g_calculate_FXAM", &amd64g_calculate_FXAM,
5414                            args
5415                        ));
5416               DIP("fxam\n");
5417               break;
5418            }
5419
5420            case 0xE8: /* FLD1 */
5421               DIP("fld1\n");
5422               fp_push();
5423               /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
5424               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
5425               break;
5426
5427            case 0xE9: /* FLDL2T */
5428               DIP("fldl2t\n");
5429               fp_push();
5430               /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
5431               put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
5432               break;
5433
5434            case 0xEA: /* FLDL2E */
5435               DIP("fldl2e\n");
5436               fp_push();
5437               /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
5438               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
5439               break;
5440
5441            case 0xEB: /* FLDPI */
5442               DIP("fldpi\n");
5443               fp_push();
5444               /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
5445               put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
5446               break;
5447
5448            case 0xEC: /* FLDLG2 */
5449               DIP("fldlg2\n");
5450               fp_push();
5451               /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
5452               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
5453               break;
5454
5455            case 0xED: /* FLDLN2 */
5456               DIP("fldln2\n");
5457               fp_push();
5458               /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
5459               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
5460               break;
5461
5462            case 0xEE: /* FLDZ */
5463               DIP("fldz\n");
5464               fp_push();
5465               /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
5466               put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
5467               break;
5468
5469            case 0xF0: /* F2XM1 */
5470               DIP("f2xm1\n");
5471               put_ST_UNCHECKED(0,
5472                  binop(Iop_2xm1F64,
5473                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5474                        get_ST(0)));
5475               break;
5476
5477            case 0xF1: /* FYL2X */
5478               DIP("fyl2x\n");
5479               put_ST_UNCHECKED(1,
5480                  triop(Iop_Yl2xF64,
5481                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5482                        get_ST(1),
5483                        get_ST(0)));
5484               fp_pop();
5485               break;
5486
5487            case 0xF2: /* FPTAN */
5488               DIP("ftan\n");
5489               put_ST_UNCHECKED(0,
5490                  binop(Iop_TanF64,
5491                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5492                        get_ST(0)));
5493               fp_push();
5494               put_ST(0, IRExpr_Const(IRConst_F64(1.0)));
5495               clear_C2(); /* HACK */
5496               break;
5497
5498            case 0xF3: /* FPATAN */
5499               DIP("fpatan\n");
5500               put_ST_UNCHECKED(1,
5501                  triop(Iop_AtanF64,
5502                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5503                        get_ST(1),
5504                        get_ST(0)));
5505               fp_pop();
5506               break;
5507
5508            case 0xF4: { /* FXTRACT */
5509               IRTemp argF = newTemp(Ity_F64);
5510               IRTemp sigF = newTemp(Ity_F64);
5511               IRTemp expF = newTemp(Ity_F64);
5512               IRTemp argI = newTemp(Ity_I64);
5513               IRTemp sigI = newTemp(Ity_I64);
5514               IRTemp expI = newTemp(Ity_I64);
5515               DIP("fxtract\n");
5516               assign( argF, get_ST(0) );
5517               assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
5518               assign( sigI,
5519                       mkIRExprCCall(
5520                          Ity_I64, 0/*regparms*/,
5521                          "x86amd64g_calculate_FXTRACT",
5522                          &x86amd64g_calculate_FXTRACT,
5523                          mkIRExprVec_2( mkexpr(argI),
5524                                         mkIRExpr_HWord(0)/*sig*/ ))
5525               );
5526               assign( expI,
5527                       mkIRExprCCall(
5528                          Ity_I64, 0/*regparms*/,
5529                          "x86amd64g_calculate_FXTRACT",
5530                          &x86amd64g_calculate_FXTRACT,
5531                          mkIRExprVec_2( mkexpr(argI),
5532                                         mkIRExpr_HWord(1)/*exp*/ ))
5533               );
5534               assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
5535               assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
5536               /* exponent */
5537               put_ST_UNCHECKED(0, mkexpr(expF) );
5538               fp_push();
5539               /* significand */
5540               put_ST(0, mkexpr(sigF) );
5541               break;
5542            }
5543
5544            case 0xF5: { /* FPREM1 -- IEEE compliant */
5545               IRTemp a1 = newTemp(Ity_F64);
5546               IRTemp a2 = newTemp(Ity_F64);
5547               DIP("fprem1\n");
5548               /* Do FPREM1 twice, once to get the remainder, and once
5549                  to get the C3210 flag values. */
5550               assign( a1, get_ST(0) );
5551               assign( a2, get_ST(1) );
5552               put_ST_UNCHECKED(0,
5553                  triop(Iop_PRem1F64,
5554                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5555                        mkexpr(a1),
5556                        mkexpr(a2)));
5557               put_C3210(
5558                  unop(Iop_32Uto64,
5559                  triop(Iop_PRem1C3210F64,
5560                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5561                        mkexpr(a1),
5562                        mkexpr(a2)) ));
5563               break;
5564            }
5565
5566            case 0xF7: /* FINCSTP */
5567               DIP("fincstp\n");
5568               put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
5569               break;
5570
5571            case 0xF8: { /* FPREM -- not IEEE compliant */
5572               IRTemp a1 = newTemp(Ity_F64);
5573               IRTemp a2 = newTemp(Ity_F64);
5574               DIP("fprem\n");
5575               /* Do FPREM twice, once to get the remainder, and once
5576                  to get the C3210 flag values. */
5577               assign( a1, get_ST(0) );
5578               assign( a2, get_ST(1) );
5579               put_ST_UNCHECKED(0,
5580                  triop(Iop_PRemF64,
5581                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5582                        mkexpr(a1),
5583                        mkexpr(a2)));
5584               put_C3210(
5585                  unop(Iop_32Uto64,
5586                  triop(Iop_PRemC3210F64,
5587                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5588                        mkexpr(a1),
5589                        mkexpr(a2)) ));
5590               break;
5591            }
5592
5593            case 0xF9: /* FYL2XP1 */
5594               DIP("fyl2xp1\n");
5595               put_ST_UNCHECKED(1,
5596                  triop(Iop_Yl2xp1F64,
5597                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5598                        get_ST(1),
5599                        get_ST(0)));
5600               fp_pop();
5601               break;
5602
5603            case 0xFA: /* FSQRT */
5604               DIP("fsqrt\n");
5605               put_ST_UNCHECKED(0,
5606                  binop(Iop_SqrtF64,
5607                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5608                        get_ST(0)));
5609               break;
5610
5611            case 0xFB: { /* FSINCOS */
5612               IRTemp a1 = newTemp(Ity_F64);
5613               assign( a1, get_ST(0) );
5614               DIP("fsincos\n");
5615               put_ST_UNCHECKED(0,
5616                  binop(Iop_SinF64,
5617                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5618                        mkexpr(a1)));
5619               fp_push();
5620               put_ST(0,
5621                  binop(Iop_CosF64,
5622                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5623                        mkexpr(a1)));
5624               clear_C2(); /* HACK */
5625               break;
5626            }
5627
5628            case 0xFC: /* FRNDINT */
5629               DIP("frndint\n");
5630               put_ST_UNCHECKED(0,
5631                  binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
5632               break;
5633
5634            case 0xFD: /* FSCALE */
5635               DIP("fscale\n");
5636               put_ST_UNCHECKED(0,
5637                  triop(Iop_ScaleF64,
5638                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5639                        get_ST(0),
5640                        get_ST(1)));
5641               break;
5642
5643            case 0xFE: /* FSIN */
5644               DIP("fsin\n");
5645               put_ST_UNCHECKED(0,
5646                  binop(Iop_SinF64,
5647                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5648                        get_ST(0)));
5649               clear_C2(); /* HACK */
5650               break;
5651
5652            case 0xFF: /* FCOS */
5653               DIP("fcos\n");
5654               put_ST_UNCHECKED(0,
5655                  binop(Iop_CosF64,
5656                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5657                        get_ST(0)));
5658               clear_C2(); /* HACK */
5659               break;
5660
5661            default:
5662               goto decode_fail;
5663         }
5664      }
5665   }
5666
5667   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
5668   else
5669   if (first_opcode == 0xDA) {
5670
5671      if (modrm < 0xC0) {
5672
5673         /* bits 5,4,3 are an opcode extension, and the modRM also
5674            specifies an address. */
5675         IROp   fop;
5676         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
5677         delta += len;
5678         switch (gregLO3ofRM(modrm)) {
5679
5680            case 0: /* FIADD m32int */ /* ST(0) += m32int */
5681               DIP("fiaddl %s\n", dis_buf);
5682               fop = Iop_AddF64;
5683               goto do_fop_m32;
5684
5685            case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
5686               DIP("fimull %s\n", dis_buf);
5687               fop = Iop_MulF64;
5688               goto do_fop_m32;
5689
5690            case 4: /* FISUB m32int */ /* ST(0) -= m32int */
5691               DIP("fisubl %s\n", dis_buf);
5692               fop = Iop_SubF64;
5693               goto do_fop_m32;
5694
5695            case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
5696               DIP("fisubrl %s\n", dis_buf);
5697               fop = Iop_SubF64;
5698               goto do_foprev_m32;
5699
5700            case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
5701               DIP("fisubl %s\n", dis_buf);
5702               fop = Iop_DivF64;
5703               goto do_fop_m32;
5704
5705            case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
5706               DIP("fidivrl %s\n", dis_buf);
5707               fop = Iop_DivF64;
5708               goto do_foprev_m32;
5709
5710            do_fop_m32:
5711               put_ST_UNCHECKED(0,
5712                  triop(fop,
5713                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5714                        get_ST(0),
5715                        unop(Iop_I32StoF64,
5716                             loadLE(Ity_I32, mkexpr(addr)))));
5717               break;
5718
5719            do_foprev_m32:
5720               put_ST_UNCHECKED(0,
5721                  triop(fop,
5722                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5723                        unop(Iop_I32StoF64,
5724                             loadLE(Ity_I32, mkexpr(addr))),
5725                        get_ST(0)));
5726               break;
5727
5728            default:
5729               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
5730               vex_printf("first_opcode == 0xDA\n");
5731               goto decode_fail;
5732         }
5733
5734      } else {
5735
5736         delta++;
5737         switch (modrm) {
5738
5739            case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
5740               r_src = (UInt)modrm - 0xC0;
5741               DIP("fcmovb %%st(%u), %%st(0)\n", r_src);
5742               put_ST_UNCHECKED(0,
5743                                IRExpr_Mux0X(
5744                                    unop(Iop_1Uto8,
5745                                         mk_amd64g_calculate_condition(AMD64CondB)),
5746                                    get_ST(0), get_ST(r_src)) );
5747               break;
5748
5749            case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
5750               r_src = (UInt)modrm - 0xC8;
5751               DIP("fcmovz %%st(%u), %%st(0)\n", r_src);
5752               put_ST_UNCHECKED(0,
5753                                IRExpr_Mux0X(
5754                                    unop(Iop_1Uto8,
5755                                         mk_amd64g_calculate_condition(AMD64CondZ)),
5756                                    get_ST(0), get_ST(r_src)) );
5757               break;
5758
5759            case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
5760               r_src = (UInt)modrm - 0xD0;
5761               DIP("fcmovbe %%st(%u), %%st(0)\n", r_src);
5762               put_ST_UNCHECKED(0,
5763                                IRExpr_Mux0X(
5764                                    unop(Iop_1Uto8,
5765                                         mk_amd64g_calculate_condition(AMD64CondBE)),
5766                                    get_ST(0), get_ST(r_src)) );
5767               break;
5768
5769            case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
5770               r_src = (UInt)modrm - 0xD8;
5771               DIP("fcmovu %%st(%u), %%st(0)\n", r_src);
5772               put_ST_UNCHECKED(0,
5773                                IRExpr_Mux0X(
5774                                    unop(Iop_1Uto8,
5775                                         mk_amd64g_calculate_condition(AMD64CondP)),
5776                                    get_ST(0), get_ST(r_src)) );
5777               break;
5778
5779            case 0xE9: /* FUCOMPP %st(0),%st(1) */
5780               DIP("fucompp %%st(0),%%st(1)\n");
5781               /* This forces C1 to zero, which isn't right. */
5782               put_C3210(
5783                   unop(Iop_32Uto64,
5784                   binop( Iop_And32,
5785                          binop(Iop_Shl32,
5786                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
5787                                mkU8(8)),
5788                          mkU32(0x4500)
5789                   )));
5790               fp_pop();
5791               fp_pop();
5792               break;
5793
5794            default:
5795               goto decode_fail;
5796         }
5797
5798      }
5799   }
5800
5801   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
5802   else
5803   if (first_opcode == 0xDB) {
5804      if (modrm < 0xC0) {
5805
5806         /* bits 5,4,3 are an opcode extension, and the modRM also
5807            specifies an address. */
5808         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
5809         delta += len;
5810
5811         switch (gregLO3ofRM(modrm)) {
5812
5813            case 0: /* FILD m32int */
5814               DIP("fildl %s\n", dis_buf);
5815               fp_push();
5816               put_ST(0, unop(Iop_I32StoF64,
5817                              loadLE(Ity_I32, mkexpr(addr))));
5818               break;
5819
5820            case 1: /* FISTTPL m32 (SSE3) */
5821               DIP("fisttpl %s\n", dis_buf);
5822               storeLE( mkexpr(addr),
5823                        binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
5824               fp_pop();
5825               break;
5826
5827            case 2: /* FIST m32 */
5828               DIP("fistl %s\n", dis_buf);
5829               storeLE( mkexpr(addr),
5830                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
5831               break;
5832
5833            case 3: /* FISTP m32 */
5834               DIP("fistpl %s\n", dis_buf);
5835               storeLE( mkexpr(addr),
5836                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
5837               fp_pop();
5838               break;
5839
5840            case 5: { /* FLD extended-real */
5841               /* Uses dirty helper:
5842                     ULong amd64g_loadF80le ( ULong )
5843                  addr holds the address.  First, do a dirty call to
5844                  get hold of the data. */
5845               IRTemp   val  = newTemp(Ity_I64);
5846               IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
5847
5848               IRDirty* d = unsafeIRDirty_1_N (
5849                               val,
5850                               0/*regparms*/,
5851                               "amd64g_dirtyhelper_loadF80le",
5852                               &amd64g_dirtyhelper_loadF80le,
5853                               args
5854                            );
5855               /* declare that we're reading memory */
5856               d->mFx   = Ifx_Read;
5857               d->mAddr = mkexpr(addr);
5858               d->mSize = 10;
5859
5860               /* execute the dirty call, dumping the result in val. */
5861               stmt( IRStmt_Dirty(d) );
5862               fp_push();
5863               put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
5864
5865               DIP("fldt %s\n", dis_buf);
5866               break;
5867            }
5868
5869            case 7: { /* FSTP extended-real */
5870               /* Uses dirty helper:
5871                     void amd64g_storeF80le ( ULong addr, ULong data )
5872               */
5873               IRExpr** args
5874                  = mkIRExprVec_2( mkexpr(addr),
5875                                   unop(Iop_ReinterpF64asI64, get_ST(0)) );
5876
5877               IRDirty* d = unsafeIRDirty_0_N (
5878                               0/*regparms*/,
5879                               "amd64g_dirtyhelper_storeF80le",
5880                               &amd64g_dirtyhelper_storeF80le,
5881                               args
5882                            );
5883               /* declare we're writing memory */
5884               d->mFx   = Ifx_Write;
5885               d->mAddr = mkexpr(addr);
5886               d->mSize = 10;
5887
5888               /* execute the dirty call. */
5889               stmt( IRStmt_Dirty(d) );
5890               fp_pop();
5891
5892               DIP("fstpt\n %s", dis_buf);
5893               break;
5894            }
5895
5896            default:
5897               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
5898               vex_printf("first_opcode == 0xDB\n");
5899               goto decode_fail;
5900         }
5901
5902      } else {
5903
5904         delta++;
5905         switch (modrm) {
5906
5907            case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
5908               r_src = (UInt)modrm - 0xC0;
5909               DIP("fcmovnb %%st(%u), %%st(0)\n", r_src);
5910               put_ST_UNCHECKED(0,
5911                                IRExpr_Mux0X(
5912                                    unop(Iop_1Uto8,
5913                                         mk_amd64g_calculate_condition(AMD64CondNB)),
5914                                    get_ST(0), get_ST(r_src)) );
5915               break;
5916
5917            case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
5918               r_src = (UInt)modrm - 0xC8;
5919               DIP("fcmovnz %%st(%u), %%st(0)\n", r_src);
5920               put_ST_UNCHECKED(
5921                  0,
5922                  IRExpr_Mux0X(
5923                     unop(Iop_1Uto8,
5924                          mk_amd64g_calculate_condition(AMD64CondNZ)),
5925                     get_ST(0),
5926                     get_ST(r_src)
5927                  )
5928               );
5929               break;
5930
5931            case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
5932               r_src = (UInt)modrm - 0xD0;
5933               DIP("fcmovnbe %%st(%u), %%st(0)\n", r_src);
5934               put_ST_UNCHECKED(
5935                  0,
5936                  IRExpr_Mux0X(
5937                     unop(Iop_1Uto8,
5938                          mk_amd64g_calculate_condition(AMD64CondNBE)),
5939                     get_ST(0),
5940                     get_ST(r_src)
5941                  )
5942               );
5943               break;
5944
5945            case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
5946               r_src = (UInt)modrm - 0xD8;
5947               DIP("fcmovnu %%st(%u), %%st(0)\n", r_src);
5948               put_ST_UNCHECKED(
5949                  0,
5950                  IRExpr_Mux0X(
5951                     unop(Iop_1Uto8,
5952                          mk_amd64g_calculate_condition(AMD64CondNP)),
5953                     get_ST(0),
5954                     get_ST(r_src)
5955                  )
5956               );
5957               break;
5958
5959            case 0xE2:
5960               DIP("fnclex\n");
5961               break;
5962
5963            case 0xE3: {
5964               /* Uses dirty helper:
5965                     void amd64g_do_FINIT ( VexGuestAMD64State* ) */
5966               IRDirty* d  = unsafeIRDirty_0_N (
5967                                0/*regparms*/,
5968                                "amd64g_dirtyhelper_FINIT",
5969                                &amd64g_dirtyhelper_FINIT,
5970                                mkIRExprVec_0()
5971                             );
5972               d->needsBBP = True;
5973
5974               /* declare we're writing guest state */
5975               d->nFxState = 5;
5976               vex_bzero(&d->fxState, sizeof(d->fxState));
5977
5978               d->fxState[0].fx     = Ifx_Write;
5979               d->fxState[0].offset = OFFB_FTOP;
5980               d->fxState[0].size   = sizeof(UInt);
5981
5982               d->fxState[1].fx     = Ifx_Write;
5983               d->fxState[1].offset = OFFB_FPREGS;
5984               d->fxState[1].size   = 8 * sizeof(ULong);
5985
5986               d->fxState[2].fx     = Ifx_Write;
5987               d->fxState[2].offset = OFFB_FPTAGS;
5988               d->fxState[2].size   = 8 * sizeof(UChar);
5989
5990               d->fxState[3].fx     = Ifx_Write;
5991               d->fxState[3].offset = OFFB_FPROUND;
5992               d->fxState[3].size   = sizeof(ULong);
5993
5994               d->fxState[4].fx     = Ifx_Write;
5995               d->fxState[4].offset = OFFB_FC3210;
5996               d->fxState[4].size   = sizeof(ULong);
5997
5998               stmt( IRStmt_Dirty(d) );
5999
6000               DIP("fninit\n");
6001               break;
6002            }
6003
6004            case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
6005               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
6006               break;
6007
6008            case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
6009               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
6010               break;
6011
6012            default:
6013               goto decode_fail;
6014         }
6015      }
6016   }
6017
6018   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
6019   else
6020   if (first_opcode == 0xDC) {
6021      if (modrm < 0xC0) {
6022
6023         /* bits 5,4,3 are an opcode extension, and the modRM also
6024            specifies an address. */
6025         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6026         delta += len;
6027
6028         switch (gregLO3ofRM(modrm)) {
6029
6030            case 0: /* FADD double-real */
6031               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
6032               break;
6033
6034            case 1: /* FMUL double-real */
6035               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
6036               break;
6037
6038//..             case 2: /* FCOM double-real */
6039//..                DIP("fcoml %s\n", dis_buf);
6040//..                /* This forces C1 to zero, which isn't right. */
6041//..                put_C3210(
6042//..                    binop( Iop_And32,
6043//..                           binop(Iop_Shl32,
6044//..                                 binop(Iop_CmpF64,
6045//..                                       get_ST(0),
6046//..                                       loadLE(Ity_F64,mkexpr(addr))),
6047//..                                 mkU8(8)),
6048//..                           mkU32(0x4500)
6049//..                    ));
6050//..                break;
6051
6052            case 3: /* FCOMP double-real */
6053               DIP("fcompl %s\n", dis_buf);
6054               /* This forces C1 to zero, which isn't right. */
6055               put_C3210(
6056                   unop(Iop_32Uto64,
6057                   binop( Iop_And32,
6058                          binop(Iop_Shl32,
6059                                binop(Iop_CmpF64,
6060                                      get_ST(0),
6061                                      loadLE(Ity_F64,mkexpr(addr))),
6062                                mkU8(8)),
6063                          mkU32(0x4500)
6064                   )));
6065               fp_pop();
6066               break;
6067
6068            case 4: /* FSUB double-real */
6069               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
6070               break;
6071
6072            case 5: /* FSUBR double-real */
6073               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
6074               break;
6075
6076            case 6: /* FDIV double-real */
6077               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
6078               break;
6079
6080            case 7: /* FDIVR double-real */
6081               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
6082               break;
6083
6084            default:
6085               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
6086               vex_printf("first_opcode == 0xDC\n");
6087               goto decode_fail;
6088         }
6089
6090      } else {
6091
6092         delta++;
6093         switch (modrm) {
6094
6095            case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
6096               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
6097               break;
6098
6099            case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
6100               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
6101               break;
6102
6103            case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
6104               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
6105               break;
6106
6107            case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
6108               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
6109               break;
6110
6111            case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
6112               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
6113               break;
6114
6115            case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
6116               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
6117               break;
6118
6119            default:
6120               goto decode_fail;
6121         }
6122
6123      }
6124   }
6125
6126   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
6127   else
6128   if (first_opcode == 0xDD) {
6129
6130      if (modrm < 0xC0) {
6131
6132         /* bits 5,4,3 are an opcode extension, and the modRM also
6133            specifies an address. */
6134         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6135         delta += len;
6136
6137         switch (gregLO3ofRM(modrm)) {
6138
6139            case 0: /* FLD double-real */
6140               DIP("fldl %s\n", dis_buf);
6141               fp_push();
6142               put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
6143               break;
6144
6145            case 1: /* FISTTPQ m64 (SSE3) */
6146               DIP("fistppll %s\n", dis_buf);
6147               storeLE( mkexpr(addr),
6148                        binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
6149               fp_pop();
6150               break;
6151
6152            case 2: /* FST double-real */
6153               DIP("fstl %s\n", dis_buf);
6154               storeLE(mkexpr(addr), get_ST(0));
6155               break;
6156
6157            case 3: /* FSTP double-real */
6158               DIP("fstpl %s\n", dis_buf);
6159               storeLE(mkexpr(addr), get_ST(0));
6160               fp_pop();
6161               break;
6162
6163            case 4: { /* FRSTOR m94/m108 */
6164               IRTemp   ew = newTemp(Ity_I32);
6165               IRTemp  w64 = newTemp(Ity_I64);
6166               IRDirty*  d;
6167               if ( have66(pfx) ) {
6168                  /* Uses dirty helper:
6169                     VexEmWarn amd64g_dirtyhelper_FRSTORS
6170                                  ( VexGuestAMD64State*, HWord ) */
6171                  d = unsafeIRDirty_0_N (
6172                         0/*regparms*/,
6173                         "amd64g_dirtyhelper_FRSTORS",
6174                         &amd64g_dirtyhelper_FRSTORS,
6175                         mkIRExprVec_1( mkexpr(addr) )
6176                      );
6177                  d->mSize = 94;
6178               } else {
6179                  /* Uses dirty helper:
6180                     VexEmWarn amd64g_dirtyhelper_FRSTOR
6181                                  ( VexGuestAMD64State*, HWord ) */
6182                  d = unsafeIRDirty_0_N (
6183                         0/*regparms*/,
6184                         "amd64g_dirtyhelper_FRSTOR",
6185                         &amd64g_dirtyhelper_FRSTOR,
6186                         mkIRExprVec_1( mkexpr(addr) )
6187                      );
6188                  d->mSize = 108;
6189               }
6190
6191               d->needsBBP = True;
6192               d->tmp      = w64;
6193               /* declare we're reading memory */
6194               d->mFx   = Ifx_Read;
6195               d->mAddr = mkexpr(addr);
6196               /* d->mSize set above */
6197
6198               /* declare we're writing guest state */
6199               d->nFxState = 5;
6200               vex_bzero(&d->fxState, sizeof(d->fxState));
6201
6202               d->fxState[0].fx     = Ifx_Write;
6203               d->fxState[0].offset = OFFB_FTOP;
6204               d->fxState[0].size   = sizeof(UInt);
6205
6206               d->fxState[1].fx     = Ifx_Write;
6207               d->fxState[1].offset = OFFB_FPREGS;
6208               d->fxState[1].size   = 8 * sizeof(ULong);
6209
6210               d->fxState[2].fx     = Ifx_Write;
6211               d->fxState[2].offset = OFFB_FPTAGS;
6212               d->fxState[2].size   = 8 * sizeof(UChar);
6213
6214               d->fxState[3].fx     = Ifx_Write;
6215               d->fxState[3].offset = OFFB_FPROUND;
6216               d->fxState[3].size   = sizeof(ULong);
6217
6218               d->fxState[4].fx     = Ifx_Write;
6219               d->fxState[4].offset = OFFB_FC3210;
6220               d->fxState[4].size   = sizeof(ULong);
6221
6222               stmt( IRStmt_Dirty(d) );
6223
6224               /* ew contains any emulation warning we may need to
6225                  issue.  If needed, side-exit to the next insn,
6226                  reporting the warning, so that Valgrind's dispatcher
6227                  sees the warning. */
6228               assign(ew, unop(Iop_64to32,mkexpr(w64)) );
6229               put_emwarn( mkexpr(ew) );
6230               stmt(
6231                  IRStmt_Exit(
6232                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
6233                     Ijk_EmWarn,
6234                     IRConst_U64( guest_RIP_bbstart+delta ),
6235                     OFFB_RIP
6236                  )
6237               );
6238
6239               if ( have66(pfx) ) {
6240                  DIP("frstors %s\n", dis_buf);
6241               } else {
6242                  DIP("frstor %s\n", dis_buf);
6243               }
6244               break;
6245            }
6246
6247            case 6: { /* FNSAVE m94/m108 */
6248               IRDirty *d;
6249               if ( have66(pfx) ) {
6250                 /* Uses dirty helper:
6251                    void amd64g_dirtyhelper_FNSAVES ( VexGuestX86State*, HWord ) */
6252                  d = unsafeIRDirty_0_N (
6253                         0/*regparms*/,
6254                         "amd64g_dirtyhelper_FNSAVES",
6255                         &amd64g_dirtyhelper_FNSAVES,
6256                         mkIRExprVec_1( mkexpr(addr) )
6257                         );
6258                  d->mSize = 94;
6259               } else {
6260                 /* Uses dirty helper:
6261                    void amd64g_dirtyhelper_FNSAVE ( VexGuestX86State*, HWord ) */
6262                  d = unsafeIRDirty_0_N (
6263                         0/*regparms*/,
6264                         "amd64g_dirtyhelper_FNSAVE",
6265                         &amd64g_dirtyhelper_FNSAVE,
6266                         mkIRExprVec_1( mkexpr(addr) )
6267                         );
6268                  d->mSize = 108;
6269               }
6270               d->needsBBP = True;
6271               /* declare we're writing memory */
6272               d->mFx   = Ifx_Write;
6273               d->mAddr = mkexpr(addr);
6274               /* d->mSize set above */
6275
6276               /* declare we're reading guest state */
6277               d->nFxState = 5;
6278               vex_bzero(&d->fxState, sizeof(d->fxState));
6279
6280               d->fxState[0].fx     = Ifx_Read;
6281               d->fxState[0].offset = OFFB_FTOP;
6282               d->fxState[0].size   = sizeof(UInt);
6283
6284               d->fxState[1].fx     = Ifx_Read;
6285               d->fxState[1].offset = OFFB_FPREGS;
6286               d->fxState[1].size   = 8 * sizeof(ULong);
6287
6288               d->fxState[2].fx     = Ifx_Read;
6289               d->fxState[2].offset = OFFB_FPTAGS;
6290               d->fxState[2].size   = 8 * sizeof(UChar);
6291
6292               d->fxState[3].fx     = Ifx_Read;
6293               d->fxState[3].offset = OFFB_FPROUND;
6294               d->fxState[3].size   = sizeof(ULong);
6295
6296               d->fxState[4].fx     = Ifx_Read;
6297               d->fxState[4].offset = OFFB_FC3210;
6298               d->fxState[4].size   = sizeof(ULong);
6299
6300               stmt( IRStmt_Dirty(d) );
6301
6302               if ( have66(pfx) ) {
6303                 DIP("fnsaves %s\n", dis_buf);
6304               } else {
6305                 DIP("fnsave %s\n", dis_buf);
6306               }
6307               break;
6308            }
6309
6310            case 7: { /* FNSTSW m16 */
6311               IRExpr* sw = get_FPU_sw();
6312               vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
6313               storeLE( mkexpr(addr), sw );
6314               DIP("fnstsw %s\n", dis_buf);
6315               break;
6316            }
6317
6318            default:
6319               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
6320               vex_printf("first_opcode == 0xDD\n");
6321               goto decode_fail;
6322         }
6323      } else {
6324         delta++;
6325         switch (modrm) {
6326
6327            case 0xC0 ... 0xC7: /* FFREE %st(?) */
6328               r_dst = (UInt)modrm - 0xC0;
6329               DIP("ffree %%st(%u)\n", r_dst);
6330               put_ST_TAG ( r_dst, mkU8(0) );
6331               break;
6332
6333            case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
6334               r_dst = (UInt)modrm - 0xD0;
6335               DIP("fst %%st(0),%%st(%u)\n", r_dst);
6336               /* P4 manual says: "If the destination operand is a
6337                  non-empty register, the invalid-operation exception
6338                  is not generated.  Hence put_ST_UNCHECKED. */
6339               put_ST_UNCHECKED(r_dst, get_ST(0));
6340               break;
6341
6342            case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
6343               r_dst = (UInt)modrm - 0xD8;
6344               DIP("fstp %%st(0),%%st(%u)\n", r_dst);
6345               /* P4 manual says: "If the destination operand is a
6346                  non-empty register, the invalid-operation exception
6347                  is not generated.  Hence put_ST_UNCHECKED. */
6348               put_ST_UNCHECKED(r_dst, get_ST(0));
6349               fp_pop();
6350               break;
6351
6352            case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
6353               r_dst = (UInt)modrm - 0xE0;
6354               DIP("fucom %%st(0),%%st(%u)\n", r_dst);
6355               /* This forces C1 to zero, which isn't right. */
6356               put_C3210(
6357                   unop(Iop_32Uto64,
6358                   binop( Iop_And32,
6359                          binop(Iop_Shl32,
6360                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
6361                                mkU8(8)),
6362                          mkU32(0x4500)
6363                   )));
6364               break;
6365
6366            case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
6367               r_dst = (UInt)modrm - 0xE8;
6368               DIP("fucomp %%st(0),%%st(%u)\n", r_dst);
6369               /* This forces C1 to zero, which isn't right. */
6370               put_C3210(
6371                   unop(Iop_32Uto64,
6372                   binop( Iop_And32,
6373                          binop(Iop_Shl32,
6374                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
6375                                mkU8(8)),
6376                          mkU32(0x4500)
6377                   )));
6378               fp_pop();
6379               break;
6380
6381            default:
6382               goto decode_fail;
6383         }
6384      }
6385   }
6386
6387   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
6388   else
6389   if (first_opcode == 0xDE) {
6390
6391      if (modrm < 0xC0) {
6392
6393         /* bits 5,4,3 are an opcode extension, and the modRM also
6394            specifies an address. */
6395         IROp   fop;
6396         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6397         delta += len;
6398
6399         switch (gregLO3ofRM(modrm)) {
6400
6401            case 0: /* FIADD m16int */ /* ST(0) += m16int */
6402               DIP("fiaddw %s\n", dis_buf);
6403               fop = Iop_AddF64;
6404               goto do_fop_m16;
6405
6406            case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
6407               DIP("fimulw %s\n", dis_buf);
6408               fop = Iop_MulF64;
6409               goto do_fop_m16;
6410
6411            case 4: /* FISUB m16int */ /* ST(0) -= m16int */
6412               DIP("fisubw %s\n", dis_buf);
6413               fop = Iop_SubF64;
6414               goto do_fop_m16;
6415
6416            case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
6417               DIP("fisubrw %s\n", dis_buf);
6418               fop = Iop_SubF64;
6419               goto do_foprev_m16;
6420
6421            case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
6422               DIP("fisubw %s\n", dis_buf);
6423               fop = Iop_DivF64;
6424               goto do_fop_m16;
6425
6426            case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
6427               DIP("fidivrw %s\n", dis_buf);
6428               fop = Iop_DivF64;
6429               goto do_foprev_m16;
6430
6431            do_fop_m16:
6432               put_ST_UNCHECKED(0,
6433                  triop(fop,
6434                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6435                        get_ST(0),
6436                        unop(Iop_I32StoF64,
6437                             unop(Iop_16Sto32,
6438                                  loadLE(Ity_I16, mkexpr(addr))))));
6439               break;
6440
6441            do_foprev_m16:
6442               put_ST_UNCHECKED(0,
6443                  triop(fop,
6444                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6445                        unop(Iop_I32StoF64,
6446                             unop(Iop_16Sto32,
6447                                  loadLE(Ity_I16, mkexpr(addr)))),
6448                        get_ST(0)));
6449               break;
6450
6451            default:
6452               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
6453               vex_printf("first_opcode == 0xDE\n");
6454               goto decode_fail;
6455         }
6456
6457      } else {
6458
6459         delta++;
6460         switch (modrm) {
6461
6462            case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
6463               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
6464               break;
6465
6466            case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
6467               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
6468               break;
6469
6470            case 0xD9: /* FCOMPP %st(0),%st(1) */
6471               DIP("fcompp %%st(0),%%st(1)\n");
6472               /* This forces C1 to zero, which isn't right. */
6473               put_C3210(
6474                   unop(Iop_32Uto64,
6475                   binop( Iop_And32,
6476                          binop(Iop_Shl32,
6477                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
6478                                mkU8(8)),
6479                          mkU32(0x4500)
6480                   )));
6481               fp_pop();
6482               fp_pop();
6483               break;
6484
6485            case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
6486               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
6487               break;
6488
6489            case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
6490               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
6491               break;
6492
6493            case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
6494               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
6495               break;
6496
6497            case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
6498               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
6499               break;
6500
6501            default:
6502               goto decode_fail;
6503         }
6504
6505      }
6506   }
6507
6508   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
6509   else
6510   if (first_opcode == 0xDF) {
6511
6512      if (modrm < 0xC0) {
6513
6514         /* bits 5,4,3 are an opcode extension, and the modRM also
6515            specifies an address. */
6516         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6517         delta += len;
6518
6519         switch (gregLO3ofRM(modrm)) {
6520
6521            case 0: /* FILD m16int */
6522               DIP("fildw %s\n", dis_buf);
6523               fp_push();
6524               put_ST(0, unop(Iop_I32StoF64,
6525                              unop(Iop_16Sto32,
6526                                   loadLE(Ity_I16, mkexpr(addr)))));
6527               break;
6528
6529            case 1: /* FISTTPS m16 (SSE3) */
6530               DIP("fisttps %s\n", dis_buf);
6531               storeLE( mkexpr(addr),
6532                        x87ishly_qnarrow_32_to_16(
6533                        binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) ));
6534               fp_pop();
6535               break;
6536
6537            case 2: /* FIST m16 */
6538               DIP("fists %s\n", dis_buf);
6539               storeLE( mkexpr(addr),
6540                        x87ishly_qnarrow_32_to_16(
6541                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
6542               break;
6543
6544            case 3: /* FISTP m16 */
6545               DIP("fistps %s\n", dis_buf);
6546               storeLE( mkexpr(addr),
6547                        x87ishly_qnarrow_32_to_16(
6548                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
6549               fp_pop();
6550               break;
6551
6552            case 5: /* FILD m64 */
6553               DIP("fildll %s\n", dis_buf);
6554               fp_push();
6555               put_ST(0, binop(Iop_I64StoF64,
6556                               get_roundingmode(),
6557                               loadLE(Ity_I64, mkexpr(addr))));
6558               break;
6559
6560            case 7: /* FISTP m64 */
6561               DIP("fistpll %s\n", dis_buf);
6562               storeLE( mkexpr(addr),
6563                        binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
6564               fp_pop();
6565               break;
6566
6567            default:
6568               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
6569               vex_printf("first_opcode == 0xDF\n");
6570               goto decode_fail;
6571         }
6572
6573      } else {
6574
6575         delta++;
6576         switch (modrm) {
6577
6578            case 0xC0: /* FFREEP %st(0) */
6579               DIP("ffreep %%st(%d)\n", 0);
6580               put_ST_TAG ( 0, mkU8(0) );
6581               fp_pop();
6582               break;
6583
6584            case 0xE0: /* FNSTSW %ax */
6585               DIP("fnstsw %%ax\n");
6586               /* Invent a plausible-looking FPU status word value and
6587                  dump it in %AX:
6588                     ((ftop & 7) << 11) | (c3210 & 0x4700)
6589               */
6590               putIRegRAX(
6591                  2,
6592                  unop(Iop_32to16,
6593                       binop(Iop_Or32,
6594                             binop(Iop_Shl32,
6595                                   binop(Iop_And32, get_ftop(), mkU32(7)),
6596                                   mkU8(11)),
6597                             binop(Iop_And32,
6598                                   unop(Iop_64to32, get_C3210()),
6599                                   mkU32(0x4700))
6600               )));
6601               break;
6602
6603            case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
6604               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
6605               break;
6606
6607            case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
6608               /* not really right since COMIP != UCOMIP */
6609               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
6610               break;
6611
6612            default:
6613               goto decode_fail;
6614         }
6615      }
6616
6617   }
6618
6619   else
6620      goto decode_fail;
6621
6622   *decode_ok = True;
6623   return delta;
6624
6625  decode_fail:
6626   *decode_ok = False;
6627   return delta;
6628}
6629
6630
6631/*------------------------------------------------------------*/
6632/*---                                                      ---*/
6633/*--- MMX INSTRUCTIONS                                     ---*/
6634/*---                                                      ---*/
6635/*------------------------------------------------------------*/
6636
6637/* Effect of MMX insns on x87 FPU state (table 11-2 of
6638   IA32 arch manual, volume 3):
6639
6640   Read from, or write to MMX register (viz, any insn except EMMS):
6641   * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
6642   * FP stack pointer set to zero
6643
6644   EMMS:
6645   * All tags set to Invalid (empty) -- FPTAGS[i] := zero
6646   * FP stack pointer set to zero
6647*/
6648
6649static void do_MMX_preamble ( void )
6650{
6651   Int         i;
6652   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
6653   IRExpr*     zero  = mkU32(0);
6654   IRExpr*     tag1  = mkU8(1);
6655   put_ftop(zero);
6656   for (i = 0; i < 8; i++)
6657      stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag1) ) );
6658}
6659
6660static void do_EMMS_preamble ( void )
6661{
6662   Int         i;
6663   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
6664   IRExpr*     zero  = mkU32(0);
6665   IRExpr*     tag0  = mkU8(0);
6666   put_ftop(zero);
6667   for (i = 0; i < 8; i++)
6668      stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag0) ) );
6669}
6670
6671
6672static IRExpr* getMMXReg ( UInt archreg )
6673{
6674   vassert(archreg < 8);
6675   return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
6676}
6677
6678
6679static void putMMXReg ( UInt archreg, IRExpr* e )
6680{
6681   vassert(archreg < 8);
6682   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
6683   stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
6684}
6685
6686
6687/* Helper for non-shift MMX insns.  Note this is incomplete in the
6688   sense that it does not first call do_MMX_preamble() -- that is the
6689   responsibility of its caller. */
6690
6691static
6692ULong dis_MMXop_regmem_to_reg ( VexAbiInfo* vbi,
6693                                Prefix      pfx,
6694                                Long        delta,
6695                                UChar       opc,
6696                                HChar*      name,
6697                                Bool        show_granularity )
6698{
6699   HChar   dis_buf[50];
6700   UChar   modrm = getUChar(delta);
6701   Bool    isReg = epartIsReg(modrm);
6702   IRExpr* argL  = NULL;
6703   IRExpr* argR  = NULL;
6704   IRExpr* argG  = NULL;
6705   IRExpr* argE  = NULL;
6706   IRTemp  res   = newTemp(Ity_I64);
6707
6708   Bool    invG  = False;
6709   IROp    op    = Iop_INVALID;
6710   void*   hAddr = NULL;
6711   HChar*  hName = NULL;
6712   Bool    eLeft = False;
6713
6714#  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
6715
6716   switch (opc) {
6717      /* Original MMX ones */
6718      case 0xFC: op = Iop_Add8x8; break;
6719      case 0xFD: op = Iop_Add16x4; break;
6720      case 0xFE: op = Iop_Add32x2; break;
6721
6722      case 0xEC: op = Iop_QAdd8Sx8; break;
6723      case 0xED: op = Iop_QAdd16Sx4; break;
6724
6725      case 0xDC: op = Iop_QAdd8Ux8; break;
6726      case 0xDD: op = Iop_QAdd16Ux4; break;
6727
6728      case 0xF8: op = Iop_Sub8x8;  break;
6729      case 0xF9: op = Iop_Sub16x4; break;
6730      case 0xFA: op = Iop_Sub32x2; break;
6731
6732      case 0xE8: op = Iop_QSub8Sx8; break;
6733      case 0xE9: op = Iop_QSub16Sx4; break;
6734
6735      case 0xD8: op = Iop_QSub8Ux8; break;
6736      case 0xD9: op = Iop_QSub16Ux4; break;
6737
6738      case 0xE5: op = Iop_MulHi16Sx4; break;
6739      case 0xD5: op = Iop_Mul16x4; break;
6740      case 0xF5: XXX(amd64g_calculate_mmx_pmaddwd); break;
6741
6742      case 0x74: op = Iop_CmpEQ8x8; break;
6743      case 0x75: op = Iop_CmpEQ16x4; break;
6744      case 0x76: op = Iop_CmpEQ32x2; break;
6745
6746      case 0x64: op = Iop_CmpGT8Sx8; break;
6747      case 0x65: op = Iop_CmpGT16Sx4; break;
6748      case 0x66: op = Iop_CmpGT32Sx2; break;
6749
6750      case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
6751      case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
6752      case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
6753
6754      case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
6755      case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
6756      case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
6757
6758      case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
6759      case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
6760      case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
6761
6762      case 0xDB: op = Iop_And64; break;
6763      case 0xDF: op = Iop_And64; invG = True; break;
6764      case 0xEB: op = Iop_Or64; break;
6765      case 0xEF: /* Possibly do better here if argL and argR are the
6766                    same reg */
6767                 op = Iop_Xor64; break;
6768
6769      /* Introduced in SSE1 */
6770      case 0xE0: op = Iop_Avg8Ux8;    break;
6771      case 0xE3: op = Iop_Avg16Ux4;   break;
6772      case 0xEE: op = Iop_Max16Sx4;   break;
6773      case 0xDE: op = Iop_Max8Ux8;    break;
6774      case 0xEA: op = Iop_Min16Sx4;   break;
6775      case 0xDA: op = Iop_Min8Ux8;    break;
6776      case 0xE4: op = Iop_MulHi16Ux4; break;
6777      case 0xF6: XXX(amd64g_calculate_mmx_psadbw); break;
6778
6779      /* Introduced in SSE2 */
6780      case 0xD4: op = Iop_Add64; break;
6781      case 0xFB: op = Iop_Sub64; break;
6782
6783      default:
6784         vex_printf("\n0x%x\n", (Int)opc);
6785         vpanic("dis_MMXop_regmem_to_reg");
6786   }
6787
6788#  undef XXX
6789
6790   argG = getMMXReg(gregLO3ofRM(modrm));
6791   if (invG)
6792      argG = unop(Iop_Not64, argG);
6793
6794   if (isReg) {
6795      delta++;
6796      argE = getMMXReg(eregLO3ofRM(modrm));
6797   } else {
6798      Int    len;
6799      IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6800      delta += len;
6801      argE = loadLE(Ity_I64, mkexpr(addr));
6802   }
6803
6804   if (eLeft) {
6805      argL = argE;
6806      argR = argG;
6807   } else {
6808      argL = argG;
6809      argR = argE;
6810   }
6811
6812   if (op != Iop_INVALID) {
6813      vassert(hName == NULL);
6814      vassert(hAddr == NULL);
6815      assign(res, binop(op, argL, argR));
6816   } else {
6817      vassert(hName != NULL);
6818      vassert(hAddr != NULL);
6819      assign( res,
6820              mkIRExprCCall(
6821                 Ity_I64,
6822                 0/*regparms*/, hName, hAddr,
6823                 mkIRExprVec_2( argL, argR )
6824              )
6825            );
6826   }
6827
6828   putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
6829
6830   DIP("%s%s %s, %s\n",
6831       name, show_granularity ? nameMMXGran(opc & 3) : "",
6832       ( isReg ? nameMMXReg(eregLO3ofRM(modrm)) : dis_buf ),
6833       nameMMXReg(gregLO3ofRM(modrm)) );
6834
6835   return delta;
6836}
6837
6838
6839/* Vector by scalar shift of G by the amount specified at the bottom
6840   of E.  This is a straight copy of dis_SSE_shiftG_byE. */
6841
6842static ULong dis_MMX_shiftG_byE ( VexAbiInfo* vbi,
6843                                  Prefix pfx, Long delta,
6844                                  HChar* opname, IROp op )
6845{
6846   HChar   dis_buf[50];
6847   Int     alen, size;
6848   IRTemp  addr;
6849   Bool    shl, shr, sar;
6850   UChar   rm   = getUChar(delta);
6851   IRTemp  g0   = newTemp(Ity_I64);
6852   IRTemp  g1   = newTemp(Ity_I64);
6853   IRTemp  amt  = newTemp(Ity_I64);
6854   IRTemp  amt8 = newTemp(Ity_I8);
6855
6856   if (epartIsReg(rm)) {
6857      assign( amt, getMMXReg(eregLO3ofRM(rm)) );
6858      DIP("%s %s,%s\n", opname,
6859                        nameMMXReg(eregLO3ofRM(rm)),
6860                        nameMMXReg(gregLO3ofRM(rm)) );
6861      delta++;
6862   } else {
6863      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
6864      assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
6865      DIP("%s %s,%s\n", opname,
6866                        dis_buf,
6867                        nameMMXReg(gregLO3ofRM(rm)) );
6868      delta += alen;
6869   }
6870   assign( g0,   getMMXReg(gregLO3ofRM(rm)) );
6871   assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
6872
6873   shl = shr = sar = False;
6874   size = 0;
6875   switch (op) {
6876      case Iop_ShlN16x4: shl = True; size = 32; break;
6877      case Iop_ShlN32x2: shl = True; size = 32; break;
6878      case Iop_Shl64:    shl = True; size = 64; break;
6879      case Iop_ShrN16x4: shr = True; size = 16; break;
6880      case Iop_ShrN32x2: shr = True; size = 32; break;
6881      case Iop_Shr64:    shr = True; size = 64; break;
6882      case Iop_SarN16x4: sar = True; size = 16; break;
6883      case Iop_SarN32x2: sar = True; size = 32; break;
6884      default: vassert(0);
6885   }
6886
6887   if (shl || shr) {
6888     assign(
6889        g1,
6890        IRExpr_Mux0X(
6891           unop(Iop_1Uto8,binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size))),
6892           mkU64(0),
6893           binop(op, mkexpr(g0), mkexpr(amt8))
6894        )
6895     );
6896   } else
6897   if (sar) {
6898     assign(
6899        g1,
6900        IRExpr_Mux0X(
6901           unop(Iop_1Uto8,binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size))),
6902           binop(op, mkexpr(g0), mkU8(size-1)),
6903           binop(op, mkexpr(g0), mkexpr(amt8))
6904        )
6905     );
6906   } else {
6907      vassert(0);
6908   }
6909
6910   putMMXReg( gregLO3ofRM(rm), mkexpr(g1) );
6911   return delta;
6912}
6913
6914
6915/* Vector by scalar shift of E by an immediate byte.  This is a
6916   straight copy of dis_SSE_shiftE_imm. */
6917
6918static
6919ULong dis_MMX_shiftE_imm ( Long delta, HChar* opname, IROp op )
6920{
6921   Bool    shl, shr, sar;
6922   UChar   rm   = getUChar(delta);
6923   IRTemp  e0   = newTemp(Ity_I64);
6924   IRTemp  e1   = newTemp(Ity_I64);
6925   UChar   amt, size;
6926   vassert(epartIsReg(rm));
6927   vassert(gregLO3ofRM(rm) == 2
6928           || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
6929   amt = getUChar(delta+1);
6930   delta += 2;
6931   DIP("%s $%d,%s\n", opname,
6932                      (Int)amt,
6933                      nameMMXReg(eregLO3ofRM(rm)) );
6934
6935   assign( e0, getMMXReg(eregLO3ofRM(rm)) );
6936
6937   shl = shr = sar = False;
6938   size = 0;
6939   switch (op) {
6940      case Iop_ShlN16x4: shl = True; size = 16; break;
6941      case Iop_ShlN32x2: shl = True; size = 32; break;
6942      case Iop_Shl64:    shl = True; size = 64; break;
6943      case Iop_SarN16x4: sar = True; size = 16; break;
6944      case Iop_SarN32x2: sar = True; size = 32; break;
6945      case Iop_ShrN16x4: shr = True; size = 16; break;
6946      case Iop_ShrN32x2: shr = True; size = 32; break;
6947      case Iop_Shr64:    shr = True; size = 64; break;
6948      default: vassert(0);
6949   }
6950
6951   if (shl || shr) {
6952     assign( e1, amt >= size
6953                    ? mkU64(0)
6954                    : binop(op, mkexpr(e0), mkU8(amt))
6955     );
6956   } else
6957   if (sar) {
6958     assign( e1, amt >= size
6959                    ? binop(op, mkexpr(e0), mkU8(size-1))
6960                    : binop(op, mkexpr(e0), mkU8(amt))
6961     );
6962   } else {
6963      vassert(0);
6964   }
6965
6966   putMMXReg( eregLO3ofRM(rm), mkexpr(e1) );
6967   return delta;
6968}
6969
6970
6971/* Completely handle all MMX instructions except emms. */
6972
6973static
6974ULong dis_MMX ( Bool* decode_ok,
6975                VexAbiInfo* vbi, Prefix pfx, Int sz, Long delta )
6976{
6977   Int   len;
6978   UChar modrm;
6979   HChar dis_buf[50];
6980   UChar opc = getUChar(delta);
6981   delta++;
6982
6983   /* dis_MMX handles all insns except emms. */
6984   do_MMX_preamble();
6985
6986   switch (opc) {
6987
6988      case 0x6E:
6989         if (sz == 4) {
6990            /* MOVD (src)ireg32-or-mem32 (E), (dst)mmxreg (G)*/
6991            modrm = getUChar(delta);
6992            if (epartIsReg(modrm)) {
6993               delta++;
6994               putMMXReg(
6995                  gregLO3ofRM(modrm),
6996                  binop( Iop_32HLto64,
6997                         mkU32(0),
6998                         getIReg32(eregOfRexRM(pfx,modrm)) ) );
6999               DIP("movd %s, %s\n",
7000                   nameIReg32(eregOfRexRM(pfx,modrm)),
7001                   nameMMXReg(gregLO3ofRM(modrm)));
7002            } else {
7003               IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7004               delta += len;
7005               putMMXReg(
7006                  gregLO3ofRM(modrm),
7007                  binop( Iop_32HLto64,
7008                         mkU32(0),
7009                         loadLE(Ity_I32, mkexpr(addr)) ) );
7010               DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
7011            }
7012         }
7013         else
7014         if (sz == 8) {
7015            /* MOVD (src)ireg64-or-mem64 (E), (dst)mmxreg (G)*/
7016            modrm = getUChar(delta);
7017            if (epartIsReg(modrm)) {
7018               delta++;
7019               putMMXReg( gregLO3ofRM(modrm),
7020                          getIReg64(eregOfRexRM(pfx,modrm)) );
7021               DIP("movd %s, %s\n",
7022                   nameIReg64(eregOfRexRM(pfx,modrm)),
7023                   nameMMXReg(gregLO3ofRM(modrm)));
7024            } else {
7025               IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7026               delta += len;
7027               putMMXReg( gregLO3ofRM(modrm),
7028                          loadLE(Ity_I64, mkexpr(addr)) );
7029               DIP("movd{64} %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
7030            }
7031         }
7032         else {
7033            goto mmx_decode_failure;
7034         }
7035         break;
7036
7037      case 0x7E:
7038         if (sz == 4) {
7039            /* MOVD (src)mmxreg (G), (dst)ireg32-or-mem32 (E) */
7040            modrm = getUChar(delta);
7041            if (epartIsReg(modrm)) {
7042               delta++;
7043               putIReg32( eregOfRexRM(pfx,modrm),
7044                          unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
7045               DIP("movd %s, %s\n",
7046                   nameMMXReg(gregLO3ofRM(modrm)),
7047                   nameIReg32(eregOfRexRM(pfx,modrm)));
7048            } else {
7049               IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7050               delta += len;
7051               storeLE( mkexpr(addr),
7052                        unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
7053               DIP("movd %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
7054            }
7055         }
7056         else
7057         if (sz == 8) {
7058            /* MOVD (src)mmxreg (G), (dst)ireg64-or-mem64 (E) */
7059            modrm = getUChar(delta);
7060            if (epartIsReg(modrm)) {
7061               delta++;
7062               putIReg64( eregOfRexRM(pfx,modrm),
7063                          getMMXReg(gregLO3ofRM(modrm)) );
7064               DIP("movd %s, %s\n",
7065                   nameMMXReg(gregLO3ofRM(modrm)),
7066                   nameIReg64(eregOfRexRM(pfx,modrm)));
7067            } else {
7068               IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7069               delta += len;
7070               storeLE( mkexpr(addr),
7071                       getMMXReg(gregLO3ofRM(modrm)) );
7072               DIP("movd{64} %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
7073            }
7074         } else {
7075            goto mmx_decode_failure;
7076         }
7077         break;
7078
7079      case 0x6F:
7080         /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
7081         if (sz != 4
7082             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7083            goto mmx_decode_failure;
7084         modrm = getUChar(delta);
7085         if (epartIsReg(modrm)) {
7086            delta++;
7087            putMMXReg( gregLO3ofRM(modrm), getMMXReg(eregLO3ofRM(modrm)) );
7088            DIP("movq %s, %s\n",
7089                nameMMXReg(eregLO3ofRM(modrm)),
7090                nameMMXReg(gregLO3ofRM(modrm)));
7091         } else {
7092            IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7093            delta += len;
7094            putMMXReg( gregLO3ofRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
7095            DIP("movq %s, %s\n",
7096                dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
7097         }
7098         break;
7099
7100      case 0x7F:
7101         /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
7102         if (sz != 4
7103             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7104            goto mmx_decode_failure;
7105         modrm = getUChar(delta);
7106         if (epartIsReg(modrm)) {
7107            delta++;
7108            putMMXReg( eregLO3ofRM(modrm), getMMXReg(gregLO3ofRM(modrm)) );
7109            DIP("movq %s, %s\n",
7110                nameMMXReg(gregLO3ofRM(modrm)),
7111                nameMMXReg(eregLO3ofRM(modrm)));
7112         } else {
7113            IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7114            delta += len;
7115            storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
7116            DIP("mov(nt)q %s, %s\n",
7117                nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
7118         }
7119         break;
7120
7121      case 0xFC:
7122      case 0xFD:
7123      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
7124         if (sz != 4)
7125            goto mmx_decode_failure;
7126         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padd", True );
7127         break;
7128
7129      case 0xEC:
7130      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
7131         if (sz != 4
7132             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7133            goto mmx_decode_failure;
7134         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padds", True );
7135         break;
7136
7137      case 0xDC:
7138      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
7139         if (sz != 4)
7140            goto mmx_decode_failure;
7141         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "paddus", True );
7142         break;
7143
7144      case 0xF8:
7145      case 0xF9:
7146      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
7147         if (sz != 4)
7148            goto mmx_decode_failure;
7149         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psub", True );
7150         break;
7151
7152      case 0xE8:
7153      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
7154         if (sz != 4)
7155            goto mmx_decode_failure;
7156         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubs", True );
7157         break;
7158
7159      case 0xD8:
7160      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
7161         if (sz != 4)
7162            goto mmx_decode_failure;
7163         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubus", True );
7164         break;
7165
7166      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
7167         if (sz != 4)
7168            goto mmx_decode_failure;
7169         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmulhw", False );
7170         break;
7171
7172      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
7173         if (sz != 4)
7174            goto mmx_decode_failure;
7175         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmullw", False );
7176         break;
7177
7178      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
7179         vassert(sz == 4);
7180         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmaddwd", False );
7181         break;
7182
7183      case 0x74:
7184      case 0x75:
7185      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
7186         if (sz != 4)
7187            goto mmx_decode_failure;
7188         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpeq", True );
7189         break;
7190
7191      case 0x64:
7192      case 0x65:
7193      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
7194         if (sz != 4)
7195            goto mmx_decode_failure;
7196         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpgt", True );
7197         break;
7198
7199      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
7200         if (sz != 4)
7201            goto mmx_decode_failure;
7202         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packssdw", False );
7203         break;
7204
7205      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
7206         if (sz != 4)
7207            goto mmx_decode_failure;
7208         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packsswb", False );
7209         break;
7210
7211      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
7212         if (sz != 4)
7213            goto mmx_decode_failure;
7214         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packuswb", False );
7215         break;
7216
7217      case 0x68:
7218      case 0x69:
7219      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
7220         if (sz != 4
7221             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7222            goto mmx_decode_failure;
7223         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckh", True );
7224         break;
7225
7226      case 0x60:
7227      case 0x61:
7228      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
7229         if (sz != 4
7230             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7231            goto mmx_decode_failure;
7232         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckl", True );
7233         break;
7234
7235      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
7236         if (sz != 4)
7237            goto mmx_decode_failure;
7238         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pand", False );
7239         break;
7240
7241      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
7242         if (sz != 4)
7243            goto mmx_decode_failure;
7244         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pandn", False );
7245         break;
7246
7247      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
7248         if (sz != 4)
7249            goto mmx_decode_failure;
7250         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "por", False );
7251         break;
7252
7253      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
7254         if (sz != 4)
7255            goto mmx_decode_failure;
7256         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pxor", False );
7257         break;
7258
7259#     define SHIFT_BY_REG(_name,_op)                                     \
7260                delta = dis_MMX_shiftG_byE(vbi, pfx, delta, _name, _op); \
7261                break;
7262
7263      /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
7264      case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
7265      case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
7266      case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
7267
7268      /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
7269      case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
7270      case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
7271      case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
7272
7273      /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
7274      case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
7275      case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
7276
7277#     undef SHIFT_BY_REG
7278
7279      case 0x71:
7280      case 0x72:
7281      case 0x73: {
7282         /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
7283         UChar byte2, subopc;
7284         if (sz != 4)
7285            goto mmx_decode_failure;
7286         byte2  = getUChar(delta);      /* amode / sub-opcode */
7287         subopc = toUChar( (byte2 >> 3) & 7 );
7288
7289#        define SHIFT_BY_IMM(_name,_op)                        \
7290            do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
7291            } while (0)
7292
7293              if (subopc == 2 /*SRL*/ && opc == 0x71)
7294                  SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
7295         else if (subopc == 2 /*SRL*/ && opc == 0x72)
7296                 SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
7297         else if (subopc == 2 /*SRL*/ && opc == 0x73)
7298                 SHIFT_BY_IMM("psrlq", Iop_Shr64);
7299
7300         else if (subopc == 4 /*SAR*/ && opc == 0x71)
7301                 SHIFT_BY_IMM("psraw", Iop_SarN16x4);
7302         else if (subopc == 4 /*SAR*/ && opc == 0x72)
7303                 SHIFT_BY_IMM("psrad", Iop_SarN32x2);
7304
7305         else if (subopc == 6 /*SHL*/ && opc == 0x71)
7306                 SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
7307         else if (subopc == 6 /*SHL*/ && opc == 0x72)
7308                  SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
7309         else if (subopc == 6 /*SHL*/ && opc == 0x73)
7310                 SHIFT_BY_IMM("psllq", Iop_Shl64);
7311
7312         else goto mmx_decode_failure;
7313
7314#        undef SHIFT_BY_IMM
7315         break;
7316      }
7317
7318      case 0xF7: {
7319         IRTemp addr    = newTemp(Ity_I64);
7320         IRTemp regD    = newTemp(Ity_I64);
7321         IRTemp regM    = newTemp(Ity_I64);
7322         IRTemp mask    = newTemp(Ity_I64);
7323         IRTemp olddata = newTemp(Ity_I64);
7324         IRTemp newdata = newTemp(Ity_I64);
7325
7326         modrm = getUChar(delta);
7327         if (sz != 4 || (!epartIsReg(modrm)))
7328            goto mmx_decode_failure;
7329         delta++;
7330
7331         assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
7332         assign( regM, getMMXReg( eregLO3ofRM(modrm) ));
7333         assign( regD, getMMXReg( gregLO3ofRM(modrm) ));
7334         assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
7335         assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
7336         assign( newdata,
7337                 binop(Iop_Or64,
7338                       binop(Iop_And64,
7339                             mkexpr(regD),
7340                             mkexpr(mask) ),
7341                       binop(Iop_And64,
7342                             mkexpr(olddata),
7343                             unop(Iop_Not64, mkexpr(mask)))) );
7344         storeLE( mkexpr(addr), mkexpr(newdata) );
7345         DIP("maskmovq %s,%s\n", nameMMXReg( eregLO3ofRM(modrm) ),
7346                                 nameMMXReg( gregLO3ofRM(modrm) ) );
7347         break;
7348      }
7349
7350      /* --- MMX decode failure --- */
7351      default:
7352      mmx_decode_failure:
7353         *decode_ok = False;
7354         return delta; /* ignored */
7355
7356   }
7357
7358   *decode_ok = True;
7359   return delta;
7360}
7361
7362
7363/*------------------------------------------------------------*/
7364/*--- More misc arithmetic and other obscure insns.        ---*/
7365/*------------------------------------------------------------*/
7366
7367/* Generate base << amt with vacated places filled with stuff
7368   from xtra.  amt guaranteed in 0 .. 63. */
7369static
7370IRExpr* shiftL64_with_extras ( IRTemp base, IRTemp xtra, IRTemp amt )
7371{
7372   /* if   amt == 0
7373      then base
7374      else (base << amt) | (xtra >>u (64-amt))
7375   */
7376   return
7377      IRExpr_Mux0X(
7378         mkexpr(amt),
7379         mkexpr(base),
7380         binop(Iop_Or64,
7381               binop(Iop_Shl64, mkexpr(base), mkexpr(amt)),
7382               binop(Iop_Shr64, mkexpr(xtra),
7383                                binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
7384         )
7385      );
7386}
7387
7388/* Generate base >>u amt with vacated places filled with stuff
7389   from xtra.  amt guaranteed in 0 .. 63. */
7390static
7391IRExpr* shiftR64_with_extras ( IRTemp xtra, IRTemp base, IRTemp amt )
7392{
7393   /* if   amt == 0
7394      then base
7395      else (base >>u amt) | (xtra << (64-amt))
7396   */
7397   return
7398      IRExpr_Mux0X(
7399         mkexpr(amt),
7400         mkexpr(base),
7401         binop(Iop_Or64,
7402               binop(Iop_Shr64, mkexpr(base), mkexpr(amt)),
7403               binop(Iop_Shl64, mkexpr(xtra),
7404                                binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
7405         )
7406      );
7407}
7408
7409/* Double length left and right shifts.  Apparently only required in
7410   v-size (no b- variant). */
7411static
7412ULong dis_SHLRD_Gv_Ev ( VexAbiInfo* vbi,
7413                        Prefix pfx,
7414                        Long delta, UChar modrm,
7415                        Int sz,
7416                        IRExpr* shift_amt,
7417                        Bool amt_is_literal,
7418                        HChar* shift_amt_txt,
7419                        Bool left_shift )
7420{
7421   /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
7422      for printing it.   And eip on entry points at the modrm byte. */
7423   Int len;
7424   HChar dis_buf[50];
7425
7426   IRType ty     = szToITy(sz);
7427   IRTemp gsrc   = newTemp(ty);
7428   IRTemp esrc   = newTemp(ty);
7429   IRTemp addr   = IRTemp_INVALID;
7430   IRTemp tmpSH  = newTemp(Ity_I8);
7431   IRTemp tmpSS  = newTemp(Ity_I8);
7432   IRTemp tmp64  = IRTemp_INVALID;
7433   IRTemp res64  = IRTemp_INVALID;
7434   IRTemp rss64  = IRTemp_INVALID;
7435   IRTemp resTy  = IRTemp_INVALID;
7436   IRTemp rssTy  = IRTemp_INVALID;
7437   Int    mask   = sz==8 ? 63 : 31;
7438
7439   vassert(sz == 2 || sz == 4 || sz == 8);
7440
7441   /* The E-part is the destination; this is shifted.  The G-part
7442      supplies bits to be shifted into the E-part, but is not
7443      changed.
7444
7445      If shifting left, form a double-length word with E at the top
7446      and G at the bottom, and shift this left.  The result is then in
7447      the high part.
7448
7449      If shifting right, form a double-length word with G at the top
7450      and E at the bottom, and shift this right.  The result is then
7451      at the bottom.  */
7452
7453   /* Fetch the operands. */
7454
7455   assign( gsrc, getIRegG(sz, pfx, modrm) );
7456
7457   if (epartIsReg(modrm)) {
7458      delta++;
7459      assign( esrc, getIRegE(sz, pfx, modrm) );
7460      DIP("sh%cd%c %s, %s, %s\n",
7461          ( left_shift ? 'l' : 'r' ), nameISize(sz),
7462          shift_amt_txt,
7463          nameIRegG(sz, pfx, modrm), nameIRegE(sz, pfx, modrm));
7464   } else {
7465      addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
7466                        /* # bytes following amode */
7467                        amt_is_literal ? 1 : 0 );
7468      delta += len;
7469      assign( esrc, loadLE(ty, mkexpr(addr)) );
7470      DIP("sh%cd%c %s, %s, %s\n",
7471          ( left_shift ? 'l' : 'r' ), nameISize(sz),
7472          shift_amt_txt,
7473          nameIRegG(sz, pfx, modrm), dis_buf);
7474   }
7475
7476   /* Calculate the masked shift amount (tmpSH), the masked subshift
7477      amount (tmpSS), the shifted value (res64) and the subshifted
7478      value (rss64). */
7479
7480   assign( tmpSH, binop(Iop_And8, shift_amt, mkU8(mask)) );
7481   assign( tmpSS, binop(Iop_And8,
7482                        binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
7483                        mkU8(mask)));
7484
7485   tmp64 = newTemp(Ity_I64);
7486   res64 = newTemp(Ity_I64);
7487   rss64 = newTemp(Ity_I64);
7488
7489   if (sz == 2 || sz == 4) {
7490
7491      /* G is xtra; E is data */
7492      /* what a freaking nightmare: */
7493      if (sz == 4 && left_shift) {
7494         assign( tmp64, binop(Iop_32HLto64, mkexpr(esrc), mkexpr(gsrc)) );
7495         assign( res64,
7496                 binop(Iop_Shr64,
7497                       binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
7498                       mkU8(32)) );
7499         assign( rss64,
7500                 binop(Iop_Shr64,
7501                       binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSS)),
7502                       mkU8(32)) );
7503      }
7504      else
7505      if (sz == 4 && !left_shift) {
7506         assign( tmp64, binop(Iop_32HLto64, mkexpr(gsrc), mkexpr(esrc)) );
7507         assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
7508         assign( rss64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSS)) );
7509      }
7510      else
7511      if (sz == 2 && left_shift) {
7512         assign( tmp64,
7513                 binop(Iop_32HLto64,
7514                       binop(Iop_16HLto32, mkexpr(esrc), mkexpr(gsrc)),
7515                       binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc))
7516         ));
7517	 /* result formed by shifting [esrc'gsrc'gsrc'gsrc] */
7518         assign( res64,
7519                 binop(Iop_Shr64,
7520                       binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
7521                       mkU8(48)) );
7522         /* subshift formed by shifting [esrc'0000'0000'0000] */
7523         assign( rss64,
7524                 binop(Iop_Shr64,
7525                       binop(Iop_Shl64,
7526                             binop(Iop_Shl64, unop(Iop_16Uto64, mkexpr(esrc)),
7527                                              mkU8(48)),
7528                             mkexpr(tmpSS)),
7529                       mkU8(48)) );
7530      }
7531      else
7532      if (sz == 2 && !left_shift) {
7533         assign( tmp64,
7534                 binop(Iop_32HLto64,
7535                       binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc)),
7536                       binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(esrc))
7537         ));
7538         /* result formed by shifting [gsrc'gsrc'gsrc'esrc] */
7539         assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
7540         /* subshift formed by shifting [0000'0000'0000'esrc] */
7541         assign( rss64, binop(Iop_Shr64,
7542                              unop(Iop_16Uto64, mkexpr(esrc)),
7543                              mkexpr(tmpSS)) );
7544      }
7545
7546   } else {
7547
7548      vassert(sz == 8);
7549      if (left_shift) {
7550         assign( res64, shiftL64_with_extras( esrc, gsrc, tmpSH ));
7551         assign( rss64, shiftL64_with_extras( esrc, gsrc, tmpSS ));
7552      } else {
7553         assign( res64, shiftR64_with_extras( gsrc, esrc, tmpSH ));
7554         assign( rss64, shiftR64_with_extras( gsrc, esrc, tmpSS ));
7555      }
7556
7557   }
7558
7559   resTy = newTemp(ty);
7560   rssTy = newTemp(ty);
7561   assign( resTy, narrowTo(ty, mkexpr(res64)) );
7562   assign( rssTy, narrowTo(ty, mkexpr(rss64)) );
7563
7564   /* Put result back and write the flags thunk. */
7565   setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl64 : Iop_Sar64,
7566                              resTy, rssTy, ty, tmpSH );
7567
7568   if (epartIsReg(modrm)) {
7569      putIRegE(sz, pfx, modrm, mkexpr(resTy));
7570   } else {
7571      storeLE( mkexpr(addr), mkexpr(resTy) );
7572   }
7573
7574   if (amt_is_literal) delta++;
7575   return delta;
7576}
7577
7578
7579/* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
7580   required. */
7581
7582typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
7583
7584static HChar* nameBtOp ( BtOp op )
7585{
7586   switch (op) {
7587      case BtOpNone:  return "";
7588      case BtOpSet:   return "s";
7589      case BtOpReset: return "r";
7590      case BtOpComp:  return "c";
7591      default: vpanic("nameBtOp(amd64)");
7592   }
7593}
7594
7595
7596static
7597ULong dis_bt_G_E ( VexAbiInfo* vbi,
7598                   Prefix pfx, Int sz, Long delta, BtOp op )
7599{
7600   HChar  dis_buf[50];
7601   UChar  modrm;
7602   Int    len;
7603   IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
7604     t_addr1, t_rsp, t_mask, t_new;
7605
7606   vassert(sz == 2 || sz == 4 || sz == 8);
7607
7608   t_fetched = t_bitno0 = t_bitno1 = t_bitno2
7609             = t_addr0 = t_addr1 = t_rsp
7610             = t_mask = t_new = IRTemp_INVALID;
7611
7612   t_fetched = newTemp(Ity_I8);
7613   t_new     = newTemp(Ity_I8);
7614   t_bitno0  = newTemp(Ity_I64);
7615   t_bitno1  = newTemp(Ity_I64);
7616   t_bitno2  = newTemp(Ity_I8);
7617   t_addr1   = newTemp(Ity_I64);
7618   modrm     = getUChar(delta);
7619
7620   assign( t_bitno0, widenSto64(getIRegG(sz, pfx, modrm)) );
7621
7622   if (epartIsReg(modrm)) {
7623      delta++;
7624      /* Get it onto the client's stack.  Oh, this is a horrible
7625         kludge.  See https://bugs.kde.org/show_bug.cgi?id=245925.
7626         Because of the ELF ABI stack redzone, there may be live data
7627         up to 128 bytes below %RSP.  So we can't just push it on the
7628         stack, else we may wind up trashing live data, and causing
7629         impossible-to-find simulation errors.  (Yes, this did
7630         happen.)  So we need to drop RSP before at least 128 before
7631         pushing it.  That unfortunately means hitting Memcheck's
7632         fast-case painting code.  Ideally we should drop more than
7633         128, to reduce the chances of breaking buggy programs that
7634         have live data below -128(%RSP).  Memcheck fast-cases moves
7635         of 288 bytes due to the need to handle ppc64-linux quickly,
7636         so let's use 288.  Of course the real fix is to get rid of
7637         this kludge entirely.  */
7638      t_rsp = newTemp(Ity_I64);
7639      t_addr0 = newTemp(Ity_I64);
7640
7641      vassert(vbi->guest_stack_redzone_size == 128);
7642      assign( t_rsp, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(288)) );
7643      putIReg64(R_RSP, mkexpr(t_rsp));
7644
7645      storeLE( mkexpr(t_rsp), getIRegE(sz, pfx, modrm) );
7646
7647      /* Make t_addr0 point at it. */
7648      assign( t_addr0, mkexpr(t_rsp) );
7649
7650      /* Mask out upper bits of the shift amount, since we're doing a
7651         reg. */
7652      assign( t_bitno1, binop(Iop_And64,
7653                              mkexpr(t_bitno0),
7654                              mkU64(sz == 8 ? 63 : sz == 4 ? 31 : 15)) );
7655
7656   } else {
7657      t_addr0 = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
7658      delta += len;
7659      assign( t_bitno1, mkexpr(t_bitno0) );
7660   }
7661
7662   /* At this point: t_addr0 is the address being operated on.  If it
7663      was a reg, we will have pushed it onto the client's stack.
7664      t_bitno1 is the bit number, suitably masked in the case of a
7665      reg.  */
7666
7667   /* Now the main sequence. */
7668   assign( t_addr1,
7669           binop(Iop_Add64,
7670                 mkexpr(t_addr0),
7671                 binop(Iop_Sar64, mkexpr(t_bitno1), mkU8(3))) );
7672
7673   /* t_addr1 now holds effective address */
7674
7675   assign( t_bitno2,
7676           unop(Iop_64to8,
7677                binop(Iop_And64, mkexpr(t_bitno1), mkU64(7))) );
7678
7679   /* t_bitno2 contains offset of bit within byte */
7680
7681   if (op != BtOpNone) {
7682      t_mask = newTemp(Ity_I8);
7683      assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
7684   }
7685
7686   /* t_mask is now a suitable byte mask */
7687
7688   assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
7689
7690   if (op != BtOpNone) {
7691      switch (op) {
7692         case BtOpSet:
7693            assign( t_new,
7694                    binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
7695            break;
7696         case BtOpComp:
7697            assign( t_new,
7698                    binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
7699            break;
7700         case BtOpReset:
7701            assign( t_new,
7702                    binop(Iop_And8, mkexpr(t_fetched),
7703                                    unop(Iop_Not8, mkexpr(t_mask))) );
7704            break;
7705         default:
7706            vpanic("dis_bt_G_E(amd64)");
7707      }
7708      if ((pfx & PFX_LOCK) && !epartIsReg(modrm)) {
7709         casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
7710                                 mkexpr(t_new)/*new*/,
7711                                 guest_RIP_curr_instr );
7712      } else {
7713         storeLE( mkexpr(t_addr1), mkexpr(t_new) );
7714      }
7715   }
7716
7717   /* Side effect done; now get selected bit into Carry flag */
7718   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
7719   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
7720   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
7721   stmt( IRStmt_Put(
7722            OFFB_CC_DEP1,
7723            binop(Iop_And64,
7724                  binop(Iop_Shr64,
7725                        unop(Iop_8Uto64, mkexpr(t_fetched)),
7726                        mkexpr(t_bitno2)),
7727                  mkU64(1)))
7728       );
7729   /* Set NDEP even though it isn't used.  This makes redundant-PUT
7730      elimination of previous stores to this field work better. */
7731   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
7732
7733   /* Move reg operand from stack back to reg */
7734   if (epartIsReg(modrm)) {
7735      /* t_rsp still points at it. */
7736      /* only write the reg if actually modifying it; doing otherwise
7737         zeroes the top half erroneously when doing btl due to
7738         standard zero-extend rule */
7739      if (op != BtOpNone)
7740         putIRegE(sz, pfx, modrm, loadLE(szToITy(sz), mkexpr(t_rsp)) );
7741      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t_rsp), mkU64(288)) );
7742   }
7743
7744   DIP("bt%s%c %s, %s\n",
7745       nameBtOp(op), nameISize(sz), nameIRegG(sz, pfx, modrm),
7746       ( epartIsReg(modrm) ? nameIRegE(sz, pfx, modrm) : dis_buf ) );
7747
7748   return delta;
7749}
7750
7751
7752
7753/* Handle BSF/BSR.  Only v-size seems necessary. */
7754static
7755ULong dis_bs_E_G ( VexAbiInfo* vbi,
7756                   Prefix pfx, Int sz, Long delta, Bool fwds )
7757{
7758   Bool   isReg;
7759   UChar  modrm;
7760   HChar  dis_buf[50];
7761
7762   IRType ty    = szToITy(sz);
7763   IRTemp src   = newTemp(ty);
7764   IRTemp dst   = newTemp(ty);
7765   IRTemp src64 = newTemp(Ity_I64);
7766   IRTemp dst64 = newTemp(Ity_I64);
7767   IRTemp src8  = newTemp(Ity_I8);
7768
7769   vassert(sz == 8 || sz == 4 || sz == 2);
7770
7771   modrm = getUChar(delta);
7772   isReg = epartIsReg(modrm);
7773   if (isReg) {
7774      delta++;
7775      assign( src, getIRegE(sz, pfx, modrm) );
7776   } else {
7777      Int    len;
7778      IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7779      delta += len;
7780      assign( src, loadLE(ty, mkexpr(addr)) );
7781   }
7782
7783   DIP("bs%c%c %s, %s\n",
7784       fwds ? 'f' : 'r', nameISize(sz),
7785       ( isReg ? nameIRegE(sz, pfx, modrm) : dis_buf ),
7786       nameIRegG(sz, pfx, modrm));
7787
7788   /* First, widen src to 64 bits if it is not already. */
7789   assign( src64, widenUto64(mkexpr(src)) );
7790
7791   /* Generate an 8-bit expression which is zero iff the
7792      original is zero, and nonzero otherwise */
7793   assign( src8,
7794           unop(Iop_1Uto8,
7795                binop(Iop_CmpNE64,
7796                      mkexpr(src64), mkU64(0))) );
7797
7798   /* Flags: Z is 1 iff source value is zero.  All others
7799      are undefined -- we force them to zero. */
7800   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
7801   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
7802   stmt( IRStmt_Put(
7803            OFFB_CC_DEP1,
7804            IRExpr_Mux0X( mkexpr(src8),
7805                          /* src==0 */
7806                          mkU64(AMD64G_CC_MASK_Z),
7807                          /* src!=0 */
7808                          mkU64(0)
7809                        )
7810       ));
7811   /* Set NDEP even though it isn't used.  This makes redundant-PUT
7812      elimination of previous stores to this field work better. */
7813   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
7814
7815   /* Result: iff source value is zero, we can't use
7816      Iop_Clz64/Iop_Ctz64 as they have no defined result in that case.
7817      But anyway, amd64 semantics say the result is undefined in
7818      such situations.  Hence handle the zero case specially. */
7819
7820   /* Bleh.  What we compute:
7821
7822          bsf64:  if src == 0 then {dst is unchanged}
7823                              else Ctz64(src)
7824
7825          bsr64:  if src == 0 then {dst is unchanged}
7826                              else 63 - Clz64(src)
7827
7828          bsf32:  if src == 0 then {dst is unchanged}
7829                              else Ctz64(32Uto64(src))
7830
7831          bsr32:  if src == 0 then {dst is unchanged}
7832                              else 63 - Clz64(32Uto64(src))
7833
7834          bsf16:  if src == 0 then {dst is unchanged}
7835                              else Ctz64(32Uto64(16Uto32(src)))
7836
7837          bsr16:  if src == 0 then {dst is unchanged}
7838                              else 63 - Clz64(32Uto64(16Uto32(src)))
7839   */
7840
7841   /* The main computation, guarding against zero. */
7842   assign( dst64,
7843           IRExpr_Mux0X(
7844              mkexpr(src8),
7845              /* src == 0 -- leave dst unchanged */
7846              widenUto64( getIRegG( sz, pfx, modrm ) ),
7847              /* src != 0 */
7848              fwds ? unop(Iop_Ctz64, mkexpr(src64))
7849                   : binop(Iop_Sub64,
7850                           mkU64(63),
7851                           unop(Iop_Clz64, mkexpr(src64)))
7852           )
7853         );
7854
7855   if (sz == 2)
7856      assign( dst, unop(Iop_64to16, mkexpr(dst64)) );
7857   else
7858   if (sz == 4)
7859      assign( dst, unop(Iop_64to32, mkexpr(dst64)) );
7860   else
7861      assign( dst, mkexpr(dst64) );
7862
7863   /* dump result back */
7864   putIRegG( sz, pfx, modrm, mkexpr(dst) );
7865
7866   return delta;
7867}
7868
7869
7870/* swap rAX with the reg specified by reg and REX.B */
7871static
7872void codegen_xchg_rAX_Reg ( Prefix pfx, Int sz, UInt regLo3 )
7873{
7874   IRType ty = szToITy(sz);
7875   IRTemp t1 = newTemp(ty);
7876   IRTemp t2 = newTemp(ty);
7877   vassert(sz == 2 || sz == 4 || sz == 8);
7878   vassert(regLo3 < 8);
7879   if (sz == 8) {
7880      assign( t1, getIReg64(R_RAX) );
7881      assign( t2, getIRegRexB(8, pfx, regLo3) );
7882      putIReg64( R_RAX, mkexpr(t2) );
7883      putIRegRexB(8, pfx, regLo3, mkexpr(t1) );
7884   } else if (sz == 4) {
7885      assign( t1, getIReg32(R_RAX) );
7886      assign( t2, getIRegRexB(4, pfx, regLo3) );
7887      putIReg32( R_RAX, mkexpr(t2) );
7888      putIRegRexB(4, pfx, regLo3, mkexpr(t1) );
7889   } else {
7890      assign( t1, getIReg16(R_RAX) );
7891      assign( t2, getIRegRexB(2, pfx, regLo3) );
7892      putIReg16( R_RAX, mkexpr(t2) );
7893      putIRegRexB(2, pfx, regLo3, mkexpr(t1) );
7894   }
7895   DIP("xchg%c %s, %s\n",
7896       nameISize(sz), nameIRegRAX(sz),
7897                      nameIRegRexB(sz,pfx, regLo3));
7898}
7899
7900
7901static
7902void codegen_SAHF ( void )
7903{
7904   /* Set the flags to:
7905      (amd64g_calculate_flags_all() & AMD64G_CC_MASK_O)
7906                                    -- retain the old O flag
7907      | (%AH & (AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
7908                |AMD64G_CC_MASK_P|AMD64G_CC_MASK_C)
7909   */
7910   ULong  mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
7911                       |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
7912   IRTemp oldflags   = newTemp(Ity_I64);
7913   assign( oldflags, mk_amd64g_calculate_rflags_all() );
7914   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
7915   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
7916   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
7917   stmt( IRStmt_Put( OFFB_CC_DEP1,
7918         binop(Iop_Or64,
7919               binop(Iop_And64, mkexpr(oldflags), mkU64(AMD64G_CC_MASK_O)),
7920               binop(Iop_And64,
7921                     binop(Iop_Shr64, getIReg64(R_RAX), mkU8(8)),
7922                     mkU64(mask_SZACP))
7923              )
7924   ));
7925}
7926
7927
7928static
7929void codegen_LAHF ( void  )
7930{
7931   /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
7932   IRExpr* rax_with_hole;
7933   IRExpr* new_byte;
7934   IRExpr* new_rax;
7935   ULong   mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
7936                        |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
7937
7938   IRTemp  flags = newTemp(Ity_I64);
7939   assign( flags, mk_amd64g_calculate_rflags_all() );
7940
7941   rax_with_hole
7942      = binop(Iop_And64, getIReg64(R_RAX), mkU64(~0xFF00ULL));
7943   new_byte
7944      = binop(Iop_Or64, binop(Iop_And64, mkexpr(flags), mkU64(mask_SZACP)),
7945                        mkU64(1<<1));
7946   new_rax
7947      = binop(Iop_Or64, rax_with_hole,
7948                        binop(Iop_Shl64, new_byte, mkU8(8)));
7949   putIReg64(R_RAX, new_rax);
7950}
7951
7952
7953static
7954ULong dis_cmpxchg_G_E ( /*OUT*/Bool* ok,
7955                        VexAbiInfo*  vbi,
7956                        Prefix       pfx,
7957                        Int          size,
7958                        Long         delta0 )
7959{
7960   HChar dis_buf[50];
7961   Int   len;
7962
7963   IRType ty    = szToITy(size);
7964   IRTemp acc   = newTemp(ty);
7965   IRTemp src   = newTemp(ty);
7966   IRTemp dest  = newTemp(ty);
7967   IRTemp dest2 = newTemp(ty);
7968   IRTemp acc2  = newTemp(ty);
7969   IRTemp cond8 = newTemp(Ity_I8);
7970   IRTemp addr  = IRTemp_INVALID;
7971   UChar  rm    = getUChar(delta0);
7972
7973   /* There are 3 cases to consider:
7974
7975      reg-reg: ignore any lock prefix, generate sequence based
7976               on Mux0X
7977
7978      reg-mem, not locked: ignore any lock prefix, generate sequence
7979                           based on Mux0X
7980
7981      reg-mem, locked: use IRCAS
7982   */
7983
7984   if (epartIsReg(rm)) {
7985      /* case 1 */
7986      assign( dest, getIRegE(size, pfx, rm) );
7987      delta0++;
7988      assign( src, getIRegG(size, pfx, rm) );
7989      assign( acc, getIRegRAX(size) );
7990      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
7991      assign( cond8, unop(Iop_1Uto8, mk_amd64g_calculate_condition(AMD64CondZ)) );
7992      assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
7993      assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
7994      putIRegRAX(size, mkexpr(acc2));
7995      putIRegE(size, pfx, rm, mkexpr(dest2));
7996      DIP("cmpxchg%c %s,%s\n", nameISize(size),
7997                               nameIRegG(size,pfx,rm),
7998                               nameIRegE(size,pfx,rm) );
7999   }
8000   else if (!epartIsReg(rm) && !(pfx & PFX_LOCK)) {
8001      /* case 2 */
8002      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
8003      assign( dest, loadLE(ty, mkexpr(addr)) );
8004      delta0 += len;
8005      assign( src, getIRegG(size, pfx, rm) );
8006      assign( acc, getIRegRAX(size) );
8007      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
8008      assign( cond8, unop(Iop_1Uto8, mk_amd64g_calculate_condition(AMD64CondZ)) );
8009      assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
8010      assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
8011      putIRegRAX(size, mkexpr(acc2));
8012      storeLE( mkexpr(addr), mkexpr(dest2) );
8013      DIP("cmpxchg%c %s,%s\n", nameISize(size),
8014                               nameIRegG(size,pfx,rm), dis_buf);
8015   }
8016   else if (!epartIsReg(rm) && (pfx & PFX_LOCK)) {
8017      /* case 3 */
8018      /* src is new value.  acc is expected value.  dest is old value.
8019         Compute success from the output of the IRCAS, and steer the
8020         new value for RAX accordingly: in case of success, RAX is
8021         unchanged. */
8022      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
8023      delta0 += len;
8024      assign( src, getIRegG(size, pfx, rm) );
8025      assign( acc, getIRegRAX(size) );
8026      stmt( IRStmt_CAS(
8027         mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
8028                  NULL, mkexpr(acc), NULL, mkexpr(src) )
8029      ));
8030      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
8031      assign( cond8, unop(Iop_1Uto8, mk_amd64g_calculate_condition(AMD64CondZ)) );
8032      assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
8033      putIRegRAX(size, mkexpr(acc2));
8034      DIP("cmpxchg%c %s,%s\n", nameISize(size),
8035                               nameIRegG(size,pfx,rm), dis_buf);
8036   }
8037   else vassert(0);
8038
8039   *ok = True;
8040   return delta0;
8041}
8042
8043
8044/* Handle conditional move instructions of the form
8045      cmovcc E(reg-or-mem), G(reg)
8046
8047   E(src) is reg-or-mem
8048   G(dst) is reg.
8049
8050   If E is reg, -->    GET %E, tmps
8051                       GET %G, tmpd
8052                       CMOVcc tmps, tmpd
8053                       PUT tmpd, %G
8054
8055   If E is mem  -->    (getAddr E) -> tmpa
8056                       LD (tmpa), tmps
8057                       GET %G, tmpd
8058                       CMOVcc tmps, tmpd
8059                       PUT tmpd, %G
8060*/
8061static
8062ULong dis_cmov_E_G ( VexAbiInfo* vbi,
8063                     Prefix        pfx,
8064                     Int           sz,
8065                     AMD64Condcode cond,
8066                     Long          delta0 )
8067{
8068   UChar rm  = getUChar(delta0);
8069   HChar dis_buf[50];
8070   Int   len;
8071
8072   IRType ty   = szToITy(sz);
8073   IRTemp tmps = newTemp(ty);
8074   IRTemp tmpd = newTemp(ty);
8075
8076   if (epartIsReg(rm)) {
8077      assign( tmps, getIRegE(sz, pfx, rm) );
8078      assign( tmpd, getIRegG(sz, pfx, rm) );
8079
8080      putIRegG( sz, pfx, rm,
8081                IRExpr_Mux0X( unop(Iop_1Uto8,
8082                                   mk_amd64g_calculate_condition(cond)),
8083                              mkexpr(tmpd),
8084                              mkexpr(tmps) )
8085              );
8086      DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
8087                            nameIRegE(sz,pfx,rm),
8088                            nameIRegG(sz,pfx,rm));
8089      return 1+delta0;
8090   }
8091
8092   /* E refers to memory */
8093   {
8094      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
8095      assign( tmps, loadLE(ty, mkexpr(addr)) );
8096      assign( tmpd, getIRegG(sz, pfx, rm) );
8097
8098      putIRegG( sz, pfx, rm,
8099                IRExpr_Mux0X( unop(Iop_1Uto8,
8100                                   mk_amd64g_calculate_condition(cond)),
8101                              mkexpr(tmpd),
8102                              mkexpr(tmps) )
8103              );
8104
8105      DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
8106                            dis_buf,
8107                            nameIRegG(sz,pfx,rm));
8108      return len+delta0;
8109   }
8110}
8111
8112
8113static
8114ULong dis_xadd_G_E ( /*OUT*/Bool* decode_ok,
8115                     VexAbiInfo* vbi,
8116                     Prefix pfx, Int sz, Long delta0 )
8117{
8118   Int   len;
8119   UChar rm = getUChar(delta0);
8120   HChar dis_buf[50];
8121
8122   IRType ty    = szToITy(sz);
8123   IRTemp tmpd  = newTemp(ty);
8124   IRTemp tmpt0 = newTemp(ty);
8125   IRTemp tmpt1 = newTemp(ty);
8126
8127   /* There are 3 cases to consider:
8128
8129      reg-reg: ignore any lock prefix,
8130               generate 'naive' (non-atomic) sequence
8131
8132      reg-mem, not locked: ignore any lock prefix, generate 'naive'
8133                           (non-atomic) sequence
8134
8135      reg-mem, locked: use IRCAS
8136   */
8137
8138   if (epartIsReg(rm)) {
8139      /* case 1 */
8140      assign( tmpd, getIRegE(sz, pfx, rm) );
8141      assign( tmpt0, getIRegG(sz, pfx, rm) );
8142      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
8143                           mkexpr(tmpd), mkexpr(tmpt0)) );
8144      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
8145      putIRegG(sz, pfx, rm, mkexpr(tmpd));
8146      putIRegE(sz, pfx, rm, mkexpr(tmpt1));
8147      DIP("xadd%c %s, %s\n",
8148          nameISize(sz), nameIRegG(sz,pfx,rm),
8149          				 nameIRegE(sz,pfx,rm));
8150      *decode_ok = True;
8151      return 1+delta0;
8152   }
8153   else if (!epartIsReg(rm) && !(pfx & PFX_LOCK)) {
8154      /* case 2 */
8155      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
8156      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
8157      assign( tmpt0, getIRegG(sz, pfx, rm) );
8158      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
8159                           mkexpr(tmpd), mkexpr(tmpt0)) );
8160      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
8161      storeLE( mkexpr(addr), mkexpr(tmpt1) );
8162      putIRegG(sz, pfx, rm, mkexpr(tmpd));
8163      DIP("xadd%c %s, %s\n",
8164          nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
8165      *decode_ok = True;
8166      return len+delta0;
8167   }
8168   else if (!epartIsReg(rm) && (pfx & PFX_LOCK)) {
8169      /* case 3 */
8170      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
8171      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
8172      assign( tmpt0, getIRegG(sz, pfx, rm) );
8173      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
8174                           mkexpr(tmpd), mkexpr(tmpt0)) );
8175      casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
8176                           mkexpr(tmpt1)/*newVal*/, guest_RIP_curr_instr );
8177      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
8178      putIRegG(sz, pfx, rm, mkexpr(tmpd));
8179      DIP("xadd%c %s, %s\n",
8180          nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
8181      *decode_ok = True;
8182      return len+delta0;
8183   }
8184   /*UNREACHED*/
8185   vassert(0);
8186}
8187
8188//.. /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
8189//..
8190//.. static
8191//.. UInt dis_mov_Ew_Sw ( UChar sorb, Long delta0 )
8192//.. {
8193//..    Int    len;
8194//..    IRTemp addr;
8195//..    UChar  rm  = getUChar(delta0);
8196//..    HChar  dis_buf[50];
8197//..
8198//..    if (epartIsReg(rm)) {
8199//..       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
8200//..       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
8201//..       return 1+delta0;
8202//..    } else {
8203//..       addr = disAMode ( &len, sorb, delta0, dis_buf );
8204//..       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
8205//..       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
8206//..       return len+delta0;
8207//..    }
8208//.. }
8209//..
8210//.. /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
8211//..    dst is ireg and sz==4, zero out top half of it.  */
8212//..
8213//.. static
8214//.. UInt dis_mov_Sw_Ew ( UChar sorb,
8215//..                      Int   sz,
8216//..                      UInt  delta0 )
8217//.. {
8218//..    Int    len;
8219//..    IRTemp addr;
8220//..    UChar  rm  = getUChar(delta0);
8221//..    HChar  dis_buf[50];
8222//..
8223//..    vassert(sz == 2 || sz == 4);
8224//..
8225//..    if (epartIsReg(rm)) {
8226//..       if (sz == 4)
8227//..          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
8228//..       else
8229//..          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
8230//..
8231//..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
8232//..       return 1+delta0;
8233//..    } else {
8234//..       addr = disAMode ( &len, sorb, delta0, dis_buf );
8235//..       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
8236//..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
8237//..       return len+delta0;
8238//..    }
8239//.. }
8240//..
8241//..
8242//.. static
8243//.. void dis_push_segreg ( UInt sreg, Int sz )
8244//.. {
8245//..     IRTemp t1 = newTemp(Ity_I16);
8246//..     IRTemp ta = newTemp(Ity_I32);
8247//..     vassert(sz == 2 || sz == 4);
8248//..
8249//..     assign( t1, getSReg(sreg) );
8250//..     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
8251//..     putIReg(4, R_ESP, mkexpr(ta));
8252//..     storeLE( mkexpr(ta), mkexpr(t1) );
8253//..
8254//..     DIP("pushw %s\n", nameSReg(sreg));
8255//.. }
8256//..
8257//.. static
8258//.. void dis_pop_segreg ( UInt sreg, Int sz )
8259//.. {
8260//..     IRTemp t1 = newTemp(Ity_I16);
8261//..     IRTemp ta = newTemp(Ity_I32);
8262//..     vassert(sz == 2 || sz == 4);
8263//..
8264//..     assign( ta, getIReg(4, R_ESP) );
8265//..     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
8266//..
8267//..     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
8268//..     putSReg( sreg, mkexpr(t1) );
8269//..     DIP("pop %s\n", nameSReg(sreg));
8270//.. }
8271
8272static
8273void dis_ret ( /*MOD*/DisResult* dres, VexAbiInfo* vbi, ULong d64 )
8274{
8275   IRTemp t1 = newTemp(Ity_I64);
8276   IRTemp t2 = newTemp(Ity_I64);
8277   IRTemp t3 = newTemp(Ity_I64);
8278   assign(t1, getIReg64(R_RSP));
8279   assign(t2, loadLE(Ity_I64,mkexpr(t1)));
8280   assign(t3, binop(Iop_Add64, mkexpr(t1), mkU64(8+d64)));
8281   putIReg64(R_RSP, mkexpr(t3));
8282   make_redzone_AbiHint(vbi, t3, t2/*nia*/, "ret");
8283   jmp_treg(dres, Ijk_Ret, t2);
8284   vassert(dres->whatNext == Dis_StopHere);
8285}
8286
8287
8288/*------------------------------------------------------------*/
8289/*--- SSE/SSE2/SSE3 helpers                                ---*/
8290/*------------------------------------------------------------*/
8291
8292/* Worker function; do not call directly.
8293   Handles full width G = G `op` E   and   G = (not G) `op` E.
8294*/
8295
8296static ULong dis_SSE_E_to_G_all_wrk (
8297                VexAbiInfo* vbi,
8298                Prefix pfx, Long delta,
8299                HChar* opname, IROp op,
8300                Bool   invertG
8301             )
8302{
8303   HChar   dis_buf[50];
8304   Int     alen;
8305   IRTemp  addr;
8306   UChar   rm = getUChar(delta);
8307   IRExpr* gpart
8308      = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRexRM(pfx,rm)))
8309                : getXMMReg(gregOfRexRM(pfx,rm));
8310   if (epartIsReg(rm)) {
8311      putXMMReg( gregOfRexRM(pfx,rm),
8312                 binop(op, gpart,
8313                           getXMMReg(eregOfRexRM(pfx,rm))) );
8314      DIP("%s %s,%s\n", opname,
8315                        nameXMMReg(eregOfRexRM(pfx,rm)),
8316                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8317      return delta+1;
8318   } else {
8319      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8320      putXMMReg( gregOfRexRM(pfx,rm),
8321                 binop(op, gpart,
8322                           loadLE(Ity_V128, mkexpr(addr))) );
8323      DIP("%s %s,%s\n", opname,
8324                        dis_buf,
8325                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8326      return delta+alen;
8327   }
8328}
8329
8330
8331/* All lanes SSE binary operation, G = G `op` E. */
8332
8333static
8334ULong dis_SSE_E_to_G_all ( VexAbiInfo* vbi,
8335                           Prefix pfx, Long delta,
8336                           HChar* opname, IROp op )
8337{
8338   return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, False );
8339}
8340
8341/* All lanes SSE binary operation, G = (not G) `op` E. */
8342
8343static
8344ULong dis_SSE_E_to_G_all_invG ( VexAbiInfo* vbi,
8345                                Prefix pfx, Long delta,
8346                                HChar* opname, IROp op )
8347{
8348   return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, True );
8349}
8350
8351
8352/* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
8353
8354static ULong dis_SSE_E_to_G_lo32 ( VexAbiInfo* vbi,
8355                                   Prefix pfx, Long delta,
8356                                   HChar* opname, IROp op )
8357{
8358   HChar   dis_buf[50];
8359   Int     alen;
8360   IRTemp  addr;
8361   UChar   rm = getUChar(delta);
8362   IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
8363   if (epartIsReg(rm)) {
8364      putXMMReg( gregOfRexRM(pfx,rm),
8365                 binop(op, gpart,
8366                           getXMMReg(eregOfRexRM(pfx,rm))) );
8367      DIP("%s %s,%s\n", opname,
8368                        nameXMMReg(eregOfRexRM(pfx,rm)),
8369                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8370      return delta+1;
8371   } else {
8372      /* We can only do a 32-bit memory read, so the upper 3/4 of the
8373         E operand needs to be made simply of zeroes. */
8374      IRTemp epart = newTemp(Ity_V128);
8375      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8376      assign( epart, unop( Iop_32UtoV128,
8377                           loadLE(Ity_I32, mkexpr(addr))) );
8378      putXMMReg( gregOfRexRM(pfx,rm),
8379                 binop(op, gpart, mkexpr(epart)) );
8380      DIP("%s %s,%s\n", opname,
8381                        dis_buf,
8382                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8383      return delta+alen;
8384   }
8385}
8386
8387
8388/* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
8389
8390static ULong dis_SSE_E_to_G_lo64 ( VexAbiInfo* vbi,
8391                                   Prefix pfx, Long delta,
8392                                   HChar* opname, IROp op )
8393{
8394   HChar   dis_buf[50];
8395   Int     alen;
8396   IRTemp  addr;
8397   UChar   rm = getUChar(delta);
8398   IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
8399   if (epartIsReg(rm)) {
8400      putXMMReg( gregOfRexRM(pfx,rm),
8401                 binop(op, gpart,
8402                           getXMMReg(eregOfRexRM(pfx,rm))) );
8403      DIP("%s %s,%s\n", opname,
8404                        nameXMMReg(eregOfRexRM(pfx,rm)),
8405                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8406      return delta+1;
8407   } else {
8408      /* We can only do a 64-bit memory read, so the upper half of the
8409         E operand needs to be made simply of zeroes. */
8410      IRTemp epart = newTemp(Ity_V128);
8411      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8412      assign( epart, unop( Iop_64UtoV128,
8413                           loadLE(Ity_I64, mkexpr(addr))) );
8414      putXMMReg( gregOfRexRM(pfx,rm),
8415                 binop(op, gpart, mkexpr(epart)) );
8416      DIP("%s %s,%s\n", opname,
8417                        dis_buf,
8418                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8419      return delta+alen;
8420   }
8421}
8422
8423
8424/* All lanes unary SSE operation, G = op(E). */
8425
8426static ULong dis_SSE_E_to_G_unary_all (
8427                VexAbiInfo* vbi,
8428                Prefix pfx, Long delta,
8429                HChar* opname, IROp op
8430             )
8431{
8432   HChar   dis_buf[50];
8433   Int     alen;
8434   IRTemp  addr;
8435   UChar   rm = getUChar(delta);
8436   if (epartIsReg(rm)) {
8437      putXMMReg( gregOfRexRM(pfx,rm),
8438                 unop(op, getXMMReg(eregOfRexRM(pfx,rm))) );
8439      DIP("%s %s,%s\n", opname,
8440                        nameXMMReg(eregOfRexRM(pfx,rm)),
8441                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8442      return delta+1;
8443   } else {
8444      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8445      putXMMReg( gregOfRexRM(pfx,rm),
8446                 unop(op, loadLE(Ity_V128, mkexpr(addr))) );
8447      DIP("%s %s,%s\n", opname,
8448                        dis_buf,
8449                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8450      return delta+alen;
8451   }
8452}
8453
8454
8455/* Lowest 32-bit lane only unary SSE operation, G = op(E). */
8456
8457static ULong dis_SSE_E_to_G_unary_lo32 (
8458                VexAbiInfo* vbi,
8459                Prefix pfx, Long delta,
8460                HChar* opname, IROp op
8461             )
8462{
8463   /* First we need to get the old G value and patch the low 32 bits
8464      of the E operand into it.  Then apply op and write back to G. */
8465   HChar   dis_buf[50];
8466   Int     alen;
8467   IRTemp  addr;
8468   UChar   rm = getUChar(delta);
8469   IRTemp  oldG0 = newTemp(Ity_V128);
8470   IRTemp  oldG1 = newTemp(Ity_V128);
8471
8472   assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
8473
8474   if (epartIsReg(rm)) {
8475      assign( oldG1,
8476              binop( Iop_SetV128lo32,
8477                     mkexpr(oldG0),
8478                     getXMMRegLane32(eregOfRexRM(pfx,rm), 0)) );
8479      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
8480      DIP("%s %s,%s\n", opname,
8481                        nameXMMReg(eregOfRexRM(pfx,rm)),
8482                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8483      return delta+1;
8484   } else {
8485      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8486      assign( oldG1,
8487              binop( Iop_SetV128lo32,
8488                     mkexpr(oldG0),
8489                     loadLE(Ity_I32, mkexpr(addr)) ));
8490      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
8491      DIP("%s %s,%s\n", opname,
8492                        dis_buf,
8493                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8494      return delta+alen;
8495   }
8496}
8497
8498
8499/* Lowest 64-bit lane only unary SSE operation, G = op(E). */
8500
8501static ULong dis_SSE_E_to_G_unary_lo64 (
8502                VexAbiInfo* vbi,
8503                Prefix pfx, Long delta,
8504                HChar* opname, IROp op
8505             )
8506{
8507   /* First we need to get the old G value and patch the low 64 bits
8508      of the E operand into it.  Then apply op and write back to G. */
8509   HChar   dis_buf[50];
8510   Int     alen;
8511   IRTemp  addr;
8512   UChar   rm = getUChar(delta);
8513   IRTemp  oldG0 = newTemp(Ity_V128);
8514   IRTemp  oldG1 = newTemp(Ity_V128);
8515
8516   assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
8517
8518   if (epartIsReg(rm)) {
8519      assign( oldG1,
8520              binop( Iop_SetV128lo64,
8521                     mkexpr(oldG0),
8522                     getXMMRegLane64(eregOfRexRM(pfx,rm), 0)) );
8523      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
8524      DIP("%s %s,%s\n", opname,
8525                        nameXMMReg(eregOfRexRM(pfx,rm)),
8526                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8527      return delta+1;
8528   } else {
8529      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8530      assign( oldG1,
8531              binop( Iop_SetV128lo64,
8532                     mkexpr(oldG0),
8533                     loadLE(Ity_I64, mkexpr(addr)) ));
8534      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
8535      DIP("%s %s,%s\n", opname,
8536                        dis_buf,
8537                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8538      return delta+alen;
8539   }
8540}
8541
8542
8543/* SSE integer binary operation:
8544      G = G `op` E   (eLeft == False)
8545      G = E `op` G   (eLeft == True)
8546*/
8547static ULong dis_SSEint_E_to_G(
8548                VexAbiInfo* vbi,
8549                Prefix pfx, Long delta,
8550                HChar* opname, IROp op,
8551                Bool   eLeft
8552             )
8553{
8554   HChar   dis_buf[50];
8555   Int     alen;
8556   IRTemp  addr;
8557   UChar   rm = getUChar(delta);
8558   IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
8559   IRExpr* epart = NULL;
8560   if (epartIsReg(rm)) {
8561      epart = getXMMReg(eregOfRexRM(pfx,rm));
8562      DIP("%s %s,%s\n", opname,
8563                        nameXMMReg(eregOfRexRM(pfx,rm)),
8564                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8565      delta += 1;
8566   } else {
8567      addr  = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8568      epart = loadLE(Ity_V128, mkexpr(addr));
8569      DIP("%s %s,%s\n", opname,
8570                        dis_buf,
8571                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8572      delta += alen;
8573   }
8574   putXMMReg( gregOfRexRM(pfx,rm),
8575              eLeft ? binop(op, epart, gpart)
8576	            : binop(op, gpart, epart) );
8577   return delta;
8578}
8579
8580
8581/* Helper for doing SSE FP comparisons.  False return ==> unhandled.
8582   This is all a bit of a kludge in that it ignores the subtleties of
8583   ordered-vs-unordered and signalling-vs-nonsignalling in the Intel
8584   spec. */
8585static Bool findSSECmpOp ( /*OUT*/Bool* preSwapP,
8586                           /*OUT*/IROp* opP,
8587                           /*OUT*/Bool* postNotP,
8588                           UInt imm8, Bool all_lanes, Int sz )
8589{
8590   if (imm8 >= 32) return False;
8591
8592   /* First, compute a (preSwap, op, postNot) triple from
8593      the supplied imm8. */
8594   Bool pre = False;
8595   IROp op  = Iop_INVALID;
8596   Bool not = False;
8597
8598#  define XXX(_pre, _op, _not) { pre = _pre; op = _op; not = _not; }
8599   // If you add a case here, add a corresponding test for both VCMPSD_128
8600   // and VCMPSS_128 in avx-1.c.
8601   switch (imm8) {
8602      // "O" = ordered, "U" = unordered
8603      // "Q" = non-signalling (quiet), "S" = signalling
8604      //
8605      //             swap operands?
8606      //             |
8607      //             |      cmp op          invert after?
8608      //             |      |               |
8609      //             v      v               v
8610      case 0x0:  XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_OQ
8611      case 0x1:  XXX(False, Iop_CmpLT32Fx4, False); break; // LT_OS
8612      case 0x2:  XXX(False, Iop_CmpLE32Fx4, False); break; // LE_OS
8613      case 0x3:  XXX(False, Iop_CmpUN32Fx4, False); break; // UNORD_Q
8614      case 0x4:  XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_UQ
8615      case 0x5:  XXX(False, Iop_CmpLT32Fx4, True);  break; // NLT_US
8616      case 0x6:  XXX(False, Iop_CmpLE32Fx4, True);  break; // NLE_US
8617      case 0x7:  XXX(False, Iop_CmpUN32Fx4, True);  break; // ORD_Q
8618      // 0x8  EQ_UQ
8619      case 0x9:  XXX(True,  Iop_CmpLE32Fx4, True);  break; // NGE_US
8620      /* "Enhanced Comparison Predicate[s] for VEX-Encoded [insns] */
8621      case 0xA:  XXX(True,  Iop_CmpLT32Fx4, True);  break; // NGT_US
8622      // 0xB  FALSE_OQ
8623      // 0xC: this isn't really right because it returns all-1s when
8624      // either operand is a NaN, and it should return all-0s.
8625      case 0xC:  XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_OQ
8626      case 0xD:  XXX(True,  Iop_CmpLE32Fx4, False); break; // GE_OS
8627      case 0xE:  XXX(True,  Iop_CmpLT32Fx4, False); break; // GT_OS
8628      // 0xF  TRUE_UQ
8629      // 0x10  EQ_OS
8630      case 0x11: XXX(False, Iop_CmpLT32Fx4, False); break; // LT_OQ
8631      case 0x12: XXX(False, Iop_CmpLE32Fx4, False); break; // LE_OQ
8632      // 0x13  UNORD_S
8633      // 0x14  NEQ_US
8634      // 0x15  NLT_UQ
8635      case 0x16: XXX(False, Iop_CmpLE32Fx4, True);  break; // NLE_UQ
8636      // 0x17  ORD_S
8637      // 0x18  EQ_US
8638      // 0x19  NGE_UQ
8639      // 0x1A  NGT_UQ
8640      // 0x1B  FALSE_OS
8641      // 0x1C  NEQ_OS
8642      // 0x1D  GE_OQ
8643      case 0x1E: XXX(True,  Iop_CmpLT32Fx4, False); break; // GT_OQ
8644      // 0x1F  TRUE_US
8645      /* Don't forget to add test cases to VCMPSS_128_<imm8> in
8646         avx-1.c if new cases turn up. */
8647      default: break;
8648   }
8649#  undef XXX
8650   if (op == Iop_INVALID) return False;
8651
8652   /* Now convert the op into one with the same arithmetic but that is
8653      correct for the width and laneage requirements. */
8654
8655   /**/ if (sz == 4 && all_lanes) {
8656      switch (op) {
8657         case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32Fx4; break;
8658         case Iop_CmpLT32Fx4: op = Iop_CmpLT32Fx4; break;
8659         case Iop_CmpLE32Fx4: op = Iop_CmpLE32Fx4; break;
8660         case Iop_CmpUN32Fx4: op = Iop_CmpUN32Fx4; break;
8661         default: vassert(0);
8662      }
8663   }
8664   else if (sz == 4 && !all_lanes) {
8665      switch (op) {
8666         case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32F0x4; break;
8667         case Iop_CmpLT32Fx4: op = Iop_CmpLT32F0x4; break;
8668         case Iop_CmpLE32Fx4: op = Iop_CmpLE32F0x4; break;
8669         case Iop_CmpUN32Fx4: op = Iop_CmpUN32F0x4; break;
8670         default: vassert(0);
8671      }
8672   }
8673   else if (sz == 8 && all_lanes) {
8674      switch (op) {
8675         case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64Fx2; break;
8676         case Iop_CmpLT32Fx4: op = Iop_CmpLT64Fx2; break;
8677         case Iop_CmpLE32Fx4: op = Iop_CmpLE64Fx2; break;
8678         case Iop_CmpUN32Fx4: op = Iop_CmpUN64Fx2; break;
8679         default: vassert(0);
8680      }
8681   }
8682   else if (sz == 8 && !all_lanes) {
8683      switch (op) {
8684         case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64F0x2; break;
8685         case Iop_CmpLT32Fx4: op = Iop_CmpLT64F0x2; break;
8686         case Iop_CmpLE32Fx4: op = Iop_CmpLE64F0x2; break;
8687         case Iop_CmpUN32Fx4: op = Iop_CmpUN64F0x2; break;
8688         default: vassert(0);
8689      }
8690   }
8691   else {
8692      vpanic("findSSECmpOp(amd64,guest)");
8693   }
8694
8695   *preSwapP = pre; *opP = op; *postNotP = not;
8696   return True;
8697}
8698
8699
8700/* Handles SSE 32F/64F comparisons.  It can fail, in which case it
8701   returns the original delta to indicate failure. */
8702
8703static Long dis_SSE_cmp_E_to_G ( VexAbiInfo* vbi,
8704                                 Prefix pfx, Long delta,
8705                                 HChar* opname, Bool all_lanes, Int sz )
8706{
8707   Long    delta0 = delta;
8708   HChar   dis_buf[50];
8709   Int     alen;
8710   UInt    imm8;
8711   IRTemp  addr;
8712   Bool    preSwap = False;
8713   IROp    op      = Iop_INVALID;
8714   Bool    postNot = False;
8715   IRTemp  plain   = newTemp(Ity_V128);
8716   UChar   rm      = getUChar(delta);
8717   UShort  mask    = 0;
8718   vassert(sz == 4 || sz == 8);
8719   if (epartIsReg(rm)) {
8720      imm8 = getUChar(delta+1);
8721      if (imm8 >= 8) return delta0; /* FAIL */
8722      Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
8723      if (!ok) return delta0; /* FAIL */
8724      vassert(!preSwap); /* never needed for imm8 < 8 */
8725      assign( plain, binop(op, getXMMReg(gregOfRexRM(pfx,rm)),
8726                               getXMMReg(eregOfRexRM(pfx,rm))) );
8727      delta += 2;
8728      DIP("%s $%d,%s,%s\n", opname,
8729                            (Int)imm8,
8730                            nameXMMReg(eregOfRexRM(pfx,rm)),
8731                            nameXMMReg(gregOfRexRM(pfx,rm)) );
8732   } else {
8733      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
8734      imm8 = getUChar(delta+alen);
8735      if (imm8 >= 8) return delta0; /* FAIL */
8736      Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
8737      if (!ok) return delta0; /* FAIL */
8738      vassert(!preSwap); /* never needed for imm8 < 8 */
8739      assign( plain,
8740              binop(
8741                 op,
8742                 getXMMReg(gregOfRexRM(pfx,rm)),
8743                   all_lanes
8744                      ? loadLE(Ity_V128, mkexpr(addr))
8745                   : sz == 8
8746                      ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
8747                   : /*sz==4*/
8748                      unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
8749	      )
8750      );
8751      delta += alen+1;
8752      DIP("%s $%d,%s,%s\n", opname,
8753                            (Int)imm8,
8754                            dis_buf,
8755                            nameXMMReg(gregOfRexRM(pfx,rm)) );
8756   }
8757
8758   if (postNot && all_lanes) {
8759      putXMMReg( gregOfRexRM(pfx,rm),
8760                 unop(Iop_NotV128, mkexpr(plain)) );
8761   }
8762   else
8763   if (postNot && !all_lanes) {
8764      mask = toUShort(sz==4 ? 0x000F : 0x00FF);
8765      putXMMReg( gregOfRexRM(pfx,rm),
8766                 binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
8767   }
8768   else {
8769      putXMMReg( gregOfRexRM(pfx,rm), mkexpr(plain) );
8770   }
8771
8772   return delta;
8773}
8774
8775
8776/* Vector by scalar shift of G by the amount specified at the bottom
8777   of E. */
8778
8779static ULong dis_SSE_shiftG_byE ( VexAbiInfo* vbi,
8780                                  Prefix pfx, Long delta,
8781                                  HChar* opname, IROp op )
8782{
8783   HChar   dis_buf[50];
8784   Int     alen, size;
8785   IRTemp  addr;
8786   Bool    shl, shr, sar;
8787   UChar   rm   = getUChar(delta);
8788   IRTemp  g0   = newTemp(Ity_V128);
8789   IRTemp  g1   = newTemp(Ity_V128);
8790   IRTemp  amt  = newTemp(Ity_I64);
8791   IRTemp  amt8 = newTemp(Ity_I8);
8792   if (epartIsReg(rm)) {
8793      assign( amt, getXMMRegLane64(eregOfRexRM(pfx,rm), 0) );
8794      DIP("%s %s,%s\n", opname,
8795                        nameXMMReg(eregOfRexRM(pfx,rm)),
8796                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8797      delta++;
8798   } else {
8799      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8800      assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
8801      DIP("%s %s,%s\n", opname,
8802                        dis_buf,
8803                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8804      delta += alen;
8805   }
8806   assign( g0,   getXMMReg(gregOfRexRM(pfx,rm)) );
8807   assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
8808
8809   shl = shr = sar = False;
8810   size = 0;
8811   switch (op) {
8812      case Iop_ShlN16x8: shl = True; size = 32; break;
8813      case Iop_ShlN32x4: shl = True; size = 32; break;
8814      case Iop_ShlN64x2: shl = True; size = 64; break;
8815      case Iop_SarN16x8: sar = True; size = 16; break;
8816      case Iop_SarN32x4: sar = True; size = 32; break;
8817      case Iop_ShrN16x8: shr = True; size = 16; break;
8818      case Iop_ShrN32x4: shr = True; size = 32; break;
8819      case Iop_ShrN64x2: shr = True; size = 64; break;
8820      default: vassert(0);
8821   }
8822
8823   if (shl || shr) {
8824     assign(
8825        g1,
8826        IRExpr_Mux0X(
8827           unop(Iop_1Uto8,
8828                binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size))),
8829           mkV128(0x0000),
8830           binop(op, mkexpr(g0), mkexpr(amt8))
8831        )
8832     );
8833   } else
8834   if (sar) {
8835     assign(
8836        g1,
8837        IRExpr_Mux0X(
8838           unop(Iop_1Uto8,
8839                binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size))),
8840           binop(op, mkexpr(g0), mkU8(size-1)),
8841           binop(op, mkexpr(g0), mkexpr(amt8))
8842        )
8843     );
8844   } else {
8845      vassert(0);
8846   }
8847
8848   putXMMReg( gregOfRexRM(pfx,rm), mkexpr(g1) );
8849   return delta;
8850}
8851
8852
8853/* Vector by scalar shift of E by an immediate byte. */
8854
8855static
8856ULong dis_SSE_shiftE_imm ( Prefix pfx,
8857                           Long delta, HChar* opname, IROp op )
8858{
8859   Bool    shl, shr, sar;
8860   UChar   rm   = getUChar(delta);
8861   IRTemp  e0   = newTemp(Ity_V128);
8862   IRTemp  e1   = newTemp(Ity_V128);
8863   UChar   amt, size;
8864   vassert(epartIsReg(rm));
8865   vassert(gregLO3ofRM(rm) == 2
8866           || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
8867   amt = getUChar(delta+1);
8868   delta += 2;
8869   DIP("%s $%d,%s\n", opname,
8870                      (Int)amt,
8871                      nameXMMReg(eregOfRexRM(pfx,rm)) );
8872   assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
8873
8874   shl = shr = sar = False;
8875   size = 0;
8876   switch (op) {
8877      case Iop_ShlN16x8: shl = True; size = 16; break;
8878      case Iop_ShlN32x4: shl = True; size = 32; break;
8879      case Iop_ShlN64x2: shl = True; size = 64; break;
8880      case Iop_SarN16x8: sar = True; size = 16; break;
8881      case Iop_SarN32x4: sar = True; size = 32; break;
8882      case Iop_ShrN16x8: shr = True; size = 16; break;
8883      case Iop_ShrN32x4: shr = True; size = 32; break;
8884      case Iop_ShrN64x2: shr = True; size = 64; break;
8885      default: vassert(0);
8886   }
8887
8888   if (shl || shr) {
8889     assign( e1, amt >= size
8890                    ? mkV128(0x0000)
8891                    : binop(op, mkexpr(e0), mkU8(amt))
8892     );
8893   } else
8894   if (sar) {
8895     assign( e1, amt >= size
8896                    ? binop(op, mkexpr(e0), mkU8(size-1))
8897                    : binop(op, mkexpr(e0), mkU8(amt))
8898     );
8899   } else {
8900      vassert(0);
8901   }
8902
8903   putXMMReg( eregOfRexRM(pfx,rm), mkexpr(e1) );
8904   return delta;
8905}
8906
8907
8908/* Get the current SSE rounding mode. */
8909
8910static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
8911{
8912   return
8913      unop( Iop_64to32,
8914            binop( Iop_And64,
8915                   IRExpr_Get( OFFB_SSEROUND, Ity_I64 ),
8916                   mkU64(3) ));
8917}
8918
8919static void put_sse_roundingmode ( IRExpr* sseround )
8920{
8921   vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
8922   stmt( IRStmt_Put( OFFB_SSEROUND,
8923                     unop(Iop_32Uto64,sseround) ) );
8924}
8925
8926/* Break a V128-bit value up into four 32-bit ints. */
8927
8928static void breakupV128to32s ( IRTemp t128,
8929                               /*OUTs*/
8930                               IRTemp* t3, IRTemp* t2,
8931                               IRTemp* t1, IRTemp* t0 )
8932{
8933   IRTemp hi64 = newTemp(Ity_I64);
8934   IRTemp lo64 = newTemp(Ity_I64);
8935   assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
8936   assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
8937
8938   vassert(t0 && *t0 == IRTemp_INVALID);
8939   vassert(t1 && *t1 == IRTemp_INVALID);
8940   vassert(t2 && *t2 == IRTemp_INVALID);
8941   vassert(t3 && *t3 == IRTemp_INVALID);
8942
8943   *t0 = newTemp(Ity_I32);
8944   *t1 = newTemp(Ity_I32);
8945   *t2 = newTemp(Ity_I32);
8946   *t3 = newTemp(Ity_I32);
8947   assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
8948   assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
8949   assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
8950   assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
8951}
8952
8953/* Construct a V128-bit value from four 32-bit ints. */
8954
8955static IRExpr* mkV128from32s ( IRTemp t3, IRTemp t2,
8956                               IRTemp t1, IRTemp t0 )
8957{
8958   return
8959      binop( Iop_64HLtoV128,
8960             binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
8961             binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
8962   );
8963}
8964
8965/* Break a 64-bit value up into four 16-bit ints. */
8966
8967static void breakup64to16s ( IRTemp t64,
8968                             /*OUTs*/
8969                             IRTemp* t3, IRTemp* t2,
8970                             IRTemp* t1, IRTemp* t0 )
8971{
8972   IRTemp hi32 = newTemp(Ity_I32);
8973   IRTemp lo32 = newTemp(Ity_I32);
8974   assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
8975   assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
8976
8977   vassert(t0 && *t0 == IRTemp_INVALID);
8978   vassert(t1 && *t1 == IRTemp_INVALID);
8979   vassert(t2 && *t2 == IRTemp_INVALID);
8980   vassert(t3 && *t3 == IRTemp_INVALID);
8981
8982   *t0 = newTemp(Ity_I16);
8983   *t1 = newTemp(Ity_I16);
8984   *t2 = newTemp(Ity_I16);
8985   *t3 = newTemp(Ity_I16);
8986   assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
8987   assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
8988   assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
8989   assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
8990}
8991
8992/* Construct a 64-bit value from four 16-bit ints. */
8993
8994static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
8995                             IRTemp t1, IRTemp t0 )
8996{
8997   return
8998      binop( Iop_32HLto64,
8999             binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
9000             binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
9001   );
9002}
9003
9004/* Break a V256-bit value up into four 64-bit ints. */
9005
9006static void breakupV256to64s ( IRTemp t256,
9007                               /*OUTs*/
9008                               IRTemp* t3, IRTemp* t2,
9009                               IRTemp* t1, IRTemp* t0 )
9010{
9011   vassert(t0 && *t0 == IRTemp_INVALID);
9012   vassert(t1 && *t1 == IRTemp_INVALID);
9013   vassert(t2 && *t2 == IRTemp_INVALID);
9014   vassert(t3 && *t3 == IRTemp_INVALID);
9015   *t0 = newTemp(Ity_I64);
9016   *t1 = newTemp(Ity_I64);
9017   *t2 = newTemp(Ity_I64);
9018   *t3 = newTemp(Ity_I64);
9019   assign( *t0, unop(Iop_V256to64_0, mkexpr(t256)) );
9020   assign( *t1, unop(Iop_V256to64_1, mkexpr(t256)) );
9021   assign( *t2, unop(Iop_V256to64_2, mkexpr(t256)) );
9022   assign( *t3, unop(Iop_V256to64_3, mkexpr(t256)) );
9023}
9024
9025/* Break a V256-bit value up into two V128s. */
9026
9027static void breakupV256toV128s ( IRTemp t256,
9028                                 /*OUTs*/
9029                                 IRTemp* t1, IRTemp* t0 )
9030{
9031   vassert(t0 && *t0 == IRTemp_INVALID);
9032   vassert(t1 && *t1 == IRTemp_INVALID);
9033   *t0 = newTemp(Ity_V128);
9034   *t1 = newTemp(Ity_V128);
9035   assign(*t1, unop(Iop_V256toV128_1, mkexpr(t256)));
9036   assign(*t0, unop(Iop_V256toV128_0, mkexpr(t256)));
9037}
9038
9039/* Break a V256-bit value up into eight 32-bit ints.  */
9040
9041static void breakupV256to32s ( IRTemp t256,
9042                               /*OUTs*/
9043                               IRTemp* t7, IRTemp* t6,
9044                               IRTemp* t5, IRTemp* t4,
9045                               IRTemp* t3, IRTemp* t2,
9046                               IRTemp* t1, IRTemp* t0 )
9047{
9048   IRTemp t128_1 = IRTemp_INVALID;
9049   IRTemp t128_0 = IRTemp_INVALID;
9050   breakupV256toV128s( t256, &t128_1, &t128_0 );
9051   breakupV128to32s( t128_1, t7, t6, t5, t4 );
9052   breakupV128to32s( t128_0, t3, t2, t1, t0 );
9053}
9054
9055/* Break a V128-bit value up into two 64-bit ints. */
9056
9057static void breakupV128to64s ( IRTemp t128,
9058                               /*OUTs*/
9059                               IRTemp* t1, IRTemp* t0 )
9060{
9061   vassert(t0 && *t0 == IRTemp_INVALID);
9062   vassert(t1 && *t1 == IRTemp_INVALID);
9063   *t0 = newTemp(Ity_I64);
9064   *t1 = newTemp(Ity_I64);
9065   assign( *t0, unop(Iop_V128to64,   mkexpr(t128)) );
9066   assign( *t1, unop(Iop_V128HIto64, mkexpr(t128)) );
9067}
9068
9069/* Construct a V256-bit value from eight 32-bit ints. */
9070
9071static IRExpr* mkV256from32s ( IRTemp t7, IRTemp t6,
9072                               IRTemp t5, IRTemp t4,
9073                               IRTemp t3, IRTemp t2,
9074                               IRTemp t1, IRTemp t0 )
9075{
9076   return
9077      binop( Iop_V128HLtoV256,
9078             binop( Iop_64HLtoV128,
9079                    binop(Iop_32HLto64, mkexpr(t7), mkexpr(t6)),
9080                    binop(Iop_32HLto64, mkexpr(t5), mkexpr(t4)) ),
9081             binop( Iop_64HLtoV128,
9082                    binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
9083                    binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0)) )
9084   );
9085}
9086
9087/* Construct a V256-bit value from four 64-bit ints. */
9088
9089static IRExpr* mkV256from64s ( IRTemp t3, IRTemp t2,
9090                               IRTemp t1, IRTemp t0 )
9091{
9092   return
9093      binop( Iop_V128HLtoV256,
9094             binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)),
9095             binop(Iop_64HLtoV128, mkexpr(t1), mkexpr(t0))
9096   );
9097}
9098
9099/* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
9100   values (aa,bb), computes, for each of the 4 16-bit lanes:
9101
9102   (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
9103*/
9104static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
9105{
9106   IRTemp aa      = newTemp(Ity_I64);
9107   IRTemp bb      = newTemp(Ity_I64);
9108   IRTemp aahi32s = newTemp(Ity_I64);
9109   IRTemp aalo32s = newTemp(Ity_I64);
9110   IRTemp bbhi32s = newTemp(Ity_I64);
9111   IRTemp bblo32s = newTemp(Ity_I64);
9112   IRTemp rHi     = newTemp(Ity_I64);
9113   IRTemp rLo     = newTemp(Ity_I64);
9114   IRTemp one32x2 = newTemp(Ity_I64);
9115   assign(aa, aax);
9116   assign(bb, bbx);
9117   assign( aahi32s,
9118           binop(Iop_SarN32x2,
9119                 binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
9120                 mkU8(16) ));
9121   assign( aalo32s,
9122           binop(Iop_SarN32x2,
9123                 binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
9124                 mkU8(16) ));
9125   assign( bbhi32s,
9126           binop(Iop_SarN32x2,
9127                 binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
9128                 mkU8(16) ));
9129   assign( bblo32s,
9130           binop(Iop_SarN32x2,
9131                 binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
9132                 mkU8(16) ));
9133   assign(one32x2, mkU64( (1ULL << 32) + 1 ));
9134   assign(
9135      rHi,
9136      binop(
9137         Iop_ShrN32x2,
9138         binop(
9139            Iop_Add32x2,
9140            binop(
9141               Iop_ShrN32x2,
9142               binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
9143               mkU8(14)
9144            ),
9145            mkexpr(one32x2)
9146         ),
9147         mkU8(1)
9148      )
9149   );
9150   assign(
9151      rLo,
9152      binop(
9153         Iop_ShrN32x2,
9154         binop(
9155            Iop_Add32x2,
9156            binop(
9157               Iop_ShrN32x2,
9158               binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
9159               mkU8(14)
9160            ),
9161            mkexpr(one32x2)
9162         ),
9163         mkU8(1)
9164      )
9165   );
9166   return
9167      binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
9168}
9169
9170/* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
9171   values (aa,bb), computes, for each lane:
9172
9173          if aa_lane < 0 then - bb_lane
9174     else if aa_lane > 0 then bb_lane
9175     else 0
9176*/
9177static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
9178{
9179   IRTemp aa       = newTemp(Ity_I64);
9180   IRTemp bb       = newTemp(Ity_I64);
9181   IRTemp zero     = newTemp(Ity_I64);
9182   IRTemp bbNeg    = newTemp(Ity_I64);
9183   IRTemp negMask  = newTemp(Ity_I64);
9184   IRTemp posMask  = newTemp(Ity_I64);
9185   IROp   opSub    = Iop_INVALID;
9186   IROp   opCmpGTS = Iop_INVALID;
9187
9188   switch (laneszB) {
9189      case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
9190      case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
9191      case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
9192      default: vassert(0);
9193   }
9194
9195   assign( aa,      aax );
9196   assign( bb,      bbx );
9197   assign( zero,    mkU64(0) );
9198   assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
9199   assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
9200   assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
9201
9202   return
9203      binop(Iop_Or64,
9204            binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
9205            binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
9206
9207}
9208
9209
9210/* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
9211   value aa, computes, for each lane
9212
9213   if aa < 0 then -aa else aa
9214
9215   Note that the result is interpreted as unsigned, so that the
9216   absolute value of the most negative signed input can be
9217   represented.
9218*/
9219static IRTemp math_PABS_MMX ( IRTemp aa, Int laneszB )
9220{
9221   IRTemp res     = newTemp(Ity_I64);
9222   IRTemp zero    = newTemp(Ity_I64);
9223   IRTemp aaNeg   = newTemp(Ity_I64);
9224   IRTemp negMask = newTemp(Ity_I64);
9225   IRTemp posMask = newTemp(Ity_I64);
9226   IROp   opSub   = Iop_INVALID;
9227   IROp   opSarN  = Iop_INVALID;
9228
9229   switch (laneszB) {
9230      case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
9231      case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
9232      case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
9233      default: vassert(0);
9234   }
9235
9236   assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
9237   assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
9238   assign( zero,    mkU64(0) );
9239   assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
9240   assign( res,
9241           binop(Iop_Or64,
9242                 binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
9243                 binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) ));
9244   return res;
9245}
9246
9247/* XMM version of math_PABS_MMX. */
9248static IRTemp math_PABS_XMM ( IRTemp aa, Int laneszB )
9249{
9250   IRTemp res  = newTemp(Ity_V128);
9251   IRTemp aaHi = newTemp(Ity_I64);
9252   IRTemp aaLo = newTemp(Ity_I64);
9253   assign(aaHi, unop(Iop_V128HIto64, mkexpr(aa)));
9254   assign(aaLo, unop(Iop_V128to64, mkexpr(aa)));
9255   assign(res, binop(Iop_64HLtoV128,
9256                     mkexpr(math_PABS_MMX(aaHi, laneszB)),
9257                     mkexpr(math_PABS_MMX(aaLo, laneszB))));
9258   return res;
9259}
9260
9261/* Specialisations of math_PABS_XMM, since there's no easy way to do
9262   partial applications in C :-( */
9263static IRTemp math_PABS_XMM_pap4 ( IRTemp aa ) {
9264   return math_PABS_XMM(aa, 4);
9265}
9266
9267static IRTemp math_PABS_XMM_pap2 ( IRTemp aa ) {
9268   return math_PABS_XMM(aa, 2);
9269}
9270
9271static IRTemp math_PABS_XMM_pap1 ( IRTemp aa ) {
9272   return math_PABS_XMM(aa, 1);
9273}
9274
9275static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
9276                                        IRTemp lo64, Long byteShift )
9277{
9278   vassert(byteShift >= 1 && byteShift <= 7);
9279   return
9280      binop(Iop_Or64,
9281            binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
9282            binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
9283      );
9284}
9285
9286static IRTemp math_PALIGNR_XMM ( IRTemp sV, IRTemp dV, UInt imm8 )
9287{
9288   IRTemp res = newTemp(Ity_V128);
9289   IRTemp sHi = newTemp(Ity_I64);
9290   IRTemp sLo = newTemp(Ity_I64);
9291   IRTemp dHi = newTemp(Ity_I64);
9292   IRTemp dLo = newTemp(Ity_I64);
9293   IRTemp rHi = newTemp(Ity_I64);
9294   IRTemp rLo = newTemp(Ity_I64);
9295
9296   assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
9297   assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
9298   assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
9299   assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
9300
9301   if (imm8 == 0) {
9302      assign( rHi, mkexpr(sHi) );
9303      assign( rLo, mkexpr(sLo) );
9304   }
9305   else if (imm8 >= 1 && imm8 <= 7) {
9306      assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, imm8) );
9307      assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, imm8) );
9308   }
9309   else if (imm8 == 8) {
9310      assign( rHi, mkexpr(dLo) );
9311      assign( rLo, mkexpr(sHi) );
9312   }
9313   else if (imm8 >= 9 && imm8 <= 15) {
9314      assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-8) );
9315      assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, imm8-8) );
9316   }
9317   else if (imm8 == 16) {
9318      assign( rHi, mkexpr(dHi) );
9319      assign( rLo, mkexpr(dLo) );
9320   }
9321   else if (imm8 >= 17 && imm8 <= 23) {
9322      assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-16))) );
9323      assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-16) );
9324   }
9325   else if (imm8 == 24) {
9326      assign( rHi, mkU64(0) );
9327      assign( rLo, mkexpr(dHi) );
9328   }
9329   else if (imm8 >= 25 && imm8 <= 31) {
9330      assign( rHi, mkU64(0) );
9331      assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-24))) );
9332   }
9333   else if (imm8 >= 32 && imm8 <= 255) {
9334      assign( rHi, mkU64(0) );
9335      assign( rLo, mkU64(0) );
9336   }
9337   else
9338      vassert(0);
9339
9340   assign( res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)));
9341   return res;
9342}
9343
9344
9345/* Generate a SIGSEGV followed by a restart of the current instruction
9346   if effective_addr is not 16-aligned.  This is required behaviour
9347   for some SSE3 instructions and all 128-bit SSSE3 instructions.
9348   This assumes that guest_RIP_curr_instr is set correctly! */
9349static
9350void gen_SEGV_if_not_XX_aligned ( IRTemp effective_addr, ULong mask )
9351{
9352   stmt(
9353      IRStmt_Exit(
9354         binop(Iop_CmpNE64,
9355               binop(Iop_And64,mkexpr(effective_addr),mkU64(mask)),
9356               mkU64(0)),
9357         Ijk_SigSEGV,
9358         IRConst_U64(guest_RIP_curr_instr),
9359         OFFB_RIP
9360      )
9361   );
9362}
9363
9364static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr ) {
9365   gen_SEGV_if_not_XX_aligned(effective_addr, 16-1);
9366}
9367
9368static void gen_SEGV_if_not_32_aligned ( IRTemp effective_addr ) {
9369   gen_SEGV_if_not_XX_aligned(effective_addr, 32-1);
9370}
9371
9372/* Helper for deciding whether a given insn (starting at the opcode
9373   byte) may validly be used with a LOCK prefix.  The following insns
9374   may be used with LOCK when their destination operand is in memory.
9375   AFAICS this is exactly the same for both 32-bit and 64-bit mode.
9376
9377   ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
9378   OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
9379   ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
9380   SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
9381   AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
9382   SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
9383   XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
9384
9385   DEC        FE /1,  FF /1
9386   INC        FE /0,  FF /0
9387
9388   NEG        F6 /3,  F7 /3
9389   NOT        F6 /2,  F7 /2
9390
9391   XCHG       86, 87
9392
9393   BTC        0F BB,  0F BA /7
9394   BTR        0F B3,  0F BA /6
9395   BTS        0F AB,  0F BA /5
9396
9397   CMPXCHG    0F B0,  0F B1
9398   CMPXCHG8B  0F C7 /1
9399
9400   XADD       0F C0,  0F C1
9401
9402   ------------------------------
9403
9404   80 /0  =  addb $imm8,  rm8
9405   81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
9406   82 /0  =  addb $imm8,  rm8
9407   83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
9408
9409   00     =  addb r8,  rm8
9410   01     =  addl r32, rm32  and  addw r16, rm16
9411
9412   Same for ADD OR ADC SBB AND SUB XOR
9413
9414   FE /1  = dec rm8
9415   FF /1  = dec rm32  and  dec rm16
9416
9417   FE /0  = inc rm8
9418   FF /0  = inc rm32  and  inc rm16
9419
9420   F6 /3  = neg rm8
9421   F7 /3  = neg rm32  and  neg rm16
9422
9423   F6 /2  = not rm8
9424   F7 /2  = not rm32  and  not rm16
9425
9426   0F BB     = btcw r16, rm16    and  btcl r32, rm32
9427   OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
9428
9429   Same for BTS, BTR
9430*/
9431static Bool can_be_used_with_LOCK_prefix ( UChar* opc )
9432{
9433   switch (opc[0]) {
9434      case 0x00: case 0x01: case 0x08: case 0x09:
9435      case 0x10: case 0x11: case 0x18: case 0x19:
9436      case 0x20: case 0x21: case 0x28: case 0x29:
9437      case 0x30: case 0x31:
9438         if (!epartIsReg(opc[1]))
9439            return True;
9440         break;
9441
9442      case 0x80: case 0x81: case 0x82: case 0x83:
9443         if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 6
9444             && !epartIsReg(opc[1]))
9445            return True;
9446         break;
9447
9448      case 0xFE: case 0xFF:
9449         if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 1
9450             && !epartIsReg(opc[1]))
9451            return True;
9452         break;
9453
9454      case 0xF6: case 0xF7:
9455         if (gregLO3ofRM(opc[1]) >= 2 && gregLO3ofRM(opc[1]) <= 3
9456             && !epartIsReg(opc[1]))
9457            return True;
9458         break;
9459
9460      case 0x86: case 0x87:
9461         if (!epartIsReg(opc[1]))
9462            return True;
9463         break;
9464
9465      case 0x0F: {
9466         switch (opc[1]) {
9467            case 0xBB: case 0xB3: case 0xAB:
9468               if (!epartIsReg(opc[2]))
9469                  return True;
9470               break;
9471            case 0xBA:
9472               if (gregLO3ofRM(opc[2]) >= 5 && gregLO3ofRM(opc[2]) <= 7
9473                   && !epartIsReg(opc[2]))
9474                  return True;
9475               break;
9476            case 0xB0: case 0xB1:
9477               if (!epartIsReg(opc[2]))
9478                  return True;
9479               break;
9480            case 0xC7:
9481               if (gregLO3ofRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
9482                  return True;
9483               break;
9484            case 0xC0: case 0xC1:
9485               if (!epartIsReg(opc[2]))
9486                  return True;
9487               break;
9488            default:
9489               break;
9490         } /* switch (opc[1]) */
9491         break;
9492      }
9493
9494      default:
9495         break;
9496   } /* switch (opc[0]) */
9497
9498   return False;
9499}
9500
9501
9502/*------------------------------------------------------------*/
9503/*---                                                      ---*/
9504/*--- Top-level SSE/SSE2: dis_ESC_0F__SSE2                 ---*/
9505/*---                                                      ---*/
9506/*------------------------------------------------------------*/
9507
9508static Long dis_COMISD ( VexAbiInfo* vbi, Prefix pfx,
9509                         Long delta, Bool isAvx, UChar opc )
9510{
9511   vassert(opc == 0x2F/*COMISD*/ || opc == 0x2E/*UCOMISD*/);
9512   Int    alen  = 0;
9513   HChar  dis_buf[50];
9514   IRTemp argL  = newTemp(Ity_F64);
9515   IRTemp argR  = newTemp(Ity_F64);
9516   UChar  modrm = getUChar(delta);
9517   IRTemp addr  = IRTemp_INVALID;
9518   if (epartIsReg(modrm)) {
9519      assign( argR, getXMMRegLane64F( eregOfRexRM(pfx,modrm),
9520                                      0/*lowest lane*/ ) );
9521      delta += 1;
9522      DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
9523                                opc==0x2E ? "u" : "",
9524                                nameXMMReg(eregOfRexRM(pfx,modrm)),
9525                                nameXMMReg(gregOfRexRM(pfx,modrm)) );
9526   } else {
9527      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9528      assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
9529      delta += alen;
9530      DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
9531                                opc==0x2E ? "u" : "",
9532                                dis_buf,
9533                                nameXMMReg(gregOfRexRM(pfx,modrm)) );
9534   }
9535   assign( argL, getXMMRegLane64F( gregOfRexRM(pfx,modrm),
9536                                   0/*lowest lane*/ ) );
9537
9538   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
9539   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
9540   stmt( IRStmt_Put(
9541            OFFB_CC_DEP1,
9542            binop( Iop_And64,
9543                   unop( Iop_32Uto64,
9544                         binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)) ),
9545                   mkU64(0x45)
9546       )));
9547   return delta;
9548}
9549
9550
9551static Long dis_COMISS ( VexAbiInfo* vbi, Prefix pfx,
9552                         Long delta, Bool isAvx, UChar opc )
9553{
9554   vassert(opc == 0x2F/*COMISS*/ || opc == 0x2E/*UCOMISS*/);
9555   Int    alen  = 0;
9556   HChar  dis_buf[50];
9557   IRTemp argL  = newTemp(Ity_F32);
9558   IRTemp argR  = newTemp(Ity_F32);
9559   UChar  modrm = getUChar(delta);
9560   IRTemp addr  = IRTemp_INVALID;
9561   if (epartIsReg(modrm)) {
9562      assign( argR, getXMMRegLane32F( eregOfRexRM(pfx,modrm),
9563                                      0/*lowest lane*/ ) );
9564      delta += 1;
9565      DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
9566                                opc==0x2E ? "u" : "",
9567                                nameXMMReg(eregOfRexRM(pfx,modrm)),
9568                                nameXMMReg(gregOfRexRM(pfx,modrm)) );
9569   } else {
9570      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9571      assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
9572      delta += alen;
9573      DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
9574                                opc==0x2E ? "u" : "",
9575                                dis_buf,
9576                                nameXMMReg(gregOfRexRM(pfx,modrm)) );
9577   }
9578   assign( argL, getXMMRegLane32F( gregOfRexRM(pfx,modrm),
9579                                   0/*lowest lane*/ ) );
9580
9581   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
9582   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
9583   stmt( IRStmt_Put(
9584            OFFB_CC_DEP1,
9585            binop( Iop_And64,
9586                   unop( Iop_32Uto64,
9587                         binop(Iop_CmpF64,
9588                               unop(Iop_F32toF64,mkexpr(argL)),
9589                               unop(Iop_F32toF64,mkexpr(argR)))),
9590                   mkU64(0x45)
9591       )));
9592   return delta;
9593}
9594
9595
9596static Long dis_PSHUFD_32x4 ( VexAbiInfo* vbi, Prefix pfx,
9597                              Long delta, Bool writesYmm )
9598{
9599   Int    order;
9600   Int    alen  = 0;
9601   HChar  dis_buf[50];
9602   IRTemp sV    = newTemp(Ity_V128);
9603   UChar  modrm = getUChar(delta);
9604   HChar* strV  = writesYmm ? "v" : "";
9605   IRTemp addr  = IRTemp_INVALID;
9606   if (epartIsReg(modrm)) {
9607      assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
9608      order = (Int)getUChar(delta+1);
9609      delta += 1+1;
9610      DIP("%spshufd $%d,%s,%s\n", strV, order,
9611                                  nameXMMReg(eregOfRexRM(pfx,modrm)),
9612                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
9613   } else {
9614      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
9615                        1/*byte after the amode*/ );
9616      assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
9617      order = (Int)getUChar(delta+alen);
9618      delta += alen+1;
9619      DIP("%spshufd $%d,%s,%s\n", strV, order,
9620                                 dis_buf,
9621                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
9622   }
9623
9624   IRTemp s3, s2, s1, s0;
9625   s3 = s2 = s1 = s0 = IRTemp_INVALID;
9626   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
9627
9628#  define SEL(n)  ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
9629   IRTemp dV = newTemp(Ity_V128);
9630   assign(dV,
9631          mkV128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
9632                         SEL((order>>2)&3), SEL((order>>0)&3) )
9633   );
9634#  undef SEL
9635
9636   (writesYmm ? putYMMRegLoAndZU : putXMMReg)
9637      (gregOfRexRM(pfx,modrm), mkexpr(dV));
9638   return delta;
9639}
9640
9641
9642static IRTemp math_PSRLDQ ( IRTemp sV, Int imm )
9643{
9644   IRTemp dV    = newTemp(Ity_V128);
9645   IRTemp hi64  = newTemp(Ity_I64);
9646   IRTemp lo64  = newTemp(Ity_I64);
9647   IRTemp hi64r = newTemp(Ity_I64);
9648   IRTemp lo64r = newTemp(Ity_I64);
9649
9650   vassert(imm >= 0 && imm <= 255);
9651   if (imm >= 16) {
9652      assign(dV, mkV128(0x0000));
9653      return dV;
9654   }
9655
9656   assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
9657   assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
9658
9659   if (imm == 0) {
9660      assign( lo64r, mkexpr(lo64) );
9661      assign( hi64r, mkexpr(hi64) );
9662   }
9663   else
9664   if (imm == 8) {
9665      assign( hi64r, mkU64(0) );
9666      assign( lo64r, mkexpr(hi64) );
9667   }
9668   else
9669   if (imm > 8) {
9670      assign( hi64r, mkU64(0) );
9671      assign( lo64r, binop( Iop_Shr64, mkexpr(hi64), mkU8( 8*(imm-8) ) ));
9672   } else {
9673      assign( hi64r, binop( Iop_Shr64, mkexpr(hi64), mkU8(8 * imm) ));
9674      assign( lo64r,
9675              binop( Iop_Or64,
9676                     binop(Iop_Shr64, mkexpr(lo64),
9677                           mkU8(8 * imm)),
9678                     binop(Iop_Shl64, mkexpr(hi64),
9679                           mkU8(8 * (8 - imm)) )
9680                     )
9681              );
9682   }
9683
9684   assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
9685   return dV;
9686}
9687
9688
9689static IRTemp math_PSLLDQ ( IRTemp sV, Int imm )
9690{
9691   IRTemp       dV    = newTemp(Ity_V128);
9692   IRTemp       hi64  = newTemp(Ity_I64);
9693   IRTemp       lo64  = newTemp(Ity_I64);
9694   IRTemp       hi64r = newTemp(Ity_I64);
9695   IRTemp       lo64r = newTemp(Ity_I64);
9696
9697   vassert(imm >= 0 && imm <= 255);
9698   if (imm >= 16) {
9699      assign(dV, mkV128(0x0000));
9700      return dV;
9701   }
9702
9703   assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
9704   assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
9705
9706   if (imm == 0) {
9707      assign( lo64r, mkexpr(lo64) );
9708      assign( hi64r, mkexpr(hi64) );
9709   }
9710   else
9711   if (imm == 8) {
9712      assign( lo64r, mkU64(0) );
9713      assign( hi64r, mkexpr(lo64) );
9714   }
9715   else
9716   if (imm > 8) {
9717      assign( lo64r, mkU64(0) );
9718      assign( hi64r, binop( Iop_Shl64, mkexpr(lo64), mkU8( 8*(imm-8) ) ));
9719   } else {
9720      assign( lo64r, binop( Iop_Shl64, mkexpr(lo64), mkU8(8 * imm) ));
9721      assign( hi64r,
9722              binop( Iop_Or64,
9723                     binop(Iop_Shl64, mkexpr(hi64),
9724                           mkU8(8 * imm)),
9725                     binop(Iop_Shr64, mkexpr(lo64),
9726                           mkU8(8 * (8 - imm)) )
9727                     )
9728              );
9729   }
9730
9731   assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
9732   return dV;
9733}
9734
9735
9736static Long dis_CVTxSD2SI ( VexAbiInfo* vbi, Prefix pfx,
9737                            Long delta, Bool isAvx, UChar opc, Int sz )
9738{
9739   vassert(opc == 0x2D/*CVTSD2SI*/ || opc == 0x2C/*CVTTSD2SI*/);
9740   HChar  dis_buf[50];
9741   Int    alen   = 0;
9742   UChar  modrm  = getUChar(delta);
9743   IRTemp addr   = IRTemp_INVALID;
9744   IRTemp rmode  = newTemp(Ity_I32);
9745   IRTemp f64lo  = newTemp(Ity_F64);
9746   Bool   r2zero = toBool(opc == 0x2C);
9747
9748   if (epartIsReg(modrm)) {
9749      delta += 1;
9750      assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
9751      DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
9752                                  nameXMMReg(eregOfRexRM(pfx,modrm)),
9753                                  nameIReg(sz, gregOfRexRM(pfx,modrm),
9754                                           False));
9755   } else {
9756      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9757      assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
9758      delta += alen;
9759      DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
9760                                  dis_buf,
9761                                  nameIReg(sz, gregOfRexRM(pfx,modrm),
9762                                           False));
9763   }
9764
9765   if (r2zero) {
9766      assign( rmode, mkU32((UInt)Irrm_ZERO) );
9767   } else {
9768      assign( rmode, get_sse_roundingmode() );
9769   }
9770
9771   if (sz == 4) {
9772      putIReg32( gregOfRexRM(pfx,modrm),
9773                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
9774   } else {
9775      vassert(sz == 8);
9776      putIReg64( gregOfRexRM(pfx,modrm),
9777                 binop( Iop_F64toI64S, mkexpr(rmode), mkexpr(f64lo)) );
9778   }
9779
9780   return delta;
9781}
9782
9783
9784static Long dis_CVTxSS2SI ( VexAbiInfo* vbi, Prefix pfx,
9785                            Long delta, Bool isAvx, UChar opc, Int sz )
9786{
9787   vassert(opc == 0x2D/*CVTSS2SI*/ || opc == 0x2C/*CVTTSS2SI*/);
9788   HChar  dis_buf[50];
9789   Int    alen   = 0;
9790   UChar  modrm  = getUChar(delta);
9791   IRTemp addr   = IRTemp_INVALID;
9792   IRTemp rmode  = newTemp(Ity_I32);
9793   IRTemp f32lo  = newTemp(Ity_F32);
9794   Bool   r2zero = toBool(opc == 0x2C);
9795
9796   if (epartIsReg(modrm)) {
9797      delta += 1;
9798      assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
9799      DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
9800                                  nameXMMReg(eregOfRexRM(pfx,modrm)),
9801                                  nameIReg(sz, gregOfRexRM(pfx,modrm),
9802                                           False));
9803   } else {
9804      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9805      assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
9806      delta += alen;
9807      DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
9808                                  dis_buf,
9809                                  nameIReg(sz, gregOfRexRM(pfx,modrm),
9810                                           False));
9811   }
9812
9813   if (r2zero) {
9814      assign( rmode, mkU32((UInt)Irrm_ZERO) );
9815   } else {
9816      assign( rmode, get_sse_roundingmode() );
9817   }
9818
9819   if (sz == 4) {
9820      putIReg32( gregOfRexRM(pfx,modrm),
9821                 binop( Iop_F64toI32S,
9822                        mkexpr(rmode),
9823                        unop(Iop_F32toF64, mkexpr(f32lo))) );
9824   } else {
9825      vassert(sz == 8);
9826      putIReg64( gregOfRexRM(pfx,modrm),
9827                 binop( Iop_F64toI64S,
9828                        mkexpr(rmode),
9829                        unop(Iop_F32toF64, mkexpr(f32lo))) );
9830   }
9831
9832   return delta;
9833}
9834
9835
9836static Long dis_CVTPS2PD_128 ( VexAbiInfo* vbi, Prefix pfx,
9837                               Long delta, Bool isAvx )
9838{
9839   IRTemp addr  = IRTemp_INVALID;
9840   Int    alen  = 0;
9841   HChar  dis_buf[50];
9842   IRTemp f32lo = newTemp(Ity_F32);
9843   IRTemp f32hi = newTemp(Ity_F32);
9844   UChar  modrm = getUChar(delta);
9845   UInt   rG    = gregOfRexRM(pfx,modrm);
9846   if (epartIsReg(modrm)) {
9847      UInt rE = eregOfRexRM(pfx,modrm);
9848      assign( f32lo, getXMMRegLane32F(rE, 0) );
9849      assign( f32hi, getXMMRegLane32F(rE, 1) );
9850      delta += 1;
9851      DIP("%scvtps2pd %s,%s\n",
9852          isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
9853   } else {
9854      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9855      assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
9856      assign( f32hi, loadLE(Ity_F32,
9857                            binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
9858      delta += alen;
9859      DIP("%scvtps2pd %s,%s\n",
9860          isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
9861   }
9862
9863   putXMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32hi)) );
9864   putXMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32lo)) );
9865   if (isAvx)
9866      putYMMRegLane128( rG, 1, mkV128(0));
9867   return delta;
9868}
9869
9870
9871static Long dis_CVTPS2PD_256 ( VexAbiInfo* vbi, Prefix pfx,
9872                               Long delta )
9873{
9874   IRTemp addr  = IRTemp_INVALID;
9875   Int    alen  = 0;
9876   HChar  dis_buf[50];
9877   IRTemp f32_0 = newTemp(Ity_F32);
9878   IRTemp f32_1 = newTemp(Ity_F32);
9879   IRTemp f32_2 = newTemp(Ity_F32);
9880   IRTemp f32_3 = newTemp(Ity_F32);
9881   UChar  modrm = getUChar(delta);
9882   UInt   rG    = gregOfRexRM(pfx,modrm);
9883   if (epartIsReg(modrm)) {
9884      UInt rE = eregOfRexRM(pfx,modrm);
9885      assign( f32_0, getXMMRegLane32F(rE, 0) );
9886      assign( f32_1, getXMMRegLane32F(rE, 1) );
9887      assign( f32_2, getXMMRegLane32F(rE, 2) );
9888      assign( f32_3, getXMMRegLane32F(rE, 3) );
9889      delta += 1;
9890      DIP("vcvtps2pd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
9891   } else {
9892      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9893      assign( f32_0, loadLE(Ity_F32, mkexpr(addr)) );
9894      assign( f32_1, loadLE(Ity_F32,
9895                            binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
9896      assign( f32_2, loadLE(Ity_F32,
9897                            binop(Iop_Add64,mkexpr(addr),mkU64(8))) );
9898      assign( f32_3, loadLE(Ity_F32,
9899                            binop(Iop_Add64,mkexpr(addr),mkU64(12))) );
9900      delta += alen;
9901      DIP("vcvtps2pd %s,%s\n", dis_buf, nameYMMReg(rG));
9902   }
9903
9904   putYMMRegLane64F( rG, 3, unop(Iop_F32toF64, mkexpr(f32_3)) );
9905   putYMMRegLane64F( rG, 2, unop(Iop_F32toF64, mkexpr(f32_2)) );
9906   putYMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32_1)) );
9907   putYMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32_0)) );
9908   return delta;
9909}
9910
9911
9912static Long dis_CVTPD2PS_128 ( VexAbiInfo* vbi, Prefix pfx,
9913                               Long delta, Bool isAvx )
9914{
9915   IRTemp addr  = IRTemp_INVALID;
9916   Int    alen  = 0;
9917   HChar  dis_buf[50];
9918   UChar  modrm = getUChar(delta);
9919   UInt   rG    = gregOfRexRM(pfx,modrm);
9920   IRTemp argV  = newTemp(Ity_V128);
9921   IRTemp rmode = newTemp(Ity_I32);
9922   if (epartIsReg(modrm)) {
9923      UInt rE = eregOfRexRM(pfx,modrm);
9924      assign( argV, getXMMReg(rE) );
9925      delta += 1;
9926      DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
9927          nameXMMReg(rE), nameXMMReg(rG));
9928   } else {
9929      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9930      assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9931      delta += alen;
9932      DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
9933          dis_buf, nameXMMReg(rG) );
9934   }
9935
9936   assign( rmode, get_sse_roundingmode() );
9937   IRTemp t0 = newTemp(Ity_F64);
9938   IRTemp t1 = newTemp(Ity_F64);
9939   assign( t0, unop(Iop_ReinterpI64asF64,
9940                    unop(Iop_V128to64, mkexpr(argV))) );
9941   assign( t1, unop(Iop_ReinterpI64asF64,
9942                    unop(Iop_V128HIto64, mkexpr(argV))) );
9943
9944#  define CVT(_t)  binop( Iop_F64toF32, mkexpr(rmode), mkexpr(_t) )
9945   putXMMRegLane32(  rG, 3, mkU32(0) );
9946   putXMMRegLane32(  rG, 2, mkU32(0) );
9947   putXMMRegLane32F( rG, 1, CVT(t1) );
9948   putXMMRegLane32F( rG, 0, CVT(t0) );
9949#  undef CVT
9950   if (isAvx)
9951      putYMMRegLane128( rG, 1, mkV128(0) );
9952
9953   return delta;
9954}
9955
9956
9957static Long dis_CVTxPS2DQ_128 ( VexAbiInfo* vbi, Prefix pfx,
9958                                Long delta, Bool isAvx, Bool r2zero )
9959{
9960   IRTemp addr  = IRTemp_INVALID;
9961   Int    alen  = 0;
9962   HChar  dis_buf[50];
9963   UChar  modrm = getUChar(delta);
9964   IRTemp argV  = newTemp(Ity_V128);
9965   IRTemp rmode = newTemp(Ity_I32);
9966   UInt   rG    = gregOfRexRM(pfx,modrm);
9967   IRTemp t0, t1, t2, t3;
9968
9969   if (epartIsReg(modrm)) {
9970      UInt rE = eregOfRexRM(pfx,modrm);
9971      assign( argV, getXMMReg(rE) );
9972      delta += 1;
9973      DIP("%scvt%sps2dq %s,%s\n",
9974          isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
9975   } else {
9976      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9977      assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9978      delta += alen;
9979      DIP("%scvt%sps2dq %s,%s\n",
9980          isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
9981   }
9982
9983   assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
9984                         : get_sse_roundingmode() );
9985   t0 = t1 = t2 = t3 = IRTemp_INVALID;
9986   breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
9987   /* This is less than ideal.  If it turns out to be a performance
9988      bottleneck it can be improved. */
9989#  define CVT(_t)                             \
9990      binop( Iop_F64toI32S,                   \
9991             mkexpr(rmode),                   \
9992             unop( Iop_F32toF64,              \
9993                   unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
9994
9995   putXMMRegLane32( rG, 3, CVT(t3) );
9996   putXMMRegLane32( rG, 2, CVT(t2) );
9997   putXMMRegLane32( rG, 1, CVT(t1) );
9998   putXMMRegLane32( rG, 0, CVT(t0) );
9999#  undef CVT
10000   if (isAvx)
10001      putYMMRegLane128( rG, 1, mkV128(0) );
10002
10003   return delta;
10004}
10005
10006
10007static Long dis_CVTxPS2DQ_256 ( VexAbiInfo* vbi, Prefix pfx,
10008                                Long delta, Bool r2zero )
10009{
10010   IRTemp addr  = IRTemp_INVALID;
10011   Int    alen  = 0;
10012   HChar  dis_buf[50];
10013   UChar  modrm = getUChar(delta);
10014   IRTemp argV  = newTemp(Ity_V256);
10015   IRTemp rmode = newTemp(Ity_I32);
10016   UInt   rG    = gregOfRexRM(pfx,modrm);
10017   IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
10018
10019   if (epartIsReg(modrm)) {
10020      UInt rE = eregOfRexRM(pfx,modrm);
10021      assign( argV, getYMMReg(rE) );
10022      delta += 1;
10023      DIP("vcvt%sps2dq %s,%s\n",
10024          r2zero ? "t" : "", nameYMMReg(rE), nameYMMReg(rG));
10025   } else {
10026      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10027      assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
10028      delta += alen;
10029      DIP("vcvt%sps2dq %s,%s\n",
10030          r2zero ? "t" : "", dis_buf, nameYMMReg(rG) );
10031   }
10032
10033   assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
10034                         : get_sse_roundingmode() );
10035   t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = IRTemp_INVALID;
10036   breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
10037   /* This is less than ideal.  If it turns out to be a performance
10038      bottleneck it can be improved. */
10039#  define CVT(_t)                             \
10040      binop( Iop_F64toI32S,                   \
10041             mkexpr(rmode),                   \
10042             unop( Iop_F32toF64,              \
10043                   unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
10044
10045   putYMMRegLane32( rG, 7, CVT(t7) );
10046   putYMMRegLane32( rG, 6, CVT(t6) );
10047   putYMMRegLane32( rG, 5, CVT(t5) );
10048   putYMMRegLane32( rG, 4, CVT(t4) );
10049   putYMMRegLane32( rG, 3, CVT(t3) );
10050   putYMMRegLane32( rG, 2, CVT(t2) );
10051   putYMMRegLane32( rG, 1, CVT(t1) );
10052   putYMMRegLane32( rG, 0, CVT(t0) );
10053#  undef CVT
10054
10055   return delta;
10056}
10057
10058
10059static Long dis_CVTxPD2DQ_128 ( VexAbiInfo* vbi, Prefix pfx,
10060                                Long delta, Bool isAvx, Bool r2zero )
10061{
10062   IRTemp addr  = IRTemp_INVALID;
10063   Int    alen  = 0;
10064   HChar  dis_buf[50];
10065   UChar  modrm = getUChar(delta);
10066   IRTemp argV  = newTemp(Ity_V128);
10067   IRTemp rmode = newTemp(Ity_I32);
10068   UInt   rG    = gregOfRexRM(pfx,modrm);
10069   IRTemp t0, t1;
10070
10071   if (epartIsReg(modrm)) {
10072      UInt rE = eregOfRexRM(pfx,modrm);
10073      assign( argV, getXMMReg(rE) );
10074      delta += 1;
10075      DIP("%scvt%spd2dq %s,%s\n",
10076          isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
10077   } else {
10078      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10079      assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10080      delta += alen;
10081      DIP("%scvt%spd2dqx %s,%s\n",
10082          isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
10083   }
10084
10085   if (r2zero) {
10086      assign(rmode, mkU32((UInt)Irrm_ZERO) );
10087   } else {
10088      assign( rmode, get_sse_roundingmode() );
10089   }
10090
10091   t0 = newTemp(Ity_F64);
10092   t1 = newTemp(Ity_F64);
10093   assign( t0, unop(Iop_ReinterpI64asF64,
10094                    unop(Iop_V128to64, mkexpr(argV))) );
10095   assign( t1, unop(Iop_ReinterpI64asF64,
10096                    unop(Iop_V128HIto64, mkexpr(argV))) );
10097
10098#  define CVT(_t)  binop( Iop_F64toI32S,                   \
10099                          mkexpr(rmode),                   \
10100                          mkexpr(_t) )
10101
10102   putXMMRegLane32( rG, 3, mkU32(0) );
10103   putXMMRegLane32( rG, 2, mkU32(0) );
10104   putXMMRegLane32( rG, 1, CVT(t1) );
10105   putXMMRegLane32( rG, 0, CVT(t0) );
10106#  undef CVT
10107   if (isAvx)
10108      putYMMRegLane128( rG, 1, mkV128(0) );
10109
10110   return delta;
10111}
10112
10113
10114static Long dis_CVTxPD2DQ_256 ( VexAbiInfo* vbi, Prefix pfx,
10115                                Long delta, Bool r2zero )
10116{
10117   IRTemp addr  = IRTemp_INVALID;
10118   Int    alen  = 0;
10119   HChar  dis_buf[50];
10120   UChar  modrm = getUChar(delta);
10121   IRTemp argV  = newTemp(Ity_V256);
10122   IRTemp rmode = newTemp(Ity_I32);
10123   UInt   rG    = gregOfRexRM(pfx,modrm);
10124   IRTemp t0, t1, t2, t3;
10125
10126   if (epartIsReg(modrm)) {
10127      UInt rE = eregOfRexRM(pfx,modrm);
10128      assign( argV, getYMMReg(rE) );
10129      delta += 1;
10130      DIP("vcvt%spd2dq %s,%s\n",
10131          r2zero ? "t" : "", nameYMMReg(rE), nameXMMReg(rG));
10132   } else {
10133      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10134      assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
10135      delta += alen;
10136      DIP("vcvt%spd2dqy %s,%s\n",
10137          r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
10138   }
10139
10140   if (r2zero) {
10141      assign(rmode, mkU32((UInt)Irrm_ZERO) );
10142   } else {
10143      assign( rmode, get_sse_roundingmode() );
10144   }
10145
10146   t0 = IRTemp_INVALID;
10147   t1 = IRTemp_INVALID;
10148   t2 = IRTemp_INVALID;
10149   t3 = IRTemp_INVALID;
10150   breakupV256to64s( argV, &t3, &t2, &t1, &t0 );
10151
10152#  define CVT(_t)  binop( Iop_F64toI32S,                   \
10153                          mkexpr(rmode),                   \
10154                          unop( Iop_ReinterpI64asF64,      \
10155                                mkexpr(_t) ) )
10156
10157   putXMMRegLane32( rG, 3, CVT(t3) );
10158   putXMMRegLane32( rG, 2, CVT(t2) );
10159   putXMMRegLane32( rG, 1, CVT(t1) );
10160   putXMMRegLane32( rG, 0, CVT(t0) );
10161#  undef CVT
10162   putYMMRegLane128( rG, 1, mkV128(0) );
10163
10164   return delta;
10165}
10166
10167
10168static Long dis_CVTDQ2PS_128 ( VexAbiInfo* vbi, Prefix pfx,
10169                               Long delta, Bool isAvx )
10170{
10171   IRTemp addr  = IRTemp_INVALID;
10172   Int    alen  = 0;
10173   HChar  dis_buf[50];
10174   UChar  modrm = getUChar(delta);
10175   IRTemp argV  = newTemp(Ity_V128);
10176   IRTemp rmode = newTemp(Ity_I32);
10177   UInt   rG    = gregOfRexRM(pfx,modrm);
10178   IRTemp t0, t1, t2, t3;
10179
10180   if (epartIsReg(modrm)) {
10181      UInt rE = eregOfRexRM(pfx,modrm);
10182      assign( argV, getXMMReg(rE) );
10183      delta += 1;
10184      DIP("%scvtdq2ps %s,%s\n",
10185          isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
10186   } else {
10187      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10188      assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10189      delta += alen;
10190      DIP("%scvtdq2ps %s,%s\n",
10191          isAvx ? "v" : "", dis_buf, nameXMMReg(rG) );
10192   }
10193
10194   assign( rmode, get_sse_roundingmode() );
10195   t0 = IRTemp_INVALID;
10196   t1 = IRTemp_INVALID;
10197   t2 = IRTemp_INVALID;
10198   t3 = IRTemp_INVALID;
10199   breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
10200
10201#  define CVT(_t)  binop( Iop_F64toF32,                    \
10202                          mkexpr(rmode),                   \
10203                          unop(Iop_I32StoF64,mkexpr(_t)))
10204
10205   putXMMRegLane32F( rG, 3, CVT(t3) );
10206   putXMMRegLane32F( rG, 2, CVT(t2) );
10207   putXMMRegLane32F( rG, 1, CVT(t1) );
10208   putXMMRegLane32F( rG, 0, CVT(t0) );
10209#  undef CVT
10210   if (isAvx)
10211      putYMMRegLane128( rG, 1, mkV128(0) );
10212
10213   return delta;
10214}
10215
10216static Long dis_CVTDQ2PS_256 ( VexAbiInfo* vbi, Prefix pfx,
10217                               Long delta )
10218{
10219   IRTemp addr   = IRTemp_INVALID;
10220   Int    alen   = 0;
10221   HChar  dis_buf[50];
10222   UChar  modrm  = getUChar(delta);
10223   IRTemp argV   = newTemp(Ity_V256);
10224   IRTemp rmode  = newTemp(Ity_I32);
10225   UInt   rG     = gregOfRexRM(pfx,modrm);
10226   IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
10227
10228   if (epartIsReg(modrm)) {
10229      UInt rE = eregOfRexRM(pfx,modrm);
10230      assign( argV, getYMMReg(rE) );
10231      delta += 1;
10232      DIP("vcvtdq2ps %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
10233   } else {
10234      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10235      assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
10236      delta += alen;
10237      DIP("vcvtdq2ps %s,%s\n", dis_buf, nameYMMReg(rG) );
10238   }
10239
10240   assign( rmode, get_sse_roundingmode() );
10241   t0 = IRTemp_INVALID;
10242   t1 = IRTemp_INVALID;
10243   t2 = IRTemp_INVALID;
10244   t3 = IRTemp_INVALID;
10245   t4 = IRTemp_INVALID;
10246   t5 = IRTemp_INVALID;
10247   t6 = IRTemp_INVALID;
10248   t7 = IRTemp_INVALID;
10249   breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
10250
10251#  define CVT(_t)  binop( Iop_F64toF32,                    \
10252                          mkexpr(rmode),                   \
10253                          unop(Iop_I32StoF64,mkexpr(_t)))
10254
10255   putYMMRegLane32F( rG, 7, CVT(t7) );
10256   putYMMRegLane32F( rG, 6, CVT(t6) );
10257   putYMMRegLane32F( rG, 5, CVT(t5) );
10258   putYMMRegLane32F( rG, 4, CVT(t4) );
10259   putYMMRegLane32F( rG, 3, CVT(t3) );
10260   putYMMRegLane32F( rG, 2, CVT(t2) );
10261   putYMMRegLane32F( rG, 1, CVT(t1) );
10262   putYMMRegLane32F( rG, 0, CVT(t0) );
10263#  undef CVT
10264
10265   return delta;
10266}
10267
10268
10269static Long dis_PMOVMSKB_128 ( VexAbiInfo* vbi, Prefix pfx,
10270                               Long delta, Bool isAvx )
10271{
10272   /* UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ); */
10273   UChar modrm = getUChar(delta);
10274   vassert(epartIsReg(modrm)); /* ensured by caller */
10275   UInt   rE = eregOfRexRM(pfx,modrm);
10276   UInt   rG = gregOfRexRM(pfx,modrm);
10277   IRTemp t0 = newTemp(Ity_I64);
10278   IRTemp t1 = newTemp(Ity_I64);
10279   IRTemp t5 = newTemp(Ity_I64);
10280   assign(t0, getXMMRegLane64(rE, 0));
10281   assign(t1, getXMMRegLane64(rE, 1));
10282   assign(t5, mkIRExprCCall( Ity_I64, 0/*regparms*/,
10283                             "amd64g_calculate_sse_pmovmskb",
10284                             &amd64g_calculate_sse_pmovmskb,
10285                             mkIRExprVec_2( mkexpr(t1), mkexpr(t0) )));
10286   putIReg32(rG, unop(Iop_64to32,mkexpr(t5)));
10287   DIP("%spmovmskb %s,%s\n", isAvx ? "v" : "", nameXMMReg(rE),
10288       nameIReg32(rG));
10289   delta += 1;
10290   return delta;
10291}
10292
10293
10294/* FIXME: why not just use InterleaveLO / InterleaveHI?  I think the
10295   relevant ops are "xIsH ? InterleaveHI32x4 : InterleaveLO32x4". */
10296/* Does the maths for 128 bit versions of UNPCKLPS and UNPCKHPS */
10297static IRTemp math_UNPCKxPS_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
10298{
10299   IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
10300   s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
10301   breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
10302   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
10303   IRTemp res = newTemp(Ity_V128);
10304   assign(res,  xIsH ? mkV128from32s( s3, d3, s2, d2 )
10305                     : mkV128from32s( s1, d1, s0, d0 ));
10306   return res;
10307}
10308
10309
10310/* FIXME: why not just use InterleaveLO / InterleaveHI ?? */
10311/* Does the maths for 128 bit versions of UNPCKLPD and UNPCKHPD */
10312static IRTemp math_UNPCKxPD_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
10313{
10314   IRTemp s1 = newTemp(Ity_I64);
10315   IRTemp s0 = newTemp(Ity_I64);
10316   IRTemp d1 = newTemp(Ity_I64);
10317   IRTemp d0 = newTemp(Ity_I64);
10318   assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
10319   assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
10320   assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
10321   assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
10322   IRTemp res = newTemp(Ity_V128);
10323   assign(res, xIsH ? binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1))
10324                    : binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)));
10325   return res;
10326}
10327
10328
10329/* Does the maths for 256 bit versions of UNPCKLPD and UNPCKHPD.
10330   Doesn't seem like this fits in either of the Iop_Interleave{LO,HI}
10331   or the Iop_Cat{Odd,Even}Lanes idioms, hence just do it the stupid
10332   way. */
10333static IRTemp math_UNPCKxPD_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
10334{
10335   IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
10336   s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
10337   breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
10338   breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
10339   IRTemp res = newTemp(Ity_V256);
10340   assign(res, xIsH
10341               ? IRExpr_Qop(Iop_64x4toV256, mkexpr(s3), mkexpr(d3),
10342                                            mkexpr(s1), mkexpr(d1))
10343               : IRExpr_Qop(Iop_64x4toV256, mkexpr(s2), mkexpr(d2),
10344                                            mkexpr(s0), mkexpr(d0)));
10345   return res;
10346}
10347
10348
10349/* FIXME: this is really bad.  Surely can do something better here?
10350   One observation is that the steering in the upper and lower 128 bit
10351   halves is the same as with math_UNPCKxPS_128, so we simply split
10352   into two halves, and use that.  Consequently any improvement in
10353   math_UNPCKxPS_128 (probably, to use interleave-style primops)
10354   benefits this too. */
10355static IRTemp math_UNPCKxPS_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
10356{
10357   IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
10358   IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
10359   breakupV256toV128s( sV, &sVhi, &sVlo );
10360   breakupV256toV128s( dV, &dVhi, &dVlo );
10361   IRTemp rVhi = math_UNPCKxPS_128(sVhi, dVhi, xIsH);
10362   IRTemp rVlo = math_UNPCKxPS_128(sVlo, dVlo, xIsH);
10363   IRTemp rV   = newTemp(Ity_V256);
10364   assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
10365   return rV;
10366}
10367
10368
10369static IRTemp math_SHUFPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
10370{
10371   IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
10372   s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
10373   vassert(imm8 < 256);
10374
10375   breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
10376   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
10377
10378#  define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
10379#  define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
10380   IRTemp res = newTemp(Ity_V128);
10381   assign(res,
10382          mkV128from32s( SELS((imm8>>6)&3), SELS((imm8>>4)&3),
10383                         SELD((imm8>>2)&3), SELD((imm8>>0)&3) ) );
10384#  undef SELD
10385#  undef SELS
10386   return res;
10387}
10388
10389
10390/* 256-bit SHUFPS appears to steer each of the 128-bit halves
10391   identically.  Hence do the clueless thing and use math_SHUFPS_128
10392   twice. */
10393static IRTemp math_SHUFPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
10394{
10395   IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
10396   IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
10397   breakupV256toV128s( sV, &sVhi, &sVlo );
10398   breakupV256toV128s( dV, &dVhi, &dVlo );
10399   IRTemp rVhi = math_SHUFPS_128(sVhi, dVhi, imm8);
10400   IRTemp rVlo = math_SHUFPS_128(sVlo, dVlo, imm8);
10401   IRTemp rV   = newTemp(Ity_V256);
10402   assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
10403   return rV;
10404}
10405
10406
10407static IRTemp math_SHUFPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
10408{
10409   IRTemp s1 = newTemp(Ity_I64);
10410   IRTemp s0 = newTemp(Ity_I64);
10411   IRTemp d1 = newTemp(Ity_I64);
10412   IRTemp d0 = newTemp(Ity_I64);
10413
10414   assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
10415   assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
10416   assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
10417   assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
10418
10419#  define SELD(n) mkexpr((n)==0 ? d0 : d1)
10420#  define SELS(n) mkexpr((n)==0 ? s0 : s1)
10421
10422   IRTemp res = newTemp(Ity_V128);
10423   assign(res, binop( Iop_64HLtoV128,
10424                      SELS((imm8>>1)&1), SELD((imm8>>0)&1) ) );
10425
10426#  undef SELD
10427#  undef SELS
10428   return res;
10429}
10430
10431
10432static IRTemp math_SHUFPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
10433{
10434   IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
10435   IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
10436   breakupV256toV128s( sV, &sVhi, &sVlo );
10437   breakupV256toV128s( dV, &dVhi, &dVlo );
10438   IRTemp rVhi = math_SHUFPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
10439   IRTemp rVlo = math_SHUFPD_128(sVlo, dVlo, imm8 & 3);
10440   IRTemp rV   = newTemp(Ity_V256);
10441   assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
10442   return rV;
10443}
10444
10445
10446static IRTemp math_BLENDPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
10447{
10448   UShort imm8_mask_16;
10449   IRTemp imm8_mask = newTemp(Ity_V128);
10450
10451   switch( imm8 & 3 ) {
10452      case 0:  imm8_mask_16 = 0x0000; break;
10453      case 1:  imm8_mask_16 = 0x00FF; break;
10454      case 2:  imm8_mask_16 = 0xFF00; break;
10455      case 3:  imm8_mask_16 = 0xFFFF; break;
10456      default: vassert(0);            break;
10457   }
10458   assign( imm8_mask, mkV128( imm8_mask_16 ) );
10459
10460   IRTemp res = newTemp(Ity_V128);
10461   assign ( res, binop( Iop_OrV128,
10462                        binop( Iop_AndV128, mkexpr(sV),
10463                                            mkexpr(imm8_mask) ),
10464                        binop( Iop_AndV128, mkexpr(dV),
10465                               unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
10466   return res;
10467}
10468
10469
10470static IRTemp math_BLENDPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
10471{
10472   IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
10473   IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
10474   breakupV256toV128s( sV, &sVhi, &sVlo );
10475   breakupV256toV128s( dV, &dVhi, &dVlo );
10476   IRTemp rVhi = math_BLENDPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
10477   IRTemp rVlo = math_BLENDPD_128(sVlo, dVlo, imm8 & 3);
10478   IRTemp rV   = newTemp(Ity_V256);
10479   assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
10480   return rV;
10481}
10482
10483
10484static IRTemp math_BLENDPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
10485{
10486   UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
10487                             0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
10488                             0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0,
10489                             0xFFFF };
10490   IRTemp imm8_mask = newTemp(Ity_V128);
10491   assign( imm8_mask, mkV128( imm8_perms[ (imm8 & 15) ] ) );
10492
10493   IRTemp res = newTemp(Ity_V128);
10494   assign ( res, binop( Iop_OrV128,
10495                        binop( Iop_AndV128, mkexpr(sV),
10496                                            mkexpr(imm8_mask) ),
10497                        binop( Iop_AndV128, mkexpr(dV),
10498                               unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
10499   return res;
10500}
10501
10502
10503static IRTemp math_BLENDPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
10504{
10505   IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
10506   IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
10507   breakupV256toV128s( sV, &sVhi, &sVlo );
10508   breakupV256toV128s( dV, &dVhi, &dVlo );
10509   IRTemp rVhi = math_BLENDPS_128(sVhi, dVhi, (imm8 >> 4) & 15);
10510   IRTemp rVlo = math_BLENDPS_128(sVlo, dVlo, imm8 & 15);
10511   IRTemp rV   = newTemp(Ity_V256);
10512   assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
10513   return rV;
10514}
10515
10516
10517static IRTemp math_PBLENDW_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
10518{
10519   /* Make w be a 16-bit version of imm8, formed by duplicating each
10520      bit in imm8. */
10521   Int i;
10522   UShort imm16 = 0;
10523   for (i = 0; i < 8; i++) {
10524      if (imm8 & (1 << i))
10525         imm16 |= (3 << (2*i));
10526   }
10527   IRTemp imm16_mask = newTemp(Ity_V128);
10528   assign( imm16_mask, mkV128( imm16 ));
10529
10530   IRTemp res = newTemp(Ity_V128);
10531   assign ( res, binop( Iop_OrV128,
10532                        binop( Iop_AndV128, mkexpr(sV),
10533                                            mkexpr(imm16_mask) ),
10534                        binop( Iop_AndV128, mkexpr(dV),
10535                               unop( Iop_NotV128, mkexpr(imm16_mask) ) ) ) );
10536   return res;
10537}
10538
10539
10540static IRTemp math_PMULUDQ_128 ( IRTemp sV, IRTemp dV )
10541{
10542   /* This is a really poor translation -- could be improved if
10543      performance critical */
10544   IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
10545   s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
10546   breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
10547   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
10548   IRTemp res = newTemp(Ity_V128);
10549   assign(res, binop(Iop_64HLtoV128,
10550                     binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)),
10551                     binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) ));
10552   return res;
10553}
10554
10555
10556static IRTemp math_PMULDQ_128 ( IRTemp dV, IRTemp sV )
10557{
10558   /* This is a really poor translation -- could be improved if
10559      performance critical */
10560   IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
10561   s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
10562   breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
10563   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
10564   IRTemp res = newTemp(Ity_V128);
10565   assign(res, binop(Iop_64HLtoV128,
10566                     binop( Iop_MullS32, mkexpr(d2), mkexpr(s2)),
10567                     binop( Iop_MullS32, mkexpr(d0), mkexpr(s0)) ));
10568   return res;
10569}
10570
10571
10572static IRTemp math_PMADDWD_128 ( IRTemp dV, IRTemp sV )
10573{
10574   IRTemp sVhi, sVlo, dVhi, dVlo;
10575   IRTemp resHi = newTemp(Ity_I64);
10576   IRTemp resLo = newTemp(Ity_I64);
10577   sVhi = sVlo = dVhi = dVlo = IRTemp_INVALID;
10578   breakupV128to64s( sV, &sVhi, &sVlo );
10579   breakupV128to64s( dV, &dVhi, &dVlo );
10580   assign( resHi, mkIRExprCCall(Ity_I64, 0/*regparms*/,
10581                                "amd64g_calculate_mmx_pmaddwd",
10582                                &amd64g_calculate_mmx_pmaddwd,
10583                                mkIRExprVec_2( mkexpr(sVhi), mkexpr(dVhi))));
10584   assign( resLo, mkIRExprCCall(Ity_I64, 0/*regparms*/,
10585                                "amd64g_calculate_mmx_pmaddwd",
10586                                &amd64g_calculate_mmx_pmaddwd,
10587                                mkIRExprVec_2( mkexpr(sVlo), mkexpr(dVlo))));
10588   IRTemp res = newTemp(Ity_V128);
10589   assign( res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo))) ;
10590   return res;
10591}
10592
10593
10594static IRTemp math_ADDSUBPD_128 ( IRTemp dV, IRTemp sV )
10595{
10596   IRTemp addV = newTemp(Ity_V128);
10597   IRTemp subV = newTemp(Ity_V128);
10598   IRTemp a1   = newTemp(Ity_I64);
10599   IRTemp s0   = newTemp(Ity_I64);
10600
10601   assign( addV, binop(Iop_Add64Fx2, mkexpr(dV), mkexpr(sV)) );
10602   assign( subV, binop(Iop_Sub64Fx2, mkexpr(dV), mkexpr(sV)) );
10603
10604   assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
10605   assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
10606
10607   IRTemp res = newTemp(Ity_V128);
10608   assign( res, binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
10609   return res;
10610}
10611
10612
10613static IRTemp math_ADDSUBPD_256 ( IRTemp dV, IRTemp sV )
10614{
10615   IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
10616   IRTemp addV = newTemp(Ity_V256);
10617   IRTemp subV = newTemp(Ity_V256);
10618   a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
10619
10620   assign( addV, binop(Iop_Add64Fx4, mkexpr(dV), mkexpr(sV)) );
10621   assign( subV, binop(Iop_Sub64Fx4, mkexpr(dV), mkexpr(sV)) );
10622
10623   breakupV256to64s( addV, &a3, &a2, &a1, &a0 );
10624   breakupV256to64s( subV, &s3, &s2, &s1, &s0 );
10625
10626   IRTemp res = newTemp(Ity_V256);
10627   assign( res, mkV256from64s( a3, s2, a1, s0 ) );
10628   return res;
10629}
10630
10631
10632static IRTemp math_ADDSUBPS_128 ( IRTemp dV, IRTemp sV )
10633{
10634   IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
10635   IRTemp addV = newTemp(Ity_V128);
10636   IRTemp subV = newTemp(Ity_V128);
10637   a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
10638
10639   assign( addV, binop(Iop_Add32Fx4, mkexpr(dV), mkexpr(sV)) );
10640   assign( subV, binop(Iop_Sub32Fx4, mkexpr(dV), mkexpr(sV)) );
10641
10642   breakupV128to32s( addV, &a3, &a2, &a1, &a0 );
10643   breakupV128to32s( subV, &s3, &s2, &s1, &s0 );
10644
10645   IRTemp res = newTemp(Ity_V128);
10646   assign( res, mkV128from32s( a3, s2, a1, s0 ) );
10647   return res;
10648}
10649
10650
10651static IRTemp math_ADDSUBPS_256 ( IRTemp dV, IRTemp sV )
10652{
10653   IRTemp a7, a6, a5, a4, a3, a2, a1, a0;
10654   IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
10655   IRTemp addV = newTemp(Ity_V256);
10656   IRTemp subV = newTemp(Ity_V256);
10657   a7 = a6 = a5 = a4 = a3 = a2 = a1 = a0 = IRTemp_INVALID;
10658   s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
10659
10660   assign( addV, binop(Iop_Add32Fx8, mkexpr(dV), mkexpr(sV)) );
10661   assign( subV, binop(Iop_Sub32Fx8, mkexpr(dV), mkexpr(sV)) );
10662
10663   breakupV256to32s( addV, &a7, &a6, &a5, &a4, &a3, &a2, &a1, &a0 );
10664   breakupV256to32s( subV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
10665
10666   IRTemp res = newTemp(Ity_V256);
10667   assign( res, mkV256from32s( a7, s6, a5, s4, a3, s2, a1, s0 ) );
10668   return res;
10669}
10670
10671
10672/* Handle 128 bit PSHUFLW and PSHUFHW. */
10673static Long dis_PSHUFxW_128 ( VexAbiInfo* vbi, Prefix pfx,
10674                              Long delta, Bool isAvx, Bool xIsH )
10675{
10676   IRTemp addr  = IRTemp_INVALID;
10677   Int    alen  = 0;
10678   HChar  dis_buf[50];
10679   UChar  modrm = getUChar(delta);
10680   UInt   rG = gregOfRexRM(pfx,modrm);
10681   UInt   imm8;
10682   IRTemp sVmut, dVmut, sVcon, sV, dV, s3, s2, s1, s0;
10683   s3 = s2 = s1 = s0 = IRTemp_INVALID;
10684   sV    = newTemp(Ity_V128);
10685   dV    = newTemp(Ity_V128);
10686   sVmut = newTemp(Ity_I64);
10687   dVmut = newTemp(Ity_I64);
10688   sVcon = newTemp(Ity_I64);
10689   if (epartIsReg(modrm)) {
10690      UInt rE = eregOfRexRM(pfx,modrm);
10691      assign( sV, getXMMReg(rE) );
10692      imm8 = (UInt)getUChar(delta+1);
10693      delta += 1+1;
10694      DIP("%spshuf%cw $%u,%s,%s\n",
10695          isAvx ? "v" : "", xIsH ? 'h' : 'l',
10696          imm8, nameXMMReg(rE), nameXMMReg(rG));
10697   } else {
10698      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
10699      assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
10700      imm8 = (UInt)getUChar(delta+alen);
10701      delta += alen+1;
10702      DIP("%spshuf%cw $%u,%s,%s\n",
10703          isAvx ? "v" : "", xIsH ? 'h' : 'l',
10704          imm8, dis_buf, nameXMMReg(rG));
10705   }
10706
10707   /* Get the to-be-changed (mut) and unchanging (con) bits of the
10708      source. */
10709   assign( sVmut, unop(xIsH ? Iop_V128HIto64 : Iop_V128to64,   mkexpr(sV)) );
10710   assign( sVcon, unop(xIsH ? Iop_V128to64   : Iop_V128HIto64, mkexpr(sV)) );
10711
10712   breakup64to16s( sVmut, &s3, &s2, &s1, &s0 );
10713#  define SEL(n) \
10714             ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
10715   assign(dVmut, mk64from16s( SEL((imm8>>6)&3), SEL((imm8>>4)&3),
10716                              SEL((imm8>>2)&3), SEL((imm8>>0)&3) ));
10717#  undef SEL
10718
10719   assign(dV, xIsH ? binop(Iop_64HLtoV128, mkexpr(dVmut), mkexpr(sVcon))
10720                   : binop(Iop_64HLtoV128, mkexpr(sVcon), mkexpr(dVmut)) );
10721
10722   (isAvx ? putYMMRegLoAndZU : putXMMReg)(rG, mkexpr(dV));
10723   return delta;
10724}
10725
10726
10727static Long dis_PEXTRW_128_EregOnly_toG ( VexAbiInfo* vbi, Prefix pfx,
10728                                          Long delta, Bool isAvx )
10729{
10730   Long   deltaIN = delta;
10731   UChar  modrm   = getUChar(delta);
10732   UInt   rG      = gregOfRexRM(pfx,modrm);
10733   IRTemp sV      = newTemp(Ity_V128);
10734   IRTemp d16     = newTemp(Ity_I16);
10735   UInt   imm8;
10736   IRTemp s0, s1, s2, s3;
10737   if (epartIsReg(modrm)) {
10738      UInt rE = eregOfRexRM(pfx,modrm);
10739      assign(sV, getXMMReg(rE));
10740      imm8 = getUChar(delta+1) & 7;
10741      delta += 1+1;
10742      DIP("%spextrw $%d,%s,%s\n", isAvx ? "v" : "",
10743          (Int)imm8, nameXMMReg(rE), nameIReg32(rG));
10744   } else {
10745      /* The memory case is disallowed, apparently. */
10746      return deltaIN; /* FAIL */
10747   }
10748   s3 = s2 = s1 = s0 = IRTemp_INVALID;
10749   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
10750   switch (imm8) {
10751      case 0:  assign(d16, unop(Iop_32to16,   mkexpr(s0))); break;
10752      case 1:  assign(d16, unop(Iop_32HIto16, mkexpr(s0))); break;
10753      case 2:  assign(d16, unop(Iop_32to16,   mkexpr(s1))); break;
10754      case 3:  assign(d16, unop(Iop_32HIto16, mkexpr(s1))); break;
10755      case 4:  assign(d16, unop(Iop_32to16,   mkexpr(s2))); break;
10756      case 5:  assign(d16, unop(Iop_32HIto16, mkexpr(s2))); break;
10757      case 6:  assign(d16, unop(Iop_32to16,   mkexpr(s3))); break;
10758      case 7:  assign(d16, unop(Iop_32HIto16, mkexpr(s3))); break;
10759      default: vassert(0);
10760   }
10761   putIReg32(rG, unop(Iop_16Uto32, mkexpr(d16)));
10762   return delta;
10763}
10764
10765
10766static Long dis_CVTDQ2PD_128 ( VexAbiInfo* vbi, Prefix pfx,
10767                               Long delta, Bool isAvx )
10768{
10769   IRTemp addr  = IRTemp_INVALID;
10770   Int    alen  = 0;
10771   HChar  dis_buf[50];
10772   UChar  modrm = getUChar(delta);
10773   IRTemp arg64 = newTemp(Ity_I64);
10774   UInt   rG    = gregOfRexRM(pfx,modrm);
10775   UChar* mbV   = isAvx ? "v" : "";
10776   if (epartIsReg(modrm)) {
10777      UInt rE = eregOfRexRM(pfx,modrm);
10778      assign( arg64, getXMMRegLane64(rE, 0) );
10779      delta += 1;
10780      DIP("%scvtdq2pd %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG));
10781   } else {
10782      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10783      assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
10784      delta += alen;
10785      DIP("%scvtdq2pd %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
10786   }
10787   putXMMRegLane64F(
10788      rG, 0,
10789      unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
10790   );
10791   putXMMRegLane64F(
10792      rG, 1,
10793      unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
10794   );
10795   if (isAvx)
10796      putYMMRegLane128(rG, 1, mkV128(0));
10797   return delta;
10798}
10799
10800
10801static Long dis_STMXCSR ( VexAbiInfo* vbi, Prefix pfx,
10802                          Long delta, Bool isAvx )
10803{
10804   IRTemp addr  = IRTemp_INVALID;
10805   Int    alen  = 0;
10806   HChar  dis_buf[50];
10807   UChar  modrm = getUChar(delta);
10808   vassert(!epartIsReg(modrm)); /* ensured by caller */
10809   vassert(gregOfRexRM(pfx,modrm) == 3); /* ditto */
10810
10811   addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10812   delta += alen;
10813
10814   /* Fake up a native SSE mxcsr word.  The only thing it depends on
10815      is SSEROUND[1:0], so call a clean helper to cook it up.
10816   */
10817   /* ULong amd64h_create_mxcsr ( ULong sseround ) */
10818   DIP("%sstmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
10819   storeLE(
10820      mkexpr(addr),
10821      unop(Iop_64to32,
10822           mkIRExprCCall(
10823              Ity_I64, 0/*regp*/,
10824              "amd64g_create_mxcsr", &amd64g_create_mxcsr,
10825              mkIRExprVec_1( unop(Iop_32Uto64,get_sse_roundingmode()) )
10826           )
10827      )
10828   );
10829   return delta;
10830}
10831
10832
10833static Long dis_LDMXCSR ( VexAbiInfo* vbi, Prefix pfx,
10834                          Long delta, Bool isAvx )
10835{
10836   IRTemp addr  = IRTemp_INVALID;
10837   Int    alen  = 0;
10838   HChar  dis_buf[50];
10839   UChar  modrm = getUChar(delta);
10840   vassert(!epartIsReg(modrm)); /* ensured by caller */
10841   vassert(gregOfRexRM(pfx,modrm) == 2); /* ditto */
10842
10843   IRTemp t64 = newTemp(Ity_I64);
10844   IRTemp ew  = newTemp(Ity_I32);
10845
10846   addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10847   delta += alen;
10848   DIP("%sldmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
10849
10850   /* The only thing we observe in %mxcsr is the rounding mode.
10851      Therefore, pass the 32-bit value (SSE native-format control
10852      word) to a clean helper, getting back a 64-bit value, the
10853      lower half of which is the SSEROUND value to store, and the
10854      upper half of which is the emulation-warning token which may
10855      be generated.
10856   */
10857   /* ULong amd64h_check_ldmxcsr ( ULong ); */
10858   assign( t64, mkIRExprCCall(
10859                   Ity_I64, 0/*regparms*/,
10860                   "amd64g_check_ldmxcsr",
10861                   &amd64g_check_ldmxcsr,
10862                   mkIRExprVec_1(
10863                      unop(Iop_32Uto64,
10864                           loadLE(Ity_I32, mkexpr(addr))
10865                      )
10866                   )
10867                )
10868         );
10869
10870   put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
10871   assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
10872   put_emwarn( mkexpr(ew) );
10873   /* Finally, if an emulation warning was reported, side-exit to
10874      the next insn, reporting the warning, so that Valgrind's
10875      dispatcher sees the warning. */
10876   stmt(
10877      IRStmt_Exit(
10878         binop(Iop_CmpNE64, unop(Iop_32Uto64,mkexpr(ew)), mkU64(0)),
10879         Ijk_EmWarn,
10880         IRConst_U64(guest_RIP_bbstart+delta),
10881         OFFB_RIP
10882      )
10883   );
10884   return delta;
10885}
10886
10887
10888static IRTemp math_PINSRW_128 ( IRTemp v128, IRTemp u16, UInt imm8 )
10889{
10890   vassert(imm8 >= 0 && imm8 <= 7);
10891
10892   // Create a V128 value which has the selected word in the
10893   // specified lane, and zeroes everywhere else.
10894   IRTemp tmp128    = newTemp(Ity_V128);
10895   IRTemp halfshift = newTemp(Ity_I64);
10896   assign(halfshift, binop(Iop_Shl64,
10897                           unop(Iop_16Uto64, mkexpr(u16)),
10898                           mkU8(16 * (imm8 & 3))));
10899   if (imm8 < 4) {
10900      assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
10901   } else {
10902      assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
10903   }
10904
10905   UShort mask = ~(3 << (imm8 * 2));
10906   IRTemp res  = newTemp(Ity_V128);
10907   assign( res, binop(Iop_OrV128,
10908                      mkexpr(tmp128),
10909                      binop(Iop_AndV128, mkexpr(v128), mkV128(mask))) );
10910   return res;
10911}
10912
10913
10914static IRTemp math_PSADBW_128 ( IRTemp dV, IRTemp sV )
10915{
10916   IRTemp s1, s0, d1, d0;
10917   s1 = s0 = d1 = d0 = IRTemp_INVALID;
10918
10919   breakupV128to64s( sV, &s1, &s0 );
10920   breakupV128to64s( dV, &d1, &d0 );
10921
10922   IRTemp res = newTemp(Ity_V128);
10923   assign( res,
10924           binop(Iop_64HLtoV128,
10925                 mkIRExprCCall(Ity_I64, 0/*regparms*/,
10926                               "amd64g_calculate_mmx_psadbw",
10927                               &amd64g_calculate_mmx_psadbw,
10928                               mkIRExprVec_2( mkexpr(s1), mkexpr(d1))),
10929                 mkIRExprCCall(Ity_I64, 0/*regparms*/,
10930                               "amd64g_calculate_mmx_psadbw",
10931                               &amd64g_calculate_mmx_psadbw,
10932                               mkIRExprVec_2( mkexpr(s0), mkexpr(d0)))) );
10933   return res;
10934}
10935
10936
10937static Long dis_MASKMOVDQU ( VexAbiInfo* vbi, Prefix pfx,
10938                             Long delta, Bool isAvx )
10939{
10940   IRTemp regD    = newTemp(Ity_V128);
10941   IRTemp mask    = newTemp(Ity_V128);
10942   IRTemp olddata = newTemp(Ity_V128);
10943   IRTemp newdata = newTemp(Ity_V128);
10944   IRTemp addr    = newTemp(Ity_I64);
10945   UChar  modrm   = getUChar(delta);
10946   UInt   rG      = gregOfRexRM(pfx,modrm);
10947   UInt   rE      = eregOfRexRM(pfx,modrm);
10948
10949   assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
10950   assign( regD, getXMMReg( rG ));
10951
10952   /* Unfortunately can't do the obvious thing with SarN8x16
10953      here since that can't be re-emitted as SSE2 code - no such
10954      insn. */
10955   assign( mask,
10956           binop(Iop_64HLtoV128,
10957                 binop(Iop_SarN8x8,
10958                       getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ),
10959                       mkU8(7) ),
10960                 binop(Iop_SarN8x8,
10961                       getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ),
10962                       mkU8(7) ) ));
10963   assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
10964   assign( newdata, binop(Iop_OrV128,
10965                          binop(Iop_AndV128,
10966                                mkexpr(regD),
10967                                mkexpr(mask) ),
10968                          binop(Iop_AndV128,
10969                                mkexpr(olddata),
10970                                unop(Iop_NotV128, mkexpr(mask)))) );
10971   storeLE( mkexpr(addr), mkexpr(newdata) );
10972
10973   delta += 1;
10974   DIP("%smaskmovdqu %s,%s\n", isAvx ? "v" : "",
10975       nameXMMReg(rE), nameXMMReg(rG) );
10976   return delta;
10977}
10978
10979
10980static Long dis_MOVMSKPS_128 ( VexAbiInfo* vbi, Prefix pfx,
10981                               Long delta, Bool isAvx )
10982{
10983   UChar modrm = getUChar(delta);
10984   UInt   rG   = gregOfRexRM(pfx,modrm);
10985   UInt   rE   = eregOfRexRM(pfx,modrm);
10986   IRTemp t0   = newTemp(Ity_I32);
10987   IRTemp t1   = newTemp(Ity_I32);
10988   IRTemp t2   = newTemp(Ity_I32);
10989   IRTemp t3   = newTemp(Ity_I32);
10990   delta += 1;
10991   assign( t0, binop( Iop_And32,
10992                      binop(Iop_Shr32, getXMMRegLane32(rE,0), mkU8(31)),
10993                      mkU32(1) ));
10994   assign( t1, binop( Iop_And32,
10995                      binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(30)),
10996                      mkU32(2) ));
10997   assign( t2, binop( Iop_And32,
10998                      binop(Iop_Shr32, getXMMRegLane32(rE,2), mkU8(29)),
10999                      mkU32(4) ));
11000   assign( t3, binop( Iop_And32,
11001                      binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(28)),
11002                      mkU32(8) ));
11003   putIReg32( rG, binop(Iop_Or32,
11004                        binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
11005                        binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
11006   DIP("%smovmskps %s,%s\n", isAvx ? "v" : "",
11007       nameXMMReg(rE), nameIReg32(rG));
11008   return delta;
11009}
11010
11011
11012static Long dis_MOVMSKPS_256 ( VexAbiInfo* vbi, Prefix pfx, Long delta )
11013{
11014   UChar modrm = getUChar(delta);
11015   UInt   rG   = gregOfRexRM(pfx,modrm);
11016   UInt   rE   = eregOfRexRM(pfx,modrm);
11017   IRTemp t0   = newTemp(Ity_I32);
11018   IRTemp t1   = newTemp(Ity_I32);
11019   IRTemp t2   = newTemp(Ity_I32);
11020   IRTemp t3   = newTemp(Ity_I32);
11021   IRTemp t4   = newTemp(Ity_I32);
11022   IRTemp t5   = newTemp(Ity_I32);
11023   IRTemp t6   = newTemp(Ity_I32);
11024   IRTemp t7   = newTemp(Ity_I32);
11025   delta += 1;
11026   assign( t0, binop( Iop_And32,
11027                      binop(Iop_Shr32, getYMMRegLane32(rE,0), mkU8(31)),
11028                      mkU32(1) ));
11029   assign( t1, binop( Iop_And32,
11030                      binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(30)),
11031                      mkU32(2) ));
11032   assign( t2, binop( Iop_And32,
11033                      binop(Iop_Shr32, getYMMRegLane32(rE,2), mkU8(29)),
11034                      mkU32(4) ));
11035   assign( t3, binop( Iop_And32,
11036                      binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(28)),
11037                      mkU32(8) ));
11038   assign( t4, binop( Iop_And32,
11039                      binop(Iop_Shr32, getYMMRegLane32(rE,4), mkU8(27)),
11040                      mkU32(16) ));
11041   assign( t5, binop( Iop_And32,
11042                      binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(26)),
11043                      mkU32(32) ));
11044   assign( t6, binop( Iop_And32,
11045                      binop(Iop_Shr32, getYMMRegLane32(rE,6), mkU8(25)),
11046                      mkU32(64) ));
11047   assign( t7, binop( Iop_And32,
11048                      binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(24)),
11049                      mkU32(128) ));
11050   putIReg32( rG, binop(Iop_Or32,
11051                        binop(Iop_Or32,
11052                              binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
11053                              binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ),
11054                        binop(Iop_Or32,
11055                              binop(Iop_Or32, mkexpr(t4), mkexpr(t5)),
11056                              binop(Iop_Or32, mkexpr(t6), mkexpr(t7)) ) ) );
11057   DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
11058   return delta;
11059}
11060
11061
11062static Long dis_MOVMSKPD_128 ( VexAbiInfo* vbi, Prefix pfx,
11063                               Long delta, Bool isAvx )
11064{
11065   UChar modrm = getUChar(delta);
11066   UInt   rG   = gregOfRexRM(pfx,modrm);
11067   UInt   rE   = eregOfRexRM(pfx,modrm);
11068   IRTemp t0   = newTemp(Ity_I32);
11069   IRTemp t1   = newTemp(Ity_I32);
11070   delta += 1;
11071   assign( t0, binop( Iop_And32,
11072                      binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(31)),
11073                      mkU32(1) ));
11074   assign( t1, binop( Iop_And32,
11075                      binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(30)),
11076                      mkU32(2) ));
11077   putIReg32( rG, binop(Iop_Or32, mkexpr(t0), mkexpr(t1) ) );
11078   DIP("%smovmskpd %s,%s\n", isAvx ? "v" : "",
11079       nameXMMReg(rE), nameIReg32(rG));
11080   return delta;
11081}
11082
11083
11084static Long dis_MOVMSKPD_256 ( VexAbiInfo* vbi, Prefix pfx, Long delta )
11085{
11086   UChar modrm = getUChar(delta);
11087   UInt   rG   = gregOfRexRM(pfx,modrm);
11088   UInt   rE   = eregOfRexRM(pfx,modrm);
11089   IRTemp t0   = newTemp(Ity_I32);
11090   IRTemp t1   = newTemp(Ity_I32);
11091   IRTemp t2   = newTemp(Ity_I32);
11092   IRTemp t3   = newTemp(Ity_I32);
11093   delta += 1;
11094   assign( t0, binop( Iop_And32,
11095                      binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(31)),
11096                      mkU32(1) ));
11097   assign( t1, binop( Iop_And32,
11098                      binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(30)),
11099                      mkU32(2) ));
11100   assign( t2, binop( Iop_And32,
11101                      binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(29)),
11102                      mkU32(4) ));
11103   assign( t3, binop( Iop_And32,
11104                      binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(28)),
11105                      mkU32(8) ));
11106   putIReg32( rG, binop(Iop_Or32,
11107                        binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
11108                        binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
11109   DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
11110   return delta;
11111}
11112
11113
11114/* Note, this also handles SSE(1) insns. */
11115__attribute__((noinline))
11116static
11117Long dis_ESC_0F__SSE2 ( Bool* decode_OK,
11118                        VexAbiInfo* vbi,
11119                        Prefix pfx, Int sz, Long deltaIN,
11120                        DisResult* dres )
11121{
11122   IRTemp addr  = IRTemp_INVALID;
11123   IRTemp t0    = IRTemp_INVALID;
11124   IRTemp t1    = IRTemp_INVALID;
11125   IRTemp t2    = IRTemp_INVALID;
11126   IRTemp t3    = IRTemp_INVALID;
11127   IRTemp t4    = IRTemp_INVALID;
11128   IRTemp t5    = IRTemp_INVALID;
11129   IRTemp t6    = IRTemp_INVALID;
11130   UChar  modrm = 0;
11131   Int    alen  = 0;
11132   HChar  dis_buf[50];
11133
11134   *decode_OK = False;
11135
11136   Long   delta = deltaIN;
11137   UChar  opc   = getUChar(delta);
11138   delta++;
11139   switch (opc) {
11140
11141   case 0x10:
11142      if (have66noF2noF3(pfx)
11143          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
11144         /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
11145         modrm = getUChar(delta);
11146         if (epartIsReg(modrm)) {
11147            putXMMReg( gregOfRexRM(pfx,modrm),
11148                       getXMMReg( eregOfRexRM(pfx,modrm) ));
11149            DIP("movupd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
11150                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
11151            delta += 1;
11152         } else {
11153            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11154            putXMMReg( gregOfRexRM(pfx,modrm),
11155                       loadLE(Ity_V128, mkexpr(addr)) );
11156            DIP("movupd %s,%s\n", dis_buf,
11157                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
11158            delta += alen;
11159         }
11160         goto decode_success;
11161      }
11162      /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
11163         G (lo half xmm).  If E is mem, upper half of G is zeroed out.
11164         If E is reg, upper half of G is unchanged. */
11165      if (haveF2no66noF3(pfx)
11166          && (sz == 4 || /* ignore redundant REX.W */ sz == 8) ) {
11167         modrm = getUChar(delta);
11168         if (epartIsReg(modrm)) {
11169            putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
11170                             getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
11171            DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
11172                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
11173            delta += 1;
11174         } else {
11175            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11176            putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
11177            putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
11178                             loadLE(Ity_I64, mkexpr(addr)) );
11179            DIP("movsd %s,%s\n", dis_buf,
11180                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
11181            delta += alen;
11182         }
11183         goto decode_success;
11184      }
11185      /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
11186         (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
11187      if (haveF3no66noF2(pfx)
11188          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
11189         modrm = getUChar(delta);
11190         if (epartIsReg(modrm)) {
11191            putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
11192                             getXMMRegLane32( eregOfRexRM(pfx,modrm), 0 ));
11193            DIP("movss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
11194                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
11195            delta += 1;
11196         } else {
11197            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11198            putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
11199            putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
11200                             loadLE(Ity_I32, mkexpr(addr)) );
11201            DIP("movss %s,%s\n", dis_buf,
11202                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
11203            delta += alen;
11204         }
11205         goto decode_success;
11206      }
11207      /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
11208      if (haveNo66noF2noF3(pfx)
11209          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
11210         modrm = getUChar(delta);
11211         if (epartIsReg(modrm)) {
11212            putXMMReg( gregOfRexRM(pfx,modrm),
11213                       getXMMReg( eregOfRexRM(pfx,modrm) ));
11214            DIP("movups %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
11215                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
11216            delta += 1;
11217         } else {
11218            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11219            putXMMReg( gregOfRexRM(pfx,modrm),
11220                       loadLE(Ity_V128, mkexpr(addr)) );
11221            DIP("movups %s,%s\n", dis_buf,
11222                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
11223            delta += alen;
11224         }
11225         goto decode_success;
11226      }
11227      break;
11228
11229   case 0x11:
11230      /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
11231         or lo half xmm). */
11232      if (haveF2no66noF3(pfx)
11233          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
11234         modrm = getUChar(delta);
11235         if (epartIsReg(modrm)) {
11236            putXMMRegLane64( eregOfRexRM(pfx,modrm), 0,
11237                             getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
11238            DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11239                                 nameXMMReg(eregOfRexRM(pfx,modrm)));
11240            delta += 1;
11241         } else {
11242            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11243            storeLE( mkexpr(addr),
11244                     getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
11245            DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11246                                 dis_buf);
11247            delta += alen;
11248         }
11249         goto decode_success;
11250      }
11251      /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
11252         or lo 1/4 xmm). */
11253      if (haveF3no66noF2(pfx) && sz == 4) {
11254         modrm = getUChar(delta);
11255         if (epartIsReg(modrm)) {
11256            /* fall through, we don't yet have a test case */
11257         } else {
11258            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11259            storeLE( mkexpr(addr),
11260                     getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
11261            DIP("movss %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11262                                 dis_buf);
11263            delta += alen;
11264            goto decode_success;
11265         }
11266      }
11267      /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
11268      if (have66noF2noF3(pfx)
11269          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
11270         modrm = getUChar(delta);
11271         if (epartIsReg(modrm)) {
11272            putXMMReg( eregOfRexRM(pfx,modrm),
11273   		    getXMMReg( gregOfRexRM(pfx,modrm) ) );
11274            DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11275   	                       nameXMMReg(eregOfRexRM(pfx,modrm)));
11276            delta += 1;
11277         } else {
11278            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11279            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
11280            DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11281                                  dis_buf );
11282            delta += alen;
11283         }
11284         goto decode_success;
11285      }
11286      /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
11287      if (haveNo66noF2noF3(pfx)
11288          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
11289         modrm = getUChar(delta);
11290         if (epartIsReg(modrm)) {
11291            /* fall through; awaiting test case */
11292         } else {
11293            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11294            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
11295            DIP("movups %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11296                                  dis_buf );
11297            delta += alen;
11298            goto decode_success;
11299         }
11300      }
11301      break;
11302
11303   case 0x12:
11304      /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
11305      /* Identical to MOVLPS ? */
11306      if (have66noF2noF3(pfx)
11307          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
11308         modrm = getUChar(delta);
11309         if (epartIsReg(modrm)) {
11310            /* fall through; apparently reg-reg is not possible */
11311         } else {
11312            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11313            delta += alen;
11314            putXMMRegLane64( gregOfRexRM(pfx,modrm),
11315                             0/*lower lane*/,
11316                             loadLE(Ity_I64, mkexpr(addr)) );
11317            DIP("movlpd %s, %s\n",
11318                dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
11319            goto decode_success;
11320         }
11321      }
11322      /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
11323      /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
11324      if (haveNo66noF2noF3(pfx)
11325          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
11326         modrm = getUChar(delta);
11327         if (epartIsReg(modrm)) {
11328            delta += 1;
11329            putXMMRegLane64( gregOfRexRM(pfx,modrm),
11330                             0/*lower lane*/,
11331                             getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ));
11332            DIP("movhlps %s, %s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
11333                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
11334         } else {
11335            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11336            delta += alen;
11337            putXMMRegLane64( gregOfRexRM(pfx,modrm),  0/*lower lane*/,
11338                             loadLE(Ity_I64, mkexpr(addr)) );
11339            DIP("movlps %s, %s\n",
11340                dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
11341         }
11342         goto decode_success;
11343      }
11344      break;
11345
11346   case 0x13:
11347      /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
11348      if (haveNo66noF2noF3(pfx)
11349          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
11350         modrm = getUChar(delta);
11351         if (!epartIsReg(modrm)) {
11352            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11353            delta += alen;
11354            storeLE( mkexpr(addr),
11355                     getXMMRegLane64( gregOfRexRM(pfx,modrm),
11356                                      0/*lower lane*/ ) );
11357            DIP("movlps %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
11358                                   dis_buf);
11359            goto decode_success;
11360         }
11361         /* else fall through */
11362      }
11363      /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
11364      /* Identical to MOVLPS ? */
11365      if (have66noF2noF3(pfx)
11366          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
11367         modrm = getUChar(delta);
11368         if (!epartIsReg(modrm)) {
11369            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11370            delta += alen;
11371            storeLE( mkexpr(addr),
11372                     getXMMRegLane64( gregOfRexRM(pfx,modrm),
11373                                      0/*lower lane*/ ) );
11374            DIP("movlpd %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
11375                                   dis_buf);
11376            goto decode_success;
11377         }
11378         /* else fall through */
11379      }
11380      break;
11381
11382   case 0x14:
11383   case 0x15:
11384      /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
11385      /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
11386      /* These just appear to be special cases of SHUFPS */
11387      if (haveNo66noF2noF3(pfx) && sz == 4) {
11388         Bool   hi = toBool(opc == 0x15);
11389         IRTemp sV = newTemp(Ity_V128);
11390         IRTemp dV = newTemp(Ity_V128);
11391         modrm = getUChar(delta);
11392         UInt   rG = gregOfRexRM(pfx,modrm);
11393         assign( dV, getXMMReg(rG) );
11394         if (epartIsReg(modrm)) {
11395            UInt rE = eregOfRexRM(pfx,modrm);
11396            assign( sV, getXMMReg(rE) );
11397            delta += 1;
11398            DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
11399                nameXMMReg(rE), nameXMMReg(rG));
11400         } else {
11401            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11402            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11403            delta += alen;
11404            DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
11405                dis_buf, nameXMMReg(rG));
11406         }
11407         IRTemp res = math_UNPCKxPS_128( sV, dV, hi );
11408         putXMMReg( rG, mkexpr(res) );
11409         goto decode_success;
11410      }
11411      /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
11412      /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
11413      /* These just appear to be special cases of SHUFPS */
11414      if (have66noF2noF3(pfx)
11415          && sz == 2 /* could be 8 if rex also present */) {
11416         Bool   hi = toBool(opc == 0x15);
11417         IRTemp sV = newTemp(Ity_V128);
11418         IRTemp dV = newTemp(Ity_V128);
11419         modrm = getUChar(delta);
11420         UInt   rG = gregOfRexRM(pfx,modrm);
11421         assign( dV, getXMMReg(rG) );
11422         if (epartIsReg(modrm)) {
11423            UInt rE = eregOfRexRM(pfx,modrm);
11424            assign( sV, getXMMReg(rE) );
11425            delta += 1;
11426            DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
11427                nameXMMReg(rE), nameXMMReg(rG));
11428         } else {
11429            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11430            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11431            delta += alen;
11432            DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
11433                dis_buf, nameXMMReg(rG));
11434         }
11435         IRTemp res = math_UNPCKxPD_128( sV, dV, hi );
11436         putXMMReg( rG, mkexpr(res) );
11437         goto decode_success;
11438      }
11439      break;
11440
11441   case 0x16:
11442      /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
11443      /* These seems identical to MOVHPS.  This instruction encoding is
11444         completely crazy. */
11445      if (have66noF2noF3(pfx)
11446          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
11447         modrm = getUChar(delta);
11448         if (epartIsReg(modrm)) {
11449            /* fall through; apparently reg-reg is not possible */
11450         } else {
11451            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11452            delta += alen;
11453            putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
11454                             loadLE(Ity_I64, mkexpr(addr)) );
11455            DIP("movhpd %s,%s\n", dis_buf,
11456                                  nameXMMReg( gregOfRexRM(pfx,modrm) ));
11457            goto decode_success;
11458         }
11459      }
11460      /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
11461      /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
11462      if (haveNo66noF2noF3(pfx)
11463          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
11464         modrm = getUChar(delta);
11465         if (epartIsReg(modrm)) {
11466            delta += 1;
11467            putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
11468                             getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ) );
11469            DIP("movhps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
11470                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
11471         } else {
11472            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11473            delta += alen;
11474            putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
11475                             loadLE(Ity_I64, mkexpr(addr)) );
11476            DIP("movhps %s,%s\n", dis_buf,
11477                                  nameXMMReg( gregOfRexRM(pfx,modrm) ));
11478         }
11479         goto decode_success;
11480      }
11481      break;
11482
11483   case 0x17:
11484      /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
11485      if (haveNo66noF2noF3(pfx)
11486          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
11487         modrm = getUChar(delta);
11488         if (!epartIsReg(modrm)) {
11489            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11490            delta += alen;
11491            storeLE( mkexpr(addr),
11492                     getXMMRegLane64( gregOfRexRM(pfx,modrm),
11493                                      1/*upper lane*/ ) );
11494            DIP("movhps %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
11495                                  dis_buf);
11496            goto decode_success;
11497         }
11498         /* else fall through */
11499      }
11500      /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
11501      /* Again, this seems identical to MOVHPS. */
11502      if (have66noF2noF3(pfx)
11503          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
11504         modrm = getUChar(delta);
11505         if (!epartIsReg(modrm)) {
11506            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11507            delta += alen;
11508            storeLE( mkexpr(addr),
11509                     getXMMRegLane64( gregOfRexRM(pfx,modrm),
11510                                      1/*upper lane*/ ) );
11511            DIP("movhpd %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
11512                                  dis_buf);
11513            goto decode_success;
11514         }
11515         /* else fall through */
11516      }
11517      break;
11518
11519   case 0x18:
11520      /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
11521      /* 0F 18 /1 = PREFETCH0   -- with various different hints */
11522      /* 0F 18 /2 = PREFETCH1 */
11523      /* 0F 18 /3 = PREFETCH2 */
11524      if (haveNo66noF2noF3(pfx)
11525          && !epartIsReg(getUChar(delta))
11526          && gregLO3ofRM(getUChar(delta)) >= 0
11527          && gregLO3ofRM(getUChar(delta)) <= 3) {
11528         HChar* hintstr = "??";
11529
11530         modrm = getUChar(delta);
11531         vassert(!epartIsReg(modrm));
11532
11533         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11534         delta += alen;
11535
11536         switch (gregLO3ofRM(modrm)) {
11537            case 0: hintstr = "nta"; break;
11538            case 1: hintstr = "t0"; break;
11539            case 2: hintstr = "t1"; break;
11540            case 3: hintstr = "t2"; break;
11541            default: vassert(0);
11542         }
11543
11544         DIP("prefetch%s %s\n", hintstr, dis_buf);
11545         goto decode_success;
11546      }
11547      break;
11548
11549   case 0x28:
11550      /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
11551      if (have66noF2noF3(pfx)
11552          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
11553         modrm = getUChar(delta);
11554         if (epartIsReg(modrm)) {
11555            putXMMReg( gregOfRexRM(pfx,modrm),
11556                       getXMMReg( eregOfRexRM(pfx,modrm) ));
11557            DIP("movapd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
11558                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
11559            delta += 1;
11560         } else {
11561            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11562            gen_SEGV_if_not_16_aligned( addr );
11563            putXMMReg( gregOfRexRM(pfx,modrm),
11564                       loadLE(Ity_V128, mkexpr(addr)) );
11565            DIP("movapd %s,%s\n", dis_buf,
11566                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
11567            delta += alen;
11568         }
11569         goto decode_success;
11570      }
11571      /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
11572      if (haveNo66noF2noF3(pfx)
11573          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
11574         modrm = getUChar(delta);
11575         if (epartIsReg(modrm)) {
11576            putXMMReg( gregOfRexRM(pfx,modrm),
11577                       getXMMReg( eregOfRexRM(pfx,modrm) ));
11578            DIP("movaps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
11579                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
11580            delta += 1;
11581         } else {
11582            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11583            gen_SEGV_if_not_16_aligned( addr );
11584            putXMMReg( gregOfRexRM(pfx,modrm),
11585                       loadLE(Ity_V128, mkexpr(addr)) );
11586            DIP("movaps %s,%s\n", dis_buf,
11587                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
11588            delta += alen;
11589         }
11590         goto decode_success;
11591      }
11592      break;
11593
11594   case 0x29:
11595      /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
11596      if (haveNo66noF2noF3(pfx)
11597          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
11598         modrm = getUChar(delta);
11599         if (epartIsReg(modrm)) {
11600            /* fall through; awaiting test case */
11601            putXMMReg( eregOfRexRM(pfx,modrm),
11602                       getXMMReg( gregOfRexRM(pfx,modrm) ));
11603            DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11604                                  nameXMMReg(eregOfRexRM(pfx,modrm)));
11605            delta += 1;
11606         } else {
11607            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11608            gen_SEGV_if_not_16_aligned( addr );
11609            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
11610            DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11611                                  dis_buf );
11612            delta += alen;
11613         }
11614         goto decode_success;
11615      }
11616      /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
11617      if (have66noF2noF3(pfx)
11618          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
11619         modrm = getUChar(delta);
11620         if (epartIsReg(modrm)) {
11621            putXMMReg( eregOfRexRM(pfx,modrm),
11622   		    getXMMReg( gregOfRexRM(pfx,modrm) ) );
11623            DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11624   	                       nameXMMReg(eregOfRexRM(pfx,modrm)));
11625            delta += 1;
11626         } else {
11627            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11628            gen_SEGV_if_not_16_aligned( addr );
11629            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
11630            DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11631                                  dis_buf );
11632            delta += alen;
11633         }
11634         goto decode_success;
11635      }
11636      break;
11637
11638   case 0x2A:
11639      /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
11640         half xmm */
11641      if (haveNo66noF2noF3(pfx) && sz == 4) {
11642         IRTemp arg64 = newTemp(Ity_I64);
11643         IRTemp rmode = newTemp(Ity_I32);
11644
11645         modrm = getUChar(delta);
11646         do_MMX_preamble();
11647         if (epartIsReg(modrm)) {
11648            assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
11649            delta += 1;
11650            DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
11651                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
11652         } else {
11653            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11654            assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
11655            delta += alen;
11656            DIP("cvtpi2ps %s,%s\n", dis_buf,
11657                                    nameXMMReg(gregOfRexRM(pfx,modrm)) );
11658         }
11659
11660         assign( rmode, get_sse_roundingmode() );
11661
11662         putXMMRegLane32F(
11663            gregOfRexRM(pfx,modrm), 0,
11664            binop(Iop_F64toF32,
11665                  mkexpr(rmode),
11666                  unop(Iop_I32StoF64,
11667                       unop(Iop_64to32, mkexpr(arg64)) )) );
11668
11669         putXMMRegLane32F(
11670            gregOfRexRM(pfx,modrm), 1,
11671            binop(Iop_F64toF32,
11672                  mkexpr(rmode),
11673                  unop(Iop_I32StoF64,
11674                       unop(Iop_64HIto32, mkexpr(arg64)) )) );
11675
11676         goto decode_success;
11677      }
11678      /* F3 0F 2A = CVTSI2SS
11679         -- sz==4: convert I32 in mem/ireg to F32 in low quarter xmm
11680         -- sz==8: convert I64 in mem/ireg to F32 in low quarter xmm */
11681      if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
11682         IRTemp rmode = newTemp(Ity_I32);
11683         assign( rmode, get_sse_roundingmode() );
11684         modrm = getUChar(delta);
11685         if (sz == 4) {
11686            IRTemp arg32 = newTemp(Ity_I32);
11687            if (epartIsReg(modrm)) {
11688               assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
11689               delta += 1;
11690               DIP("cvtsi2ss %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
11691                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
11692            } else {
11693               addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11694               assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
11695               delta += alen;
11696               DIP("cvtsi2ss %s,%s\n", dis_buf,
11697                                       nameXMMReg(gregOfRexRM(pfx,modrm)) );
11698            }
11699            putXMMRegLane32F(
11700               gregOfRexRM(pfx,modrm), 0,
11701               binop(Iop_F64toF32,
11702                     mkexpr(rmode),
11703                     unop(Iop_I32StoF64, mkexpr(arg32)) ) );
11704         } else {
11705            /* sz == 8 */
11706            IRTemp arg64 = newTemp(Ity_I64);
11707            if (epartIsReg(modrm)) {
11708               assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
11709               delta += 1;
11710               DIP("cvtsi2ssq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
11711                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
11712            } else {
11713               addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11714               assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
11715               delta += alen;
11716               DIP("cvtsi2ssq %s,%s\n", dis_buf,
11717                                        nameXMMReg(gregOfRexRM(pfx,modrm)) );
11718            }
11719            putXMMRegLane32F(
11720               gregOfRexRM(pfx,modrm), 0,
11721               binop(Iop_F64toF32,
11722                     mkexpr(rmode),
11723                     binop(Iop_I64StoF64, mkexpr(rmode), mkexpr(arg64)) ) );
11724         }
11725         goto decode_success;
11726      }
11727      /* F2 0F 2A = CVTSI2SD
11728         when sz==4 -- convert I32 in mem/ireg to F64 in low half xmm
11729         when sz==8 -- convert I64 in mem/ireg to F64 in low half xmm
11730      */
11731      if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
11732         modrm = getUChar(delta);
11733         if (sz == 4) {
11734            IRTemp arg32 = newTemp(Ity_I32);
11735            if (epartIsReg(modrm)) {
11736               assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
11737               delta += 1;
11738               DIP("cvtsi2sdl %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
11739                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
11740            } else {
11741               addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11742               assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
11743               delta += alen;
11744               DIP("cvtsi2sdl %s,%s\n", dis_buf,
11745                                        nameXMMReg(gregOfRexRM(pfx,modrm)) );
11746            }
11747            putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
11748                              unop(Iop_I32StoF64, mkexpr(arg32))
11749            );
11750         } else {
11751            /* sz == 8 */
11752            IRTemp arg64 = newTemp(Ity_I64);
11753            if (epartIsReg(modrm)) {
11754               assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
11755               delta += 1;
11756               DIP("cvtsi2sdq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
11757                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
11758            } else {
11759               addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11760               assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
11761               delta += alen;
11762               DIP("cvtsi2sdq %s,%s\n", dis_buf,
11763                                        nameXMMReg(gregOfRexRM(pfx,modrm)) );
11764            }
11765            putXMMRegLane64F(
11766               gregOfRexRM(pfx,modrm),
11767               0,
11768               binop( Iop_I64StoF64,
11769                      get_sse_roundingmode(),
11770                      mkexpr(arg64)
11771               )
11772            );
11773         }
11774         goto decode_success;
11775      }
11776      /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
11777         xmm(G) */
11778      if (have66noF2noF3(pfx) && sz == 2) {
11779         IRTemp arg64 = newTemp(Ity_I64);
11780
11781         modrm = getUChar(delta);
11782         if (epartIsReg(modrm)) {
11783            /* Only switch to MMX mode if the source is a MMX register.
11784               This is inconsistent with all other instructions which
11785               convert between XMM and (M64 or MMX), which always switch
11786               to MMX mode even if 64-bit operand is M64 and not MMX.  At
11787               least, that's what the Intel docs seem to me to say.
11788               Fixes #210264. */
11789            do_MMX_preamble();
11790            assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
11791            delta += 1;
11792            DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
11793                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
11794         } else {
11795            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11796            assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
11797            delta += alen;
11798            DIP("cvtpi2pd %s,%s\n", dis_buf,
11799                                    nameXMMReg(gregOfRexRM(pfx,modrm)) );
11800         }
11801
11802         putXMMRegLane64F(
11803            gregOfRexRM(pfx,modrm), 0,
11804            unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
11805         );
11806
11807         putXMMRegLane64F(
11808            gregOfRexRM(pfx,modrm), 1,
11809            unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
11810         );
11811
11812         goto decode_success;
11813      }
11814      break;
11815
11816   case 0x2B:
11817      /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
11818      /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
11819      if ( (haveNo66noF2noF3(pfx) && sz == 4)
11820           || (have66noF2noF3(pfx) && sz == 2) ) {
11821         modrm = getUChar(delta);
11822         if (!epartIsReg(modrm)) {
11823            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11824            gen_SEGV_if_not_16_aligned( addr );
11825            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
11826            DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
11827                                    dis_buf,
11828                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
11829            delta += alen;
11830            goto decode_success;
11831         }
11832         /* else fall through */
11833      }
11834      break;
11835
11836   case 0x2C:
11837   case 0x2D:
11838      /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
11839         I32 in mmx, according to prevailing SSE rounding mode */
11840      /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
11841         I32 in mmx, rounding towards zero */
11842      if (haveNo66noF2noF3(pfx) && sz == 4) {
11843         IRTemp dst64  = newTemp(Ity_I64);
11844         IRTemp rmode  = newTemp(Ity_I32);
11845         IRTemp f32lo  = newTemp(Ity_F32);
11846         IRTemp f32hi  = newTemp(Ity_F32);
11847         Bool   r2zero = toBool(opc == 0x2C);
11848
11849         do_MMX_preamble();
11850         modrm = getUChar(delta);
11851
11852         if (epartIsReg(modrm)) {
11853            delta += 1;
11854            assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
11855            assign(f32hi, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 1));
11856            DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
11857                                      nameXMMReg(eregOfRexRM(pfx,modrm)),
11858                                      nameMMXReg(gregLO3ofRM(modrm)));
11859         } else {
11860            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11861            assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
11862            assign(f32hi, loadLE(Ity_F32, binop( Iop_Add64,
11863                                                 mkexpr(addr),
11864                                                 mkU64(4) )));
11865            delta += alen;
11866            DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
11867                                      dis_buf,
11868                                      nameMMXReg(gregLO3ofRM(modrm)));
11869         }
11870
11871         if (r2zero) {
11872            assign(rmode, mkU32((UInt)Irrm_ZERO) );
11873         } else {
11874            assign( rmode, get_sse_roundingmode() );
11875         }
11876
11877         assign(
11878            dst64,
11879            binop( Iop_32HLto64,
11880                   binop( Iop_F64toI32S,
11881                          mkexpr(rmode),
11882                          unop( Iop_F32toF64, mkexpr(f32hi) ) ),
11883                   binop( Iop_F64toI32S,
11884                          mkexpr(rmode),
11885                          unop( Iop_F32toF64, mkexpr(f32lo) ) )
11886                 )
11887         );
11888
11889         putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
11890         goto decode_success;
11891      }
11892      /* F3 0F 2D = CVTSS2SI
11893         when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
11894                       according to prevailing SSE rounding mode
11895         when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
11896                       according to prevailing SSE rounding mode
11897      */
11898      /* F3 0F 2C = CVTTSS2SI
11899         when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
11900                       truncating towards zero
11901         when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
11902                       truncating towards zero
11903      */
11904      if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
11905         delta = dis_CVTxSS2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
11906         goto decode_success;
11907      }
11908      /* F2 0F 2D = CVTSD2SI
11909         when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
11910                       according to prevailing SSE rounding mode
11911         when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
11912                       according to prevailing SSE rounding mode
11913      */
11914      /* F2 0F 2C = CVTTSD2SI
11915         when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
11916                       truncating towards zero
11917         when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
11918                       truncating towards zero
11919      */
11920      if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
11921         delta = dis_CVTxSD2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
11922         goto decode_success;
11923      }
11924      /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
11925         I32 in mmx, according to prevailing SSE rounding mode */
11926      /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
11927         I32 in mmx, rounding towards zero */
11928      if (have66noF2noF3(pfx) && sz == 2) {
11929         IRTemp dst64  = newTemp(Ity_I64);
11930         IRTemp rmode  = newTemp(Ity_I32);
11931         IRTemp f64lo  = newTemp(Ity_F64);
11932         IRTemp f64hi  = newTemp(Ity_F64);
11933         Bool   r2zero = toBool(opc == 0x2C);
11934
11935         do_MMX_preamble();
11936         modrm = getUChar(delta);
11937
11938         if (epartIsReg(modrm)) {
11939            delta += 1;
11940            assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
11941            assign(f64hi, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 1));
11942            DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
11943                                      nameXMMReg(eregOfRexRM(pfx,modrm)),
11944                                      nameMMXReg(gregLO3ofRM(modrm)));
11945         } else {
11946            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11947            assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
11948            assign(f64hi, loadLE(Ity_F64, binop( Iop_Add64,
11949                                                 mkexpr(addr),
11950                                                 mkU64(8) )));
11951            delta += alen;
11952            DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
11953                                      dis_buf,
11954                                      nameMMXReg(gregLO3ofRM(modrm)));
11955         }
11956
11957         if (r2zero) {
11958            assign(rmode, mkU32((UInt)Irrm_ZERO) );
11959         } else {
11960            assign( rmode, get_sse_roundingmode() );
11961         }
11962
11963         assign(
11964            dst64,
11965            binop( Iop_32HLto64,
11966                   binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
11967                   binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
11968                 )
11969         );
11970
11971         putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
11972         goto decode_success;
11973      }
11974      break;
11975
11976   case 0x2E:
11977   case 0x2F:
11978      /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
11979      /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
11980      if (have66noF2noF3(pfx) && sz == 2) {
11981         delta = dis_COMISD( vbi, pfx, delta, False/*!isAvx*/, opc );
11982         goto decode_success;
11983      }
11984      /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
11985      /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
11986      if (haveNo66noF2noF3(pfx) && sz == 4) {
11987         delta = dis_COMISS( vbi, pfx, delta, False/*!isAvx*/, opc );
11988         goto decode_success;
11989      }
11990      break;
11991
11992   case 0x50:
11993      /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
11994         to 4 lowest bits of ireg(G) */
11995      if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
11996          && epartIsReg(getUChar(delta))) {
11997         /* sz == 8 is a kludge to handle insns with REX.W redundantly
11998            set to 1, which has been known to happen:
11999
12000            4c 0f 50 d9             rex64X movmskps %xmm1,%r11d
12001
12002            20071106: Intel docs say that REX.W isn't redundant: when
12003            present, a 64-bit register is written; when not present, only
12004            the 32-bit half is written.  However, testing on a Core2
12005            machine suggests the entire 64 bit register is written
12006            irrespective of the status of REX.W.  That could be because
12007            of the default rule that says "if the lower half of a 32-bit
12008            register is written, the upper half is zeroed".  By using
12009            putIReg32 here we inadvertantly produce the same behaviour as
12010            the Core2, for the same reason -- putIReg32 implements said
12011            rule.
12012
12013            AMD docs give no indication that REX.W is even valid for this
12014            insn. */
12015         delta = dis_MOVMSKPS_128( vbi, pfx, delta, False/*!isAvx*/ );
12016         goto decode_success;
12017      }
12018      /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
12019         2 lowest bits of ireg(G) */
12020      if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
12021         /* sz == 8 is a kludge to handle insns with REX.W redundantly
12022            set to 1, which has been known to happen:
12023            66 4c 0f 50 d9          rex64X movmskpd %xmm1,%r11d
12024            20071106: see further comments on MOVMSKPS implementation above.
12025         */
12026         delta = dis_MOVMSKPD_128( vbi, pfx, delta, False/*!isAvx*/ );
12027         goto decode_success;
12028      }
12029      break;
12030
12031   case 0x51:
12032      /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
12033      if (haveF3no66noF2(pfx) && sz == 4) {
12034         delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
12035                                            "sqrtss", Iop_Sqrt32F0x4 );
12036         goto decode_success;
12037      }
12038      /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
12039      if (haveNo66noF2noF3(pfx) && sz == 4) {
12040         delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
12041                                           "sqrtps", Iop_Sqrt32Fx4 );
12042         goto decode_success;
12043      }
12044      /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
12045      if (haveF2no66noF3(pfx) && sz == 4) {
12046         delta = dis_SSE_E_to_G_unary_lo64( vbi, pfx, delta,
12047                                            "sqrtsd", Iop_Sqrt64F0x2 );
12048         goto decode_success;
12049      }
12050      /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
12051      if (have66noF2noF3(pfx) && sz == 2) {
12052         delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
12053                                           "sqrtpd", Iop_Sqrt64Fx2 );
12054         goto decode_success;
12055      }
12056      break;
12057
12058   case 0x52:
12059      /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
12060      if (haveF3no66noF2(pfx) && sz == 4) {
12061         delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
12062                                            "rsqrtss", Iop_RSqrt32F0x4 );
12063         goto decode_success;
12064      }
12065      /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
12066      if (haveNo66noF2noF3(pfx) && sz == 4) {
12067         delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
12068                                           "rsqrtps", Iop_RSqrt32Fx4 );
12069         goto decode_success;
12070      }
12071      break;
12072
12073   case 0x53:
12074      /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
12075      if (haveF3no66noF2(pfx) && sz == 4) {
12076         delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
12077                                            "rcpss", Iop_Recip32F0x4 );
12078         goto decode_success;
12079      }
12080      /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
12081      if (haveNo66noF2noF3(pfx) && sz == 4) {
12082         delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
12083                                           "rcpps", Iop_Recip32Fx4 );
12084         goto decode_success;
12085      }
12086      break;
12087
12088   case 0x54:
12089      /* 0F 54 = ANDPS -- G = G and E */
12090      if (haveNo66noF2noF3(pfx) && sz == 4) {
12091         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andps", Iop_AndV128 );
12092         goto decode_success;
12093      }
12094      /* 66 0F 54 = ANDPD -- G = G and E */
12095      if (have66noF2noF3(pfx) && sz == 2) {
12096         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andpd", Iop_AndV128 );
12097         goto decode_success;
12098      }
12099      break;
12100
12101   case 0x55:
12102      /* 0F 55 = ANDNPS -- G = (not G) and E */
12103      if (haveNo66noF2noF3(pfx) && sz == 4) {
12104         delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnps",
12105                                                           Iop_AndV128 );
12106         goto decode_success;
12107      }
12108      /* 66 0F 55 = ANDNPD -- G = (not G) and E */
12109      if (have66noF2noF3(pfx) && sz == 2) {
12110         delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnpd",
12111                                                           Iop_AndV128 );
12112         goto decode_success;
12113      }
12114      break;
12115
12116   case 0x56:
12117      /* 0F 56 = ORPS -- G = G and E */
12118      if (haveNo66noF2noF3(pfx) && sz == 4) {
12119         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orps", Iop_OrV128 );
12120         goto decode_success;
12121      }
12122      /* 66 0F 56 = ORPD -- G = G and E */
12123      if (have66noF2noF3(pfx) && sz == 2) {
12124         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orpd", Iop_OrV128 );
12125         goto decode_success;
12126      }
12127      break;
12128
12129   case 0x57:
12130      /* 66 0F 57 = XORPD -- G = G xor E */
12131      if (have66noF2noF3(pfx) && sz == 2) {
12132         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorpd", Iop_XorV128 );
12133         goto decode_success;
12134      }
12135      /* 0F 57 = XORPS -- G = G xor E */
12136      if (haveNo66noF2noF3(pfx) && sz == 4) {
12137         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorps", Iop_XorV128 );
12138         goto decode_success;
12139      }
12140      break;
12141
12142   case 0x58:
12143      /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
12144      if (haveNo66noF2noF3(pfx) && sz == 4) {
12145         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addps", Iop_Add32Fx4 );
12146         goto decode_success;
12147      }
12148      /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
12149      if (haveF3no66noF2(pfx) && sz == 4) {
12150         delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "addss", Iop_Add32F0x4 );
12151         goto decode_success;
12152      }
12153      /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
12154      if (haveF2no66noF3(pfx)
12155          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12156         delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "addsd", Iop_Add64F0x2 );
12157         goto decode_success;
12158      }
12159      /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
12160      if (have66noF2noF3(pfx)
12161          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12162         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addpd", Iop_Add64Fx2 );
12163         goto decode_success;
12164      }
12165      break;
12166
12167   case 0x59:
12168      /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
12169      if (haveF2no66noF3(pfx)
12170          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12171         delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "mulsd", Iop_Mul64F0x2 );
12172         goto decode_success;
12173      }
12174      /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
12175      if (haveF3no66noF2(pfx) && sz == 4) {
12176         delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "mulss", Iop_Mul32F0x4 );
12177         goto decode_success;
12178      }
12179      /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
12180      if (haveNo66noF2noF3(pfx) && sz == 4) {
12181         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulps", Iop_Mul32Fx4 );
12182         goto decode_success;
12183      }
12184      /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
12185      if (have66noF2noF3(pfx)
12186          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12187         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulpd", Iop_Mul64Fx2 );
12188         goto decode_success;
12189      }
12190      break;
12191
12192   case 0x5A:
12193      /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
12194         F64 in xmm(G). */
12195      if (haveNo66noF2noF3(pfx) && sz == 4) {
12196         delta = dis_CVTPS2PD_128( vbi, pfx, delta, False/*!isAvx*/ );
12197         goto decode_success;
12198      }
12199      /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
12200         low half xmm(G) */
12201      if (haveF3no66noF2(pfx) && sz == 4) {
12202         IRTemp f32lo = newTemp(Ity_F32);
12203
12204         modrm = getUChar(delta);
12205         if (epartIsReg(modrm)) {
12206            delta += 1;
12207            assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
12208            DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12209                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
12210         } else {
12211            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12212            assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
12213            delta += alen;
12214            DIP("cvtss2sd %s,%s\n", dis_buf,
12215                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
12216         }
12217
12218         putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
12219                           unop( Iop_F32toF64, mkexpr(f32lo) ) );
12220
12221         goto decode_success;
12222      }
12223      /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
12224         low 1/4 xmm(G), according to prevailing SSE rounding mode */
12225      if (haveF2no66noF3(pfx) && sz == 4) {
12226         IRTemp rmode = newTemp(Ity_I32);
12227         IRTemp f64lo = newTemp(Ity_F64);
12228
12229         modrm = getUChar(delta);
12230         if (epartIsReg(modrm)) {
12231            delta += 1;
12232            assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
12233            DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12234                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
12235         } else {
12236            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12237            assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
12238            delta += alen;
12239            DIP("cvtsd2ss %s,%s\n", dis_buf,
12240                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
12241         }
12242
12243         assign( rmode, get_sse_roundingmode() );
12244         putXMMRegLane32F(
12245            gregOfRexRM(pfx,modrm), 0,
12246            binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
12247         );
12248
12249         goto decode_success;
12250      }
12251      /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
12252         lo half xmm(G), rounding according to prevailing SSE rounding
12253         mode, and zero upper half */
12254      /* Note, this is practically identical to CVTPD2DQ.  It would have
12255         be nice to merge them together. */
12256      if (have66noF2noF3(pfx) && sz == 2) {
12257         delta = dis_CVTPD2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
12258         goto decode_success;
12259      }
12260      break;
12261
12262   case 0x5B:
12263      /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
12264         xmm(G), rounding towards zero */
12265      /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
12266         xmm(G), as per the prevailing rounding mode */
12267      if ( (have66noF2noF3(pfx) && sz == 2)
12268           || (haveF3no66noF2(pfx) && sz == 4) ) {
12269         Bool r2zero = toBool(sz == 4); // FIXME -- unreliable (???)
12270         delta = dis_CVTxPS2DQ_128( vbi, pfx, delta, False/*!isAvx*/, r2zero );
12271         goto decode_success;
12272      }
12273      /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
12274         xmm(G) */
12275      if (haveNo66noF2noF3(pfx) && sz == 4) {
12276         delta = dis_CVTDQ2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
12277         goto decode_success;
12278      }
12279      break;
12280
12281   case 0x5C:
12282      /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
12283      if (haveF3no66noF2(pfx) && sz == 4) {
12284         delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "subss", Iop_Sub32F0x4 );
12285         goto decode_success;
12286      }
12287      /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
12288      if (haveF2no66noF3(pfx)
12289          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12290         delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "subsd", Iop_Sub64F0x2 );
12291         goto decode_success;
12292      }
12293      /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
12294      if (haveNo66noF2noF3(pfx) && sz == 4) {
12295         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subps", Iop_Sub32Fx4 );
12296         goto decode_success;
12297      }
12298      /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
12299      if (have66noF2noF3(pfx) && sz == 2) {
12300         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subpd", Iop_Sub64Fx2 );
12301         goto decode_success;
12302      }
12303      break;
12304
12305   case 0x5D:
12306      /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
12307      if (haveNo66noF2noF3(pfx) && sz == 4) {
12308         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minps", Iop_Min32Fx4 );
12309         goto decode_success;
12310      }
12311      /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
12312      if (haveF3no66noF2(pfx) && sz == 4) {
12313         delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "minss", Iop_Min32F0x4 );
12314         goto decode_success;
12315      }
12316      /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
12317      if (haveF2no66noF3(pfx) && sz == 4) {
12318         delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "minsd", Iop_Min64F0x2 );
12319         goto decode_success;
12320      }
12321      /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
12322      if (have66noF2noF3(pfx) && sz == 2) {
12323         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minpd", Iop_Min64Fx2 );
12324         goto decode_success;
12325      }
12326      break;
12327
12328   case 0x5E:
12329      /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
12330      if (haveF2no66noF3(pfx) && sz == 4) {
12331         delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "divsd", Iop_Div64F0x2 );
12332         goto decode_success;
12333      }
12334      /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
12335      if (haveNo66noF2noF3(pfx) && sz == 4) {
12336         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divps", Iop_Div32Fx4 );
12337         goto decode_success;
12338      }
12339      /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
12340      if (haveF3no66noF2(pfx) && sz == 4) {
12341         delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "divss", Iop_Div32F0x4 );
12342         goto decode_success;
12343      }
12344      /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
12345      if (have66noF2noF3(pfx) && sz == 2) {
12346         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divpd", Iop_Div64Fx2 );
12347         goto decode_success;
12348      }
12349      break;
12350
12351   case 0x5F:
12352      /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
12353      if (haveNo66noF2noF3(pfx) && sz == 4) {
12354         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxps", Iop_Max32Fx4 );
12355         goto decode_success;
12356      }
12357      /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
12358      if (haveF3no66noF2(pfx) && sz == 4) {
12359         delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "maxss", Iop_Max32F0x4 );
12360         goto decode_success;
12361      }
12362      /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
12363      if (haveF2no66noF3(pfx) && sz == 4) {
12364         delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "maxsd", Iop_Max64F0x2 );
12365         goto decode_success;
12366      }
12367      /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
12368      if (have66noF2noF3(pfx) && sz == 2) {
12369         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxpd", Iop_Max64Fx2 );
12370         goto decode_success;
12371      }
12372      break;
12373
12374   case 0x60:
12375      /* 66 0F 60 = PUNPCKLBW */
12376      if (have66noF2noF3(pfx) && sz == 2) {
12377         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
12378                                    "punpcklbw",
12379                                    Iop_InterleaveLO8x16, True );
12380         goto decode_success;
12381      }
12382      break;
12383
12384   case 0x61:
12385      /* 66 0F 61 = PUNPCKLWD */
12386      if (have66noF2noF3(pfx) && sz == 2) {
12387         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
12388                                    "punpcklwd",
12389                                    Iop_InterleaveLO16x8, True );
12390         goto decode_success;
12391      }
12392      break;
12393
12394   case 0x62:
12395      /* 66 0F 62 = PUNPCKLDQ */
12396      if (have66noF2noF3(pfx) && sz == 2) {
12397         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
12398                                    "punpckldq",
12399                                    Iop_InterleaveLO32x4, True );
12400         goto decode_success;
12401      }
12402      break;
12403
12404   case 0x63:
12405      /* 66 0F 63 = PACKSSWB */
12406      if (have66noF2noF3(pfx) && sz == 2) {
12407         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
12408                                    "packsswb",
12409                                    Iop_QNarrowBin16Sto8Sx16, True );
12410         goto decode_success;
12411      }
12412      break;
12413
12414   case 0x64:
12415      /* 66 0F 64 = PCMPGTB */
12416      if (have66noF2noF3(pfx) && sz == 2) {
12417         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
12418                                    "pcmpgtb", Iop_CmpGT8Sx16, False );
12419         goto decode_success;
12420      }
12421      break;
12422
12423   case 0x65:
12424      /* 66 0F 65 = PCMPGTW */
12425      if (have66noF2noF3(pfx) && sz == 2) {
12426         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
12427                                    "pcmpgtw", Iop_CmpGT16Sx8, False );
12428         goto decode_success;
12429      }
12430      break;
12431
12432   case 0x66:
12433      /* 66 0F 66 = PCMPGTD */
12434      if (have66noF2noF3(pfx) && sz == 2) {
12435         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
12436                                    "pcmpgtd", Iop_CmpGT32Sx4, False );
12437         goto decode_success;
12438      }
12439      break;
12440
12441   case 0x67:
12442      /* 66 0F 67 = PACKUSWB */
12443      if (have66noF2noF3(pfx) && sz == 2) {
12444         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
12445                                    "packuswb",
12446                                    Iop_QNarrowBin16Sto8Ux16, True );
12447         goto decode_success;
12448      }
12449      break;
12450
12451   case 0x68:
12452      /* 66 0F 68 = PUNPCKHBW */
12453      if (have66noF2noF3(pfx) && sz == 2) {
12454         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
12455                                    "punpckhbw",
12456                                    Iop_InterleaveHI8x16, True );
12457         goto decode_success;
12458      }
12459      break;
12460
12461   case 0x69:
12462      /* 66 0F 69 = PUNPCKHWD */
12463      if (have66noF2noF3(pfx) && sz == 2) {
12464         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
12465                                    "punpckhwd",
12466                                    Iop_InterleaveHI16x8, True );
12467         goto decode_success;
12468      }
12469      break;
12470
12471   case 0x6A:
12472      /* 66 0F 6A = PUNPCKHDQ */
12473      if (have66noF2noF3(pfx) && sz == 2) {
12474         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
12475                                    "punpckhdq",
12476                                    Iop_InterleaveHI32x4, True );
12477         goto decode_success;
12478      }
12479      break;
12480
12481   case 0x6B:
12482      /* 66 0F 6B = PACKSSDW */
12483      if (have66noF2noF3(pfx) && sz == 2) {
12484         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
12485                                    "packssdw",
12486                                    Iop_QNarrowBin32Sto16Sx8, True );
12487         goto decode_success;
12488      }
12489      break;
12490
12491   case 0x6C:
12492      /* 66 0F 6C = PUNPCKLQDQ */
12493      if (have66noF2noF3(pfx) && sz == 2) {
12494         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
12495                                    "punpcklqdq",
12496                                    Iop_InterleaveLO64x2, True );
12497         goto decode_success;
12498      }
12499      break;
12500
12501   case 0x6D:
12502      /* 66 0F 6D = PUNPCKHQDQ */
12503      if (have66noF2noF3(pfx) && sz == 2) {
12504         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
12505                                    "punpckhqdq",
12506                                    Iop_InterleaveHI64x2, True );
12507         goto decode_success;
12508      }
12509      break;
12510
12511   case 0x6E:
12512      /* 66 0F 6E = MOVD from ireg32/m32 to xmm lo 1/4,
12513                    zeroing high 3/4 of xmm. */
12514      /*              or from ireg64/m64 to xmm lo 1/2,
12515                    zeroing high 1/2 of xmm. */
12516      if (have66noF2noF3(pfx)) {
12517         vassert(sz == 2 || sz == 8);
12518         if (sz == 2) sz = 4;
12519         modrm = getUChar(delta);
12520         if (epartIsReg(modrm)) {
12521            delta += 1;
12522            if (sz == 4) {
12523               putXMMReg(
12524                  gregOfRexRM(pfx,modrm),
12525                  unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
12526               );
12527               DIP("movd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
12528                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
12529            } else {
12530               putXMMReg(
12531                  gregOfRexRM(pfx,modrm),
12532                  unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
12533               );
12534               DIP("movq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
12535                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
12536            }
12537         } else {
12538            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
12539            delta += alen;
12540            putXMMReg(
12541               gregOfRexRM(pfx,modrm),
12542               sz == 4
12543                  ?  unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
12544                  :  unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)) )
12545            );
12546            DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q', dis_buf,
12547                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12548         }
12549         goto decode_success;
12550      }
12551      break;
12552
12553   case 0x6F:
12554      if (have66noF2noF3(pfx)
12555          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12556         /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
12557         modrm = getUChar(delta);
12558         if (epartIsReg(modrm)) {
12559            putXMMReg( gregOfRexRM(pfx,modrm),
12560                       getXMMReg( eregOfRexRM(pfx,modrm) ));
12561            DIP("movdqa %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12562                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12563            delta += 1;
12564         } else {
12565            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12566            gen_SEGV_if_not_16_aligned( addr );
12567            putXMMReg( gregOfRexRM(pfx,modrm),
12568                       loadLE(Ity_V128, mkexpr(addr)) );
12569            DIP("movdqa %s,%s\n", dis_buf,
12570                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12571            delta += alen;
12572         }
12573         goto decode_success;
12574      }
12575      if (haveF3no66noF2(pfx) && sz == 4) {
12576         /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
12577         modrm = getUChar(delta);
12578         if (epartIsReg(modrm)) {
12579            putXMMReg( gregOfRexRM(pfx,modrm),
12580                       getXMMReg( eregOfRexRM(pfx,modrm) ));
12581            DIP("movdqu %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12582                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12583            delta += 1;
12584         } else {
12585            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12586            putXMMReg( gregOfRexRM(pfx,modrm),
12587                       loadLE(Ity_V128, mkexpr(addr)) );
12588            DIP("movdqu %s,%s\n", dis_buf,
12589                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12590            delta += alen;
12591         }
12592         goto decode_success;
12593      }
12594      break;
12595
12596   case 0x70:
12597      /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
12598      if (have66noF2noF3(pfx) && sz == 2) {
12599         delta = dis_PSHUFD_32x4( vbi, pfx, delta, False/*!writesYmm*/);
12600         goto decode_success;
12601      }
12602      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
12603      /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
12604      if (haveNo66noF2noF3(pfx) && sz == 4) {
12605         Int order;
12606         IRTemp sV, dV, s3, s2, s1, s0;
12607         s3 = s2 = s1 = s0 = IRTemp_INVALID;
12608         sV = newTemp(Ity_I64);
12609         dV = newTemp(Ity_I64);
12610         do_MMX_preamble();
12611         modrm = getUChar(delta);
12612         if (epartIsReg(modrm)) {
12613            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
12614            order = (Int)getUChar(delta+1);
12615            delta += 1+1;
12616            DIP("pshufw $%d,%s,%s\n", order,
12617                                      nameMMXReg(eregLO3ofRM(modrm)),
12618                                      nameMMXReg(gregLO3ofRM(modrm)));
12619         } else {
12620            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
12621                              1/*extra byte after amode*/ );
12622            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12623            order = (Int)getUChar(delta+alen);
12624            delta += 1+alen;
12625            DIP("pshufw $%d,%s,%s\n", order,
12626                                      dis_buf,
12627                                      nameMMXReg(gregLO3ofRM(modrm)));
12628         }
12629         breakup64to16s( sV, &s3, &s2, &s1, &s0 );
12630#        define SEL(n) \
12631                   ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
12632         assign(dV,
12633   	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
12634                             SEL((order>>2)&3), SEL((order>>0)&3) )
12635         );
12636         putMMXReg(gregLO3ofRM(modrm), mkexpr(dV));
12637#        undef SEL
12638         goto decode_success;
12639      }
12640      /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
12641         mem) to G(xmm), and copy upper half */
12642      if (haveF2no66noF3(pfx) && sz == 4) {
12643         delta = dis_PSHUFxW_128( vbi, pfx, delta,
12644                                  False/*!isAvx*/, False/*!xIsH*/ );
12645         goto decode_success;
12646      }
12647      /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
12648         mem) to G(xmm), and copy lower half */
12649      if (haveF3no66noF2(pfx) && sz == 4) {
12650         delta = dis_PSHUFxW_128( vbi, pfx, delta,
12651                                  False/*!isAvx*/, True/*xIsH*/ );
12652         goto decode_success;
12653      }
12654      break;
12655
12656   case 0x71:
12657      /* 66 0F 71 /2 ib = PSRLW by immediate */
12658      if (have66noF2noF3(pfx) && sz == 2
12659          && epartIsReg(getUChar(delta))
12660          && gregLO3ofRM(getUChar(delta)) == 2) {
12661         delta = dis_SSE_shiftE_imm( pfx, delta, "psrlw", Iop_ShrN16x8 );
12662         goto decode_success;
12663      }
12664      /* 66 0F 71 /4 ib = PSRAW by immediate */
12665      if (have66noF2noF3(pfx) && sz == 2
12666          && epartIsReg(getUChar(delta))
12667          && gregLO3ofRM(getUChar(delta)) == 4) {
12668         delta = dis_SSE_shiftE_imm( pfx, delta, "psraw", Iop_SarN16x8 );
12669         goto decode_success;
12670      }
12671      /* 66 0F 71 /6 ib = PSLLW by immediate */
12672      if (have66noF2noF3(pfx) && sz == 2
12673          && epartIsReg(getUChar(delta))
12674          && gregLO3ofRM(getUChar(delta)) == 6) {
12675         delta = dis_SSE_shiftE_imm( pfx, delta, "psllw", Iop_ShlN16x8 );
12676         goto decode_success;
12677      }
12678      break;
12679
12680   case 0x72:
12681      /* 66 0F 72 /2 ib = PSRLD by immediate */
12682      if (have66noF2noF3(pfx) && sz == 2
12683          && epartIsReg(getUChar(delta))
12684          && gregLO3ofRM(getUChar(delta)) == 2) {
12685         delta = dis_SSE_shiftE_imm( pfx, delta, "psrld", Iop_ShrN32x4 );
12686         goto decode_success;
12687      }
12688      /* 66 0F 72 /4 ib = PSRAD by immediate */
12689      if (have66noF2noF3(pfx) && sz == 2
12690          && epartIsReg(getUChar(delta))
12691          && gregLO3ofRM(getUChar(delta)) == 4) {
12692         delta = dis_SSE_shiftE_imm( pfx, delta, "psrad", Iop_SarN32x4 );
12693         goto decode_success;
12694      }
12695      /* 66 0F 72 /6 ib = PSLLD by immediate */
12696      if (have66noF2noF3(pfx) && sz == 2
12697          && epartIsReg(getUChar(delta))
12698          && gregLO3ofRM(getUChar(delta)) == 6) {
12699         delta = dis_SSE_shiftE_imm( pfx, delta, "pslld", Iop_ShlN32x4 );
12700         goto decode_success;
12701      }
12702      break;
12703
12704   case 0x73:
12705      /* 66 0F 73 /3 ib = PSRLDQ by immediate */
12706      /* note, if mem case ever filled in, 1 byte after amode */
12707      if (have66noF2noF3(pfx) && sz == 2
12708          && epartIsReg(getUChar(delta))
12709          && gregLO3ofRM(getUChar(delta)) == 3) {
12710         Int imm = (Int)getUChar(delta+1);
12711         Int reg = eregOfRexRM(pfx,getUChar(delta));
12712         DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
12713         delta += 2;
12714         IRTemp sV = newTemp(Ity_V128);
12715         assign( sV, getXMMReg(reg) );
12716         putXMMReg(reg, mkexpr(math_PSRLDQ( sV, imm )));
12717         goto decode_success;
12718      }
12719      /* 66 0F 73 /7 ib = PSLLDQ by immediate */
12720      /* note, if mem case ever filled in, 1 byte after amode */
12721      if (have66noF2noF3(pfx) && sz == 2
12722          && epartIsReg(getUChar(delta))
12723          && gregLO3ofRM(getUChar(delta)) == 7) {
12724         Int imm = (Int)getUChar(delta+1);
12725         Int reg = eregOfRexRM(pfx,getUChar(delta));
12726         DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
12727         vassert(imm >= 0 && imm <= 255);
12728         delta += 2;
12729         IRTemp sV = newTemp(Ity_V128);
12730         assign( sV, getXMMReg(reg) );
12731         putXMMReg(reg, mkexpr(math_PSLLDQ( sV, imm )));
12732         goto decode_success;
12733      }
12734      /* 66 0F 73 /2 ib = PSRLQ by immediate */
12735      if (have66noF2noF3(pfx) && sz == 2
12736          && epartIsReg(getUChar(delta))
12737          && gregLO3ofRM(getUChar(delta)) == 2) {
12738         delta = dis_SSE_shiftE_imm( pfx, delta, "psrlq", Iop_ShrN64x2 );
12739         goto decode_success;
12740      }
12741      /* 66 0F 73 /6 ib = PSLLQ by immediate */
12742      if (have66noF2noF3(pfx) && sz == 2
12743          && epartIsReg(getUChar(delta))
12744          && gregLO3ofRM(getUChar(delta)) == 6) {
12745         delta = dis_SSE_shiftE_imm( pfx, delta, "psllq", Iop_ShlN64x2 );
12746         goto decode_success;
12747      }
12748      break;
12749
12750   case 0x74:
12751      /* 66 0F 74 = PCMPEQB */
12752      if (have66noF2noF3(pfx) && sz == 2) {
12753         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
12754                                    "pcmpeqb", Iop_CmpEQ8x16, False );
12755         goto decode_success;
12756      }
12757      break;
12758
12759   case 0x75:
12760      /* 66 0F 75 = PCMPEQW */
12761      if (have66noF2noF3(pfx) && sz == 2) {
12762         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
12763                                    "pcmpeqw", Iop_CmpEQ16x8, False );
12764         goto decode_success;
12765      }
12766      break;
12767
12768   case 0x76:
12769      /* 66 0F 76 = PCMPEQD */
12770      if (have66noF2noF3(pfx) && sz == 2) {
12771         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
12772                                    "pcmpeqd", Iop_CmpEQ32x4, False );
12773         goto decode_success;
12774      }
12775      break;
12776
12777   case 0x7E:
12778      /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
12779         G (lo half xmm).  Upper half of G is zeroed out. */
12780      if (haveF3no66noF2(pfx)
12781          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12782         modrm = getUChar(delta);
12783         if (epartIsReg(modrm)) {
12784            putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
12785                             getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
12786               /* zero bits 127:64 */
12787               putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkU64(0) );
12788            DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12789                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
12790            delta += 1;
12791         } else {
12792            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12793            putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
12794            putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
12795                             loadLE(Ity_I64, mkexpr(addr)) );
12796            DIP("movsd %s,%s\n", dis_buf,
12797                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
12798            delta += alen;
12799         }
12800         goto decode_success;
12801      }
12802      /* 66 0F 7E = MOVD from xmm low 1/4 to ireg32 or m32. */
12803      /*              or from xmm low 1/2 to ireg64 or m64. */
12804         if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
12805         if (sz == 2) sz = 4;
12806         modrm = getUChar(delta);
12807         if (epartIsReg(modrm)) {
12808            delta += 1;
12809            if (sz == 4) {
12810               putIReg32( eregOfRexRM(pfx,modrm),
12811                          getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
12812               DIP("movd %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12813                                    nameIReg32(eregOfRexRM(pfx,modrm)));
12814   	 } else {
12815               putIReg64( eregOfRexRM(pfx,modrm),
12816                          getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
12817               DIP("movq %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12818                                    nameIReg64(eregOfRexRM(pfx,modrm)));
12819   	 }
12820         } else {
12821            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
12822            delta += alen;
12823            storeLE( mkexpr(addr),
12824                     sz == 4
12825                        ? getXMMRegLane32(gregOfRexRM(pfx,modrm),0)
12826                        : getXMMRegLane64(gregOfRexRM(pfx,modrm),0) );
12827            DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q',
12828                                  nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
12829         }
12830         goto decode_success;
12831      }
12832      break;
12833
12834   case 0x7F:
12835      /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
12836      if (haveF3no66noF2(pfx) && sz == 4) {
12837         modrm = getUChar(delta);
12838         if (epartIsReg(modrm)) {
12839            goto decode_failure; /* awaiting test case */
12840            delta += 1;
12841            putXMMReg( eregOfRexRM(pfx,modrm),
12842                       getXMMReg(gregOfRexRM(pfx,modrm)) );
12843            DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12844                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
12845         } else {
12846            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
12847            delta += alen;
12848            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
12849            DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
12850         }
12851         goto decode_success;
12852      }
12853      /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
12854      if (have66noF2noF3(pfx) && sz == 2) {
12855         modrm = getUChar(delta);
12856         if (epartIsReg(modrm)) {
12857            delta += 1;
12858            putXMMReg( eregOfRexRM(pfx,modrm),
12859                       getXMMReg(gregOfRexRM(pfx,modrm)) );
12860            DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12861                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
12862         } else {
12863            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
12864            gen_SEGV_if_not_16_aligned( addr );
12865            delta += alen;
12866            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
12867            DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
12868         }
12869         goto decode_success;
12870      }
12871      break;
12872
12873   case 0xAE:
12874      /* 0F AE /7 = SFENCE -- flush pending operations to memory */
12875      if (haveNo66noF2noF3(pfx)
12876          && epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
12877          && sz == 4) {
12878         delta += 1;
12879         /* Insert a memory fence.  It's sometimes important that these
12880            are carried through to the generated code. */
12881         stmt( IRStmt_MBE(Imbe_Fence) );
12882         DIP("sfence\n");
12883         goto decode_success;
12884      }
12885      /* mindless duplication follows .. */
12886      /* 0F AE /5 = LFENCE -- flush pending operations to memory */
12887      /* 0F AE /6 = MFENCE -- flush pending operations to memory */
12888      if (haveNo66noF2noF3(pfx)
12889          && epartIsReg(getUChar(delta))
12890          && (gregLO3ofRM(getUChar(delta)) == 5
12891              || gregLO3ofRM(getUChar(delta)) == 6)
12892          && sz == 4) {
12893         delta += 1;
12894         /* Insert a memory fence.  It's sometimes important that these
12895            are carried through to the generated code. */
12896         stmt( IRStmt_MBE(Imbe_Fence) );
12897         DIP("%sfence\n", gregLO3ofRM(getUChar(delta-1))==5 ? "l" : "m");
12898         goto decode_success;
12899      }
12900
12901      /* 0F AE /7 = CLFLUSH -- flush cache line */
12902      if (haveNo66noF2noF3(pfx)
12903          && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
12904          && sz == 4) {
12905
12906         /* This is something of a hack.  We need to know the size of
12907            the cache line containing addr.  Since we don't (easily),
12908            assume 256 on the basis that no real cache would have a
12909            line that big.  It's safe to invalidate more stuff than we
12910            need, just inefficient. */
12911         ULong lineszB = 256ULL;
12912
12913         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12914         delta += alen;
12915
12916         /* Round addr down to the start of the containing block. */
12917         stmt( IRStmt_Put(
12918                  OFFB_TISTART,
12919                  binop( Iop_And64,
12920                         mkexpr(addr),
12921                         mkU64( ~(lineszB-1) ))) );
12922
12923         stmt( IRStmt_Put(OFFB_TILEN, mkU64(lineszB) ) );
12924
12925         jmp_lit(dres, Ijk_TInval, (Addr64)(guest_RIP_bbstart+delta));
12926
12927         DIP("clflush %s\n", dis_buf);
12928         goto decode_success;
12929      }
12930
12931      /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
12932      if (haveNo66noF2noF3(pfx)
12933          && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 3
12934          && sz == 4) {
12935         delta = dis_STMXCSR(vbi, pfx, delta, False/*!isAvx*/);
12936         goto decode_success;
12937      }
12938      /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
12939      if (haveNo66noF2noF3(pfx)
12940          && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 2
12941          && sz == 4) {
12942         delta = dis_LDMXCSR(vbi, pfx, delta, False/*!isAvx*/);
12943         goto decode_success;
12944      }
12945      /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory.
12946         Note that the presence or absence of REX.W slightly affects the
12947         written format: whether the saved FPU IP and DP pointers are 64
12948         or 32 bits.  But the helper function we call simply writes zero
12949         bits in the relevant fields (which are 64 bits regardless of
12950         what REX.W is) and so it's good enough (iow, equally broken) in
12951         both cases. */
12952      if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
12953          && !epartIsReg(getUChar(delta))
12954          && gregOfRexRM(pfx,getUChar(delta)) == 0) {
12955          IRDirty* d;
12956         modrm = getUChar(delta);
12957         vassert(!epartIsReg(modrm));
12958
12959         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12960         delta += alen;
12961         gen_SEGV_if_not_16_aligned(addr);
12962
12963         DIP("%sfxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
12964
12965         /* Uses dirty helper:
12966               void amd64g_do_FXSAVE ( VexGuestAMD64State*, ULong ) */
12967         d = unsafeIRDirty_0_N (
12968                0/*regparms*/,
12969                "amd64g_dirtyhelper_FXSAVE",
12970                &amd64g_dirtyhelper_FXSAVE,
12971                mkIRExprVec_1( mkexpr(addr) )
12972             );
12973         d->needsBBP = True;
12974
12975         /* declare we're writing memory */
12976         d->mFx   = Ifx_Write;
12977         d->mAddr = mkexpr(addr);
12978         d->mSize = 464; /* according to recent Intel docs */
12979
12980         /* declare we're reading guest state */
12981         d->nFxState = 7;
12982         vex_bzero(&d->fxState, sizeof(d->fxState));
12983
12984         d->fxState[0].fx     = Ifx_Read;
12985         d->fxState[0].offset = OFFB_FTOP;
12986         d->fxState[0].size   = sizeof(UInt);
12987
12988         d->fxState[1].fx     = Ifx_Read;
12989         d->fxState[1].offset = OFFB_FPREGS;
12990         d->fxState[1].size   = 8 * sizeof(ULong);
12991
12992         d->fxState[2].fx     = Ifx_Read;
12993         d->fxState[2].offset = OFFB_FPTAGS;
12994         d->fxState[2].size   = 8 * sizeof(UChar);
12995
12996         d->fxState[3].fx     = Ifx_Read;
12997         d->fxState[3].offset = OFFB_FPROUND;
12998         d->fxState[3].size   = sizeof(ULong);
12999
13000         d->fxState[4].fx     = Ifx_Read;
13001         d->fxState[4].offset = OFFB_FC3210;
13002         d->fxState[4].size   = sizeof(ULong);
13003
13004         d->fxState[5].fx     = Ifx_Read;
13005         d->fxState[5].offset = OFFB_YMM0;
13006         d->fxState[5].size   = sizeof(U128);
13007         /* plus 15 more of the above, spaced out in YMM sized steps */
13008         d->fxState[5].nRepeats  = 15;
13009         d->fxState[5].repeatLen = sizeof(U256);
13010
13011         d->fxState[6].fx     = Ifx_Read;
13012         d->fxState[6].offset = OFFB_SSEROUND;
13013         d->fxState[6].size   = sizeof(ULong);
13014
13015         /* Be paranoid ... this assertion tries to ensure the 16 %ymm
13016            images are packed back-to-back.  If not, the settings for
13017            d->fxState[5] are wrong. */
13018         vassert(32 == sizeof(U256));
13019         vassert(OFFB_YMM15 == (OFFB_YMM0 + 15 * 32));
13020
13021         stmt( IRStmt_Dirty(d) );
13022
13023         goto decode_success;
13024      }
13025      /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory.
13026         As with FXSAVE above we ignore the value of REX.W since we're
13027         not bothering with the FPU DP and IP fields. */
13028      if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
13029          && !epartIsReg(getUChar(delta))
13030          && gregOfRexRM(pfx,getUChar(delta)) == 1) {
13031         IRDirty* d;
13032         modrm = getUChar(delta);
13033         vassert(!epartIsReg(modrm));
13034
13035         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13036         delta += alen;
13037         gen_SEGV_if_not_16_aligned(addr);
13038
13039         DIP("%sfxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
13040
13041         /* Uses dirty helper:
13042               VexEmWarn amd64g_do_FXRSTOR ( VexGuestAMD64State*, ULong )
13043            NOTE:
13044               the VexEmWarn value is simply ignored
13045         */
13046         d = unsafeIRDirty_0_N (
13047                0/*regparms*/,
13048                "amd64g_dirtyhelper_FXRSTOR",
13049                &amd64g_dirtyhelper_FXRSTOR,
13050                mkIRExprVec_1( mkexpr(addr) )
13051             );
13052         d->needsBBP = True;
13053
13054         /* declare we're reading memory */
13055         d->mFx   = Ifx_Read;
13056         d->mAddr = mkexpr(addr);
13057         d->mSize = 464; /* according to recent Intel docs */
13058
13059         /* declare we're writing guest state */
13060         d->nFxState = 7;
13061         vex_bzero(&d->fxState, sizeof(d->fxState));
13062
13063         d->fxState[0].fx     = Ifx_Write;
13064         d->fxState[0].offset = OFFB_FTOP;
13065         d->fxState[0].size   = sizeof(UInt);
13066
13067         d->fxState[1].fx     = Ifx_Write;
13068         d->fxState[1].offset = OFFB_FPREGS;
13069         d->fxState[1].size   = 8 * sizeof(ULong);
13070
13071         d->fxState[2].fx     = Ifx_Write;
13072         d->fxState[2].offset = OFFB_FPTAGS;
13073         d->fxState[2].size   = 8 * sizeof(UChar);
13074
13075         d->fxState[3].fx     = Ifx_Write;
13076         d->fxState[3].offset = OFFB_FPROUND;
13077         d->fxState[3].size   = sizeof(ULong);
13078
13079         d->fxState[4].fx     = Ifx_Write;
13080         d->fxState[4].offset = OFFB_FC3210;
13081         d->fxState[4].size   = sizeof(ULong);
13082
13083         d->fxState[5].fx     = Ifx_Write;
13084         d->fxState[5].offset = OFFB_YMM0;
13085         d->fxState[5].size   = sizeof(U128);
13086         /* plus 15 more of the above, spaced out in YMM sized steps */
13087         d->fxState[5].nRepeats  = 15;
13088         d->fxState[5].repeatLen = sizeof(U256);
13089
13090         d->fxState[6].fx     = Ifx_Write;
13091         d->fxState[6].offset = OFFB_SSEROUND;
13092         d->fxState[6].size   = sizeof(ULong);
13093
13094         /* Be paranoid ... this assertion tries to ensure the 16 %ymm
13095            images are packed back-to-back.  If not, the settings for
13096            d->fxState[5] are wrong. */
13097         vassert(32 == sizeof(U256));
13098         vassert(OFFB_YMM15 == (OFFB_YMM0 + 15 * 32));
13099
13100         stmt( IRStmt_Dirty(d) );
13101
13102         goto decode_success;
13103      }
13104      break;
13105
13106   case 0xC2:
13107      /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
13108      if (haveNo66noF2noF3(pfx) && sz == 4) {
13109         Long delta0 = delta;
13110         delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpps", True, 4 );
13111         if (delta > delta0) goto decode_success;
13112      }
13113      /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
13114      if (haveF3no66noF2(pfx) && sz == 4) {
13115         Long delta0 = delta;
13116         delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpss", False, 4 );
13117         if (delta > delta0) goto decode_success;
13118      }
13119      /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
13120      if (haveF2no66noF3(pfx) && sz == 4) {
13121         Long delta0 = delta;
13122         delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpsd", False, 8 );
13123         if (delta > delta0) goto decode_success;
13124      }
13125      /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
13126      if (have66noF2noF3(pfx) && sz == 2) {
13127         Long delta0 = delta;
13128         delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmppd", True, 8 );
13129         if (delta > delta0) goto decode_success;
13130      }
13131      break;
13132
13133   case 0xC3:
13134      /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
13135      if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
13136         modrm = getUChar(delta);
13137         if (!epartIsReg(modrm)) {
13138            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13139            storeLE( mkexpr(addr), getIRegG(sz, pfx, modrm) );
13140            DIP("movnti %s,%s\n", dis_buf,
13141                                  nameIRegG(sz, pfx, modrm));
13142            delta += alen;
13143            goto decode_success;
13144         }
13145         /* else fall through */
13146      }
13147      break;
13148
13149   case 0xC4:
13150      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
13151      /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
13152         put it into the specified lane of mmx(G). */
13153      if (haveNo66noF2noF3(pfx)
13154          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
13155         /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
13156            mmx reg.  t4 is the new lane value.  t5 is the original
13157            mmx value. t6 is the new mmx value. */
13158         Int lane;
13159         t4 = newTemp(Ity_I16);
13160         t5 = newTemp(Ity_I64);
13161         t6 = newTemp(Ity_I64);
13162         modrm = getUChar(delta);
13163         do_MMX_preamble();
13164
13165         assign(t5, getMMXReg(gregLO3ofRM(modrm)));
13166         breakup64to16s( t5, &t3, &t2, &t1, &t0 );
13167
13168         if (epartIsReg(modrm)) {
13169            assign(t4, getIReg16(eregOfRexRM(pfx,modrm)));
13170            delta += 1+1;
13171            lane = getUChar(delta-1);
13172            DIP("pinsrw $%d,%s,%s\n", (Int)lane,
13173                                      nameIReg16(eregOfRexRM(pfx,modrm)),
13174                                      nameMMXReg(gregLO3ofRM(modrm)));
13175         } else {
13176            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
13177            delta += 1+alen;
13178            lane = getUChar(delta-1);
13179            assign(t4, loadLE(Ity_I16, mkexpr(addr)));
13180            DIP("pinsrw $%d,%s,%s\n", (Int)lane,
13181                                      dis_buf,
13182                                      nameMMXReg(gregLO3ofRM(modrm)));
13183         }
13184
13185         switch (lane & 3) {
13186            case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
13187            case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
13188            case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
13189            case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
13190            default: vassert(0);
13191         }
13192         putMMXReg(gregLO3ofRM(modrm), mkexpr(t6));
13193         goto decode_success;
13194      }
13195      /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
13196         put it into the specified lane of xmm(G). */
13197      if (have66noF2noF3(pfx)
13198          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
13199         Int lane;
13200         t4 = newTemp(Ity_I16);
13201         modrm = getUChar(delta);
13202         UInt rG = gregOfRexRM(pfx,modrm);
13203         if (epartIsReg(modrm)) {
13204            UInt rE = eregOfRexRM(pfx,modrm);
13205            assign(t4, getIReg16(rE));
13206            delta += 1+1;
13207            lane = getUChar(delta-1);
13208            DIP("pinsrw $%d,%s,%s\n",
13209                (Int)lane, nameIReg16(rE), nameXMMReg(rG));
13210         } else {
13211            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
13212                              1/*byte after the amode*/ );
13213            delta += 1+alen;
13214            lane = getUChar(delta-1);
13215            assign(t4, loadLE(Ity_I16, mkexpr(addr)));
13216            DIP("pinsrw $%d,%s,%s\n",
13217                (Int)lane, dis_buf, nameXMMReg(rG));
13218         }
13219         IRTemp src_vec = newTemp(Ity_V128);
13220         assign(src_vec, getXMMReg(rG));
13221         IRTemp res_vec = math_PINSRW_128( src_vec, t4, lane & 7);
13222         putXMMReg(rG, mkexpr(res_vec));
13223         goto decode_success;
13224      }
13225      break;
13226
13227   case 0xC5:
13228      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
13229      /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
13230         zero-extend of it in ireg(G). */
13231      if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
13232         modrm = getUChar(delta);
13233         if (epartIsReg(modrm)) {
13234            IRTemp sV = newTemp(Ity_I64);
13235            t5 = newTemp(Ity_I16);
13236            do_MMX_preamble();
13237            assign(sV, getMMXReg(eregLO3ofRM(modrm)));
13238            breakup64to16s( sV, &t3, &t2, &t1, &t0 );
13239            switch (getUChar(delta+1) & 3) {
13240               case 0:  assign(t5, mkexpr(t0)); break;
13241               case 1:  assign(t5, mkexpr(t1)); break;
13242               case 2:  assign(t5, mkexpr(t2)); break;
13243               case 3:  assign(t5, mkexpr(t3)); break;
13244               default: vassert(0);
13245            }
13246            if (sz == 8)
13247               putIReg64(gregOfRexRM(pfx,modrm), unop(Iop_16Uto64, mkexpr(t5)));
13248            else
13249               putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_16Uto32, mkexpr(t5)));
13250            DIP("pextrw $%d,%s,%s\n",
13251                (Int)getUChar(delta+1),
13252                nameMMXReg(eregLO3ofRM(modrm)),
13253                sz==8 ? nameIReg64(gregOfRexRM(pfx,modrm))
13254                      : nameIReg32(gregOfRexRM(pfx,modrm))
13255            );
13256            delta += 2;
13257            goto decode_success;
13258         }
13259         /* else fall through */
13260         /* note, for anyone filling in the mem case: this insn has one
13261            byte after the amode and therefore you must pass 1 as the
13262            last arg to disAMode */
13263      }
13264      /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
13265         zero-extend of it in ireg(G). */
13266      if (have66noF2noF3(pfx)
13267          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
13268         Long delta0 = delta;
13269         delta = dis_PEXTRW_128_EregOnly_toG( vbi, pfx, delta,
13270                                              False/*!isAvx*/ );
13271         if (delta > delta0) goto decode_success;
13272         /* else fall through -- decoding has failed */
13273      }
13274      break;
13275
13276   case 0xC6:
13277      /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
13278      if (haveNo66noF2noF3(pfx) && sz == 4) {
13279         Int    imm8 = 0;
13280         IRTemp sV   = newTemp(Ity_V128);
13281         IRTemp dV   = newTemp(Ity_V128);
13282         modrm = getUChar(delta);
13283         UInt rG = gregOfRexRM(pfx,modrm);
13284         assign( dV, getXMMReg(rG) );
13285         if (epartIsReg(modrm)) {
13286            UInt rE = eregOfRexRM(pfx,modrm);
13287            assign( sV, getXMMReg(rE) );
13288            imm8 = (Int)getUChar(delta+1);
13289            delta += 1+1;
13290            DIP("shufps $%d,%s,%s\n", imm8, nameXMMReg(rE), nameXMMReg(rG));
13291         } else {
13292            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
13293            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
13294            imm8 = (Int)getUChar(delta+alen);
13295            delta += 1+alen;
13296            DIP("shufps $%d,%s,%s\n", imm8, dis_buf, nameXMMReg(rG));
13297         }
13298         IRTemp res = math_SHUFPS_128( sV, dV, imm8 );
13299         putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
13300         goto decode_success;
13301      }
13302      /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
13303      if (have66noF2noF3(pfx) && sz == 2) {
13304         Int    select;
13305         IRTemp sV = newTemp(Ity_V128);
13306         IRTemp dV = newTemp(Ity_V128);
13307
13308         modrm = getUChar(delta);
13309         assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
13310
13311         if (epartIsReg(modrm)) {
13312            assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
13313            select = (Int)getUChar(delta+1);
13314            delta += 1+1;
13315            DIP("shufpd $%d,%s,%s\n", select,
13316                                      nameXMMReg(eregOfRexRM(pfx,modrm)),
13317                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
13318         } else {
13319            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
13320            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
13321            select = getUChar(delta+alen);
13322            delta += 1+alen;
13323            DIP("shufpd $%d,%s,%s\n", select,
13324                                      dis_buf,
13325                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
13326         }
13327
13328         IRTemp res = math_SHUFPD_128( sV, dV, select );
13329         putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
13330         goto decode_success;
13331      }
13332      break;
13333
13334   case 0xD1:
13335      /* 66 0F D1 = PSRLW by E */
13336      if (have66noF2noF3(pfx) && sz == 2) {
13337         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlw", Iop_ShrN16x8 );
13338         goto decode_success;
13339      }
13340      break;
13341
13342   case 0xD2:
13343      /* 66 0F D2 = PSRLD by E */
13344      if (have66noF2noF3(pfx) && sz == 2) {
13345         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrld", Iop_ShrN32x4 );
13346         goto decode_success;
13347      }
13348      break;
13349
13350   case 0xD3:
13351      /* 66 0F D3 = PSRLQ by E */
13352      if (have66noF2noF3(pfx) && sz == 2) {
13353         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlq", Iop_ShrN64x2 );
13354         goto decode_success;
13355      }
13356      break;
13357
13358   case 0xD4:
13359      /* 66 0F D4 = PADDQ */
13360      if (have66noF2noF3(pfx) && sz == 2) {
13361         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13362                                    "paddq", Iop_Add64x2, False );
13363         goto decode_success;
13364      }
13365      /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
13366      /* 0F D4 = PADDQ -- add 64x1 */
13367      if (haveNo66noF2noF3(pfx) && sz == 4) {
13368         do_MMX_preamble();
13369         delta = dis_MMXop_regmem_to_reg (
13370                   vbi, pfx, delta, opc, "paddq", False );
13371         goto decode_success;
13372      }
13373      break;
13374
13375   case 0xD5:
13376      /* 66 0F D5 = PMULLW -- 16x8 multiply */
13377      if (have66noF2noF3(pfx) && sz == 2) {
13378         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13379                                    "pmullw", Iop_Mul16x8, False );
13380         goto decode_success;
13381      }
13382      break;
13383
13384   case 0xD6:
13385      /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
13386         hi half). */
13387      if (haveF3no66noF2(pfx) && sz == 4) {
13388         modrm = getUChar(delta);
13389         if (epartIsReg(modrm)) {
13390            do_MMX_preamble();
13391            putXMMReg( gregOfRexRM(pfx,modrm),
13392                       unop(Iop_64UtoV128, getMMXReg( eregLO3ofRM(modrm) )) );
13393            DIP("movq2dq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
13394                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
13395            delta += 1;
13396            goto decode_success;
13397         }
13398         /* apparently no mem case for this insn */
13399      }
13400      /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
13401         or lo half xmm).  */
13402      if (have66noF2noF3(pfx)
13403          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
13404         modrm = getUChar(delta);
13405         if (epartIsReg(modrm)) {
13406            /* fall through, awaiting test case */
13407            /* dst: lo half copied, hi half zeroed */
13408         } else {
13409            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13410            storeLE( mkexpr(addr),
13411                     getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
13412            DIP("movq %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf );
13413            delta += alen;
13414            goto decode_success;
13415         }
13416      }
13417      /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
13418      if (haveF2no66noF3(pfx) && sz == 4) {
13419         modrm = getUChar(delta);
13420         if (epartIsReg(modrm)) {
13421            do_MMX_preamble();
13422            putMMXReg( gregLO3ofRM(modrm),
13423                       getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
13424            DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
13425                                   nameMMXReg(gregLO3ofRM(modrm)));
13426            delta += 1;
13427            goto decode_success;
13428         }
13429         /* apparently no mem case for this insn */
13430      }
13431      break;
13432
13433   case 0xD7:
13434      /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16
13435         lanes in xmm(E), turn them into a byte, and put
13436         zero-extend of it in ireg(G).  Doing this directly is just
13437         too cumbersome; give up therefore and call a helper. */
13438      if (have66noF2noF3(pfx)
13439          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
13440          && epartIsReg(getUChar(delta))) { /* no memory case, it seems */
13441         delta = dis_PMOVMSKB_128( vbi, pfx, delta, False/*!isAvx*/ );
13442         goto decode_success;
13443      }
13444      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
13445      /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
13446         mmx(G), turn them into a byte, and put zero-extend of it in
13447         ireg(G). */
13448      if (haveNo66noF2noF3(pfx)
13449          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
13450         modrm = getUChar(delta);
13451         if (epartIsReg(modrm)) {
13452            do_MMX_preamble();
13453            t0 = newTemp(Ity_I64);
13454            t1 = newTemp(Ity_I64);
13455            assign(t0, getMMXReg(eregLO3ofRM(modrm)));
13456            assign(t1, mkIRExprCCall(
13457                          Ity_I64, 0/*regparms*/,
13458                          "amd64g_calculate_mmx_pmovmskb",
13459                          &amd64g_calculate_mmx_pmovmskb,
13460                          mkIRExprVec_1(mkexpr(t0))));
13461            putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_64to32,mkexpr(t1)));
13462            DIP("pmovmskb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
13463                                    nameIReg32(gregOfRexRM(pfx,modrm)));
13464            delta += 1;
13465            goto decode_success;
13466         }
13467         /* else fall through */
13468      }
13469      break;
13470
13471   case 0xD8:
13472      /* 66 0F D8 = PSUBUSB */
13473      if (have66noF2noF3(pfx) && sz == 2) {
13474         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13475                                    "psubusb", Iop_QSub8Ux16, False );
13476         goto decode_success;
13477      }
13478      break;
13479
13480   case 0xD9:
13481      /* 66 0F D9 = PSUBUSW */
13482      if (have66noF2noF3(pfx) && sz == 2) {
13483         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13484                                    "psubusw", Iop_QSub16Ux8, False );
13485         goto decode_success;
13486      }
13487      break;
13488
13489   case 0xDA:
13490      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
13491      /* 0F DA = PMINUB -- 8x8 unsigned min */
13492      if (haveNo66noF2noF3(pfx) && sz == 4) {
13493         do_MMX_preamble();
13494         delta = dis_MMXop_regmem_to_reg (
13495                    vbi, pfx, delta, opc, "pminub", False );
13496         goto decode_success;
13497      }
13498      /* 66 0F DA = PMINUB -- 8x16 unsigned min */
13499      if (have66noF2noF3(pfx) && sz == 2) {
13500         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13501                                    "pminub", Iop_Min8Ux16, False );
13502         goto decode_success;
13503      }
13504      break;
13505
13506   case 0xDB:
13507      /* 66 0F DB = PAND */
13508      if (have66noF2noF3(pfx) && sz == 2) {
13509         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pand", Iop_AndV128 );
13510         goto decode_success;
13511      }
13512      break;
13513
13514   case 0xDC:
13515      /* 66 0F DC = PADDUSB */
13516      if (have66noF2noF3(pfx) && sz == 2) {
13517         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13518                                    "paddusb", Iop_QAdd8Ux16, False );
13519         goto decode_success;
13520      }
13521      break;
13522
13523   case 0xDD:
13524      /* 66 0F DD = PADDUSW */
13525      if (have66noF2noF3(pfx) && sz == 2) {
13526         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13527                                    "paddusw", Iop_QAdd16Ux8, False );
13528         goto decode_success;
13529      }
13530      break;
13531
13532   case 0xDE:
13533      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
13534      /* 0F DE = PMAXUB -- 8x8 unsigned max */
13535      if (haveNo66noF2noF3(pfx) && sz == 4) {
13536         do_MMX_preamble();
13537         delta = dis_MMXop_regmem_to_reg (
13538                    vbi, pfx, delta, opc, "pmaxub", False );
13539         goto decode_success;
13540      }
13541      /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
13542      if (have66noF2noF3(pfx) && sz == 2) {
13543         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13544                                    "pmaxub", Iop_Max8Ux16, False );
13545         goto decode_success;
13546      }
13547      break;
13548
13549   case 0xDF:
13550      /* 66 0F DF = PANDN */
13551      if (have66noF2noF3(pfx) && sz == 2) {
13552         delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "pandn", Iop_AndV128 );
13553         goto decode_success;
13554      }
13555      break;
13556
13557   case 0xE0:
13558      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
13559      /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
13560      if (haveNo66noF2noF3(pfx) && sz == 4) {
13561         do_MMX_preamble();
13562         delta = dis_MMXop_regmem_to_reg (
13563                    vbi, pfx, delta, opc, "pavgb", False );
13564         goto decode_success;
13565      }
13566      /* 66 0F E0 = PAVGB */
13567      if (have66noF2noF3(pfx) && sz == 2) {
13568         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13569                                    "pavgb", Iop_Avg8Ux16, False );
13570         goto decode_success;
13571      }
13572      break;
13573
13574   case 0xE1:
13575      /* 66 0F E1 = PSRAW by E */
13576      if (have66noF2noF3(pfx) && sz == 2) {
13577         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psraw", Iop_SarN16x8 );
13578         goto decode_success;
13579      }
13580      break;
13581
13582   case 0xE2:
13583      /* 66 0F E2 = PSRAD by E */
13584      if (have66noF2noF3(pfx) && sz == 2) {
13585         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrad", Iop_SarN32x4 );
13586         goto decode_success;
13587      }
13588      break;
13589
13590   case 0xE3:
13591      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
13592      /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
13593      if (haveNo66noF2noF3(pfx) && sz == 4) {
13594         do_MMX_preamble();
13595         delta = dis_MMXop_regmem_to_reg (
13596                    vbi, pfx, delta, opc, "pavgw", False );
13597         goto decode_success;
13598      }
13599      /* 66 0F E3 = PAVGW */
13600      if (have66noF2noF3(pfx) && sz == 2) {
13601         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13602                                    "pavgw", Iop_Avg16Ux8, False );
13603         goto decode_success;
13604      }
13605      break;
13606
13607   case 0xE4:
13608      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
13609      /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
13610      if (haveNo66noF2noF3(pfx) && sz == 4) {
13611         do_MMX_preamble();
13612         delta = dis_MMXop_regmem_to_reg (
13613                    vbi, pfx, delta, opc, "pmuluh", False );
13614         goto decode_success;
13615      }
13616      /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
13617      if (have66noF2noF3(pfx) && sz == 2) {
13618         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13619                                    "pmulhuw", Iop_MulHi16Ux8, False );
13620         goto decode_success;
13621      }
13622      break;
13623
13624   case 0xE5:
13625      /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
13626      if (have66noF2noF3(pfx) && sz == 2) {
13627         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13628                                    "pmulhw", Iop_MulHi16Sx8, False );
13629         goto decode_success;
13630      }
13631      break;
13632
13633   case 0xE6:
13634      /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
13635         lo half xmm(G), and zero upper half, rounding towards zero */
13636      /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
13637         lo half xmm(G), according to prevailing rounding mode, and zero
13638         upper half */
13639      if ( (haveF2no66noF3(pfx) && sz == 4)
13640           || (have66noF2noF3(pfx) && sz == 2) ) {
13641         delta = dis_CVTxPD2DQ_128( vbi, pfx, delta, False/*!isAvx*/,
13642                                    toBool(sz == 2)/*r2zero*/);
13643         goto decode_success;
13644      }
13645      /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
13646         F64 in xmm(G) */
13647      if (haveF3no66noF2(pfx) && sz == 4) {
13648         delta = dis_CVTDQ2PD_128(vbi, pfx, delta, False/*!isAvx*/);
13649         goto decode_success;
13650      }
13651      break;
13652
13653   case 0xE7:
13654      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
13655      /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
13656         Intel manual does not say anything about the usual business of
13657         the FP reg tags getting trashed whenever an MMX insn happens.
13658         So we just leave them alone.
13659      */
13660      if (haveNo66noF2noF3(pfx) && sz == 4) {
13661         modrm = getUChar(delta);
13662         if (!epartIsReg(modrm)) {
13663            /* do_MMX_preamble(); Intel docs don't specify this */
13664            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13665            storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
13666            DIP("movntq %s,%s\n", dis_buf,
13667                                  nameMMXReg(gregLO3ofRM(modrm)));
13668            delta += alen;
13669            goto decode_success;
13670         }
13671         /* else fall through */
13672      }
13673      /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
13674      if (have66noF2noF3(pfx) && sz == 2) {
13675         modrm = getUChar(delta);
13676         if (!epartIsReg(modrm)) {
13677            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13678            gen_SEGV_if_not_16_aligned( addr );
13679            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
13680            DIP("movntdq %s,%s\n", dis_buf,
13681                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
13682            delta += alen;
13683            goto decode_success;
13684         }
13685         /* else fall through */
13686      }
13687      break;
13688
13689   case 0xE8:
13690      /* 66 0F E8 = PSUBSB */
13691      if (have66noF2noF3(pfx) && sz == 2) {
13692         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13693                                    "psubsb", Iop_QSub8Sx16, False );
13694         goto decode_success;
13695      }
13696      break;
13697
13698   case 0xE9:
13699      /* 66 0F E9 = PSUBSW */
13700      if (have66noF2noF3(pfx) && sz == 2) {
13701         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13702                                    "psubsw", Iop_QSub16Sx8, False );
13703         goto decode_success;
13704      }
13705      break;
13706
13707   case 0xEA:
13708      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
13709      /* 0F EA = PMINSW -- 16x4 signed min */
13710      if (haveNo66noF2noF3(pfx) && sz == 4) {
13711         do_MMX_preamble();
13712         delta = dis_MMXop_regmem_to_reg (
13713                    vbi, pfx, delta, opc, "pminsw", False );
13714         goto decode_success;
13715      }
13716      /* 66 0F EA = PMINSW -- 16x8 signed min */
13717      if (have66noF2noF3(pfx) && sz == 2) {
13718         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13719                                    "pminsw", Iop_Min16Sx8, False );
13720         goto decode_success;
13721      }
13722      break;
13723
13724   case 0xEB:
13725      /* 66 0F EB = POR */
13726      if (have66noF2noF3(pfx) && sz == 2) {
13727         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "por", Iop_OrV128 );
13728         goto decode_success;
13729      }
13730      break;
13731
13732   case 0xEC:
13733      /* 66 0F EC = PADDSB */
13734      if (have66noF2noF3(pfx) && sz == 2) {
13735         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13736                                    "paddsb", Iop_QAdd8Sx16, False );
13737         goto decode_success;
13738      }
13739      break;
13740
13741   case 0xED:
13742      /* 66 0F ED = PADDSW */
13743      if (have66noF2noF3(pfx) && sz == 2) {
13744         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13745                                    "paddsw", Iop_QAdd16Sx8, False );
13746         goto decode_success;
13747      }
13748      break;
13749
13750   case 0xEE:
13751      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
13752      /* 0F EE = PMAXSW -- 16x4 signed max */
13753      if (haveNo66noF2noF3(pfx) && sz == 4) {
13754         do_MMX_preamble();
13755         delta = dis_MMXop_regmem_to_reg (
13756                    vbi, pfx, delta, opc, "pmaxsw", False );
13757         goto decode_success;
13758      }
13759      /* 66 0F EE = PMAXSW -- 16x8 signed max */
13760      if (have66noF2noF3(pfx) && sz == 2) {
13761         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13762                                    "pmaxsw", Iop_Max16Sx8, False );
13763         goto decode_success;
13764      }
13765      break;
13766
13767   case 0xEF:
13768      /* 66 0F EF = PXOR */
13769      if (have66noF2noF3(pfx) && sz == 2) {
13770         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pxor", Iop_XorV128 );
13771         goto decode_success;
13772      }
13773      break;
13774
13775   case 0xF1:
13776      /* 66 0F F1 = PSLLW by E */
13777      if (have66noF2noF3(pfx) && sz == 2) {
13778         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllw", Iop_ShlN16x8 );
13779         goto decode_success;
13780      }
13781      break;
13782
13783   case 0xF2:
13784      /* 66 0F F2 = PSLLD by E */
13785      if (have66noF2noF3(pfx) && sz == 2) {
13786         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "pslld", Iop_ShlN32x4 );
13787         goto decode_success;
13788      }
13789      break;
13790
13791   case 0xF3:
13792      /* 66 0F F3 = PSLLQ by E */
13793      if (have66noF2noF3(pfx) && sz == 2) {
13794         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllq", Iop_ShlN64x2 );
13795         goto decode_success;
13796      }
13797      break;
13798
13799   case 0xF4:
13800      /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
13801         0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
13802         half */
13803      if (have66noF2noF3(pfx) && sz == 2) {
13804         IRTemp sV = newTemp(Ity_V128);
13805         IRTemp dV = newTemp(Ity_V128);
13806         modrm = getUChar(delta);
13807         UInt rG = gregOfRexRM(pfx,modrm);
13808         assign( dV, getXMMReg(rG) );
13809         if (epartIsReg(modrm)) {
13810            UInt rE = eregOfRexRM(pfx,modrm);
13811            assign( sV, getXMMReg(rE) );
13812            delta += 1;
13813            DIP("pmuludq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
13814         } else {
13815            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13816            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
13817            delta += alen;
13818            DIP("pmuludq %s,%s\n", dis_buf, nameXMMReg(rG));
13819         }
13820         putXMMReg( rG, mkexpr(math_PMULUDQ_128( sV, dV )) );
13821         goto decode_success;
13822      }
13823      /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
13824      /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
13825         0 to form 64-bit result */
13826      if (haveNo66noF2noF3(pfx) && sz == 4) {
13827         IRTemp sV = newTemp(Ity_I64);
13828         IRTemp dV = newTemp(Ity_I64);
13829         t1 = newTemp(Ity_I32);
13830         t0 = newTemp(Ity_I32);
13831         modrm = getUChar(delta);
13832
13833         do_MMX_preamble();
13834         assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
13835
13836         if (epartIsReg(modrm)) {
13837            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
13838            delta += 1;
13839            DIP("pmuludq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
13840                                   nameMMXReg(gregLO3ofRM(modrm)));
13841         } else {
13842            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13843            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
13844            delta += alen;
13845            DIP("pmuludq %s,%s\n", dis_buf,
13846                                   nameMMXReg(gregLO3ofRM(modrm)));
13847         }
13848
13849         assign( t0, unop(Iop_64to32, mkexpr(dV)) );
13850         assign( t1, unop(Iop_64to32, mkexpr(sV)) );
13851         putMMXReg( gregLO3ofRM(modrm),
13852                    binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
13853         goto decode_success;
13854      }
13855      break;
13856
13857   case 0xF5:
13858      /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
13859         E(xmm or mem) to G(xmm) */
13860      if (have66noF2noF3(pfx) && sz == 2) {
13861         IRTemp sV = newTemp(Ity_V128);
13862         IRTemp dV = newTemp(Ity_V128);
13863         modrm     = getUChar(delta);
13864         UInt   rG = gregOfRexRM(pfx,modrm);
13865         if (epartIsReg(modrm)) {
13866            UInt rE = eregOfRexRM(pfx,modrm);
13867            assign( sV, getXMMReg(rE) );
13868            delta += 1;
13869            DIP("pmaddwd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
13870         } else {
13871            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13872            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
13873            delta += alen;
13874            DIP("pmaddwd %s,%s\n", dis_buf, nameXMMReg(rG));
13875         }
13876         assign( dV, getXMMReg(rG) );
13877         putXMMReg( rG, mkexpr(math_PMADDWD_128(dV, sV)) );
13878         goto decode_success;
13879      }
13880      break;
13881
13882   case 0xF6:
13883      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
13884      /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
13885      if (haveNo66noF2noF3(pfx) && sz == 4) {
13886         do_MMX_preamble();
13887         delta = dis_MMXop_regmem_to_reg (
13888                    vbi, pfx, delta, opc, "psadbw", False );
13889         goto decode_success;
13890      }
13891      /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
13892         from E(xmm or mem) to G(xmm) */
13893      if (have66noF2noF3(pfx) && sz == 2) {
13894         IRTemp sV  = newTemp(Ity_V128);
13895         IRTemp dV  = newTemp(Ity_V128);
13896         modrm = getUChar(delta);
13897         UInt   rG   = gregOfRexRM(pfx,modrm);
13898         if (epartIsReg(modrm)) {
13899            UInt rE = eregOfRexRM(pfx,modrm);
13900            assign( sV, getXMMReg(rE) );
13901            delta += 1;
13902            DIP("psadbw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
13903         } else {
13904            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13905            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
13906            delta += alen;
13907            DIP("psadbw %s,%s\n", dis_buf, nameXMMReg(rG));
13908         }
13909         assign( dV, getXMMReg(rG) );
13910         putXMMReg( rG, mkexpr( math_PSADBW_128 ( dV, sV ) ) );
13911
13912         goto decode_success;
13913      }
13914      break;
13915
13916   case 0xF7:
13917      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
13918      /* 0F F7 = MASKMOVQ -- 8x8 masked store */
13919      if (haveNo66noF2noF3(pfx) && sz == 4) {
13920         Bool ok = False;
13921         delta = dis_MMX( &ok, vbi, pfx, sz, delta-1 );
13922         if (ok) goto decode_success;
13923      }
13924      /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
13925      if (have66noF2noF3(pfx) && sz == 2 && epartIsReg(getUChar(delta))) {
13926         delta = dis_MASKMOVDQU( vbi, pfx, delta, False/*!isAvx*/ );
13927         goto decode_success;
13928      }
13929      break;
13930
13931   case 0xF8:
13932      /* 66 0F F8 = PSUBB */
13933      if (have66noF2noF3(pfx) && sz == 2) {
13934         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13935                                    "psubb", Iop_Sub8x16, False );
13936         goto decode_success;
13937      }
13938      break;
13939
13940   case 0xF9:
13941      /* 66 0F F9 = PSUBW */
13942      if (have66noF2noF3(pfx) && sz == 2) {
13943         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13944                                    "psubw", Iop_Sub16x8, False );
13945         goto decode_success;
13946      }
13947      break;
13948
13949   case 0xFA:
13950      /* 66 0F FA = PSUBD */
13951      if (have66noF2noF3(pfx) && sz == 2) {
13952         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13953                                    "psubd", Iop_Sub32x4, False );
13954         goto decode_success;
13955      }
13956      break;
13957
13958   case 0xFB:
13959      /* 66 0F FB = PSUBQ */
13960      if (have66noF2noF3(pfx) && sz == 2) {
13961         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13962                                    "psubq", Iop_Sub64x2, False );
13963         goto decode_success;
13964      }
13965      /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
13966      /* 0F FB = PSUBQ -- sub 64x1 */
13967      if (haveNo66noF2noF3(pfx) && sz == 4) {
13968         do_MMX_preamble();
13969         delta = dis_MMXop_regmem_to_reg (
13970                   vbi, pfx, delta, opc, "psubq", False );
13971         goto decode_success;
13972      }
13973      break;
13974
13975   case 0xFC:
13976      /* 66 0F FC = PADDB */
13977      if (have66noF2noF3(pfx) && sz == 2) {
13978         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13979                                    "paddb", Iop_Add8x16, False );
13980         goto decode_success;
13981      }
13982      break;
13983
13984   case 0xFD:
13985      /* 66 0F FD = PADDW */
13986      if (have66noF2noF3(pfx) && sz == 2) {
13987         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13988                                    "paddw", Iop_Add16x8, False );
13989         goto decode_success;
13990      }
13991      break;
13992
13993   case 0xFE:
13994      /* 66 0F FE = PADDD */
13995      if (have66noF2noF3(pfx) && sz == 2) {
13996         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13997                                    "paddd", Iop_Add32x4, False );
13998         goto decode_success;
13999      }
14000      break;
14001
14002   default:
14003      goto decode_failure;
14004
14005   }
14006
14007  decode_failure:
14008   *decode_OK = False;
14009   return deltaIN;
14010
14011  decode_success:
14012   *decode_OK = True;
14013   return delta;
14014}
14015
14016
14017/*------------------------------------------------------------*/
14018/*---                                                      ---*/
14019/*--- Top-level SSE3 (not SupSSE3): dis_ESC_0F__SSE3       ---*/
14020/*---                                                      ---*/
14021/*------------------------------------------------------------*/
14022
14023static Long dis_MOVDDUP_128 ( VexAbiInfo* vbi, Prefix pfx,
14024                              Long delta, Bool isAvx )
14025{
14026   IRTemp addr   = IRTemp_INVALID;
14027   Int    alen   = 0;
14028   HChar  dis_buf[50];
14029   IRTemp sV    = newTemp(Ity_V128);
14030   IRTemp d0    = newTemp(Ity_I64);
14031   UChar  modrm = getUChar(delta);
14032   UInt   rG    = gregOfRexRM(pfx,modrm);
14033   if (epartIsReg(modrm)) {
14034      UInt rE = eregOfRexRM(pfx,modrm);
14035      assign( sV, getXMMReg(rE) );
14036      DIP("%smovddup %s,%s\n",
14037          isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
14038      delta += 1;
14039      assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
14040   } else {
14041      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14042      assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
14043      DIP("%smovddup %s,%s\n",
14044          isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
14045      delta += alen;
14046   }
14047   (isAvx ? putYMMRegLoAndZU : putXMMReg)
14048      ( rG, binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
14049   return delta;
14050}
14051
14052
14053static Long dis_MOVDDUP_256 ( VexAbiInfo* vbi, Prefix pfx,
14054                              Long delta )
14055{
14056   IRTemp addr   = IRTemp_INVALID;
14057   Int    alen   = 0;
14058   HChar  dis_buf[50];
14059   IRTemp d0    = newTemp(Ity_I64);
14060   IRTemp d1    = newTemp(Ity_I64);
14061   UChar  modrm = getUChar(delta);
14062   UInt   rG    = gregOfRexRM(pfx,modrm);
14063   if (epartIsReg(modrm)) {
14064      UInt rE = eregOfRexRM(pfx,modrm);
14065      DIP("vmovddup %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
14066      delta += 1;
14067      assign ( d0, getYMMRegLane64(rE, 0) );
14068      assign ( d1, getYMMRegLane64(rE, 2) );
14069   } else {
14070      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14071      assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
14072      assign( d1, loadLE(Ity_I64, binop(Iop_Add64,
14073                                        mkexpr(addr), mkU64(16))) );
14074      DIP("vmovddup %s,%s\n", dis_buf, nameYMMReg(rG));
14075      delta += alen;
14076   }
14077   putYMMRegLane64( rG, 0, mkexpr(d0) );
14078   putYMMRegLane64( rG, 1, mkexpr(d0) );
14079   putYMMRegLane64( rG, 2, mkexpr(d1) );
14080   putYMMRegLane64( rG, 3, mkexpr(d1) );
14081   return delta;
14082}
14083
14084
14085static Long dis_MOVSxDUP_128 ( VexAbiInfo* vbi, Prefix pfx,
14086                               Long delta, Bool isAvx, Bool isL )
14087{
14088   IRTemp addr  = IRTemp_INVALID;
14089   Int    alen  = 0;
14090   HChar  dis_buf[50];
14091   IRTemp sV    = newTemp(Ity_V128);
14092   UChar  modrm = getUChar(delta);
14093   UInt   rG    = gregOfRexRM(pfx,modrm);
14094   IRTemp s3, s2, s1, s0;
14095   s3 = s2 = s1 = s0 = IRTemp_INVALID;
14096   if (epartIsReg(modrm)) {
14097      UInt rE = eregOfRexRM(pfx,modrm);
14098      assign( sV, getXMMReg(rE) );
14099      DIP("%smovs%cdup %s,%s\n",
14100          isAvx ? "v" : "", isL ? 'l' : 'h', nameXMMReg(rE), nameXMMReg(rG));
14101      delta += 1;
14102   } else {
14103      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14104      if (!isAvx)
14105         gen_SEGV_if_not_16_aligned( addr );
14106      assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
14107      DIP("%smovs%cdup %s,%s\n",
14108          isAvx ? "v" : "", isL ? 'l' : 'h', dis_buf, nameXMMReg(rG));
14109      delta += alen;
14110   }
14111   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
14112   (isAvx ? putYMMRegLoAndZU : putXMMReg)
14113      ( rG, isL ? mkV128from32s( s2, s2, s0, s0 )
14114                : mkV128from32s( s3, s3, s1, s1 ) );
14115   return delta;
14116}
14117
14118
14119static Long dis_MOVSxDUP_256 ( VexAbiInfo* vbi, Prefix pfx,
14120                               Long delta, Bool isL )
14121{
14122   IRTemp addr  = IRTemp_INVALID;
14123   Int    alen  = 0;
14124   HChar  dis_buf[50];
14125   IRTemp sV    = newTemp(Ity_V256);
14126   UChar  modrm = getUChar(delta);
14127   UInt   rG    = gregOfRexRM(pfx,modrm);
14128   IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
14129   s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
14130   if (epartIsReg(modrm)) {
14131      UInt rE = eregOfRexRM(pfx,modrm);
14132      assign( sV, getYMMReg(rE) );
14133      DIP("vmovs%cdup %s,%s\n",
14134          isL ? 'l' : 'h', nameYMMReg(rE), nameYMMReg(rG));
14135      delta += 1;
14136   } else {
14137      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14138      assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
14139      DIP("vmovs%cdup %s,%s\n",
14140          isL ? 'l' : 'h', dis_buf, nameYMMReg(rG));
14141      delta += alen;
14142   }
14143   breakupV256to32s( sV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
14144   putYMMRegLane128( rG, 1, isL ? mkV128from32s( s6, s6, s4, s4 )
14145                                : mkV128from32s( s7, s7, s5, s5 ) );
14146   putYMMRegLane128( rG, 0, isL ? mkV128from32s( s2, s2, s0, s0 )
14147                                : mkV128from32s( s3, s3, s1, s1 ) );
14148   return delta;
14149}
14150
14151
14152static IRTemp math_HADDPS_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
14153{
14154   IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
14155   IRTemp leftV  = newTemp(Ity_V128);
14156   IRTemp rightV = newTemp(Ity_V128);
14157   s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
14158
14159   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
14160   breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
14161
14162   assign( leftV,  mkV128from32s( s2, s0, d2, d0 ) );
14163   assign( rightV, mkV128from32s( s3, s1, d3, d1 ) );
14164
14165   IRTemp res = newTemp(Ity_V128);
14166   assign( res, binop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
14167                              mkexpr(leftV), mkexpr(rightV) ) );
14168   return res;
14169}
14170
14171
14172static IRTemp math_HADDPD_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
14173{
14174   IRTemp s1, s0, d1, d0;
14175   IRTemp leftV  = newTemp(Ity_V128);
14176   IRTemp rightV = newTemp(Ity_V128);
14177   s1 = s0 = d1 = d0 = IRTemp_INVALID;
14178
14179   breakupV128to64s( sV, &s1, &s0 );
14180   breakupV128to64s( dV, &d1, &d0 );
14181
14182   assign( leftV,  binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
14183   assign( rightV, binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
14184
14185   IRTemp res = newTemp(Ity_V128);
14186   assign( res, binop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
14187                              mkexpr(leftV), mkexpr(rightV) ) );
14188   return res;
14189}
14190
14191
14192__attribute__((noinline))
14193static
14194Long dis_ESC_0F__SSE3 ( Bool* decode_OK,
14195                        VexAbiInfo* vbi,
14196                        Prefix pfx, Int sz, Long deltaIN )
14197{
14198   IRTemp addr  = IRTemp_INVALID;
14199   UChar  modrm = 0;
14200   Int    alen  = 0;
14201   HChar  dis_buf[50];
14202
14203   *decode_OK = False;
14204
14205   Long   delta = deltaIN;
14206   UChar  opc   = getUChar(delta);
14207   delta++;
14208   switch (opc) {
14209
14210   case 0x12:
14211      /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
14212         duplicating some lanes (2:2:0:0). */
14213      if (haveF3no66noF2(pfx) && sz == 4) {
14214         delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
14215                                   True/*isL*/ );
14216         goto decode_success;
14217      }
14218      /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
14219         duplicating some lanes (0:1:0:1). */
14220      if (haveF2no66noF3(pfx)
14221          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
14222         delta = dis_MOVDDUP_128( vbi, pfx, delta, False/*!isAvx*/ );
14223         goto decode_success;
14224      }
14225      break;
14226
14227   case 0x16:
14228      /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
14229         duplicating some lanes (3:3:1:1). */
14230      if (haveF3no66noF2(pfx) && sz == 4) {
14231         delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
14232                                   False/*!isL*/ );
14233         goto decode_success;
14234      }
14235      break;
14236
14237   case 0x7C:
14238   case 0x7D:
14239      /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
14240      /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
14241      if (haveF2no66noF3(pfx) && sz == 4) {
14242         IRTemp eV     = newTemp(Ity_V128);
14243         IRTemp gV     = newTemp(Ity_V128);
14244         Bool   isAdd  = opc == 0x7C;
14245         HChar* str    = isAdd ? "add" : "sub";
14246         modrm         = getUChar(delta);
14247         UInt   rG     = gregOfRexRM(pfx,modrm);
14248         if (epartIsReg(modrm)) {
14249            UInt rE = eregOfRexRM(pfx,modrm);
14250            assign( eV, getXMMReg(rE) );
14251            DIP("h%sps %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
14252            delta += 1;
14253         } else {
14254            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14255            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
14256            DIP("h%sps %s,%s\n", str, dis_buf, nameXMMReg(rG));
14257            delta += alen;
14258         }
14259
14260         assign( gV, getXMMReg(rG) );
14261         putXMMReg( rG, mkexpr( math_HADDPS_128 ( gV, eV, isAdd ) ) );
14262         goto decode_success;
14263      }
14264      /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
14265      /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
14266      if (have66noF2noF3(pfx) && sz == 2) {
14267         IRTemp eV     = newTemp(Ity_V128);
14268         IRTemp gV     = newTemp(Ity_V128);
14269         Bool   isAdd  = opc == 0x7C;
14270         HChar* str    = isAdd ? "add" : "sub";
14271         modrm         = getUChar(delta);
14272         UInt   rG     = gregOfRexRM(pfx,modrm);
14273         if (epartIsReg(modrm)) {
14274            UInt rE = eregOfRexRM(pfx,modrm);
14275            assign( eV, getXMMReg(rE) );
14276            DIP("h%spd %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
14277            delta += 1;
14278         } else {
14279            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14280            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
14281            DIP("h%spd %s,%s\n", str, dis_buf, nameXMMReg(rG));
14282            delta += alen;
14283         }
14284
14285         assign( gV, getXMMReg(rG) );
14286         putXMMReg( rG, mkexpr( math_HADDPD_128 ( gV, eV, isAdd ) ) );
14287         goto decode_success;
14288      }
14289      break;
14290
14291   case 0xD0:
14292      /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
14293      if (have66noF2noF3(pfx) && sz == 2) {
14294         IRTemp eV   = newTemp(Ity_V128);
14295         IRTemp gV   = newTemp(Ity_V128);
14296         modrm       = getUChar(delta);
14297         UInt   rG   = gregOfRexRM(pfx,modrm);
14298         if (epartIsReg(modrm)) {
14299            UInt rE = eregOfRexRM(pfx,modrm);
14300            assign( eV, getXMMReg(rE) );
14301            DIP("addsubpd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
14302            delta += 1;
14303         } else {
14304            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14305            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
14306            DIP("addsubpd %s,%s\n", dis_buf, nameXMMReg(rG));
14307            delta += alen;
14308         }
14309
14310         assign( gV, getXMMReg(rG) );
14311         putXMMReg( rG, mkexpr( math_ADDSUBPD_128 ( gV, eV ) ) );
14312         goto decode_success;
14313      }
14314      /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
14315      if (haveF2no66noF3(pfx) && sz == 4) {
14316         IRTemp eV   = newTemp(Ity_V128);
14317         IRTemp gV   = newTemp(Ity_V128);
14318         modrm       = getUChar(delta);
14319         UInt   rG   = gregOfRexRM(pfx,modrm);
14320
14321         modrm = getUChar(delta);
14322         if (epartIsReg(modrm)) {
14323            UInt rE = eregOfRexRM(pfx,modrm);
14324            assign( eV, getXMMReg(rE) );
14325            DIP("addsubps %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
14326            delta += 1;
14327         } else {
14328            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14329            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
14330            DIP("addsubps %s,%s\n", dis_buf, nameXMMReg(rG));
14331            delta += alen;
14332         }
14333
14334         assign( gV, getXMMReg(rG) );
14335         putXMMReg( rG, mkexpr( math_ADDSUBPS_128 ( gV, eV ) ) );
14336         goto decode_success;
14337      }
14338      break;
14339
14340   case 0xF0:
14341      /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
14342      if (haveF2no66noF3(pfx) && sz == 4) {
14343         modrm = getUChar(delta);
14344         if (epartIsReg(modrm)) {
14345            goto decode_failure;
14346         } else {
14347            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14348            putXMMReg( gregOfRexRM(pfx,modrm),
14349                       loadLE(Ity_V128, mkexpr(addr)) );
14350            DIP("lddqu %s,%s\n", dis_buf,
14351                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
14352            delta += alen;
14353         }
14354         goto decode_success;
14355      }
14356      break;
14357
14358   default:
14359      goto decode_failure;
14360
14361   }
14362
14363  decode_failure:
14364   *decode_OK = False;
14365   return deltaIN;
14366
14367  decode_success:
14368   *decode_OK = True;
14369   return delta;
14370}
14371
14372
14373/*------------------------------------------------------------*/
14374/*---                                                      ---*/
14375/*--- Top-level SSSE3: dis_ESC_0F38__SupSSE3               ---*/
14376/*---                                                      ---*/
14377/*------------------------------------------------------------*/
14378
14379static
14380IRTemp math_PSHUFB_XMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ )
14381{
14382   IRTemp sHi        = newTemp(Ity_I64);
14383   IRTemp sLo        = newTemp(Ity_I64);
14384   IRTemp dHi        = newTemp(Ity_I64);
14385   IRTemp dLo        = newTemp(Ity_I64);
14386   IRTemp rHi        = newTemp(Ity_I64);
14387   IRTemp rLo        = newTemp(Ity_I64);
14388   IRTemp sevens     = newTemp(Ity_I64);
14389   IRTemp mask0x80hi = newTemp(Ity_I64);
14390   IRTemp mask0x80lo = newTemp(Ity_I64);
14391   IRTemp maskBit3hi = newTemp(Ity_I64);
14392   IRTemp maskBit3lo = newTemp(Ity_I64);
14393   IRTemp sAnd7hi    = newTemp(Ity_I64);
14394   IRTemp sAnd7lo    = newTemp(Ity_I64);
14395   IRTemp permdHi    = newTemp(Ity_I64);
14396   IRTemp permdLo    = newTemp(Ity_I64);
14397   IRTemp res        = newTemp(Ity_V128);
14398
14399   assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
14400   assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
14401   assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
14402   assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
14403
14404   assign( sevens, mkU64(0x0707070707070707ULL) );
14405
14406   /* mask0x80hi = Not(SarN8x8(sHi,7))
14407      maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
14408      sAnd7hi    = And(sHi,sevens)
14409      permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
14410      And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
14411      rHi        = And(permdHi,mask0x80hi)
14412   */
14413   assign(
14414      mask0x80hi,
14415      unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
14416
14417   assign(
14418      maskBit3hi,
14419      binop(Iop_SarN8x8,
14420            binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
14421            mkU8(7)));
14422
14423   assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
14424
14425   assign(
14426      permdHi,
14427      binop(
14428         Iop_Or64,
14429         binop(Iop_And64,
14430               binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
14431               mkexpr(maskBit3hi)),
14432         binop(Iop_And64,
14433               binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
14434               unop(Iop_Not64,mkexpr(maskBit3hi))) ));
14435
14436   assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
14437
14438   /* And the same for the lower half of the result.  What fun. */
14439
14440   assign(
14441      mask0x80lo,
14442      unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
14443
14444   assign(
14445      maskBit3lo,
14446      binop(Iop_SarN8x8,
14447            binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
14448            mkU8(7)));
14449
14450   assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
14451
14452   assign(
14453      permdLo,
14454      binop(
14455         Iop_Or64,
14456         binop(Iop_And64,
14457               binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
14458               mkexpr(maskBit3lo)),
14459         binop(Iop_And64,
14460               binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
14461               unop(Iop_Not64,mkexpr(maskBit3lo))) ));
14462
14463   assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
14464
14465   assign(res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)));
14466   return res;
14467}
14468
14469
14470static Long dis_PHADD_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta,
14471                            Bool isAvx, UChar opc )
14472{
14473   IRTemp addr   = IRTemp_INVALID;
14474   Int    alen   = 0;
14475   HChar  dis_buf[50];
14476   HChar* str    = "???";
14477   IROp   opV64  = Iop_INVALID;
14478   IROp   opCatO = Iop_CatOddLanes16x4;
14479   IROp   opCatE = Iop_CatEvenLanes16x4;
14480   IRTemp sV     = newTemp(Ity_V128);
14481   IRTemp dV     = newTemp(Ity_V128);
14482   IRTemp sHi    = newTemp(Ity_I64);
14483   IRTemp sLo    = newTemp(Ity_I64);
14484   IRTemp dHi    = newTemp(Ity_I64);
14485   IRTemp dLo    = newTemp(Ity_I64);
14486   UChar  modrm  = getUChar(delta);
14487   UInt   rG     = gregOfRexRM(pfx,modrm);
14488   UInt   rV     = isAvx ? getVexNvvvv(pfx) : rG;
14489
14490   switch (opc) {
14491      case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
14492      case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
14493      case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
14494      case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
14495      case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
14496      case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
14497      default: vassert(0);
14498   }
14499   if (opc == 0x02 || opc == 0x06) {
14500      opCatO = Iop_InterleaveHI32x2;
14501      opCatE = Iop_InterleaveLO32x2;
14502   }
14503
14504   assign( dV, getXMMReg(rV) );
14505
14506   if (epartIsReg(modrm)) {
14507      UInt rE = eregOfRexRM(pfx,modrm);
14508      assign( sV, getXMMReg(rE) );
14509      DIP("ph%s %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
14510      delta += 1;
14511   } else {
14512      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14513      if (!isAvx)
14514         gen_SEGV_if_not_16_aligned( addr );
14515      assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
14516      DIP("ph%s %s,%s\n", str, dis_buf, nameXMMReg(rG));
14517      delta += alen;
14518   }
14519
14520   assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
14521   assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
14522   assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
14523   assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
14524
14525   /* This isn't a particularly efficient way to compute the
14526      result, but at least it avoids a proliferation of IROps,
14527      hence avoids complication all the backends. */
14528
14529   (isAvx ? putYMMRegLoAndZU : putXMMReg)
14530      ( rG,
14531        binop(Iop_64HLtoV128,
14532              binop(opV64,
14533                    binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
14534                    binop(opCatO,mkexpr(sHi),mkexpr(sLo)) ),
14535              binop(opV64,
14536                    binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
14537                    binop(opCatO,mkexpr(dHi),mkexpr(dLo)) ) ) );
14538   return delta;
14539}
14540
14541
14542static IRTemp math_PMADDUBSW_128 ( IRTemp dV, IRTemp sV )
14543{
14544   IRTemp sVoddsSX  = newTemp(Ity_V128);
14545   IRTemp sVevensSX = newTemp(Ity_V128);
14546   IRTemp dVoddsZX  = newTemp(Ity_V128);
14547   IRTemp dVevensZX = newTemp(Ity_V128);
14548   /* compute dV unsigned x sV signed */
14549   assign( sVoddsSX, binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
14550   assign( sVevensSX, binop(Iop_SarN16x8,
14551                            binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
14552                            mkU8(8)) );
14553   assign( dVoddsZX, binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
14554   assign( dVevensZX, binop(Iop_ShrN16x8,
14555                            binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
14556                            mkU8(8)) );
14557
14558   IRTemp res = newTemp(Ity_V128);
14559   assign( res, binop(Iop_QAdd16Sx8,
14560                      binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
14561                      binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
14562                     )
14563         );
14564   return res;
14565}
14566
14567
14568__attribute__((noinline))
14569static
14570Long dis_ESC_0F38__SupSSE3 ( Bool* decode_OK,
14571                             VexAbiInfo* vbi,
14572                             Prefix pfx, Int sz, Long deltaIN )
14573{
14574   IRTemp addr  = IRTemp_INVALID;
14575   UChar  modrm = 0;
14576   Int    alen  = 0;
14577   HChar  dis_buf[50];
14578
14579   *decode_OK = False;
14580
14581   Long   delta = deltaIN;
14582   UChar  opc   = getUChar(delta);
14583   delta++;
14584   switch (opc) {
14585
14586   case 0x00:
14587      /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
14588      if (have66noF2noF3(pfx)
14589          && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
14590         IRTemp sV = newTemp(Ity_V128);
14591         IRTemp dV = newTemp(Ity_V128);
14592
14593         modrm = getUChar(delta);
14594         assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
14595
14596         if (epartIsReg(modrm)) {
14597            assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
14598            delta += 1;
14599            DIP("pshufb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
14600                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
14601         } else {
14602            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14603            gen_SEGV_if_not_16_aligned( addr );
14604            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
14605            delta += alen;
14606            DIP("pshufb %s,%s\n", dis_buf,
14607                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
14608         }
14609
14610         IRTemp res = math_PSHUFB_XMM( dV, sV );
14611         putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(res));
14612         goto decode_success;
14613      }
14614      /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
14615      if (haveNo66noF2noF3(pfx) && sz == 4) {
14616         IRTemp sV      = newTemp(Ity_I64);
14617         IRTemp dV      = newTemp(Ity_I64);
14618
14619         modrm = getUChar(delta);
14620         do_MMX_preamble();
14621         assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
14622
14623         if (epartIsReg(modrm)) {
14624            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
14625            delta += 1;
14626            DIP("pshufb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
14627                                  nameMMXReg(gregLO3ofRM(modrm)));
14628         } else {
14629            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14630            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
14631            delta += alen;
14632            DIP("pshufb %s,%s\n", dis_buf,
14633                                  nameMMXReg(gregLO3ofRM(modrm)));
14634         }
14635
14636         putMMXReg(
14637            gregLO3ofRM(modrm),
14638            binop(
14639               Iop_And64,
14640               /* permute the lanes */
14641               binop(
14642                  Iop_Perm8x8,
14643                  mkexpr(dV),
14644                  binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
14645               ),
14646               /* mask off lanes which have (index & 0x80) == 0x80 */
14647               unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
14648            )
14649         );
14650         goto decode_success;
14651      }
14652      break;
14653
14654   case 0x01:
14655   case 0x02:
14656   case 0x03:
14657   case 0x05:
14658   case 0x06:
14659   case 0x07:
14660      /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
14661         G to G (xmm). */
14662      /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
14663         G to G (xmm). */
14664      /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
14665         xmm) and G to G (xmm). */
14666      /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
14667         G to G (xmm). */
14668      /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
14669         G to G (xmm). */
14670      /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
14671         xmm) and G to G (xmm). */
14672      if (have66noF2noF3(pfx)
14673          && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
14674         delta = dis_PHADD_128( vbi, pfx, delta, False/*isAvx*/, opc );
14675         goto decode_success;
14676      }
14677      /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
14678      /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
14679         to G (mmx). */
14680      /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
14681         to G (mmx). */
14682      /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
14683         mmx) and G to G (mmx). */
14684      /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
14685         to G (mmx). */
14686      /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
14687         to G (mmx). */
14688      /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
14689         mmx) and G to G (mmx). */
14690      if (haveNo66noF2noF3(pfx) && sz == 4) {
14691         HChar* str    = "???";
14692         IROp   opV64  = Iop_INVALID;
14693         IROp   opCatO = Iop_CatOddLanes16x4;
14694         IROp   opCatE = Iop_CatEvenLanes16x4;
14695         IRTemp sV     = newTemp(Ity_I64);
14696         IRTemp dV     = newTemp(Ity_I64);
14697
14698         modrm = getUChar(delta);
14699
14700         switch (opc) {
14701            case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
14702            case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
14703            case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
14704            case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
14705            case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
14706            case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
14707            default: vassert(0);
14708         }
14709         if (opc == 0x02 || opc == 0x06) {
14710            opCatO = Iop_InterleaveHI32x2;
14711            opCatE = Iop_InterleaveLO32x2;
14712         }
14713
14714         do_MMX_preamble();
14715         assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
14716
14717         if (epartIsReg(modrm)) {
14718            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
14719            delta += 1;
14720            DIP("ph%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
14721                                     nameMMXReg(gregLO3ofRM(modrm)));
14722         } else {
14723            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14724            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
14725            delta += alen;
14726            DIP("ph%s %s,%s\n", str, dis_buf,
14727                                     nameMMXReg(gregLO3ofRM(modrm)));
14728         }
14729
14730         putMMXReg(
14731            gregLO3ofRM(modrm),
14732            binop(opV64,
14733                  binop(opCatE,mkexpr(sV),mkexpr(dV)),
14734                  binop(opCatO,mkexpr(sV),mkexpr(dV))
14735            )
14736         );
14737         goto decode_success;
14738      }
14739      break;
14740
14741   case 0x04:
14742      /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
14743         Unsigned Bytes (XMM) */
14744      if (have66noF2noF3(pfx)
14745          && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
14746         IRTemp sV = newTemp(Ity_V128);
14747         IRTemp dV = newTemp(Ity_V128);
14748         modrm     = getUChar(delta);
14749         UInt   rG = gregOfRexRM(pfx,modrm);
14750
14751         assign( dV, getXMMReg(rG) );
14752
14753         if (epartIsReg(modrm)) {
14754            UInt rE = eregOfRexRM(pfx,modrm);
14755            assign( sV, getXMMReg(rE) );
14756            delta += 1;
14757            DIP("pmaddubsw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
14758         } else {
14759            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14760            gen_SEGV_if_not_16_aligned( addr );
14761            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
14762            delta += alen;
14763            DIP("pmaddubsw %s,%s\n", dis_buf, nameXMMReg(rG));
14764         }
14765
14766         putXMMReg( rG, mkexpr( math_PMADDUBSW_128( dV, sV ) ) );
14767         goto decode_success;
14768      }
14769      /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
14770         Unsigned Bytes (MMX) */
14771      if (haveNo66noF2noF3(pfx) && sz == 4) {
14772         IRTemp sV        = newTemp(Ity_I64);
14773         IRTemp dV        = newTemp(Ity_I64);
14774         IRTemp sVoddsSX  = newTemp(Ity_I64);
14775         IRTemp sVevensSX = newTemp(Ity_I64);
14776         IRTemp dVoddsZX  = newTemp(Ity_I64);
14777         IRTemp dVevensZX = newTemp(Ity_I64);
14778
14779         modrm = getUChar(delta);
14780         do_MMX_preamble();
14781         assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
14782
14783         if (epartIsReg(modrm)) {
14784            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
14785            delta += 1;
14786            DIP("pmaddubsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
14787                                     nameMMXReg(gregLO3ofRM(modrm)));
14788         } else {
14789            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14790            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
14791            delta += alen;
14792            DIP("pmaddubsw %s,%s\n", dis_buf,
14793                                     nameMMXReg(gregLO3ofRM(modrm)));
14794         }
14795
14796         /* compute dV unsigned x sV signed */
14797         assign( sVoddsSX,
14798                 binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
14799         assign( sVevensSX,
14800                 binop(Iop_SarN16x4,
14801                       binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
14802                       mkU8(8)) );
14803         assign( dVoddsZX,
14804                 binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
14805         assign( dVevensZX,
14806                 binop(Iop_ShrN16x4,
14807                       binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
14808                       mkU8(8)) );
14809
14810         putMMXReg(
14811            gregLO3ofRM(modrm),
14812            binop(Iop_QAdd16Sx4,
14813                  binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
14814                  binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
14815            )
14816         );
14817         goto decode_success;
14818      }
14819      break;
14820
14821   case 0x08:
14822   case 0x09:
14823   case 0x0A:
14824      /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
14825      /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
14826      /* 66 0F 38 0A = PSIGND -- Packed Sign 32x4 (XMM) */
14827      if (have66noF2noF3(pfx)
14828          && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
14829         IRTemp sV      = newTemp(Ity_V128);
14830         IRTemp dV      = newTemp(Ity_V128);
14831         IRTemp sHi     = newTemp(Ity_I64);
14832         IRTemp sLo     = newTemp(Ity_I64);
14833         IRTemp dHi     = newTemp(Ity_I64);
14834         IRTemp dLo     = newTemp(Ity_I64);
14835         HChar* str     = "???";
14836         Int    laneszB = 0;
14837
14838         switch (opc) {
14839            case 0x08: laneszB = 1; str = "b"; break;
14840            case 0x09: laneszB = 2; str = "w"; break;
14841            case 0x0A: laneszB = 4; str = "d"; break;
14842            default: vassert(0);
14843         }
14844
14845         modrm = getUChar(delta);
14846         assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
14847
14848         if (epartIsReg(modrm)) {
14849            assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
14850            delta += 1;
14851            DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
14852                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
14853         } else {
14854            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14855            gen_SEGV_if_not_16_aligned( addr );
14856            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
14857            delta += alen;
14858            DIP("psign%s %s,%s\n", str, dis_buf,
14859                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
14860         }
14861
14862         assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
14863         assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
14864         assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
14865         assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
14866
14867         putXMMReg(
14868            gregOfRexRM(pfx,modrm),
14869            binop(Iop_64HLtoV128,
14870                  dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
14871                  dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
14872            )
14873         );
14874         goto decode_success;
14875      }
14876      /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
14877      /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
14878      /* 0F 38 0A = PSIGND -- Packed Sign 32x2 (MMX) */
14879      if (haveNo66noF2noF3(pfx) && sz == 4) {
14880         IRTemp sV      = newTemp(Ity_I64);
14881         IRTemp dV      = newTemp(Ity_I64);
14882         HChar* str     = "???";
14883         Int    laneszB = 0;
14884
14885         switch (opc) {
14886            case 0x08: laneszB = 1; str = "b"; break;
14887            case 0x09: laneszB = 2; str = "w"; break;
14888            case 0x0A: laneszB = 4; str = "d"; break;
14889            default: vassert(0);
14890         }
14891
14892         modrm = getUChar(delta);
14893         do_MMX_preamble();
14894         assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
14895
14896         if (epartIsReg(modrm)) {
14897            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
14898            delta += 1;
14899            DIP("psign%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
14900                                        nameMMXReg(gregLO3ofRM(modrm)));
14901         } else {
14902            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14903            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
14904            delta += alen;
14905            DIP("psign%s %s,%s\n", str, dis_buf,
14906                                        nameMMXReg(gregLO3ofRM(modrm)));
14907         }
14908
14909         putMMXReg(
14910            gregLO3ofRM(modrm),
14911            dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
14912         );
14913         goto decode_success;
14914      }
14915      break;
14916
14917   case 0x0B:
14918      /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
14919         Scale (XMM) */
14920      if (have66noF2noF3(pfx)
14921          && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
14922         IRTemp sV  = newTemp(Ity_V128);
14923         IRTemp dV  = newTemp(Ity_V128);
14924         IRTemp sHi = newTemp(Ity_I64);
14925         IRTemp sLo = newTemp(Ity_I64);
14926         IRTemp dHi = newTemp(Ity_I64);
14927         IRTemp dLo = newTemp(Ity_I64);
14928
14929         modrm = getUChar(delta);
14930         assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
14931
14932         if (epartIsReg(modrm)) {
14933            assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
14934            delta += 1;
14935            DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
14936                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
14937         } else {
14938            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14939            gen_SEGV_if_not_16_aligned( addr );
14940            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
14941            delta += alen;
14942            DIP("pmulhrsw %s,%s\n", dis_buf,
14943                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
14944         }
14945
14946         assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
14947         assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
14948         assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
14949         assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
14950
14951         putXMMReg(
14952            gregOfRexRM(pfx,modrm),
14953            binop(Iop_64HLtoV128,
14954                  dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
14955                  dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
14956            )
14957         );
14958         goto decode_success;
14959      }
14960      /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
14961         (MMX) */
14962      if (haveNo66noF2noF3(pfx) && sz == 4) {
14963         IRTemp sV = newTemp(Ity_I64);
14964         IRTemp dV = newTemp(Ity_I64);
14965
14966         modrm = getUChar(delta);
14967         do_MMX_preamble();
14968         assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
14969
14970         if (epartIsReg(modrm)) {
14971            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
14972            delta += 1;
14973            DIP("pmulhrsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
14974                                    nameMMXReg(gregLO3ofRM(modrm)));
14975         } else {
14976            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14977            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
14978            delta += alen;
14979            DIP("pmulhrsw %s,%s\n", dis_buf,
14980                                    nameMMXReg(gregLO3ofRM(modrm)));
14981         }
14982
14983         putMMXReg(
14984            gregLO3ofRM(modrm),
14985            dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
14986         );
14987         goto decode_success;
14988      }
14989      break;
14990
14991   case 0x1C:
14992   case 0x1D:
14993   case 0x1E:
14994      /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
14995      /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
14996      /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
14997      if (have66noF2noF3(pfx)
14998          && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
14999         IRTemp sV  = newTemp(Ity_V128);
15000         HChar* str = "???";
15001         Int    laneszB = 0;
15002
15003         switch (opc) {
15004            case 0x1C: laneszB = 1; str = "b"; break;
15005            case 0x1D: laneszB = 2; str = "w"; break;
15006            case 0x1E: laneszB = 4; str = "d"; break;
15007            default: vassert(0);
15008         }
15009
15010         modrm = getUChar(delta);
15011         if (epartIsReg(modrm)) {
15012            assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
15013            delta += 1;
15014            DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
15015                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
15016         } else {
15017            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15018            gen_SEGV_if_not_16_aligned( addr );
15019            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
15020            delta += alen;
15021            DIP("pabs%s %s,%s\n", str, dis_buf,
15022                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
15023         }
15024
15025         putXMMReg( gregOfRexRM(pfx,modrm),
15026                    mkexpr(math_PABS_XMM(sV, laneszB)) );
15027         goto decode_success;
15028      }
15029      /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
15030      /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
15031      /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
15032      if (haveNo66noF2noF3(pfx) && sz == 4) {
15033         IRTemp sV      = newTemp(Ity_I64);
15034         HChar* str     = "???";
15035         Int    laneszB = 0;
15036
15037         switch (opc) {
15038            case 0x1C: laneszB = 1; str = "b"; break;
15039            case 0x1D: laneszB = 2; str = "w"; break;
15040            case 0x1E: laneszB = 4; str = "d"; break;
15041            default: vassert(0);
15042         }
15043
15044         modrm = getUChar(delta);
15045         do_MMX_preamble();
15046
15047         if (epartIsReg(modrm)) {
15048            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
15049            delta += 1;
15050            DIP("pabs%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
15051                                       nameMMXReg(gregLO3ofRM(modrm)));
15052         } else {
15053            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15054            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
15055            delta += alen;
15056            DIP("pabs%s %s,%s\n", str, dis_buf,
15057                                       nameMMXReg(gregLO3ofRM(modrm)));
15058         }
15059
15060         putMMXReg( gregLO3ofRM(modrm),
15061                    mkexpr(math_PABS_MMX( sV, laneszB )) );
15062         goto decode_success;
15063      }
15064      break;
15065
15066   default:
15067      break;
15068
15069   }
15070
15071  //decode_failure:
15072   *decode_OK = False;
15073   return deltaIN;
15074
15075  decode_success:
15076   *decode_OK = True;
15077   return delta;
15078}
15079
15080
15081/*------------------------------------------------------------*/
15082/*---                                                      ---*/
15083/*--- Top-level SSSE3: dis_ESC_0F3A__SupSSE3               ---*/
15084/*---                                                      ---*/
15085/*------------------------------------------------------------*/
15086
15087__attribute__((noinline))
15088static
15089Long dis_ESC_0F3A__SupSSE3 ( Bool* decode_OK,
15090                             VexAbiInfo* vbi,
15091                             Prefix pfx, Int sz, Long deltaIN )
15092{
15093   Long   d64   = 0;
15094   IRTemp addr  = IRTemp_INVALID;
15095   UChar  modrm = 0;
15096   Int    alen  = 0;
15097   HChar  dis_buf[50];
15098
15099   *decode_OK = False;
15100
15101   Long   delta = deltaIN;
15102   UChar  opc   = getUChar(delta);
15103   delta++;
15104   switch (opc) {
15105
15106   case 0x0F:
15107      /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
15108      if (have66noF2noF3(pfx)
15109          && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
15110         IRTemp sV  = newTemp(Ity_V128);
15111         IRTemp dV  = newTemp(Ity_V128);
15112
15113         modrm = getUChar(delta);
15114         assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
15115
15116         if (epartIsReg(modrm)) {
15117            assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
15118            d64 = (Long)getUChar(delta+1);
15119            delta += 1+1;
15120            DIP("palignr $%d,%s,%s\n", (Int)d64,
15121                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
15122                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
15123         } else {
15124            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
15125            gen_SEGV_if_not_16_aligned( addr );
15126            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
15127            d64 = (Long)getUChar(delta+alen);
15128            delta += alen+1;
15129            DIP("palignr $%d,%s,%s\n", (Int)d64,
15130                                       dis_buf,
15131                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
15132         }
15133
15134         IRTemp res = math_PALIGNR_XMM( sV, dV, d64 );
15135         putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
15136         goto decode_success;
15137      }
15138      /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
15139      if (haveNo66noF2noF3(pfx) && sz == 4) {
15140         IRTemp sV  = newTemp(Ity_I64);
15141         IRTemp dV  = newTemp(Ity_I64);
15142         IRTemp res = newTemp(Ity_I64);
15143
15144         modrm = getUChar(delta);
15145         do_MMX_preamble();
15146         assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
15147
15148         if (epartIsReg(modrm)) {
15149            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
15150            d64 = (Long)getUChar(delta+1);
15151            delta += 1+1;
15152            DIP("palignr $%d,%s,%s\n",  (Int)d64,
15153                                        nameMMXReg(eregLO3ofRM(modrm)),
15154                                        nameMMXReg(gregLO3ofRM(modrm)));
15155         } else {
15156            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
15157            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
15158            d64 = (Long)getUChar(delta+alen);
15159            delta += alen+1;
15160            DIP("palignr $%d%s,%s\n", (Int)d64,
15161                                      dis_buf,
15162                                      nameMMXReg(gregLO3ofRM(modrm)));
15163         }
15164
15165         if (d64 == 0) {
15166            assign( res, mkexpr(sV) );
15167         }
15168         else if (d64 >= 1 && d64 <= 7) {
15169            assign(res,
15170                   binop(Iop_Or64,
15171                         binop(Iop_Shr64, mkexpr(sV), mkU8(8*d64)),
15172                         binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d64))
15173                        )));
15174         }
15175         else if (d64 == 8) {
15176           assign( res, mkexpr(dV) );
15177         }
15178         else if (d64 >= 9 && d64 <= 15) {
15179            assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d64-8))) );
15180         }
15181         else if (d64 >= 16 && d64 <= 255) {
15182            assign( res, mkU64(0) );
15183         }
15184         else
15185            vassert(0);
15186
15187         putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
15188         goto decode_success;
15189      }
15190      break;
15191
15192   default:
15193      break;
15194
15195   }
15196
15197  //decode_failure:
15198   *decode_OK = False;
15199   return deltaIN;
15200
15201  decode_success:
15202   *decode_OK = True;
15203   return delta;
15204}
15205
15206
15207/*------------------------------------------------------------*/
15208/*---                                                      ---*/
15209/*--- Top-level SSE4: dis_ESC_0F__SSE4                     ---*/
15210/*---                                                      ---*/
15211/*------------------------------------------------------------*/
15212
15213__attribute__((noinline))
15214static
15215Long dis_ESC_0F__SSE4 ( Bool* decode_OK,
15216                        VexArchInfo* archinfo,
15217                        VexAbiInfo* vbi,
15218                        Prefix pfx, Int sz, Long deltaIN )
15219{
15220   IRTemp addr  = IRTemp_INVALID;
15221   IRType ty    = Ity_INVALID;
15222   UChar  modrm = 0;
15223   Int    alen  = 0;
15224   HChar  dis_buf[50];
15225
15226   *decode_OK = False;
15227
15228   Long   delta = deltaIN;
15229   UChar  opc   = getUChar(delta);
15230   delta++;
15231   switch (opc) {
15232
15233   case 0xB8:
15234      /* F3 0F B8  = POPCNT{W,L,Q}
15235         Count the number of 1 bits in a register
15236      */
15237      if (haveF3noF2(pfx) /* so both 66 and REX.W are possibilities */
15238          && (sz == 2 || sz == 4 || sz == 8)) {
15239         /*IRType*/ ty  = szToITy(sz);
15240         IRTemp     src = newTemp(ty);
15241         modrm = getUChar(delta);
15242         if (epartIsReg(modrm)) {
15243            assign(src, getIRegE(sz, pfx, modrm));
15244            delta += 1;
15245            DIP("popcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
15246                nameIRegG(sz, pfx, modrm));
15247         } else {
15248            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
15249            assign(src, loadLE(ty, mkexpr(addr)));
15250            delta += alen;
15251            DIP("popcnt%c %s, %s\n", nameISize(sz), dis_buf,
15252                nameIRegG(sz, pfx, modrm));
15253         }
15254
15255         IRTemp result = gen_POPCOUNT(ty, src);
15256         putIRegG(sz, pfx, modrm, mkexpr(result));
15257
15258         // Update flags.  This is pretty lame .. perhaps can do better
15259         // if this turns out to be performance critical.
15260         // O S A C P are cleared.  Z is set if SRC == 0.
15261         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
15262         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
15263         stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
15264         stmt( IRStmt_Put( OFFB_CC_DEP1,
15265               binop(Iop_Shl64,
15266                     unop(Iop_1Uto64,
15267                          binop(Iop_CmpEQ64,
15268                                widenUto64(mkexpr(src)),
15269                                mkU64(0))),
15270                     mkU8(AMD64G_CC_SHIFT_Z))));
15271
15272         goto decode_success;
15273      }
15274      break;
15275
15276   case 0xBD:
15277      /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
15278         which we can only decode if we're sure this is an AMD cpu
15279         that supports LZCNT, since otherwise it's BSR, which behaves
15280         differently.  Bizarrely, my Sandy Bridge also accepts these
15281         instructions but produces different results. */
15282      if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
15283          && (sz == 2 || sz == 4 || sz == 8)
15284          && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT)) {
15285         /*IRType*/ ty  = szToITy(sz);
15286         IRTemp     src = newTemp(ty);
15287         modrm = getUChar(delta);
15288         if (epartIsReg(modrm)) {
15289            assign(src, getIRegE(sz, pfx, modrm));
15290            delta += 1;
15291            DIP("lzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
15292                nameIRegG(sz, pfx, modrm));
15293         } else {
15294            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
15295            assign(src, loadLE(ty, mkexpr(addr)));
15296            delta += alen;
15297            DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
15298                nameIRegG(sz, pfx, modrm));
15299         }
15300
15301         IRTemp res = gen_LZCNT(ty, src);
15302         putIRegG(sz, pfx, modrm, mkexpr(res));
15303
15304         // Update flags.  This is pretty lame .. perhaps can do better
15305         // if this turns out to be performance critical.
15306         // O S A P are cleared.  Z is set if RESULT == 0.
15307         // C is set if SRC is zero.
15308         IRTemp src64 = newTemp(Ity_I64);
15309         IRTemp res64 = newTemp(Ity_I64);
15310         assign(src64, widenUto64(mkexpr(src)));
15311         assign(res64, widenUto64(mkexpr(res)));
15312
15313         IRTemp oszacp = newTemp(Ity_I64);
15314         assign(
15315            oszacp,
15316            binop(Iop_Or64,
15317                  binop(Iop_Shl64,
15318                        unop(Iop_1Uto64,
15319                             binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))),
15320                        mkU8(AMD64G_CC_SHIFT_Z)),
15321                  binop(Iop_Shl64,
15322                        unop(Iop_1Uto64,
15323                             binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))),
15324                        mkU8(AMD64G_CC_SHIFT_C))
15325            )
15326         );
15327
15328         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
15329         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
15330         stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
15331         stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
15332
15333         goto decode_success;
15334      }
15335      break;
15336
15337   default:
15338      break;
15339
15340   }
15341
15342  //decode_failure:
15343   *decode_OK = False;
15344   return deltaIN;
15345
15346  decode_success:
15347   *decode_OK = True;
15348   return delta;
15349}
15350
15351
15352/*------------------------------------------------------------*/
15353/*---                                                      ---*/
15354/*--- Top-level SSE4: dis_ESC_0F38__SSE4                   ---*/
15355/*---                                                      ---*/
15356/*------------------------------------------------------------*/
15357
15358static IRTemp math_PBLENDVB_128 ( IRTemp vecE, IRTemp vecG,
15359                                  IRTemp vec0/*controlling mask*/,
15360                                  UInt gran, IROp opSAR )
15361{
15362   /* The tricky bit is to convert vec0 into a suitable mask, by
15363      copying the most significant bit of each lane into all positions
15364      in the lane. */
15365   IRTemp sh = newTemp(Ity_I8);
15366   assign(sh, mkU8(8 * gran - 1));
15367
15368   IRTemp mask = newTemp(Ity_V128);
15369   assign(mask, binop(opSAR, mkexpr(vec0), mkexpr(sh)));
15370
15371   IRTemp notmask = newTemp(Ity_V128);
15372   assign(notmask, unop(Iop_NotV128, mkexpr(mask)));
15373
15374   IRTemp res = newTemp(Ity_V128);
15375   assign(res,  binop(Iop_OrV128,
15376                      binop(Iop_AndV128, mkexpr(vecE), mkexpr(mask)),
15377                      binop(Iop_AndV128, mkexpr(vecG), mkexpr(notmask))));
15378   return res;
15379}
15380
15381static IRTemp math_PBLENDVB_256 ( IRTemp vecE, IRTemp vecG,
15382                                  IRTemp vec0/*controlling mask*/,
15383                                  UInt gran, IROp opSAR128 )
15384{
15385   /* The tricky bit is to convert vec0 into a suitable mask, by
15386      copying the most significant bit of each lane into all positions
15387      in the lane. */
15388   IRTemp sh = newTemp(Ity_I8);
15389   assign(sh, mkU8(8 * gran - 1));
15390
15391   IRTemp vec0Hi = IRTemp_INVALID;
15392   IRTemp vec0Lo = IRTemp_INVALID;
15393   breakupV256toV128s( vec0, &vec0Hi, &vec0Lo );
15394
15395   IRTemp mask = newTemp(Ity_V256);
15396   assign(mask, binop(Iop_V128HLtoV256,
15397                      binop(opSAR128, mkexpr(vec0Hi), mkexpr(sh)),
15398                      binop(opSAR128, mkexpr(vec0Lo), mkexpr(sh))));
15399
15400   IRTemp notmask = newTemp(Ity_V256);
15401   assign(notmask, unop(Iop_NotV256, mkexpr(mask)));
15402
15403   IRTemp res = newTemp(Ity_V256);
15404   assign(res,  binop(Iop_OrV256,
15405                      binop(Iop_AndV256, mkexpr(vecE), mkexpr(mask)),
15406                      binop(Iop_AndV256, mkexpr(vecG), mkexpr(notmask))));
15407   return res;
15408}
15409
15410static Long dis_VBLENDV_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta,
15411                              const HChar *name, UInt gran, IROp opSAR )
15412{
15413   IRTemp addr   = IRTemp_INVALID;
15414   Int    alen   = 0;
15415   HChar  dis_buf[50];
15416   UChar  modrm  = getUChar(delta);
15417   UInt   rG     = gregOfRexRM(pfx, modrm);
15418   UInt   rV     = getVexNvvvv(pfx);
15419   UInt   rIS4   = 0xFF; /* invalid */
15420   IRTemp vecE   = newTemp(Ity_V128);
15421   IRTemp vecV   = newTemp(Ity_V128);
15422   IRTemp vecIS4 = newTemp(Ity_V128);
15423   if (epartIsReg(modrm)) {
15424      delta++;
15425      UInt rE = eregOfRexRM(pfx, modrm);
15426      assign(vecE, getXMMReg(rE));
15427      UChar ib = getUChar(delta);
15428      rIS4 = (ib >> 4) & 0xF;
15429      DIP("%s %s,%s,%s,%s\n",
15430          name, nameXMMReg(rIS4), nameXMMReg(rE),
15431          nameXMMReg(rV), nameXMMReg(rG));
15432   } else {
15433      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
15434      delta += alen;
15435      assign(vecE, loadLE(Ity_V128, mkexpr(addr)));
15436      UChar ib = getUChar(delta);
15437      rIS4 = (ib >> 4) & 0xF;
15438      DIP("%s %s,%s,%s,%s\n",
15439          name, nameXMMReg(rIS4), dis_buf, nameXMMReg(rV), nameXMMReg(rG));
15440   }
15441   delta++;
15442   assign(vecV,   getXMMReg(rV));
15443   assign(vecIS4, getXMMReg(rIS4));
15444   IRTemp res = math_PBLENDVB_128( vecE, vecV, vecIS4, gran, opSAR );
15445   putYMMRegLoAndZU( rG, mkexpr(res) );
15446   return delta;
15447}
15448
15449static Long dis_VBLENDV_256 ( VexAbiInfo* vbi, Prefix pfx, Long delta,
15450                              const HChar *name, UInt gran, IROp opSAR128 )
15451{
15452   IRTemp addr   = IRTemp_INVALID;
15453   Int    alen   = 0;
15454   HChar  dis_buf[50];
15455   UChar  modrm  = getUChar(delta);
15456   UInt   rG     = gregOfRexRM(pfx, modrm);
15457   UInt   rV     = getVexNvvvv(pfx);
15458   UInt   rIS4   = 0xFF; /* invalid */
15459   IRTemp vecE   = newTemp(Ity_V256);
15460   IRTemp vecV   = newTemp(Ity_V256);
15461   IRTemp vecIS4 = newTemp(Ity_V256);
15462   if (epartIsReg(modrm)) {
15463      delta++;
15464      UInt rE = eregOfRexRM(pfx, modrm);
15465      assign(vecE, getYMMReg(rE));
15466      UChar ib = getUChar(delta);
15467      rIS4 = (ib >> 4) & 0xF;
15468      DIP("%s %s,%s,%s,%s\n",
15469          name, nameYMMReg(rIS4), nameYMMReg(rE),
15470          nameYMMReg(rV), nameYMMReg(rG));
15471   } else {
15472      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
15473      delta += alen;
15474      assign(vecE, loadLE(Ity_V256, mkexpr(addr)));
15475      UChar ib = getUChar(delta);
15476      rIS4 = (ib >> 4) & 0xF;
15477      DIP("%s %s,%s,%s,%s\n",
15478          name, nameYMMReg(rIS4), dis_buf, nameYMMReg(rV), nameYMMReg(rG));
15479   }
15480   delta++;
15481   assign(vecV,   getYMMReg(rV));
15482   assign(vecIS4, getYMMReg(rIS4));
15483   IRTemp res = math_PBLENDVB_256( vecE, vecV, vecIS4, gran, opSAR128 );
15484   putYMMReg( rG, mkexpr(res) );
15485   return delta;
15486}
15487
15488static void finish_xTESTy ( IRTemp andV, IRTemp andnV, Int sign )
15489{
15490   /* Set Z=1 iff (vecE & vecG) == 0
15491      Set C=1 iff (vecE & not vecG) == 0
15492   */
15493
15494   /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
15495
15496   /* andV resp. andnV, reduced to 64-bit values, by or-ing the top
15497      and bottom 64-bits together.  It relies on this trick:
15498
15499      InterleaveLO64x2([a,b],[c,d]) == [b,d]    hence
15500
15501      InterleaveLO64x2([a,b],[a,b]) == [b,b]    and similarly
15502      InterleaveHI64x2([a,b],[a,b]) == [a,a]
15503
15504      and so the OR of the above 2 exprs produces
15505      [a OR b, a OR b], from which we simply take the lower half.
15506   */
15507   IRTemp and64  = newTemp(Ity_I64);
15508   IRTemp andn64 = newTemp(Ity_I64);
15509
15510   assign(and64,
15511          unop(Iop_V128to64,
15512               binop(Iop_OrV128,
15513                     binop(Iop_InterleaveLO64x2,
15514                           mkexpr(andV), mkexpr(andV)),
15515                     binop(Iop_InterleaveHI64x2,
15516                           mkexpr(andV), mkexpr(andV)))));
15517
15518   assign(andn64,
15519          unop(Iop_V128to64,
15520               binop(Iop_OrV128,
15521                     binop(Iop_InterleaveLO64x2,
15522                           mkexpr(andnV), mkexpr(andnV)),
15523                     binop(Iop_InterleaveHI64x2,
15524                           mkexpr(andnV), mkexpr(andnV)))));
15525
15526   IRTemp z64 = newTemp(Ity_I64);
15527   IRTemp c64 = newTemp(Ity_I64);
15528   if (sign == 64) {
15529      /* When only interested in the most significant bit, just shift
15530         arithmetically right and negate.  */
15531      assign(z64,
15532             unop(Iop_Not64,
15533                  binop(Iop_Sar64, mkexpr(and64), mkU8(63))));
15534
15535      assign(c64,
15536             unop(Iop_Not64,
15537                  binop(Iop_Sar64, mkexpr(andn64), mkU8(63))));
15538   } else {
15539      if (sign == 32) {
15540         /* When interested in bit 31 and bit 63, mask those bits and
15541            fallthrough into the PTEST handling.  */
15542         IRTemp t0 = newTemp(Ity_I64);
15543         IRTemp t1 = newTemp(Ity_I64);
15544         IRTemp t2 = newTemp(Ity_I64);
15545         assign(t0, mkU64(0x8000000080000000ULL));
15546         assign(t1, binop(Iop_And64, mkexpr(and64), mkexpr(t0)));
15547         assign(t2, binop(Iop_And64, mkexpr(andn64), mkexpr(t0)));
15548         and64 = t1;
15549         andn64 = t2;
15550      }
15551      /* Now convert and64, andn64 to all-zeroes or all-1s, so we can
15552         slice out the Z and C bits conveniently.  We use the standard
15553         trick all-zeroes -> all-zeroes, anything-else -> all-ones
15554         done by "(x | -x) >>s (word-size - 1)".
15555      */
15556      assign(z64,
15557             unop(Iop_Not64,
15558                  binop(Iop_Sar64,
15559                        binop(Iop_Or64,
15560                              binop(Iop_Sub64, mkU64(0), mkexpr(and64)),
15561                                    mkexpr(and64)), mkU8(63))));
15562
15563      assign(c64,
15564             unop(Iop_Not64,
15565                  binop(Iop_Sar64,
15566                        binop(Iop_Or64,
15567                              binop(Iop_Sub64, mkU64(0), mkexpr(andn64)),
15568                                    mkexpr(andn64)), mkU8(63))));
15569   }
15570
15571   /* And finally, slice out the Z and C flags and set the flags
15572      thunk to COPY for them.  OSAP are set to zero. */
15573   IRTemp newOSZACP = newTemp(Ity_I64);
15574   assign(newOSZACP,
15575          binop(Iop_Or64,
15576                binop(Iop_And64, mkexpr(z64), mkU64(AMD64G_CC_MASK_Z)),
15577                binop(Iop_And64, mkexpr(c64), mkU64(AMD64G_CC_MASK_C))));
15578
15579   stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(newOSZACP)));
15580   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
15581   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
15582   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
15583}
15584
15585
15586/* Handles 128 bit versions of PTEST, VTESTPS or VTESTPD.
15587   sign is 0 for PTEST insn, 32 for VTESTPS and 64 for VTESTPD. */
15588static Long dis_xTESTy_128 ( VexAbiInfo* vbi, Prefix pfx,
15589                             Long delta, Bool isAvx, Int sign )
15590{
15591   IRTemp addr   = IRTemp_INVALID;
15592   Int    alen   = 0;
15593   HChar  dis_buf[50];
15594   UChar  modrm  = getUChar(delta);
15595   UInt   rG     = gregOfRexRM(pfx, modrm);
15596   IRTemp vecE = newTemp(Ity_V128);
15597   IRTemp vecG = newTemp(Ity_V128);
15598
15599   if ( epartIsReg(modrm) ) {
15600      UInt rE = eregOfRexRM(pfx, modrm);
15601      assign(vecE, getXMMReg(rE));
15602      delta += 1;
15603      DIP( "%s%stest%s %s,%s\n",
15604           isAvx ? "v" : "", sign == 0 ? "p" : "",
15605           sign == 0 ? "" : sign == 32 ? "ps" : "pd",
15606           nameXMMReg(rE), nameXMMReg(rG) );
15607   } else {
15608      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
15609      if (!isAvx)
15610         gen_SEGV_if_not_16_aligned( addr );
15611      assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
15612      delta += alen;
15613      DIP( "%s%stest%s %s,%s\n",
15614           isAvx ? "v" : "", sign == 0 ? "p" : "",
15615           sign == 0 ? "" : sign == 32 ? "ps" : "pd",
15616           dis_buf, nameXMMReg(rG) );
15617   }
15618
15619   assign(vecG, getXMMReg(rG));
15620
15621   /* Set Z=1 iff (vecE & vecG) == 0
15622      Set C=1 iff (vecE & not vecG) == 0
15623   */
15624
15625   /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
15626   IRTemp andV  = newTemp(Ity_V128);
15627   IRTemp andnV = newTemp(Ity_V128);
15628   assign(andV,  binop(Iop_AndV128, mkexpr(vecE), mkexpr(vecG)));
15629   assign(andnV, binop(Iop_AndV128,
15630                       mkexpr(vecE),
15631                       binop(Iop_XorV128, mkexpr(vecG),
15632                                          mkV128(0xFFFF))));
15633
15634   finish_xTESTy ( andV, andnV, sign );
15635   return delta;
15636}
15637
15638
15639/* Handles 256 bit versions of PTEST, VTESTPS or VTESTPD.
15640   sign is 0 for PTEST insn, 32 for VTESTPS and 64 for VTESTPD. */
15641static Long dis_xTESTy_256 ( VexAbiInfo* vbi, Prefix pfx,
15642                             Long delta, Int sign )
15643{
15644   IRTemp addr   = IRTemp_INVALID;
15645   Int    alen   = 0;
15646   HChar  dis_buf[50];
15647   UChar  modrm  = getUChar(delta);
15648   UInt   rG     = gregOfRexRM(pfx, modrm);
15649   IRTemp vecE   = newTemp(Ity_V256);
15650   IRTemp vecG   = newTemp(Ity_V256);
15651
15652   if ( epartIsReg(modrm) ) {
15653      UInt rE = eregOfRexRM(pfx, modrm);
15654      assign(vecE, getYMMReg(rE));
15655      delta += 1;
15656      DIP( "v%stest%s %s,%s\n", sign == 0 ? "p" : "",
15657           sign == 0 ? "" : sign == 32 ? "ps" : "pd",
15658           nameYMMReg(rE), nameYMMReg(rG) );
15659   } else {
15660      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
15661      assign(vecE, loadLE( Ity_V256, mkexpr(addr) ));
15662      delta += alen;
15663      DIP( "v%stest%s %s,%s\n", sign == 0 ? "p" : "",
15664           sign == 0 ? "" : sign == 32 ? "ps" : "pd",
15665           dis_buf, nameYMMReg(rG) );
15666   }
15667
15668   assign(vecG, getYMMReg(rG));
15669
15670   /* Set Z=1 iff (vecE & vecG) == 0
15671      Set C=1 iff (vecE & not vecG) == 0
15672   */
15673
15674   /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
15675   IRTemp andV  = newTemp(Ity_V256);
15676   IRTemp andnV = newTemp(Ity_V256);
15677   assign(andV,  binop(Iop_AndV256, mkexpr(vecE), mkexpr(vecG)));
15678   assign(andnV, binop(Iop_AndV256,
15679                       mkexpr(vecE), unop(Iop_NotV256, mkexpr(vecG))));
15680
15681   IRTemp andVhi  = IRTemp_INVALID;
15682   IRTemp andVlo  = IRTemp_INVALID;
15683   IRTemp andnVhi = IRTemp_INVALID;
15684   IRTemp andnVlo = IRTemp_INVALID;
15685   breakupV256toV128s( andV, &andVhi, &andVlo );
15686   breakupV256toV128s( andnV, &andnVhi, &andnVlo );
15687
15688   IRTemp andV128  = newTemp(Ity_V128);
15689   IRTemp andnV128 = newTemp(Ity_V128);
15690   assign( andV128, binop( Iop_OrV128, mkexpr(andVhi), mkexpr(andVlo) ) );
15691   assign( andnV128, binop( Iop_OrV128, mkexpr(andnVhi), mkexpr(andnVlo) ) );
15692
15693   finish_xTESTy ( andV128, andnV128, sign );
15694   return delta;
15695}
15696
15697
15698/* Handles 128 bit versions of PMOVZXBW and PMOVSXBW. */
15699static Long dis_PMOVxXBW_128 ( VexAbiInfo* vbi, Prefix pfx,
15700                               Long delta, Bool isAvx, Bool xIsZ )
15701{
15702   IRTemp addr   = IRTemp_INVALID;
15703   Int    alen   = 0;
15704   HChar  dis_buf[50];
15705   IRTemp srcVec = newTemp(Ity_V128);
15706   UChar  modrm  = getUChar(delta);
15707   UChar* mbV    = isAvx ? "v" : "";
15708   UChar  how    = xIsZ ? 'z' : 's';
15709   UInt   rG     = gregOfRexRM(pfx, modrm);
15710   if ( epartIsReg(modrm) ) {
15711      UInt rE = eregOfRexRM(pfx, modrm);
15712      assign( srcVec, getXMMReg(rE) );
15713      delta += 1;
15714      DIP( "%spmov%cxbw %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
15715   } else {
15716      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
15717      assign( srcVec,
15718              unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
15719      delta += alen;
15720      DIP( "%spmov%cxbw %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
15721   }
15722
15723   IRExpr* res
15724      = xIsZ /* do math for either zero or sign extend */
15725        ? binop( Iop_InterleaveLO8x16,
15726                 IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) )
15727        : binop( Iop_SarN16x8,
15728                 binop( Iop_ShlN16x8,
15729                        binop( Iop_InterleaveLO8x16,
15730                               IRExpr_Const( IRConst_V128(0) ),
15731                               mkexpr(srcVec) ),
15732                        mkU8(8) ),
15733                 mkU8(8) );
15734
15735   (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
15736
15737   return delta;
15738}
15739
15740
15741static Long dis_PMOVxXWD_128 ( VexAbiInfo* vbi, Prefix pfx,
15742                               Long delta, Bool isAvx, Bool xIsZ )
15743{
15744   IRTemp addr   = IRTemp_INVALID;
15745   Int    alen   = 0;
15746   HChar  dis_buf[50];
15747   IRTemp srcVec = newTemp(Ity_V128);
15748   UChar  modrm  = getUChar(delta);
15749   UChar* mbV    = isAvx ? "v" : "";
15750   UChar  how    = xIsZ ? 'z' : 's';
15751   UInt   rG     = gregOfRexRM(pfx, modrm);
15752
15753   if ( epartIsReg(modrm) ) {
15754      UInt rE = eregOfRexRM(pfx, modrm);
15755      assign( srcVec, getXMMReg(rE) );
15756      delta += 1;
15757      DIP( "%spmov%cxwd %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
15758   } else {
15759      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
15760      assign( srcVec,
15761              unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
15762      delta += alen;
15763      DIP( "%spmov%cxwd %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
15764   }
15765
15766   IRExpr* res
15767      = binop( Iop_InterleaveLO16x8,
15768               IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) );
15769   if (!xIsZ)
15770      res = binop(Iop_SarN32x4,
15771                  binop(Iop_ShlN32x4, res, mkU8(16)), mkU8(16));
15772
15773   (isAvx ? putYMMRegLoAndZU : putXMMReg)
15774      ( gregOfRexRM(pfx, modrm), res );
15775
15776   return delta;
15777}
15778
15779
15780static Long dis_PMOVSXWQ_128 ( VexAbiInfo* vbi, Prefix pfx,
15781                               Long delta, Bool isAvx )
15782{
15783   IRTemp addr     = IRTemp_INVALID;
15784   Int    alen     = 0;
15785   HChar  dis_buf[50];
15786   IRTemp srcBytes = newTemp(Ity_I32);
15787   UChar  modrm    = getUChar(delta);
15788   UChar* mbV      = isAvx ? "v" : "";
15789   UInt   rG       = gregOfRexRM(pfx, modrm);
15790
15791   if ( epartIsReg( modrm ) ) {
15792      UInt rE = eregOfRexRM(pfx, modrm);
15793      assign( srcBytes, getXMMRegLane32( rE, 0 ) );
15794      delta += 1;
15795      DIP( "%spmovsxwq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
15796   } else {
15797      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
15798      assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) );
15799      delta += alen;
15800      DIP( "%spmovsxwq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
15801   }
15802
15803   (isAvx ? putYMMRegLoAndZU : putXMMReg)
15804      ( rG, binop( Iop_64HLtoV128,
15805                   unop( Iop_16Sto64,
15806                         unop( Iop_32HIto16, mkexpr(srcBytes) ) ),
15807                   unop( Iop_16Sto64,
15808                         unop( Iop_32to16, mkexpr(srcBytes) ) ) ) );
15809   return delta;
15810}
15811
15812
15813static Long dis_PMOVZXWQ_128 ( VexAbiInfo* vbi, Prefix pfx,
15814                               Long delta, Bool isAvx )
15815{
15816   IRTemp addr     = IRTemp_INVALID;
15817   Int    alen     = 0;
15818   HChar  dis_buf[50];
15819   IRTemp srcVec = newTemp(Ity_V128);
15820   UChar  modrm    = getUChar(delta);
15821   UChar* mbV      = isAvx ? "v" : "";
15822   UInt   rG       = gregOfRexRM(pfx, modrm);
15823
15824   if ( epartIsReg( modrm ) ) {
15825      UInt rE = eregOfRexRM(pfx, modrm);
15826      assign( srcVec, getXMMReg(rE) );
15827      delta += 1;
15828      DIP( "%spmovzxwq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
15829   } else {
15830      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
15831      assign( srcVec,
15832              unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
15833      delta += alen;
15834      DIP( "%spmovzxwq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
15835   }
15836
15837   IRTemp zeroVec = newTemp( Ity_V128 );
15838   assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
15839
15840   (isAvx ? putYMMRegLoAndZU : putXMMReg)
15841      ( rG, binop( Iop_InterleaveLO16x8,
15842                   mkexpr(zeroVec),
15843                   binop( Iop_InterleaveLO16x8,
15844                          mkexpr(zeroVec), mkexpr(srcVec) ) ) );
15845   return delta;
15846}
15847
15848
15849/* Handles 128 bit versions of PMOVZXDQ and PMOVSXDQ. */
15850static Long dis_PMOVxXDQ_128 ( VexAbiInfo* vbi, Prefix pfx,
15851                               Long delta, Bool isAvx, Bool xIsZ )
15852{
15853   IRTemp addr   = IRTemp_INVALID;
15854   Int    alen   = 0;
15855   HChar  dis_buf[50];
15856   IRTemp srcI64 = newTemp(Ity_I64);
15857   IRTemp srcVec = newTemp(Ity_V128);
15858   UChar  modrm  = getUChar(delta);
15859   UChar* mbV    = isAvx ? "v" : "";
15860   UChar  how    = xIsZ ? 'z' : 's';
15861   UInt   rG     = gregOfRexRM(pfx, modrm);
15862   /* Compute both srcI64 -- the value to expand -- and srcVec -- same
15863      thing in a V128, with arbitrary junk in the top 64 bits.  Use
15864      one or both of them and let iropt clean up afterwards (as
15865      usual). */
15866   if ( epartIsReg(modrm) ) {
15867      UInt rE = eregOfRexRM(pfx, modrm);
15868      assign( srcVec, getXMMReg(rE) );
15869      assign( srcI64, unop(Iop_V128to64, mkexpr(srcVec)) );
15870      delta += 1;
15871      DIP( "%spmov%cxdq %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
15872   } else {
15873      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
15874      assign( srcI64, loadLE(Ity_I64, mkexpr(addr)) );
15875      assign( srcVec, unop( Iop_64UtoV128, mkexpr(srcI64)) );
15876      delta += alen;
15877      DIP( "%spmov%cxdq %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
15878   }
15879
15880   IRExpr* res
15881      = xIsZ /* do math for either zero or sign extend */
15882        ? binop( Iop_InterleaveLO32x4,
15883                 IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) )
15884        : binop( Iop_64HLtoV128,
15885                 unop( Iop_32Sto64,
15886                       unop( Iop_64HIto32, mkexpr(srcI64) ) ),
15887                 unop( Iop_32Sto64,
15888                       unop( Iop_64to32, mkexpr(srcI64) ) ) );
15889
15890   (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
15891
15892   return delta;
15893}
15894
15895
15896/* Handles 128 bit versions of PMOVZXBD and PMOVSXBD. */
15897static Long dis_PMOVxXBD_128 ( VexAbiInfo* vbi, Prefix pfx,
15898                               Long delta, Bool isAvx, Bool xIsZ )
15899{
15900   IRTemp addr   = IRTemp_INVALID;
15901   Int    alen   = 0;
15902   HChar  dis_buf[50];
15903   IRTemp srcVec = newTemp(Ity_V128);
15904   UChar  modrm  = getUChar(delta);
15905   UChar* mbV    = isAvx ? "v" : "";
15906   UChar  how    = xIsZ ? 'z' : 's';
15907   UInt   rG     = gregOfRexRM(pfx, modrm);
15908   if ( epartIsReg(modrm) ) {
15909      UInt rE = eregOfRexRM(pfx, modrm);
15910      assign( srcVec, getXMMReg(rE) );
15911      delta += 1;
15912      DIP( "%spmov%cxbd %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
15913   } else {
15914      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
15915      assign( srcVec,
15916              unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
15917      delta += alen;
15918      DIP( "%spmov%cxbd %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
15919   }
15920
15921   IRTemp zeroVec = newTemp(Ity_V128);
15922   assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
15923
15924   IRExpr* res
15925      = binop(Iop_InterleaveLO8x16,
15926              mkexpr(zeroVec),
15927              binop(Iop_InterleaveLO8x16,
15928                    mkexpr(zeroVec), mkexpr(srcVec)));
15929   if (!xIsZ)
15930      res = binop(Iop_SarN32x4,
15931                  binop(Iop_ShlN32x4, res, mkU8(24)), mkU8(24));
15932
15933   (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
15934
15935   return delta;
15936}
15937
15938
15939/* Handles 128 bit versions of PMOVSXBQ. */
15940static Long dis_PMOVSXBQ_128 ( VexAbiInfo* vbi, Prefix pfx,
15941                               Long delta, Bool isAvx )
15942{
15943   IRTemp addr     = IRTemp_INVALID;
15944   Int    alen     = 0;
15945   HChar  dis_buf[50];
15946   IRTemp srcBytes = newTemp(Ity_I16);
15947   UChar  modrm    = getUChar(delta);
15948   UChar* mbV      = isAvx ? "v" : "";
15949   UInt   rG       = gregOfRexRM(pfx, modrm);
15950   if ( epartIsReg(modrm) ) {
15951      UInt rE = eregOfRexRM(pfx, modrm);
15952      assign( srcBytes, getXMMRegLane16( rE, 0 ) );
15953      delta += 1;
15954      DIP( "%spmovsxbq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
15955   } else {
15956      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
15957      assign( srcBytes, loadLE( Ity_I16, mkexpr(addr) ) );
15958      delta += alen;
15959      DIP( "%spmovsxbq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
15960   }
15961
15962   (isAvx ? putYMMRegLoAndZU : putXMMReg)
15963      ( rG, binop( Iop_64HLtoV128,
15964                   unop( Iop_8Sto64,
15965                         unop( Iop_16HIto8, mkexpr(srcBytes) ) ),
15966                   unop( Iop_8Sto64,
15967                         unop( Iop_16to8, mkexpr(srcBytes) ) ) ) );
15968   return delta;
15969}
15970
15971
15972/* Handles 128 bit versions of PMOVZXBQ. */
15973static Long dis_PMOVZXBQ_128 ( VexAbiInfo* vbi, Prefix pfx,
15974                               Long delta, Bool isAvx )
15975{
15976   IRTemp addr     = IRTemp_INVALID;
15977   Int    alen     = 0;
15978   HChar  dis_buf[50];
15979   IRTemp srcVec   = newTemp(Ity_V128);
15980   UChar  modrm    = getUChar(delta);
15981   UChar* mbV      = isAvx ? "v" : "";
15982   UInt   rG       = gregOfRexRM(pfx, modrm);
15983   if ( epartIsReg(modrm) ) {
15984      UInt rE = eregOfRexRM(pfx, modrm);
15985      assign( srcVec, getXMMReg(rE) );
15986      delta += 1;
15987      DIP( "%spmovzxbq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
15988   } else {
15989      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
15990      assign( srcVec,
15991              unop( Iop_32UtoV128,
15992                    unop( Iop_16Uto32, loadLE( Ity_I16, mkexpr(addr) ))));
15993      delta += alen;
15994      DIP( "%spmovzxbq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
15995   }
15996
15997   IRTemp zeroVec = newTemp(Ity_V128);
15998   assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
15999
16000   (isAvx ? putYMMRegLoAndZU : putXMMReg)
16001      ( rG, binop( Iop_InterleaveLO8x16,
16002                   mkexpr(zeroVec),
16003                   binop( Iop_InterleaveLO8x16,
16004                          mkexpr(zeroVec),
16005                          binop( Iop_InterleaveLO8x16,
16006                                 mkexpr(zeroVec), mkexpr(srcVec) ) ) ) );
16007   return delta;
16008}
16009
16010
16011static Long dis_PHMINPOSUW_128 ( VexAbiInfo* vbi, Prefix pfx,
16012                                 Long delta, Bool isAvx )
16013{
16014   IRTemp addr   = IRTemp_INVALID;
16015   Int    alen   = 0;
16016   HChar  dis_buf[50];
16017   UChar  modrm  = getUChar(delta);
16018   UChar* mbV    = isAvx ? "v" : "";
16019   IRTemp sV     = newTemp(Ity_V128);
16020   IRTemp sHi    = newTemp(Ity_I64);
16021   IRTemp sLo    = newTemp(Ity_I64);
16022   IRTemp dLo    = newTemp(Ity_I64);
16023   UInt   rG     = gregOfRexRM(pfx,modrm);
16024   if (epartIsReg(modrm)) {
16025      UInt rE = eregOfRexRM(pfx,modrm);
16026      assign( sV, getXMMReg(rE) );
16027      delta += 1;
16028      DIP("%sphminposuw %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG));
16029   } else {
16030      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
16031      if (!isAvx)
16032         gen_SEGV_if_not_16_aligned(addr);
16033      assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
16034      delta += alen;
16035      DIP("%sphminposuw %s,%s\n", mbV, dis_buf, nameXMMReg(rG));
16036   }
16037   assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
16038   assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
16039   assign( dLo, mkIRExprCCall(
16040                   Ity_I64, 0/*regparms*/,
16041                   "amd64g_calculate_sse_phminposuw",
16042                   &amd64g_calculate_sse_phminposuw,
16043                   mkIRExprVec_2( mkexpr(sLo), mkexpr(sHi) )
16044         ));
16045   (isAvx ? putYMMRegLoAndZU : putXMMReg)
16046      (rG, unop(Iop_64UtoV128, mkexpr(dLo)));
16047   return delta;
16048}
16049
16050
16051static Long dis_AESx ( VexAbiInfo* vbi, Prefix pfx,
16052                       Long delta, Bool isAvx, UChar opc )
16053{
16054   IRTemp addr   = IRTemp_INVALID;
16055   Int    alen   = 0;
16056   HChar  dis_buf[50];
16057   UChar  modrm  = getUChar(delta);
16058   UInt   rG     = gregOfRexRM(pfx, modrm);
16059   UInt   regNoL = 0;
16060   UInt   regNoR = (isAvx && opc != 0xDB) ? getVexNvvvv(pfx) : rG;
16061
16062   /* This is a nasty kludge.  We need to pass 2 x V128 to the
16063      helper.  Since we can't do that, use a dirty
16064      helper to compute the results directly from the XMM regs in
16065      the guest state.  That means for the memory case, we need to
16066      move the left operand into a pseudo-register (XMM16, let's
16067      call it). */
16068   if (epartIsReg(modrm)) {
16069      regNoL = eregOfRexRM(pfx, modrm);
16070      delta += 1;
16071   } else {
16072      regNoL = 16; /* use XMM16 as an intermediary */
16073      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16074      /* alignment check needed ???? */
16075      stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
16076      delta += alen;
16077   }
16078
16079   void*  fn = &amd64g_dirtyhelper_AES;
16080   HChar* nm = "amd64g_dirtyhelper_AES";
16081
16082   /* Round up the arguments.  Note that this is a kludge -- the
16083      use of mkU64 rather than mkIRExpr_HWord implies the
16084      assumption that the host's word size is 64-bit. */
16085   UInt gstOffD = ymmGuestRegOffset(rG);
16086   UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
16087   UInt gstOffR = ymmGuestRegOffset(regNoR);
16088   IRExpr*  opc4         = mkU64(opc);
16089   IRExpr*  gstOffDe     = mkU64(gstOffD);
16090   IRExpr*  gstOffLe     = mkU64(gstOffL);
16091   IRExpr*  gstOffRe     = mkU64(gstOffR);
16092   IRExpr** args
16093      = mkIRExprVec_4( opc4, gstOffDe, gstOffLe, gstOffRe );
16094
16095   IRDirty* d    = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
16096   /* It's not really a dirty call, but we can't use the clean
16097      helper mechanism here for the very lame reason that we can't
16098      pass 2 x V128s by value to a helper, nor get one back.  Hence
16099      this roundabout scheme. */
16100   d->needsBBP = True;
16101   d->nFxState = 2;
16102   vex_bzero(&d->fxState, sizeof(d->fxState));
16103   /* AES{ENC,ENCLAST,DEC,DECLAST} read both registers, and writes
16104      the second for !isAvx or the third for isAvx.
16105      AESIMC (0xDB) reads the first register, and writes the second. */
16106   d->fxState[0].fx     = Ifx_Read;
16107   d->fxState[0].offset = gstOffL;
16108   d->fxState[0].size   = sizeof(U128);
16109   d->fxState[1].offset = gstOffR;
16110   d->fxState[1].size   = sizeof(U128);
16111   if (opc == 0xDB)
16112      d->fxState[1].fx   = Ifx_Write;
16113   else if (!isAvx || rG == regNoR)
16114      d->fxState[1].fx   = Ifx_Modify;
16115   else {
16116      d->fxState[1].fx     = Ifx_Read;
16117      d->nFxState++;
16118      d->fxState[2].fx     = Ifx_Write;
16119      d->fxState[2].offset = gstOffD;
16120      d->fxState[2].size   = sizeof(U128);
16121   }
16122
16123   stmt( IRStmt_Dirty(d) );
16124   {
16125      HChar* opsuf;
16126      switch (opc) {
16127         case 0xDC: opsuf = "enc"; break;
16128         case 0XDD: opsuf = "enclast"; break;
16129         case 0xDE: opsuf = "dec"; break;
16130         case 0xDF: opsuf = "declast"; break;
16131         case 0xDB: opsuf = "imc"; break;
16132         default: vassert(0);
16133      }
16134      DIP("%saes%s %s,%s%s%s\n", isAvx ? "v" : "", opsuf,
16135          (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
16136          nameXMMReg(regNoR),
16137          (isAvx && opc != 0xDB) ? "," : "",
16138          (isAvx && opc != 0xDB) ? nameXMMReg(rG) : "");
16139   }
16140   if (isAvx)
16141      putYMMRegLane128( rG, 1, mkV128(0) );
16142   return delta;
16143}
16144
16145static Long dis_AESKEYGENASSIST ( VexAbiInfo* vbi, Prefix pfx,
16146                                  Long delta, Bool isAvx )
16147{
16148   IRTemp addr   = IRTemp_INVALID;
16149   Int    alen   = 0;
16150   HChar  dis_buf[50];
16151   UChar  modrm  = getUChar(delta);
16152   UInt   regNoL = 0;
16153   UInt   regNoR = gregOfRexRM(pfx, modrm);
16154   UChar  imm    = 0;
16155
16156   /* This is a nasty kludge.  See AESENC et al. instructions. */
16157   modrm = getUChar(delta);
16158   if (epartIsReg(modrm)) {
16159      regNoL = eregOfRexRM(pfx, modrm);
16160      imm = getUChar(delta+1);
16161      delta += 1+1;
16162   } else {
16163      regNoL = 16; /* use XMM16 as an intermediary */
16164      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
16165      /* alignment check ???? . */
16166      stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
16167      imm = getUChar(delta+alen);
16168      delta += alen+1;
16169   }
16170
16171   /* Who ya gonna call?  Presumably not Ghostbusters. */
16172   void*  fn = &amd64g_dirtyhelper_AESKEYGENASSIST;
16173   HChar* nm = "amd64g_dirtyhelper_AESKEYGENASSIST";
16174
16175   /* Round up the arguments.  Note that this is a kludge -- the
16176      use of mkU64 rather than mkIRExpr_HWord implies the
16177      assumption that the host's word size is 64-bit. */
16178   UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
16179   UInt gstOffR = ymmGuestRegOffset(regNoR);
16180
16181   IRExpr*  imme          = mkU64(imm & 0xFF);
16182   IRExpr*  gstOffLe     = mkU64(gstOffL);
16183   IRExpr*  gstOffRe     = mkU64(gstOffR);
16184   IRExpr** args
16185      = mkIRExprVec_3( imme, gstOffLe, gstOffRe );
16186
16187   IRDirty* d    = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
16188   /* It's not really a dirty call, but we can't use the clean
16189      helper mechanism here for the very lame reason that we can't
16190      pass 2 x V128s by value to a helper, nor get one back.  Hence
16191      this roundabout scheme. */
16192   d->needsBBP = True;
16193   d->nFxState = 2;
16194   vex_bzero(&d->fxState, sizeof(d->fxState));
16195   d->fxState[0].fx     = Ifx_Read;
16196   d->fxState[0].offset = gstOffL;
16197   d->fxState[0].size   = sizeof(U128);
16198   d->fxState[1].fx     = Ifx_Write;
16199   d->fxState[1].offset = gstOffR;
16200   d->fxState[1].size   = sizeof(U128);
16201   stmt( IRStmt_Dirty(d) );
16202
16203   DIP("%saeskeygenassist $%x,%s,%s\n", isAvx ? "v" : "", (UInt)imm,
16204       (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
16205       nameXMMReg(regNoR));
16206   if (isAvx)
16207      putYMMRegLane128( regNoR, 1, mkV128(0) );
16208   return delta;
16209}
16210
16211
16212__attribute__((noinline))
16213static
16214Long dis_ESC_0F38__SSE4 ( Bool* decode_OK,
16215                          VexAbiInfo* vbi,
16216                          Prefix pfx, Int sz, Long deltaIN )
16217{
16218   IRTemp addr  = IRTemp_INVALID;
16219   UChar  modrm = 0;
16220   Int    alen  = 0;
16221   HChar  dis_buf[50];
16222
16223   *decode_OK = False;
16224
16225   Long   delta = deltaIN;
16226   UChar  opc   = getUChar(delta);
16227   delta++;
16228   switch (opc) {
16229
16230   case 0x10:
16231   case 0x14:
16232   case 0x15:
16233      /* 66 0F 38 10 /r = PBLENDVB xmm1, xmm2/m128  (byte gran)
16234         66 0F 38 14 /r = BLENDVPS xmm1, xmm2/m128  (float gran)
16235         66 0F 38 15 /r = BLENDVPD xmm1, xmm2/m128  (double gran)
16236         Blend at various granularities, with XMM0 (implicit operand)
16237         providing the controlling mask.
16238      */
16239      if (have66noF2noF3(pfx) && sz == 2) {
16240         modrm = getUChar(delta);
16241
16242         HChar* nm    = NULL;
16243         UInt   gran  = 0;
16244         IROp   opSAR = Iop_INVALID;
16245         switch (opc) {
16246            case 0x10:
16247               nm = "pblendvb"; gran = 1; opSAR = Iop_SarN8x16;
16248               break;
16249            case 0x14:
16250               nm = "blendvps"; gran = 4; opSAR = Iop_SarN32x4;
16251               break;
16252            case 0x15:
16253               nm = "blendvpd"; gran = 8; opSAR = Iop_SarN64x2;
16254               break;
16255         }
16256         vassert(nm);
16257
16258         IRTemp vecE = newTemp(Ity_V128);
16259         IRTemp vecG = newTemp(Ity_V128);
16260         IRTemp vec0 = newTemp(Ity_V128);
16261
16262         if ( epartIsReg(modrm) ) {
16263            assign(vecE, getXMMReg(eregOfRexRM(pfx, modrm)));
16264            delta += 1;
16265            DIP( "%s %s,%s\n", nm,
16266                 nameXMMReg( eregOfRexRM(pfx, modrm) ),
16267                 nameXMMReg( gregOfRexRM(pfx, modrm) ) );
16268         } else {
16269            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16270            gen_SEGV_if_not_16_aligned( addr );
16271            assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
16272            delta += alen;
16273            DIP( "%s %s,%s\n", nm,
16274                 dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
16275         }
16276
16277         assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm)));
16278         assign(vec0, getXMMReg(0));
16279
16280         IRTemp res = math_PBLENDVB_128( vecE, vecG, vec0, gran, opSAR );
16281         putXMMReg(gregOfRexRM(pfx, modrm), mkexpr(res));
16282
16283         goto decode_success;
16284      }
16285      break;
16286
16287   case 0x17:
16288      /* 66 0F 38 17 /r = PTEST xmm1, xmm2/m128
16289         Logical compare (set ZF and CF from AND/ANDN of the operands) */
16290      if (have66noF2noF3(pfx)
16291          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
16292         delta = dis_xTESTy_128( vbi, pfx, delta, False/*!isAvx*/, 0 );
16293         goto decode_success;
16294      }
16295      break;
16296
16297   case 0x20:
16298      /* 66 0F 38 20 /r = PMOVSXBW xmm1, xmm2/m64
16299         Packed Move with Sign Extend from Byte to Word (XMM) */
16300      if (have66noF2noF3(pfx) && sz == 2) {
16301         delta = dis_PMOVxXBW_128( vbi, pfx, delta,
16302                                   False/*!isAvx*/, False/*!xIsZ*/ );
16303         goto decode_success;
16304      }
16305      break;
16306
16307   case 0x21:
16308      /* 66 0F 38 21 /r = PMOVSXBD xmm1, xmm2/m32
16309         Packed Move with Sign Extend from Byte to DWord (XMM) */
16310      if (have66noF2noF3(pfx) && sz == 2) {
16311         delta = dis_PMOVxXBD_128( vbi, pfx, delta,
16312                                   False/*!isAvx*/, False/*!xIsZ*/ );
16313         goto decode_success;
16314      }
16315      break;
16316
16317   case 0x22:
16318      /* 66 0F 38 22 /r = PMOVSXBQ xmm1, xmm2/m16
16319         Packed Move with Sign Extend from Byte to QWord (XMM) */
16320      if (have66noF2noF3(pfx) && sz == 2) {
16321         delta = dis_PMOVSXBQ_128( vbi, pfx, delta, False/*!isAvx*/ );
16322         goto decode_success;
16323      }
16324      break;
16325
16326   case 0x23:
16327      /* 66 0F 38 23 /r = PMOVSXWD xmm1, xmm2/m64
16328         Packed Move with Sign Extend from Word to DWord (XMM) */
16329      if (have66noF2noF3(pfx) && sz == 2) {
16330         delta = dis_PMOVxXWD_128(vbi, pfx, delta,
16331                                  False/*!isAvx*/, False/*!xIsZ*/);
16332         goto decode_success;
16333      }
16334      break;
16335
16336   case 0x24:
16337      /* 66 0F 38 24 /r = PMOVSXWQ xmm1, xmm2/m32
16338         Packed Move with Sign Extend from Word to QWord (XMM) */
16339      if (have66noF2noF3(pfx) && sz == 2) {
16340         delta = dis_PMOVSXWQ_128( vbi, pfx, delta, False/*!isAvx*/ );
16341         goto decode_success;
16342      }
16343      break;
16344
16345   case 0x25:
16346      /* 66 0F 38 25 /r = PMOVSXDQ xmm1, xmm2/m64
16347         Packed Move with Sign Extend from Double Word to Quad Word (XMM) */
16348      if (have66noF2noF3(pfx) && sz == 2) {
16349         delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
16350                                   False/*!isAvx*/, False/*!xIsZ*/ );
16351         goto decode_success;
16352      }
16353      break;
16354
16355   case 0x28:
16356      /* 66 0F 38 28 = PMULDQ -- signed widening multiply of 32-lanes
16357         0 x 0 to form lower 64-bit half and lanes 2 x 2 to form upper
16358         64-bit half */
16359      /* This is a really poor translation -- could be improved if
16360         performance critical.  It's a copy-paste of PMULUDQ, too. */
16361      if (have66noF2noF3(pfx) && sz == 2) {
16362         IRTemp sV = newTemp(Ity_V128);
16363         IRTemp dV = newTemp(Ity_V128);
16364         modrm = getUChar(delta);
16365         UInt rG = gregOfRexRM(pfx,modrm);
16366         assign( dV, getXMMReg(rG) );
16367         if (epartIsReg(modrm)) {
16368            UInt rE = eregOfRexRM(pfx,modrm);
16369            assign( sV, getXMMReg(rE) );
16370            delta += 1;
16371            DIP("pmuldq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
16372         } else {
16373            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
16374            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
16375            delta += alen;
16376            DIP("pmuldq %s,%s\n", dis_buf, nameXMMReg(rG));
16377         }
16378
16379         putXMMReg( rG, mkexpr(math_PMULDQ_128( dV, sV )) );
16380         goto decode_success;
16381      }
16382      break;
16383
16384   case 0x29:
16385      /* 66 0F 38 29 = PCMPEQQ
16386         64x2 equality comparison */
16387      if (have66noF2noF3(pfx) && sz == 2) {
16388         /* FIXME: this needs an alignment check */
16389         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
16390                                    "pcmpeqq", Iop_CmpEQ64x2, False );
16391         goto decode_success;
16392      }
16393      break;
16394
16395   case 0x2B:
16396      /* 66 0f 38 2B /r = PACKUSDW xmm1, xmm2/m128
16397         2x 32x4 S->U saturating narrow from xmm2/m128 to xmm1 */
16398      if (have66noF2noF3(pfx) && sz == 2) {
16399
16400         modrm = getUChar(delta);
16401
16402         IRTemp argL = newTemp(Ity_V128);
16403         IRTemp argR = newTemp(Ity_V128);
16404
16405         if ( epartIsReg(modrm) ) {
16406            assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
16407            delta += 1;
16408            DIP( "packusdw %s,%s\n",
16409                 nameXMMReg( eregOfRexRM(pfx, modrm) ),
16410                 nameXMMReg( gregOfRexRM(pfx, modrm) ) );
16411         } else {
16412            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16413            gen_SEGV_if_not_16_aligned( addr );
16414            assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
16415            delta += alen;
16416            DIP( "packusdw %s,%s\n",
16417                 dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
16418         }
16419
16420         assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
16421
16422         putXMMReg( gregOfRexRM(pfx, modrm),
16423                    binop( Iop_QNarrowBin32Sto16Ux8,
16424                           mkexpr(argL), mkexpr(argR)) );
16425
16426         goto decode_success;
16427      }
16428      break;
16429
16430   case 0x30:
16431      /* 66 0F 38 30 /r = PMOVZXBW xmm1, xmm2/m64
16432         Packed Move with Zero Extend from Byte to Word (XMM) */
16433      if (have66noF2noF3(pfx) && sz == 2) {
16434         delta = dis_PMOVxXBW_128( vbi, pfx, delta,
16435                                   False/*!isAvx*/, True/*xIsZ*/ );
16436         goto decode_success;
16437      }
16438      break;
16439
16440   case 0x31:
16441      /* 66 0F 38 31 /r = PMOVZXBD xmm1, xmm2/m32
16442         Packed Move with Zero Extend from Byte to DWord (XMM) */
16443      if (have66noF2noF3(pfx) && sz == 2) {
16444         delta = dis_PMOVxXBD_128( vbi, pfx, delta,
16445                                   False/*!isAvx*/, True/*xIsZ*/ );
16446         goto decode_success;
16447      }
16448      break;
16449
16450   case 0x32:
16451      /* 66 0F 38 32 /r = PMOVZXBQ xmm1, xmm2/m16
16452         Packed Move with Zero Extend from Byte to QWord (XMM) */
16453      if (have66noF2noF3(pfx) && sz == 2) {
16454         delta = dis_PMOVZXBQ_128( vbi, pfx, delta, False/*!isAvx*/ );
16455         goto decode_success;
16456      }
16457      break;
16458
16459   case 0x33:
16460      /* 66 0F 38 33 /r = PMOVZXWD xmm1, xmm2/m64
16461         Packed Move with Zero Extend from Word to DWord (XMM) */
16462      if (have66noF2noF3(pfx) && sz == 2) {
16463         delta = dis_PMOVxXWD_128( vbi, pfx, delta,
16464                                   False/*!isAvx*/, True/*xIsZ*/ );
16465         goto decode_success;
16466      }
16467      break;
16468
16469   case 0x34:
16470      /* 66 0F 38 34 /r = PMOVZXWQ xmm1, xmm2/m32
16471         Packed Move with Zero Extend from Word to QWord (XMM) */
16472      if (have66noF2noF3(pfx) && sz == 2) {
16473         delta = dis_PMOVZXWQ_128( vbi, pfx, delta, False/*!isAvx*/ );
16474         goto decode_success;
16475      }
16476      break;
16477
16478   case 0x35:
16479      /* 66 0F 38 35 /r = PMOVZXDQ xmm1, xmm2/m64
16480         Packed Move with Zero Extend from DWord to QWord (XMM) */
16481      if (have66noF2noF3(pfx) && sz == 2) {
16482         delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
16483                                   False/*!isAvx*/, True/*xIsZ*/ );
16484         goto decode_success;
16485      }
16486      break;
16487
16488   case 0x37:
16489      /* 66 0F 38 37 = PCMPGTQ
16490         64x2 comparison (signed, presumably; the Intel docs don't say :-)
16491      */
16492      if (have66noF2noF3(pfx) && sz == 2) {
16493         /* FIXME: this needs an alignment check */
16494         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
16495                                    "pcmpgtq", Iop_CmpGT64Sx2, False );
16496         goto decode_success;
16497      }
16498      break;
16499
16500   case 0x38:
16501   case 0x3C:
16502      /* 66 0F 38 38 /r = PMINSB xmm1, xmm2/m128    8Sx16 (signed) min
16503         66 0F 38 3C /r = PMAXSB xmm1, xmm2/m128    8Sx16 (signed) max
16504      */
16505      if (have66noF2noF3(pfx) && sz == 2) {
16506         /* FIXME: this needs an alignment check */
16507         Bool isMAX = opc == 0x3C;
16508         delta = dis_SSEint_E_to_G(
16509                    vbi, pfx, delta,
16510                    isMAX ? "pmaxsb" : "pminsb",
16511                    isMAX ? Iop_Max8Sx16 : Iop_Min8Sx16,
16512                    False
16513                 );
16514         goto decode_success;
16515      }
16516      break;
16517
16518   case 0x39:
16519   case 0x3D:
16520      /* 66 0F 38 39 /r = PMINSD xmm1, xmm2/m128
16521         Minimum of Packed Signed Double Word Integers (XMM)
16522         66 0F 38 3D /r = PMAXSD xmm1, xmm2/m128
16523         Maximum of Packed Signed Double Word Integers (XMM)
16524      */
16525      if (have66noF2noF3(pfx) && sz == 2) {
16526         /* FIXME: this needs an alignment check */
16527         Bool isMAX = opc == 0x3D;
16528         delta = dis_SSEint_E_to_G(
16529                    vbi, pfx, delta,
16530                    isMAX ? "pmaxsd" : "pminsd",
16531                    isMAX ? Iop_Max32Sx4 : Iop_Min32Sx4,
16532                    False
16533                 );
16534         goto decode_success;
16535      }
16536      break;
16537
16538   case 0x3A:
16539   case 0x3E:
16540      /* 66 0F 38 3A /r = PMINUW xmm1, xmm2/m128
16541         Minimum of Packed Unsigned Word Integers (XMM)
16542         66 0F 38 3E /r = PMAXUW xmm1, xmm2/m128
16543         Maximum of Packed Unsigned Word Integers (XMM)
16544      */
16545      if (have66noF2noF3(pfx) && sz == 2) {
16546         /* FIXME: this needs an alignment check */
16547         Bool isMAX = opc == 0x3E;
16548         delta = dis_SSEint_E_to_G(
16549                    vbi, pfx, delta,
16550                    isMAX ? "pmaxuw" : "pminuw",
16551                    isMAX ? Iop_Max16Ux8 : Iop_Min16Ux8,
16552                    False
16553                 );
16554         goto decode_success;
16555      }
16556      break;
16557
16558   case 0x3B:
16559   case 0x3F:
16560      /* 66 0F 38 3B /r = PMINUD xmm1, xmm2/m128
16561         Minimum of Packed Unsigned Doubleword Integers (XMM)
16562         66 0F 38 3F /r = PMAXUD xmm1, xmm2/m128
16563         Maximum of Packed Unsigned Doubleword Integers (XMM)
16564      */
16565      if (have66noF2noF3(pfx) && sz == 2) {
16566         /* FIXME: this needs an alignment check */
16567         Bool isMAX = opc == 0x3F;
16568         delta = dis_SSEint_E_to_G(
16569                    vbi, pfx, delta,
16570                    isMAX ? "pmaxud" : "pminud",
16571                    isMAX ? Iop_Max32Ux4 : Iop_Min32Ux4,
16572                    False
16573                 );
16574         goto decode_success;
16575      }
16576      break;
16577
16578   case 0x40:
16579      /* 66 0F 38 40 /r = PMULLD xmm1, xmm2/m128
16580         32x4 integer multiply from xmm2/m128 to xmm1 */
16581      if (have66noF2noF3(pfx) && sz == 2) {
16582
16583         modrm = getUChar(delta);
16584
16585         IRTemp argL = newTemp(Ity_V128);
16586         IRTemp argR = newTemp(Ity_V128);
16587
16588         if ( epartIsReg(modrm) ) {
16589            assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
16590            delta += 1;
16591            DIP( "pmulld %s,%s\n",
16592                 nameXMMReg( eregOfRexRM(pfx, modrm) ),
16593                 nameXMMReg( gregOfRexRM(pfx, modrm) ) );
16594         } else {
16595            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16596            gen_SEGV_if_not_16_aligned( addr );
16597            assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
16598            delta += alen;
16599            DIP( "pmulld %s,%s\n",
16600                 dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
16601         }
16602
16603         assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
16604
16605         putXMMReg( gregOfRexRM(pfx, modrm),
16606                    binop( Iop_Mul32x4, mkexpr(argL), mkexpr(argR)) );
16607
16608         goto decode_success;
16609      }
16610      break;
16611
16612   case 0x41:
16613      /* 66 0F 38 41 /r = PHMINPOSUW xmm1, xmm2/m128
16614         Packed Horizontal Word Minimum from xmm2/m128 to xmm1 */
16615      if (have66noF2noF3(pfx) && sz == 2) {
16616         delta = dis_PHMINPOSUW_128( vbi, pfx, delta, False/*!isAvx*/ );
16617         goto decode_success;
16618      }
16619      break;
16620
16621   case 0xDC:
16622   case 0xDD:
16623   case 0xDE:
16624   case 0xDF:
16625   case 0xDB:
16626      /* 66 0F 38 DC /r = AESENC xmm1, xmm2/m128
16627                  DD /r = AESENCLAST xmm1, xmm2/m128
16628                  DE /r = AESDEC xmm1, xmm2/m128
16629                  DF /r = AESDECLAST xmm1, xmm2/m128
16630
16631                  DB /r = AESIMC xmm1, xmm2/m128 */
16632      if (have66noF2noF3(pfx) && sz == 2) {
16633         delta = dis_AESx( vbi, pfx, delta, False/*!isAvx*/, opc );
16634         goto decode_success;
16635      }
16636      break;
16637
16638   case 0xF0:
16639   case 0xF1:
16640      /* F2 0F 38 F0 /r = CRC32 r/m8, r32 (REX.W ok, 66 not ok)
16641         F2 0F 38 F1 /r = CRC32 r/m{16,32,64}, r32
16642         The decoding on this is a bit unusual.
16643      */
16644      if (haveF2noF3(pfx)
16645          && (opc == 0xF1 || (opc == 0xF0 && !have66(pfx)))) {
16646         modrm = getUChar(delta);
16647
16648         if (opc == 0xF0)
16649            sz = 1;
16650         else
16651            vassert(sz == 2 || sz == 4 || sz == 8);
16652
16653         IRType tyE = szToITy(sz);
16654         IRTemp valE = newTemp(tyE);
16655
16656         if (epartIsReg(modrm)) {
16657            assign(valE, getIRegE(sz, pfx, modrm));
16658            delta += 1;
16659            DIP("crc32b %s,%s\n", nameIRegE(sz, pfx, modrm),
16660                nameIRegG(1==getRexW(pfx) ? 8 : 4, pfx, modrm));
16661         } else {
16662            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16663            assign(valE, loadLE(tyE, mkexpr(addr)));
16664            delta += alen;
16665            DIP("crc32b %s,%s\n", dis_buf,
16666                nameIRegG(1==getRexW(pfx) ? 8 : 4, pfx, modrm));
16667         }
16668
16669         /* Somewhat funny getting/putting of the crc32 value, in order
16670            to ensure that it turns into 64-bit gets and puts.  However,
16671            mask off the upper 32 bits so as to not get memcheck false
16672            +ves around the helper call. */
16673         IRTemp valG0 = newTemp(Ity_I64);
16674         assign(valG0, binop(Iop_And64, getIRegG(8, pfx, modrm),
16675                             mkU64(0xFFFFFFFF)));
16676
16677         HChar* nm = NULL;
16678         void*  fn = NULL;
16679         switch (sz) {
16680            case 1: nm = "amd64g_calc_crc32b";
16681                    fn = &amd64g_calc_crc32b; break;
16682            case 2: nm = "amd64g_calc_crc32w";
16683                    fn = &amd64g_calc_crc32w; break;
16684            case 4: nm = "amd64g_calc_crc32l";
16685                    fn = &amd64g_calc_crc32l; break;
16686            case 8: nm = "amd64g_calc_crc32q";
16687                    fn = &amd64g_calc_crc32q; break;
16688         }
16689         vassert(nm && fn);
16690         IRTemp valG1 = newTemp(Ity_I64);
16691         assign(valG1,
16692                mkIRExprCCall(Ity_I64, 0/*regparm*/, nm, fn,
16693                              mkIRExprVec_2(mkexpr(valG0),
16694                                            widenUto64(mkexpr(valE)))));
16695
16696         putIRegG(4, pfx, modrm, unop(Iop_64to32, mkexpr(valG1)));
16697         goto decode_success;
16698      }
16699      break;
16700
16701   default:
16702      break;
16703
16704   }
16705
16706  //decode_failure:
16707   *decode_OK = False;
16708   return deltaIN;
16709
16710  decode_success:
16711   *decode_OK = True;
16712   return delta;
16713}
16714
16715
16716/*------------------------------------------------------------*/
16717/*---                                                      ---*/
16718/*--- Top-level SSE4: dis_ESC_0F3A__SSE4                   ---*/
16719/*---                                                      ---*/
16720/*------------------------------------------------------------*/
16721
16722static Long dis_PEXTRW ( VexAbiInfo* vbi, Prefix pfx,
16723                         Long delta, Bool isAvx )
16724{
16725   IRTemp addr  = IRTemp_INVALID;
16726   IRTemp t0    = IRTemp_INVALID;
16727   IRTemp t1    = IRTemp_INVALID;
16728   IRTemp t2    = IRTemp_INVALID;
16729   IRTemp t3    = IRTemp_INVALID;
16730   UChar  modrm = getUChar(delta);
16731   Int    alen  = 0;
16732   HChar  dis_buf[50];
16733   UInt   rG    = gregOfRexRM(pfx,modrm);
16734   Int    imm8_20;
16735   IRTemp xmm_vec = newTemp(Ity_V128);
16736   IRTemp d16   = newTemp(Ity_I16);
16737   HChar* mbV   = isAvx ? "v" : "";
16738
16739   vassert(0==getRexW(pfx)); /* ensured by caller */
16740   assign( xmm_vec, getXMMReg(rG) );
16741   breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
16742
16743   if ( epartIsReg( modrm ) ) {
16744      imm8_20 = (Int)(getUChar(delta+1) & 7);
16745   } else {
16746      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
16747      imm8_20 = (Int)(getUChar(delta+alen) & 7);
16748   }
16749
16750   switch (imm8_20) {
16751      case 0:  assign(d16, unop(Iop_32to16,   mkexpr(t0))); break;
16752      case 1:  assign(d16, unop(Iop_32HIto16, mkexpr(t0))); break;
16753      case 2:  assign(d16, unop(Iop_32to16,   mkexpr(t1))); break;
16754      case 3:  assign(d16, unop(Iop_32HIto16, mkexpr(t1))); break;
16755      case 4:  assign(d16, unop(Iop_32to16,   mkexpr(t2))); break;
16756      case 5:  assign(d16, unop(Iop_32HIto16, mkexpr(t2))); break;
16757      case 6:  assign(d16, unop(Iop_32to16,   mkexpr(t3))); break;
16758      case 7:  assign(d16, unop(Iop_32HIto16, mkexpr(t3))); break;
16759      default: vassert(0);
16760   }
16761
16762   if ( epartIsReg( modrm ) ) {
16763      UInt rE = eregOfRexRM(pfx,modrm);
16764      putIReg32( rE, unop(Iop_16Uto32, mkexpr(d16)) );
16765      delta += 1+1;
16766      DIP( "%spextrw $%d, %s,%s\n", mbV, imm8_20,
16767           nameXMMReg( rG ), nameIReg32( rE ) );
16768   } else {
16769      storeLE( mkexpr(addr), mkexpr(d16) );
16770      delta += alen+1;
16771      DIP( "%spextrw $%d, %s,%s\n", mbV, imm8_20, nameXMMReg( rG ), dis_buf );
16772   }
16773   return delta;
16774}
16775
16776
16777static Long dis_PEXTRD ( VexAbiInfo* vbi, Prefix pfx,
16778                         Long delta, Bool isAvx )
16779{
16780   IRTemp addr  = IRTemp_INVALID;
16781   IRTemp t0    = IRTemp_INVALID;
16782   IRTemp t1    = IRTemp_INVALID;
16783   IRTemp t2    = IRTemp_INVALID;
16784   IRTemp t3    = IRTemp_INVALID;
16785   UChar  modrm = 0;
16786   Int    alen  = 0;
16787   HChar  dis_buf[50];
16788
16789   Int    imm8_10;
16790   IRTemp xmm_vec   = newTemp(Ity_V128);
16791   IRTemp src_dword = newTemp(Ity_I32);
16792   HChar* mbV = isAvx ? "v" : "";
16793
16794   vassert(0==getRexW(pfx)); /* ensured by caller */
16795   modrm = getUChar(delta);
16796   assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
16797   breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
16798
16799   if ( epartIsReg( modrm ) ) {
16800      imm8_10 = (Int)(getUChar(delta+1) & 3);
16801   } else {
16802      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
16803      imm8_10 = (Int)(getUChar(delta+alen) & 3);
16804   }
16805
16806   switch ( imm8_10 ) {
16807      case 0:  assign( src_dword, mkexpr(t0) ); break;
16808      case 1:  assign( src_dword, mkexpr(t1) ); break;
16809      case 2:  assign( src_dword, mkexpr(t2) ); break;
16810      case 3:  assign( src_dword, mkexpr(t3) ); break;
16811      default: vassert(0);
16812   }
16813
16814   if ( epartIsReg( modrm ) ) {
16815      putIReg32( eregOfRexRM(pfx,modrm), mkexpr(src_dword) );
16816      delta += 1+1;
16817      DIP( "%spextrd $%d, %s,%s\n", mbV, imm8_10,
16818           nameXMMReg( gregOfRexRM(pfx, modrm) ),
16819           nameIReg32( eregOfRexRM(pfx, modrm) ) );
16820   } else {
16821      storeLE( mkexpr(addr), mkexpr(src_dword) );
16822      delta += alen+1;
16823      DIP( "%spextrd $%d, %s,%s\n", mbV,
16824           imm8_10, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
16825   }
16826   return delta;
16827}
16828
16829
16830static Long dis_PEXTRQ ( VexAbiInfo* vbi, Prefix pfx,
16831                         Long delta, Bool isAvx )
16832{
16833   IRTemp addr  = IRTemp_INVALID;
16834   UChar  modrm = 0;
16835   Int    alen  = 0;
16836   HChar  dis_buf[50];
16837
16838   Int imm8_0;
16839   IRTemp xmm_vec   = newTemp(Ity_V128);
16840   IRTemp src_qword = newTemp(Ity_I64);
16841   HChar* mbV = isAvx ? "v" : "";
16842
16843   vassert(1==getRexW(pfx)); /* ensured by caller */
16844   modrm = getUChar(delta);
16845   assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
16846
16847   if ( epartIsReg( modrm ) ) {
16848      imm8_0 = (Int)(getUChar(delta+1) & 1);
16849   } else {
16850      addr   = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
16851      imm8_0 = (Int)(getUChar(delta+alen) & 1);
16852   }
16853
16854   switch ( imm8_0 ) {
16855      case 0:  assign( src_qword, unop(Iop_V128to64,   mkexpr(xmm_vec)) );
16856               break;
16857      case 1:  assign( src_qword, unop(Iop_V128HIto64, mkexpr(xmm_vec)) );
16858               break;
16859      default: vassert(0);
16860   }
16861
16862   if ( epartIsReg( modrm ) ) {
16863      putIReg64( eregOfRexRM(pfx,modrm), mkexpr(src_qword) );
16864      delta += 1+1;
16865      DIP( "%spextrq $%d, %s,%s\n", mbV, imm8_0,
16866           nameXMMReg( gregOfRexRM(pfx, modrm) ),
16867           nameIReg64( eregOfRexRM(pfx, modrm) ) );
16868   } else {
16869      storeLE( mkexpr(addr), mkexpr(src_qword) );
16870      delta += alen+1;
16871      DIP( "%spextrq $%d, %s,%s\n", mbV,
16872           imm8_0, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
16873   }
16874   return delta;
16875}
16876
16877
16878/* This can fail, in which case it returns the original (unchanged)
16879   delta. */
16880static Long dis_PCMPxSTRx ( VexAbiInfo* vbi, Prefix pfx,
16881                            Long delta, Bool isAvx, UChar opc )
16882{
16883   Long   delta0  = delta;
16884   UInt   isISTRx = opc & 2;
16885   UInt   isxSTRM = (opc & 1) ^ 1;
16886   UInt   regNoL  = 0;
16887   UInt   regNoR  = 0;
16888   UChar  imm     = 0;
16889   IRTemp addr    = IRTemp_INVALID;
16890   Int    alen    = 0;
16891   HChar  dis_buf[50];
16892
16893   /* This is a nasty kludge.  We need to pass 2 x V128 to the helper
16894      (which is clean).  Since we can't do that, use a dirty helper to
16895      compute the results directly from the XMM regs in the guest
16896      state.  That means for the memory case, we need to move the left
16897      operand into a pseudo-register (XMM16, let's call it). */
16898   UChar modrm = getUChar(delta);
16899   if (epartIsReg(modrm)) {
16900      regNoL = eregOfRexRM(pfx, modrm);
16901      regNoR = gregOfRexRM(pfx, modrm);
16902      imm = getUChar(delta+1);
16903      delta += 1+1;
16904   } else {
16905      regNoL = 16; /* use XMM16 as an intermediary */
16906      regNoR = gregOfRexRM(pfx, modrm);
16907      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
16908      /* No alignment check; I guess that makes sense, given that
16909         these insns are for dealing with C style strings. */
16910      stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
16911      imm = getUChar(delta+alen);
16912      delta += alen+1;
16913   }
16914
16915   /* Now we know the XMM reg numbers for the operands, and the
16916      immediate byte.  Is it one we can actually handle? Throw out any
16917      cases for which the helper function has not been verified. */
16918   switch (imm) {
16919      case 0x00:
16920      case 0x02: case 0x08: case 0x0A: case 0x0C: case 0x12:
16921      case 0x1A: case 0x38: case 0x3A: case 0x44: case 0x4A:
16922      case 0x46:
16923         break;
16924      case 0x01: // the 16-bit character versions of the above
16925      case 0x03: case 0x09: case 0x0B: case 0x0D: case 0x13:
16926      case 0x1B: case 0x39: case 0x3B: case 0x45: case 0x4B:
16927         break;
16928      default:
16929         return delta0; /*FAIL*/
16930   }
16931
16932   /* Who ya gonna call?  Presumably not Ghostbusters. */
16933   void*  fn = &amd64g_dirtyhelper_PCMPxSTRx;
16934   HChar* nm = "amd64g_dirtyhelper_PCMPxSTRx";
16935
16936   /* Round up the arguments.  Note that this is a kludge -- the use
16937      of mkU64 rather than mkIRExpr_HWord implies the assumption that
16938      the host's word size is 64-bit. */
16939   UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
16940   UInt gstOffR = ymmGuestRegOffset(regNoR);
16941
16942   IRExpr*  opc4_and_imm = mkU64((opc << 8) | (imm & 0xFF));
16943   IRExpr*  gstOffLe     = mkU64(gstOffL);
16944   IRExpr*  gstOffRe     = mkU64(gstOffR);
16945   IRExpr*  edxIN        = isISTRx ? mkU64(0) : getIRegRDX(8);
16946   IRExpr*  eaxIN        = isISTRx ? mkU64(0) : getIRegRAX(8);
16947   IRExpr** args
16948      = mkIRExprVec_5( opc4_and_imm, gstOffLe, gstOffRe, edxIN, eaxIN );
16949
16950   IRTemp   resT = newTemp(Ity_I64);
16951   IRDirty* d    = unsafeIRDirty_1_N( resT, 0/*regparms*/, nm, fn, args );
16952   /* It's not really a dirty call, but we can't use the clean helper
16953      mechanism here for the very lame reason that we can't pass 2 x
16954      V128s by value to a helper, nor get one back.  Hence this
16955      roundabout scheme. */
16956   d->needsBBP = True;
16957   d->nFxState = 2;
16958   vex_bzero(&d->fxState, sizeof(d->fxState));
16959   d->fxState[0].fx     = Ifx_Read;
16960   d->fxState[0].offset = gstOffL;
16961   d->fxState[0].size   = sizeof(U128);
16962   d->fxState[1].fx     = Ifx_Read;
16963   d->fxState[1].offset = gstOffR;
16964   d->fxState[1].size   = sizeof(U128);
16965   if (isxSTRM) {
16966      /* Declare that the helper writes XMM0. */
16967      d->nFxState = 3;
16968      d->fxState[2].fx     = Ifx_Write;
16969      d->fxState[2].offset = ymmGuestRegOffset(0);
16970      d->fxState[2].size   = sizeof(U128);
16971   }
16972
16973   stmt( IRStmt_Dirty(d) );
16974
16975   /* Now resT[15:0] holds the new OSZACP values, so the condition
16976      codes must be updated. And for a xSTRI case, resT[31:16] holds
16977      the new ECX value, so stash that too. */
16978   if (!isxSTRM) {
16979      putIReg64(R_RCX, binop(Iop_And64,
16980                             binop(Iop_Shr64, mkexpr(resT), mkU8(16)),
16981                             mkU64(0xFFFF)));
16982   }
16983
16984   /* Zap the upper half of the dest reg as per AVX conventions. */
16985   if (isxSTRM && isAvx)
16986      putYMMRegLane128(/*YMM*/0, 1, mkV128(0));
16987
16988   stmt( IRStmt_Put(
16989            OFFB_CC_DEP1,
16990            binop(Iop_And64, mkexpr(resT), mkU64(0xFFFF))
16991   ));
16992   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
16993   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
16994   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
16995
16996   if (regNoL == 16) {
16997      DIP("%spcmp%cstr%c $%x,%s,%s\n",
16998          isAvx ? "v" : "", isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
16999          (UInt)imm, dis_buf, nameXMMReg(regNoR));
17000   } else {
17001      DIP("%spcmp%cstr%c $%x,%s,%s\n",
17002          isAvx ? "v" : "", isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
17003          (UInt)imm, nameXMMReg(regNoL), nameXMMReg(regNoR));
17004   }
17005
17006   return delta;
17007}
17008
17009
17010static IRTemp math_PINSRB_128 ( IRTemp v128, IRTemp u8, UInt imm8 )
17011{
17012   vassert(imm8 >= 0 && imm8 <= 15);
17013
17014   // Create a V128 value which has the selected byte in the
17015   // specified lane, and zeroes everywhere else.
17016   IRTemp tmp128    = newTemp(Ity_V128);
17017   IRTemp halfshift = newTemp(Ity_I64);
17018   assign(halfshift, binop(Iop_Shl64,
17019                           unop(Iop_8Uto64, mkexpr(u8)),
17020                           mkU8(8 * (imm8 & 7))));
17021   if (imm8 < 8) {
17022      assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
17023   } else {
17024      assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
17025   }
17026
17027   UShort mask = ~(1 << imm8);
17028   IRTemp res  = newTemp(Ity_V128);
17029   assign( res, binop(Iop_OrV128,
17030                      mkexpr(tmp128),
17031                      binop(Iop_AndV128, mkexpr(v128), mkV128(mask))) );
17032   return res;
17033}
17034
17035
17036static IRTemp math_PINSRD_128 ( IRTemp v128, IRTemp u32, UInt imm8 )
17037{
17038   IRTemp z32 = newTemp(Ity_I32);
17039   assign(z32, mkU32(0));
17040
17041   /* Surround u32 with zeroes as per imm, giving us something we can
17042      OR into a suitably masked-out v128.*/
17043   IRTemp withZs = newTemp(Ity_V128);
17044   UShort mask = 0;
17045   switch (imm8) {
17046      case 3:  mask = 0x0FFF;
17047               assign(withZs, mkV128from32s(u32, z32, z32, z32));
17048               break;
17049      case 2:  mask = 0xF0FF;
17050               assign(withZs, mkV128from32s(z32, u32, z32, z32));
17051               break;
17052      case 1:  mask = 0xFF0F;
17053               assign(withZs, mkV128from32s(z32, z32, u32, z32));
17054               break;
17055      case 0:  mask = 0xFFF0;
17056               assign(withZs, mkV128from32s(z32, z32, z32, u32));
17057               break;
17058      default: vassert(0);
17059   }
17060
17061   IRTemp res = newTemp(Ity_V128);
17062   assign(res, binop( Iop_OrV128,
17063                      mkexpr(withZs),
17064                      binop( Iop_AndV128, mkexpr(v128), mkV128(mask) ) ) );
17065   return res;
17066}
17067
17068
17069static IRTemp math_PINSRQ_128 ( IRTemp v128, IRTemp u64, UInt imm8 )
17070{
17071   /* Surround u64 with zeroes as per imm, giving us something we can
17072      OR into a suitably masked-out v128.*/
17073   IRTemp withZs = newTemp(Ity_V128);
17074   UShort mask = 0;
17075   if (imm8 == 0) {
17076      mask = 0xFF00;
17077      assign(withZs, binop(Iop_64HLtoV128, mkU64(0), mkexpr(u64)));
17078   } else {
17079      vassert(imm8 == 1);
17080      mask = 0x00FF;
17081      assign( withZs, binop(Iop_64HLtoV128, mkexpr(u64), mkU64(0)));
17082   }
17083
17084   IRTemp res = newTemp(Ity_V128);
17085   assign( res, binop( Iop_OrV128,
17086                       mkexpr(withZs),
17087                       binop( Iop_AndV128, mkexpr(v128), mkV128(mask) ) ) );
17088   return res;
17089}
17090
17091
17092static IRTemp math_INSERTPS ( IRTemp dstV, IRTemp toInsertD, UInt imm8 )
17093{
17094   const IRTemp inval = IRTemp_INVALID;
17095   IRTemp dstDs[4] = { inval, inval, inval, inval };
17096   breakupV128to32s( dstV, &dstDs[3], &dstDs[2], &dstDs[1], &dstDs[0] );
17097
17098   vassert(imm8 <= 255);
17099   dstDs[(imm8 >> 4) & 3] = toInsertD; /* "imm8_count_d" */
17100
17101   UInt imm8_zmask = (imm8 & 15);
17102   IRTemp zero_32 = newTemp(Ity_I32);
17103   assign( zero_32, mkU32(0) );
17104   IRTemp resV = newTemp(Ity_V128);
17105   assign( resV, mkV128from32s(
17106                    ((imm8_zmask & 8) == 8) ? zero_32 : dstDs[3],
17107                    ((imm8_zmask & 4) == 4) ? zero_32 : dstDs[2],
17108                    ((imm8_zmask & 2) == 2) ? zero_32 : dstDs[1],
17109                    ((imm8_zmask & 1) == 1) ? zero_32 : dstDs[0]) );
17110   return resV;
17111}
17112
17113
17114static Long dis_PEXTRB_128_GtoE ( VexAbiInfo* vbi, Prefix pfx,
17115                                  Long delta, Bool isAvx )
17116{
17117   IRTemp addr     = IRTemp_INVALID;
17118   Int    alen     = 0;
17119   HChar  dis_buf[50];
17120   IRTemp xmm_vec  = newTemp(Ity_V128);
17121   IRTemp sel_lane = newTemp(Ity_I32);
17122   IRTemp shr_lane = newTemp(Ity_I32);
17123   UChar* mbV      = isAvx ? "v" : "";
17124   UChar  modrm    = getUChar(delta);
17125   IRTemp t3, t2, t1, t0;
17126   Int    imm8;
17127   assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
17128   t3 = t2 = t1 = t0 = IRTemp_INVALID;
17129   breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
17130
17131   if ( epartIsReg( modrm ) ) {
17132      imm8 = (Int)getUChar(delta+1);
17133   } else {
17134      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
17135      imm8 = (Int)getUChar(delta+alen);
17136   }
17137   switch ( (imm8 >> 2) & 3 ) {
17138      case 0:  assign( sel_lane, mkexpr(t0) ); break;
17139      case 1:  assign( sel_lane, mkexpr(t1) ); break;
17140      case 2:  assign( sel_lane, mkexpr(t2) ); break;
17141      case 3:  assign( sel_lane, mkexpr(t3) ); break;
17142      default: vassert(0);
17143   }
17144   assign( shr_lane,
17145           binop( Iop_Shr32, mkexpr(sel_lane), mkU8(((imm8 & 3)*8)) ) );
17146
17147   if ( epartIsReg( modrm ) ) {
17148      putIReg64( eregOfRexRM(pfx,modrm),
17149                 unop( Iop_32Uto64,
17150                       binop(Iop_And32, mkexpr(shr_lane), mkU32(255)) ) );
17151      delta += 1+1;
17152      DIP( "%spextrb $%d, %s,%s\n", mbV, imm8,
17153           nameXMMReg( gregOfRexRM(pfx, modrm) ),
17154           nameIReg64( eregOfRexRM(pfx, modrm) ) );
17155   } else {
17156      storeLE( mkexpr(addr), unop(Iop_32to8, mkexpr(shr_lane) ) );
17157      delta += alen+1;
17158      DIP( "%spextrb $%d,%s,%s\n", mbV,
17159           imm8, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
17160   }
17161
17162   return delta;
17163}
17164
17165
17166static IRTemp math_DPPD_128 ( IRTemp src_vec, IRTemp dst_vec, UInt imm8 )
17167{
17168   vassert(imm8 < 256);
17169   UShort imm8_perms[4] = { 0x0000, 0x00FF, 0xFF00, 0xFFFF };
17170   IRTemp and_vec = newTemp(Ity_V128);
17171   IRTemp sum_vec = newTemp(Ity_V128);
17172   assign( and_vec, binop( Iop_AndV128,
17173                           binop( Iop_Mul64Fx2,
17174                                  mkexpr(dst_vec), mkexpr(src_vec) ),
17175                           mkV128( imm8_perms[ ((imm8 >> 4) & 3) ] ) ) );
17176
17177   assign( sum_vec, binop( Iop_Add64F0x2,
17178                           binop( Iop_InterleaveHI64x2,
17179                                  mkexpr(and_vec), mkexpr(and_vec) ),
17180                           binop( Iop_InterleaveLO64x2,
17181                                  mkexpr(and_vec), mkexpr(and_vec) ) ) );
17182   IRTemp res = newTemp(Ity_V128);
17183   assign(res, binop( Iop_AndV128,
17184                      binop( Iop_InterleaveLO64x2,
17185                             mkexpr(sum_vec), mkexpr(sum_vec) ),
17186                      mkV128( imm8_perms[ (imm8 & 3) ] ) ) );
17187   return res;
17188}
17189
17190
17191static IRTemp math_DPPS_128 ( IRTemp src_vec, IRTemp dst_vec, UInt imm8 )
17192{
17193   vassert(imm8 < 256);
17194   IRTemp tmp_prod_vec = newTemp(Ity_V128);
17195   IRTemp prod_vec     = newTemp(Ity_V128);
17196   IRTemp sum_vec      = newTemp(Ity_V128);
17197   IRTemp v3, v2, v1, v0;
17198   v3 = v2 = v1 = v0   = IRTemp_INVALID;
17199   UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
17200                             0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
17201                             0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0,
17202                             0xFFFF };
17203
17204   assign( tmp_prod_vec,
17205           binop( Iop_AndV128,
17206                  binop( Iop_Mul32Fx4, mkexpr(dst_vec),
17207                                       mkexpr(src_vec) ),
17208                  mkV128( imm8_perms[((imm8 >> 4)& 15)] ) ) );
17209   breakupV128to32s( tmp_prod_vec, &v3, &v2, &v1, &v0 );
17210   assign( prod_vec, mkV128from32s( v3, v1, v2, v0 ) );
17211
17212   assign( sum_vec, binop( Iop_Add32Fx4,
17213                           binop( Iop_InterleaveHI32x4,
17214                                  mkexpr(prod_vec), mkexpr(prod_vec) ),
17215                           binop( Iop_InterleaveLO32x4,
17216                                  mkexpr(prod_vec), mkexpr(prod_vec) ) ) );
17217
17218   IRTemp res = newTemp(Ity_V128);
17219   assign( res, binop( Iop_AndV128,
17220                       binop( Iop_Add32Fx4,
17221                              binop( Iop_InterleaveHI32x4,
17222                                     mkexpr(sum_vec), mkexpr(sum_vec) ),
17223                              binop( Iop_InterleaveLO32x4,
17224                                     mkexpr(sum_vec), mkexpr(sum_vec) ) ),
17225                       mkV128( imm8_perms[ (imm8 & 15) ] ) ) );
17226   return res;
17227}
17228
17229
17230static IRTemp math_MPSADBW_128 ( IRTemp dst_vec, IRTemp src_vec, UInt imm8 )
17231{
17232   /* Mask out bits of the operands we don't need.  This isn't
17233      strictly necessary, but it does ensure Memcheck doesn't
17234      give us any false uninitialised value errors as a
17235      result. */
17236   UShort src_mask[4] = { 0x000F, 0x00F0, 0x0F00, 0xF000 };
17237   UShort dst_mask[2] = { 0x07FF, 0x7FF0 };
17238
17239   IRTemp src_maskV = newTemp(Ity_V128);
17240   IRTemp dst_maskV = newTemp(Ity_V128);
17241   assign(src_maskV, mkV128( src_mask[ imm8 & 3 ] ));
17242   assign(dst_maskV, mkV128( dst_mask[ (imm8 >> 2) & 1 ] ));
17243
17244   IRTemp src_masked = newTemp(Ity_V128);
17245   IRTemp dst_masked = newTemp(Ity_V128);
17246   assign(src_masked, binop(Iop_AndV128, mkexpr(src_vec), mkexpr(src_maskV)));
17247   assign(dst_masked, binop(Iop_AndV128, mkexpr(dst_vec), mkexpr(dst_maskV)));
17248
17249   /* Generate 4 64 bit values that we can hand to a clean helper */
17250   IRTemp sHi = newTemp(Ity_I64);
17251   IRTemp sLo = newTemp(Ity_I64);
17252   assign( sHi, unop(Iop_V128HIto64, mkexpr(src_masked)) );
17253   assign( sLo, unop(Iop_V128to64,   mkexpr(src_masked)) );
17254
17255   IRTemp dHi = newTemp(Ity_I64);
17256   IRTemp dLo = newTemp(Ity_I64);
17257   assign( dHi, unop(Iop_V128HIto64, mkexpr(dst_masked)) );
17258   assign( dLo, unop(Iop_V128to64,   mkexpr(dst_masked)) );
17259
17260   /* Compute halves of the result separately */
17261   IRTemp resHi = newTemp(Ity_I64);
17262   IRTemp resLo = newTemp(Ity_I64);
17263
17264   IRExpr** argsHi
17265      = mkIRExprVec_5( mkexpr(sHi), mkexpr(sLo), mkexpr(dHi), mkexpr(dLo),
17266                       mkU64( 0x80 | (imm8 & 7) ));
17267   IRExpr** argsLo
17268      = mkIRExprVec_5( mkexpr(sHi), mkexpr(sLo), mkexpr(dHi), mkexpr(dLo),
17269                       mkU64( 0x00 | (imm8 & 7) ));
17270
17271   assign(resHi, mkIRExprCCall( Ity_I64, 0/*regparm*/,
17272                                "amd64g_calc_mpsadbw",
17273                                &amd64g_calc_mpsadbw, argsHi ));
17274   assign(resLo, mkIRExprCCall( Ity_I64, 0/*regparm*/,
17275                                "amd64g_calc_mpsadbw",
17276                                &amd64g_calc_mpsadbw, argsLo ));
17277
17278   IRTemp res = newTemp(Ity_V128);
17279   assign(res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo)));
17280   return res;
17281}
17282
17283static Long dis_EXTRACTPS ( VexAbiInfo* vbi, Prefix pfx,
17284                            Long delta, Bool isAvx )
17285{
17286   IRTemp addr       = IRTemp_INVALID;
17287   Int    alen       = 0;
17288   HChar  dis_buf[50];
17289   UChar  modrm      = getUChar(delta);
17290   Int imm8_10;
17291   IRTemp xmm_vec    = newTemp(Ity_V128);
17292   IRTemp src_dword  = newTemp(Ity_I32);
17293   UInt   rG         = gregOfRexRM(pfx,modrm);
17294   IRTemp t3, t2, t1, t0;
17295   t3 = t2 = t1 = t0 = IRTemp_INVALID;
17296
17297   assign( xmm_vec, getXMMReg( rG ) );
17298   breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
17299
17300   if ( epartIsReg( modrm ) ) {
17301      imm8_10 = (Int)(getUChar(delta+1) & 3);
17302   } else {
17303      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
17304      imm8_10 = (Int)(getUChar(delta+alen) & 3);
17305   }
17306
17307   switch ( imm8_10 ) {
17308      case 0:  assign( src_dword, mkexpr(t0) ); break;
17309      case 1:  assign( src_dword, mkexpr(t1) ); break;
17310      case 2:  assign( src_dword, mkexpr(t2) ); break;
17311      case 3:  assign( src_dword, mkexpr(t3) ); break;
17312      default: vassert(0);
17313   }
17314
17315   if ( epartIsReg( modrm ) ) {
17316      UInt rE = eregOfRexRM(pfx,modrm);
17317      putIReg32( rE, mkexpr(src_dword) );
17318      delta += 1+1;
17319      DIP( "%sextractps $%d, %s,%s\n", isAvx ? "v" : "", imm8_10,
17320           nameXMMReg( rG ), nameIReg32( rE ) );
17321   } else {
17322      storeLE( mkexpr(addr), mkexpr(src_dword) );
17323      delta += alen+1;
17324      DIP( "%sextractps $%d, %s,%s\n", isAvx ? "v" : "", imm8_10,
17325           nameXMMReg( rG ), dis_buf );
17326   }
17327
17328   return delta;
17329}
17330
17331
17332static IRTemp math_PCLMULQDQ( IRTemp dV, IRTemp sV, UInt imm8 )
17333{
17334   IRTemp t0 = newTemp(Ity_I64);
17335   IRTemp t1 = newTemp(Ity_I64);
17336   assign(t0, unop((imm8&1)? Iop_V128HIto64 : Iop_V128to64,
17337              mkexpr(dV)));
17338   assign(t1, unop((imm8&16) ? Iop_V128HIto64 : Iop_V128to64,
17339              mkexpr(sV)));
17340
17341   IRTemp t2 = newTemp(Ity_I64);
17342   IRTemp t3 = newTemp(Ity_I64);
17343
17344   IRExpr** args;
17345
17346   args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(0));
17347   assign(t2, mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
17348                            &amd64g_calculate_pclmul, args));
17349   args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(1));
17350   assign(t3, mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
17351                            &amd64g_calculate_pclmul, args));
17352
17353   IRTemp res     = newTemp(Ity_V128);
17354   assign(res, binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)));
17355   return res;
17356}
17357
17358
17359__attribute__((noinline))
17360static
17361Long dis_ESC_0F3A__SSE4 ( Bool* decode_OK,
17362                          VexAbiInfo* vbi,
17363                          Prefix pfx, Int sz, Long deltaIN )
17364{
17365   IRTemp addr  = IRTemp_INVALID;
17366   UChar  modrm = 0;
17367   Int    alen  = 0;
17368   HChar  dis_buf[50];
17369
17370   *decode_OK = False;
17371
17372   Long   delta = deltaIN;
17373   UChar  opc   = getUChar(delta);
17374   delta++;
17375   switch (opc) {
17376
17377   case 0x08:
17378      /* 66 0F 3A 08 /r ib = ROUNDPS imm8, xmm2/m128, xmm1 */
17379      if (have66noF2noF3(pfx) && sz == 2) {
17380
17381         IRTemp src0 = newTemp(Ity_F32);
17382         IRTemp src1 = newTemp(Ity_F32);
17383         IRTemp src2 = newTemp(Ity_F32);
17384         IRTemp src3 = newTemp(Ity_F32);
17385         IRTemp res0 = newTemp(Ity_F32);
17386         IRTemp res1 = newTemp(Ity_F32);
17387         IRTemp res2 = newTemp(Ity_F32);
17388         IRTemp res3 = newTemp(Ity_F32);
17389         IRTemp rm   = newTemp(Ity_I32);
17390         Int    imm  = 0;
17391
17392         modrm = getUChar(delta);
17393
17394         if (epartIsReg(modrm)) {
17395            assign( src0,
17396                    getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
17397            assign( src1,
17398                    getXMMRegLane32F( eregOfRexRM(pfx, modrm), 1 ) );
17399            assign( src2,
17400                    getXMMRegLane32F( eregOfRexRM(pfx, modrm), 2 ) );
17401            assign( src3,
17402                    getXMMRegLane32F( eregOfRexRM(pfx, modrm), 3 ) );
17403            imm = getUChar(delta+1);
17404            if (imm & ~15) goto decode_failure;
17405            delta += 1+1;
17406            DIP( "roundps $%d,%s,%s\n",
17407                 imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
17408                      nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17409         } else {
17410            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
17411            gen_SEGV_if_not_16_aligned(addr);
17412            assign( src0, loadLE(Ity_F32,
17413                                 binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
17414            assign( src1, loadLE(Ity_F32,
17415                                 binop(Iop_Add64, mkexpr(addr), mkU64(4) )));
17416            assign( src2, loadLE(Ity_F32,
17417                                 binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
17418            assign( src3, loadLE(Ity_F32,
17419                                 binop(Iop_Add64, mkexpr(addr), mkU64(12) )));
17420            imm = getUChar(delta+alen);
17421            if (imm & ~15) goto decode_failure;
17422            delta += alen+1;
17423            DIP( "roundps $%d,%s,%s\n",
17424                 imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17425         }
17426
17427         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
17428            that encoding is the same as the encoding for IRRoundingMode,
17429            we can use that value directly in the IR as a rounding
17430            mode. */
17431         assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
17432
17433         assign(res0, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src0)) );
17434         assign(res1, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src1)) );
17435         assign(res2, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src2)) );
17436         assign(res3, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src3)) );
17437
17438         putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
17439         putXMMRegLane32F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
17440         putXMMRegLane32F( gregOfRexRM(pfx, modrm), 2, mkexpr(res2) );
17441         putXMMRegLane32F( gregOfRexRM(pfx, modrm), 3, mkexpr(res3) );
17442
17443         goto decode_success;
17444      }
17445      break;
17446
17447   case 0x09:
17448      /* 66 0F 3A 09 /r ib = ROUNDPD imm8, xmm2/m128, xmm1 */
17449      if (have66noF2noF3(pfx) && sz == 2) {
17450
17451         IRTemp src0 = newTemp(Ity_F64);
17452         IRTemp src1 = newTemp(Ity_F64);
17453         IRTemp res0 = newTemp(Ity_F64);
17454         IRTemp res1 = newTemp(Ity_F64);
17455         IRTemp rm   = newTemp(Ity_I32);
17456         Int    imm  = 0;
17457
17458         modrm = getUChar(delta);
17459
17460         if (epartIsReg(modrm)) {
17461            assign( src0,
17462                    getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 ) );
17463            assign( src1,
17464                    getXMMRegLane64F( eregOfRexRM(pfx, modrm), 1 ) );
17465            imm = getUChar(delta+1);
17466            if (imm & ~15) goto decode_failure;
17467            delta += 1+1;
17468            DIP( "roundpd $%d,%s,%s\n",
17469                 imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
17470                      nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17471         } else {
17472            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
17473            gen_SEGV_if_not_16_aligned(addr);
17474            assign( src0, loadLE(Ity_F64,
17475                                 binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
17476            assign( src1, loadLE(Ity_F64,
17477                                 binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
17478            imm = getUChar(delta+alen);
17479            if (imm & ~15) goto decode_failure;
17480            delta += alen+1;
17481            DIP( "roundpd $%d,%s,%s\n",
17482                 imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17483         }
17484
17485         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
17486            that encoding is the same as the encoding for IRRoundingMode,
17487            we can use that value directly in the IR as a rounding
17488            mode. */
17489         assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
17490
17491         assign(res0, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src0)) );
17492         assign(res1, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src1)) );
17493
17494         putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
17495         putXMMRegLane64F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
17496
17497         goto decode_success;
17498      }
17499      break;
17500
17501   case 0x0A:
17502   case 0x0B:
17503      /* 66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
17504         66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
17505      */
17506      if (have66noF2noF3(pfx) && sz == 2) {
17507
17508         Bool   isD = opc == 0x0B;
17509         IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
17510         IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
17511         Int    imm = 0;
17512
17513         modrm = getUChar(delta);
17514
17515         if (epartIsReg(modrm)) {
17516            assign( src,
17517                    isD ? getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 )
17518                        : getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
17519            imm = getUChar(delta+1);
17520            if (imm & ~15) goto decode_failure;
17521            delta += 1+1;
17522            DIP( "rounds%c $%d,%s,%s\n",
17523                 isD ? 'd' : 's',
17524                 imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
17525                      nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17526         } else {
17527            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
17528            assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
17529            imm = getUChar(delta+alen);
17530            if (imm & ~15) goto decode_failure;
17531            delta += alen+1;
17532            DIP( "rounds%c $%d,%s,%s\n",
17533                 isD ? 'd' : 's',
17534                 imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17535         }
17536
17537         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
17538            that encoding is the same as the encoding for IRRoundingMode,
17539            we can use that value directly in the IR as a rounding
17540            mode. */
17541         assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
17542                           (imm & 4) ? get_sse_roundingmode()
17543                                     : mkU32(imm & 3),
17544                           mkexpr(src)) );
17545
17546         if (isD)
17547            putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
17548         else
17549            putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
17550
17551         goto decode_success;
17552      }
17553      break;
17554
17555   case 0x0C:
17556      /* 66 0F 3A 0C /r ib = BLENDPS xmm1, xmm2/m128, imm8
17557         Blend Packed Single Precision Floating-Point Values (XMM) */
17558      if (have66noF2noF3(pfx) && sz == 2) {
17559
17560         Int imm8;
17561         IRTemp dst_vec = newTemp(Ity_V128);
17562         IRTemp src_vec = newTemp(Ity_V128);
17563
17564         modrm = getUChar(delta);
17565
17566         assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
17567
17568         if ( epartIsReg( modrm ) ) {
17569            imm8 = (Int)getUChar(delta+1);
17570            assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
17571            delta += 1+1;
17572            DIP( "blendps $%d, %s,%s\n", imm8,
17573                 nameXMMReg( eregOfRexRM(pfx, modrm) ),
17574                 nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17575         } else {
17576            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
17577                             1/* imm8 is 1 byte after the amode */ );
17578            gen_SEGV_if_not_16_aligned( addr );
17579            assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
17580            imm8 = (Int)getUChar(delta+alen);
17581            delta += alen+1;
17582            DIP( "blendpd $%d, %s,%s\n",
17583                 imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17584         }
17585
17586         putXMMReg( gregOfRexRM(pfx, modrm),
17587                    mkexpr( math_BLENDPS_128( src_vec, dst_vec, imm8) ) );
17588         goto decode_success;
17589      }
17590      break;
17591
17592   case 0x0D:
17593      /* 66 0F 3A 0D /r ib = BLENDPD xmm1, xmm2/m128, imm8
17594         Blend Packed Double Precision Floating-Point Values (XMM) */
17595      if (have66noF2noF3(pfx) && sz == 2) {
17596
17597         Int imm8;
17598         IRTemp dst_vec = newTemp(Ity_V128);
17599         IRTemp src_vec = newTemp(Ity_V128);
17600
17601         modrm = getUChar(delta);
17602         assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
17603
17604         if ( epartIsReg( modrm ) ) {
17605            imm8 = (Int)getUChar(delta+1);
17606            assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
17607            delta += 1+1;
17608            DIP( "blendpd $%d, %s,%s\n", imm8,
17609                 nameXMMReg( eregOfRexRM(pfx, modrm) ),
17610                 nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17611         } else {
17612            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
17613                             1/* imm8 is 1 byte after the amode */ );
17614            gen_SEGV_if_not_16_aligned( addr );
17615            assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
17616            imm8 = (Int)getUChar(delta+alen);
17617            delta += alen+1;
17618            DIP( "blendpd $%d, %s,%s\n",
17619                 imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17620         }
17621
17622         putXMMReg( gregOfRexRM(pfx, modrm),
17623                    mkexpr( math_BLENDPD_128( src_vec, dst_vec, imm8) ) );
17624         goto decode_success;
17625      }
17626      break;
17627
17628   case 0x0E:
17629      /* 66 0F 3A 0E /r ib = PBLENDW xmm1, xmm2/m128, imm8
17630         Blend Packed Words (XMM) */
17631      if (have66noF2noF3(pfx) && sz == 2) {
17632
17633         Int imm8;
17634         IRTemp dst_vec = newTemp(Ity_V128);
17635         IRTemp src_vec = newTemp(Ity_V128);
17636
17637         modrm = getUChar(delta);
17638
17639         assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
17640
17641         if ( epartIsReg( modrm ) ) {
17642            imm8 = (Int)getUChar(delta+1);
17643            assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
17644            delta += 1+1;
17645            DIP( "pblendw $%d, %s,%s\n", imm8,
17646                 nameXMMReg( eregOfRexRM(pfx, modrm) ),
17647                 nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17648         } else {
17649            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
17650                             1/* imm8 is 1 byte after the amode */ );
17651            gen_SEGV_if_not_16_aligned( addr );
17652            assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
17653            imm8 = (Int)getUChar(delta+alen);
17654            delta += alen+1;
17655            DIP( "pblendw $%d, %s,%s\n",
17656                 imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17657         }
17658
17659         putXMMReg( gregOfRexRM(pfx, modrm),
17660                    mkexpr( math_PBLENDW_128( src_vec, dst_vec, imm8) ) );
17661         goto decode_success;
17662      }
17663      break;
17664
17665   case 0x14:
17666      /* 66 0F 3A 14 /r ib = PEXTRB r/m16, xmm, imm8
17667         Extract Byte from xmm, store in mem or zero-extend + store in gen.reg.
17668         (XMM) */
17669      if (have66noF2noF3(pfx) && sz == 2) {
17670         delta = dis_PEXTRB_128_GtoE( vbi, pfx, delta, False/*!isAvx*/ );
17671         goto decode_success;
17672      }
17673      break;
17674
17675   case 0x15:
17676      /* 66 0F 3A 15 /r ib = PEXTRW r/m16, xmm, imm8
17677         Extract Word from xmm, store in mem or zero-extend + store in gen.reg.
17678         (XMM) */
17679      if (have66noF2noF3(pfx) && sz == 2) {
17680         delta = dis_PEXTRW( vbi, pfx, delta, False/*!isAvx*/ );
17681         goto decode_success;
17682      }
17683      break;
17684
17685   case 0x16:
17686      /* 66 no-REX.W 0F 3A 16 /r ib = PEXTRD reg/mem32, xmm2, imm8
17687         Extract Doubleword int from xmm reg and store in gen.reg or mem. (XMM)
17688         Note that this insn has the same opcodes as PEXTRQ, but
17689         here the REX.W bit is _not_ present */
17690      if (have66noF2noF3(pfx)
17691          && sz == 2 /* REX.W is _not_ present */) {
17692         delta = dis_PEXTRD( vbi, pfx, delta, False/*!isAvx*/ );
17693         goto decode_success;
17694      }
17695      /* 66 REX.W 0F 3A 16 /r ib = PEXTRQ reg/mem64, xmm2, imm8
17696         Extract Quadword int from xmm reg and store in gen.reg or mem. (XMM)
17697         Note that this insn has the same opcodes as PEXTRD, but
17698         here the REX.W bit is present */
17699      if (have66noF2noF3(pfx)
17700          && sz == 8 /* REX.W is present */) {
17701         delta = dis_PEXTRQ( vbi, pfx, delta, False/*!isAvx*/);
17702         goto decode_success;
17703      }
17704      break;
17705
17706   case 0x17:
17707      /* 66 0F 3A 17 /r ib = EXTRACTPS reg/mem32, xmm2, imm8 Extract
17708         float from xmm reg and store in gen.reg or mem.  This is
17709         identical to PEXTRD, except that REX.W appears to be ignored.
17710      */
17711      if (have66noF2noF3(pfx)
17712          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
17713         delta = dis_EXTRACTPS( vbi, pfx, delta, False/*!isAvx*/ );
17714         goto decode_success;
17715      }
17716      break;
17717
17718   case 0x20:
17719      /* 66 0F 3A 20 /r ib = PINSRB xmm1, r32/m8, imm8
17720         Extract byte from r32/m8 and insert into xmm1 */
17721      if (have66noF2noF3(pfx) && sz == 2) {
17722         Int    imm8;
17723         IRTemp new8 = newTemp(Ity_I8);
17724         modrm = getUChar(delta);
17725         UInt rG = gregOfRexRM(pfx, modrm);
17726         if ( epartIsReg( modrm ) ) {
17727            UInt rE = eregOfRexRM(pfx,modrm);
17728            imm8 = (Int)(getUChar(delta+1) & 0xF);
17729            assign( new8, unop(Iop_32to8, getIReg32(rE)) );
17730            delta += 1+1;
17731            DIP( "pinsrb $%d,%s,%s\n", imm8,
17732                 nameIReg32(rE), nameXMMReg(rG) );
17733         } else {
17734            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
17735            imm8 = (Int)(getUChar(delta+alen) & 0xF);
17736            assign( new8, loadLE( Ity_I8, mkexpr(addr) ) );
17737            delta += alen+1;
17738            DIP( "pinsrb $%d,%s,%s\n",
17739                 imm8, dis_buf, nameXMMReg(rG) );
17740         }
17741         IRTemp src_vec = newTemp(Ity_V128);
17742         assign(src_vec, getXMMReg( gregOfRexRM(pfx, modrm) ));
17743         IRTemp res = math_PINSRB_128( src_vec, new8, imm8 );
17744         putXMMReg( rG, mkexpr(res) );
17745         goto decode_success;
17746      }
17747      break;
17748
17749   case 0x21:
17750      /* 66 0F 3A 21 /r ib = INSERTPS imm8, xmm2/m32, xmm1
17751         Insert Packed Single Precision Floating-Point Value (XMM) */
17752      if (have66noF2noF3(pfx) && sz == 2) {
17753         UInt   imm8;
17754         IRTemp d2ins = newTemp(Ity_I32); /* comes from the E part */
17755         const IRTemp inval = IRTemp_INVALID;
17756
17757         modrm = getUChar(delta);
17758         UInt rG = gregOfRexRM(pfx, modrm);
17759
17760         if ( epartIsReg( modrm ) ) {
17761            UInt   rE = eregOfRexRM(pfx, modrm);
17762            IRTemp vE = newTemp(Ity_V128);
17763            assign( vE, getXMMReg(rE) );
17764            IRTemp dsE[4] = { inval, inval, inval, inval };
17765            breakupV128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
17766            imm8 = getUChar(delta+1);
17767            d2ins = dsE[(imm8 >> 6) & 3]; /* "imm8_count_s" */
17768            delta += 1+1;
17769            DIP( "insertps $%u, %s,%s\n",
17770                 imm8, nameXMMReg(rE), nameXMMReg(rG) );
17771         } else {
17772            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
17773            assign( d2ins, loadLE( Ity_I32, mkexpr(addr) ) );
17774            imm8 = getUChar(delta+alen);
17775            delta += alen+1;
17776            DIP( "insertps $%u, %s,%s\n",
17777                 imm8, dis_buf, nameXMMReg(rG) );
17778         }
17779
17780         IRTemp vG = newTemp(Ity_V128);
17781         assign( vG, getXMMReg(rG) );
17782
17783         putXMMReg( rG, mkexpr(math_INSERTPS( vG, d2ins, imm8 )) );
17784         goto decode_success;
17785      }
17786      break;
17787
17788   case 0x22:
17789      /* 66 no-REX.W 0F 3A 22 /r ib = PINSRD xmm1, r/m32, imm8
17790         Extract Doubleword int from gen.reg/mem32 and insert into xmm1 */
17791      if (have66noF2noF3(pfx)
17792          && sz == 2 /* REX.W is NOT present */) {
17793         Int    imm8_10;
17794         IRTemp src_u32 = newTemp(Ity_I32);
17795         modrm = getUChar(delta);
17796         UInt rG = gregOfRexRM(pfx, modrm);
17797
17798         if ( epartIsReg( modrm ) ) {
17799            UInt rE = eregOfRexRM(pfx,modrm);
17800            imm8_10 = (Int)(getUChar(delta+1) & 3);
17801            assign( src_u32, getIReg32( rE ) );
17802            delta += 1+1;
17803            DIP( "pinsrd $%d, %s,%s\n",
17804                 imm8_10, nameIReg32(rE), nameXMMReg(rG) );
17805         } else {
17806            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
17807            imm8_10 = (Int)(getUChar(delta+alen) & 3);
17808            assign( src_u32, loadLE( Ity_I32, mkexpr(addr) ) );
17809            delta += alen+1;
17810            DIP( "pinsrd $%d, %s,%s\n",
17811                 imm8_10, dis_buf, nameXMMReg(rG) );
17812         }
17813
17814         IRTemp src_vec = newTemp(Ity_V128);
17815         assign(src_vec, getXMMReg( rG ));
17816         IRTemp res_vec = math_PINSRD_128( src_vec, src_u32, imm8_10 );
17817         putXMMReg( rG, mkexpr(res_vec) );
17818         goto decode_success;
17819      }
17820      /* 66 REX.W 0F 3A 22 /r ib = PINSRQ xmm1, r/m64, imm8
17821         Extract Quadword int from gen.reg/mem64 and insert into xmm1 */
17822      if (have66noF2noF3(pfx)
17823          && sz == 8 /* REX.W is present */) {
17824         Int imm8_0;
17825         IRTemp src_u64 = newTemp(Ity_I64);
17826         modrm = getUChar(delta);
17827         UInt rG = gregOfRexRM(pfx, modrm);
17828
17829         if ( epartIsReg( modrm ) ) {
17830            UInt rE = eregOfRexRM(pfx,modrm);
17831            imm8_0 = (Int)(getUChar(delta+1) & 1);
17832            assign( src_u64, getIReg64( rE ) );
17833            delta += 1+1;
17834            DIP( "pinsrq $%d, %s,%s\n",
17835                 imm8_0, nameIReg64(rE), nameXMMReg(rG) );
17836         } else {
17837            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
17838            imm8_0 = (Int)(getUChar(delta+alen) & 1);
17839            assign( src_u64, loadLE( Ity_I64, mkexpr(addr) ) );
17840            delta += alen+1;
17841            DIP( "pinsrq $%d, %s,%s\n",
17842                 imm8_0, dis_buf, nameXMMReg(rG) );
17843         }
17844
17845         IRTemp src_vec = newTemp(Ity_V128);
17846         assign(src_vec, getXMMReg( rG ));
17847         IRTemp res_vec = math_PINSRQ_128( src_vec, src_u64, imm8_0 );
17848         putXMMReg( rG, mkexpr(res_vec) );
17849         goto decode_success;
17850      }
17851      break;
17852
17853   case 0x40:
17854      /* 66 0F 3A 40 /r ib = DPPS xmm1, xmm2/m128, imm8
17855         Dot Product of Packed Single Precision Floating-Point Values (XMM) */
17856      if (have66noF2noF3(pfx) && sz == 2) {
17857         modrm = getUChar(delta);
17858         Int    imm8;
17859         IRTemp src_vec = newTemp(Ity_V128);
17860         IRTemp dst_vec = newTemp(Ity_V128);
17861         UInt   rG      = gregOfRexRM(pfx, modrm);
17862         assign( dst_vec, getXMMReg( rG ) );
17863         if ( epartIsReg( modrm ) ) {
17864            UInt rE = eregOfRexRM(pfx, modrm);
17865            imm8 = (Int)getUChar(delta+1);
17866            assign( src_vec, getXMMReg(rE) );
17867            delta += 1+1;
17868            DIP( "dpps $%d, %s,%s\n",
17869                 imm8, nameXMMReg(rE), nameXMMReg(rG) );
17870         } else {
17871            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
17872                             1/* imm8 is 1 byte after the amode */ );
17873            gen_SEGV_if_not_16_aligned( addr );
17874            assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
17875            imm8 = (Int)getUChar(delta+alen);
17876            delta += alen+1;
17877            DIP( "dpps $%d, %s,%s\n",
17878                 imm8, dis_buf, nameXMMReg(rG) );
17879         }
17880         IRTemp res = math_DPPS_128( src_vec, dst_vec, imm8 );
17881         putXMMReg( rG, mkexpr(res) );
17882         goto decode_success;
17883      }
17884      break;
17885
17886   case 0x41:
17887      /* 66 0F 3A 41 /r ib = DPPD xmm1, xmm2/m128, imm8
17888         Dot Product of Packed Double Precision Floating-Point Values (XMM) */
17889      if (have66noF2noF3(pfx) && sz == 2) {
17890         modrm = getUChar(delta);
17891         Int    imm8;
17892         IRTemp src_vec = newTemp(Ity_V128);
17893         IRTemp dst_vec = newTemp(Ity_V128);
17894         UInt   rG      = gregOfRexRM(pfx, modrm);
17895         assign( dst_vec, getXMMReg( rG ) );
17896         if ( epartIsReg( modrm ) ) {
17897            UInt rE = eregOfRexRM(pfx, modrm);
17898            imm8 = (Int)getUChar(delta+1);
17899            assign( src_vec, getXMMReg(rE) );
17900            delta += 1+1;
17901            DIP( "dppd $%d, %s,%s\n",
17902                 imm8, nameXMMReg(rE), nameXMMReg(rG) );
17903         } else {
17904            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
17905                             1/* imm8 is 1 byte after the amode */ );
17906            gen_SEGV_if_not_16_aligned( addr );
17907            assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
17908            imm8 = (Int)getUChar(delta+alen);
17909            delta += alen+1;
17910            DIP( "dppd $%d, %s,%s\n",
17911                 imm8, dis_buf, nameXMMReg(rG) );
17912         }
17913         IRTemp res = math_DPPD_128( src_vec, dst_vec, imm8 );
17914         putXMMReg( rG, mkexpr(res) );
17915         goto decode_success;
17916      }
17917      break;
17918
17919   case 0x42:
17920      /* 66 0F 3A 42 /r ib = MPSADBW xmm1, xmm2/m128, imm8
17921         Multiple Packed Sums of Absolule Difference (XMM) */
17922      if (have66noF2noF3(pfx) && sz == 2) {
17923         Int    imm8;
17924         IRTemp src_vec = newTemp(Ity_V128);
17925         IRTemp dst_vec = newTemp(Ity_V128);
17926         modrm          = getUChar(delta);
17927         UInt   rG      = gregOfRexRM(pfx, modrm);
17928
17929         assign( dst_vec, getXMMReg(rG) );
17930
17931         if ( epartIsReg( modrm ) ) {
17932            UInt rE = eregOfRexRM(pfx, modrm);
17933
17934            imm8 = (Int)getUChar(delta+1);
17935            assign( src_vec, getXMMReg(rE) );
17936            delta += 1+1;
17937            DIP( "mpsadbw $%d, %s,%s\n", imm8,
17938                 nameXMMReg(rE), nameXMMReg(rG) );
17939         } else {
17940            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
17941                             1/* imm8 is 1 byte after the amode */ );
17942            gen_SEGV_if_not_16_aligned( addr );
17943            assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
17944            imm8 = (Int)getUChar(delta+alen);
17945            delta += alen+1;
17946            DIP( "mpsadbw $%d, %s,%s\n", imm8, dis_buf, nameXMMReg(rG) );
17947         }
17948
17949         putXMMReg( rG, mkexpr( math_MPSADBW_128(dst_vec, src_vec, imm8) ) );
17950         goto decode_success;
17951      }
17952      break;
17953
17954   case 0x44:
17955      /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
17956       * Carry-less multiplication of selected XMM quadwords into XMM
17957       * registers (a.k.a multiplication of polynomials over GF(2))
17958       */
17959      if (have66noF2noF3(pfx) && sz == 2) {
17960
17961         Int imm8;
17962         IRTemp svec = newTemp(Ity_V128);
17963         IRTemp dvec = newTemp(Ity_V128);
17964         modrm       = getUChar(delta);
17965         UInt   rG   = gregOfRexRM(pfx, modrm);
17966
17967         assign( dvec, getXMMReg(rG) );
17968
17969         if ( epartIsReg( modrm ) ) {
17970            UInt rE = eregOfRexRM(pfx, modrm);
17971            imm8 = (Int)getUChar(delta+1);
17972            assign( svec, getXMMReg(rE) );
17973            delta += 1+1;
17974            DIP( "pclmulqdq $%d, %s,%s\n", imm8,
17975                 nameXMMReg(rE), nameXMMReg(rG) );
17976         } else {
17977            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
17978                             1/* imm8 is 1 byte after the amode */ );
17979            gen_SEGV_if_not_16_aligned( addr );
17980            assign( svec, loadLE( Ity_V128, mkexpr(addr) ) );
17981            imm8 = (Int)getUChar(delta+alen);
17982            delta += alen+1;
17983            DIP( "pclmulqdq $%d, %s,%s\n",
17984                 imm8, dis_buf, nameXMMReg(rG) );
17985         }
17986
17987         putXMMReg( rG, mkexpr( math_PCLMULQDQ(dvec, svec, imm8) ) );
17988         goto decode_success;
17989      }
17990      break;
17991
17992   case 0x60:
17993   case 0x61:
17994   case 0x62:
17995   case 0x63:
17996      /* 66 0F 3A 63 /r ib = PCMPISTRI imm8, xmm2/m128, xmm1
17997         66 0F 3A 62 /r ib = PCMPISTRM imm8, xmm2/m128, xmm1
17998         66 0F 3A 61 /r ib = PCMPESTRI imm8, xmm2/m128, xmm1
17999         66 0F 3A 60 /r ib = PCMPESTRM imm8, xmm2/m128, xmm1
18000         (selected special cases that actually occur in glibc,
18001          not by any means a complete implementation.)
18002      */
18003      if (have66noF2noF3(pfx) && sz == 2) {
18004         Long delta0 = delta;
18005         delta = dis_PCMPxSTRx( vbi, pfx, delta, False/*!isAvx*/, opc );
18006         if (delta > delta0) goto decode_success;
18007         /* else fall though; dis_PCMPxSTRx failed to decode it */
18008      }
18009      break;
18010
18011   case 0xDF:
18012      /* 66 0F 3A DF /r ib = AESKEYGENASSIST imm8, xmm2/m128, xmm1 */
18013      if (have66noF2noF3(pfx) && sz == 2) {
18014         delta = dis_AESKEYGENASSIST( vbi, pfx, delta, False/*!isAvx*/ );
18015         goto decode_success;
18016      }
18017      break;
18018
18019   default:
18020      break;
18021
18022   }
18023
18024  decode_failure:
18025   *decode_OK = False;
18026   return deltaIN;
18027
18028  decode_success:
18029   *decode_OK = True;
18030   return delta;
18031}
18032
18033
18034/*------------------------------------------------------------*/
18035/*---                                                      ---*/
18036/*--- Top-level post-escape decoders: dis_ESC_NONE         ---*/
18037/*---                                                      ---*/
18038/*------------------------------------------------------------*/
18039
18040__attribute__((noinline))
18041static
18042Long dis_ESC_NONE (
18043        /*MB_OUT*/DisResult* dres,
18044        /*MB_OUT*/Bool*      expect_CAS,
18045        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
18046        Bool         resteerCisOk,
18047        void*        callback_opaque,
18048        VexArchInfo* archinfo,
18049        VexAbiInfo*  vbi,
18050        Prefix pfx, Int sz, Long deltaIN
18051     )
18052{
18053   Long   d64   = 0;
18054   UChar  abyte = 0;
18055   IRTemp addr  = IRTemp_INVALID;
18056   IRTemp t1    = IRTemp_INVALID;
18057   IRTemp t2    = IRTemp_INVALID;
18058   IRTemp t3    = IRTemp_INVALID;
18059   IRTemp t4    = IRTemp_INVALID;
18060   IRTemp t5    = IRTemp_INVALID;
18061   IRType ty    = Ity_INVALID;
18062   UChar  modrm = 0;
18063   Int    am_sz = 0;
18064   Int    d_sz  = 0;
18065   Int    alen  = 0;
18066   HChar  dis_buf[50];
18067
18068   Long   delta = deltaIN;
18069   UChar  opc   = getUChar(delta);
18070   delta++;
18071   switch (opc) {
18072
18073   case 0x00: /* ADD Gb,Eb */
18074      if (haveF2orF3(pfx)) goto decode_failure;
18075      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Add8, True, 1, delta, "add" );
18076      return delta;
18077   case 0x01: /* ADD Gv,Ev */
18078      if (haveF2orF3(pfx)) goto decode_failure;
18079      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Add8, True, sz, delta, "add" );
18080      return delta;
18081
18082   case 0x02: /* ADD Eb,Gb */
18083      if (haveF2orF3(pfx)) goto decode_failure;
18084      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Add8, True, 1, delta, "add" );
18085      return delta;
18086   case 0x03: /* ADD Ev,Gv */
18087      if (haveF2orF3(pfx)) goto decode_failure;
18088      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Add8, True, sz, delta, "add" );
18089      return delta;
18090
18091   case 0x04: /* ADD Ib, AL */
18092      if (haveF2orF3(pfx)) goto decode_failure;
18093      delta = dis_op_imm_A( 1, False, Iop_Add8, True, delta, "add" );
18094      return delta;
18095   case 0x05: /* ADD Iv, eAX */
18096      if (haveF2orF3(pfx)) goto decode_failure;
18097      delta = dis_op_imm_A(sz, False, Iop_Add8, True, delta, "add" );
18098      return delta;
18099
18100   case 0x08: /* OR Gb,Eb */
18101      if (haveF2orF3(pfx)) goto decode_failure;
18102      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Or8, True, 1, delta, "or" );
18103      return delta;
18104   case 0x09: /* OR Gv,Ev */
18105      if (haveF2orF3(pfx)) goto decode_failure;
18106      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Or8, True, sz, delta, "or" );
18107      return delta;
18108
18109   case 0x0A: /* OR Eb,Gb */
18110      if (haveF2orF3(pfx)) goto decode_failure;
18111      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Or8, True, 1, delta, "or" );
18112      return delta;
18113   case 0x0B: /* OR Ev,Gv */
18114      if (haveF2orF3(pfx)) goto decode_failure;
18115      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Or8, True, sz, delta, "or" );
18116      return delta;
18117
18118   case 0x0C: /* OR Ib, AL */
18119      if (haveF2orF3(pfx)) goto decode_failure;
18120      delta = dis_op_imm_A( 1, False, Iop_Or8, True, delta, "or" );
18121      return delta;
18122   case 0x0D: /* OR Iv, eAX */
18123      if (haveF2orF3(pfx)) goto decode_failure;
18124      delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
18125      return delta;
18126
18127   case 0x10: /* ADC Gb,Eb */
18128      if (haveF2orF3(pfx)) goto decode_failure;
18129      delta = dis_op2_G_E ( vbi, pfx, True, Iop_Add8, True, 1, delta, "adc" );
18130      return delta;
18131   case 0x11: /* ADC Gv,Ev */
18132      if (haveF2orF3(pfx)) goto decode_failure;
18133      delta = dis_op2_G_E ( vbi, pfx, True, Iop_Add8, True, sz, delta, "adc" );
18134      return delta;
18135
18136   case 0x12: /* ADC Eb,Gb */
18137      if (haveF2orF3(pfx)) goto decode_failure;
18138      delta = dis_op2_E_G ( vbi, pfx, True, Iop_Add8, True, 1, delta, "adc" );
18139      return delta;
18140   case 0x13: /* ADC Ev,Gv */
18141      if (haveF2orF3(pfx)) goto decode_failure;
18142      delta = dis_op2_E_G ( vbi, pfx, True, Iop_Add8, True, sz, delta, "adc" );
18143      return delta;
18144
18145   case 0x14: /* ADC Ib, AL */
18146      if (haveF2orF3(pfx)) goto decode_failure;
18147      delta = dis_op_imm_A( 1, True, Iop_Add8, True, delta, "adc" );
18148      return delta;
18149   case 0x15: /* ADC Iv, eAX */
18150      if (haveF2orF3(pfx)) goto decode_failure;
18151      delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
18152      return delta;
18153
18154   case 0x18: /* SBB Gb,Eb */
18155      if (haveF2orF3(pfx)) goto decode_failure;
18156      delta = dis_op2_G_E ( vbi, pfx, True, Iop_Sub8, True, 1, delta, "sbb" );
18157      return delta;
18158   case 0x19: /* SBB Gv,Ev */
18159      if (haveF2orF3(pfx)) goto decode_failure;
18160      delta = dis_op2_G_E ( vbi, pfx, True, Iop_Sub8, True, sz, delta, "sbb" );
18161      return delta;
18162
18163   case 0x1A: /* SBB Eb,Gb */
18164      if (haveF2orF3(pfx)) goto decode_failure;
18165      delta = dis_op2_E_G ( vbi, pfx, True, Iop_Sub8, True, 1, delta, "sbb" );
18166      return delta;
18167   case 0x1B: /* SBB Ev,Gv */
18168      if (haveF2orF3(pfx)) goto decode_failure;
18169      delta = dis_op2_E_G ( vbi, pfx, True, Iop_Sub8, True, sz, delta, "sbb" );
18170      return delta;
18171
18172   case 0x1C: /* SBB Ib, AL */
18173      if (haveF2orF3(pfx)) goto decode_failure;
18174      delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
18175      return delta;
18176   case 0x1D: /* SBB Iv, eAX */
18177      if (haveF2orF3(pfx)) goto decode_failure;
18178      delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
18179      return delta;
18180
18181   case 0x20: /* AND Gb,Eb */
18182      if (haveF2orF3(pfx)) goto decode_failure;
18183      delta = dis_op2_G_E ( vbi, pfx, False, Iop_And8, True, 1, delta, "and" );
18184      return delta;
18185   case 0x21: /* AND Gv,Ev */
18186      if (haveF2orF3(pfx)) goto decode_failure;
18187      delta = dis_op2_G_E ( vbi, pfx, False, Iop_And8, True, sz, delta, "and" );
18188      return delta;
18189
18190   case 0x22: /* AND Eb,Gb */
18191      if (haveF2orF3(pfx)) goto decode_failure;
18192      delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, True, 1, delta, "and" );
18193      return delta;
18194   case 0x23: /* AND Ev,Gv */
18195      if (haveF2orF3(pfx)) goto decode_failure;
18196      delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, True, sz, delta, "and" );
18197      return delta;
18198
18199   case 0x24: /* AND Ib, AL */
18200      if (haveF2orF3(pfx)) goto decode_failure;
18201      delta = dis_op_imm_A( 1, False, Iop_And8, True, delta, "and" );
18202      return delta;
18203   case 0x25: /* AND Iv, eAX */
18204      if (haveF2orF3(pfx)) goto decode_failure;
18205      delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
18206      return delta;
18207
18208   case 0x28: /* SUB Gb,Eb */
18209      if (haveF2orF3(pfx)) goto decode_failure;
18210      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, True, 1, delta, "sub" );
18211      return delta;
18212   case 0x29: /* SUB Gv,Ev */
18213      if (haveF2orF3(pfx)) goto decode_failure;
18214      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, True, sz, delta, "sub" );
18215      return delta;
18216
18217   case 0x2A: /* SUB Eb,Gb */
18218      if (haveF2orF3(pfx)) goto decode_failure;
18219      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, True, 1, delta, "sub" );
18220      return delta;
18221   case 0x2B: /* SUB Ev,Gv */
18222      if (haveF2orF3(pfx)) goto decode_failure;
18223      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, True, sz, delta, "sub" );
18224      return delta;
18225
18226   case 0x2C: /* SUB Ib, AL */
18227      if (haveF2orF3(pfx)) goto decode_failure;
18228      delta = dis_op_imm_A(1, False, Iop_Sub8, True, delta, "sub" );
18229      return delta;
18230
18231   case 0x2D: /* SUB Iv, eAX */
18232      if (haveF2orF3(pfx)) goto decode_failure;
18233      delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
18234      return delta;
18235
18236   case 0x30: /* XOR Gb,Eb */
18237      if (haveF2orF3(pfx)) goto decode_failure;
18238      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Xor8, True, 1, delta, "xor" );
18239      return delta;
18240   case 0x31: /* XOR Gv,Ev */
18241      if (haveF2orF3(pfx)) goto decode_failure;
18242      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Xor8, True, sz, delta, "xor" );
18243      return delta;
18244
18245   case 0x32: /* XOR Eb,Gb */
18246      if (haveF2orF3(pfx)) goto decode_failure;
18247      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Xor8, True, 1, delta, "xor" );
18248      return delta;
18249   case 0x33: /* XOR Ev,Gv */
18250      if (haveF2orF3(pfx)) goto decode_failure;
18251      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Xor8, True, sz, delta, "xor" );
18252      return delta;
18253
18254   case 0x34: /* XOR Ib, AL */
18255      if (haveF2orF3(pfx)) goto decode_failure;
18256      delta = dis_op_imm_A( 1, False, Iop_Xor8, True, delta, "xor" );
18257      return delta;
18258   case 0x35: /* XOR Iv, eAX */
18259      if (haveF2orF3(pfx)) goto decode_failure;
18260      delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
18261      return delta;
18262
18263   case 0x38: /* CMP Gb,Eb */
18264      if (haveF2orF3(pfx)) goto decode_failure;
18265      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, False, 1, delta, "cmp" );
18266      return delta;
18267   case 0x39: /* CMP Gv,Ev */
18268      if (haveF2orF3(pfx)) goto decode_failure;
18269      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, False, sz, delta, "cmp" );
18270      return delta;
18271
18272   case 0x3A: /* CMP Eb,Gb */
18273      if (haveF2orF3(pfx)) goto decode_failure;
18274      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, False, 1, delta, "cmp" );
18275      return delta;
18276   case 0x3B: /* CMP Ev,Gv */
18277      if (haveF2orF3(pfx)) goto decode_failure;
18278      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, False, sz, delta, "cmp" );
18279      return delta;
18280
18281   case 0x3C: /* CMP Ib, AL */
18282      if (haveF2orF3(pfx)) goto decode_failure;
18283      delta = dis_op_imm_A( 1, False, Iop_Sub8, False, delta, "cmp" );
18284      return delta;
18285   case 0x3D: /* CMP Iv, eAX */
18286      if (haveF2orF3(pfx)) goto decode_failure;
18287      delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
18288      return delta;
18289
18290   case 0x50: /* PUSH eAX */
18291   case 0x51: /* PUSH eCX */
18292   case 0x52: /* PUSH eDX */
18293   case 0x53: /* PUSH eBX */
18294   case 0x55: /* PUSH eBP */
18295   case 0x56: /* PUSH eSI */
18296   case 0x57: /* PUSH eDI */
18297   case 0x54: /* PUSH eSP */
18298      /* This is the Right Way, in that the value to be pushed is
18299         established before %rsp is changed, so that pushq %rsp
18300         correctly pushes the old value. */
18301      if (haveF2orF3(pfx)) goto decode_failure;
18302      vassert(sz == 2 || sz == 4 || sz == 8);
18303      if (sz == 4)
18304         sz = 8; /* there is no encoding for 32-bit push in 64-bit mode */
18305      ty = sz==2 ? Ity_I16 : Ity_I64;
18306      t1 = newTemp(ty);
18307      t2 = newTemp(Ity_I64);
18308      assign(t1, getIRegRexB(sz, pfx, opc-0x50));
18309      assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(sz)));
18310      putIReg64(R_RSP, mkexpr(t2) );
18311      storeLE(mkexpr(t2),mkexpr(t1));
18312      DIP("push%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x50));
18313      return delta;
18314
18315   case 0x58: /* POP eAX */
18316   case 0x59: /* POP eCX */
18317   case 0x5A: /* POP eDX */
18318   case 0x5B: /* POP eBX */
18319   case 0x5D: /* POP eBP */
18320   case 0x5E: /* POP eSI */
18321   case 0x5F: /* POP eDI */
18322   case 0x5C: /* POP eSP */
18323      if (haveF2orF3(pfx)) goto decode_failure;
18324      vassert(sz == 2 || sz == 4 || sz == 8);
18325      if (sz == 4)
18326         sz = 8; /* there is no encoding for 32-bit pop in 64-bit mode */
18327      t1 = newTemp(szToITy(sz));
18328      t2 = newTemp(Ity_I64);
18329      assign(t2, getIReg64(R_RSP));
18330      assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
18331      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
18332      putIRegRexB(sz, pfx, opc-0x58, mkexpr(t1));
18333      DIP("pop%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x58));
18334      return delta;
18335
18336   case 0x63: /* MOVSX */
18337      if (haveF2orF3(pfx)) goto decode_failure;
18338      if (haveREX(pfx) && 1==getRexW(pfx)) {
18339         vassert(sz == 8);
18340         /* movsx r/m32 to r64 */
18341         modrm = getUChar(delta);
18342         if (epartIsReg(modrm)) {
18343            delta++;
18344            putIRegG(8, pfx, modrm,
18345                             unop(Iop_32Sto64,
18346                                  getIRegE(4, pfx, modrm)));
18347            DIP("movslq %s,%s\n",
18348                nameIRegE(4, pfx, modrm),
18349                nameIRegG(8, pfx, modrm));
18350            return delta;
18351         } else {
18352            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
18353            delta += alen;
18354            putIRegG(8, pfx, modrm,
18355                             unop(Iop_32Sto64,
18356                                  loadLE(Ity_I32, mkexpr(addr))));
18357            DIP("movslq %s,%s\n", dis_buf,
18358                nameIRegG(8, pfx, modrm));
18359            return delta;
18360         }
18361      } else {
18362         goto decode_failure;
18363      }
18364
18365   case 0x68: /* PUSH Iv */
18366      if (haveF2orF3(pfx)) goto decode_failure;
18367      /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
18368      if (sz == 4) sz = 8;
18369      d64 = getSDisp(imin(4,sz),delta);
18370      delta += imin(4,sz);
18371      goto do_push_I;
18372
18373   case 0x69: /* IMUL Iv, Ev, Gv */
18374      if (haveF2orF3(pfx)) goto decode_failure;
18375      delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, sz );
18376      return delta;
18377
18378   case 0x6A: /* PUSH Ib, sign-extended to sz */
18379      if (haveF2orF3(pfx)) goto decode_failure;
18380      /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
18381      if (sz == 4) sz = 8;
18382      d64 = getSDisp8(delta); delta += 1;
18383      goto do_push_I;
18384   do_push_I:
18385      ty = szToITy(sz);
18386      t1 = newTemp(Ity_I64);
18387      t2 = newTemp(ty);
18388      assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
18389      putIReg64(R_RSP, mkexpr(t1) );
18390      /* stop mkU16 asserting if d32 is a negative 16-bit number
18391         (bug #132813) */
18392      if (ty == Ity_I16)
18393         d64 &= 0xFFFF;
18394      storeLE( mkexpr(t1), mkU(ty,d64) );
18395      DIP("push%c $%lld\n", nameISize(sz), (Long)d64);
18396      return delta;
18397
18398   case 0x6B: /* IMUL Ib, Ev, Gv */
18399      delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, 1 );
18400      return delta;
18401
18402   case 0x70:
18403   case 0x71:
18404   case 0x72:   /* JBb/JNAEb (jump below) */
18405   case 0x73:   /* JNBb/JAEb (jump not below) */
18406   case 0x74:   /* JZb/JEb (jump zero) */
18407   case 0x75:   /* JNZb/JNEb (jump not zero) */
18408   case 0x76:   /* JBEb/JNAb (jump below or equal) */
18409   case 0x77:   /* JNBEb/JAb (jump not below or equal) */
18410   case 0x78:   /* JSb (jump negative) */
18411   case 0x79:   /* JSb (jump not negative) */
18412   case 0x7A:   /* JP (jump parity even) */
18413   case 0x7B:   /* JNP/JPO (jump parity odd) */
18414   case 0x7C:   /* JLb/JNGEb (jump less) */
18415   case 0x7D:   /* JGEb/JNLb (jump greater or equal) */
18416   case 0x7E:   /* JLEb/JNGb (jump less or equal) */
18417   case 0x7F: { /* JGb/JNLEb (jump greater) */
18418      Long   jmpDelta;
18419      HChar* comment  = "";
18420      if (haveF2orF3(pfx)) goto decode_failure;
18421      jmpDelta = getSDisp8(delta);
18422      vassert(-128 <= jmpDelta && jmpDelta < 128);
18423      d64 = (guest_RIP_bbstart+delta+1) + jmpDelta;
18424      delta++;
18425      if (resteerCisOk
18426          && vex_control.guest_chase_cond
18427          && (Addr64)d64 != (Addr64)guest_RIP_bbstart
18428          && jmpDelta < 0
18429          && resteerOkFn( callback_opaque, d64) ) {
18430         /* Speculation: assume this backward branch is taken.  So we
18431            need to emit a side-exit to the insn following this one,
18432            on the negation of the condition, and continue at the
18433            branch target address (d64).  If we wind up back at the
18434            first instruction of the trace, just stop; it's better to
18435            let the IR loop unroller handle that case. */
18436         stmt( IRStmt_Exit(
18437                  mk_amd64g_calculate_condition(
18438                     (AMD64Condcode)(1 ^ (opc - 0x70))),
18439                  Ijk_Boring,
18440                  IRConst_U64(guest_RIP_bbstart+delta),
18441                  OFFB_RIP ) );
18442         dres->whatNext   = Dis_ResteerC;
18443         dres->continueAt = d64;
18444         comment = "(assumed taken)";
18445      }
18446      else
18447      if (resteerCisOk
18448          && vex_control.guest_chase_cond
18449          && (Addr64)d64 != (Addr64)guest_RIP_bbstart
18450          && jmpDelta >= 0
18451          && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
18452         /* Speculation: assume this forward branch is not taken.  So
18453            we need to emit a side-exit to d64 (the dest) and continue
18454            disassembling at the insn immediately following this
18455            one. */
18456         stmt( IRStmt_Exit(
18457                  mk_amd64g_calculate_condition((AMD64Condcode)(opc - 0x70)),
18458                  Ijk_Boring,
18459                  IRConst_U64(d64),
18460                  OFFB_RIP ) );
18461         dres->whatNext   = Dis_ResteerC;
18462         dres->continueAt = guest_RIP_bbstart+delta;
18463         comment = "(assumed not taken)";
18464      }
18465      else {
18466         /* Conservative default translation - end the block at this
18467            point. */
18468         jcc_01( dres, (AMD64Condcode)(opc - 0x70),
18469                 guest_RIP_bbstart+delta, d64 );
18470         vassert(dres->whatNext == Dis_StopHere);
18471      }
18472      DIP("j%s-8 0x%llx %s\n", name_AMD64Condcode(opc - 0x70), d64, comment);
18473      return delta;
18474   }
18475
18476   case 0x80: /* Grp1 Ib,Eb */
18477      if (haveF2orF3(pfx)) goto decode_failure;
18478      modrm = getUChar(delta);
18479      am_sz = lengthAMode(pfx,delta);
18480      sz    = 1;
18481      d_sz  = 1;
18482      d64   = getSDisp8(delta + am_sz);
18483      delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
18484      return delta;
18485
18486   case 0x81: /* Grp1 Iv,Ev */
18487      if (haveF2orF3(pfx)) goto decode_failure;
18488      modrm = getUChar(delta);
18489      am_sz = lengthAMode(pfx,delta);
18490      d_sz  = imin(sz,4);
18491      d64   = getSDisp(d_sz, delta + am_sz);
18492      delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
18493      return delta;
18494
18495   case 0x83: /* Grp1 Ib,Ev */
18496      if (haveF2orF3(pfx)) goto decode_failure;
18497      modrm = getUChar(delta);
18498      am_sz = lengthAMode(pfx,delta);
18499      d_sz  = 1;
18500      d64   = getSDisp8(delta + am_sz);
18501      delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
18502      return delta;
18503
18504   case 0x84: /* TEST Eb,Gb */
18505      if (haveF2orF3(pfx)) goto decode_failure;
18506      delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, False, 1, delta, "test" );
18507      return delta;
18508
18509   case 0x85: /* TEST Ev,Gv */
18510      if (haveF2orF3(pfx)) goto decode_failure;
18511      delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, False, sz, delta, "test" );
18512      return delta;
18513
18514   /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
18515      prefix.  Therefore, surround it with a IRStmt_MBE(Imbe_BusLock)
18516      and IRStmt_MBE(Imbe_BusUnlock) pair.  But be careful; if it is
18517      used with an explicit LOCK prefix, we don't want to end up with
18518      two IRStmt_MBE(Imbe_BusLock)s -- one made here and one made by
18519      the generic LOCK logic at the top of disInstr. */
18520   case 0x86: /* XCHG Gb,Eb */
18521      sz = 1;
18522      /* Fall through ... */
18523   case 0x87: /* XCHG Gv,Ev */
18524      if (haveF2orF3(pfx)) goto decode_failure;
18525      modrm = getUChar(delta);
18526      ty = szToITy(sz);
18527      t1 = newTemp(ty); t2 = newTemp(ty);
18528      if (epartIsReg(modrm)) {
18529         assign(t1, getIRegE(sz, pfx, modrm));
18530         assign(t2, getIRegG(sz, pfx, modrm));
18531         putIRegG(sz, pfx, modrm, mkexpr(t1));
18532         putIRegE(sz, pfx, modrm, mkexpr(t2));
18533         delta++;
18534         DIP("xchg%c %s, %s\n",
18535             nameISize(sz), nameIRegG(sz, pfx, modrm),
18536                            nameIRegE(sz, pfx, modrm));
18537      } else {
18538         *expect_CAS = True;
18539         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
18540         assign( t1, loadLE(ty, mkexpr(addr)) );
18541         assign( t2, getIRegG(sz, pfx, modrm) );
18542         casLE( mkexpr(addr),
18543                mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
18544         putIRegG( sz, pfx, modrm, mkexpr(t1) );
18545         delta += alen;
18546         DIP("xchg%c %s, %s\n", nameISize(sz),
18547                                nameIRegG(sz, pfx, modrm), dis_buf);
18548      }
18549      return delta;
18550
18551   case 0x88: /* MOV Gb,Eb */
18552      if (haveF2orF3(pfx)) goto decode_failure;
18553      delta = dis_mov_G_E(vbi, pfx, 1, delta);
18554      return delta;
18555
18556   case 0x89: /* MOV Gv,Ev */
18557      if (haveF2orF3(pfx)) goto decode_failure;
18558      delta = dis_mov_G_E(vbi, pfx, sz, delta);
18559      return delta;
18560
18561   case 0x8A: /* MOV Eb,Gb */
18562      if (haveF2orF3(pfx)) goto decode_failure;
18563      delta = dis_mov_E_G(vbi, pfx, 1, delta);
18564      return delta;
18565
18566   case 0x8B: /* MOV Ev,Gv */
18567      if (haveF2orF3(pfx)) goto decode_failure;
18568      delta = dis_mov_E_G(vbi, pfx, sz, delta);
18569      return delta;
18570
18571   case 0x8D: /* LEA M,Gv */
18572      if (haveF2orF3(pfx)) goto decode_failure;
18573      if (sz != 4 && sz != 8)
18574         goto decode_failure;
18575      modrm = getUChar(delta);
18576      if (epartIsReg(modrm))
18577         goto decode_failure;
18578      /* NOTE!  this is the one place where a segment override prefix
18579         has no effect on the address calculation.  Therefore we clear
18580         any segment override bits in pfx. */
18581      addr = disAMode ( &alen, vbi, clearSegBits(pfx), delta, dis_buf, 0 );
18582      delta += alen;
18583      /* This is a hack.  But it isn't clear that really doing the
18584         calculation at 32 bits is really worth it.  Hence for leal,
18585         do the full 64-bit calculation and then truncate it. */
18586      putIRegG( sz, pfx, modrm,
18587                         sz == 4
18588                            ? unop(Iop_64to32, mkexpr(addr))
18589                            : mkexpr(addr)
18590              );
18591      DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
18592                            nameIRegG(sz,pfx,modrm));
18593      return delta;
18594
18595   case 0x8F: { /* POPQ m64 / POPW m16 */
18596      Int   len;
18597      UChar rm;
18598      /* There is no encoding for 32-bit pop in 64-bit mode.
18599         So sz==4 actually means sz==8. */
18600      if (haveF2orF3(pfx)) goto decode_failure;
18601      vassert(sz == 2 || sz == 4
18602              || /* tolerate redundant REX.W, see #210481 */ sz == 8);
18603      if (sz == 4) sz = 8;
18604      if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
18605
18606      rm = getUChar(delta);
18607
18608      /* make sure this instruction is correct POP */
18609      if (epartIsReg(rm) || gregLO3ofRM(rm) != 0)
18610         goto decode_failure;
18611      /* and has correct size */
18612      vassert(sz == 8);
18613
18614      t1 = newTemp(Ity_I64);
18615      t3 = newTemp(Ity_I64);
18616      assign( t1, getIReg64(R_RSP) );
18617      assign( t3, loadLE(Ity_I64, mkexpr(t1)) );
18618
18619      /* Increase RSP; must be done before the STORE.  Intel manual
18620         says: If the RSP register is used as a base register for
18621         addressing a destination operand in memory, the POP
18622         instruction computes the effective address of the operand
18623         after it increments the RSP register.  */
18624      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(sz)) );
18625
18626      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
18627      storeLE( mkexpr(addr), mkexpr(t3) );
18628
18629      DIP("popl %s\n", dis_buf);
18630
18631      delta += len;
18632      return delta;
18633   }
18634
18635   case 0x90: /* XCHG eAX,eAX */
18636      /* detect and handle F3 90 (rep nop) specially */
18637      if (!have66(pfx) && !haveF2(pfx) && haveF3(pfx)) {
18638         DIP("rep nop (P4 pause)\n");
18639         /* "observe" the hint.  The Vex client needs to be careful not
18640            to cause very long delays as a result, though. */
18641         jmp_lit(dres, Ijk_Yield, guest_RIP_bbstart+delta);
18642         vassert(dres->whatNext == Dis_StopHere);
18643         return delta;
18644      }
18645      /* detect and handle NOPs specially */
18646      if (/* F2/F3 probably change meaning completely */
18647          !haveF2orF3(pfx)
18648          /* If REX.B is 1, we're not exchanging rAX with itself */
18649          && getRexB(pfx)==0 ) {
18650         DIP("nop\n");
18651         return delta;
18652      }
18653      /* else fall through to normal case. */
18654   case 0x91: /* XCHG rAX,rCX */
18655   case 0x92: /* XCHG rAX,rDX */
18656   case 0x93: /* XCHG rAX,rBX */
18657   case 0x94: /* XCHG rAX,rSP */
18658   case 0x95: /* XCHG rAX,rBP */
18659   case 0x96: /* XCHG rAX,rSI */
18660   case 0x97: /* XCHG rAX,rDI */
18661      /* guard against mutancy */
18662      if (haveF2orF3(pfx)) goto decode_failure;
18663      codegen_xchg_rAX_Reg ( pfx, sz, opc - 0x90 );
18664      return delta;
18665
18666   case 0x98: /* CBW */
18667      if (haveF2orF3(pfx)) goto decode_failure;
18668      if (sz == 8) {
18669         putIRegRAX( 8, unop(Iop_32Sto64, getIRegRAX(4)) );
18670         DIP(/*"cdqe\n"*/"cltq");
18671         return delta;
18672      }
18673      if (sz == 4) {
18674         putIRegRAX( 4, unop(Iop_16Sto32, getIRegRAX(2)) );
18675         DIP("cwtl\n");
18676         return delta;
18677      }
18678      if (sz == 2) {
18679         putIRegRAX( 2, unop(Iop_8Sto16, getIRegRAX(1)) );
18680         DIP("cbw\n");
18681         return delta;
18682      }
18683      goto decode_failure;
18684
18685   case 0x99: /* CWD/CDQ/CQO */
18686      if (haveF2orF3(pfx)) goto decode_failure;
18687      vassert(sz == 2 || sz == 4 || sz == 8);
18688      ty = szToITy(sz);
18689      putIRegRDX( sz,
18690                  binop(mkSizedOp(ty,Iop_Sar8),
18691                        getIRegRAX(sz),
18692                        mkU8(sz == 2 ? 15 : (sz == 4 ? 31 : 63))) );
18693      DIP(sz == 2 ? "cwd\n"
18694                  : (sz == 4 ? /*"cdq\n"*/ "cltd\n"
18695                             : "cqo\n"));
18696      return delta;
18697
18698   case 0x9B: /* FWAIT (X87 insn) */
18699      /* ignore? */
18700      DIP("fwait\n");
18701      return delta;
18702
18703   case 0x9C: /* PUSHF */ {
18704      /* Note.  There is no encoding for a 32-bit pushf in 64-bit
18705         mode.  So sz==4 actually means sz==8. */
18706      /* 24 July 06: has also been seen with a redundant REX prefix,
18707         so must also allow sz==8. */
18708      if (haveF2orF3(pfx)) goto decode_failure;
18709      vassert(sz == 2 || sz == 4 || sz == 8);
18710      if (sz == 4) sz = 8;
18711      if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
18712
18713      t1 = newTemp(Ity_I64);
18714      assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
18715      putIReg64(R_RSP, mkexpr(t1) );
18716
18717      t2 = newTemp(Ity_I64);
18718      assign( t2, mk_amd64g_calculate_rflags_all() );
18719
18720      /* Patch in the D flag.  This can simply be a copy of bit 10 of
18721         baseBlock[OFFB_DFLAG]. */
18722      t3 = newTemp(Ity_I64);
18723      assign( t3, binop(Iop_Or64,
18724                        mkexpr(t2),
18725                        binop(Iop_And64,
18726                              IRExpr_Get(OFFB_DFLAG,Ity_I64),
18727                              mkU64(1<<10)))
18728            );
18729
18730      /* And patch in the ID flag. */
18731      t4 = newTemp(Ity_I64);
18732      assign( t4, binop(Iop_Or64,
18733                        mkexpr(t3),
18734                        binop(Iop_And64,
18735                              binop(Iop_Shl64, IRExpr_Get(OFFB_IDFLAG,Ity_I64),
18736                                               mkU8(21)),
18737                              mkU64(1<<21)))
18738            );
18739
18740      /* And patch in the AC flag too. */
18741      t5 = newTemp(Ity_I64);
18742      assign( t5, binop(Iop_Or64,
18743                        mkexpr(t4),
18744                        binop(Iop_And64,
18745                              binop(Iop_Shl64, IRExpr_Get(OFFB_ACFLAG,Ity_I64),
18746                                               mkU8(18)),
18747                              mkU64(1<<18)))
18748            );
18749
18750      /* if sz==2, the stored value needs to be narrowed. */
18751      if (sz == 2)
18752        storeLE( mkexpr(t1), unop(Iop_32to16,
18753                             unop(Iop_64to32,mkexpr(t5))) );
18754      else
18755        storeLE( mkexpr(t1), mkexpr(t5) );
18756
18757      DIP("pushf%c\n", nameISize(sz));
18758      return delta;
18759   }
18760
18761   case 0x9D: /* POPF */
18762      /* Note.  There is no encoding for a 32-bit popf in 64-bit mode.
18763         So sz==4 actually means sz==8. */
18764      if (haveF2orF3(pfx)) goto decode_failure;
18765      vassert(sz == 2 || sz == 4);
18766      if (sz == 4) sz = 8;
18767      if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
18768      t1 = newTemp(Ity_I64); t2 = newTemp(Ity_I64);
18769      assign(t2, getIReg64(R_RSP));
18770      assign(t1, widenUto64(loadLE(szToITy(sz),mkexpr(t2))));
18771      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
18772      /* t1 is the flag word.  Mask out everything except OSZACP and
18773         set the flags thunk to AMD64G_CC_OP_COPY. */
18774      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
18775      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
18776      stmt( IRStmt_Put( OFFB_CC_DEP1,
18777                        binop(Iop_And64,
18778                              mkexpr(t1),
18779                              mkU64( AMD64G_CC_MASK_C | AMD64G_CC_MASK_P
18780                                     | AMD64G_CC_MASK_A | AMD64G_CC_MASK_Z
18781                                     | AMD64G_CC_MASK_S| AMD64G_CC_MASK_O )
18782                             )
18783                       )
18784          );
18785
18786      /* Also need to set the D flag, which is held in bit 10 of t1.
18787         If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
18788      stmt( IRStmt_Put(
18789               OFFB_DFLAG,
18790               IRExpr_Mux0X(
18791                  unop(Iop_32to8,
18792                  unop(Iop_64to32,
18793                       binop(Iop_And64,
18794                             binop(Iop_Shr64, mkexpr(t1), mkU8(10)),
18795                             mkU64(1)))),
18796                  mkU64(1),
18797                  mkU64(0xFFFFFFFFFFFFFFFFULL)))
18798          );
18799
18800      /* And set the ID flag */
18801      stmt( IRStmt_Put(
18802               OFFB_IDFLAG,
18803               IRExpr_Mux0X(
18804                  unop(Iop_32to8,
18805                  unop(Iop_64to32,
18806                       binop(Iop_And64,
18807                             binop(Iop_Shr64, mkexpr(t1), mkU8(21)),
18808                             mkU64(1)))),
18809                  mkU64(0),
18810                  mkU64(1)))
18811          );
18812
18813      /* And set the AC flag too */
18814      stmt( IRStmt_Put(
18815               OFFB_ACFLAG,
18816               IRExpr_Mux0X(
18817                  unop(Iop_32to8,
18818                  unop(Iop_64to32,
18819                       binop(Iop_And64,
18820                             binop(Iop_Shr64, mkexpr(t1), mkU8(18)),
18821                             mkU64(1)))),
18822                  mkU64(0),
18823                  mkU64(1)))
18824          );
18825
18826      DIP("popf%c\n", nameISize(sz));
18827      return delta;
18828
18829   case 0x9E: /* SAHF */
18830      codegen_SAHF();
18831      DIP("sahf\n");
18832      return delta;
18833
18834   case 0x9F: /* LAHF */
18835      codegen_LAHF();
18836      DIP("lahf\n");
18837      return delta;
18838
18839   case 0xA0: /* MOV Ob,AL */
18840      if (have66orF2orF3(pfx)) goto decode_failure;
18841      sz = 1;
18842      /* Fall through ... */
18843   case 0xA1: /* MOV Ov,eAX */
18844      if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
18845         goto decode_failure;
18846      d64 = getDisp64(delta);
18847      delta += 8;
18848      ty = szToITy(sz);
18849      addr = newTemp(Ity_I64);
18850      assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
18851      putIRegRAX(sz, loadLE( ty, mkexpr(addr) ));
18852      DIP("mov%c %s0x%llx, %s\n", nameISize(sz),
18853                                  segRegTxt(pfx), d64,
18854                                  nameIRegRAX(sz));
18855      return delta;
18856
18857   case 0xA2: /* MOV AL,Ob */
18858      if (have66orF2orF3(pfx)) goto decode_failure;
18859      sz = 1;
18860      /* Fall through ... */
18861   case 0xA3: /* MOV eAX,Ov */
18862      if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
18863         goto decode_failure;
18864      d64 = getDisp64(delta);
18865      delta += 8;
18866      ty = szToITy(sz);
18867      addr = newTemp(Ity_I64);
18868      assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
18869      storeLE( mkexpr(addr), getIRegRAX(sz) );
18870      DIP("mov%c %s, %s0x%llx\n", nameISize(sz), nameIRegRAX(sz),
18871                                  segRegTxt(pfx), d64);
18872      return delta;
18873
18874   case 0xA4:
18875   case 0xA5:
18876      /* F3 A4: rep movsb */
18877      if (haveF3(pfx) && !haveF2(pfx)) {
18878         if (opc == 0xA4)
18879            sz = 1;
18880         dis_REP_op ( dres, AMD64CondAlways, dis_MOVS, sz,
18881                      guest_RIP_curr_instr,
18882                      guest_RIP_bbstart+delta, "rep movs", pfx );
18883        dres->whatNext = Dis_StopHere;
18884        return delta;
18885      }
18886      /* A4: movsb */
18887      if (!haveF3(pfx) && !haveF2(pfx)) {
18888         if (opc == 0xA4)
18889            sz = 1;
18890         dis_string_op( dis_MOVS, sz, "movs", pfx );
18891         return delta;
18892      }
18893      goto decode_failure;
18894
18895   case 0xA6:
18896   case 0xA7:
18897      /* F3 A6/A7: repe cmps/rep cmps{w,l,q} */
18898      if (haveF3(pfx) && !haveF2(pfx)) {
18899         if (opc == 0xA6)
18900            sz = 1;
18901         dis_REP_op ( dres, AMD64CondZ, dis_CMPS, sz,
18902                      guest_RIP_curr_instr,
18903                      guest_RIP_bbstart+delta, "repe cmps", pfx );
18904         dres->whatNext = Dis_StopHere;
18905         return delta;
18906      }
18907      goto decode_failure;
18908
18909   case 0xAA:
18910   case 0xAB:
18911      /* F3 AA/AB: rep stosb/rep stos{w,l,q} */
18912      if (haveF3(pfx) && !haveF2(pfx)) {
18913         if (opc == 0xAA)
18914            sz = 1;
18915         dis_REP_op ( dres, AMD64CondAlways, dis_STOS, sz,
18916                      guest_RIP_curr_instr,
18917                      guest_RIP_bbstart+delta, "rep stos", pfx );
18918         vassert(dres->whatNext == Dis_StopHere);
18919         return delta;
18920      }
18921      /* AA/AB: stosb/stos{w,l,q} */
18922      if (!haveF3(pfx) && !haveF2(pfx)) {
18923         if (opc == 0xAA)
18924            sz = 1;
18925         dis_string_op( dis_STOS, sz, "stos", pfx );
18926         return delta;
18927      }
18928      goto decode_failure;
18929
18930   case 0xA8: /* TEST Ib, AL */
18931      if (haveF2orF3(pfx)) goto decode_failure;
18932      delta = dis_op_imm_A( 1, False, Iop_And8, False, delta, "test" );
18933      return delta;
18934   case 0xA9: /* TEST Iv, eAX */
18935      if (haveF2orF3(pfx)) goto decode_failure;
18936      delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
18937      return delta;
18938
18939   case 0xAC: /* LODS, no REP prefix */
18940   case 0xAD:
18941      dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", pfx );
18942      return delta;
18943
18944   case 0xAE:
18945   case 0xAF:
18946      /* F2 AE/AF: repne scasb/repne scas{w,l,q} */
18947      if (haveF2(pfx) && !haveF3(pfx)) {
18948         if (opc == 0xAE)
18949            sz = 1;
18950         dis_REP_op ( dres, AMD64CondNZ, dis_SCAS, sz,
18951                      guest_RIP_curr_instr,
18952                      guest_RIP_bbstart+delta, "repne scas", pfx );
18953         vassert(dres->whatNext == Dis_StopHere);
18954         return delta;
18955      }
18956      /* F3 AE/AF: repe scasb/repe scas{w,l,q} */
18957      if (!haveF2(pfx) && haveF3(pfx)) {
18958         if (opc == 0xAE)
18959            sz = 1;
18960         dis_REP_op ( dres, AMD64CondZ, dis_SCAS, sz,
18961                      guest_RIP_curr_instr,
18962                      guest_RIP_bbstart+delta, "repe scas", pfx );
18963         vassert(dres->whatNext == Dis_StopHere);
18964         return delta;
18965      }
18966      /* AE/AF: scasb/scas{w,l,q} */
18967      if (!haveF2(pfx) && !haveF3(pfx)) {
18968         if (opc == 0xAE)
18969            sz = 1;
18970         dis_string_op( dis_SCAS, sz, "scas", pfx );
18971         return delta;
18972      }
18973      goto decode_failure;
18974
18975   /* XXXX be careful here with moves to AH/BH/CH/DH */
18976   case 0xB0: /* MOV imm,AL */
18977   case 0xB1: /* MOV imm,CL */
18978   case 0xB2: /* MOV imm,DL */
18979   case 0xB3: /* MOV imm,BL */
18980   case 0xB4: /* MOV imm,AH */
18981   case 0xB5: /* MOV imm,CH */
18982   case 0xB6: /* MOV imm,DH */
18983   case 0xB7: /* MOV imm,BH */
18984      if (haveF2orF3(pfx)) goto decode_failure;
18985      d64 = getUChar(delta);
18986      delta += 1;
18987      putIRegRexB(1, pfx, opc-0xB0, mkU8(d64));
18988      DIP("movb $%lld,%s\n", d64, nameIRegRexB(1,pfx,opc-0xB0));
18989      return delta;
18990
18991   case 0xB8: /* MOV imm,eAX */
18992   case 0xB9: /* MOV imm,eCX */
18993   case 0xBA: /* MOV imm,eDX */
18994   case 0xBB: /* MOV imm,eBX */
18995   case 0xBC: /* MOV imm,eSP */
18996   case 0xBD: /* MOV imm,eBP */
18997   case 0xBE: /* MOV imm,eSI */
18998   case 0xBF: /* MOV imm,eDI */
18999      /* This is the one-and-only place where 64-bit literals are
19000         allowed in the instruction stream. */
19001      if (haveF2orF3(pfx)) goto decode_failure;
19002      if (sz == 8) {
19003         d64 = getDisp64(delta);
19004         delta += 8;
19005         putIRegRexB(8, pfx, opc-0xB8, mkU64(d64));
19006         DIP("movabsq $%lld,%s\n", (Long)d64,
19007                                   nameIRegRexB(8,pfx,opc-0xB8));
19008      } else {
19009         d64 = getSDisp(imin(4,sz),delta);
19010         delta += imin(4,sz);
19011         putIRegRexB(sz, pfx, opc-0xB8,
19012                         mkU(szToITy(sz), d64 & mkSizeMask(sz)));
19013         DIP("mov%c $%lld,%s\n", nameISize(sz),
19014                                 (Long)d64,
19015                                 nameIRegRexB(sz,pfx,opc-0xB8));
19016      }
19017      return delta;
19018
19019   case 0xC0: { /* Grp2 Ib,Eb */
19020      Bool decode_OK = True;
19021      if (haveF2orF3(pfx)) goto decode_failure;
19022      modrm = getUChar(delta);
19023      am_sz = lengthAMode(pfx,delta);
19024      d_sz  = 1;
19025      d64   = getUChar(delta + am_sz);
19026      sz    = 1;
19027      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
19028                         mkU8(d64 & 0xFF), NULL, &decode_OK );
19029      if (!decode_OK) goto decode_failure;
19030      return delta;
19031   }
19032
19033   case 0xC1: { /* Grp2 Ib,Ev */
19034      Bool decode_OK = True;
19035      if (haveF2orF3(pfx)) goto decode_failure;
19036      modrm = getUChar(delta);
19037      am_sz = lengthAMode(pfx,delta);
19038      d_sz  = 1;
19039      d64   = getUChar(delta + am_sz);
19040      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
19041                         mkU8(d64 & 0xFF), NULL, &decode_OK );
19042      if (!decode_OK) goto decode_failure;
19043      return delta;
19044   }
19045
19046   case 0xC2: /* RET imm16 */
19047      if (have66orF2orF3(pfx)) goto decode_failure;
19048      d64 = getUDisp16(delta);
19049      delta += 2;
19050      dis_ret(dres, vbi, d64);
19051      DIP("ret $%lld\n", d64);
19052      return delta;
19053
19054   case 0xC3: /* RET */
19055      if (have66orF2(pfx)) goto decode_failure;
19056      /* F3 is acceptable on AMD. */
19057      dis_ret(dres, vbi, 0);
19058      DIP(haveF3(pfx) ? "rep ; ret\n" : "ret\n");
19059      return delta;
19060
19061   case 0xC6: /* MOV Ib,Eb */
19062      sz = 1;
19063      goto do_Mov_I_E;
19064   case 0xC7: /* MOV Iv,Ev */
19065      goto do_Mov_I_E;
19066   do_Mov_I_E:
19067      if (haveF2orF3(pfx)) goto decode_failure;
19068      modrm = getUChar(delta);
19069      if (epartIsReg(modrm)) {
19070         delta++; /* mod/rm byte */
19071         d64 = getSDisp(imin(4,sz),delta);
19072         delta += imin(4,sz);
19073         putIRegE(sz, pfx, modrm,
19074                      mkU(szToITy(sz), d64 & mkSizeMask(sz)));
19075         DIP("mov%c $%lld, %s\n", nameISize(sz),
19076                                  (Long)d64,
19077                                  nameIRegE(sz,pfx,modrm));
19078      } else {
19079         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
19080                           /*xtra*/imin(4,sz) );
19081         delta += alen;
19082         d64 = getSDisp(imin(4,sz),delta);
19083         delta += imin(4,sz);
19084         storeLE(mkexpr(addr),
19085                 mkU(szToITy(sz), d64 & mkSizeMask(sz)));
19086         DIP("mov%c $%lld, %s\n", nameISize(sz), (Long)d64, dis_buf);
19087      }
19088      return delta;
19089
19090   case 0xC8: /* ENTER */
19091      /* Same comments re operand size as for LEAVE below apply.
19092         Also, only handles the case "enter $imm16, $0"; other cases
19093         for the second operand (nesting depth) are not handled. */
19094      if (sz != 4)
19095         goto decode_failure;
19096      d64 = getUDisp16(delta);
19097      delta += 2;
19098      vassert(d64 >= 0 && d64 <= 0xFFFF);
19099      if (getUChar(delta) != 0)
19100         goto decode_failure;
19101      delta++;
19102      /* Intel docs seem to suggest:
19103           push rbp
19104           temp = rsp
19105           rbp = temp
19106           rsp = rsp - imm16
19107      */
19108      t1 = newTemp(Ity_I64);
19109      assign(t1, getIReg64(R_RBP));
19110      t2 = newTemp(Ity_I64);
19111      assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
19112      putIReg64(R_RSP, mkexpr(t2));
19113      storeLE(mkexpr(t2), mkexpr(t1));
19114      putIReg64(R_RBP, mkexpr(t2));
19115      if (d64 > 0) {
19116         putIReg64(R_RSP, binop(Iop_Sub64, mkexpr(t2), mkU64(d64)));
19117      }
19118      DIP("enter $%u, $0\n", (UInt)d64);
19119      return delta;
19120
19121   case 0xC9: /* LEAVE */
19122      /* In 64-bit mode this defaults to a 64-bit operand size.  There
19123         is no way to encode a 32-bit variant.  Hence sz==4 but we do
19124         it as if sz=8. */
19125      if (sz != 4)
19126         goto decode_failure;
19127      t1 = newTemp(Ity_I64);
19128      t2 = newTemp(Ity_I64);
19129      assign(t1, getIReg64(R_RBP));
19130      /* First PUT RSP looks redundant, but need it because RSP must
19131         always be up-to-date for Memcheck to work... */
19132      putIReg64(R_RSP, mkexpr(t1));
19133      assign(t2, loadLE(Ity_I64,mkexpr(t1)));
19134      putIReg64(R_RBP, mkexpr(t2));
19135      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(8)) );
19136      DIP("leave\n");
19137      return delta;
19138
19139   case 0xCC: /* INT 3 */
19140      jmp_lit(dres, Ijk_SigTRAP, guest_RIP_bbstart + delta);
19141      vassert(dres->whatNext == Dis_StopHere);
19142      DIP("int $0x3\n");
19143      return delta;
19144
19145   case 0xD0: { /* Grp2 1,Eb */
19146      Bool decode_OK = True;
19147      if (haveF2orF3(pfx)) goto decode_failure;
19148      modrm = getUChar(delta);
19149      am_sz = lengthAMode(pfx,delta);
19150      d_sz  = 0;
19151      d64   = 1;
19152      sz    = 1;
19153      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
19154                         mkU8(d64), NULL, &decode_OK );
19155      if (!decode_OK) goto decode_failure;
19156      return delta;
19157   }
19158
19159   case 0xD1: { /* Grp2 1,Ev */
19160      Bool decode_OK = True;
19161      if (haveF2orF3(pfx)) goto decode_failure;
19162      modrm = getUChar(delta);
19163      am_sz = lengthAMode(pfx,delta);
19164      d_sz  = 0;
19165      d64   = 1;
19166      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
19167                         mkU8(d64), NULL, &decode_OK );
19168      if (!decode_OK) goto decode_failure;
19169      return delta;
19170   }
19171
19172   case 0xD2: { /* Grp2 CL,Eb */
19173      Bool decode_OK = True;
19174      if (haveF2orF3(pfx)) goto decode_failure;
19175      modrm = getUChar(delta);
19176      am_sz = lengthAMode(pfx,delta);
19177      d_sz  = 0;
19178      sz    = 1;
19179      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
19180                         getIRegCL(), "%cl", &decode_OK );
19181      if (!decode_OK) goto decode_failure;
19182      return delta;
19183   }
19184
19185   case 0xD3: { /* Grp2 CL,Ev */
19186      Bool decode_OK = True;
19187      if (haveF2orF3(pfx)) goto decode_failure;
19188      modrm = getUChar(delta);
19189      am_sz = lengthAMode(pfx,delta);
19190      d_sz  = 0;
19191      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
19192                         getIRegCL(), "%cl", &decode_OK );
19193      if (!decode_OK) goto decode_failure;
19194      return delta;
19195   }
19196
19197   case 0xD8: /* X87 instructions */
19198   case 0xD9:
19199   case 0xDA:
19200   case 0xDB:
19201   case 0xDC:
19202   case 0xDD:
19203   case 0xDE:
19204   case 0xDF: {
19205      Bool redundantREXWok = False;
19206
19207      if (haveF2orF3(pfx))
19208         goto decode_failure;
19209
19210      /* kludge to tolerate redundant rex.w prefixes (should do this
19211         properly one day) */
19212      /* mono 1.1.18.1 produces 48 D9 FA, which is rex.w fsqrt */
19213      if ( (opc == 0xD9 && getUChar(delta+0) == 0xFA)/*fsqrt*/ )
19214         redundantREXWok = True;
19215
19216      Bool size_OK = False;
19217      if ( sz == 4 )
19218         size_OK = True;
19219      else if ( sz == 8 )
19220         size_OK = redundantREXWok;
19221      else if ( sz == 2 ) {
19222         int mod_rm = getUChar(delta+0);
19223         int reg = gregLO3ofRM(mod_rm);
19224         /* The HotSpot JVM uses these */
19225         if ( (opc == 0xDD) && (reg == 0 /* FLDL   */ ||
19226                                reg == 4 /* FNSAVE */ ||
19227                                reg == 6 /* FRSTOR */ ) )
19228            size_OK = True;
19229      }
19230      /* AMD manual says 0x66 size override is ignored, except where
19231         it is meaningful */
19232      if (!size_OK)
19233         goto decode_failure;
19234
19235      Bool decode_OK = False;
19236      delta = dis_FPU ( &decode_OK, vbi, pfx, delta );
19237      if (!decode_OK)
19238         goto decode_failure;
19239
19240      return delta;
19241   }
19242
19243   case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
19244   case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
19245   case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
19246    { /* The docs say this uses rCX as a count depending on the
19247         address size override, not the operand one. */
19248      IRExpr* zbit  = NULL;
19249      IRExpr* count = NULL;
19250      IRExpr* cond  = NULL;
19251      HChar*  xtra  = NULL;
19252
19253      if (have66orF2orF3(pfx) || 1==getRexW(pfx)) goto decode_failure;
19254      /* So at this point we've rejected any variants which appear to
19255         be governed by the usual operand-size modifiers.  Hence only
19256         the address size prefix can have an effect.  It changes the
19257         size from 64 (default) to 32. */
19258      d64 = guest_RIP_bbstart+delta+1 + getSDisp8(delta);
19259      delta++;
19260      if (haveASO(pfx)) {
19261         /* 64to32 of 64-bit get is merely a get-put improvement
19262            trick. */
19263         putIReg32(R_RCX, binop(Iop_Sub32,
19264                                unop(Iop_64to32, getIReg64(R_RCX)),
19265                                mkU32(1)));
19266      } else {
19267         putIReg64(R_RCX, binop(Iop_Sub64, getIReg64(R_RCX), mkU64(1)));
19268      }
19269
19270      /* This is correct, both for 32- and 64-bit versions.  If we're
19271         doing a 32-bit dec and the result is zero then the default
19272         zero extension rule will cause the upper 32 bits to be zero
19273         too.  Hence a 64-bit check against zero is OK. */
19274      count = getIReg64(R_RCX);
19275      cond = binop(Iop_CmpNE64, count, mkU64(0));
19276      switch (opc) {
19277         case 0xE2:
19278            xtra = "";
19279            break;
19280         case 0xE1:
19281            xtra = "e";
19282            zbit = mk_amd64g_calculate_condition( AMD64CondZ );
19283            cond = mkAnd1(cond, zbit);
19284            break;
19285         case 0xE0:
19286            xtra = "ne";
19287            zbit = mk_amd64g_calculate_condition( AMD64CondNZ );
19288            cond = mkAnd1(cond, zbit);
19289            break;
19290         default:
19291	    vassert(0);
19292      }
19293      stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U64(d64), OFFB_RIP) );
19294
19295      DIP("loop%s%s 0x%llx\n", xtra, haveASO(pfx) ? "l" : "", d64);
19296      return delta;
19297    }
19298
19299   case 0xE3:
19300      /* JRCXZ or JECXZ, depending address size override. */
19301      if (have66orF2orF3(pfx)) goto decode_failure;
19302      d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
19303      delta++;
19304      if (haveASO(pfx)) {
19305         /* 32-bit */
19306         stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
19307                                  unop(Iop_32Uto64, getIReg32(R_RCX)),
19308                                  mkU64(0)),
19309                            Ijk_Boring,
19310                            IRConst_U64(d64),
19311                            OFFB_RIP
19312             ));
19313         DIP("jecxz 0x%llx\n", d64);
19314      } else {
19315         /* 64-bit */
19316         stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
19317                                  getIReg64(R_RCX),
19318                                  mkU64(0)),
19319                            Ijk_Boring,
19320                            IRConst_U64(d64),
19321                            OFFB_RIP
19322               ));
19323         DIP("jrcxz 0x%llx\n", d64);
19324      }
19325      return delta;
19326
19327   case 0xE4: /* IN imm8, AL */
19328      sz = 1;
19329      t1 = newTemp(Ity_I64);
19330      abyte = getUChar(delta); delta++;
19331      assign(t1, mkU64( abyte & 0xFF ));
19332      DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
19333      goto do_IN;
19334   case 0xE5: /* IN imm8, eAX */
19335      if (!(sz == 2 || sz == 4)) goto decode_failure;
19336      t1 = newTemp(Ity_I64);
19337      abyte = getUChar(delta); delta++;
19338      assign(t1, mkU64( abyte & 0xFF ));
19339      DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
19340      goto do_IN;
19341   case 0xEC: /* IN %DX, AL */
19342      sz = 1;
19343      t1 = newTemp(Ity_I64);
19344      assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
19345      DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
19346                                         nameIRegRAX(sz));
19347      goto do_IN;
19348   case 0xED: /* IN %DX, eAX */
19349      if (!(sz == 2 || sz == 4)) goto decode_failure;
19350      t1 = newTemp(Ity_I64);
19351      assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
19352      DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
19353                                         nameIRegRAX(sz));
19354      goto do_IN;
19355   do_IN: {
19356      /* At this point, sz indicates the width, and t1 is a 64-bit
19357         value giving port number. */
19358      IRDirty* d;
19359      if (haveF2orF3(pfx)) goto decode_failure;
19360      vassert(sz == 1 || sz == 2 || sz == 4);
19361      ty = szToITy(sz);
19362      t2 = newTemp(Ity_I64);
19363      d = unsafeIRDirty_1_N(
19364             t2,
19365             0/*regparms*/,
19366             "amd64g_dirtyhelper_IN",
19367             &amd64g_dirtyhelper_IN,
19368             mkIRExprVec_2( mkexpr(t1), mkU64(sz) )
19369          );
19370      /* do the call, dumping the result in t2. */
19371      stmt( IRStmt_Dirty(d) );
19372      putIRegRAX(sz, narrowTo( ty, mkexpr(t2) ) );
19373      return delta;
19374   }
19375
19376   case 0xE6: /* OUT AL, imm8 */
19377      sz = 1;
19378      t1 = newTemp(Ity_I64);
19379      abyte = getUChar(delta); delta++;
19380      assign( t1, mkU64( abyte & 0xFF ) );
19381      DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
19382      goto do_OUT;
19383   case 0xE7: /* OUT eAX, imm8 */
19384      if (!(sz == 2 || sz == 4)) goto decode_failure;
19385      t1 = newTemp(Ity_I64);
19386      abyte = getUChar(delta); delta++;
19387      assign( t1, mkU64( abyte & 0xFF ) );
19388      DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
19389      goto do_OUT;
19390   case 0xEE: /* OUT AL, %DX */
19391      sz = 1;
19392      t1 = newTemp(Ity_I64);
19393      assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
19394      DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
19395                                          nameIRegRDX(2));
19396      goto do_OUT;
19397   case 0xEF: /* OUT eAX, %DX */
19398      if (!(sz == 2 || sz == 4)) goto decode_failure;
19399      t1 = newTemp(Ity_I64);
19400      assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
19401      DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
19402                                          nameIRegRDX(2));
19403      goto do_OUT;
19404   do_OUT: {
19405      /* At this point, sz indicates the width, and t1 is a 64-bit
19406         value giving port number. */
19407      IRDirty* d;
19408      if (haveF2orF3(pfx)) goto decode_failure;
19409      vassert(sz == 1 || sz == 2 || sz == 4);
19410      ty = szToITy(sz);
19411      d = unsafeIRDirty_0_N(
19412             0/*regparms*/,
19413             "amd64g_dirtyhelper_OUT",
19414             &amd64g_dirtyhelper_OUT,
19415             mkIRExprVec_3( mkexpr(t1),
19416                            widenUto64( getIRegRAX(sz) ),
19417                            mkU64(sz) )
19418          );
19419      stmt( IRStmt_Dirty(d) );
19420      return delta;
19421   }
19422
19423   case 0xE8: /* CALL J4 */
19424      if (haveF2orF3(pfx)) goto decode_failure;
19425      d64 = getSDisp32(delta); delta += 4;
19426      d64 += (guest_RIP_bbstart+delta);
19427      /* (guest_RIP_bbstart+delta) == return-to addr, d64 == call-to addr */
19428      t1 = newTemp(Ity_I64);
19429      assign(t1, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
19430      putIReg64(R_RSP, mkexpr(t1));
19431      storeLE( mkexpr(t1), mkU64(guest_RIP_bbstart+delta));
19432      t2 = newTemp(Ity_I64);
19433      assign(t2, mkU64((Addr64)d64));
19434      make_redzone_AbiHint(vbi, t1, t2/*nia*/, "call-d32");
19435      if (resteerOkFn( callback_opaque, (Addr64)d64) ) {
19436         /* follow into the call target. */
19437         dres->whatNext   = Dis_ResteerU;
19438         dres->continueAt = d64;
19439      } else {
19440         jmp_lit(dres, Ijk_Call, d64);
19441         vassert(dres->whatNext == Dis_StopHere);
19442      }
19443      DIP("call 0x%llx\n",d64);
19444      return delta;
19445
19446   case 0xE9: /* Jv (jump, 16/32 offset) */
19447      if (haveF2orF3(pfx)) goto decode_failure;
19448      if (sz != 4)
19449         goto decode_failure; /* JRS added 2004 July 11 */
19450      d64 = (guest_RIP_bbstart+delta+sz) + getSDisp(sz,delta);
19451      delta += sz;
19452      if (resteerOkFn(callback_opaque,d64)) {
19453         dres->whatNext   = Dis_ResteerU;
19454         dres->continueAt = d64;
19455      } else {
19456         jmp_lit(dres, Ijk_Boring, d64);
19457         vassert(dres->whatNext == Dis_StopHere);
19458      }
19459      DIP("jmp 0x%llx\n", d64);
19460      return delta;
19461
19462   case 0xEB: /* Jb (jump, byte offset) */
19463      if (haveF2orF3(pfx)) goto decode_failure;
19464      if (sz != 4)
19465         goto decode_failure; /* JRS added 2004 July 11 */
19466      d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
19467      delta++;
19468      if (resteerOkFn(callback_opaque,d64)) {
19469         dres->whatNext   = Dis_ResteerU;
19470         dres->continueAt = d64;
19471      } else {
19472         jmp_lit(dres, Ijk_Boring, d64);
19473         vassert(dres->whatNext == Dis_StopHere);
19474      }
19475      DIP("jmp-8 0x%llx\n", d64);
19476      return delta;
19477
19478   case 0xF5: /* CMC */
19479   case 0xF8: /* CLC */
19480   case 0xF9: /* STC */
19481      t1 = newTemp(Ity_I64);
19482      t2 = newTemp(Ity_I64);
19483      assign( t1, mk_amd64g_calculate_rflags_all() );
19484      switch (opc) {
19485         case 0xF5:
19486            assign( t2, binop(Iop_Xor64, mkexpr(t1),
19487                                         mkU64(AMD64G_CC_MASK_C)));
19488            DIP("cmc\n");
19489            break;
19490         case 0xF8:
19491            assign( t2, binop(Iop_And64, mkexpr(t1),
19492                                         mkU64(~AMD64G_CC_MASK_C)));
19493            DIP("clc\n");
19494            break;
19495         case 0xF9:
19496            assign( t2, binop(Iop_Or64, mkexpr(t1),
19497                                        mkU64(AMD64G_CC_MASK_C)));
19498            DIP("stc\n");
19499            break;
19500         default:
19501            vpanic("disInstr(x64)(cmc/clc/stc)");
19502      }
19503      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
19504      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
19505      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t2) ));
19506      /* Set NDEP even though it isn't used.  This makes redundant-PUT
19507         elimination of previous stores to this field work better. */
19508      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
19509      return delta;
19510
19511   case 0xF6: { /* Grp3 Eb */
19512      Bool decode_OK = True;
19513      if (haveF2orF3(pfx)) goto decode_failure;
19514      delta = dis_Grp3 ( vbi, pfx, 1, delta, &decode_OK );
19515      if (!decode_OK) goto decode_failure;
19516      return delta;
19517   }
19518
19519   case 0xF7: { /* Grp3 Ev */
19520      Bool decode_OK = True;
19521      if (haveF2orF3(pfx)) goto decode_failure;
19522      delta = dis_Grp3 ( vbi, pfx, sz, delta, &decode_OK );
19523      if (!decode_OK) goto decode_failure;
19524      return delta;
19525   }
19526
19527   case 0xFC: /* CLD */
19528      if (haveF2orF3(pfx)) goto decode_failure;
19529      stmt( IRStmt_Put( OFFB_DFLAG, mkU64(1)) );
19530      DIP("cld\n");
19531      return delta;
19532
19533   case 0xFD: /* STD */
19534      if (haveF2orF3(pfx)) goto decode_failure;
19535      stmt( IRStmt_Put( OFFB_DFLAG, mkU64(-1ULL)) );
19536      DIP("std\n");
19537      return delta;
19538
19539   case 0xFE: { /* Grp4 Eb */
19540      Bool decode_OK = True;
19541      if (haveF2orF3(pfx)) goto decode_failure;
19542      delta = dis_Grp4 ( vbi, pfx, delta, &decode_OK );
19543      if (!decode_OK) goto decode_failure;
19544      return delta;
19545   }
19546
19547   case 0xFF: { /* Grp5 Ev */
19548      Bool decode_OK = True;
19549      if (haveF2orF3(pfx)) goto decode_failure;
19550      delta = dis_Grp5 ( vbi, pfx, sz, delta, dres, &decode_OK );
19551      if (!decode_OK) goto decode_failure;
19552      return delta;
19553   }
19554
19555   default:
19556      break;
19557
19558   }
19559
19560  decode_failure:
19561   return deltaIN; /* fail */
19562}
19563
19564
19565/*------------------------------------------------------------*/
19566/*---                                                      ---*/
19567/*--- Top-level post-escape decoders: dis_ESC_0F           ---*/
19568/*---                                                      ---*/
19569/*------------------------------------------------------------*/
19570
19571static IRTemp math_BSWAP ( IRTemp t1, IRType ty )
19572{
19573   IRTemp t2 = newTemp(ty);
19574   if (ty == Ity_I64) {
19575      IRTemp m8  = newTemp(Ity_I64);
19576      IRTemp s8  = newTemp(Ity_I64);
19577      IRTemp m16 = newTemp(Ity_I64);
19578      IRTemp s16 = newTemp(Ity_I64);
19579      IRTemp m32 = newTemp(Ity_I64);
19580      assign( m8, mkU64(0xFF00FF00FF00FF00ULL) );
19581      assign( s8,
19582              binop(Iop_Or64,
19583                    binop(Iop_Shr64,
19584                          binop(Iop_And64,mkexpr(t1),mkexpr(m8)),
19585                          mkU8(8)),
19586                    binop(Iop_And64,
19587                          binop(Iop_Shl64,mkexpr(t1),mkU8(8)),
19588                          mkexpr(m8))
19589                   )
19590            );
19591
19592      assign( m16, mkU64(0xFFFF0000FFFF0000ULL) );
19593      assign( s16,
19594              binop(Iop_Or64,
19595                    binop(Iop_Shr64,
19596                          binop(Iop_And64,mkexpr(s8),mkexpr(m16)),
19597                          mkU8(16)),
19598                    binop(Iop_And64,
19599                          binop(Iop_Shl64,mkexpr(s8),mkU8(16)),
19600                          mkexpr(m16))
19601                   )
19602            );
19603
19604      assign( m32, mkU64(0xFFFFFFFF00000000ULL) );
19605      assign( t2,
19606              binop(Iop_Or64,
19607                    binop(Iop_Shr64,
19608                          binop(Iop_And64,mkexpr(s16),mkexpr(m32)),
19609                          mkU8(32)),
19610                    binop(Iop_And64,
19611                          binop(Iop_Shl64,mkexpr(s16),mkU8(32)),
19612                          mkexpr(m32))
19613                   )
19614            );
19615      return t2;
19616   }
19617   if (ty == Ity_I32) {
19618      assign( t2,
19619         binop(
19620            Iop_Or32,
19621            binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
19622            binop(
19623               Iop_Or32,
19624               binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
19625                                mkU32(0x00FF0000)),
19626               binop(Iop_Or32,
19627                     binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
19628                                      mkU32(0x0000FF00)),
19629                     binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
19630                                      mkU32(0x000000FF) )
19631            )))
19632      );
19633      return t2;
19634   }
19635   if (ty == Ity_I16) {
19636      assign(t2,
19637             binop(Iop_Or16,
19638                   binop(Iop_Shl16, mkexpr(t1), mkU8(8)),
19639                   binop(Iop_Shr16, mkexpr(t1), mkU8(8)) ));
19640      return t2;
19641   }
19642   vassert(0);
19643   /*NOTREACHED*/
19644   return IRTemp_INVALID;
19645}
19646
19647
19648__attribute__((noinline))
19649static
19650Long dis_ESC_0F (
19651        /*MB_OUT*/DisResult* dres,
19652        /*MB_OUT*/Bool*      expect_CAS,
19653        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
19654        Bool         resteerCisOk,
19655        void*        callback_opaque,
19656        VexArchInfo* archinfo,
19657        VexAbiInfo*  vbi,
19658        Prefix pfx, Int sz, Long deltaIN
19659     )
19660{
19661   Long   d64   = 0;
19662   IRTemp addr  = IRTemp_INVALID;
19663   IRTemp t1    = IRTemp_INVALID;
19664   IRTemp t2    = IRTemp_INVALID;
19665   UChar  modrm = 0;
19666   Int    am_sz = 0;
19667   Int    alen  = 0;
19668   HChar  dis_buf[50];
19669
19670   /* In the first switch, look for ordinary integer insns. */
19671   Long   delta = deltaIN;
19672   UChar  opc   = getUChar(delta);
19673   delta++;
19674   switch (opc) { /* first switch */
19675
19676   case 0x01:
19677   {
19678      modrm = getUChar(delta);
19679      /* 0F 01 /0 -- SGDT */
19680      /* 0F 01 /1 -- SIDT */
19681      if (!epartIsReg(modrm)
19682          && (gregLO3ofRM(modrm) == 0 || gregLO3ofRM(modrm) == 1)) {
19683         /* This is really revolting, but ... since each processor
19684            (core) only has one IDT and one GDT, just let the guest
19685            see it (pass-through semantics).  I can't see any way to
19686            construct a faked-up value, so don't bother to try. */
19687         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
19688         delta += alen;
19689         switch (gregLO3ofRM(modrm)) {
19690            case 0: DIP("sgdt %s\n", dis_buf); break;
19691            case 1: DIP("sidt %s\n", dis_buf); break;
19692            default: vassert(0); /*NOTREACHED*/
19693         }
19694         IRDirty* d = unsafeIRDirty_0_N (
19695                          0/*regparms*/,
19696                          "amd64g_dirtyhelper_SxDT",
19697                          &amd64g_dirtyhelper_SxDT,
19698                          mkIRExprVec_2( mkexpr(addr),
19699                                         mkU64(gregLO3ofRM(modrm)) )
19700                      );
19701         /* declare we're writing memory */
19702         d->mFx   = Ifx_Write;
19703         d->mAddr = mkexpr(addr);
19704         d->mSize = 6;
19705         stmt( IRStmt_Dirty(d) );
19706         return delta;
19707      }
19708      /* 0F 01 D0 = XGETBV */
19709      if (modrm == 0xD0 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
19710         delta += 1;
19711         DIP("xgetbv\n");
19712         /* Fault (SEGV) if ECX isn't zero.  Intel docs say #GP and I
19713            am not sure if that translates in to SEGV or to something
19714            else, in user space. */
19715         t1 = newTemp(Ity_I32);
19716         assign( t1, getIReg32(R_RCX) );
19717         stmt( IRStmt_Exit(binop(Iop_CmpNE32, mkexpr(t1), mkU32(0)),
19718                           Ijk_SigSEGV,
19719                           IRConst_U64(guest_RIP_curr_instr),
19720                           OFFB_RIP
19721         ));
19722         putIRegRAX(4, mkU32(7));
19723         putIRegRDX(4, mkU32(0));
19724         return delta;
19725      }
19726      /* else decode failed */
19727      break;
19728   }
19729
19730   case 0x05: /* SYSCALL */
19731      guest_RIP_next_mustcheck = True;
19732      guest_RIP_next_assumed = guest_RIP_bbstart + delta;
19733      putIReg64( R_RCX, mkU64(guest_RIP_next_assumed) );
19734      /* It's important that all guest state is up-to-date
19735         at this point.  So we declare an end-of-block here, which
19736         forces any cached guest state to be flushed. */
19737      jmp_lit(dres, Ijk_Sys_syscall, guest_RIP_next_assumed);
19738      vassert(dres->whatNext == Dis_StopHere);
19739      DIP("syscall\n");
19740      return delta;
19741
19742   case 0x0B: /* UD2 */
19743      stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
19744      jmp_lit(dres, Ijk_NoDecode, guest_RIP_curr_instr);
19745      vassert(dres->whatNext == Dis_StopHere);
19746      DIP("ud2\n");
19747      return delta;
19748
19749   case 0x0D: /* 0F 0D /0 -- prefetch mem8 */
19750              /* 0F 0D /1 -- prefetchw mem8 */
19751      if (have66orF2orF3(pfx)) goto decode_failure;
19752      modrm = getUChar(delta);
19753      if (epartIsReg(modrm)) goto decode_failure;
19754      if (gregLO3ofRM(modrm) != 0 && gregLO3ofRM(modrm) != 1)
19755         goto decode_failure;
19756      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
19757      delta += alen;
19758      switch (gregLO3ofRM(modrm)) {
19759         case 0: DIP("prefetch %s\n", dis_buf); break;
19760         case 1: DIP("prefetchw %s\n", dis_buf); break;
19761         default: vassert(0); /*NOTREACHED*/
19762      }
19763      return delta;
19764
19765   case 0x1F:
19766      if (haveF2orF3(pfx)) goto decode_failure;
19767      modrm = getUChar(delta);
19768      if (epartIsReg(modrm)) goto decode_failure;
19769      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
19770      delta += alen;
19771      DIP("nop%c %s\n", nameISize(sz), dis_buf);
19772      return delta;
19773
19774   case 0x31: { /* RDTSC */
19775      IRTemp   val  = newTemp(Ity_I64);
19776      IRExpr** args = mkIRExprVec_0();
19777      IRDirty* d    = unsafeIRDirty_1_N (
19778                         val,
19779                         0/*regparms*/,
19780                         "amd64g_dirtyhelper_RDTSC",
19781                         &amd64g_dirtyhelper_RDTSC,
19782                         args
19783                      );
19784      if (have66orF2orF3(pfx)) goto decode_failure;
19785      /* execute the dirty call, dumping the result in val. */
19786      stmt( IRStmt_Dirty(d) );
19787      putIRegRDX(4, unop(Iop_64HIto32, mkexpr(val)));
19788      putIRegRAX(4, unop(Iop_64to32, mkexpr(val)));
19789      DIP("rdtsc\n");
19790      return delta;
19791   }
19792
19793   case 0x40:
19794   case 0x41:
19795   case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
19796   case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
19797   case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
19798   case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
19799   case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
19800   case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
19801   case 0x48: /* CMOVSb (cmov negative) */
19802   case 0x49: /* CMOVSb (cmov not negative) */
19803   case 0x4A: /* CMOVP (cmov parity even) */
19804   case 0x4B: /* CMOVNP (cmov parity odd) */
19805   case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
19806   case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
19807   case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
19808   case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
19809      if (haveF2orF3(pfx)) goto decode_failure;
19810      delta = dis_cmov_E_G(vbi, pfx, sz, (AMD64Condcode)(opc - 0x40), delta);
19811      return delta;
19812
19813   case 0x80:
19814   case 0x81:
19815   case 0x82:   /* JBb/JNAEb (jump below) */
19816   case 0x83:   /* JNBb/JAEb (jump not below) */
19817   case 0x84:   /* JZb/JEb (jump zero) */
19818   case 0x85:   /* JNZb/JNEb (jump not zero) */
19819   case 0x86:   /* JBEb/JNAb (jump below or equal) */
19820   case 0x87:   /* JNBEb/JAb (jump not below or equal) */
19821   case 0x88:   /* JSb (jump negative) */
19822   case 0x89:   /* JSb (jump not negative) */
19823   case 0x8A:   /* JP (jump parity even) */
19824   case 0x8B:   /* JNP/JPO (jump parity odd) */
19825   case 0x8C:   /* JLb/JNGEb (jump less) */
19826   case 0x8D:   /* JGEb/JNLb (jump greater or equal) */
19827   case 0x8E:   /* JLEb/JNGb (jump less or equal) */
19828   case 0x8F: { /* JGb/JNLEb (jump greater) */
19829      Long   jmpDelta;
19830      HChar* comment  = "";
19831      if (haveF2orF3(pfx)) goto decode_failure;
19832      jmpDelta = getSDisp32(delta);
19833      d64 = (guest_RIP_bbstart+delta+4) + jmpDelta;
19834      delta += 4;
19835      if (resteerCisOk
19836          && vex_control.guest_chase_cond
19837          && (Addr64)d64 != (Addr64)guest_RIP_bbstart
19838          && jmpDelta < 0
19839          && resteerOkFn( callback_opaque, d64) ) {
19840         /* Speculation: assume this backward branch is taken.  So
19841            we need to emit a side-exit to the insn following this
19842            one, on the negation of the condition, and continue at
19843            the branch target address (d64).  If we wind up back at
19844            the first instruction of the trace, just stop; it's
19845            better to let the IR loop unroller handle that case. */
19846         stmt( IRStmt_Exit(
19847                  mk_amd64g_calculate_condition(
19848                     (AMD64Condcode)(1 ^ (opc - 0x80))),
19849                  Ijk_Boring,
19850                  IRConst_U64(guest_RIP_bbstart+delta),
19851                  OFFB_RIP
19852             ));
19853         dres->whatNext   = Dis_ResteerC;
19854         dres->continueAt = d64;
19855         comment = "(assumed taken)";
19856      }
19857      else
19858      if (resteerCisOk
19859          && vex_control.guest_chase_cond
19860          && (Addr64)d64 != (Addr64)guest_RIP_bbstart
19861          && jmpDelta >= 0
19862          && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
19863         /* Speculation: assume this forward branch is not taken.
19864            So we need to emit a side-exit to d64 (the dest) and
19865            continue disassembling at the insn immediately
19866            following this one. */
19867         stmt( IRStmt_Exit(
19868                  mk_amd64g_calculate_condition((AMD64Condcode)
19869                                                (opc - 0x80)),
19870                  Ijk_Boring,
19871                  IRConst_U64(d64),
19872                  OFFB_RIP
19873             ));
19874         dres->whatNext   = Dis_ResteerC;
19875         dres->continueAt = guest_RIP_bbstart+delta;
19876         comment = "(assumed not taken)";
19877      }
19878      else {
19879         /* Conservative default translation - end the block at
19880            this point. */
19881         jcc_01( dres, (AMD64Condcode)(opc - 0x80),
19882                 guest_RIP_bbstart+delta, d64 );
19883         vassert(dres->whatNext == Dis_StopHere);
19884      }
19885      DIP("j%s-32 0x%llx %s\n", name_AMD64Condcode(opc - 0x80), d64, comment);
19886      return delta;
19887   }
19888
19889   case 0x90:
19890   case 0x91:
19891   case 0x92: /* set-Bb/set-NAEb (set if below) */
19892   case 0x93: /* set-NBb/set-AEb (set if not below) */
19893   case 0x94: /* set-Zb/set-Eb (set if zero) */
19894   case 0x95: /* set-NZb/set-NEb (set if not zero) */
19895   case 0x96: /* set-BEb/set-NAb (set if below or equal) */
19896   case 0x97: /* set-NBEb/set-Ab (set if not below or equal) */
19897   case 0x98: /* set-Sb (set if negative) */
19898   case 0x99: /* set-Sb (set if not negative) */
19899   case 0x9A: /* set-P (set if parity even) */
19900   case 0x9B: /* set-NP (set if parity odd) */
19901   case 0x9C: /* set-Lb/set-NGEb (set if less) */
19902   case 0x9D: /* set-GEb/set-NLb (set if greater or equal) */
19903   case 0x9E: /* set-LEb/set-NGb (set if less or equal) */
19904   case 0x9F: /* set-Gb/set-NLEb (set if greater) */
19905      if (haveF2orF3(pfx)) goto decode_failure;
19906      t1 = newTemp(Ity_I8);
19907      assign( t1, unop(Iop_1Uto8,mk_amd64g_calculate_condition(opc-0x90)) );
19908      modrm = getUChar(delta);
19909      if (epartIsReg(modrm)) {
19910         delta++;
19911         putIRegE(1, pfx, modrm, mkexpr(t1));
19912         DIP("set%s %s\n", name_AMD64Condcode(opc-0x90),
19913                           nameIRegE(1,pfx,modrm));
19914      } else {
19915         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
19916         delta += alen;
19917         storeLE( mkexpr(addr), mkexpr(t1) );
19918         DIP("set%s %s\n", name_AMD64Condcode(opc-0x90), dis_buf);
19919      }
19920      return delta;
19921
19922   case 0xA2: { /* CPUID */
19923      /* Uses dirty helper:
19924            void amd64g_dirtyhelper_CPUID ( VexGuestAMD64State* )
19925         declared to mod rax, wr rbx, rcx, rdx
19926      */
19927      IRDirty* d     = NULL;
19928      HChar*   fName = NULL;
19929      void*    fAddr = NULL;
19930      if (haveF2orF3(pfx)) goto decode_failure;
19931      if (archinfo->hwcaps == (VEX_HWCAPS_AMD64_SSE3
19932                               |VEX_HWCAPS_AMD64_CX16
19933                               |VEX_HWCAPS_AMD64_AVX)) {
19934         fName = "amd64g_dirtyhelper_CPUID_avx_and_cx16";
19935         fAddr = &amd64g_dirtyhelper_CPUID_avx_and_cx16;
19936         /* This is a Core-i5-2300-like machine */
19937      }
19938      else if (archinfo->hwcaps == (VEX_HWCAPS_AMD64_SSE3
19939                                    |VEX_HWCAPS_AMD64_CX16)) {
19940         fName = "amd64g_dirtyhelper_CPUID_sse42_and_cx16";
19941         fAddr = &amd64g_dirtyhelper_CPUID_sse42_and_cx16;
19942         /* This is a Core-i5-670-like machine */
19943      }
19944      else {
19945         /* Give a CPUID for at least a baseline machine, SSE2
19946            only, and no CX16 */
19947         fName = "amd64g_dirtyhelper_CPUID_baseline";
19948         fAddr = &amd64g_dirtyhelper_CPUID_baseline;
19949      }
19950
19951      vassert(fName); vassert(fAddr);
19952      d = unsafeIRDirty_0_N ( 0/*regparms*/,
19953                              fName, fAddr, mkIRExprVec_0() );
19954      /* declare guest state effects */
19955      d->needsBBP = True;
19956      d->nFxState = 4;
19957      vex_bzero(&d->fxState, sizeof(d->fxState));
19958      d->fxState[0].fx     = Ifx_Modify;
19959      d->fxState[0].offset = OFFB_RAX;
19960      d->fxState[0].size   = 8;
19961      d->fxState[1].fx     = Ifx_Write;
19962      d->fxState[1].offset = OFFB_RBX;
19963      d->fxState[1].size   = 8;
19964      d->fxState[2].fx     = Ifx_Modify;
19965      d->fxState[2].offset = OFFB_RCX;
19966      d->fxState[2].size   = 8;
19967      d->fxState[3].fx     = Ifx_Write;
19968      d->fxState[3].offset = OFFB_RDX;
19969      d->fxState[3].size   = 8;
19970      /* execute the dirty call, side-effecting guest state */
19971      stmt( IRStmt_Dirty(d) );
19972      /* CPUID is a serialising insn.  So, just in case someone is
19973         using it as a memory fence ... */
19974      stmt( IRStmt_MBE(Imbe_Fence) );
19975      DIP("cpuid\n");
19976      return delta;
19977   }
19978
19979   case 0xA3: /* BT Gv,Ev */
19980      if (haveF2orF3(pfx)) goto decode_failure;
19981      if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
19982      delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpNone );
19983      return delta;
19984
19985   case 0xA4: /* SHLDv imm8,Gv,Ev */
19986      modrm = getUChar(delta);
19987      d64   = delta + lengthAMode(pfx, delta);
19988      vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
19989      delta = dis_SHLRD_Gv_Ev (
19990                 vbi, pfx, delta, modrm, sz,
19991                 mkU8(getUChar(d64)), True, /* literal */
19992                 dis_buf, True /* left */ );
19993      return delta;
19994
19995   case 0xA5: /* SHLDv %cl,Gv,Ev */
19996      modrm = getUChar(delta);
19997      delta = dis_SHLRD_Gv_Ev (
19998                 vbi, pfx, delta, modrm, sz,
19999                 getIRegCL(), False, /* not literal */
20000                 "%cl", True /* left */ );
20001      return delta;
20002
20003   case 0xAB: /* BTS Gv,Ev */
20004      if (haveF2orF3(pfx)) goto decode_failure;
20005      if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
20006      delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpSet );
20007      return delta;
20008
20009   case 0xAC: /* SHRDv imm8,Gv,Ev */
20010      modrm = getUChar(delta);
20011      d64   = delta + lengthAMode(pfx, delta);
20012      vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
20013      delta = dis_SHLRD_Gv_Ev (
20014                 vbi, pfx, delta, modrm, sz,
20015                 mkU8(getUChar(d64)), True, /* literal */
20016                 dis_buf, False /* right */ );
20017      return delta;
20018
20019   case 0xAD: /* SHRDv %cl,Gv,Ev */
20020      modrm = getUChar(delta);
20021      delta = dis_SHLRD_Gv_Ev (
20022                 vbi, pfx, delta, modrm, sz,
20023                 getIRegCL(), False, /* not literal */
20024                 "%cl", False /* right */);
20025      return delta;
20026
20027   case 0xAF: /* IMUL Ev, Gv */
20028      if (haveF2orF3(pfx)) goto decode_failure;
20029      delta = dis_mul_E_G ( vbi, pfx, sz, delta );
20030      return delta;
20031
20032   case 0xB1: { /* CMPXCHG Gv,Ev (allowed in 16,32,64 bit) */
20033      Bool ok = True;
20034      if (haveF2orF3(pfx)) goto decode_failure;
20035      if (sz != 2 && sz != 4 && sz != 8) goto decode_failure;
20036      delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, sz, delta );
20037      if (!ok) goto decode_failure;
20038      return delta;
20039   }
20040
20041   case 0xB0: { /* CMPXCHG Gb,Eb */
20042      Bool ok = True;
20043      if (haveF2orF3(pfx)) goto decode_failure;
20044      delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, 1, delta );
20045      if (!ok) goto decode_failure;
20046      return delta;
20047   }
20048
20049   case 0xB3: /* BTR Gv,Ev */
20050      if (haveF2orF3(pfx)) goto decode_failure;
20051      if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
20052      delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpReset );
20053      return delta;
20054
20055   case 0xB6: /* MOVZXb Eb,Gv */
20056      if (haveF2orF3(pfx)) goto decode_failure;
20057      if (sz != 2 && sz != 4 && sz != 8)
20058         goto decode_failure;
20059      delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, False );
20060      return delta;
20061
20062   case 0xB7: /* MOVZXw Ew,Gv */
20063      if (haveF2orF3(pfx)) goto decode_failure;
20064      if (sz != 4 && sz != 8)
20065         goto decode_failure;
20066      delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, False );
20067      return delta;
20068
20069   case 0xBA: { /* Grp8 Ib,Ev */
20070      Bool decode_OK = False;
20071      if (haveF2orF3(pfx)) goto decode_failure;
20072      modrm = getUChar(delta);
20073      am_sz = lengthAMode(pfx,delta);
20074      d64   = getSDisp8(delta + am_sz);
20075      delta = dis_Grp8_Imm ( vbi, pfx, delta, modrm, am_sz, sz, d64,
20076                             &decode_OK );
20077      if (!decode_OK)
20078         goto decode_failure;
20079      return delta;
20080   }
20081
20082   case 0xBB: /* BTC Gv,Ev */
20083      if (haveF2orF3(pfx)) goto decode_failure;
20084      if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
20085      delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpComp );
20086      return delta;
20087
20088   case 0xBC: /* BSF Gv,Ev */
20089      if (haveF2(pfx)) goto decode_failure;
20090      delta = dis_bs_E_G ( vbi, pfx, sz, delta, True );
20091      return delta;
20092
20093   case 0xBD: /* BSR Gv,Ev */
20094      if (!haveF2orF3(pfx)
20095          || (haveF3noF2(pfx)
20096              && 0 == (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT))) {
20097         /* no-F2 no-F3 0F BD = BSR
20098                  or F3 0F BD = REP; BSR on older CPUs.  */
20099         delta = dis_bs_E_G ( vbi, pfx, sz, delta, False );
20100         return delta;
20101      }
20102      /* Fall through, since F3 0F BD is LZCNT, and needs to
20103         be handled by dis_ESC_0F__SSE4. */
20104      break;
20105
20106   case 0xBE: /* MOVSXb Eb,Gv */
20107      if (haveF2orF3(pfx)) goto decode_failure;
20108      if (sz != 2 && sz != 4 && sz != 8)
20109         goto decode_failure;
20110      delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, True );
20111      return delta;
20112
20113   case 0xBF: /* MOVSXw Ew,Gv */
20114      if (haveF2orF3(pfx)) goto decode_failure;
20115      if (sz != 4 && sz != 8)
20116         goto decode_failure;
20117      delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, True );
20118      return delta;
20119
20120   case 0xC1: { /* XADD Gv,Ev */
20121      Bool decode_OK = False;
20122      delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, sz, delta );
20123      if (!decode_OK)
20124         goto decode_failure;
20125      return delta;
20126   }
20127
20128   case 0xC7: { /* CMPXCHG8B Ev, CMPXCHG16B Ev */
20129      IRType  elemTy     = sz==4 ? Ity_I32 : Ity_I64;
20130      IRTemp  expdHi     = newTemp(elemTy);
20131      IRTemp  expdLo     = newTemp(elemTy);
20132      IRTemp  dataHi     = newTemp(elemTy);
20133      IRTemp  dataLo     = newTemp(elemTy);
20134      IRTemp  oldHi      = newTemp(elemTy);
20135      IRTemp  oldLo      = newTemp(elemTy);
20136      IRTemp  flags_old  = newTemp(Ity_I64);
20137      IRTemp  flags_new  = newTemp(Ity_I64);
20138      IRTemp  success    = newTemp(Ity_I1);
20139      IROp    opOR       = sz==4 ? Iop_Or32    : Iop_Or64;
20140      IROp    opXOR      = sz==4 ? Iop_Xor32   : Iop_Xor64;
20141      IROp    opCasCmpEQ = sz==4 ? Iop_CasCmpEQ32 : Iop_CasCmpEQ64;
20142      IRExpr* zero       = sz==4 ? mkU32(0)    : mkU64(0);
20143      IRTemp expdHi64    = newTemp(Ity_I64);
20144      IRTemp expdLo64    = newTemp(Ity_I64);
20145
20146      /* Translate this using a DCAS, even if there is no LOCK
20147         prefix.  Life is too short to bother with generating two
20148         different translations for the with/without-LOCK-prefix
20149         cases. */
20150      *expect_CAS = True;
20151
20152      /* Decode, and generate address. */
20153      if (have66orF2orF3(pfx)) goto decode_failure;
20154      if (sz != 4 && sz != 8) goto decode_failure;
20155      if (sz == 8 && !(archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16))
20156         goto decode_failure;
20157      modrm = getUChar(delta);
20158      if (epartIsReg(modrm)) goto decode_failure;
20159      if (gregLO3ofRM(modrm) != 1) goto decode_failure;
20160      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
20161      delta += alen;
20162
20163      /* cmpxchg16b requires an alignment check. */
20164      if (sz == 8)
20165         gen_SEGV_if_not_16_aligned( addr );
20166
20167      /* Get the expected and new values. */
20168      assign( expdHi64, getIReg64(R_RDX) );
20169      assign( expdLo64, getIReg64(R_RAX) );
20170
20171      /* These are the correctly-sized expected and new values.
20172         However, we also get expdHi64/expdLo64 above as 64-bits
20173         regardless, because we will need them later in the 32-bit
20174         case (paradoxically). */
20175      assign( expdHi, sz==4 ? unop(Iop_64to32, mkexpr(expdHi64))
20176                            : mkexpr(expdHi64) );
20177      assign( expdLo, sz==4 ? unop(Iop_64to32, mkexpr(expdLo64))
20178                            : mkexpr(expdLo64) );
20179      assign( dataHi, sz==4 ? getIReg32(R_RCX) : getIReg64(R_RCX) );
20180      assign( dataLo, sz==4 ? getIReg32(R_RBX) : getIReg64(R_RBX) );
20181
20182      /* Do the DCAS */
20183      stmt( IRStmt_CAS(
20184               mkIRCAS( oldHi, oldLo,
20185                        Iend_LE, mkexpr(addr),
20186                        mkexpr(expdHi), mkexpr(expdLo),
20187                        mkexpr(dataHi), mkexpr(dataLo)
20188            )));
20189
20190      /* success when oldHi:oldLo == expdHi:expdLo */
20191      assign( success,
20192              binop(opCasCmpEQ,
20193                    binop(opOR,
20194                          binop(opXOR, mkexpr(oldHi), mkexpr(expdHi)),
20195                          binop(opXOR, mkexpr(oldLo), mkexpr(expdLo))
20196                    ),
20197                    zero
20198              ));
20199
20200      /* If the DCAS is successful, that is to say oldHi:oldLo ==
20201         expdHi:expdLo, then put expdHi:expdLo back in RDX:RAX,
20202         which is where they came from originally.  Both the actual
20203         contents of these two regs, and any shadow values, are
20204         unchanged.  If the DCAS fails then we're putting into
20205         RDX:RAX the value seen in memory. */
20206      /* Now of course there's a complication in the 32-bit case
20207         (bah!): if the DCAS succeeds, we need to leave RDX:RAX
20208         unchanged; but if we use the same scheme as in the 64-bit
20209         case, we get hit by the standard rule that a write to the
20210         bottom 32 bits of an integer register zeros the upper 32
20211         bits.  And so the upper halves of RDX and RAX mysteriously
20212         become zero.  So we have to stuff back in the original
20213         64-bit values which we previously stashed in
20214         expdHi64:expdLo64, even if we're doing a cmpxchg8b. */
20215      /* It's just _so_ much fun ... */
20216      putIRegRDX( 8,
20217                  IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
20218                                sz == 4 ? unop(Iop_32Uto64, mkexpr(oldHi))
20219                                        : mkexpr(oldHi),
20220                                mkexpr(expdHi64)
20221                ));
20222      putIRegRAX( 8,
20223                  IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
20224                                sz == 4 ? unop(Iop_32Uto64, mkexpr(oldLo))
20225                                        : mkexpr(oldLo),
20226                                mkexpr(expdLo64)
20227                ));
20228
20229      /* Copy the success bit into the Z flag and leave the others
20230         unchanged */
20231      assign( flags_old, widenUto64(mk_amd64g_calculate_rflags_all()));
20232      assign(
20233         flags_new,
20234         binop(Iop_Or64,
20235               binop(Iop_And64, mkexpr(flags_old),
20236                                mkU64(~AMD64G_CC_MASK_Z)),
20237               binop(Iop_Shl64,
20238                     binop(Iop_And64,
20239                           unop(Iop_1Uto64, mkexpr(success)), mkU64(1)),
20240                     mkU8(AMD64G_CC_SHIFT_Z)) ));
20241
20242      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
20243      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
20244      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
20245      /* Set NDEP even though it isn't used.  This makes
20246         redundant-PUT elimination of previous stores to this field
20247         work better. */
20248      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
20249
20250      /* Sheesh.  Aren't you glad it was me and not you that had to
20251         write and validate all this grunge? */
20252
20253      DIP("cmpxchg8b %s\n", dis_buf);
20254      return delta;
20255   }
20256
20257   case 0xC8: /* BSWAP %eax */
20258   case 0xC9:
20259   case 0xCA:
20260   case 0xCB:
20261   case 0xCC:
20262   case 0xCD:
20263   case 0xCE:
20264   case 0xCF: /* BSWAP %edi */
20265      if (haveF2orF3(pfx)) goto decode_failure;
20266      /* According to the AMD64 docs, this insn can have size 4 or
20267         8. */
20268      if (sz == 4) {
20269         t1 = newTemp(Ity_I32);
20270         assign( t1, getIRegRexB(4, pfx, opc-0xC8) );
20271         t2 = math_BSWAP( t1, Ity_I32 );
20272         putIRegRexB(4, pfx, opc-0xC8, mkexpr(t2));
20273         DIP("bswapl %s\n", nameIRegRexB(4, pfx, opc-0xC8));
20274         return delta;
20275      }
20276      if (sz == 8) {
20277         t1 = newTemp(Ity_I64);
20278         t2 = newTemp(Ity_I64);
20279         assign( t1, getIRegRexB(8, pfx, opc-0xC8) );
20280         t2 = math_BSWAP( t1, Ity_I64 );
20281         putIRegRexB(8, pfx, opc-0xC8, mkexpr(t2));
20282         DIP("bswapq %s\n", nameIRegRexB(8, pfx, opc-0xC8));
20283         return delta;
20284      }
20285      goto decode_failure;
20286
20287   default:
20288      break;
20289
20290   } /* first switch */
20291
20292
20293   /* =-=-=-=-=-=-=-=-= MMXery =-=-=-=-=-=-=-=-= */
20294   /* In the second switch, pick off MMX insns. */
20295
20296   if (!have66orF2orF3(pfx)) {
20297      /* So there's no SIMD prefix. */
20298
20299      vassert(sz == 4 || sz == 8);
20300
20301      switch (opc) { /* second switch */
20302
20303      case 0x71:
20304      case 0x72:
20305      case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
20306
20307      case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
20308      case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
20309      case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
20310      case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
20311
20312      case 0xFC:
20313      case 0xFD:
20314      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
20315
20316      case 0xEC:
20317      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
20318
20319      case 0xDC:
20320      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
20321
20322      case 0xF8:
20323      case 0xF9:
20324      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
20325
20326      case 0xE8:
20327      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
20328
20329      case 0xD8:
20330      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
20331
20332      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
20333      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
20334
20335      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
20336
20337      case 0x74:
20338      case 0x75:
20339      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
20340
20341      case 0x64:
20342      case 0x65:
20343      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
20344
20345      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
20346      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
20347      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
20348
20349      case 0x68:
20350      case 0x69:
20351      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
20352
20353      case 0x60:
20354      case 0x61:
20355      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
20356
20357      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
20358      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
20359      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
20360      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
20361
20362      case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
20363      case 0xF2:
20364      case 0xF3:
20365
20366      case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
20367      case 0xD2:
20368      case 0xD3:
20369
20370      case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
20371      case 0xE2: {
20372         Bool decode_OK = False;
20373         delta = dis_MMX ( &decode_OK, vbi, pfx, sz, deltaIN );
20374         if (decode_OK)
20375            return delta;
20376         goto decode_failure;
20377      }
20378
20379      default:
20380         break;
20381      } /* second switch */
20382
20383   }
20384
20385   /* A couple of MMX corner cases */
20386   if (opc == 0x0E/* FEMMS */ || opc == 0x77/* EMMS */) {
20387      if (sz != 4)
20388         goto decode_failure;
20389      do_EMMS_preamble();
20390      DIP("{f}emms\n");
20391      return delta;
20392   }
20393
20394   /* =-=-=-=-=-=-=-=-= SSE2ery =-=-=-=-=-=-=-=-= */
20395   /* Perhaps it's an SSE or SSE2 instruction.  We can try this
20396      without checking the guest hwcaps because SSE2 is a baseline
20397      facility in 64 bit mode. */
20398   {
20399      Bool decode_OK = False;
20400      delta = dis_ESC_0F__SSE2 ( &decode_OK, vbi, pfx, sz, deltaIN, dres );
20401      if (decode_OK)
20402         return delta;
20403   }
20404
20405   /* =-=-=-=-=-=-=-=-= SSE3ery =-=-=-=-=-=-=-=-= */
20406   /* Perhaps it's a SSE3 instruction.  FIXME: check guest hwcaps
20407      first. */
20408   {
20409      Bool decode_OK = False;
20410      delta = dis_ESC_0F__SSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
20411      if (decode_OK)
20412         return delta;
20413   }
20414
20415   /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
20416   /* Perhaps it's a SSE4 instruction.  FIXME: check guest hwcaps
20417      first. */
20418   {
20419      Bool decode_OK = False;
20420      delta = dis_ESC_0F__SSE4 ( &decode_OK,
20421                                 archinfo, vbi, pfx, sz, deltaIN );
20422      if (decode_OK)
20423         return delta;
20424   }
20425
20426  decode_failure:
20427   return deltaIN; /* fail */
20428}
20429
20430
20431/*------------------------------------------------------------*/
20432/*---                                                      ---*/
20433/*--- Top-level post-escape decoders: dis_ESC_0F38         ---*/
20434/*---                                                      ---*/
20435/*------------------------------------------------------------*/
20436
20437__attribute__((noinline))
20438static
20439Long dis_ESC_0F38 (
20440        /*MB_OUT*/DisResult* dres,
20441        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
20442        Bool         resteerCisOk,
20443        void*        callback_opaque,
20444        VexArchInfo* archinfo,
20445        VexAbiInfo*  vbi,
20446        Prefix pfx, Int sz, Long deltaIN
20447     )
20448{
20449   Long   delta = deltaIN;
20450   UChar  opc   = getUChar(delta);
20451   delta++;
20452   switch (opc) {
20453
20454   case 0xF0:   /* 0F 38 F0 = MOVBE m16/32/64(E), r16/32/64(G) */
20455   case 0xF1: { /* 0F 38 F1 = MOVBE r16/32/64(G), m16/32/64(E) */
20456      if (!haveF2orF3(pfx) && !haveVEX(pfx)
20457          && (sz == 2 || sz == 4 || sz == 8)) {
20458         IRTemp addr  = IRTemp_INVALID;
20459         UChar  modrm = 0;
20460         Int    alen  = 0;
20461         HChar  dis_buf[50];
20462         modrm = getUChar(delta);
20463         if (epartIsReg(modrm)) break;
20464         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
20465         delta += alen;
20466         IRType ty = szToITy(sz);
20467         IRTemp src = newTemp(ty);
20468         if (opc == 0xF0) { /* LOAD */
20469            assign(src, loadLE(ty, mkexpr(addr)));
20470            IRTemp dst = math_BSWAP(src, ty);
20471            putIRegG(sz, pfx, modrm, mkexpr(dst));
20472            DIP("movbe %s,%s\n", dis_buf, nameIRegG(sz, pfx, modrm));
20473         } else { /* STORE */
20474            assign(src, getIRegG(sz, pfx, modrm));
20475            IRTemp dst = math_BSWAP(src, ty);
20476            storeLE(mkexpr(addr), mkexpr(dst));
20477            DIP("movbe %s,%s\n", nameIRegG(sz, pfx, modrm), dis_buf);
20478         }
20479         return delta;
20480      }
20481      /* else fall through; maybe one of the decoders below knows what
20482         it is. */
20483      break;
20484   }
20485
20486   default:
20487      break;
20488
20489   }
20490
20491   /* =-=-=-=-=-=-=-=-= SSSE3ery =-=-=-=-=-=-=-=-= */
20492   /* Perhaps it's an SSSE3 instruction.  FIXME: consult guest hwcaps
20493      rather than proceeding indiscriminately. */
20494   {
20495      Bool decode_OK = False;
20496      delta = dis_ESC_0F38__SupSSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
20497      if (decode_OK)
20498         return delta;
20499   }
20500
20501   /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
20502   /* Perhaps it's an SSE4 instruction.  FIXME: consult guest hwcaps
20503      rather than proceeding indiscriminately. */
20504   {
20505      Bool decode_OK = False;
20506      delta = dis_ESC_0F38__SSE4 ( &decode_OK, vbi, pfx, sz, deltaIN );
20507      if (decode_OK)
20508         return delta;
20509   }
20510
20511  /*decode_failure:*/
20512   return deltaIN; /* fail */
20513}
20514
20515
20516/*------------------------------------------------------------*/
20517/*---                                                      ---*/
20518/*--- Top-level post-escape decoders: dis_ESC_0F3A         ---*/
20519/*---                                                      ---*/
20520/*------------------------------------------------------------*/
20521
20522__attribute__((noinline))
20523static
20524Long dis_ESC_0F3A (
20525        /*MB_OUT*/DisResult* dres,
20526        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
20527        Bool         resteerCisOk,
20528        void*        callback_opaque,
20529        VexArchInfo* archinfo,
20530        VexAbiInfo*  vbi,
20531        Prefix pfx, Int sz, Long deltaIN
20532     )
20533{
20534   Long   delta = deltaIN;
20535   UChar  opc   = getUChar(delta);
20536   delta++;
20537   switch (opc) {
20538
20539   default:
20540      break;
20541
20542   }
20543
20544   /* =-=-=-=-=-=-=-=-= SSSE3ery =-=-=-=-=-=-=-=-= */
20545   /* Perhaps it's an SSSE3 instruction.  FIXME: consult guest hwcaps
20546      rather than proceeding indiscriminately. */
20547   {
20548      Bool decode_OK = False;
20549      delta = dis_ESC_0F3A__SupSSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
20550      if (decode_OK)
20551         return delta;
20552   }
20553
20554   /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
20555   /* Perhaps it's an SSE4 instruction.  FIXME: consult guest hwcaps
20556      rather than proceeding indiscriminately. */
20557   {
20558      Bool decode_OK = False;
20559      delta = dis_ESC_0F3A__SSE4 ( &decode_OK, vbi, pfx, sz, deltaIN );
20560      if (decode_OK)
20561         return delta;
20562   }
20563
20564   return deltaIN; /* fail */
20565}
20566
20567
20568/*------------------------------------------------------------*/
20569/*---                                                      ---*/
20570/*--- Top-level post-escape decoders: dis_ESC_0F__VEX      ---*/
20571/*---                                                      ---*/
20572/*------------------------------------------------------------*/
20573
20574/* FIXME: common up with the _256_ version below? */
20575static
20576Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG (
20577        /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
20578        Prefix pfx, Long delta, HChar* name,
20579        /* The actual operation.  Use either 'op' or 'opfn',
20580           but not both. */
20581        IROp op, IRTemp(*opFn)(IRTemp,IRTemp),
20582        Bool invertLeftArg,
20583        Bool swapArgs
20584     )
20585{
20586   UChar  modrm = getUChar(delta);
20587   UInt   rD    = gregOfRexRM(pfx, modrm);
20588   UInt   rSL   = getVexNvvvv(pfx);
20589   IRTemp tSL   = newTemp(Ity_V128);
20590   IRTemp tSR   = newTemp(Ity_V128);
20591   IRTemp addr  = IRTemp_INVALID;
20592   HChar  dis_buf[50];
20593   Int    alen  = 0;
20594   vassert(0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*WIG?*/);
20595
20596   assign(tSL, invertLeftArg ? unop(Iop_NotV128, getXMMReg(rSL))
20597                             : getXMMReg(rSL));
20598
20599   if (epartIsReg(modrm)) {
20600      UInt rSR = eregOfRexRM(pfx, modrm);
20601      delta += 1;
20602      assign(tSR, getXMMReg(rSR));
20603      DIP("%s %s,%s,%s\n",
20604          name, nameXMMReg(rSR), nameXMMReg(rSL), nameXMMReg(rD));
20605   } else {
20606      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
20607      delta += alen;
20608      assign(tSR, loadLE(Ity_V128, mkexpr(addr)));
20609      DIP("%s %s,%s,%s\n",
20610          name, dis_buf, nameXMMReg(rSL), nameXMMReg(rD));
20611   }
20612
20613   IRTemp res = IRTemp_INVALID;
20614   if (op != Iop_INVALID) {
20615      vassert(opFn == NULL);
20616      res = newTemp(Ity_V128);
20617      assign(res, swapArgs ? binop(op, mkexpr(tSR), mkexpr(tSL))
20618                           : binop(op, mkexpr(tSL), mkexpr(tSR)));
20619   } else {
20620      vassert(opFn != NULL);
20621      res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR);
20622   }
20623
20624   putYMMRegLoAndZU(rD, mkexpr(res));
20625
20626   *uses_vvvv = True;
20627   return delta;
20628}
20629
20630
20631/* Handle a VEX_NDS_128_66_0F_WIG (3-addr) insn, with a simple IROp
20632   for the operation, no inversion of the left arg, and no swapping of
20633   args. */
20634static
20635Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple (
20636        /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
20637        Prefix pfx, Long delta, HChar* name,
20638        IROp op
20639     )
20640{
20641   return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
20642             uses_vvvv, vbi, pfx, delta, name, op, NULL, False, False);
20643}
20644
20645
20646/* Handle a VEX_NDS_128_66_0F_WIG (3-addr) insn, using the given IR
20647   generator to compute the result, no inversion of the left
20648   arg, and no swapping of args. */
20649static
20650Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex (
20651        /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
20652        Prefix pfx, Long delta, HChar* name,
20653        IRTemp(*opFn)(IRTemp,IRTemp)
20654     )
20655{
20656   return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
20657             uses_vvvv, vbi, pfx, delta, name,
20658             Iop_INVALID, opFn, False, False );
20659}
20660
20661
20662/* Vector by scalar shift of V by the amount specified at the bottom
20663   of E. */
20664static ULong dis_AVX128_shiftV_byE ( VexAbiInfo* vbi,
20665                                     Prefix pfx, Long delta,
20666                                     HChar* opname, IROp op )
20667{
20668   HChar   dis_buf[50];
20669   Int     alen, size;
20670   IRTemp  addr;
20671   Bool    shl, shr, sar;
20672   UChar   modrm = getUChar(delta);
20673   UInt    rG    = gregOfRexRM(pfx,modrm);
20674   UInt    rV    = getVexNvvvv(pfx);;
20675   IRTemp  g0    = newTemp(Ity_V128);
20676   IRTemp  g1    = newTemp(Ity_V128);
20677   IRTemp  amt   = newTemp(Ity_I64);
20678   IRTemp  amt8  = newTemp(Ity_I8);
20679   if (epartIsReg(modrm)) {
20680      UInt rE = eregOfRexRM(pfx,modrm);
20681      assign( amt, getXMMRegLane64(rE, 0) );
20682      DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
20683          nameXMMReg(rV), nameXMMReg(rG) );
20684      delta++;
20685   } else {
20686      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
20687      assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
20688      DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
20689      delta += alen;
20690   }
20691   assign( g0, getXMMReg(rV) );
20692   assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
20693
20694   shl = shr = sar = False;
20695   size = 0;
20696   switch (op) {
20697      case Iop_ShlN16x8: shl = True; size = 32; break;
20698      case Iop_ShlN32x4: shl = True; size = 32; break;
20699      case Iop_ShlN64x2: shl = True; size = 64; break;
20700      case Iop_SarN16x8: sar = True; size = 16; break;
20701      case Iop_SarN32x4: sar = True; size = 32; break;
20702      case Iop_ShrN16x8: shr = True; size = 16; break;
20703      case Iop_ShrN32x4: shr = True; size = 32; break;
20704      case Iop_ShrN64x2: shr = True; size = 64; break;
20705      default: vassert(0);
20706   }
20707
20708   if (shl || shr) {
20709     assign(
20710        g1,
20711        IRExpr_Mux0X(
20712           unop(Iop_1Uto8,
20713                binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size))),
20714           mkV128(0x0000),
20715           binop(op, mkexpr(g0), mkexpr(amt8))
20716        )
20717     );
20718   } else
20719   if (sar) {
20720     assign(
20721        g1,
20722        IRExpr_Mux0X(
20723           unop(Iop_1Uto8,
20724                binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size))),
20725           binop(op, mkexpr(g0), mkU8(size-1)),
20726           binop(op, mkexpr(g0), mkexpr(amt8))
20727        )
20728     );
20729   } else {
20730      vassert(0);
20731   }
20732
20733   putYMMRegLoAndZU( rG, mkexpr(g1) );
20734   return delta;
20735}
20736
20737
20738/* Vector by scalar shift of E into V, by an immediate byte.  Modified
20739   version of dis_SSE_shiftE_imm. */
20740static
20741Long dis_AVX128_shiftE_to_V_imm( Prefix pfx,
20742                                 Long delta, HChar* opname, IROp op )
20743{
20744   Bool    shl, shr, sar;
20745   UChar   rm   = getUChar(delta);
20746   IRTemp  e0   = newTemp(Ity_V128);
20747   IRTemp  e1   = newTemp(Ity_V128);
20748   UInt    rD   = getVexNvvvv(pfx);
20749   UChar   amt, size;
20750   vassert(epartIsReg(rm));
20751   vassert(gregLO3ofRM(rm) == 2
20752           || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
20753   amt = getUChar(delta+1);
20754   delta += 2;
20755   DIP("%s $%d,%s,%s\n", opname,
20756                         (Int)amt,
20757                         nameXMMReg(eregOfRexRM(pfx,rm)),
20758                         nameXMMReg(rD));
20759   assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
20760
20761   shl = shr = sar = False;
20762   size = 0;
20763   switch (op) {
20764      case Iop_ShlN16x8: shl = True; size = 16; break;
20765      case Iop_ShlN32x4: shl = True; size = 32; break;
20766      case Iop_ShlN64x2: shl = True; size = 64; break;
20767      case Iop_SarN16x8: sar = True; size = 16; break;
20768      case Iop_SarN32x4: sar = True; size = 32; break;
20769      case Iop_ShrN16x8: shr = True; size = 16; break;
20770      case Iop_ShrN32x4: shr = True; size = 32; break;
20771      case Iop_ShrN64x2: shr = True; size = 64; break;
20772      default: vassert(0);
20773   }
20774
20775   if (shl || shr) {
20776     assign( e1, amt >= size
20777                    ? mkV128(0x0000)
20778                    : binop(op, mkexpr(e0), mkU8(amt))
20779     );
20780   } else
20781   if (sar) {
20782     assign( e1, amt >= size
20783                    ? binop(op, mkexpr(e0), mkU8(size-1))
20784                    : binop(op, mkexpr(e0), mkU8(amt))
20785     );
20786   } else {
20787      vassert(0);
20788   }
20789
20790   putYMMRegLoAndZU( rD, mkexpr(e1) );
20791   return delta;
20792}
20793
20794
20795/* Lower 64-bit lane only AVX128 binary operation:
20796   G[63:0]    = V[63:0] `op` E[63:0]
20797   G[127:64]  = V[127:64]
20798   G[255:128] = 0.
20799   The specified op must be of the 64F0x2 kind, so that it
20800   copies the upper half of the left operand to the result.
20801*/
20802static Long dis_AVX128_E_V_to_G_lo64 ( /*OUT*/Bool* uses_vvvv,
20803                                       VexAbiInfo* vbi,
20804                                       Prefix pfx, Long delta,
20805                                       HChar* opname, IROp op )
20806{
20807   HChar   dis_buf[50];
20808   Int     alen;
20809   IRTemp  addr;
20810   UChar   rm    = getUChar(delta);
20811   UInt    rG    = gregOfRexRM(pfx,rm);
20812   UInt    rV    = getVexNvvvv(pfx);
20813   IRExpr* vpart = getXMMReg(rV);
20814   if (epartIsReg(rm)) {
20815      UInt rE = eregOfRexRM(pfx,rm);
20816      putXMMReg( rG, binop(op, vpart, getXMMReg(rE)) );
20817      DIP("%s %s,%s,%s\n", opname,
20818          nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
20819      delta = delta+1;
20820   } else {
20821      /* We can only do a 64-bit memory read, so the upper half of the
20822         E operand needs to be made simply of zeroes. */
20823      IRTemp epart = newTemp(Ity_V128);
20824      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
20825      assign( epart, unop( Iop_64UtoV128,
20826                           loadLE(Ity_I64, mkexpr(addr))) );
20827      putXMMReg( rG, binop(op, vpart, mkexpr(epart)) );
20828      DIP("%s %s,%s,%s\n", opname,
20829          dis_buf, nameXMMReg(rV), nameXMMReg(rG));
20830      delta = delta+alen;
20831   }
20832   putYMMRegLane128( rG, 1, mkV128(0) );
20833   *uses_vvvv = True;
20834   return delta;
20835}
20836
20837
20838/* Lower 64-bit lane only AVX128 unary operation:
20839   G[63:0]    = op(E[63:0])
20840   G[127:64]  = V[127:64]
20841   G[255:128] = 0
20842   The specified op must be of the 64F0x2 kind, so that it
20843   copies the upper half of the operand to the result.
20844*/
20845static Long dis_AVX128_E_V_to_G_lo64_unary ( /*OUT*/Bool* uses_vvvv,
20846                                             VexAbiInfo* vbi,
20847                                             Prefix pfx, Long delta,
20848                                             HChar* opname, IROp op )
20849{
20850   HChar   dis_buf[50];
20851   Int     alen;
20852   IRTemp  addr;
20853   UChar   rm  = getUChar(delta);
20854   UInt    rG  = gregOfRexRM(pfx,rm);
20855   UInt    rV  = getVexNvvvv(pfx);
20856   IRTemp  e64 = newTemp(Ity_I64);
20857
20858   /* Fetch E[63:0] */
20859   if (epartIsReg(rm)) {
20860      UInt rE = eregOfRexRM(pfx,rm);
20861      assign(e64, getXMMRegLane64(rE, 0));
20862      DIP("%s %s,%s,%s\n", opname,
20863          nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
20864      delta += 1;
20865   } else {
20866      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
20867      assign(e64, loadLE(Ity_I64, mkexpr(addr)));
20868      DIP("%s %s,%s,%s\n", opname,
20869          dis_buf, nameXMMReg(rV), nameXMMReg(rG));
20870      delta += alen;
20871   }
20872
20873   /* Create a value 'arg' as V[127:64]++E[63:0] */
20874   IRTemp arg = newTemp(Ity_V128);
20875   assign(arg,
20876          binop(Iop_SetV128lo64,
20877                getXMMReg(rV), mkexpr(e64)));
20878   /* and apply op to it */
20879   putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
20880   *uses_vvvv = True;
20881   return delta;
20882}
20883
20884
20885/* Lower 32-bit lane only AVX128 unary operation:
20886   G[31:0]    = op(E[31:0])
20887   G[127:32]  = V[127:32]
20888   G[255:128] = 0
20889   The specified op must be of the 32F0x4 kind, so that it
20890   copies the upper 3/4 of the operand to the result.
20891*/
20892static Long dis_AVX128_E_V_to_G_lo32_unary ( /*OUT*/Bool* uses_vvvv,
20893                                             VexAbiInfo* vbi,
20894                                             Prefix pfx, Long delta,
20895                                             HChar* opname, IROp op )
20896{
20897   HChar   dis_buf[50];
20898   Int     alen;
20899   IRTemp  addr;
20900   UChar   rm  = getUChar(delta);
20901   UInt    rG  = gregOfRexRM(pfx,rm);
20902   UInt    rV  = getVexNvvvv(pfx);
20903   IRTemp  e32 = newTemp(Ity_I32);
20904
20905   /* Fetch E[31:0] */
20906   if (epartIsReg(rm)) {
20907      UInt rE = eregOfRexRM(pfx,rm);
20908      assign(e32, getXMMRegLane32(rE, 0));
20909      DIP("%s %s,%s,%s\n", opname,
20910          nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
20911      delta += 1;
20912   } else {
20913      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
20914      assign(e32, loadLE(Ity_I32, mkexpr(addr)));
20915      DIP("%s %s,%s,%s\n", opname,
20916          dis_buf, nameXMMReg(rV), nameXMMReg(rG));
20917      delta += alen;
20918   }
20919
20920   /* Create a value 'arg' as V[127:32]++E[31:0] */
20921   IRTemp arg = newTemp(Ity_V128);
20922   assign(arg,
20923          binop(Iop_SetV128lo32,
20924                getXMMReg(rV), mkexpr(e32)));
20925   /* and apply op to it */
20926   putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
20927   *uses_vvvv = True;
20928   return delta;
20929}
20930
20931
20932/* Lower 32-bit lane only AVX128 binary operation:
20933   G[31:0]    = V[31:0] `op` E[31:0]
20934   G[127:32]  = V[127:32]
20935   G[255:128] = 0.
20936   The specified op must be of the 32F0x4 kind, so that it
20937   copies the upper 3/4 of the left operand to the result.
20938*/
20939static Long dis_AVX128_E_V_to_G_lo32 ( /*OUT*/Bool* uses_vvvv,
20940                                       VexAbiInfo* vbi,
20941                                       Prefix pfx, Long delta,
20942                                       HChar* opname, IROp op )
20943{
20944   HChar   dis_buf[50];
20945   Int     alen;
20946   IRTemp  addr;
20947   UChar   rm    = getUChar(delta);
20948   UInt    rG    = gregOfRexRM(pfx,rm);
20949   UInt    rV    = getVexNvvvv(pfx);
20950   IRExpr* vpart = getXMMReg(rV);
20951   if (epartIsReg(rm)) {
20952      UInt rE = eregOfRexRM(pfx,rm);
20953      putXMMReg( rG, binop(op, vpart, getXMMReg(rE)) );
20954      DIP("%s %s,%s,%s\n", opname,
20955          nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
20956      delta = delta+1;
20957   } else {
20958      /* We can only do a 32-bit memory read, so the upper 3/4 of the
20959         E operand needs to be made simply of zeroes. */
20960      IRTemp epart = newTemp(Ity_V128);
20961      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
20962      assign( epart, unop( Iop_32UtoV128,
20963                           loadLE(Ity_I32, mkexpr(addr))) );
20964      putXMMReg( rG, binop(op, vpart, mkexpr(epart)) );
20965      DIP("%s %s,%s,%s\n", opname,
20966          dis_buf, nameXMMReg(rV), nameXMMReg(rG));
20967      delta = delta+alen;
20968   }
20969   putYMMRegLane128( rG, 1, mkV128(0) );
20970   *uses_vvvv = True;
20971   return delta;
20972}
20973
20974
20975/* All-lanes AVX128 binary operation:
20976   G[127:0]   = V[127:0] `op` E[127:0]
20977   G[255:128] = 0.
20978*/
20979static Long dis_AVX128_E_V_to_G ( /*OUT*/Bool* uses_vvvv,
20980                                  VexAbiInfo* vbi,
20981                                  Prefix pfx, Long delta,
20982                                  HChar* opname, IROp op )
20983{
20984   return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
20985             uses_vvvv, vbi, pfx, delta, opname, op,
20986             NULL, False/*!invertLeftArg*/, False/*!swapArgs*/
20987   );
20988}
20989
20990
20991/* Handles AVX128 32F/64F comparisons.  A derivative of
20992   dis_SSEcmp_E_to_G.  It can fail, in which case it returns the
20993   original delta to indicate failure. */
20994static
20995Long dis_AVX128_cmp_V_E_to_G ( /*OUT*/Bool* uses_vvvv,
20996                               VexAbiInfo* vbi,
20997                               Prefix pfx, Long delta,
20998                               HChar* opname, Bool all_lanes, Int sz )
20999{
21000   vassert(sz == 4 || sz == 8);
21001   Long    deltaIN = delta;
21002   HChar   dis_buf[50];
21003   Int     alen;
21004   UInt    imm8;
21005   IRTemp  addr;
21006   Bool    preSwap = False;
21007   IROp    op      = Iop_INVALID;
21008   Bool    postNot = False;
21009   IRTemp  plain   = newTemp(Ity_V128);
21010   UChar   rm      = getUChar(delta);
21011   UInt    rG      = gregOfRexRM(pfx, rm);
21012   UInt    rV      = getVexNvvvv(pfx);
21013   IRTemp argL     = newTemp(Ity_V128);
21014   IRTemp argR     = newTemp(Ity_V128);
21015
21016   assign(argL, getXMMReg(rV));
21017   if (epartIsReg(rm)) {
21018      imm8 = getUChar(delta+1);
21019      Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
21020      if (!ok) return deltaIN; /* FAIL */
21021      UInt rE = eregOfRexRM(pfx,rm);
21022      assign(argR, getXMMReg(rE));
21023      delta += 1+1;
21024      DIP("%s $%d,%s,%s,%s\n",
21025          opname, (Int)imm8,
21026          nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
21027   } else {
21028      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
21029      imm8 = getUChar(delta+alen);
21030      Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
21031      if (!ok) return deltaIN; /* FAIL */
21032      assign(argR,
21033             all_lanes   ? loadLE(Ity_V128, mkexpr(addr))
21034             : sz == 8   ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
21035             : /*sz==4*/   unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr))));
21036      delta += alen+1;
21037      DIP("%s $%d,%s,%s,%s\n",
21038          opname, (Int)imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
21039   }
21040
21041   assign(plain, preSwap ? binop(op, mkexpr(argR), mkexpr(argL))
21042                         : binop(op, mkexpr(argL), mkexpr(argR)));
21043
21044   if (all_lanes) {
21045      /* This is simple: just invert the result, if necessary, and
21046         have done. */
21047      if (postNot) {
21048         putYMMRegLoAndZU( rG, unop(Iop_NotV128, mkexpr(plain)) );
21049      } else {
21050         putYMMRegLoAndZU( rG, mkexpr(plain) );
21051      }
21052   }
21053   else
21054   if (!preSwap) {
21055      /* More complex.  It's a one-lane-only, hence need to possibly
21056         invert only that one lane.  But at least the other lanes are
21057         correctly "in" the result, having been copied from the left
21058         operand (argL). */
21059      if (postNot) {
21060         IRExpr* mask = mkV128(sz==4 ? 0x000F : 0x00FF);
21061         putYMMRegLoAndZU( rG, binop(Iop_XorV128, mkexpr(plain),
21062                                                  mask) );
21063      } else {
21064         putYMMRegLoAndZU( rG, mkexpr(plain) );
21065      }
21066   }
21067   else {
21068      /* This is the most complex case.  One-lane-only, but the args
21069         were swapped.  So we have to possibly invert the bottom lane,
21070         and (definitely) we have to copy the upper lane(s) from argL
21071         since, due to the swapping, what's currently there is from
21072         argR, which is not correct. */
21073      IRTemp res     = newTemp(Ity_V128);
21074      IRTemp mask    = newTemp(Ity_V128);
21075      IRTemp notMask = newTemp(Ity_V128);
21076      assign(mask,    mkV128(sz==4 ? 0x000F : 0x00FF));
21077      assign(notMask, mkV128(sz==4 ? 0xFFF0 : 0xFF00));
21078      if (postNot) {
21079         assign(res,
21080                binop(Iop_OrV128,
21081                      binop(Iop_AndV128,
21082                            unop(Iop_NotV128, mkexpr(plain)),
21083                            mkexpr(mask)),
21084                      binop(Iop_AndV128, mkexpr(argL), mkexpr(notMask))));
21085      } else {
21086         assign(res,
21087                binop(Iop_OrV128,
21088                      binop(Iop_AndV128,
21089                            mkexpr(plain),
21090                            mkexpr(mask)),
21091                      binop(Iop_AndV128, mkexpr(argL), mkexpr(notMask))));
21092      }
21093      putYMMRegLoAndZU( rG, mkexpr(res) );
21094   }
21095
21096   *uses_vvvv = True;
21097   return delta;
21098}
21099
21100
21101/* Handles AVX256 32F/64F comparisons.  A derivative of
21102   dis_SSEcmp_E_to_G.  It can fail, in which case it returns the
21103   original delta to indicate failure. */
21104static
21105Long dis_AVX256_cmp_V_E_to_G ( /*OUT*/Bool* uses_vvvv,
21106                               VexAbiInfo* vbi,
21107                               Prefix pfx, Long delta,
21108                               HChar* opname, Int sz )
21109{
21110   vassert(sz == 4 || sz == 8);
21111   Long    deltaIN = delta;
21112   HChar   dis_buf[50];
21113   Int     alen;
21114   UInt    imm8;
21115   IRTemp  addr;
21116   Bool    preSwap = False;
21117   IROp    op      = Iop_INVALID;
21118   Bool    postNot = False;
21119   IRTemp  plain   = newTemp(Ity_V256);
21120   UChar   rm      = getUChar(delta);
21121   UInt    rG      = gregOfRexRM(pfx, rm);
21122   UInt    rV      = getVexNvvvv(pfx);
21123   IRTemp argL     = newTemp(Ity_V256);
21124   IRTemp argR     = newTemp(Ity_V256);
21125   IRTemp argLhi   = IRTemp_INVALID;
21126   IRTemp argLlo   = IRTemp_INVALID;
21127   IRTemp argRhi   = IRTemp_INVALID;
21128   IRTemp argRlo   = IRTemp_INVALID;
21129
21130   assign(argL, getYMMReg(rV));
21131   if (epartIsReg(rm)) {
21132      imm8 = getUChar(delta+1);
21133      Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8,
21134                             True/*all_lanes*/, sz);
21135      if (!ok) return deltaIN; /* FAIL */
21136      UInt rE = eregOfRexRM(pfx,rm);
21137      assign(argR, getYMMReg(rE));
21138      delta += 1+1;
21139      DIP("%s $%d,%s,%s,%s\n",
21140          opname, (Int)imm8,
21141          nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
21142   } else {
21143      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
21144      imm8 = getUChar(delta+alen);
21145      Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8,
21146                             True/*all_lanes*/, sz);
21147      if (!ok) return deltaIN; /* FAIL */
21148      assign(argR, loadLE(Ity_V256, mkexpr(addr)) );
21149      delta += alen+1;
21150      DIP("%s $%d,%s,%s,%s\n",
21151          opname, (Int)imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
21152   }
21153
21154   breakupV256toV128s( preSwap ? argR : argL, &argLhi, &argLlo );
21155   breakupV256toV128s( preSwap ? argL : argR, &argRhi, &argRlo );
21156   assign(plain, binop( Iop_V128HLtoV256,
21157                        binop(op, mkexpr(argLhi), mkexpr(argRhi)),
21158                        binop(op, mkexpr(argLlo), mkexpr(argRlo)) ) );
21159
21160   /* This is simple: just invert the result, if necessary, and
21161      have done. */
21162   if (postNot) {
21163      putYMMReg( rG, unop(Iop_NotV256, mkexpr(plain)) );
21164   } else {
21165      putYMMReg( rG, mkexpr(plain) );
21166   }
21167
21168   *uses_vvvv = True;
21169   return delta;
21170}
21171
21172
21173/* Handles AVX128 unary E-to-G all-lanes operations. */
21174static
21175Long dis_AVX128_E_to_G_unary ( /*OUT*/Bool* uses_vvvv,
21176                               VexAbiInfo* vbi,
21177                               Prefix pfx, Long delta,
21178                               HChar* opname,
21179                               IRTemp (*opFn)(IRTemp) )
21180{
21181   HChar  dis_buf[50];
21182   Int    alen;
21183   IRTemp addr;
21184   IRTemp res  = newTemp(Ity_V128);
21185   IRTemp arg  = newTemp(Ity_V128);
21186   UChar  rm   = getUChar(delta);
21187   UInt   rG   = gregOfRexRM(pfx, rm);
21188   if (epartIsReg(rm)) {
21189      UInt rE = eregOfRexRM(pfx,rm);
21190      assign(arg, getXMMReg(rE));
21191      delta += 1;
21192      DIP("%s %s,%s\n", opname, nameXMMReg(rE), nameXMMReg(rG));
21193   } else {
21194      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21195      assign(arg, loadLE(Ity_V128, mkexpr(addr)));
21196      delta += alen;
21197      DIP("%s %s,%s\n", opname, dis_buf, nameXMMReg(rG));
21198   }
21199   res = opFn(arg);
21200   putYMMRegLoAndZU( rG, mkexpr(res) );
21201   *uses_vvvv = False;
21202   return delta;
21203}
21204
21205
21206/* Handles AVX128 unary E-to-G all-lanes operations. */
21207static
21208Long dis_AVX128_E_to_G_unary_all ( /*OUT*/Bool* uses_vvvv,
21209                                   VexAbiInfo* vbi,
21210                                   Prefix pfx, Long delta,
21211                                   HChar* opname, IROp op )
21212{
21213   HChar  dis_buf[50];
21214   Int    alen;
21215   IRTemp addr;
21216   IRTemp arg  = newTemp(Ity_V128);
21217   UChar  rm   = getUChar(delta);
21218   UInt   rG   = gregOfRexRM(pfx, rm);
21219   if (epartIsReg(rm)) {
21220      UInt rE = eregOfRexRM(pfx,rm);
21221      assign(arg, getXMMReg(rE));
21222      delta += 1;
21223      DIP("%s %s,%s\n", opname, nameXMMReg(rE), nameXMMReg(rG));
21224   } else {
21225      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21226      assign(arg, loadLE(Ity_V128, mkexpr(addr)));
21227      delta += alen;
21228      DIP("%s %s,%s\n", opname, dis_buf, nameXMMReg(rG));
21229   }
21230   putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
21231   *uses_vvvv = False;
21232   return delta;
21233}
21234
21235
21236/* FIXME: common up with the _128_ version above? */
21237static
21238Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG (
21239        /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
21240        Prefix pfx, Long delta, HChar* name,
21241        /* The actual operation.  Use either 'op' or 'opfn',
21242           but not both. */
21243        IROp op, IRTemp(*opFn)(IRTemp,IRTemp),
21244        Bool invertLeftArg,
21245        Bool swapArgs
21246     )
21247{
21248   UChar  modrm = getUChar(delta);
21249   UInt   rD    = gregOfRexRM(pfx, modrm);
21250   UInt   rSL   = getVexNvvvv(pfx);
21251   IRTemp tSL   = newTemp(Ity_V256);
21252   IRTemp tSR   = newTemp(Ity_V256);
21253   IRTemp addr  = IRTemp_INVALID;
21254   HChar  dis_buf[50];
21255   Int    alen  = 0;
21256   vassert(1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*WIG?*/);
21257
21258   assign(tSL, invertLeftArg ? unop(Iop_NotV256, getYMMReg(rSL))
21259                             : getYMMReg(rSL));
21260
21261   if (epartIsReg(modrm)) {
21262      UInt rSR = eregOfRexRM(pfx, modrm);
21263      delta += 1;
21264      assign(tSR, getYMMReg(rSR));
21265      DIP("%s %s,%s,%s\n",
21266          name, nameYMMReg(rSR), nameYMMReg(rSL), nameYMMReg(rD));
21267   } else {
21268      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
21269      delta += alen;
21270      assign(tSR, loadLE(Ity_V256, mkexpr(addr)));
21271      DIP("%s %s,%s,%s\n",
21272          name, dis_buf, nameYMMReg(rSL), nameYMMReg(rD));
21273   }
21274
21275   IRTemp res = IRTemp_INVALID;
21276   if (op != Iop_INVALID) {
21277      vassert(opFn == NULL);
21278      res = newTemp(Ity_V256);
21279      assign(res, swapArgs ? binop(op, mkexpr(tSR), mkexpr(tSL))
21280                           : binop(op, mkexpr(tSL), mkexpr(tSR)));
21281   } else {
21282      vassert(opFn != NULL);
21283      res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR);
21284   }
21285
21286   putYMMReg(rD, mkexpr(res));
21287
21288   *uses_vvvv = True;
21289   return delta;
21290}
21291
21292
21293/* All-lanes AVX256 binary operation:
21294   G[255:0] = V[255:0] `op` E[255:0]
21295*/
21296static Long dis_AVX256_E_V_to_G ( /*OUT*/Bool* uses_vvvv,
21297                                  VexAbiInfo* vbi,
21298                                  Prefix pfx, Long delta,
21299                                  HChar* opname, IROp op )
21300{
21301   return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
21302             uses_vvvv, vbi, pfx, delta, opname, op,
21303             NULL, False/*!invertLeftArg*/, False/*!swapArgs*/
21304   );
21305}
21306
21307
21308/* Handle a VEX_NDS_256_66_0F_WIG (3-addr) insn, using the given IR
21309   generator to compute the result, no inversion of the left
21310   arg, and no swapping of args. */
21311static
21312Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex (
21313        /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
21314        Prefix pfx, Long delta, HChar* name,
21315        IRTemp(*opFn)(IRTemp,IRTemp)
21316     )
21317{
21318   return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
21319             uses_vvvv, vbi, pfx, delta, name,
21320             Iop_INVALID, opFn, False, False );
21321}
21322
21323
21324/* Handles AVX256 unary E-to-G all-lanes operations. */
21325static
21326Long dis_AVX256_E_to_G_unary_all ( /*OUT*/Bool* uses_vvvv,
21327                                   VexAbiInfo* vbi,
21328                                   Prefix pfx, Long delta,
21329                                   HChar* opname, IROp op )
21330{
21331   HChar  dis_buf[50];
21332   Int    alen;
21333   IRTemp addr;
21334   IRTemp arg  = newTemp(Ity_V256);
21335   UChar  rm   = getUChar(delta);
21336   UInt   rG   = gregOfRexRM(pfx, rm);
21337   if (epartIsReg(rm)) {
21338      UInt rE = eregOfRexRM(pfx,rm);
21339      assign(arg, getYMMReg(rE));
21340      delta += 1;
21341      DIP("%s %s,%s\n", opname, nameYMMReg(rE), nameYMMReg(rG));
21342   } else {
21343      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21344      assign(arg, loadLE(Ity_V256, mkexpr(addr)));
21345      delta += alen;
21346      DIP("%s %s,%s\n", opname, dis_buf, nameYMMReg(rG));
21347   }
21348   putYMMReg( rG, unop(op, mkexpr(arg)) );
21349   *uses_vvvv = False;
21350   return delta;
21351}
21352
21353
21354/* The use of ReinterpF64asI64 is ugly.  Surely could do better if we
21355   had a variant of Iop_64x4toV256 that took F64s as args instead. */
21356static Long dis_CVTDQ2PD_256 ( VexAbiInfo* vbi, Prefix pfx,
21357                               Long delta )
21358{
21359   IRTemp addr  = IRTemp_INVALID;
21360   Int    alen  = 0;
21361   HChar  dis_buf[50];
21362   UChar  modrm = getUChar(delta);
21363   IRTemp sV    = newTemp(Ity_V128);
21364   UInt   rG    = gregOfRexRM(pfx,modrm);
21365   if (epartIsReg(modrm)) {
21366      UInt rE = eregOfRexRM(pfx,modrm);
21367      assign( sV, getXMMReg(rE) );
21368      delta += 1;
21369      DIP("vcvtdq2pd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
21370   } else {
21371      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21372      assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
21373      delta += alen;
21374      DIP("vcvtdq2pd %s,%s\n", dis_buf, nameYMMReg(rG) );
21375   }
21376   IRTemp s3, s2, s1, s0;
21377   s3 = s2 = s1 = s0 = IRTemp_INVALID;
21378   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
21379   IRExpr* res
21380      = IRExpr_Qop(
21381           Iop_64x4toV256,
21382           unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s3))),
21383           unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s2))),
21384           unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s1))),
21385           unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s0)))
21386        );
21387   putYMMReg(rG, res);
21388   return delta;
21389}
21390
21391
21392static Long dis_CVTPD2PS_256 ( VexAbiInfo* vbi, Prefix pfx,
21393                               Long delta )
21394{
21395   IRTemp addr  = IRTemp_INVALID;
21396   Int    alen  = 0;
21397   HChar  dis_buf[50];
21398   UChar  modrm = getUChar(delta);
21399   UInt   rG    = gregOfRexRM(pfx,modrm);
21400   IRTemp argV  = newTemp(Ity_V256);
21401   IRTemp rmode = newTemp(Ity_I32);
21402   if (epartIsReg(modrm)) {
21403      UInt rE = eregOfRexRM(pfx,modrm);
21404      assign( argV, getYMMReg(rE) );
21405      delta += 1;
21406      DIP("vcvtpd2psy %s,%s\n", nameYMMReg(rE), nameXMMReg(rG));
21407   } else {
21408      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21409      assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
21410      delta += alen;
21411      DIP("vcvtpd2psy %s,%s\n", dis_buf, nameXMMReg(rG) );
21412   }
21413
21414   assign( rmode, get_sse_roundingmode() );
21415   IRTemp t3, t2, t1, t0;
21416   t3 = t2 = t1 = t0 = IRTemp_INVALID;
21417   breakupV256to64s( argV, &t3, &t2, &t1, &t0 );
21418#  define CVT(_t)  binop( Iop_F64toF32, mkexpr(rmode), \
21419                          unop(Iop_ReinterpI64asF64, mkexpr(_t)) )
21420   putXMMRegLane32F( rG, 3, CVT(t3) );
21421   putXMMRegLane32F( rG, 2, CVT(t2) );
21422   putXMMRegLane32F( rG, 1, CVT(t1) );
21423   putXMMRegLane32F( rG, 0, CVT(t0) );
21424#  undef CVT
21425   putYMMRegLane128( rG, 1, mkV128(0) );
21426   return delta;
21427}
21428
21429
21430__attribute__((noinline))
21431static
21432Long dis_ESC_0F__VEX (
21433        /*MB_OUT*/DisResult* dres,
21434        /*OUT*/   Bool*      uses_vvvv,
21435        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
21436        Bool         resteerCisOk,
21437        void*        callback_opaque,
21438        VexArchInfo* archinfo,
21439        VexAbiInfo*  vbi,
21440        Prefix pfx, Int sz, Long deltaIN
21441     )
21442{
21443   IRTemp addr  = IRTemp_INVALID;
21444   Int    alen  = 0;
21445   HChar  dis_buf[50];
21446   Long   delta = deltaIN;
21447   UChar  opc   = getUChar(delta);
21448   delta++;
21449   *uses_vvvv = False;
21450
21451   switch (opc) {
21452
21453   case 0x10:
21454      /* VMOVSD m64, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */
21455      /* Move 64 bits from E (mem only) to G (lo half xmm).
21456         Bits 255-64 of the dest are zeroed out. */
21457      if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) {
21458         UChar modrm = getUChar(delta);
21459         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21460         UInt   rG   = gregOfRexRM(pfx,modrm);
21461         IRTemp z128 = newTemp(Ity_V128);
21462         assign(z128, mkV128(0));
21463         putXMMReg( rG, mkexpr(z128) );
21464         /* FIXME: ALIGNMENT CHECK? */
21465         putXMMRegLane64( rG, 0, loadLE(Ity_I64, mkexpr(addr)) );
21466         putYMMRegLane128( rG, 1, mkexpr(z128) );
21467         DIP("vmovsd %s,%s\n", dis_buf, nameXMMReg(rG));
21468         delta += alen;
21469         goto decode_success;
21470      }
21471      /* VMOVSD xmm3, xmm2, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */
21472      /* Reg form. */
21473      if (haveF2no66noF3(pfx) && epartIsReg(getUChar(delta))) {
21474         UChar modrm = getUChar(delta);
21475         UInt  rG    = gregOfRexRM(pfx, modrm);
21476         UInt  rE    = eregOfRexRM(pfx, modrm);
21477         UInt  rV    = getVexNvvvv(pfx);
21478         delta++;
21479         DIP("vmovsd %s,%s,%s\n",
21480             nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
21481         IRTemp res = newTemp(Ity_V128);
21482         assign(res, binop(Iop_64HLtoV128,
21483                           getXMMRegLane64(rV, 1),
21484                           getXMMRegLane64(rE, 0)));
21485         putYMMRegLoAndZU(rG, mkexpr(res));
21486         *uses_vvvv = True;
21487         goto decode_success;
21488      }
21489      /* VMOVSS m32, xmm1 = VEX.LIG.F3.0F.WIG 10 /r */
21490      /* Move 32 bits from E (mem only) to G (lo half xmm).
21491         Bits 255-32 of the dest are zeroed out. */
21492      if (haveF3no66noF2(pfx) && !epartIsReg(getUChar(delta))) {
21493         UChar modrm = getUChar(delta);
21494         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21495         UInt   rG   = gregOfRexRM(pfx,modrm);
21496         IRTemp z128 = newTemp(Ity_V128);
21497         assign(z128, mkV128(0));
21498         putXMMReg( rG, mkexpr(z128) );
21499         /* FIXME: ALIGNMENT CHECK? */
21500         putXMMRegLane32( rG, 0, loadLE(Ity_I32, mkexpr(addr)) );
21501         putYMMRegLane128( rG, 1, mkexpr(z128) );
21502         DIP("vmovss %s,%s\n", dis_buf, nameXMMReg(rG));
21503         delta += alen;
21504         goto decode_success;
21505      }
21506      /* VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 10 /r */
21507      /* Reg form. */
21508      if (haveF3no66noF2(pfx) && epartIsReg(getUChar(delta))) {
21509         UChar modrm = getUChar(delta);
21510         UInt  rG    = gregOfRexRM(pfx, modrm);
21511         UInt  rE    = eregOfRexRM(pfx, modrm);
21512         UInt  rV    = getVexNvvvv(pfx);
21513         delta++;
21514         DIP("vmovss %s,%s,%s\n",
21515             nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
21516         IRTemp res = newTemp(Ity_V128);
21517         assign( res, binop( Iop_64HLtoV128,
21518                             getXMMRegLane64(rV, 1),
21519                             binop(Iop_32HLto64,
21520                                   getXMMRegLane32(rV, 1),
21521                                   getXMMRegLane32(rE, 0)) ) );
21522         putYMMRegLoAndZU(rG, mkexpr(res));
21523         *uses_vvvv = True;
21524         goto decode_success;
21525      }
21526      /* VMOVUPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 10 /r */
21527      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
21528         UChar modrm = getUChar(delta);
21529         UInt  rG    = gregOfRexRM(pfx, modrm);
21530         if (epartIsReg(modrm)) {
21531            UInt rE = eregOfRexRM(pfx,modrm);
21532            putYMMRegLoAndZU( rG, getXMMReg( rE ));
21533            DIP("vmovupd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
21534            delta += 1;
21535         } else {
21536            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21537            putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
21538            DIP("vmovupd %s,%s\n", dis_buf, nameXMMReg(rG));
21539            delta += alen;
21540         }
21541         goto decode_success;
21542      }
21543      /* VMOVUPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 10 /r */
21544      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
21545         UChar modrm = getUChar(delta);
21546         UInt  rG    = gregOfRexRM(pfx, modrm);
21547         if (epartIsReg(modrm)) {
21548            UInt rE = eregOfRexRM(pfx,modrm);
21549            putYMMReg( rG, getYMMReg( rE ));
21550            DIP("vmovupd %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
21551            delta += 1;
21552         } else {
21553            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21554            putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
21555            DIP("vmovupd %s,%s\n", dis_buf, nameYMMReg(rG));
21556            delta += alen;
21557         }
21558         goto decode_success;
21559      }
21560      /* VMOVUPS xmm2/m128, xmm1 = VEX.128.0F.WIG 10 /r */
21561      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
21562         UChar modrm = getUChar(delta);
21563         UInt  rG    = gregOfRexRM(pfx, modrm);
21564         if (epartIsReg(modrm)) {
21565            UInt rE = eregOfRexRM(pfx,modrm);
21566            putYMMRegLoAndZU( rG, getXMMReg( rE ));
21567            DIP("vmovups %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
21568            delta += 1;
21569         } else {
21570            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21571            putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
21572            DIP("vmovups %s,%s\n", dis_buf, nameXMMReg(rG));
21573            delta += alen;
21574         }
21575         goto decode_success;
21576      }
21577      /* VMOVUPS ymm2/m256, ymm1 = VEX.256.0F.WIG 10 /r */
21578      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
21579         UChar modrm = getUChar(delta);
21580         UInt  rG    = gregOfRexRM(pfx, modrm);
21581         if (epartIsReg(modrm)) {
21582            UInt rE = eregOfRexRM(pfx,modrm);
21583            putYMMReg( rG, getYMMReg( rE ));
21584            DIP("vmovups %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
21585            delta += 1;
21586         } else {
21587            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21588            putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
21589            DIP("vmovups %s,%s\n", dis_buf, nameYMMReg(rG));
21590            delta += alen;
21591         }
21592         goto decode_success;
21593      }
21594      break;
21595
21596   case 0x11:
21597      /* VMOVSD xmm1, m64 = VEX.LIG.F2.0F.WIG 11 /r */
21598      /* Move 64 bits from G (low half xmm) to mem only. */
21599      if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) {
21600         UChar modrm = getUChar(delta);
21601         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21602         UInt   rG   = gregOfRexRM(pfx,modrm);
21603         /* FIXME: ALIGNMENT CHECK? */
21604         storeLE( mkexpr(addr), getXMMRegLane64(rG, 0));
21605         DIP("vmovsd %s,%s\n", nameXMMReg(rG), dis_buf);
21606         delta += alen;
21607         goto decode_success;
21608      }
21609      /* VMOVSD xmm3, xmm2, xmm1 = VEX.LIG.F2.0F.WIG 11 /r */
21610      /* Reg form. */
21611      if (haveF2no66noF3(pfx) && epartIsReg(getUChar(delta))) {
21612         UChar modrm = getUChar(delta);
21613         UInt  rG    = gregOfRexRM(pfx, modrm);
21614         UInt  rE    = eregOfRexRM(pfx, modrm);
21615         UInt  rV    = getVexNvvvv(pfx);
21616         delta++;
21617         DIP("vmovsd %s,%s,%s\n",
21618             nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
21619         IRTemp res = newTemp(Ity_V128);
21620         assign(res, binop(Iop_64HLtoV128,
21621                           getXMMRegLane64(rV, 1),
21622                           getXMMRegLane64(rE, 0)));
21623         putYMMRegLoAndZU(rG, mkexpr(res));
21624         *uses_vvvv = True;
21625         goto decode_success;
21626      }
21627      /* VMOVSS xmm1, m64 = VEX.LIG.F3.0F.WIG 11 /r */
21628      /* Move 32 bits from G (low 1/4 xmm) to mem only. */
21629      if (haveF3no66noF2(pfx) && !epartIsReg(getUChar(delta))) {
21630         UChar modrm = getUChar(delta);
21631         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21632         UInt   rG   = gregOfRexRM(pfx,modrm);
21633         /* FIXME: ALIGNMENT CHECK? */
21634         storeLE( mkexpr(addr), getXMMRegLane32(rG, 0));
21635         DIP("vmovss %s,%s\n", nameXMMReg(rG), dis_buf);
21636         delta += alen;
21637         goto decode_success;
21638      }
21639      /* VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 11 /r */
21640      /* Reg form. */
21641      if (haveF3no66noF2(pfx) && epartIsReg(getUChar(delta))) {
21642         UChar modrm = getUChar(delta);
21643         UInt  rG    = gregOfRexRM(pfx, modrm);
21644         UInt  rE    = eregOfRexRM(pfx, modrm);
21645         UInt  rV    = getVexNvvvv(pfx);
21646         delta++;
21647         DIP("vmovss %s,%s,%s\n",
21648             nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
21649         IRTemp res = newTemp(Ity_V128);
21650         assign( res, binop( Iop_64HLtoV128,
21651                             getXMMRegLane64(rV, 1),
21652                             binop(Iop_32HLto64,
21653                                   getXMMRegLane32(rV, 1),
21654                                   getXMMRegLane32(rE, 0)) ) );
21655         putYMMRegLoAndZU(rG, mkexpr(res));
21656         *uses_vvvv = True;
21657         goto decode_success;
21658      }
21659      /* VMOVUPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 11 /r */
21660      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
21661         UChar modrm = getUChar(delta);
21662         UInt  rG    = gregOfRexRM(pfx,modrm);
21663         if (epartIsReg(modrm)) {
21664            UInt rE = eregOfRexRM(pfx,modrm);
21665            putYMMRegLoAndZU( rE, getXMMReg(rG) );
21666            DIP("vmovupd %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
21667            delta += 1;
21668         } else {
21669            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21670            storeLE( mkexpr(addr), getXMMReg(rG) );
21671            DIP("vmovupd %s,%s\n", nameXMMReg(rG), dis_buf);
21672            delta += alen;
21673         }
21674         goto decode_success;
21675      }
21676      /* VMOVUPD ymm1, ymm2/m256 = VEX.256.66.0F.WIG 11 /r */
21677      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
21678         UChar modrm = getUChar(delta);
21679         UInt  rG    = gregOfRexRM(pfx,modrm);
21680         if (epartIsReg(modrm)) {
21681            UInt rE = eregOfRexRM(pfx,modrm);
21682            putYMMReg( rE, getYMMReg(rG) );
21683            DIP("vmovupd %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
21684            delta += 1;
21685         } else {
21686            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21687            storeLE( mkexpr(addr), getYMMReg(rG) );
21688            DIP("vmovupd %s,%s\n", nameYMMReg(rG), dis_buf);
21689            delta += alen;
21690         }
21691         goto decode_success;
21692      }
21693      /* VMOVUPS xmm1, xmm2/m128 = VEX.128.0F.WIG 11 /r */
21694      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
21695         UChar modrm = getUChar(delta);
21696         UInt  rG    = gregOfRexRM(pfx,modrm);
21697         if (epartIsReg(modrm)) {
21698            UInt rE = eregOfRexRM(pfx,modrm);
21699            putYMMRegLoAndZU( rE, getXMMReg(rG) );
21700            DIP("vmovups %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
21701            delta += 1;
21702         } else {
21703            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21704            storeLE( mkexpr(addr), getXMMReg(rG) );
21705            DIP("vmovups %s,%s\n", nameXMMReg(rG), dis_buf);
21706            delta += alen;
21707         }
21708         goto decode_success;
21709      }
21710      /* VMOVUPS ymm1, ymm2/m256 = VEX.256.0F.WIG 11 /r */
21711      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
21712         UChar modrm = getUChar(delta);
21713         UInt  rG    = gregOfRexRM(pfx,modrm);
21714         if (epartIsReg(modrm)) {
21715            UInt rE = eregOfRexRM(pfx,modrm);
21716            putYMMReg( rE, getYMMReg(rG) );
21717            DIP("vmovups %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
21718            delta += 1;
21719         } else {
21720            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21721            storeLE( mkexpr(addr), getYMMReg(rG) );
21722            DIP("vmovups %s,%s\n", nameYMMReg(rG), dis_buf);
21723            delta += alen;
21724         }
21725         goto decode_success;
21726      }
21727      break;
21728
21729   case 0x12:
21730      /* VMOVDDUP xmm2/m64, xmm1 = VEX.128.F2.0F.WIG /12 r */
21731      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
21732         delta = dis_MOVDDUP_128( vbi, pfx, delta, True/*isAvx*/ );
21733         goto decode_success;
21734      }
21735      /* VMOVDDUP ymm2/m256, ymm1 = VEX.256.F2.0F.WIG /12 r */
21736      if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
21737         delta = dis_MOVDDUP_256( vbi, pfx, delta );
21738         goto decode_success;
21739      }
21740      /* VMOVHLPS xmm3, xmm2, xmm1 = VEX.NDS.128.0F.WIG 12 /r */
21741      /* Insn only exists in reg form */
21742      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
21743          && epartIsReg(getUChar(delta))) {
21744         UChar modrm = getUChar(delta);
21745         UInt  rG    = gregOfRexRM(pfx, modrm);
21746         UInt  rE    = eregOfRexRM(pfx, modrm);
21747         UInt  rV    = getVexNvvvv(pfx);
21748         delta++;
21749         DIP("vmovhlps %s,%s,%s\n",
21750             nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
21751         IRTemp res = newTemp(Ity_V128);
21752         assign(res, binop(Iop_64HLtoV128,
21753                           getXMMRegLane64(rV, 1),
21754                           getXMMRegLane64(rE, 1)));
21755         putYMMRegLoAndZU(rG, mkexpr(res));
21756         *uses_vvvv = True;
21757         goto decode_success;
21758      }
21759      /* VMOVLPS m64, xmm1, xmm2 = VEX.NDS.128.0F.WIG 12 /r */
21760      /* Insn exists only in mem form, it appears. */
21761      /* VMOVLPD m64, xmm1, xmm2 = VEX.NDS.128.66.0F.WIG 12 /r */
21762      /* Insn exists only in mem form, it appears. */
21763      if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
21764          && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
21765         UChar modrm = getUChar(delta);
21766         UInt  rG    = gregOfRexRM(pfx, modrm);
21767         UInt  rV    = getVexNvvvv(pfx);
21768         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21769         delta += alen;
21770         DIP("vmovlpd %s,%s,%s\n",
21771             dis_buf, nameXMMReg(rV), nameXMMReg(rG));
21772         IRTemp res = newTemp(Ity_V128);
21773         assign(res, binop(Iop_64HLtoV128,
21774                           getXMMRegLane64(rV, 1),
21775                           loadLE(Ity_I64, mkexpr(addr))));
21776         putYMMRegLoAndZU(rG, mkexpr(res));
21777         *uses_vvvv = True;
21778         goto decode_success;
21779      }
21780      /* VMOVSLDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 12 /r */
21781      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
21782         delta = dis_MOVSxDUP_128( vbi, pfx, delta, True/*isAvx*/,
21783                                   True/*isL*/ );
21784         goto decode_success;
21785      }
21786      /* VMOVSLDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 12 /r */
21787      if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
21788         delta = dis_MOVSxDUP_256( vbi, pfx, delta, True/*isL*/ );
21789         goto decode_success;
21790      }
21791      break;
21792
21793   case 0x13:
21794      /* VMOVLPS xmm1, m64 = VEX.128.0F.WIG 13 /r */
21795      /* Insn exists only in mem form, it appears. */
21796      /* VMOVLPD xmm1, m64 = VEX.128.66.0F.WIG 13 /r */
21797      /* Insn exists only in mem form, it appears. */
21798      if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
21799          && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
21800         UChar modrm = getUChar(delta);
21801         UInt  rG    = gregOfRexRM(pfx, modrm);
21802         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21803         delta += alen;
21804         storeLE( mkexpr(addr), getXMMRegLane64( rG, 0));
21805         DIP("vmovlpd %s,%s\n", nameXMMReg(rG), dis_buf);
21806         goto decode_success;
21807      }
21808      break;
21809
21810   case 0x14:
21811   case 0x15:
21812      /* VUNPCKLPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 14 /r */
21813      /* VUNPCKHPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 15 /r */
21814      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
21815         Bool   hi    = opc == 0x15;
21816         UChar  modrm = getUChar(delta);
21817         UInt   rG    = gregOfRexRM(pfx,modrm);
21818         UInt   rV    = getVexNvvvv(pfx);
21819         IRTemp eV    = newTemp(Ity_V128);
21820         IRTemp vV    = newTemp(Ity_V128);
21821         assign( vV, getXMMReg(rV) );
21822         if (epartIsReg(modrm)) {
21823            UInt rE = eregOfRexRM(pfx,modrm);
21824            assign( eV, getXMMReg(rE) );
21825            delta += 1;
21826            DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
21827                nameXMMReg(rE), nameXMMReg(rG));
21828         } else {
21829            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21830            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
21831            delta += alen;
21832            DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
21833                dis_buf, nameXMMReg(rG));
21834         }
21835         IRTemp res = math_UNPCKxPS_128( eV, vV, hi );
21836         putYMMRegLoAndZU( rG, mkexpr(res) );
21837         *uses_vvvv = True;
21838         goto decode_success;
21839      }
21840      /* VUNPCKLPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 14 /r */
21841      /* VUNPCKHPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 15 /r */
21842      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
21843         Bool   hi    = opc == 0x15;
21844         UChar  modrm = getUChar(delta);
21845         UInt   rG    = gregOfRexRM(pfx,modrm);
21846         UInt   rV    = getVexNvvvv(pfx);
21847         IRTemp eV    = newTemp(Ity_V256);
21848         IRTemp vV    = newTemp(Ity_V256);
21849         assign( vV, getYMMReg(rV) );
21850         if (epartIsReg(modrm)) {
21851            UInt rE = eregOfRexRM(pfx,modrm);
21852            assign( eV, getYMMReg(rE) );
21853            delta += 1;
21854            DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
21855                nameYMMReg(rE), nameYMMReg(rG));
21856         } else {
21857            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21858            assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
21859            delta += alen;
21860            DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
21861                dis_buf, nameYMMReg(rG));
21862         }
21863         IRTemp res = math_UNPCKxPS_256( eV, vV, hi );
21864         putYMMReg( rG, mkexpr(res) );
21865         *uses_vvvv = True;
21866         goto decode_success;
21867      }
21868      /* VUNPCKLPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 14 /r */
21869      /* VUNPCKHPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 15 /r */
21870      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
21871         Bool   hi    = opc == 0x15;
21872         UChar  modrm = getUChar(delta);
21873         UInt   rG    = gregOfRexRM(pfx,modrm);
21874         UInt   rV    = getVexNvvvv(pfx);
21875         IRTemp eV    = newTemp(Ity_V128);
21876         IRTemp vV    = newTemp(Ity_V128);
21877         assign( vV, getXMMReg(rV) );
21878         if (epartIsReg(modrm)) {
21879            UInt rE = eregOfRexRM(pfx,modrm);
21880            assign( eV, getXMMReg(rE) );
21881            delta += 1;
21882            DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
21883                nameXMMReg(rE), nameXMMReg(rG));
21884         } else {
21885            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21886            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
21887            delta += alen;
21888            DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
21889                dis_buf, nameXMMReg(rG));
21890         }
21891         IRTemp res = math_UNPCKxPD_128( eV, vV, hi );
21892         putYMMRegLoAndZU( rG, mkexpr(res) );
21893         *uses_vvvv = True;
21894         goto decode_success;
21895      }
21896      /* VUNPCKLPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 14 /r */
21897      /* VUNPCKHPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 15 /r */
21898      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
21899         Bool   hi    = opc == 0x15;
21900         UChar  modrm = getUChar(delta);
21901         UInt   rG    = gregOfRexRM(pfx,modrm);
21902         UInt   rV    = getVexNvvvv(pfx);
21903         IRTemp eV    = newTemp(Ity_V256);
21904         IRTemp vV    = newTemp(Ity_V256);
21905         assign( vV, getYMMReg(rV) );
21906         if (epartIsReg(modrm)) {
21907            UInt rE = eregOfRexRM(pfx,modrm);
21908            assign( eV, getYMMReg(rE) );
21909            delta += 1;
21910            DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
21911                nameYMMReg(rE), nameYMMReg(rG));
21912         } else {
21913            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21914            assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
21915            delta += alen;
21916            DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
21917                dis_buf, nameYMMReg(rG));
21918         }
21919         IRTemp res = math_UNPCKxPD_256( eV, vV, hi );
21920         putYMMReg( rG, mkexpr(res) );
21921         *uses_vvvv = True;
21922         goto decode_success;
21923      }
21924      break;
21925
21926   case 0x16:
21927      /* VMOVLHPS xmm3, xmm2, xmm1 = VEX.NDS.128.0F.WIG 16 /r */
21928      /* Insn only exists in reg form */
21929      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
21930          && epartIsReg(getUChar(delta))) {
21931         UChar modrm = getUChar(delta);
21932         UInt  rG    = gregOfRexRM(pfx, modrm);
21933         UInt  rE    = eregOfRexRM(pfx, modrm);
21934         UInt  rV    = getVexNvvvv(pfx);
21935         delta++;
21936         DIP("vmovlhps %s,%s,%s\n",
21937             nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
21938         IRTemp res = newTemp(Ity_V128);
21939         assign(res, binop(Iop_64HLtoV128,
21940                           getXMMRegLane64(rE, 0),
21941                           getXMMRegLane64(rV, 0)));
21942         putYMMRegLoAndZU(rG, mkexpr(res));
21943         *uses_vvvv = True;
21944         goto decode_success;
21945      }
21946      /* VMOVHPS m64, xmm1, xmm2 = VEX.NDS.128.0F.WIG 16 /r */
21947      /* Insn exists only in mem form, it appears. */
21948      /* VMOVHPD m64, xmm1, xmm2 = VEX.NDS.128.66.0F.WIG 16 /r */
21949      /* Insn exists only in mem form, it appears. */
21950      if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
21951          && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
21952         UChar modrm = getUChar(delta);
21953         UInt  rG    = gregOfRexRM(pfx, modrm);
21954         UInt  rV    = getVexNvvvv(pfx);
21955         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21956         delta += alen;
21957         DIP("vmovhp%c %s,%s,%s\n", have66(pfx) ? 'd' : 's',
21958             dis_buf, nameXMMReg(rV), nameXMMReg(rG));
21959         IRTemp res = newTemp(Ity_V128);
21960         assign(res, binop(Iop_64HLtoV128,
21961                           loadLE(Ity_I64, mkexpr(addr)),
21962                           getXMMRegLane64(rV, 0)));
21963         putYMMRegLoAndZU(rG, mkexpr(res));
21964         *uses_vvvv = True;
21965         goto decode_success;
21966      }
21967      /* VMOVSHDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 16 /r */
21968      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
21969         delta = dis_MOVSxDUP_128( vbi, pfx, delta, True/*isAvx*/,
21970                                   False/*!isL*/ );
21971         goto decode_success;
21972      }
21973      /* VMOVSHDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 16 /r */
21974      if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
21975         delta = dis_MOVSxDUP_256( vbi, pfx, delta, False/*!isL*/ );
21976         goto decode_success;
21977      }
21978      break;
21979
21980   case 0x17:
21981      /* VMOVHPS xmm1, m64 = VEX.128.0F.WIG 17 /r */
21982      /* Insn exists only in mem form, it appears. */
21983      /* VMOVHPD xmm1, m64 = VEX.128.66.0F.WIG 17 /r */
21984      /* Insn exists only in mem form, it appears. */
21985      if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
21986          && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
21987         UChar modrm = getUChar(delta);
21988         UInt  rG    = gregOfRexRM(pfx, modrm);
21989         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21990         delta += alen;
21991         storeLE( mkexpr(addr), getXMMRegLane64( rG, 1));
21992         DIP("vmovhp%c %s,%s\n", have66(pfx) ? 'd' : 's',
21993             nameXMMReg(rG), dis_buf);
21994         goto decode_success;
21995      }
21996      break;
21997
21998   case 0x28:
21999      /* VMOVAPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 28 /r */
22000      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22001         UChar modrm = getUChar(delta);
22002         UInt  rG    = gregOfRexRM(pfx, modrm);
22003         if (epartIsReg(modrm)) {
22004            UInt rE = eregOfRexRM(pfx,modrm);
22005            putYMMRegLoAndZU( rG, getXMMReg( rE ));
22006            DIP("vmovapd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
22007            delta += 1;
22008         } else {
22009            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22010            gen_SEGV_if_not_16_aligned( addr );
22011            putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
22012            DIP("vmovapd %s,%s\n", dis_buf, nameXMMReg(rG));
22013            delta += alen;
22014         }
22015         goto decode_success;
22016      }
22017      /* VMOVAPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 28 /r */
22018      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22019         UChar modrm = getUChar(delta);
22020         UInt  rG    = gregOfRexRM(pfx, modrm);
22021         if (epartIsReg(modrm)) {
22022            UInt rE = eregOfRexRM(pfx,modrm);
22023            putYMMReg( rG, getYMMReg( rE ));
22024            DIP("vmovapd %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
22025            delta += 1;
22026         } else {
22027            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22028            gen_SEGV_if_not_32_aligned( addr );
22029            putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
22030            DIP("vmovapd %s,%s\n", dis_buf, nameYMMReg(rG));
22031            delta += alen;
22032         }
22033         goto decode_success;
22034      }
22035      /* VMOVAPS xmm2/m128, xmm1 = VEX.128.0F.WIG 28 /r */
22036      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22037         UChar modrm = getUChar(delta);
22038         UInt  rG    = gregOfRexRM(pfx, modrm);
22039         if (epartIsReg(modrm)) {
22040            UInt rE = eregOfRexRM(pfx,modrm);
22041            putYMMRegLoAndZU( rG, getXMMReg( rE ));
22042            DIP("vmovaps %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
22043            delta += 1;
22044         } else {
22045            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22046            gen_SEGV_if_not_16_aligned( addr );
22047            putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
22048            DIP("vmovaps %s,%s\n", dis_buf, nameXMMReg(rG));
22049            delta += alen;
22050         }
22051         goto decode_success;
22052      }
22053      /* VMOVAPS ymm2/m256, ymm1 = VEX.256.0F.WIG 28 /r */
22054      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22055         UChar modrm = getUChar(delta);
22056         UInt  rG    = gregOfRexRM(pfx, modrm);
22057         if (epartIsReg(modrm)) {
22058            UInt rE = eregOfRexRM(pfx,modrm);
22059            putYMMReg( rG, getYMMReg( rE ));
22060            DIP("vmovaps %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
22061            delta += 1;
22062         } else {
22063            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22064            gen_SEGV_if_not_32_aligned( addr );
22065            putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
22066            DIP("vmovaps %s,%s\n", dis_buf, nameYMMReg(rG));
22067            delta += alen;
22068         }
22069         goto decode_success;
22070      }
22071      break;
22072
22073   case 0x29:
22074      /* VMOVAPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 29 /r */
22075      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22076         UChar modrm = getUChar(delta);
22077         UInt  rG    = gregOfRexRM(pfx,modrm);
22078         if (epartIsReg(modrm)) {
22079            UInt rE = eregOfRexRM(pfx,modrm);
22080            putYMMRegLoAndZU( rE, getXMMReg(rG) );
22081            DIP("vmovapd %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
22082            delta += 1;
22083         } else {
22084            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22085            gen_SEGV_if_not_16_aligned( addr );
22086            storeLE( mkexpr(addr), getXMMReg(rG) );
22087            DIP("vmovapd %s,%s\n", nameXMMReg(rG), dis_buf );
22088            delta += alen;
22089         }
22090         goto decode_success;
22091      }
22092      /* VMOVAPD ymm1, ymm2/m256 = VEX.256.66.0F.WIG 29 /r */
22093      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22094         UChar modrm = getUChar(delta);
22095         UInt  rG    = gregOfRexRM(pfx,modrm);
22096         if (epartIsReg(modrm)) {
22097            UInt rE = eregOfRexRM(pfx,modrm);
22098            putYMMReg( rE, getYMMReg(rG) );
22099            DIP("vmovapd %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
22100            delta += 1;
22101         } else {
22102            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22103            gen_SEGV_if_not_32_aligned( addr );
22104            storeLE( mkexpr(addr), getYMMReg(rG) );
22105            DIP("vmovapd %s,%s\n", nameYMMReg(rG), dis_buf );
22106            delta += alen;
22107         }
22108         goto decode_success;
22109      }
22110      /* VMOVAPS xmm1, xmm2/m128 = VEX.128.0F.WIG 29 /r */
22111      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22112         UChar modrm = getUChar(delta);
22113         UInt  rG    = gregOfRexRM(pfx,modrm);
22114         if (epartIsReg(modrm)) {
22115            UInt rE = eregOfRexRM(pfx,modrm);
22116            putYMMRegLoAndZU( rE, getXMMReg(rG) );
22117            DIP("vmovaps %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
22118            delta += 1;
22119            goto decode_success;
22120         } else {
22121            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22122            gen_SEGV_if_not_16_aligned( addr );
22123            storeLE( mkexpr(addr), getXMMReg(rG) );
22124            DIP("vmovaps %s,%s\n", nameXMMReg(rG), dis_buf );
22125            delta += alen;
22126            goto decode_success;
22127         }
22128      }
22129      /* VMOVAPS ymm1, ymm2/m256 = VEX.256.0F.WIG 29 /r */
22130      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22131         UChar modrm = getUChar(delta);
22132         UInt  rG    = gregOfRexRM(pfx,modrm);
22133         if (epartIsReg(modrm)) {
22134            UInt rE = eregOfRexRM(pfx,modrm);
22135            putYMMReg( rE, getYMMReg(rG) );
22136            DIP("vmovaps %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
22137            delta += 1;
22138            goto decode_success;
22139         } else {
22140            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22141            gen_SEGV_if_not_32_aligned( addr );
22142            storeLE( mkexpr(addr), getYMMReg(rG) );
22143            DIP("vmovaps %s,%s\n", nameYMMReg(rG), dis_buf );
22144            delta += alen;
22145            goto decode_success;
22146         }
22147      }
22148      break;
22149
22150   case 0x2A: {
22151      IRTemp rmode = newTemp(Ity_I32);
22152      assign( rmode, get_sse_roundingmode() );
22153      /* VCVTSI2SD r/m32, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W0 2A /r */
22154      if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
22155         UChar  modrm = getUChar(delta);
22156         UInt   rV    = getVexNvvvv(pfx);
22157         UInt   rD    = gregOfRexRM(pfx, modrm);
22158         IRTemp arg32 = newTemp(Ity_I32);
22159         if (epartIsReg(modrm)) {
22160            UInt rS = eregOfRexRM(pfx,modrm);
22161            assign( arg32, getIReg32(rS) );
22162            delta += 1;
22163            DIP("vcvtsi2sdl %s,%s,%s\n",
22164                nameIReg32(rS), nameXMMReg(rV), nameXMMReg(rD));
22165         } else {
22166            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22167            assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
22168            delta += alen;
22169            DIP("vcvtsi2sdl %s,%s,%s\n",
22170                dis_buf, nameXMMReg(rV), nameXMMReg(rD));
22171         }
22172         putXMMRegLane64F( rD, 0,
22173                           unop(Iop_I32StoF64, mkexpr(arg32)));
22174         putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
22175         putYMMRegLane128( rD, 1, mkV128(0) );
22176         *uses_vvvv = True;
22177         goto decode_success;
22178      }
22179      /* VCVTSI2SD r/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W1 2A /r */
22180      if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
22181         UChar  modrm = getUChar(delta);
22182         UInt   rV    = getVexNvvvv(pfx);
22183         UInt   rD    = gregOfRexRM(pfx, modrm);
22184         IRTemp arg64 = newTemp(Ity_I64);
22185         if (epartIsReg(modrm)) {
22186            UInt rS = eregOfRexRM(pfx,modrm);
22187            assign( arg64, getIReg64(rS) );
22188            delta += 1;
22189            DIP("vcvtsi2sdq %s,%s,%s\n",
22190                nameIReg64(rS), nameXMMReg(rV), nameXMMReg(rD));
22191         } else {
22192            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22193            assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
22194            delta += alen;
22195            DIP("vcvtsi2sdq %s,%s,%s\n",
22196                dis_buf, nameXMMReg(rV), nameXMMReg(rD));
22197         }
22198         putXMMRegLane64F( rD, 0,
22199                           binop( Iop_I64StoF64,
22200                                  get_sse_roundingmode(),
22201                                  mkexpr(arg64)) );
22202         putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
22203         putYMMRegLane128( rD, 1, mkV128(0) );
22204         *uses_vvvv = True;
22205         goto decode_success;
22206      }
22207      /* VCVTSI2SS r/m64, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W1 2A /r */
22208      if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
22209         UChar  modrm = getUChar(delta);
22210         UInt   rV    = getVexNvvvv(pfx);
22211         UInt   rD    = gregOfRexRM(pfx, modrm);
22212         IRTemp arg64 = newTemp(Ity_I64);
22213         if (epartIsReg(modrm)) {
22214            UInt rS = eregOfRexRM(pfx,modrm);
22215            assign( arg64, getIReg64(rS) );
22216            delta += 1;
22217            DIP("vcvtsi2ssq %s,%s,%s\n",
22218                nameIReg64(rS), nameXMMReg(rV), nameXMMReg(rD));
22219         } else {
22220            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22221            assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
22222            delta += alen;
22223            DIP("vcvtsi2ssq %s,%s,%s\n",
22224                dis_buf, nameXMMReg(rV), nameXMMReg(rD));
22225         }
22226         putXMMRegLane32F( rD, 0,
22227                           binop(Iop_F64toF32,
22228                                 mkexpr(rmode),
22229                                 binop(Iop_I64StoF64, mkexpr(rmode),
22230                                                      mkexpr(arg64)) ) );
22231         putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
22232         putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
22233         putYMMRegLane128( rD, 1, mkV128(0) );
22234         *uses_vvvv = True;
22235         goto decode_success;
22236      }
22237      /* VCVTSI2SS r/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W0 2A /r */
22238      if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
22239         UChar  modrm = getUChar(delta);
22240         UInt   rV    = getVexNvvvv(pfx);
22241         UInt   rD    = gregOfRexRM(pfx, modrm);
22242         IRTemp arg32 = newTemp(Ity_I32);
22243         if (epartIsReg(modrm)) {
22244            UInt rS = eregOfRexRM(pfx,modrm);
22245            assign( arg32, getIReg32(rS) );
22246            delta += 1;
22247            DIP("vcvtsi2ssl %s,%s,%s\n",
22248                nameIReg32(rS), nameXMMReg(rV), nameXMMReg(rD));
22249         } else {
22250            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22251            assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
22252            delta += alen;
22253            DIP("vcvtsi2ssl %s,%s,%s\n",
22254                dis_buf, nameXMMReg(rV), nameXMMReg(rD));
22255         }
22256         putXMMRegLane32F( rD, 0,
22257                           binop(Iop_F64toF32,
22258                                 mkexpr(rmode),
22259                                 unop(Iop_I32StoF64, mkexpr(arg32)) ) );
22260         putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
22261         putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
22262         putYMMRegLane128( rD, 1, mkV128(0) );
22263         *uses_vvvv = True;
22264         goto decode_success;
22265      }
22266      break;
22267   }
22268
22269   case 0x2B:
22270      /* VMOVNTPD xmm1, m128 = VEX.128.66.0F.WIG 2B /r */
22271      /* VMOVNTPS xmm1, m128 = VEX.128.0F.WIG 2B /r */
22272      if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
22273          && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
22274         UChar  modrm = getUChar(delta);
22275         UInt   rS    = gregOfRexRM(pfx, modrm);
22276         IRTemp tS    = newTemp(Ity_V128);
22277         assign(tS, getXMMReg(rS));
22278         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
22279         delta += alen;
22280         gen_SEGV_if_not_16_aligned(addr);
22281         storeLE(mkexpr(addr), mkexpr(tS));
22282         DIP("vmovntp%c %s,%s\n", have66(pfx) ? 'd' : 's',
22283             nameXMMReg(rS), dis_buf);
22284         goto decode_success;
22285      }
22286      /* VMOVNTPD ymm1, m256 = VEX.256.66.0F.WIG 2B /r */
22287      /* VMOVNTPS ymm1, m256 = VEX.256.0F.WIG 2B /r */
22288      if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
22289          && 1==getVexL(pfx)/*256*/ && !epartIsReg(getUChar(delta))) {
22290         UChar  modrm = getUChar(delta);
22291         UInt   rS    = gregOfRexRM(pfx, modrm);
22292         IRTemp tS    = newTemp(Ity_V256);
22293         assign(tS, getYMMReg(rS));
22294         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
22295         delta += alen;
22296         gen_SEGV_if_not_32_aligned(addr);
22297         storeLE(mkexpr(addr), mkexpr(tS));
22298         DIP("vmovntp%c %s,%s\n", have66(pfx) ? 'd' : 's',
22299             nameYMMReg(rS), dis_buf);
22300         goto decode_success;
22301      }
22302      break;
22303
22304   case 0x2C:
22305      /* VCVTTSD2SI xmm1/m32, r32 = VEX.LIG.F2.0F.W0 2C /r */
22306      if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
22307         delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
22308         goto decode_success;
22309      }
22310      /* VCVTTSD2SI xmm1/m64, r64 = VEX.LIG.F2.0F.W1 2C /r */
22311      if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
22312         delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
22313         goto decode_success;
22314      }
22315      /* VCVTTSS2SI xmm1/m32, r32 = VEX.LIG.F3.0F.W0 2C /r */
22316      if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
22317         delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
22318         goto decode_success;
22319      }
22320      /* VCVTTSS2SI xmm1/m64, r64 = VEX.LIG.F3.0F.W1 2C /r */
22321      if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
22322         delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
22323         goto decode_success;
22324      }
22325      break;
22326
22327   case 0x2D:
22328      /* VCVTSD2SI xmm1/m32, r32 = VEX.LIG.F2.0F.W0 2D /r */
22329      if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
22330         delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
22331         goto decode_success;
22332      }
22333      /* VCVTSD2SI xmm1/m64, r64 = VEX.LIG.F2.0F.W1 2D /r */
22334      if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
22335         delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
22336         goto decode_success;
22337      }
22338      /* VCVTSS2SI xmm1/m32, r32 = VEX.LIG.F3.0F.W0 2D /r */
22339      if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
22340         delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
22341         goto decode_success;
22342      }
22343      /* VCVTSS2SI xmm1/m64, r64 = VEX.LIG.F3.0F.W1 2D /r */
22344      if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
22345         delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
22346         goto decode_success;
22347      }
22348      break;
22349
22350   case 0x2E:
22351   case 0x2F:
22352      /* VUCOMISD xmm2/m64, xmm1 = VEX.LIG.66.0F.WIG 2E /r */
22353      /* VCOMISD  xmm2/m64, xmm1 = VEX.LIG.66.0F.WIG 2F /r */
22354      if (have66noF2noF3(pfx)) {
22355         delta = dis_COMISD( vbi, pfx, delta, True/*isAvx*/, opc );
22356         goto decode_success;
22357      }
22358      /* VUCOMISS xmm2/m32, xmm1 = VEX.LIG.0F.WIG 2E /r */
22359      /* VCOMISS xmm2/m32, xmm1  = VEX.LIG.0F.WIG 2F /r */
22360      if (haveNo66noF2noF3(pfx)) {
22361         delta = dis_COMISS( vbi, pfx, delta, True/*isAvx*/, opc );
22362         goto decode_success;
22363      }
22364      break;
22365
22366   case 0x50:
22367      /* VMOVMSKPD xmm2, r32 = VEX.128.66.0F.WIG 50 /r */
22368      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22369         delta = dis_MOVMSKPD_128( vbi, pfx, delta, True/*isAvx*/ );
22370         goto decode_success;
22371      }
22372      /* VMOVMSKPD ymm2, r32 = VEX.256.66.0F.WIG 50 /r */
22373      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22374         delta = dis_MOVMSKPD_256( vbi, pfx, delta );
22375         goto decode_success;
22376      }
22377      /* VMOVMSKPS xmm2, r32 = VEX.128.0F.WIG 50 /r */
22378      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22379         delta = dis_MOVMSKPS_128( vbi, pfx, delta, True/*isAvx*/ );
22380         goto decode_success;
22381      }
22382      /* VMOVMSKPS ymm2, r32 = VEX.256.0F.WIG 50 /r */
22383      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22384         delta = dis_MOVMSKPS_256( vbi, pfx, delta );
22385         goto decode_success;
22386      }
22387      break;
22388
22389   case 0x51:
22390      /* VSQRTSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 51 /r */
22391      if (haveF3no66noF2(pfx)) {
22392         delta = dis_AVX128_E_V_to_G_lo32_unary(
22393                    uses_vvvv, vbi, pfx, delta, "vsqrtss", Iop_Sqrt32F0x4 );
22394         goto decode_success;
22395      }
22396      /* VSQRTPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 51 /r */
22397      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22398         delta = dis_AVX128_E_to_G_unary_all(
22399                    uses_vvvv, vbi, pfx, delta, "vsqrtps", Iop_Sqrt32Fx4 );
22400         goto decode_success;
22401      }
22402      /* VSQRTPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 51 /r */
22403      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22404         delta = dis_AVX256_E_to_G_unary_all(
22405                    uses_vvvv, vbi, pfx, delta, "vsqrtps", Iop_Sqrt32Fx8 );
22406         goto decode_success;
22407      }
22408      /* VSQRTSD xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F2.0F.WIG 51 /r */
22409      if (haveF2no66noF3(pfx)) {
22410         delta = dis_AVX128_E_V_to_G_lo64_unary(
22411                    uses_vvvv, vbi, pfx, delta, "vsqrtsd", Iop_Sqrt64F0x2 );
22412         goto decode_success;
22413      }
22414      /* VSQRTPD xmm2/m128(E), xmm1(G) = VEX.NDS.128.66.0F.WIG 51 /r */
22415      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22416         delta = dis_AVX128_E_to_G_unary_all(
22417                    uses_vvvv, vbi, pfx, delta, "vsqrtpd", Iop_Sqrt64Fx2 );
22418         goto decode_success;
22419      }
22420      /* VSQRTPD ymm2/m256(E), ymm1(G) = VEX.NDS.256.66.0F.WIG 51 /r */
22421      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22422         delta = dis_AVX256_E_to_G_unary_all(
22423                    uses_vvvv, vbi, pfx, delta, "vsqrtpd", Iop_Sqrt64Fx4 );
22424         goto decode_success;
22425      }
22426      break;
22427
22428   case 0x52:
22429      /* VRSQRTSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 52 /r */
22430      if (haveF3no66noF2(pfx)) {
22431         delta = dis_AVX128_E_V_to_G_lo32_unary(
22432                    uses_vvvv, vbi, pfx, delta, "vrsqrtss", Iop_RSqrt32F0x4 );
22433         goto decode_success;
22434      }
22435      /* VRSQRTPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 52 /r */
22436      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22437         delta = dis_AVX128_E_to_G_unary_all(
22438                    uses_vvvv, vbi, pfx, delta, "vrsqrtps", Iop_RSqrt32Fx4 );
22439         goto decode_success;
22440      }
22441      /* VRSQRTPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 52 /r */
22442      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22443         delta = dis_AVX256_E_to_G_unary_all(
22444                    uses_vvvv, vbi, pfx, delta, "vrsqrtps", Iop_RSqrt32Fx8 );
22445         goto decode_success;
22446      }
22447      break;
22448
22449   case 0x53:
22450      /* VRCPSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 53 /r */
22451      if (haveF3no66noF2(pfx)) {
22452         delta = dis_AVX128_E_V_to_G_lo32_unary(
22453                    uses_vvvv, vbi, pfx, delta, "vrcpss", Iop_Recip32F0x4 );
22454         goto decode_success;
22455      }
22456      /* VRCPPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 53 /r */
22457      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22458         delta = dis_AVX128_E_to_G_unary_all(
22459                    uses_vvvv, vbi, pfx, delta, "vrcpps", Iop_Recip32Fx4 );
22460         goto decode_success;
22461      }
22462      /* VRCPPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 53 /r */
22463      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22464         delta = dis_AVX256_E_to_G_unary_all(
22465                    uses_vvvv, vbi, pfx, delta, "vrcpps", Iop_Recip32Fx8 );
22466         goto decode_success;
22467      }
22468      break;
22469
22470   case 0x54:
22471      /* VANDPD r/m, rV, r ::: r = rV & r/m */
22472      /* VANDPD = VEX.NDS.128.66.0F.WIG 54 /r */
22473      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22474         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
22475                    uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV128 );
22476         goto decode_success;
22477      }
22478      /* VANDPD r/m, rV, r ::: r = rV & r/m */
22479      /* VANDPD = VEX.NDS.256.66.0F.WIG 54 /r */
22480      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22481         delta = dis_AVX256_E_V_to_G(
22482                    uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV256 );
22483         goto decode_success;
22484      }
22485      /* VANDPS = VEX.NDS.128.0F.WIG 54 /r */
22486      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22487         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
22488                    uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV128 );
22489         goto decode_success;
22490      }
22491      /* VANDPS = VEX.NDS.256.0F.WIG 54 /r */
22492      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22493         delta = dis_AVX256_E_V_to_G(
22494                    uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV256 );
22495         goto decode_success;
22496      }
22497      break;
22498
22499   case 0x55:
22500      /* VANDNPD r/m, rV, r ::: r = (not rV) & r/m */
22501      /* VANDNPD = VEX.NDS.128.66.0F.WIG 55 /r */
22502      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22503         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
22504                    uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV128,
22505                    NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
22506         goto decode_success;
22507      }
22508      /* VANDNPD = VEX.NDS.256.66.0F.WIG 55 /r */
22509      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22510         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
22511                    uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV256,
22512                    NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
22513         goto decode_success;
22514      }
22515      /* VANDNPS = VEX.NDS.128.0F.WIG 55 /r */
22516      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22517         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
22518                    uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV128,
22519                    NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
22520         goto decode_success;
22521      }
22522      /* VANDNPS = VEX.NDS.256.0F.WIG 55 /r */
22523      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22524         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
22525                    uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV256,
22526                    NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
22527         goto decode_success;
22528      }
22529      break;
22530
22531   case 0x56:
22532      /* VORPD r/m, rV, r ::: r = rV | r/m */
22533      /* VORPD = VEX.NDS.128.66.0F.WIG 56 /r */
22534      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22535         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
22536                    uses_vvvv, vbi, pfx, delta, "vorpd", Iop_OrV128 );
22537         goto decode_success;
22538      }
22539      /* VORPD r/m, rV, r ::: r = rV | r/m */
22540      /* VORPD = VEX.NDS.256.66.0F.WIG 56 /r */
22541      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22542         delta = dis_AVX256_E_V_to_G(
22543                    uses_vvvv, vbi, pfx, delta, "vorpd", Iop_OrV256 );
22544         goto decode_success;
22545      }
22546      /* VORPS r/m, rV, r ::: r = rV | r/m */
22547      /* VORPS = VEX.NDS.128.0F.WIG 56 /r */
22548      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22549         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
22550                    uses_vvvv, vbi, pfx, delta, "vorps", Iop_OrV128 );
22551         goto decode_success;
22552      }
22553      /* VORPS r/m, rV, r ::: r = rV | r/m */
22554      /* VORPS = VEX.NDS.256.0F.WIG 56 /r */
22555      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22556         delta = dis_AVX256_E_V_to_G(
22557                    uses_vvvv, vbi, pfx, delta, "vorps", Iop_OrV256 );
22558         goto decode_success;
22559      }
22560      break;
22561
22562   case 0x57:
22563      /* VXORPD r/m, rV, r ::: r = rV ^ r/m */
22564      /* VXORPD = VEX.NDS.128.66.0F.WIG 57 /r */
22565      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22566         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
22567                    uses_vvvv, vbi, pfx, delta, "vxorpd", Iop_XorV128 );
22568         goto decode_success;
22569      }
22570      /* VXORPD r/m, rV, r ::: r = rV ^ r/m */
22571      /* VXORPD = VEX.NDS.256.66.0F.WIG 57 /r */
22572      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22573         delta = dis_AVX256_E_V_to_G(
22574                    uses_vvvv, vbi, pfx, delta, "vxorpd", Iop_XorV256 );
22575         goto decode_success;
22576      }
22577      /* VXORPS r/m, rV, r ::: r = rV ^ r/m */
22578      /* VXORPS = VEX.NDS.128.0F.WIG 57 /r */
22579      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22580         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
22581                    uses_vvvv, vbi, pfx, delta, "vxorps", Iop_XorV128 );
22582         goto decode_success;
22583      }
22584      /* VXORPS r/m, rV, r ::: r = rV ^ r/m */
22585      /* VXORPS = VEX.NDS.256.0F.WIG 57 /r */
22586      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22587         delta = dis_AVX256_E_V_to_G(
22588                    uses_vvvv, vbi, pfx, delta, "vxorps", Iop_XorV256 );
22589         goto decode_success;
22590      }
22591      break;
22592
22593   case 0x58:
22594      /* VADDSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 58 /r */
22595      if (haveF2no66noF3(pfx)) {
22596         delta = dis_AVX128_E_V_to_G_lo64(
22597                    uses_vvvv, vbi, pfx, delta, "vaddsd", Iop_Add64F0x2 );
22598         goto decode_success;
22599      }
22600      /* VADDSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 58 /r */
22601      if (haveF3no66noF2(pfx)) {
22602         delta = dis_AVX128_E_V_to_G_lo32(
22603                    uses_vvvv, vbi, pfx, delta, "vaddss", Iop_Add32F0x4 );
22604         goto decode_success;
22605      }
22606      /* VADDPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 58 /r */
22607      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22608         delta = dis_AVX128_E_V_to_G(
22609                    uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx4 );
22610         goto decode_success;
22611      }
22612      /* VADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 58 /r */
22613      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22614         delta = dis_AVX256_E_V_to_G(
22615                    uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx8 );
22616         goto decode_success;
22617      }
22618      /* VADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 58 /r */
22619      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22620         delta = dis_AVX128_E_V_to_G(
22621                    uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx2 );
22622         goto decode_success;
22623      }
22624      /* VADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 58 /r */
22625      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22626         delta = dis_AVX256_E_V_to_G(
22627                    uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx4 );
22628         goto decode_success;
22629      }
22630      break;
22631
22632   case 0x59:
22633      /* VMULSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 59 /r */
22634      if (haveF2no66noF3(pfx)) {
22635         delta = dis_AVX128_E_V_to_G_lo64(
22636                    uses_vvvv, vbi, pfx, delta, "vmulsd", Iop_Mul64F0x2 );
22637         goto decode_success;
22638      }
22639      /* VMULSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 59 /r */
22640      if (haveF3no66noF2(pfx)) {
22641         delta = dis_AVX128_E_V_to_G_lo32(
22642                    uses_vvvv, vbi, pfx, delta, "vmulss", Iop_Mul32F0x4 );
22643         goto decode_success;
22644      }
22645      /* VMULPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 59 /r */
22646      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22647         delta = dis_AVX128_E_V_to_G(
22648                    uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx4 );
22649         goto decode_success;
22650      }
22651      /* VMULPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 59 /r */
22652      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22653         delta = dis_AVX256_E_V_to_G(
22654                    uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx8 );
22655         goto decode_success;
22656      }
22657      /* VMULPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 59 /r */
22658      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22659         delta = dis_AVX128_E_V_to_G(
22660                    uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx2 );
22661         goto decode_success;
22662      }
22663      /* VMULPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 59 /r */
22664      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22665         delta = dis_AVX256_E_V_to_G(
22666                    uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx4 );
22667         goto decode_success;
22668      }
22669      break;
22670
22671   case 0x5A:
22672      /* VCVTPS2PD xmm2/m64, xmm1 = VEX.128.0F.WIG 5A /r */
22673      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22674         delta = dis_CVTPS2PD_128( vbi, pfx, delta, True/*isAvx*/ );
22675         goto decode_success;
22676      }
22677      /* VCVTPS2PD xmm2/m128, ymm1 = VEX.256.0F.WIG 5A /r */
22678      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22679         delta = dis_CVTPS2PD_256( vbi, pfx, delta );
22680         goto decode_success;
22681      }
22682      /* VCVTPD2PS xmm2/m128, xmm1 = VEX.128.66.0F.WIG 5A /r */
22683      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22684         delta = dis_CVTPD2PS_128( vbi, pfx, delta, True/*isAvx*/ );
22685         goto decode_success;
22686      }
22687      /* VCVTPD2PS ymm2/m256, xmm1 = VEX.256.66.0F.WIG 5A /r */
22688      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22689         delta = dis_CVTPD2PS_256( vbi, pfx, delta );
22690         goto decode_success;
22691      }
22692      /* VCVTSD2SS xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5A /r */
22693      if (haveF2no66noF3(pfx)) {
22694         UChar  modrm = getUChar(delta);
22695         UInt   rV    = getVexNvvvv(pfx);
22696         UInt   rD    = gregOfRexRM(pfx, modrm);
22697         IRTemp f64lo = newTemp(Ity_F64);
22698         IRTemp rmode = newTemp(Ity_I32);
22699         assign( rmode, get_sse_roundingmode() );
22700         if (epartIsReg(modrm)) {
22701            UInt rS = eregOfRexRM(pfx,modrm);
22702            assign(f64lo, getXMMRegLane64F(rS, 0));
22703            delta += 1;
22704            DIP("vcvtsd2ss %s,%s,%s\n",
22705                nameXMMReg(rS), nameXMMReg(rV), nameXMMReg(rD));
22706         } else {
22707            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22708            assign(f64lo, loadLE(Ity_F64, mkexpr(addr)) );
22709            delta += alen;
22710            DIP("vcvtsd2ss %s,%s,%s\n",
22711                dis_buf, nameXMMReg(rV), nameXMMReg(rD));
22712         }
22713         putXMMRegLane32F( rD, 0,
22714                           binop( Iop_F64toF32, mkexpr(rmode),
22715                                                mkexpr(f64lo)) );
22716         putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
22717         putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
22718         putYMMRegLane128( rD, 1, mkV128(0) );
22719         *uses_vvvv = True;
22720         goto decode_success;
22721      }
22722      /* VCVTSS2SD xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5A /r */
22723      if (haveF3no66noF2(pfx)) {
22724         UChar  modrm = getUChar(delta);
22725         UInt   rV    = getVexNvvvv(pfx);
22726         UInt   rD    = gregOfRexRM(pfx, modrm);
22727         IRTemp f32lo = newTemp(Ity_F32);
22728         if (epartIsReg(modrm)) {
22729            UInt rS = eregOfRexRM(pfx,modrm);
22730            assign(f32lo, getXMMRegLane32F(rS, 0));
22731            delta += 1;
22732            DIP("vcvtss2sd %s,%s,%s\n",
22733                nameXMMReg(rS), nameXMMReg(rV), nameXMMReg(rD));
22734         } else {
22735            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22736            assign(f32lo, loadLE(Ity_F32, mkexpr(addr)) );
22737            delta += alen;
22738            DIP("vcvtss2sd %s,%s,%s\n",
22739                dis_buf, nameXMMReg(rV), nameXMMReg(rD));
22740         }
22741         putXMMRegLane64F( rD, 0,
22742                           unop( Iop_F32toF64, mkexpr(f32lo)) );
22743         putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
22744         putYMMRegLane128( rD, 1, mkV128(0) );
22745         *uses_vvvv = True;
22746         goto decode_success;
22747      }
22748      break;
22749
22750   case 0x5B:
22751      /* VCVTPS2DQ xmm2/m128, xmm1 = VEX.128.66.0F.WIG 5B /r */
22752      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22753         delta = dis_CVTxPS2DQ_128( vbi, pfx, delta,
22754                                    True/*isAvx*/, False/*!r2zero*/ );
22755         goto decode_success;
22756      }
22757      /* VCVTPS2DQ ymm2/m256, ymm1 = VEX.256.66.0F.WIG 5B /r */
22758      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22759         delta = dis_CVTxPS2DQ_256( vbi, pfx, delta,
22760                                    False/*!r2zero*/ );
22761         goto decode_success;
22762      }
22763      /* VCVTTPS2DQ xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 5B /r */
22764      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
22765         delta = dis_CVTxPS2DQ_128( vbi, pfx, delta,
22766                                    True/*isAvx*/, True/*r2zero*/ );
22767         goto decode_success;
22768      }
22769      /* VCVTTPS2DQ ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 5B /r */
22770      if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
22771         delta = dis_CVTxPS2DQ_256( vbi, pfx, delta,
22772                                    True/*r2zero*/ );
22773         goto decode_success;
22774      }
22775      /* VCVTDQ2PS xmm2/m128, xmm1 = VEX.128.0F.WIG 5B /r */
22776      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22777         delta = dis_CVTDQ2PS_128 ( vbi, pfx, delta, True/*isAvx*/ );
22778         goto decode_success;
22779      }
22780      /* VCVTDQ2PS ymm2/m256, ymm1 = VEX.256.0F.WIG 5B /r */
22781      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22782         delta = dis_CVTDQ2PS_256 ( vbi, pfx, delta );
22783         goto decode_success;
22784      }
22785      break;
22786
22787   case 0x5C:
22788      /* VSUBSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5C /r */
22789      if (haveF2no66noF3(pfx)) {
22790         delta = dis_AVX128_E_V_to_G_lo64(
22791                    uses_vvvv, vbi, pfx, delta, "vsubsd", Iop_Sub64F0x2 );
22792         goto decode_success;
22793      }
22794      /* VSUBSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5C /r */
22795      if (haveF3no66noF2(pfx)) {
22796         delta = dis_AVX128_E_V_to_G_lo32(
22797                    uses_vvvv, vbi, pfx, delta, "vsubss", Iop_Sub32F0x4 );
22798         goto decode_success;
22799      }
22800      /* VSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5C /r */
22801      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22802         delta = dis_AVX128_E_V_to_G(
22803                    uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx4 );
22804         goto decode_success;
22805      }
22806      /* VSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5C /r */
22807      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22808         delta = dis_AVX256_E_V_to_G(
22809                    uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx8 );
22810         goto decode_success;
22811      }
22812      /* VSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5C /r */
22813      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22814         delta = dis_AVX128_E_V_to_G(
22815                    uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx2 );
22816         goto decode_success;
22817      }
22818      /* VSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5C /r */
22819      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22820         delta = dis_AVX256_E_V_to_G(
22821                    uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx4 );
22822         goto decode_success;
22823      }
22824      break;
22825
22826   case 0x5D:
22827      /* VMINSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5D /r */
22828      if (haveF2no66noF3(pfx)) {
22829         delta = dis_AVX128_E_V_to_G_lo64(
22830                    uses_vvvv, vbi, pfx, delta, "vminsd", Iop_Min64F0x2 );
22831         goto decode_success;
22832      }
22833      /* VMINSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5D /r */
22834      if (haveF3no66noF2(pfx)) {
22835         delta = dis_AVX128_E_V_to_G_lo32(
22836                    uses_vvvv, vbi, pfx, delta, "vminss", Iop_Min32F0x4 );
22837         goto decode_success;
22838      }
22839      /* VMINPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5D /r */
22840      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22841         delta = dis_AVX128_E_V_to_G(
22842                    uses_vvvv, vbi, pfx, delta, "vminps", Iop_Min32Fx4 );
22843         goto decode_success;
22844      }
22845      /* VMINPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5D /r */
22846      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22847         delta = dis_AVX256_E_V_to_G(
22848                    uses_vvvv, vbi, pfx, delta, "vminps", Iop_Min32Fx8 );
22849         goto decode_success;
22850      }
22851      /* VMINPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5D /r */
22852      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22853         delta = dis_AVX128_E_V_to_G(
22854                    uses_vvvv, vbi, pfx, delta, "vminpd", Iop_Min64Fx2 );
22855         goto decode_success;
22856      }
22857      /* VMINPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5D /r */
22858      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22859         delta = dis_AVX256_E_V_to_G(
22860                    uses_vvvv, vbi, pfx, delta, "vminpd", Iop_Min64Fx4 );
22861         goto decode_success;
22862      }
22863      break;
22864
22865   case 0x5E:
22866      /* VDIVSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5E /r */
22867      if (haveF2no66noF3(pfx)) {
22868         delta = dis_AVX128_E_V_to_G_lo64(
22869                    uses_vvvv, vbi, pfx, delta, "vdivsd", Iop_Div64F0x2 );
22870         goto decode_success;
22871      }
22872      /* VDIVSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5E /r */
22873      if (haveF3no66noF2(pfx)) {
22874         delta = dis_AVX128_E_V_to_G_lo32(
22875                    uses_vvvv, vbi, pfx, delta, "vdivss", Iop_Div32F0x4 );
22876         goto decode_success;
22877      }
22878      /* VDIVPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5E /r */
22879      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22880         delta = dis_AVX128_E_V_to_G(
22881                    uses_vvvv, vbi, pfx, delta, "vdivps", Iop_Div32Fx4 );
22882         goto decode_success;
22883      }
22884      /* VDIVPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5E /r */
22885      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22886         delta = dis_AVX256_E_V_to_G(
22887                    uses_vvvv, vbi, pfx, delta, "vdivps", Iop_Div32Fx8 );
22888         goto decode_success;
22889      }
22890      /* VDIVPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5E /r */
22891      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22892         delta = dis_AVX128_E_V_to_G(
22893                    uses_vvvv, vbi, pfx, delta, "vdivpd", Iop_Div64Fx2 );
22894         goto decode_success;
22895      }
22896      /* VDIVPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5E /r */
22897      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22898         delta = dis_AVX256_E_V_to_G(
22899                    uses_vvvv, vbi, pfx, delta, "vdivpd", Iop_Div64Fx4 );
22900         goto decode_success;
22901      }
22902      break;
22903
22904   case 0x5F:
22905      /* VMAXSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5F /r */
22906      if (haveF2no66noF3(pfx)) {
22907         delta = dis_AVX128_E_V_to_G_lo64(
22908                    uses_vvvv, vbi, pfx, delta, "vmaxsd", Iop_Max64F0x2 );
22909         goto decode_success;
22910      }
22911      /* VMAXSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5F /r */
22912      if (haveF3no66noF2(pfx)) {
22913         delta = dis_AVX128_E_V_to_G_lo32(
22914                    uses_vvvv, vbi, pfx, delta, "vmaxss", Iop_Max32F0x4 );
22915         goto decode_success;
22916      }
22917      /* VMAXPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5F /r */
22918      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22919         delta = dis_AVX128_E_V_to_G(
22920                    uses_vvvv, vbi, pfx, delta, "vmaxps", Iop_Max32Fx4 );
22921         goto decode_success;
22922      }
22923      /* VMAXPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5F /r */
22924      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22925         delta = dis_AVX256_E_V_to_G(
22926                    uses_vvvv, vbi, pfx, delta, "vmaxps", Iop_Max32Fx8 );
22927         goto decode_success;
22928      }
22929      /* VMAXPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5F /r */
22930      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22931         delta = dis_AVX128_E_V_to_G(
22932                    uses_vvvv, vbi, pfx, delta, "vmaxpd", Iop_Max64Fx2 );
22933         goto decode_success;
22934      }
22935      /* VMAXPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5F /r */
22936      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
22937         delta = dis_AVX256_E_V_to_G(
22938                    uses_vvvv, vbi, pfx, delta, "vmaxpd", Iop_Max64Fx4 );
22939         goto decode_success;
22940      }
22941      break;
22942
22943   case 0x60:
22944      /* VPUNPCKLBW r/m, rV, r ::: r = interleave-lo-bytes(rV, r/m) */
22945      /* VPUNPCKLBW = VEX.NDS.128.66.0F.WIG 60 /r */
22946      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22947         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
22948                    uses_vvvv, vbi, pfx, delta, "vpunpcklbw",
22949                    Iop_InterleaveLO8x16, NULL,
22950                    False/*!invertLeftArg*/, True/*swapArgs*/ );
22951         goto decode_success;
22952      }
22953      break;
22954
22955   case 0x61:
22956      /* VPUNPCKLWD r/m, rV, r ::: r = interleave-lo-words(rV, r/m) */
22957      /* VPUNPCKLWD = VEX.NDS.128.66.0F.WIG 61 /r */
22958      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22959         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
22960                    uses_vvvv, vbi, pfx, delta, "vpunpcklwd",
22961                    Iop_InterleaveLO16x8, NULL,
22962                    False/*!invertLeftArg*/, True/*swapArgs*/ );
22963         goto decode_success;
22964      }
22965      break;
22966
22967   case 0x62:
22968      /* VPUNPCKLDQ r/m, rV, r ::: r = interleave-lo-dwords(rV, r/m) */
22969      /* VPUNPCKLDQ = VEX.NDS.128.66.0F.WIG 62 /r */
22970      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22971         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
22972                    uses_vvvv, vbi, pfx, delta, "vpunpckldq",
22973                    Iop_InterleaveLO32x4, NULL,
22974                    False/*!invertLeftArg*/, True/*swapArgs*/ );
22975         goto decode_success;
22976      }
22977      break;
22978
22979   case 0x63:
22980      /* VPACKSSWB r/m, rV, r ::: r = QNarrowBin16Sto8Sx16(rV, r/m) */
22981      /* VPACKSSWB = VEX.NDS.128.66.0F.WIG 63 /r */
22982      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22983         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
22984                    uses_vvvv, vbi, pfx, delta, "vpacksswb",
22985                    Iop_QNarrowBin16Sto8Sx16, NULL,
22986                    False/*!invertLeftArg*/, True/*swapArgs*/ );
22987         goto decode_success;
22988      }
22989      break;
22990
22991   case 0x64:
22992      /* VPCMPGTB r/m, rV, r ::: r = rV `>s-by-8s` r/m */
22993      /* VPCMPGTB = VEX.NDS.128.66.0F.WIG 64 /r */
22994      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
22995         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
22996                    uses_vvvv, vbi, pfx, delta, "vpcmpgtb", Iop_CmpGT8Sx16 );
22997         goto decode_success;
22998      }
22999      break;
23000
23001   case 0x65:
23002      /* VPCMPGTW r/m, rV, r ::: r = rV `>s-by-16s` r/m */
23003      /* VPCMPGTW = VEX.NDS.128.66.0F.WIG 65 /r */
23004      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23005         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
23006                    uses_vvvv, vbi, pfx, delta, "vpcmpgtw", Iop_CmpGT16Sx8 );
23007         goto decode_success;
23008      }
23009      break;
23010
23011   case 0x66:
23012      /* VPCMPGTD r/m, rV, r ::: r = rV `>s-by-32s` r/m */
23013      /* VPCMPGTD = VEX.NDS.128.66.0F.WIG 66 /r */
23014      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23015         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
23016                    uses_vvvv, vbi, pfx, delta, "vpcmpgtd", Iop_CmpGT32Sx4 );
23017         goto decode_success;
23018      }
23019      break;
23020
23021   case 0x67:
23022      /* VPACKUSWB r/m, rV, r ::: r = QNarrowBin16Sto8Ux16(rV, r/m) */
23023      /* VPACKUSWB = VEX.NDS.128.66.0F.WIG 67 /r */
23024      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23025         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
23026                    uses_vvvv, vbi, pfx, delta, "vpackuswb",
23027                    Iop_QNarrowBin16Sto8Ux16, NULL,
23028                    False/*!invertLeftArg*/, True/*swapArgs*/ );
23029         goto decode_success;
23030      }
23031      break;
23032
23033   case 0x68:
23034      /* VPUNPCKHBW r/m, rV, r ::: r = interleave-hi-bytes(rV, r/m) */
23035      /* VPUNPCKHBW = VEX.NDS.128.0F.WIG 68 /r */
23036      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23037         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
23038                    uses_vvvv, vbi, pfx, delta, "vpunpckhbw",
23039                    Iop_InterleaveHI8x16, NULL,
23040                    False/*!invertLeftArg*/, True/*swapArgs*/ );
23041         goto decode_success;
23042      }
23043      break;
23044
23045   case 0x69:
23046      /* VPUNPCKHWD r/m, rV, r ::: r = interleave-hi-words(rV, r/m) */
23047      /* VPUNPCKHWD = VEX.NDS.128.0F.WIG 69 /r */
23048      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23049         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
23050                    uses_vvvv, vbi, pfx, delta, "vpunpckhwd",
23051                    Iop_InterleaveHI16x8, NULL,
23052                    False/*!invertLeftArg*/, True/*swapArgs*/ );
23053         goto decode_success;
23054      }
23055      break;
23056
23057   case 0x6A:
23058      /* VPUNPCKHDQ r/m, rV, r ::: r = interleave-hi-dwords(rV, r/m) */
23059      /* VPUNPCKHDQ = VEX.NDS.128.66.0F.WIG 6A /r */
23060      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23061         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
23062                    uses_vvvv, vbi, pfx, delta, "vpunpckhdq",
23063                    Iop_InterleaveHI32x4, NULL,
23064                    False/*!invertLeftArg*/, True/*swapArgs*/ );
23065         goto decode_success;
23066      }
23067      break;
23068
23069   case 0x6B:
23070      /* VPACKSSDW r/m, rV, r ::: r = QNarrowBin32Sto16Sx8(rV, r/m) */
23071      /* VPACKSSDW = VEX.NDS.128.66.0F.WIG 6B /r */
23072      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23073         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
23074                    uses_vvvv, vbi, pfx, delta, "vpackssdw",
23075                    Iop_QNarrowBin32Sto16Sx8, NULL,
23076                    False/*!invertLeftArg*/, True/*swapArgs*/ );
23077         goto decode_success;
23078      }
23079      break;
23080
23081   case 0x6C:
23082      /* VPUNPCKLQDQ r/m, rV, r ::: r = interleave-lo-64bitses(rV, r/m) */
23083      /* VPUNPCKLQDQ = VEX.NDS.128.0F.WIG 6C /r */
23084      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23085         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
23086                    uses_vvvv, vbi, pfx, delta, "vpunpcklqdq",
23087                    Iop_InterleaveLO64x2, NULL,
23088                    False/*!invertLeftArg*/, True/*swapArgs*/ );
23089         goto decode_success;
23090      }
23091      break;
23092
23093   case 0x6D:
23094      /* VPUNPCKHQDQ r/m, rV, r ::: r = interleave-hi-64bitses(rV, r/m) */
23095      /* VPUNPCKHQDQ = VEX.NDS.128.0F.WIG 6D /r */
23096      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23097         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
23098                    uses_vvvv, vbi, pfx, delta, "vpunpckhqdq",
23099                    Iop_InterleaveHI64x2, NULL,
23100                    False/*!invertLeftArg*/, True/*swapArgs*/ );
23101         goto decode_success;
23102      }
23103      break;
23104
23105   case 0x6E:
23106      /* VMOVD r32/m32, xmm1 = VEX.128.66.0F.W0 6E */
23107      if (have66noF2noF3(pfx)
23108          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
23109         vassert(sz == 2); /* even tho we are transferring 4, not 2. */
23110         UChar modrm = getUChar(delta);
23111         if (epartIsReg(modrm)) {
23112            delta += 1;
23113            putYMMRegLoAndZU(
23114               gregOfRexRM(pfx,modrm),
23115               unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
23116            );
23117            DIP("vmovd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
23118                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
23119        } else {
23120            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
23121            delta += alen;
23122            putYMMRegLoAndZU(
23123               gregOfRexRM(pfx,modrm),
23124               unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)))
23125                             );
23126            DIP("vmovd %s, %s\n", dis_buf,
23127                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
23128         }
23129         goto decode_success;
23130      }
23131      /* VMOVQ r64/m64, xmm1 = VEX.128.66.0F.W1 6E */
23132      if (have66noF2noF3(pfx)
23133          && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
23134         vassert(sz == 2); /* even tho we are transferring 8, not 2. */
23135         UChar modrm = getUChar(delta);
23136         if (epartIsReg(modrm)) {
23137            delta += 1;
23138            putYMMRegLoAndZU(
23139               gregOfRexRM(pfx,modrm),
23140               unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
23141            );
23142            DIP("vmovq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
23143                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
23144        } else {
23145            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
23146            delta += alen;
23147            putYMMRegLoAndZU(
23148               gregOfRexRM(pfx,modrm),
23149               unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)))
23150                             );
23151            DIP("vmovq %s, %s\n", dis_buf,
23152                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
23153         }
23154         goto decode_success;
23155      }
23156      break;
23157
23158   case 0x6F:
23159      /* VMOVDQA ymm2/m256, ymm1 = VEX.256.66.0F.WIG 6F */
23160      /* VMOVDQU ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 6F */
23161      if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
23162          && 1==getVexL(pfx)/*256*/) {
23163         UChar  modrm = getUChar(delta);
23164         UInt   rD    = gregOfRexRM(pfx, modrm);
23165         IRTemp tD    = newTemp(Ity_V256);
23166         Bool   isA   = have66noF2noF3(pfx);
23167         UChar  ch    = isA ? 'a' : 'u';
23168         if (epartIsReg(modrm)) {
23169            UInt rS = eregOfRexRM(pfx, modrm);
23170            delta += 1;
23171            assign(tD, getYMMReg(rS));
23172            DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), nameYMMReg(rD));
23173         } else {
23174            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
23175            delta += alen;
23176            if (isA)
23177               gen_SEGV_if_not_32_aligned(addr);
23178            assign(tD, loadLE(Ity_V256, mkexpr(addr)));
23179            DIP("vmovdq%c %s,%s\n", ch, dis_buf, nameYMMReg(rD));
23180         }
23181         putYMMReg(rD, mkexpr(tD));
23182         goto decode_success;
23183      }
23184      /* VMOVDQA xmm2/m128, xmm1 = VEX.128.66.0F.WIG 6F */
23185      /* VMOVDQU xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 6F */
23186      if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
23187          && 0==getVexL(pfx)/*128*/) {
23188         UChar  modrm = getUChar(delta);
23189         UInt   rD    = gregOfRexRM(pfx, modrm);
23190         IRTemp tD    = newTemp(Ity_V128);
23191         Bool   isA   = have66noF2noF3(pfx);
23192         UChar  ch    = isA ? 'a' : 'u';
23193         if (epartIsReg(modrm)) {
23194            UInt rS = eregOfRexRM(pfx, modrm);
23195            delta += 1;
23196            assign(tD, getXMMReg(rS));
23197            DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), nameXMMReg(rD));
23198         } else {
23199            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
23200            delta += alen;
23201            if (isA)
23202               gen_SEGV_if_not_16_aligned(addr);
23203            assign(tD, loadLE(Ity_V128, mkexpr(addr)));
23204            DIP("vmovdq%c %s,%s\n", ch, dis_buf, nameXMMReg(rD));
23205         }
23206         putYMMRegLoAndZU(rD, mkexpr(tD));
23207         goto decode_success;
23208      }
23209      break;
23210
23211   case 0x70:
23212      /* VPSHUFD imm8, xmm2/m128, xmm1 = VEX.128.66.0F.WIG 70 /r ib */
23213      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23214         delta = dis_PSHUFD_32x4( vbi, pfx, delta, True/*writesYmm*/);
23215         goto decode_success;
23216      }
23217      /* VPSHUFLW imm8, xmm2/m128, xmm1 = VEX.128.F2.0F.WIG 70 /r ib */
23218      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23219         delta = dis_PSHUFxW_128( vbi, pfx, delta,
23220                                  True/*isAvx*/, False/*!xIsH*/ );
23221         goto decode_success;
23222      }
23223      /* VPSHUFHW imm8, xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 70 /r ib */
23224      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
23225         delta = dis_PSHUFxW_128( vbi, pfx, delta,
23226                                  True/*isAvx*/, True/*xIsH*/ );
23227         goto decode_success;
23228      }
23229      break;
23230
23231   case 0x71:
23232      /* VPSRLW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /2 ib */
23233      /* VPSRAW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /4 ib */
23234      /* VPSLLW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /6 ib */
23235      if (have66noF2noF3(pfx)
23236          && 0==getVexL(pfx)/*128*/
23237          && epartIsReg(getUChar(delta))) {
23238         if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
23239            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
23240                                                "vpsrlw", Iop_ShrN16x8 );
23241            *uses_vvvv = True;
23242            goto decode_success;
23243         }
23244         if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
23245            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
23246                                                "vpsraw", Iop_SarN16x8 );
23247            *uses_vvvv = True;
23248            goto decode_success;
23249         }
23250         if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
23251            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
23252                                                "vpsllw", Iop_ShlN16x8 );
23253            *uses_vvvv = True;
23254            goto decode_success;
23255         }
23256         /* else fall through */
23257      }
23258      break;
23259
23260   case 0x72:
23261      /* VPSRLD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /2 ib */
23262      /* VPSRAD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /4 ib */
23263      /* VPSLLD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /6 ib */
23264      if (have66noF2noF3(pfx)
23265          && 0==getVexL(pfx)/*128*/
23266          && epartIsReg(getUChar(delta))) {
23267         if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
23268            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
23269                                                "vpsrld", Iop_ShrN32x4 );
23270            *uses_vvvv = True;
23271            goto decode_success;
23272         }
23273         if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
23274            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
23275                                                "vpsrad", Iop_SarN32x4 );
23276            *uses_vvvv = True;
23277            goto decode_success;
23278         }
23279         if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
23280            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
23281                                                "vpslld", Iop_ShlN32x4 );
23282            *uses_vvvv = True;
23283            goto decode_success;
23284         }
23285         /* else fall through */
23286      }
23287      break;
23288
23289   case 0x73:
23290      /* VPSRLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /3 ib */
23291      /* VPSLLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /7 ib */
23292      /* VPSRLQ  imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /2 ib */
23293      /* VPSLLQ  imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /6 ib */
23294      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
23295          && epartIsReg(getUChar(delta))) {
23296         Int    rS   = eregOfRexRM(pfx,getUChar(delta));
23297         Int    rD   = getVexNvvvv(pfx);
23298         IRTemp vecS = newTemp(Ity_V128);
23299         if (gregLO3ofRM(getUChar(delta)) == 3) {
23300            Int imm = (Int)getUChar(delta+1);
23301            DIP("vpsrldq $%d,%s,%s\n", imm, nameXMMReg(rS), nameXMMReg(rD));
23302            delta += 2;
23303            assign( vecS, getXMMReg(rS) );
23304            putYMMRegLoAndZU(rD, mkexpr(math_PSRLDQ( vecS, imm )));
23305            *uses_vvvv = True;
23306            goto decode_success;
23307         }
23308         if (gregLO3ofRM(getUChar(delta)) == 7) {
23309            Int imm = (Int)getUChar(delta+1);
23310            DIP("vpslldq $%d,%s,%s\n", imm, nameXMMReg(rS), nameXMMReg(rD));
23311            delta += 2;
23312            assign( vecS, getXMMReg(rS) );
23313            putYMMRegLoAndZU(rD, mkexpr(math_PSLLDQ( vecS, imm )));
23314            *uses_vvvv = True;
23315            goto decode_success;
23316         }
23317         if (gregLO3ofRM(getUChar(delta)) == 2) {
23318            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
23319                                                "vpsrlq", Iop_ShrN64x2 );
23320            *uses_vvvv = True;
23321            goto decode_success;
23322         }
23323         if (gregLO3ofRM(getUChar(delta)) == 6) {
23324            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
23325                                                "vpsllq", Iop_ShlN64x2 );
23326            *uses_vvvv = True;
23327            goto decode_success;
23328         }
23329         /* else fall through */
23330      }
23331      break;
23332
23333   case 0x74:
23334      /* VPCMPEQB r/m, rV, r ::: r = rV `eq-by-8s` r/m */
23335      /* VPCMPEQB = VEX.NDS.128.66.0F.WIG 74 /r */
23336      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23337         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
23338                    uses_vvvv, vbi, pfx, delta, "vpcmpeqb", Iop_CmpEQ8x16 );
23339         goto decode_success;
23340      }
23341      break;
23342
23343   case 0x75:
23344      /* VPCMPEQW r/m, rV, r ::: r = rV `eq-by-16s` r/m */
23345      /* VPCMPEQW = VEX.NDS.128.66.0F.WIG 75 /r */
23346      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23347         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
23348                    uses_vvvv, vbi, pfx, delta, "vpcmpeqw", Iop_CmpEQ16x8 );
23349         goto decode_success;
23350      }
23351      break;
23352
23353   case 0x76:
23354      /* VPCMPEQD r/m, rV, r ::: r = rV `eq-by-32s` r/m */
23355      /* VPCMPEQD = VEX.NDS.128.66.0F.WIG 76 /r */
23356      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23357         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
23358                    uses_vvvv, vbi, pfx, delta, "vpcmpeqd", Iop_CmpEQ32x4 );
23359         goto decode_success;
23360      }
23361      break;
23362
23363   case 0x77:
23364      /* VZEROUPPER = VEX.128.0F.WIG 77 */
23365      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23366         Int i;
23367         IRTemp zero128 = newTemp(Ity_V128);
23368         assign(zero128, mkV128(0));
23369         for (i = 0; i < 16; i++) {
23370            putYMMRegLane128(i, 1, mkexpr(zero128));
23371         }
23372         DIP("vzeroupper\n");
23373         goto decode_success;
23374      }
23375      /* VZEROALL = VEX.256.0F.WIG 77 */
23376      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
23377         Int i;
23378         IRTemp zero128 = newTemp(Ity_V128);
23379         assign(zero128, mkV128(0));
23380         for (i = 0; i < 16; i++) {
23381            putYMMRegLoAndZU(i, mkexpr(zero128));
23382         }
23383         DIP("vzeroall\n");
23384         goto decode_success;
23385      }
23386      break;
23387
23388   case 0x7C:
23389   case 0x7D:
23390      /* VHADDPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG 7C /r */
23391      /* VHSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG 7D /r */
23392      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23393         IRTemp sV     = newTemp(Ity_V128);
23394         IRTemp dV     = newTemp(Ity_V128);
23395         Bool   isAdd  = opc == 0x7C;
23396         HChar* str    = isAdd ? "add" : "sub";
23397         UChar modrm   = getUChar(delta);
23398         UInt   rG     = gregOfRexRM(pfx,modrm);
23399         UInt   rV     = getVexNvvvv(pfx);
23400         if (epartIsReg(modrm)) {
23401            UInt rE = eregOfRexRM(pfx,modrm);
23402            assign( sV, getXMMReg(rE) );
23403            DIP("vh%spd %s,%s,%s\n", str, nameXMMReg(rE),
23404                nameXMMReg(rV), nameXMMReg(rG));
23405            delta += 1;
23406         } else {
23407            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23408            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
23409            DIP("vh%spd %s,%s,%s\n", str, dis_buf,
23410                nameXMMReg(rV), nameXMMReg(rG));
23411            delta += alen;
23412         }
23413         assign( dV, getXMMReg(rV) );
23414         putYMMRegLoAndZU( rG, mkexpr( math_HADDPS_128 ( dV, sV, isAdd ) ) );
23415         *uses_vvvv = True;
23416         goto decode_success;
23417      }
23418      /* VHADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG 7C /r */
23419      /* VHSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG 7D /r */
23420      if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
23421         IRTemp sV     = newTemp(Ity_V256);
23422         IRTemp dV     = newTemp(Ity_V256);
23423         IRTemp s1, s0, d1, d0;
23424         Bool   isAdd  = opc == 0x7C;
23425         HChar* str    = isAdd ? "add" : "sub";
23426         UChar modrm   = getUChar(delta);
23427         UInt   rG     = gregOfRexRM(pfx,modrm);
23428         UInt   rV     = getVexNvvvv(pfx);
23429         s1 = s0 = d1 = d0 = IRTemp_INVALID;
23430         if (epartIsReg(modrm)) {
23431            UInt rE = eregOfRexRM(pfx,modrm);
23432            assign( sV, getYMMReg(rE) );
23433            DIP("vh%spd %s,%s,%s\n", str, nameYMMReg(rE),
23434                nameYMMReg(rV), nameYMMReg(rG));
23435            delta += 1;
23436         } else {
23437            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23438            assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
23439            DIP("vh%spd %s,%s,%s\n", str, dis_buf,
23440                nameYMMReg(rV), nameYMMReg(rG));
23441            delta += alen;
23442         }
23443         assign( dV, getYMMReg(rV) );
23444         breakupV256toV128s( dV, &d1, &d0 );
23445         breakupV256toV128s( sV, &s1, &s0 );
23446         putYMMReg( rG, binop(Iop_V128HLtoV256,
23447                              mkexpr( math_HADDPS_128 ( d1, s1, isAdd ) ),
23448                              mkexpr( math_HADDPS_128 ( d0, s0, isAdd ) ) ) );
23449         *uses_vvvv = True;
23450         goto decode_success;
23451      }
23452      /* VHADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 7C /r */
23453      /* VHSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 7D /r */
23454      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23455         IRTemp sV     = newTemp(Ity_V128);
23456         IRTemp dV     = newTemp(Ity_V128);
23457         Bool   isAdd  = opc == 0x7C;
23458         HChar* str    = isAdd ? "add" : "sub";
23459         UChar modrm   = getUChar(delta);
23460         UInt   rG     = gregOfRexRM(pfx,modrm);
23461         UInt   rV     = getVexNvvvv(pfx);
23462         if (epartIsReg(modrm)) {
23463            UInt rE = eregOfRexRM(pfx,modrm);
23464            assign( sV, getXMMReg(rE) );
23465            DIP("vh%spd %s,%s,%s\n", str, nameXMMReg(rE),
23466                nameXMMReg(rV), nameXMMReg(rG));
23467            delta += 1;
23468         } else {
23469            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23470            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
23471            DIP("vh%spd %s,%s,%s\n", str, dis_buf,
23472                nameXMMReg(rV), nameXMMReg(rG));
23473            delta += alen;
23474         }
23475         assign( dV, getXMMReg(rV) );
23476         putYMMRegLoAndZU( rG, mkexpr( math_HADDPD_128 ( dV, sV, isAdd ) ) );
23477         *uses_vvvv = True;
23478         goto decode_success;
23479      }
23480      /* VHADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 7C /r */
23481      /* VHSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 7D /r */
23482      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
23483         IRTemp sV     = newTemp(Ity_V256);
23484         IRTemp dV     = newTemp(Ity_V256);
23485         IRTemp s1, s0, d1, d0;
23486         Bool   isAdd  = opc == 0x7C;
23487         HChar* str    = isAdd ? "add" : "sub";
23488         UChar modrm   = getUChar(delta);
23489         UInt   rG     = gregOfRexRM(pfx,modrm);
23490         UInt   rV     = getVexNvvvv(pfx);
23491         s1 = s0 = d1 = d0 = IRTemp_INVALID;
23492         if (epartIsReg(modrm)) {
23493            UInt rE = eregOfRexRM(pfx,modrm);
23494            assign( sV, getYMMReg(rE) );
23495            DIP("vh%spd %s,%s,%s\n", str, nameYMMReg(rE),
23496                nameYMMReg(rV), nameYMMReg(rG));
23497            delta += 1;
23498         } else {
23499            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23500            assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
23501            DIP("vh%spd %s,%s,%s\n", str, dis_buf,
23502                nameYMMReg(rV), nameYMMReg(rG));
23503            delta += alen;
23504         }
23505         assign( dV, getYMMReg(rV) );
23506         breakupV256toV128s( dV, &d1, &d0 );
23507         breakupV256toV128s( sV, &s1, &s0 );
23508         putYMMReg( rG, binop(Iop_V128HLtoV256,
23509                              mkexpr( math_HADDPD_128 ( d1, s1, isAdd ) ),
23510                              mkexpr( math_HADDPD_128 ( d0, s0, isAdd ) ) ) );
23511         *uses_vvvv = True;
23512         goto decode_success;
23513      }
23514      break;
23515
23516   case 0x7E:
23517      /* Note the Intel docs don't make sense for this.  I think they
23518         are wrong.  They seem to imply it is a store when in fact I
23519         think it is a load.  Also it's unclear whether this is W0, W1
23520         or WIG. */
23521      /* VMOVQ xmm2/m64, xmm1 = VEX.128.F3.0F.W0 7E /r */
23522      if (haveF3no66noF2(pfx)
23523          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
23524         vassert(sz == 4); /* even tho we are transferring 8, not 4. */
23525         UChar modrm = getUChar(delta);
23526         UInt  rG    = gregOfRexRM(pfx,modrm);
23527         if (epartIsReg(modrm)) {
23528            UInt rE = eregOfRexRM(pfx,modrm);
23529            putXMMRegLane64( rG, 0, getXMMRegLane64( rE, 0 ));
23530            DIP("vmovq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
23531            delta += 1;
23532         } else {
23533            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23534            putXMMRegLane64( rG, 0, loadLE(Ity_I64, mkexpr(addr)) );
23535            DIP("vmovq %s,%s\n", dis_buf, nameXMMReg(rG));
23536            delta += alen;
23537         }
23538         /* zero bits 255:64 */
23539         putXMMRegLane64( rG, 1, mkU64(0) );
23540         putYMMRegLane128( rG, 1, mkV128(0) );
23541         goto decode_success;
23542      }
23543      /* VMOVQ xmm1, r64 = VEX.128.66.0F.W1 7E /r (reg case only) */
23544      /* Moves from G to E, so is a store-form insn */
23545      /* Intel docs list this in the VMOVD entry for some reason. */
23546      if (have66noF2noF3(pfx)
23547          && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
23548         UChar modrm = getUChar(delta);
23549         UInt  rG    = gregOfRexRM(pfx,modrm);
23550         if (epartIsReg(modrm)) {
23551            UInt rE = eregOfRexRM(pfx,modrm);
23552            DIP("vmovq %s,%s\n", nameXMMReg(rG), nameIReg64(rE));
23553            putIReg64(rE, getXMMRegLane64(rG, 0));
23554            delta += 1;
23555         } else {
23556            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23557            storeLE( mkexpr(addr), getXMMRegLane64(rG, 0) );
23558            DIP("vmovq %s,%s\n", dis_buf, nameXMMReg(rG));
23559            delta += alen;
23560         }
23561         goto decode_success;
23562      }
23563      /* VMOVD xmm1, m32/r32 = VEX.128.66.0F.W0 7E /r (reg case only) */
23564      /* Moves from G to E, so is a store-form insn */
23565      if (have66noF2noF3(pfx)
23566          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
23567         UChar modrm = getUChar(delta);
23568         UInt  rG    = gregOfRexRM(pfx,modrm);
23569         if (epartIsReg(modrm)) {
23570            UInt rE = eregOfRexRM(pfx,modrm);
23571            DIP("vmovd %s,%s\n", nameXMMReg(rG), nameIReg32(rE));
23572            putIReg32(rE, getXMMRegLane32(rG, 0));
23573            delta += 1;
23574         } else {
23575            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23576            storeLE( mkexpr(addr), getXMMRegLane32(rG, 0) );
23577            DIP("vmovd %s,%s\n", dis_buf, nameXMMReg(rG));
23578            delta += alen;
23579         }
23580         goto decode_success;
23581      }
23582      break;
23583
23584   case 0x7F:
23585      /* VMOVDQA ymm1, ymm2/m256 = VEX.256.66.0F.WIG 7F */
23586      /* VMOVDQU ymm1, ymm2/m256 = VEX.256.F3.0F.WIG 7F */
23587      if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
23588          && 1==getVexL(pfx)/*256*/) {
23589         UChar  modrm = getUChar(delta);
23590         UInt   rS    = gregOfRexRM(pfx, modrm);
23591         IRTemp tS    = newTemp(Ity_V256);
23592         Bool   isA   = have66noF2noF3(pfx);
23593         UChar  ch    = isA ? 'a' : 'u';
23594         assign(tS, getYMMReg(rS));
23595         if (epartIsReg(modrm)) {
23596            UInt rD = eregOfRexRM(pfx, modrm);
23597            delta += 1;
23598            putYMMReg(rD, mkexpr(tS));
23599            DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), nameYMMReg(rD));
23600         } else {
23601            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
23602            delta += alen;
23603            if (isA)
23604               gen_SEGV_if_not_32_aligned(addr);
23605            storeLE(mkexpr(addr), mkexpr(tS));
23606            DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), dis_buf);
23607         }
23608         goto decode_success;
23609      }
23610      /* VMOVDQA xmm1, xmm2/m128 = VEX.128.66.0F.WIG 7F */
23611      /* VMOVDQU xmm1, xmm2/m128 = VEX.128.F3.0F.WIG 7F */
23612      if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
23613          && 0==getVexL(pfx)/*128*/) {
23614         UChar  modrm = getUChar(delta);
23615         UInt   rS    = gregOfRexRM(pfx, modrm);
23616         IRTemp tS    = newTemp(Ity_V128);
23617         Bool   isA   = have66noF2noF3(pfx);
23618         UChar  ch    = isA ? 'a' : 'u';
23619         assign(tS, getXMMReg(rS));
23620         if (epartIsReg(modrm)) {
23621            UInt rD = eregOfRexRM(pfx, modrm);
23622            delta += 1;
23623            putYMMRegLoAndZU(rD, mkexpr(tS));
23624            DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), nameXMMReg(rD));
23625         } else {
23626            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
23627            delta += alen;
23628            if (isA)
23629               gen_SEGV_if_not_16_aligned(addr);
23630            storeLE(mkexpr(addr), mkexpr(tS));
23631            DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), dis_buf);
23632         }
23633         goto decode_success;
23634      }
23635      break;
23636
23637   case 0xAE:
23638      /* VSTMXCSR m32 = VEX.LZ.0F.WIG AE /3 */
23639      if (haveNo66noF2noF3(pfx)
23640          && 0==getVexL(pfx)/*LZ*/
23641          && 0==getRexW(pfx) /* be paranoid -- Intel docs don't require this */
23642          && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 3
23643          && sz == 4) {
23644         delta = dis_STMXCSR(vbi, pfx, delta, True/*isAvx*/);
23645         goto decode_success;
23646      }
23647      /* VLDMXCSR m32 = VEX.LZ.0F.WIG AE /2 */
23648      if (haveNo66noF2noF3(pfx)
23649          && 0==getVexL(pfx)/*LZ*/
23650          && 0==getRexW(pfx) /* be paranoid -- Intel docs don't require this */
23651          && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 2
23652          && sz == 4) {
23653         delta = dis_LDMXCSR(vbi, pfx, delta, True/*isAvx*/);
23654         goto decode_success;
23655      }
23656      break;
23657
23658   case 0xC2:
23659      /* VCMPSD xmm3/m64(E=argL), xmm2(V=argR), xmm1(G) */
23660      /* = VEX.NDS.LIG.F2.0F.WIG C2 /r ib */
23661      if (haveF2no66noF3(pfx)) {
23662         Long delta0 = delta;
23663         delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
23664                                          "vcmpsd", False/*!all_lanes*/,
23665                                          8/*sz*/);
23666         if (delta > delta0) goto decode_success;
23667         /* else fall through -- decoding has failed */
23668      }
23669      /* VCMPSS xmm3/m32(E=argL), xmm2(V=argR), xmm1(G) */
23670      /* = VEX.NDS.LIG.F3.0F.WIG C2 /r ib */
23671      if (haveF3no66noF2(pfx)) {
23672         Long delta0 = delta;
23673         delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
23674                                          "vcmpss", False/*!all_lanes*/,
23675                                          4/*sz*/);
23676         if (delta > delta0) goto decode_success;
23677         /* else fall through -- decoding has failed */
23678      }
23679      /* VCMPPD xmm3/m128(E=argL), xmm2(V=argR), xmm1(G) */
23680      /* = VEX.NDS.128.66.0F.WIG C2 /r ib */
23681      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23682         Long delta0 = delta;
23683         delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
23684                                          "vcmppd", True/*all_lanes*/,
23685                                          8/*sz*/);
23686         if (delta > delta0) goto decode_success;
23687         /* else fall through -- decoding has failed */
23688      }
23689      /* VCMPPD ymm3/m256(E=argL), ymm2(V=argR), ymm1(G) */
23690      /* = VEX.NDS.256.66.0F.WIG C2 /r ib */
23691      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
23692         Long delta0 = delta;
23693         delta = dis_AVX256_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
23694                                          "vcmppd", 8/*sz*/);
23695         if (delta > delta0) goto decode_success;
23696         /* else fall through -- decoding has failed */
23697      }
23698      /* VCMPPS xmm3/m128(E=argL), xmm2(V=argR), xmm1(G) */
23699      /* = VEX.NDS.128.0F.WIG C2 /r ib */
23700      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23701         Long delta0 = delta;
23702         delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
23703                                          "vcmpps", True/*all_lanes*/,
23704                                          4/*sz*/);
23705         if (delta > delta0) goto decode_success;
23706         /* else fall through -- decoding has failed */
23707      }
23708      /* VCMPPS ymm3/m256(E=argL), ymm2(V=argR), ymm1(G) */
23709      /* = VEX.NDS.256.0F.WIG C2 /r ib */
23710      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
23711         Long delta0 = delta;
23712         delta = dis_AVX256_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
23713                                          "vcmpps", 4/*sz*/);
23714         if (delta > delta0) goto decode_success;
23715         /* else fall through -- decoding has failed */
23716      }
23717      break;
23718
23719   case 0xC4:
23720      /* VPINSRW r32/m16, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG C4 /r ib */
23721      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23722         UChar  modrm = getUChar(delta);
23723         UInt   rG    = gregOfRexRM(pfx, modrm);
23724         UInt   rV    = getVexNvvvv(pfx);
23725         Int    imm8;
23726         IRTemp new16 = newTemp(Ity_I16);
23727
23728         if ( epartIsReg( modrm ) ) {
23729            imm8 = (Int)(getUChar(delta+1) & 7);
23730            assign( new16, unop(Iop_32to16,
23731                                getIReg32(eregOfRexRM(pfx,modrm))) );
23732            delta += 1+1;
23733            DIP( "vpinsrw $%d,%s,%s\n", imm8,
23734                 nameIReg32( eregOfRexRM(pfx, modrm) ), nameXMMReg(rG) );
23735         } else {
23736            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
23737            imm8 = (Int)(getUChar(delta+alen) & 7);
23738            assign( new16, loadLE( Ity_I16, mkexpr(addr) ));
23739            delta += alen+1;
23740            DIP( "vpinsrw $%d,%s,%s\n",
23741                 imm8, dis_buf, nameXMMReg(rG) );
23742         }
23743
23744         IRTemp src_vec = newTemp(Ity_V128);
23745         assign(src_vec, getXMMReg( rV ));
23746         IRTemp res_vec = math_PINSRW_128( src_vec, new16, imm8 );
23747         putYMMRegLoAndZU( rG, mkexpr(res_vec) );
23748         *uses_vvvv = True;
23749         goto decode_success;
23750      }
23751      break;
23752
23753   case 0xC5:
23754      /* VPEXTRW imm8, xmm1, reg32 = VEX.128.66.0F.W0 C5 /r ib */
23755      if (have66noF2noF3(pfx)
23756         && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
23757         Long delta0 = delta;
23758         delta = dis_PEXTRW_128_EregOnly_toG( vbi, pfx, delta,
23759                                              True/*isAvx*/ );
23760         if (delta > delta0) goto decode_success;
23761         /* else fall through -- decoding has failed */
23762      }
23763      break;
23764
23765   case 0xC6:
23766      /* VSHUFPS imm8, xmm3/m128, xmm2, xmm1, xmm2 */
23767      /* = VEX.NDS.128.0F.WIG C6 /r ib */
23768      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23769         Int    imm8 = 0;
23770         IRTemp eV   = newTemp(Ity_V128);
23771         IRTemp vV   = newTemp(Ity_V128);
23772         UInt  modrm = getUChar(delta);
23773         UInt  rG    = gregOfRexRM(pfx,modrm);
23774         UInt  rV    = getVexNvvvv(pfx);
23775         assign( vV, getXMMReg(rV) );
23776         if (epartIsReg(modrm)) {
23777            UInt rE = eregOfRexRM(pfx,modrm);
23778            assign( eV, getXMMReg(rE) );
23779            imm8 = (Int)getUChar(delta+1);
23780            delta += 1+1;
23781            DIP("vshufps $%d,%s,%s,%s\n",
23782                imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
23783         } else {
23784            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
23785            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
23786            imm8 = (Int)getUChar(delta+alen);
23787            delta += 1+alen;
23788            DIP("vshufps $%d,%s,%s,%s\n",
23789                imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
23790         }
23791         IRTemp res = math_SHUFPS_128( eV, vV, imm8 );
23792         putYMMRegLoAndZU( rG, mkexpr(res) );
23793         *uses_vvvv = True;
23794         goto decode_success;
23795      }
23796      /* VSHUFPS imm8, ymm3/m256, ymm2, ymm1, ymm2 */
23797      /* = VEX.NDS.256.0F.WIG C6 /r ib */
23798      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
23799         Int    imm8 = 0;
23800         IRTemp eV   = newTemp(Ity_V256);
23801         IRTemp vV   = newTemp(Ity_V256);
23802         UInt  modrm = getUChar(delta);
23803         UInt  rG    = gregOfRexRM(pfx,modrm);
23804         UInt  rV    = getVexNvvvv(pfx);
23805         assign( vV, getYMMReg(rV) );
23806         if (epartIsReg(modrm)) {
23807            UInt rE = eregOfRexRM(pfx,modrm);
23808            assign( eV, getYMMReg(rE) );
23809            imm8 = (Int)getUChar(delta+1);
23810            delta += 1+1;
23811            DIP("vshufps $%d,%s,%s,%s\n",
23812                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
23813         } else {
23814            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
23815            assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
23816            imm8 = (Int)getUChar(delta+alen);
23817            delta += 1+alen;
23818            DIP("vshufps $%d,%s,%s,%s\n",
23819                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
23820         }
23821         IRTemp res = math_SHUFPS_256( eV, vV, imm8 );
23822         putYMMReg( rG, mkexpr(res) );
23823         *uses_vvvv = True;
23824         goto decode_success;
23825      }
23826      /* VSHUFPD imm8, xmm3/m128, xmm2, xmm1, xmm2 */
23827      /* = VEX.NDS.128.66.0F.WIG C6 /r ib */
23828      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23829         Int    imm8 = 0;
23830         IRTemp eV   = newTemp(Ity_V128);
23831         IRTemp vV   = newTemp(Ity_V128);
23832         UInt  modrm = getUChar(delta);
23833         UInt  rG    = gregOfRexRM(pfx,modrm);
23834         UInt  rV    = getVexNvvvv(pfx);
23835         assign( vV, getXMMReg(rV) );
23836         if (epartIsReg(modrm)) {
23837            UInt rE = eregOfRexRM(pfx,modrm);
23838            assign( eV, getXMMReg(rE) );
23839            imm8 = (Int)getUChar(delta+1);
23840            delta += 1+1;
23841            DIP("vshufpd $%d,%s,%s,%s\n",
23842                imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
23843         } else {
23844            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
23845            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
23846            imm8 = (Int)getUChar(delta+alen);
23847            delta += 1+alen;
23848            DIP("vshufpd $%d,%s,%s,%s\n",
23849                imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
23850         }
23851         IRTemp res = math_SHUFPD_128( eV, vV, imm8 );
23852         putYMMRegLoAndZU( rG, mkexpr(res) );
23853         *uses_vvvv = True;
23854         goto decode_success;
23855      }
23856      /* VSHUFPD imm8, ymm3/m256, ymm2, ymm1, ymm2 */
23857      /* = VEX.NDS.256.66.0F.WIG C6 /r ib */
23858      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
23859         Int    imm8 = 0;
23860         IRTemp eV   = newTemp(Ity_V256);
23861         IRTemp vV   = newTemp(Ity_V256);
23862         UInt  modrm = getUChar(delta);
23863         UInt  rG    = gregOfRexRM(pfx,modrm);
23864         UInt  rV    = getVexNvvvv(pfx);
23865         assign( vV, getYMMReg(rV) );
23866         if (epartIsReg(modrm)) {
23867            UInt rE = eregOfRexRM(pfx,modrm);
23868            assign( eV, getYMMReg(rE) );
23869            imm8 = (Int)getUChar(delta+1);
23870            delta += 1+1;
23871            DIP("vshufpd $%d,%s,%s,%s\n",
23872                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
23873         } else {
23874            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
23875            assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
23876            imm8 = (Int)getUChar(delta+alen);
23877            delta += 1+alen;
23878            DIP("vshufpd $%d,%s,%s,%s\n",
23879                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
23880         }
23881         IRTemp res = math_SHUFPD_256( eV, vV, imm8 );
23882         putYMMReg( rG, mkexpr(res) );
23883         *uses_vvvv = True;
23884         goto decode_success;
23885      }
23886      break;
23887
23888   case 0xD0:
23889      /* VADDSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D0 /r */
23890      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23891         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
23892                    uses_vvvv, vbi, pfx, delta,
23893                    "vaddsubpd", math_ADDSUBPD_128 );
23894         goto decode_success;
23895      }
23896      /* VADDSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D0 /r */
23897      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
23898         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
23899                    uses_vvvv, vbi, pfx, delta,
23900                    "vaddsubpd", math_ADDSUBPD_256 );
23901         goto decode_success;
23902      }
23903      /* VADDSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG D0 /r */
23904      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23905         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
23906                    uses_vvvv, vbi, pfx, delta,
23907                    "vaddsubps", math_ADDSUBPS_128 );
23908         goto decode_success;
23909      }
23910      /* VADDSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG D0 /r */
23911      if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
23912         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
23913                    uses_vvvv, vbi, pfx, delta,
23914                    "vaddsubps", math_ADDSUBPS_256 );
23915         goto decode_success;
23916      }
23917      break;
23918
23919   case 0xD1:
23920      /* VPSRLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D1 /r */
23921      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23922         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
23923                                        "vpsrlw", Iop_ShrN16x8 );
23924         *uses_vvvv = True;
23925         goto decode_success;
23926
23927      }
23928      break;
23929
23930   case 0xD2:
23931      /* VPSRLD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D2 /r */
23932      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23933         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
23934                                        "vpsrld", Iop_ShrN32x4 );
23935         *uses_vvvv = True;
23936         goto decode_success;
23937      }
23938      break;
23939
23940   case 0xD3:
23941      /* VPSRLQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D3 /r */
23942      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23943         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
23944                                        "vpsrlq", Iop_ShrN64x2 );
23945         *uses_vvvv = True;
23946         goto decode_success;
23947      }
23948      break;
23949
23950   case 0xD4:
23951      /* VPADDQ r/m, rV, r ::: r = rV + r/m */
23952      /* VPADDQ = VEX.NDS.128.66.0F.WIG D4 /r */
23953      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23954         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
23955                    uses_vvvv, vbi, pfx, delta, "vpaddq", Iop_Add64x2 );
23956         goto decode_success;
23957      }
23958      break;
23959
23960   case 0xD5:
23961      /* VPMULLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D5 /r */
23962      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23963         delta = dis_AVX128_E_V_to_G(
23964                    uses_vvvv, vbi, pfx, delta, "vpmullw", Iop_Mul16x8 );
23965         goto decode_success;
23966      }
23967      break;
23968
23969   case 0xD6:
23970      /* I can't even find any Intel docs for this one. */
23971      /* Basically: 66 0F D6 = MOVQ -- move 64 bits from G (lo half
23972         xmm) to E (mem or lo half xmm).  Looks like L==0(128), W==0
23973         (WIG, maybe?) */
23974      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
23975          && 0==getRexW(pfx)/*this might be redundant, dunno*/) {
23976         UChar modrm = getUChar(delta);
23977         UInt  rG    = gregOfRexRM(pfx,modrm);
23978         if (epartIsReg(modrm)) {
23979            /* fall through, awaiting test case */
23980            /* dst: lo half copied, hi half zeroed */
23981         } else {
23982            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23983            storeLE( mkexpr(addr), getXMMRegLane64( rG, 0 ));
23984            DIP("vmovq %s,%s\n", nameXMMReg(rG), dis_buf );
23985            delta += alen;
23986            goto decode_success;
23987         }
23988      }
23989      break;
23990
23991   case 0xD7:
23992      /* VEX.128.66.0F.WIG D7 /r = VPMOVMSKB xmm1, r32 */
23993      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23994         delta = dis_PMOVMSKB_128( vbi, pfx, delta, True/*isAvx*/ );
23995         goto decode_success;
23996      }
23997      break;
23998
23999   case 0xD8:
24000      /* VPSUBUSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D8 /r */
24001      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24002         delta = dis_AVX128_E_V_to_G(
24003                    uses_vvvv, vbi, pfx, delta, "vpsubusb", Iop_QSub8Ux16 );
24004         goto decode_success;
24005      }
24006     break;
24007
24008   case 0xD9:
24009      /* VPSUBUSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D9 /r */
24010      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24011         delta = dis_AVX128_E_V_to_G(
24012                    uses_vvvv, vbi, pfx, delta, "vpsubusw", Iop_QSub16Ux8 );
24013         goto decode_success;
24014      }
24015      break;
24016
24017   case 0xDA:
24018      /* VPMINUB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DA /r */
24019      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24020         delta = dis_AVX128_E_V_to_G(
24021                    uses_vvvv, vbi, pfx, delta, "vpminub", Iop_Min8Ux16 );
24022         goto decode_success;
24023      }
24024      break;
24025
24026   case 0xDB:
24027      /* VPAND r/m, rV, r ::: r = rV & r/m */
24028      /* VEX.NDS.128.66.0F.WIG DB /r = VPAND xmm3/m128, xmm2, xmm1 */
24029      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24030         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
24031                    uses_vvvv, vbi, pfx, delta, "vpand", Iop_AndV128 );
24032         goto decode_success;
24033      }
24034      break;
24035
24036   case 0xDC:
24037      /* VPADDUSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DC /r */
24038      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24039         delta = dis_AVX128_E_V_to_G(
24040                    uses_vvvv, vbi, pfx, delta, "vpaddusb", Iop_QAdd8Ux16 );
24041         goto decode_success;
24042      }
24043      break;
24044
24045   case 0xDD:
24046      /* VPADDUSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DD /r */
24047      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24048         delta = dis_AVX128_E_V_to_G(
24049                    uses_vvvv, vbi, pfx, delta, "vpaddusw", Iop_QAdd16Ux8 );
24050         goto decode_success;
24051      }
24052      break;
24053
24054   case 0xDE:
24055      /* VPMAXUB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DE /r */
24056      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24057         delta = dis_AVX128_E_V_to_G(
24058                    uses_vvvv, vbi, pfx, delta, "vpmaxub", Iop_Max8Ux16 );
24059         goto decode_success;
24060      }
24061      break;
24062
24063   case 0xDF:
24064      /* VPANDN r/m, rV, r ::: r = rV & ~r/m (is that correct, re the ~ ?) */
24065      /* VEX.NDS.128.66.0F.WIG DF /r = VPANDN xmm3/m128, xmm2, xmm1 */
24066      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24067         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
24068                    uses_vvvv, vbi, pfx, delta, "vpandn", Iop_AndV128,
24069                    NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
24070         goto decode_success;
24071      }
24072      break;
24073
24074   case 0xE0:
24075      /* VPAVGB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E0 /r */
24076      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24077         delta = dis_AVX128_E_V_to_G(
24078                    uses_vvvv, vbi, pfx, delta, "vpavgb", Iop_Avg8Ux16 );
24079         goto decode_success;
24080      }
24081      break;
24082
24083   case 0xE1:
24084      /* VPSRAW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E1 /r */
24085      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24086         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
24087                                        "vpsraw", Iop_SarN16x8 );
24088         *uses_vvvv = True;
24089         goto decode_success;
24090      }
24091      break;
24092
24093   case 0xE2:
24094      /* VPSRAD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E2 /r */
24095      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24096         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
24097                                        "vpsrad", Iop_SarN32x4 );
24098         *uses_vvvv = True;
24099         goto decode_success;
24100      }
24101      break;
24102
24103   case 0xE3:
24104      /* VPAVGW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E3 /r */
24105      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24106         delta = dis_AVX128_E_V_to_G(
24107                    uses_vvvv, vbi, pfx, delta, "vpavgw", Iop_Avg16Ux8 );
24108         goto decode_success;
24109      }
24110      break;
24111
24112   case 0xE4:
24113      /* VPMULHUW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E4 /r */
24114      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24115         delta = dis_AVX128_E_V_to_G(
24116                    uses_vvvv, vbi, pfx, delta, "vpmulhuw", Iop_MulHi16Ux8 );
24117         goto decode_success;
24118      }
24119      break;
24120
24121   case 0xE5:
24122      /* VPMULHW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E5 /r */
24123      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24124         delta = dis_AVX128_E_V_to_G(
24125                    uses_vvvv, vbi, pfx, delta, "vpmulhw", Iop_MulHi16Sx8 );
24126         goto decode_success;
24127      }
24128      break;
24129
24130   case 0xE6:
24131      /* VCVTDQ2PD xmm2/m64, xmm1 = VEX.128.F3.0F.WIG E6 /r */
24132      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
24133         delta = dis_CVTDQ2PD_128(vbi, pfx, delta, True/*isAvx*/);
24134         goto decode_success;
24135      }
24136      /* VCVTDQ2PD xmm2/m128, ymm1 = VEX.256.F3.0F.WIG E6 /r */
24137      if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
24138         delta = dis_CVTDQ2PD_256(vbi, pfx, delta);
24139         goto decode_success;
24140      }
24141      /* VCVTTPD2DQ xmm2/m128, xmm1 = VEX.128.66.0F.WIG E6 /r */
24142      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24143         delta = dis_CVTxPD2DQ_128(vbi, pfx, delta, True/*isAvx*/,
24144                                   True/*r2zero*/);
24145         goto decode_success;
24146      }
24147      /* VCVTTPD2DQ ymm2/m256, xmm1 = VEX.256.66.0F.WIG E6 /r */
24148      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24149         delta = dis_CVTxPD2DQ_256(vbi, pfx, delta, True/*r2zero*/);
24150         goto decode_success;
24151      }
24152      /* VCVTPD2DQ xmm2/m128, xmm1 = VEX.128.F2.0F.WIG E6 /r */
24153      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24154         delta = dis_CVTxPD2DQ_128(vbi, pfx, delta, True/*isAvx*/,
24155                                   False/*!r2zero*/);
24156         goto decode_success;
24157      }
24158      /* VCVTPD2DQ ymm2/m256, xmm1 = VEX.256.F2.0F.WIG E6 /r */
24159      if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24160         delta = dis_CVTxPD2DQ_256(vbi, pfx, delta, False/*!r2zero*/);
24161         goto decode_success;
24162      }
24163      break;
24164
24165   case 0xE7:
24166      /* VMOVNTDQ xmm1, m128 = VEX.128.66.0F.WIG E7 /r */
24167      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24168         UChar modrm = getUChar(delta);
24169         UInt rG     = gregOfRexRM(pfx,modrm);
24170         if (!epartIsReg(modrm)) {
24171            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24172            gen_SEGV_if_not_16_aligned( addr );
24173            storeLE( mkexpr(addr), getXMMReg(rG) );
24174            DIP("vmovntdq %s,%s\n", dis_buf, nameXMMReg(rG));
24175            delta += alen;
24176            goto decode_success;
24177         }
24178         /* else fall through */
24179      }
24180      /* VMOVNTDQ ymm1, m256 = VEX.256.66.0F.WIG E7 /r */
24181      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24182         UChar modrm = getUChar(delta);
24183         UInt rG     = gregOfRexRM(pfx,modrm);
24184         if (!epartIsReg(modrm)) {
24185            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24186            gen_SEGV_if_not_32_aligned( addr );
24187            storeLE( mkexpr(addr), getYMMReg(rG) );
24188            DIP("vmovntdq %s,%s\n", dis_buf, nameYMMReg(rG));
24189            delta += alen;
24190            goto decode_success;
24191         }
24192         /* else fall through */
24193      }
24194      break;
24195
24196   case 0xE8:
24197      /* VPSUBSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E8 /r */
24198      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24199         delta = dis_AVX128_E_V_to_G(
24200                    uses_vvvv, vbi, pfx, delta, "vpsubsb", Iop_QSub8Sx16 );
24201         goto decode_success;
24202      }
24203      break;
24204
24205   case 0xE9:
24206      /* VPSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E9 /r */
24207      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24208         delta = dis_AVX128_E_V_to_G(
24209                    uses_vvvv, vbi, pfx, delta, "vpsubsw", Iop_QSub16Sx8 );
24210         goto decode_success;
24211      }
24212      break;
24213
24214   case 0xEA:
24215      /* VPMINSW r/m, rV, r ::: r = min-signed16s(rV, r/m) */
24216      /* VPMINSW = VEX.NDS.128.66.0F.WIG EA /r */
24217      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24218         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
24219                    uses_vvvv, vbi, pfx, delta, "vpminsw", Iop_Min16Sx8 );
24220         goto decode_success;
24221      }
24222      break;
24223
24224   case 0xEB:
24225      /* VPOR r/m, rV, r ::: r = rV | r/m */
24226      /* VPOR = VEX.NDS.128.66.0F.WIG EB /r */
24227      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24228         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
24229                    uses_vvvv, vbi, pfx, delta, "vpor", Iop_OrV128 );
24230         goto decode_success;
24231      }
24232      break;
24233
24234   case 0xEC:
24235      /* VPADDSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG EC /r */
24236      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24237         delta = dis_AVX128_E_V_to_G(
24238                    uses_vvvv, vbi, pfx, delta, "vpaddsb", Iop_QAdd8Sx16 );
24239         goto decode_success;
24240      }
24241      break;
24242
24243   case 0xED:
24244      /* VPADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG ED /r */
24245      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24246         delta = dis_AVX128_E_V_to_G(
24247                    uses_vvvv, vbi, pfx, delta, "vpaddsw", Iop_QAdd16Sx8 );
24248         goto decode_success;
24249      }
24250      break;
24251
24252   case 0xEE:
24253      /* VPMAXSW r/m, rV, r ::: r = max-signed16s(rV, r/m) */
24254      /* VPMAXSW = VEX.NDS.128.66.0F.WIG EE /r */
24255      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24256         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
24257                    uses_vvvv, vbi, pfx, delta, "vpmaxsw", Iop_Max16Sx8 );
24258         goto decode_success;
24259      }
24260      break;
24261
24262   case 0xEF:
24263      /* VPXOR r/m, rV, r ::: r = rV ^ r/m */
24264      /* VPXOR = VEX.NDS.128.66.0F.WIG EF /r */
24265      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24266         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
24267                    uses_vvvv, vbi, pfx, delta, "vpxor", Iop_XorV128 );
24268         goto decode_success;
24269      }
24270      break;
24271
24272   case 0xF0:
24273      /* VLDDQU m256, ymm1 = VEX.256.F2.0F.WIG F0 /r */
24274      if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24275         UChar  modrm = getUChar(delta);
24276         UInt   rD    = gregOfRexRM(pfx, modrm);
24277         IRTemp tD    = newTemp(Ity_V256);
24278         if (epartIsReg(modrm)) break;
24279         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
24280         delta += alen;
24281         assign(tD, loadLE(Ity_V256, mkexpr(addr)));
24282         DIP("vlddqu %s,%s\n", dis_buf, nameYMMReg(rD));
24283         putYMMReg(rD, mkexpr(tD));
24284         goto decode_success;
24285      }
24286      /* VLDDQU m128, xmm1 = VEX.128.F2.0F.WIG F0 /r */
24287      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24288         UChar  modrm = getUChar(delta);
24289         UInt   rD    = gregOfRexRM(pfx, modrm);
24290         IRTemp tD    = newTemp(Ity_V128);
24291         if (epartIsReg(modrm)) break;
24292         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
24293         delta += alen;
24294         assign(tD, loadLE(Ity_V128, mkexpr(addr)));
24295         DIP("vlddqu %s,%s\n", dis_buf, nameXMMReg(rD));
24296         putYMMRegLoAndZU(rD, mkexpr(tD));
24297         goto decode_success;
24298      }
24299      break;
24300
24301   case 0xF1:
24302      /* VPSLLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F1 /r */
24303      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24304         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
24305                                        "vpsllw", Iop_ShlN16x8 );
24306         *uses_vvvv = True;
24307         goto decode_success;
24308
24309      }
24310      break;
24311
24312   case 0xF2:
24313      /* VPSLLD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F2 /r */
24314      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24315         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
24316                                        "vpslld", Iop_ShlN32x4 );
24317         *uses_vvvv = True;
24318         goto decode_success;
24319      }
24320      break;
24321
24322   case 0xF3:
24323      /* VPSLLQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F3 /r */
24324      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24325         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
24326                                        "vpsllq", Iop_ShlN64x2 );
24327         *uses_vvvv = True;
24328         goto decode_success;
24329      }
24330      break;
24331
24332   case 0xF4:
24333      /* VPMULUDQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F4 /r */
24334      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24335         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
24336                    uses_vvvv, vbi, pfx, delta,
24337                    "vpmuludq", math_PMULUDQ_128 );
24338         goto decode_success;
24339      }
24340      break;
24341
24342   case 0xF5:
24343      /* VPMADDWD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F5 /r */
24344      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24345         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
24346                    uses_vvvv, vbi, pfx, delta,
24347                    "vpmaddwd", math_PMADDWD_128 );
24348         goto decode_success;
24349      }
24350      break;
24351
24352   case 0xF6:
24353      /* VPSADBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F6 /r */
24354      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24355         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
24356                    uses_vvvv, vbi, pfx, delta,
24357                    "vpsadbw", math_PSADBW_128 );
24358         goto decode_success;
24359      }
24360      break;
24361
24362   case 0xF7:
24363      /* VMASKMOVDQU xmm2, xmm1 = VEX.128.66.0F.WIG F7 /r */
24364      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
24365          && epartIsReg(getUChar(delta))) {
24366         delta = dis_MASKMOVDQU( vbi, pfx, delta, True/*isAvx*/ );
24367         goto decode_success;
24368      }
24369      break;
24370
24371   case 0xF8:
24372      /* VPSUBB r/m, rV, r ::: r = rV - r/m */
24373      /* VPSUBB = VEX.NDS.128.66.0F.WIG F8 /r */
24374      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24375         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
24376                    uses_vvvv, vbi, pfx, delta, "vpsubb", Iop_Sub8x16 );
24377         goto decode_success;
24378      }
24379      break;
24380
24381   case 0xF9:
24382      /* VPSUBW r/m, rV, r ::: r = rV - r/m */
24383      /* VPSUBW = VEX.NDS.128.66.0F.WIG F9 /r */
24384      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24385         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
24386                    uses_vvvv, vbi, pfx, delta, "vpsubw", Iop_Sub16x8 );
24387         goto decode_success;
24388      }
24389      break;
24390
24391   case 0xFA:
24392      /* VPSUBD r/m, rV, r ::: r = rV - r/m */
24393      /* VPSUBD = VEX.NDS.128.66.0F.WIG FA /r */
24394      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24395         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
24396                    uses_vvvv, vbi, pfx, delta, "vpsubd", Iop_Sub32x4 );
24397         goto decode_success;
24398      }
24399      break;
24400
24401   case 0xFB:
24402      /* VPSUBQ r/m, rV, r ::: r = rV - r/m */
24403      /* VPSUBQ = VEX.NDS.128.66.0F.WIG FB /r */
24404      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24405         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
24406                    uses_vvvv, vbi, pfx, delta, "vpsubq", Iop_Sub64x2 );
24407         goto decode_success;
24408      }
24409      break;
24410
24411   case 0xFC:
24412      /* VPADDB r/m, rV, r ::: r = rV + r/m */
24413      /* VPADDB = VEX.NDS.128.66.0F.WIG FC /r */
24414      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24415         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
24416                    uses_vvvv, vbi, pfx, delta, "vpaddb", Iop_Add8x16 );
24417         goto decode_success;
24418      }
24419      break;
24420
24421   case 0xFD:
24422      /* VPADDW r/m, rV, r ::: r = rV + r/m */
24423      /* VPADDW = VEX.NDS.128.66.0F.WIG FD /r */
24424      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24425         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
24426                    uses_vvvv, vbi, pfx, delta, "vpaddw", Iop_Add16x8 );
24427         goto decode_success;
24428      }
24429      break;
24430
24431   case 0xFE:
24432      /* VPADDD r/m, rV, r ::: r = rV + r/m */
24433      /* VPADDD = VEX.NDS.128.66.0F.WIG FE /r */
24434      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24435         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
24436                    uses_vvvv, vbi, pfx, delta, "vpaddd", Iop_Add32x4 );
24437         goto decode_success;
24438      }
24439      break;
24440
24441   default:
24442      break;
24443
24444   }
24445
24446  //decode_failure:
24447   return deltaIN;
24448
24449  decode_success:
24450   return delta;
24451}
24452
24453
24454/*------------------------------------------------------------*/
24455/*---                                                      ---*/
24456/*--- Top-level post-escape decoders: dis_ESC_0F38__VEX    ---*/
24457/*---                                                      ---*/
24458/*------------------------------------------------------------*/
24459
24460static IRTemp math_PERMILPS_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
24461{
24462   /* In the control vector, zero out all but the bottom two bits of
24463      each 32-bit lane. */
24464   IRExpr* cv1 = binop(Iop_ShrN32x4,
24465                       binop(Iop_ShlN32x4, mkexpr(ctrlV), mkU8(30)),
24466                       mkU8(30));
24467   /* And use the resulting cleaned-up control vector as steering
24468      in a Perm operation. */
24469   IRTemp res = newTemp(Ity_V128);
24470   assign(res, binop(Iop_Perm32x4, mkexpr(dataV), cv1));
24471   return res;
24472}
24473
24474static IRTemp math_PERMILPS_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
24475{
24476   IRTemp dHi, dLo, cHi, cLo;
24477   dHi = dLo = cHi = cLo = IRTemp_INVALID;
24478   breakupV256toV128s( dataV, &dHi, &dLo );
24479   breakupV256toV128s( ctrlV, &cHi, &cLo );
24480   IRTemp rHi = math_PERMILPS_VAR_128( dHi, cHi );
24481   IRTemp rLo = math_PERMILPS_VAR_128( dLo, cLo );
24482   IRTemp res = newTemp(Ity_V256);
24483   assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
24484   return res;
24485}
24486
24487static IRTemp math_PERMILPD_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
24488{
24489   /* No cleverness here .. */
24490   IRTemp dHi, dLo, cHi, cLo;
24491   dHi = dLo = cHi = cLo = IRTemp_INVALID;
24492   breakupV128to64s( dataV, &dHi, &dLo );
24493   breakupV128to64s( ctrlV, &cHi, &cLo );
24494   IRExpr* rHi
24495      = IRExpr_Mux0X( unop(Iop_64to8,
24496                           binop(Iop_And64, mkexpr(cHi), mkU64(2))),
24497                      mkexpr(dLo), mkexpr(dHi) );
24498   IRExpr* rLo
24499      = IRExpr_Mux0X( unop(Iop_64to8,
24500                           binop(Iop_And64, mkexpr(cLo), mkU64(2))),
24501                      mkexpr(dLo), mkexpr(dHi) );
24502   IRTemp res = newTemp(Ity_V128);
24503   assign(res, binop(Iop_64HLtoV128, rHi, rLo));
24504   return res;
24505}
24506
24507static IRTemp math_PERMILPD_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
24508{
24509   IRTemp dHi, dLo, cHi, cLo;
24510   dHi = dLo = cHi = cLo = IRTemp_INVALID;
24511   breakupV256toV128s( dataV, &dHi, &dLo );
24512   breakupV256toV128s( ctrlV, &cHi, &cLo );
24513   IRTemp rHi = math_PERMILPD_VAR_128( dHi, cHi );
24514   IRTemp rLo = math_PERMILPD_VAR_128( dLo, cLo );
24515   IRTemp res = newTemp(Ity_V256);
24516   assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
24517   return res;
24518}
24519
24520__attribute__((noinline))
24521static
24522Long dis_ESC_0F38__VEX (
24523        /*MB_OUT*/DisResult* dres,
24524        /*OUT*/   Bool*      uses_vvvv,
24525        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
24526        Bool         resteerCisOk,
24527        void*        callback_opaque,
24528        VexArchInfo* archinfo,
24529        VexAbiInfo*  vbi,
24530        Prefix pfx, Int sz, Long deltaIN
24531     )
24532{
24533   IRTemp addr  = IRTemp_INVALID;
24534   Int    alen  = 0;
24535   HChar  dis_buf[50];
24536   Long   delta = deltaIN;
24537   UChar  opc   = getUChar(delta);
24538   delta++;
24539   *uses_vvvv = False;
24540
24541   switch (opc) {
24542
24543   case 0x00:
24544      /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */
24545      /* VPSHUFB = VEX.NDS.128.66.0F38.WIG 00 /r */
24546      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24547         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
24548                    uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_XMM );
24549         goto decode_success;
24550      }
24551      break;
24552
24553   case 0x01:
24554   case 0x02:
24555   case 0x03:
24556      /* VPHADDW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 01 /r */
24557      /* VPHADDD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 02 /r */
24558      /* VPHADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 03 /r */
24559      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24560         delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
24561         *uses_vvvv = True;
24562         goto decode_success;
24563      }
24564      break;
24565
24566   case 0x04:
24567      /* VPMADDUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 04 /r */
24568      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24569         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
24570                    uses_vvvv, vbi, pfx, delta, "vpmaddubsw",
24571                    math_PMADDUBSW_128 );
24572         goto decode_success;
24573      }
24574      break;
24575
24576   case 0x05:
24577   case 0x06:
24578   case 0x07:
24579      /* VPHSUBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 05 /r */
24580      /* VPHSUBD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 06 /r */
24581      /* VPHSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 07 /r */
24582      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24583         delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
24584         *uses_vvvv = True;
24585         goto decode_success;
24586      }
24587      break;
24588
24589   case 0x08:
24590   case 0x09:
24591   case 0x0A:
24592      /* VPSIGNB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 08 /r */
24593      /* VPSIGNW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 09 /r */
24594      /* VPSIGND xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0A /r */
24595      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24596         IRTemp sV      = newTemp(Ity_V128);
24597         IRTemp dV      = newTemp(Ity_V128);
24598         IRTemp sHi, sLo, dHi, dLo;
24599         sHi = sLo = dHi = dLo = IRTemp_INVALID;
24600         UChar  ch      = '?';
24601         Int    laneszB = 0;
24602         UChar  modrm   = getUChar(delta);
24603         UInt   rG      = gregOfRexRM(pfx,modrm);
24604         UInt   rV      = getVexNvvvv(pfx);
24605
24606         switch (opc) {
24607            case 0x08: laneszB = 1; ch = 'b'; break;
24608            case 0x09: laneszB = 2; ch = 'w'; break;
24609            case 0x0A: laneszB = 4; ch = 'd'; break;
24610            default: vassert(0);
24611         }
24612
24613         assign( dV, getXMMReg(rV) );
24614
24615         if (epartIsReg(modrm)) {
24616            UInt rE = eregOfRexRM(pfx,modrm);
24617            assign( sV, getXMMReg(rE) );
24618            delta += 1;
24619            DIP("vpsign%c %s,%s,%s\n", ch, nameXMMReg(rE),
24620                nameXMMReg(rV), nameXMMReg(rG));
24621         } else {
24622            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24623            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
24624            delta += alen;
24625            DIP("vpsign%c %s,%s,%s\n", ch, dis_buf,
24626                nameXMMReg(rV), nameXMMReg(rG));
24627         }
24628
24629         breakupV128to64s( dV, &dHi, &dLo );
24630         breakupV128to64s( sV, &sHi, &sLo );
24631
24632         putYMMRegLoAndZU(
24633            rG,
24634            binop(Iop_64HLtoV128,
24635                  dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
24636                  dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
24637            )
24638         );
24639         *uses_vvvv = True;
24640         goto decode_success;
24641      }
24642      break;
24643
24644   case 0x0B:
24645      /* VPMULHRSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0B /r */
24646      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24647         IRTemp sV      = newTemp(Ity_V128);
24648         IRTemp dV      = newTemp(Ity_V128);
24649         IRTemp sHi, sLo, dHi, dLo;
24650         sHi = sLo = dHi = dLo = IRTemp_INVALID;
24651         UChar  modrm   = getUChar(delta);
24652         UInt   rG      = gregOfRexRM(pfx,modrm);
24653         UInt   rV      = getVexNvvvv(pfx);
24654
24655         assign( dV, getXMMReg(rV) );
24656
24657         if (epartIsReg(modrm)) {
24658            UInt rE = eregOfRexRM(pfx,modrm);
24659            assign( sV, getXMMReg(rE) );
24660            delta += 1;
24661            DIP("vpmulhrsw %s,%s,%s\n", nameXMMReg(rE),
24662                nameXMMReg(rV), nameXMMReg(rG));
24663         } else {
24664            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24665            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
24666            delta += alen;
24667            DIP("vpmulhrsw %s,%s,%s\n", dis_buf,
24668                nameXMMReg(rV), nameXMMReg(rG));
24669         }
24670
24671         breakupV128to64s( dV, &dHi, &dLo );
24672         breakupV128to64s( sV, &sHi, &sLo );
24673
24674         putYMMRegLoAndZU(
24675            rG,
24676            binop(Iop_64HLtoV128,
24677                  dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
24678                  dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
24679            )
24680         );
24681         *uses_vvvv = True;
24682         goto decode_success;
24683      }
24684      break;
24685
24686   case 0x0C:
24687      /* VPERMILPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0C /r */
24688      if (have66noF2noF3(pfx)
24689          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
24690         UChar  modrm = getUChar(delta);
24691         UInt   rG    = gregOfRexRM(pfx, modrm);
24692         UInt   rV    = getVexNvvvv(pfx);
24693         IRTemp ctrlV = newTemp(Ity_V128);
24694         if (epartIsReg(modrm)) {
24695            UInt rE = eregOfRexRM(pfx, modrm);
24696            delta += 1;
24697            DIP("vpermilps %s,%s,%s\n",
24698                nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
24699            assign(ctrlV, getXMMReg(rE));
24700         } else {
24701            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
24702            delta += alen;
24703            DIP("vpermilps %s,%s,%s\n",
24704                dis_buf, nameXMMReg(rV), nameXMMReg(rG));
24705            assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
24706         }
24707         IRTemp dataV = newTemp(Ity_V128);
24708         assign(dataV, getXMMReg(rV));
24709         IRTemp resV = math_PERMILPS_VAR_128(dataV, ctrlV);
24710         putYMMRegLoAndZU(rG, mkexpr(resV));
24711         *uses_vvvv = True;
24712         goto decode_success;
24713      }
24714      /* VPERMILPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0C /r */
24715      if (have66noF2noF3(pfx)
24716          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
24717         UChar  modrm = getUChar(delta);
24718         UInt   rG    = gregOfRexRM(pfx, modrm);
24719         UInt   rV    = getVexNvvvv(pfx);
24720         IRTemp ctrlV = newTemp(Ity_V256);
24721         if (epartIsReg(modrm)) {
24722            UInt rE = eregOfRexRM(pfx, modrm);
24723            delta += 1;
24724            DIP("vpermilps %s,%s,%s\n",
24725                nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
24726            assign(ctrlV, getYMMReg(rE));
24727         } else {
24728            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
24729            delta += alen;
24730            DIP("vpermilps %s,%s,%s\n",
24731                dis_buf, nameYMMReg(rV), nameYMMReg(rG));
24732            assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
24733         }
24734         IRTemp dataV = newTemp(Ity_V256);
24735         assign(dataV, getYMMReg(rV));
24736         IRTemp resV = math_PERMILPS_VAR_256(dataV, ctrlV);
24737         putYMMReg(rG, mkexpr(resV));
24738         *uses_vvvv = True;
24739         goto decode_success;
24740      }
24741      break;
24742
24743   case 0x0D:
24744      /* VPERMILPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0D /r */
24745      if (have66noF2noF3(pfx)
24746          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
24747         UChar  modrm = getUChar(delta);
24748         UInt   rG    = gregOfRexRM(pfx, modrm);
24749         UInt   rV    = getVexNvvvv(pfx);
24750         IRTemp ctrlV = newTemp(Ity_V128);
24751         if (epartIsReg(modrm)) {
24752            UInt rE = eregOfRexRM(pfx, modrm);
24753            delta += 1;
24754            DIP("vpermilpd %s,%s,%s\n",
24755                nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
24756            assign(ctrlV, getXMMReg(rE));
24757         } else {
24758            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
24759            delta += alen;
24760            DIP("vpermilpd %s,%s,%s\n",
24761                dis_buf, nameXMMReg(rV), nameXMMReg(rG));
24762            assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
24763         }
24764         IRTemp dataV = newTemp(Ity_V128);
24765         assign(dataV, getXMMReg(rV));
24766         IRTemp resV = math_PERMILPD_VAR_128(dataV, ctrlV);
24767         putYMMRegLoAndZU(rG, mkexpr(resV));
24768         *uses_vvvv = True;
24769         goto decode_success;
24770      }
24771      /* VPERMILPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0D /r */
24772      if (have66noF2noF3(pfx)
24773          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
24774         UChar  modrm = getUChar(delta);
24775         UInt   rG    = gregOfRexRM(pfx, modrm);
24776         UInt   rV    = getVexNvvvv(pfx);
24777         IRTemp ctrlV = newTemp(Ity_V256);
24778         if (epartIsReg(modrm)) {
24779            UInt rE = eregOfRexRM(pfx, modrm);
24780            delta += 1;
24781            DIP("vpermilpd %s,%s,%s\n",
24782                nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
24783            assign(ctrlV, getYMMReg(rE));
24784         } else {
24785            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
24786            delta += alen;
24787            DIP("vpermilpd %s,%s,%s\n",
24788                dis_buf, nameYMMReg(rV), nameYMMReg(rG));
24789            assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
24790         }
24791         IRTemp dataV = newTemp(Ity_V256);
24792         assign(dataV, getYMMReg(rV));
24793         IRTemp resV = math_PERMILPD_VAR_256(dataV, ctrlV);
24794         putYMMReg(rG, mkexpr(resV));
24795         *uses_vvvv = True;
24796         goto decode_success;
24797      }
24798      break;
24799
24800   case 0x0E:
24801      /* VTESTPS xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 0E /r */
24802      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24803         delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 32 );
24804         goto decode_success;
24805      }
24806      /* VTESTPS ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 0E /r */
24807      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24808         delta = dis_xTESTy_256( vbi, pfx, delta, 32 );
24809         goto decode_success;
24810      }
24811      break;
24812
24813   case 0x0F:
24814      /* VTESTPD xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 0F /r */
24815      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24816         delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 64 );
24817         goto decode_success;
24818      }
24819      /* VTESTPD ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 0F /r */
24820      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24821         delta = dis_xTESTy_256( vbi, pfx, delta, 64 );
24822         goto decode_success;
24823      }
24824      break;
24825
24826   case 0x17:
24827      /* VPTEST xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 17 /r */
24828      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24829         delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 0 );
24830         goto decode_success;
24831      }
24832      /* VPTEST ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 17 /r */
24833      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24834         delta = dis_xTESTy_256( vbi, pfx, delta, 0 );
24835         goto decode_success;
24836      }
24837      break;
24838
24839   case 0x18:
24840      /* VBROADCASTSS m32, xmm1 = VEX.128.66.0F38.WIG 18 /r */
24841      if (have66noF2noF3(pfx)
24842          && 0==getVexL(pfx)/*128*/
24843          && !epartIsReg(getUChar(delta))) {
24844         UChar modrm = getUChar(delta);
24845         UInt  rG    = gregOfRexRM(pfx, modrm);
24846         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
24847         delta += alen;
24848         DIP("vbroadcastss %s,%s\n", dis_buf, nameXMMReg(rG));
24849         IRTemp t32 = newTemp(Ity_I32);
24850         assign(t32, loadLE(Ity_I32, mkexpr(addr)));
24851         IRTemp t64 = newTemp(Ity_I64);
24852         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
24853         IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
24854         putYMMRegLoAndZU(rG, res);
24855         goto decode_success;
24856      }
24857      /* VBROADCASTSS m32, ymm1 = VEX.256.66.0F38.WIG 18 /r */
24858      if (have66noF2noF3(pfx)
24859          && 1==getVexL(pfx)/*256*/
24860          && !epartIsReg(getUChar(delta))) {
24861         UChar modrm = getUChar(delta);
24862         UInt  rG    = gregOfRexRM(pfx, modrm);
24863         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
24864         delta += alen;
24865         DIP("vbroadcastss %s,%s\n", dis_buf, nameYMMReg(rG));
24866         IRTemp t32 = newTemp(Ity_I32);
24867         assign(t32, loadLE(Ity_I32, mkexpr(addr)));
24868         IRTemp t64 = newTemp(Ity_I64);
24869         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
24870         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
24871                                                  mkexpr(t64), mkexpr(t64));
24872         putYMMReg(rG, res);
24873         goto decode_success;
24874      }
24875      break;
24876
24877   case 0x19:
24878      /* VBROADCASTSD m64, ymm1 = VEX.256.66.0F38.WIG 19 /r */
24879      if (have66noF2noF3(pfx)
24880          && 1==getVexL(pfx)/*256*/
24881          && !epartIsReg(getUChar(delta))) {
24882         UChar modrm = getUChar(delta);
24883         UInt  rG    = gregOfRexRM(pfx, modrm);
24884         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
24885         delta += alen;
24886         DIP("vbroadcastsd %s,%s\n", dis_buf, nameYMMReg(rG));
24887         IRTemp t64 = newTemp(Ity_I64);
24888         assign(t64, loadLE(Ity_I64, mkexpr(addr)));
24889         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
24890                                                  mkexpr(t64), mkexpr(t64));
24891         putYMMReg(rG, res);
24892         goto decode_success;
24893      }
24894      break;
24895
24896   case 0x1A:
24897      /* VBROADCASTF128 m128, ymm1 = VEX.256.66.0F38.WIG 1A /r */
24898      if (have66noF2noF3(pfx)
24899          && 1==getVexL(pfx)/*256*/
24900          && !epartIsReg(getUChar(delta))) {
24901         UChar modrm = getUChar(delta);
24902         UInt  rG    = gregOfRexRM(pfx, modrm);
24903         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
24904         delta += alen;
24905         DIP("vbroadcastf128 %s,%s\n", dis_buf, nameYMMReg(rG));
24906         IRTemp t128 = newTemp(Ity_V128);
24907         assign(t128, loadLE(Ity_V128, mkexpr(addr)));
24908         putYMMReg( rG, binop(Iop_V128HLtoV256, mkexpr(t128), mkexpr(t128)) );
24909         goto decode_success;
24910      }
24911      break;
24912
24913   case 0x1C:
24914      /* VPABSB xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1C /r */
24915      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24916         delta = dis_AVX128_E_to_G_unary(
24917                    uses_vvvv, vbi, pfx, delta,
24918                    "vpabsb", math_PABS_XMM_pap1 );
24919         goto decode_success;
24920      }
24921      break;
24922
24923   case 0x1D:
24924      /* VPABSW xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1D /r */
24925      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24926         delta = dis_AVX128_E_to_G_unary(
24927                    uses_vvvv, vbi, pfx, delta,
24928                    "vpabsw", math_PABS_XMM_pap2 );
24929         goto decode_success;
24930      }
24931      break;
24932
24933   case 0x1E:
24934      /* VPABSD xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1E /r */
24935      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24936         delta = dis_AVX128_E_to_G_unary(
24937                    uses_vvvv, vbi, pfx, delta,
24938                    "vpabsd", math_PABS_XMM_pap4 );
24939         goto decode_success;
24940      }
24941      break;
24942
24943   case 0x20:
24944      /* VPMOVSXBW xmm2/m64, xmm1 */
24945      /* VPMOVSXBW = VEX.128.66.0F38.WIG 20 /r */
24946      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24947         delta = dis_PMOVxXBW_128( vbi, pfx, delta,
24948                                   True/*isAvx*/, False/*!xIsZ*/ );
24949         goto decode_success;
24950      }
24951      break;
24952
24953   case 0x21:
24954      /* VPMOVSXBD xmm2/m32, xmm1 */
24955      /* VPMOVSXBD = VEX.128.66.0F38.WIG 21 /r */
24956      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24957         delta = dis_PMOVxXBD_128( vbi, pfx, delta,
24958                                   True/*isAvx*/, False/*!xIsZ*/ );
24959         goto decode_success;
24960      }
24961      break;
24962
24963   case 0x22:
24964      /* VPMOVSXBQ xmm2/m16, xmm1 */
24965      /* VPMOVSXBQ = VEX.128.66.0F38.WIG 22 /r */
24966      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24967         delta = dis_PMOVSXBQ_128( vbi, pfx, delta, True/*isAvx*/ );
24968         goto decode_success;
24969      }
24970      break;
24971
24972   case 0x23:
24973      /* VPMOVSXWD xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 23 /r */
24974      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24975         delta = dis_PMOVxXWD_128( vbi, pfx, delta,
24976                                   True/*isAvx*/, False/*!xIsZ*/ );
24977         goto decode_success;
24978      }
24979      break;
24980
24981   case 0x24:
24982      /* VPMOVSXWQ xmm2/m32, xmm1 = VEX.128.66.0F38.WIG 24 /r */
24983      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24984         delta = dis_PMOVSXWQ_128( vbi, pfx, delta, True/*isAvx*/ );
24985         goto decode_success;
24986      }
24987      break;
24988
24989   case 0x25:
24990      /* VPMOVSXDQ xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 25 /r */
24991      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24992         delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
24993                                   True/*isAvx*/, False/*!xIsZ*/ );
24994         goto decode_success;
24995      }
24996      break;
24997
24998   case 0x28:
24999      /* VPMULDQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 28 /r */
25000      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25001         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
25002                    uses_vvvv, vbi, pfx, delta,
25003                    "vpmuldq", math_PMULDQ_128 );
25004         goto decode_success;
25005      }
25006      break;
25007
25008   case 0x29:
25009      /* VPCMPEQQ r/m, rV, r ::: r = rV `eq-by-64s` r/m */
25010      /* VPCMPEQQ = VEX.NDS.128.66.0F38.WIG 29 /r */
25011      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25012         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25013                    uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x2 );
25014         goto decode_success;
25015      }
25016      break;
25017
25018   case 0x2A:
25019      /* VMOVNTDQA m128, xmm1 = VEX.128.66.0F38.WIG 2A /r */
25020      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
25021          && !epartIsReg(getUChar(delta))) {
25022         UChar  modrm = getUChar(delta);
25023         UInt   rD    = gregOfRexRM(pfx, modrm);
25024         IRTemp tD    = newTemp(Ity_V128);
25025         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
25026         delta += alen;
25027         gen_SEGV_if_not_16_aligned(addr);
25028         assign(tD, loadLE(Ity_V128, mkexpr(addr)));
25029         DIP("vmovntdqa %s,%s\n", dis_buf, nameXMMReg(rD));
25030         putYMMRegLoAndZU(rD, mkexpr(tD));
25031         goto decode_success;
25032      }
25033      break;
25034
25035   case 0x2B:
25036      /* VPACKUSDW r/m, rV, r ::: r = QNarrowBin32Sto16Ux8(rV, r/m) */
25037      /* VPACKUSDW = VEX.NDS.128.66.0F38.WIG 2B /r */
25038      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25039         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25040                    uses_vvvv, vbi, pfx, delta, "vpackusdw",
25041                    Iop_QNarrowBin32Sto16Ux8, NULL,
25042                    False/*!invertLeftArg*/, True/*swapArgs*/ );
25043         goto decode_success;
25044      }
25045      break;
25046
25047   case 0x30:
25048      /* VPMOVZXBW xmm2/m64, xmm1 */
25049      /* VPMOVZXBW = VEX.128.66.0F38.WIG 30 /r */
25050      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25051         delta = dis_PMOVxXBW_128( vbi, pfx, delta,
25052                                   True/*isAvx*/, True/*xIsZ*/ );
25053         goto decode_success;
25054      }
25055      break;
25056
25057   case 0x31:
25058      /* VPMOVZXBD xmm2/m32, xmm1 */
25059      /* VPMOVZXBD = VEX.128.66.0F38.WIG 31 /r */
25060      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25061         delta = dis_PMOVxXBD_128( vbi, pfx, delta,
25062                                   True/*isAvx*/, True/*xIsZ*/ );
25063         goto decode_success;
25064      }
25065      break;
25066
25067   case 0x32:
25068      /* VPMOVZXBQ xmm2/m16, xmm1 */
25069      /* VPMOVZXBQ = VEX.128.66.0F38.WIG 32 /r */
25070      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25071         delta = dis_PMOVZXBQ_128( vbi, pfx, delta, True/*isAvx*/ );
25072         goto decode_success;
25073      }
25074      break;
25075
25076   case 0x33:
25077      /* VPMOVZXWD xmm2/m64, xmm1 */
25078      /* VPMOVZXWD = VEX.128.66.0F38.WIG 33 /r */
25079      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25080         delta = dis_PMOVxXWD_128( vbi, pfx, delta,
25081                                   True/*isAvx*/, True/*xIsZ*/ );
25082         goto decode_success;
25083      }
25084      break;
25085
25086   case 0x34:
25087      /* VPMOVZXWQ xmm2/m32, xmm1 = VEX.128.66.0F38.WIG 34 /r */
25088      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25089         delta = dis_PMOVZXWQ_128( vbi, pfx, delta, True/*isAvx*/ );
25090         goto decode_success;
25091      }
25092      break;
25093
25094   case 0x35:
25095      /* VPMOVZXDQ xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 35 /r */
25096      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25097         delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
25098                                   True/*isAvx*/, True/*xIsZ*/ );
25099         goto decode_success;
25100      }
25101      break;
25102
25103   case 0x37:
25104      /* VPCMPGTQ r/m, rV, r ::: r = rV `>s-by-64s` r/m */
25105      /* VPCMPGTQ = VEX.NDS.128.66.0F38.WIG 37 /r */
25106      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25107         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25108                    uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx2 );
25109         goto decode_success;
25110      }
25111      break;
25112
25113   case 0x38:
25114      /* VPMINSB r/m, rV, r ::: r = min-signed-8s(rV, r/m) */
25115      /* VPMINSB = VEX.NDS.128.66.0F38.WIG 38 /r */
25116      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25117         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25118                    uses_vvvv, vbi, pfx, delta, "vpminsb", Iop_Min8Sx16 );
25119         goto decode_success;
25120      }
25121      break;
25122
25123   case 0x39:
25124      /* VPMINSD r/m, rV, r ::: r = min-signed-32s(rV, r/m) */
25125      /* VPMINSD = VEX.NDS.128.66.0F38.WIG 39 /r */
25126      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25127         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25128                    uses_vvvv, vbi, pfx, delta, "vpminsd", Iop_Min32Sx4 );
25129         goto decode_success;
25130      }
25131      break;
25132
25133   case 0x3A:
25134      /* VPMINUW r/m, rV, r ::: r = min-unsigned-16s(rV, r/m) */
25135      /* VPMINUW = VEX.NDS.128.66.0F38.WIG 3A /r */
25136      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25137         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25138                    uses_vvvv, vbi, pfx, delta, "vpminuw", Iop_Min16Ux8 );
25139         goto decode_success;
25140      }
25141      break;
25142
25143   case 0x3B:
25144      /* VPMINUD r/m, rV, r ::: r = min-unsigned-32s(rV, r/m) */
25145      /* VPMINUD = VEX.NDS.128.66.0F38.WIG 3B /r */
25146      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25147         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25148                    uses_vvvv, vbi, pfx, delta, "vpminud", Iop_Min32Ux4 );
25149         goto decode_success;
25150      }
25151      break;
25152
25153   case 0x3C:
25154      /* VPMAXSB r/m, rV, r ::: r = max-signed-8s(rV, r/m) */
25155      /* VPMAXSB = VEX.NDS.128.66.0F38.WIG 3C /r */
25156      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25157         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25158                    uses_vvvv, vbi, pfx, delta, "vpmaxsb", Iop_Max8Sx16 );
25159         goto decode_success;
25160      }
25161      break;
25162
25163   case 0x3D:
25164      /* VPMAXSD r/m, rV, r ::: r = max-signed-32s(rV, r/m) */
25165      /* VPMAXSD = VEX.NDS.128.66.0F38.WIG 3D /r */
25166      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25167         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25168                    uses_vvvv, vbi, pfx, delta, "vpmaxsd", Iop_Max32Sx4 );
25169         goto decode_success;
25170      }
25171      break;
25172
25173   case 0x3E:
25174      /* VPMAXUW r/m, rV, r ::: r = max-unsigned-16s(rV, r/m) */
25175      /* VPMAXUW = VEX.NDS.128.66.0F38.WIG 3E /r */
25176      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25177         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25178                    uses_vvvv, vbi, pfx, delta, "vpmaxuw", Iop_Max16Ux8 );
25179         goto decode_success;
25180      }
25181      break;
25182
25183   case 0x3F:
25184      /* VPMAXUD r/m, rV, r ::: r = max-unsigned-32s(rV, r/m) */
25185      /* VPMAXUD = VEX.NDS.128.66.0F38.WIG 3F /r */
25186      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25187         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25188                    uses_vvvv, vbi, pfx, delta, "vpmaxud", Iop_Max32Ux4 );
25189         goto decode_success;
25190      }
25191      break;
25192
25193   case 0x40:
25194      /* VPMULLD r/m, rV, r ::: r = mul-32s(rV, r/m) */
25195      /* VPMULLD = VEX.NDS.128.66.0F38.WIG 40 /r */
25196      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25197         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25198                    uses_vvvv, vbi, pfx, delta, "vpmulld", Iop_Mul32x4 );
25199         goto decode_success;
25200      }
25201      break;
25202
25203   case 0x41:
25204      /* VPHMINPOSUW xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 41 /r */
25205      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25206         delta = dis_PHMINPOSUW_128( vbi, pfx, delta, True/*isAvx*/ );
25207         goto decode_success;
25208      }
25209      break;
25210
25211   case 0xDB:
25212   case 0xDC:
25213   case 0xDD:
25214   case 0xDE:
25215   case 0xDF:
25216      /* VAESIMC xmm2/m128, xmm1 = VEX.128.66.0F38.WIG DB /r */
25217      /* VAESENC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DC /r */
25218      /* VAESENCLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DD /r */
25219      /* VAESDEC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DE /r */
25220      /* VAESDECLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DF /r */
25221      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25222         delta = dis_AESx( vbi, pfx, delta, True/*!isAvx*/, opc );
25223         if (opc != 0xDB) *uses_vvvv = True;
25224         goto decode_success;
25225      }
25226      break;
25227
25228   default:
25229      break;
25230
25231   }
25232
25233  //decode_failure:
25234   return deltaIN;
25235
25236  decode_success:
25237   return delta;
25238}
25239
25240
25241/*------------------------------------------------------------*/
25242/*---                                                      ---*/
25243/*--- Top-level post-escape decoders: dis_ESC_0F3A__VEX    ---*/
25244/*---                                                      ---*/
25245/*------------------------------------------------------------*/
25246
25247static IRTemp math_VPERMILPS_128 ( IRTemp sV, UInt imm8 )
25248{
25249   vassert(imm8 < 256);
25250   IRTemp s3, s2, s1, s0;
25251   s3 = s2 = s1 = s0 = IRTemp_INVALID;
25252   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
25253#  define SEL(_nn) (((_nn)==0) ? s0 : ((_nn)==1) ? s1 \
25254                                    : ((_nn)==2) ? s2 : s3)
25255   IRTemp res = newTemp(Ity_V128);
25256   assign(res, mkV128from32s( SEL((imm8 >> 6) & 3),
25257                              SEL((imm8 >> 4) & 3),
25258                              SEL((imm8 >> 2) & 3),
25259                              SEL((imm8 >> 0) & 3) ));
25260#  undef SEL
25261   return res;
25262}
25263
25264__attribute__((noinline))
25265static
25266Long dis_ESC_0F3A__VEX (
25267        /*MB_OUT*/DisResult* dres,
25268        /*OUT*/   Bool*      uses_vvvv,
25269        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
25270        Bool         resteerCisOk,
25271        void*        callback_opaque,
25272        VexArchInfo* archinfo,
25273        VexAbiInfo*  vbi,
25274        Prefix pfx, Int sz, Long deltaIN
25275     )
25276{
25277   IRTemp addr  = IRTemp_INVALID;
25278   Int    alen  = 0;
25279   HChar  dis_buf[50];
25280   Long   delta = deltaIN;
25281   UChar  opc   = getUChar(delta);
25282   delta++;
25283   *uses_vvvv = False;
25284
25285   switch (opc) {
25286
25287   case 0x04:
25288      /* VPERMILPS imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.WIG 04 /r ib */
25289      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25290         UChar  modrm = getUChar(delta);
25291         UInt   imm8  = 0;
25292         UInt   rG    = gregOfRexRM(pfx, modrm);
25293         IRTemp sV    = newTemp(Ity_V256);
25294         if (epartIsReg(modrm)) {
25295            UInt rE = eregOfRexRM(pfx, modrm);
25296            delta += 1;
25297            imm8 = getUChar(delta);
25298            DIP("vpermilps $%u,%s,%s\n",
25299                imm8, nameYMMReg(rE), nameYMMReg(rG));
25300            assign(sV, getYMMReg(rE));
25301         } else {
25302            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
25303            delta += alen;
25304            imm8 = getUChar(delta);
25305            DIP("vpermilps $%u,%s,%s\n",
25306                imm8, dis_buf, nameYMMReg(rG));
25307            assign(sV, loadLE(Ity_V256, mkexpr(addr)));
25308         }
25309         delta++;
25310         IRTemp  sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
25311         breakupV256toV128s( sV, &sVhi, &sVlo );
25312         IRTemp  dVhi = math_VPERMILPS_128( sVhi, imm8 );
25313         IRTemp  dVlo = math_VPERMILPS_128( sVlo, imm8 );
25314         IRExpr* res  = binop(Iop_V128HLtoV256, mkexpr(dVhi), mkexpr(dVlo));
25315         putYMMReg(rG, res);
25316         goto decode_success;
25317      }
25318      /* VPERMILPS imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG 04 /r ib */
25319      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25320         UChar  modrm = getUChar(delta);
25321         UInt   imm8  = 0;
25322         UInt   rG    = gregOfRexRM(pfx, modrm);
25323         IRTemp sV    = newTemp(Ity_V128);
25324         if (epartIsReg(modrm)) {
25325            UInt rE = eregOfRexRM(pfx, modrm);
25326            delta += 1;
25327            imm8 = getUChar(delta);
25328            DIP("vpermilps $%u,%s,%s\n",
25329                imm8, nameXMMReg(rE), nameXMMReg(rG));
25330            assign(sV, getXMMReg(rE));
25331         } else {
25332            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
25333            delta += alen;
25334            imm8 = getUChar(delta);
25335            DIP("vpermilps $%u,%s,%s\n",
25336                imm8, dis_buf, nameXMMReg(rG));
25337            assign(sV, loadLE(Ity_V128, mkexpr(addr)));
25338         }
25339         delta++;
25340         putYMMRegLoAndZU(rG, mkexpr ( math_VPERMILPS_128 ( sV, imm8 ) ) );
25341         goto decode_success;
25342      }
25343      break;
25344
25345   case 0x05:
25346      /* VPERMILPD imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG 05 /r ib */
25347      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25348         UChar  modrm = getUChar(delta);
25349         UInt   imm8  = 0;
25350         UInt   rG    = gregOfRexRM(pfx, modrm);
25351         IRTemp sV    = newTemp(Ity_V128);
25352         if (epartIsReg(modrm)) {
25353            UInt rE = eregOfRexRM(pfx, modrm);
25354            delta += 1;
25355            imm8 = getUChar(delta);
25356            DIP("vpermilpd $%u,%s,%s\n",
25357                imm8, nameXMMReg(rE), nameXMMReg(rG));
25358            assign(sV, getXMMReg(rE));
25359         } else {
25360            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
25361            delta += alen;
25362            imm8 = getUChar(delta);
25363            DIP("vpermilpd $%u,%s,%s\n",
25364                imm8, dis_buf, nameXMMReg(rG));
25365            assign(sV, loadLE(Ity_V128, mkexpr(addr)));
25366         }
25367         delta++;
25368         IRTemp s1 = newTemp(Ity_I64);
25369         IRTemp s0 = newTemp(Ity_I64);
25370         assign(s1, unop(Iop_V128HIto64, mkexpr(sV)));
25371         assign(s0, unop(Iop_V128to64,   mkexpr(sV)));
25372         IRTemp dV = newTemp(Ity_V128);
25373         assign(dV, binop(Iop_64HLtoV128,
25374                               mkexpr((imm8 & (1<<1)) ? s1 : s0),
25375                               mkexpr((imm8 & (1<<0)) ? s1 : s0)));
25376         putYMMRegLoAndZU(rG, mkexpr(dV));
25377         goto decode_success;
25378      }
25379      /* VPERMILPD imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.WIG 05 /r ib */
25380      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25381         UChar  modrm = getUChar(delta);
25382         UInt   imm8  = 0;
25383         UInt   rG    = gregOfRexRM(pfx, modrm);
25384         IRTemp sV    = newTemp(Ity_V256);
25385         if (epartIsReg(modrm)) {
25386            UInt rE = eregOfRexRM(pfx, modrm);
25387            delta += 1;
25388            imm8 = getUChar(delta);
25389            DIP("vpermilpd $%u,%s,%s\n",
25390                imm8, nameYMMReg(rE), nameYMMReg(rG));
25391            assign(sV, getYMMReg(rE));
25392         } else {
25393            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
25394            delta += alen;
25395            imm8 = getUChar(delta);
25396            DIP("vpermilpd $%u,%s,%s\n",
25397                imm8, dis_buf, nameYMMReg(rG));
25398            assign(sV, loadLE(Ity_V256, mkexpr(addr)));
25399         }
25400         delta++;
25401         IRTemp s3, s2, s1, s0;
25402         s3 = s2 = s1 = s0 = IRTemp_INVALID;
25403         breakupV256to64s(sV, &s3, &s2, &s1, &s0);
25404         IRTemp dV = newTemp(Ity_V256);
25405         assign(dV, IRExpr_Qop(Iop_64x4toV256,
25406                               mkexpr((imm8 & (1<<3)) ? s3 : s2),
25407                               mkexpr((imm8 & (1<<2)) ? s3 : s2),
25408                               mkexpr((imm8 & (1<<1)) ? s1 : s0),
25409                               mkexpr((imm8 & (1<<0)) ? s1 : s0)));
25410         putYMMReg(rG, mkexpr(dV));
25411         goto decode_success;
25412      }
25413      break;
25414
25415   case 0x06:
25416      /* VPERM2F128 imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.66.0F3A.W0 06 /r ib */
25417      if (have66noF2noF3(pfx)
25418          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
25419         UChar  modrm = getUChar(delta);
25420         UInt   imm8  = 0;
25421         UInt   rG    = gregOfRexRM(pfx, modrm);
25422         UInt   rV    = getVexNvvvv(pfx);
25423         IRTemp s00   = newTemp(Ity_V128);
25424         IRTemp s01   = newTemp(Ity_V128);
25425         IRTemp s10   = newTemp(Ity_V128);
25426         IRTemp s11   = newTemp(Ity_V128);
25427         assign(s00, getYMMRegLane128(rV, 0));
25428         assign(s01, getYMMRegLane128(rV, 1));
25429         if (epartIsReg(modrm)) {
25430            UInt rE = eregOfRexRM(pfx, modrm);
25431            delta += 1;
25432            imm8 = getUChar(delta);
25433            DIP("vperm2f128 $%u,%s,%s,%s\n",
25434                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
25435            assign(s10, getYMMRegLane128(rE, 0));
25436            assign(s11, getYMMRegLane128(rE, 1));
25437         } else {
25438            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
25439            delta += alen;
25440            imm8 = getUChar(delta);
25441            DIP("vperm2f128 $%u,%s,%s,%s\n",
25442                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
25443            assign(s10, loadLE(Ity_V128, binop(Iop_Add64,
25444                                               mkexpr(addr), mkU64(0))));
25445            assign(s11, loadLE(Ity_V128, binop(Iop_Add64,
25446                                               mkexpr(addr), mkU64(16))));
25447         }
25448         delta++;
25449#        define SEL(_nn) (((_nn)==0) ? s00 : ((_nn)==1) ? s01 \
25450                                           : ((_nn)==2) ? s10 : s11)
25451         putYMMRegLane128(rG, 0, mkexpr(SEL((imm8 >> 0) & 3)));
25452         putYMMRegLane128(rG, 1, mkexpr(SEL((imm8 >> 4) & 3)));
25453#        undef SEL
25454         if (imm8 & (1<<3)) putYMMRegLane128(rG, 0, mkV128(0));
25455         if (imm8 & (1<<7)) putYMMRegLane128(rG, 1, mkV128(0));
25456         *uses_vvvv = True;
25457         goto decode_success;
25458      }
25459      break;
25460
25461   case 0x08:
25462      /* VROUNDPS imm8, xmm2/m128, xmm1 */
25463      /* VROUNDPS = VEX.NDS.128.66.0F3A.WIG 08 ib */
25464      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25465         UChar  modrm = getUChar(delta);
25466         UInt   rG    = gregOfRexRM(pfx, modrm);
25467         IRTemp src   = newTemp(Ity_V128);
25468         IRTemp s0    = IRTemp_INVALID;
25469         IRTemp s1    = IRTemp_INVALID;
25470         IRTemp s2    = IRTemp_INVALID;
25471         IRTemp s3    = IRTemp_INVALID;
25472         IRTemp rm    = newTemp(Ity_I32);
25473         Int    imm   = 0;
25474
25475         modrm = getUChar(delta);
25476
25477         if (epartIsReg(modrm)) {
25478            UInt rE = eregOfRexRM(pfx, modrm);
25479            assign( src, getXMMReg( rE ) );
25480            imm = getUChar(delta+1);
25481            if (imm & ~15) break;
25482            delta += 1+1;
25483            DIP( "vroundps $%d,%s,%s\n", imm, nameXMMReg(rE), nameXMMReg(rG) );
25484         } else {
25485            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
25486            assign( src, loadLE(Ity_V128, mkexpr(addr) ) );
25487            imm = getUChar(delta+alen);
25488            if (imm & ~15) break;
25489            delta += alen+1;
25490            DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameXMMReg(rG) );
25491         }
25492
25493         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
25494            that encoding is the same as the encoding for IRRoundingMode,
25495            we can use that value directly in the IR as a rounding
25496            mode. */
25497         assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
25498
25499         breakupV128to32s( src, &s3, &s2, &s1, &s0 );
25500         putYMMRegLane128( rG, 1, mkV128(0) );
25501#        define CVT(s) binop(Iop_RoundF32toInt, mkexpr(rm), \
25502                             unop(Iop_ReinterpI32asF32, mkexpr(s)))
25503         putYMMRegLane32F( rG, 3, CVT(s3) );
25504         putYMMRegLane32F( rG, 2, CVT(s2) );
25505         putYMMRegLane32F( rG, 1, CVT(s1) );
25506         putYMMRegLane32F( rG, 0, CVT(s0) );
25507#        undef CVT
25508         goto decode_success;
25509      }
25510      /* VROUNDPS imm8, ymm2/m256, ymm1 */
25511      /* VROUNDPS = VEX.NDS.256.66.0F3A.WIG 08 ib */
25512      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25513         UChar  modrm = getUChar(delta);
25514         UInt   rG    = gregOfRexRM(pfx, modrm);
25515         IRTemp src   = newTemp(Ity_V256);
25516         IRTemp s0    = IRTemp_INVALID;
25517         IRTemp s1    = IRTemp_INVALID;
25518         IRTemp s2    = IRTemp_INVALID;
25519         IRTemp s3    = IRTemp_INVALID;
25520         IRTemp s4    = IRTemp_INVALID;
25521         IRTemp s5    = IRTemp_INVALID;
25522         IRTemp s6    = IRTemp_INVALID;
25523         IRTemp s7    = IRTemp_INVALID;
25524         IRTemp rm    = newTemp(Ity_I32);
25525         Int    imm   = 0;
25526
25527         modrm = getUChar(delta);
25528
25529         if (epartIsReg(modrm)) {
25530            UInt rE = eregOfRexRM(pfx, modrm);
25531            assign( src, getYMMReg( rE ) );
25532            imm = getUChar(delta+1);
25533            if (imm & ~15) break;
25534            delta += 1+1;
25535            DIP( "vroundps $%d,%s,%s\n", imm, nameYMMReg(rE), nameYMMReg(rG) );
25536         } else {
25537            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
25538            assign( src, loadLE(Ity_V256, mkexpr(addr) ) );
25539            imm = getUChar(delta+alen);
25540            if (imm & ~15) break;
25541            delta += alen+1;
25542            DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameYMMReg(rG) );
25543         }
25544
25545         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
25546            that encoding is the same as the encoding for IRRoundingMode,
25547            we can use that value directly in the IR as a rounding
25548            mode. */
25549         assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
25550
25551         breakupV256to32s( src, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
25552#        define CVT(s) binop(Iop_RoundF32toInt, mkexpr(rm), \
25553                             unop(Iop_ReinterpI32asF32, mkexpr(s)))
25554         putYMMRegLane32F( rG, 7, CVT(s7) );
25555         putYMMRegLane32F( rG, 6, CVT(s6) );
25556         putYMMRegLane32F( rG, 5, CVT(s5) );
25557         putYMMRegLane32F( rG, 4, CVT(s4) );
25558         putYMMRegLane32F( rG, 3, CVT(s3) );
25559         putYMMRegLane32F( rG, 2, CVT(s2) );
25560         putYMMRegLane32F( rG, 1, CVT(s1) );
25561         putYMMRegLane32F( rG, 0, CVT(s0) );
25562#        undef CVT
25563         goto decode_success;
25564      }
25565
25566   case 0x09:
25567      /* VROUNDPD imm8, xmm2/m128, xmm1 */
25568      /* VROUNDPD = VEX.NDS.128.66.0F3A.WIG 09 ib */
25569      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25570         UChar  modrm = getUChar(delta);
25571         UInt   rG    = gregOfRexRM(pfx, modrm);
25572         IRTemp src   = newTemp(Ity_V128);
25573         IRTemp s0    = IRTemp_INVALID;
25574         IRTemp s1    = IRTemp_INVALID;
25575         IRTemp rm    = newTemp(Ity_I32);
25576         Int    imm   = 0;
25577
25578         modrm = getUChar(delta);
25579
25580         if (epartIsReg(modrm)) {
25581            UInt rE = eregOfRexRM(pfx, modrm);
25582            assign( src, getXMMReg( rE ) );
25583            imm = getUChar(delta+1);
25584            if (imm & ~15) break;
25585            delta += 1+1;
25586            DIP( "vroundpd $%d,%s,%s\n", imm, nameXMMReg(rE), nameXMMReg(rG) );
25587         } else {
25588            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
25589            assign( src, loadLE(Ity_V128, mkexpr(addr) ) );
25590            imm = getUChar(delta+alen);
25591            if (imm & ~15) break;
25592            delta += alen+1;
25593            DIP( "vroundpd $%d,%s,%s\n", imm, dis_buf, nameXMMReg(rG) );
25594         }
25595
25596         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
25597            that encoding is the same as the encoding for IRRoundingMode,
25598            we can use that value directly in the IR as a rounding
25599            mode. */
25600         assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
25601
25602         breakupV128to64s( src, &s1, &s0 );
25603         putYMMRegLane128( rG, 1, mkV128(0) );
25604#        define CVT(s) binop(Iop_RoundF64toInt, mkexpr(rm), \
25605                             unop(Iop_ReinterpI64asF64, mkexpr(s)))
25606         putYMMRegLane64F( rG, 1, CVT(s1) );
25607         putYMMRegLane64F( rG, 0, CVT(s0) );
25608#        undef CVT
25609         goto decode_success;
25610      }
25611      /* VROUNDPD imm8, ymm2/m256, ymm1 */
25612      /* VROUNDPD = VEX.NDS.256.66.0F3A.WIG 09 ib */
25613      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25614         UChar  modrm = getUChar(delta);
25615         UInt   rG    = gregOfRexRM(pfx, modrm);
25616         IRTemp src   = newTemp(Ity_V256);
25617         IRTemp s0    = IRTemp_INVALID;
25618         IRTemp s1    = IRTemp_INVALID;
25619         IRTemp s2    = IRTemp_INVALID;
25620         IRTemp s3    = IRTemp_INVALID;
25621         IRTemp rm    = newTemp(Ity_I32);
25622         Int    imm   = 0;
25623
25624         modrm = getUChar(delta);
25625
25626         if (epartIsReg(modrm)) {
25627            UInt rE = eregOfRexRM(pfx, modrm);
25628            assign( src, getYMMReg( rE ) );
25629            imm = getUChar(delta+1);
25630            if (imm & ~15) break;
25631            delta += 1+1;
25632            DIP( "vroundpd $%d,%s,%s\n", imm, nameYMMReg(rE), nameYMMReg(rG) );
25633         } else {
25634            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
25635            assign( src, loadLE(Ity_V256, mkexpr(addr) ) );
25636            imm = getUChar(delta+alen);
25637            if (imm & ~15) break;
25638            delta += alen+1;
25639            DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameYMMReg(rG) );
25640         }
25641
25642         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
25643            that encoding is the same as the encoding for IRRoundingMode,
25644            we can use that value directly in the IR as a rounding
25645            mode. */
25646         assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
25647
25648         breakupV256to64s( src, &s3, &s2, &s1, &s0 );
25649#        define CVT(s) binop(Iop_RoundF64toInt, mkexpr(rm), \
25650                             unop(Iop_ReinterpI64asF64, mkexpr(s)))
25651         putYMMRegLane64F( rG, 3, CVT(s3) );
25652         putYMMRegLane64F( rG, 2, CVT(s2) );
25653         putYMMRegLane64F( rG, 1, CVT(s1) );
25654         putYMMRegLane64F( rG, 0, CVT(s0) );
25655#        undef CVT
25656         goto decode_success;
25657      }
25658
25659   case 0x0A:
25660   case 0x0B:
25661      /* VROUNDSS imm8, xmm3/m32, xmm2, xmm1 */
25662      /* VROUNDSS = VEX.NDS.128.66.0F3A.WIG 0A ib */
25663      /* VROUNDSD imm8, xmm3/m64, xmm2, xmm1 */
25664      /* VROUNDSD = VEX.NDS.128.66.0F3A.WIG 0B ib */
25665      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25666         UChar  modrm = getUChar(delta);
25667         UInt   rG    = gregOfRexRM(pfx, modrm);
25668         UInt   rV    = getVexNvvvv(pfx);
25669         Bool   isD   = opc == 0x0B;
25670         IRTemp src   = newTemp(isD ? Ity_F64 : Ity_F32);
25671         IRTemp res   = newTemp(isD ? Ity_F64 : Ity_F32);
25672         Int    imm   = 0;
25673
25674         if (epartIsReg(modrm)) {
25675            UInt rE = eregOfRexRM(pfx, modrm);
25676            assign( src,
25677                    isD ? getXMMRegLane64F(rE, 0) : getXMMRegLane32F(rE, 0) );
25678            imm = getUChar(delta+1);
25679            if (imm & ~15) break;
25680            delta += 1+1;
25681            DIP( "vrounds%c $%d,%s,%s,%s\n",
25682                 isD ? 'd' : 's',
25683                 imm, nameXMMReg( rE ), nameXMMReg( rV ), nameXMMReg( rG ) );
25684         } else {
25685            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
25686            assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
25687            imm = getUChar(delta+alen);
25688            if (imm & ~15) break;
25689            delta += alen+1;
25690            DIP( "vrounds%c $%d,%s,%s,%s\n",
25691                 isD ? 'd' : 's',
25692                 imm, dis_buf, nameXMMReg( rV ), nameXMMReg( rG ) );
25693         }
25694
25695         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
25696            that encoding is the same as the encoding for IRRoundingMode,
25697            we can use that value directly in the IR as a rounding
25698            mode. */
25699         assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
25700                           (imm & 4) ? get_sse_roundingmode()
25701                                     : mkU32(imm & 3),
25702                           mkexpr(src)) );
25703
25704         if (isD)
25705            putXMMRegLane64F( rG, 0, mkexpr(res) );
25706         else {
25707            putXMMRegLane32F( rG, 0, mkexpr(res) );
25708            putXMMRegLane32F( rG, 1, getXMMRegLane32F( rV, 1 ) );
25709         }
25710         putXMMRegLane64F( rG, 1, getXMMRegLane64F( rV, 1 ) );
25711         putYMMRegLane128( rG, 1, mkV128(0) );
25712         *uses_vvvv = True;
25713         goto decode_success;
25714      }
25715      break;
25716
25717   case 0x0C:
25718      /* VBLENDPS imm8, ymm3/m256, ymm2, ymm1 */
25719      /* VBLENDPS = VEX.NDS.256.66.0F3A.WIG 0C /r ib */
25720      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25721         UChar  modrm = getUChar(delta);
25722         UInt   imm8;
25723         UInt   rG    = gregOfRexRM(pfx, modrm);
25724         UInt   rV    = getVexNvvvv(pfx);
25725         IRTemp sV    = newTemp(Ity_V256);
25726         IRTemp sE    = newTemp(Ity_V256);
25727         assign ( sV, getYMMReg(rV) );
25728         if (epartIsReg(modrm)) {
25729            UInt rE = eregOfRexRM(pfx, modrm);
25730            delta += 1;
25731            imm8 = getUChar(delta);
25732            DIP("vblendps $%u,%s,%s,%s\n",
25733                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
25734            assign(sE, getYMMReg(rE));
25735         } else {
25736            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
25737            delta += alen;
25738            imm8 = getUChar(delta);
25739            DIP("vblendps $%u,%s,%s,%s\n",
25740                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
25741            assign(sE, loadLE(Ity_V256, mkexpr(addr)));
25742         }
25743         delta++;
25744         putYMMReg( rG,
25745                    mkexpr( math_BLENDPS_256( sE, sV, imm8) ) );
25746         *uses_vvvv = True;
25747         goto decode_success;
25748      }
25749      /* VBLENDPS imm8, xmm3/m128, xmm2, xmm1 */
25750      /* VBLENDPS = VEX.NDS.128.66.0F3A.WIG 0C /r ib */
25751      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25752         UChar  modrm = getUChar(delta);
25753         UInt   imm8;
25754         UInt   rG    = gregOfRexRM(pfx, modrm);
25755         UInt   rV    = getVexNvvvv(pfx);
25756         IRTemp sV    = newTemp(Ity_V128);
25757         IRTemp sE    = newTemp(Ity_V128);
25758         assign ( sV, getXMMReg(rV) );
25759         if (epartIsReg(modrm)) {
25760            UInt rE = eregOfRexRM(pfx, modrm);
25761            delta += 1;
25762            imm8 = getUChar(delta);
25763            DIP("vblendps $%u,%s,%s,%s\n",
25764                imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
25765            assign(sE, getXMMReg(rE));
25766         } else {
25767            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
25768            delta += alen;
25769            imm8 = getUChar(delta);
25770            DIP("vblendps $%u,%s,%s,%s\n",
25771                imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
25772            assign(sE, loadLE(Ity_V128, mkexpr(addr)));
25773         }
25774         delta++;
25775         putYMMRegLoAndZU( rG,
25776                           mkexpr( math_BLENDPS_128( sE, sV, imm8) ) );
25777         *uses_vvvv = True;
25778         goto decode_success;
25779      }
25780      break;
25781
25782   case 0x0D:
25783      /* VBLENDPD imm8, ymm3/m256, ymm2, ymm1 */
25784      /* VBLENDPD = VEX.NDS.256.66.0F3A.WIG 0D /r ib */
25785      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25786         UChar  modrm = getUChar(delta);
25787         UInt   imm8;
25788         UInt   rG    = gregOfRexRM(pfx, modrm);
25789         UInt   rV    = getVexNvvvv(pfx);
25790         IRTemp sV    = newTemp(Ity_V256);
25791         IRTemp sE    = newTemp(Ity_V256);
25792         assign ( sV, getYMMReg(rV) );
25793         if (epartIsReg(modrm)) {
25794            UInt rE = eregOfRexRM(pfx, modrm);
25795            delta += 1;
25796            imm8 = getUChar(delta);
25797            DIP("vblendpd $%u,%s,%s,%s\n",
25798                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
25799            assign(sE, getYMMReg(rE));
25800         } else {
25801            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
25802            delta += alen;
25803            imm8 = getUChar(delta);
25804            DIP("vblendpd $%u,%s,%s,%s\n",
25805                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
25806            assign(sE, loadLE(Ity_V256, mkexpr(addr)));
25807         }
25808         delta++;
25809         putYMMReg( rG,
25810                    mkexpr( math_BLENDPD_256( sE, sV, imm8) ) );
25811         *uses_vvvv = True;
25812         goto decode_success;
25813      }
25814      /* VBLENDPD imm8, xmm3/m128, xmm2, xmm1 */
25815      /* VBLENDPD = VEX.NDS.128.66.0F3A.WIG 0D /r ib */
25816      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25817         UChar  modrm = getUChar(delta);
25818         UInt   imm8;
25819         UInt   rG    = gregOfRexRM(pfx, modrm);
25820         UInt   rV    = getVexNvvvv(pfx);
25821         IRTemp sV    = newTemp(Ity_V128);
25822         IRTemp sE    = newTemp(Ity_V128);
25823         assign ( sV, getXMMReg(rV) );
25824         if (epartIsReg(modrm)) {
25825            UInt rE = eregOfRexRM(pfx, modrm);
25826            delta += 1;
25827            imm8 = getUChar(delta);
25828            DIP("vblendpd $%u,%s,%s,%s\n",
25829                imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
25830            assign(sE, getXMMReg(rE));
25831         } else {
25832            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
25833            delta += alen;
25834            imm8 = getUChar(delta);
25835            DIP("vblendpd $%u,%s,%s,%s\n",
25836                imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
25837            assign(sE, loadLE(Ity_V128, mkexpr(addr)));
25838         }
25839         delta++;
25840         putYMMRegLoAndZU( rG,
25841                           mkexpr( math_BLENDPD_128( sE, sV, imm8) ) );
25842         *uses_vvvv = True;
25843         goto decode_success;
25844      }
25845      break;
25846
25847   case 0x0E:
25848      /* VPBLENDW imm8, xmm3/m128, xmm2, xmm1 */
25849      /* VPBLENDW = VEX.NDS.128.66.0F3A.WIG 0E /r ib */
25850      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25851         UChar  modrm = getUChar(delta);
25852         UInt   imm8;
25853         UInt   rG    = gregOfRexRM(pfx, modrm);
25854         UInt   rV    = getVexNvvvv(pfx);
25855         IRTemp sV    = newTemp(Ity_V128);
25856         IRTemp sE    = newTemp(Ity_V128);
25857         assign ( sV, getXMMReg(rV) );
25858         if (epartIsReg(modrm)) {
25859            UInt rE = eregOfRexRM(pfx, modrm);
25860            delta += 1;
25861            imm8 = getUChar(delta);
25862            DIP("vpblendw $%u,%s,%s,%s\n",
25863                imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
25864            assign(sE, getXMMReg(rE));
25865         } else {
25866            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
25867            delta += alen;
25868            imm8 = getUChar(delta);
25869            DIP("vpblendw $%u,%s,%s,%s\n",
25870                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
25871            assign(sE, loadLE(Ity_V128, mkexpr(addr)));
25872         }
25873         delta++;
25874         putYMMRegLoAndZU( rG,
25875                           mkexpr( math_PBLENDW_128( sE, sV, imm8) ) );
25876         *uses_vvvv = True;
25877         goto decode_success;
25878      }
25879      break;
25880
25881   case 0x0F:
25882      /* VPALIGNR imm8, xmm3/m128, xmm2, xmm1 */
25883      /* VPALIGNR = VEX.NDS.128.66.0F3A.WIG 0F /r ib */
25884      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25885         UChar  modrm = getUChar(delta);
25886         UInt   rG    = gregOfRexRM(pfx, modrm);
25887         UInt   rV    = getVexNvvvv(pfx);
25888         IRTemp sV    = newTemp(Ity_V128);
25889         IRTemp dV    = newTemp(Ity_V128);
25890         UInt   imm8;
25891
25892         assign( dV, getXMMReg(rV) );
25893
25894         if ( epartIsReg( modrm ) ) {
25895            UInt   rE = eregOfRexRM(pfx, modrm);
25896            assign( sV, getXMMReg(rE) );
25897            imm8 = getUChar(delta+1);
25898            delta += 1+1;
25899            DIP("vpalignr $%d,%s,%s,%s\n", imm8, nameXMMReg(rE),
25900                                           nameXMMReg(rV), nameXMMReg(rG));
25901         } else {
25902            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
25903            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
25904            imm8 = getUChar(delta+alen);
25905            delta += alen+1;
25906            DIP("vpalignr $%d,%s,%s,%s\n", imm8, dis_buf,
25907                                           nameXMMReg(rV), nameXMMReg(rG));
25908         }
25909
25910         IRTemp res = math_PALIGNR_XMM( sV, dV, imm8 );
25911         putYMMRegLoAndZU( rG, mkexpr(res) );
25912         *uses_vvvv = True;
25913         goto decode_success;
25914      }
25915      break;
25916
25917   case 0x14:
25918      /* VPEXTRB imm8, xmm2, reg/m8 = VEX.128.66.0F3A.W0 14 /r ib */
25919      if (have66noF2noF3(pfx)
25920          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
25921         delta = dis_PEXTRB_128_GtoE( vbi, pfx, delta, False/*!isAvx*/ );
25922         goto decode_success;
25923      }
25924      break;
25925
25926   case 0x15:
25927      /* VPEXTRW imm8, reg/m16, xmm2 */
25928      /* VPEXTRW = VEX.128.66.0F3A.W0 15 /r ib */
25929      if (have66noF2noF3(pfx)
25930          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
25931         delta = dis_PEXTRW( vbi, pfx, delta, True/*isAvx*/ );
25932         goto decode_success;
25933      }
25934      break;
25935
25936   case 0x16:
25937      /* VPEXTRD imm8, r32/m32, xmm2 */
25938      /* VPEXTRD = VEX.128.66.0F3A.W0 16 /r ib */
25939      if (have66noF2noF3(pfx)
25940          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
25941         delta = dis_PEXTRD( vbi, pfx, delta, True/*isAvx*/ );
25942         goto decode_success;
25943      }
25944      /* VPEXTRQ = VEX.128.66.0F3A.W1 16 /r ib */
25945      if (have66noF2noF3(pfx)
25946          && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
25947         delta = dis_PEXTRQ( vbi, pfx, delta, True/*isAvx*/ );
25948         goto decode_success;
25949      }
25950      break;
25951
25952   case 0x17:
25953      /* VEXTRACTPS imm8, xmm1, r32/m32 = VEX.128.66.0F3A.WIG 17 /r ib */
25954      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25955         delta = dis_EXTRACTPS( vbi, pfx, delta, True/*isAvx*/ );
25956         goto decode_success;
25957      }
25958      break;
25959
25960   case 0x18:
25961      /* VINSERTF128 r/m, rV, rD
25962         ::: rD = insertinto(a lane in rV, 128 bits from r/m) */
25963      /* VINSERTF128 = VEX.NDS.256.66.0F3A.W0 18 /r ib */
25964      if (have66noF2noF3(pfx)
25965          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
25966         UChar  modrm = getUChar(delta);
25967         UInt   ib    = 0;
25968         UInt   rG    = gregOfRexRM(pfx, modrm);
25969         UInt   rV    = getVexNvvvv(pfx);
25970         IRTemp t128  = newTemp(Ity_V128);
25971         if (epartIsReg(modrm)) {
25972            UInt rE = eregOfRexRM(pfx, modrm);
25973            delta += 1;
25974            assign(t128, getXMMReg(rE));
25975            ib = getUChar(delta);
25976            DIP("vinsertf128 $%u,%s,%s,%s\n",
25977                ib, nameXMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
25978         } else {
25979            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
25980            assign(t128, loadLE(Ity_V128, mkexpr(addr)));
25981            delta += alen;
25982            ib = getUChar(delta);
25983            DIP("vinsertf128 $%u,%s,%s,%s\n",
25984                ib, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
25985         }
25986         delta++;
25987         putYMMRegLane128(rG, 0,   getYMMRegLane128(rV, 0));
25988         putYMMRegLane128(rG, 1,   getYMMRegLane128(rV, 1));
25989         putYMMRegLane128(rG, ib & 1, mkexpr(t128));
25990         *uses_vvvv = True;
25991         goto decode_success;
25992      }
25993      break;
25994
25995   case 0x19:
25996     /* VEXTRACTF128 $lane_no, rS, r/m
25997        ::: r/m:V128 = a lane of rS:V256 (RM format) */
25998     /* VEXTRACTF128 = VEX.256.66.0F3A.W0 19 /r ib */
25999      if (have66noF2noF3(pfx)
26000          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
26001         UChar  modrm = getUChar(delta);
26002         UInt   ib    = 0;
26003         UInt   rS    = gregOfRexRM(pfx, modrm);
26004         IRTemp t128  = newTemp(Ity_V128);
26005         if (epartIsReg(modrm)) {
26006            UInt rD = eregOfRexRM(pfx, modrm);
26007            delta += 1;
26008            ib = getUChar(delta);
26009            assign(t128, getYMMRegLane128(rS, ib & 1));
26010            putYMMRegLoAndZU(rD, mkexpr(t128));
26011            DIP("vextractf128 $%u,%s,%s\n",
26012                ib, nameXMMReg(rS), nameYMMReg(rD));
26013         } else {
26014            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
26015            delta += alen;
26016            ib = getUChar(delta);
26017            assign(t128, getYMMRegLane128(rS, ib & 1));
26018            storeLE(mkexpr(addr), mkexpr(t128));
26019            DIP("vextractf128 $%u,%s,%s\n",
26020                ib, nameYMMReg(rS), dis_buf);
26021         }
26022         delta++;
26023         /* doesn't use vvvv */
26024         goto decode_success;
26025      }
26026      break;
26027
26028   case 0x20:
26029      /* VPINSRB r32/m8, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 20 /r ib */
26030      if (have66noF2noF3(pfx)
26031          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
26032         UChar  modrm  = getUChar(delta);
26033         UInt   rG     = gregOfRexRM(pfx, modrm);
26034         UInt   rV     = getVexNvvvv(pfx);
26035         Int    imm8;
26036         IRTemp src_u8 = newTemp(Ity_I8);
26037
26038         if ( epartIsReg( modrm ) ) {
26039            UInt rE = eregOfRexRM(pfx,modrm);
26040            imm8 = (Int)(getUChar(delta+1) & 15);
26041            assign( src_u8, unop(Iop_32to8, getIReg32( rE )) );
26042            delta += 1+1;
26043            DIP( "vpinsrb $%d,%s,%s,%s\n",
26044                 imm8, nameIReg32(rE), nameXMMReg(rV), nameXMMReg(rG) );
26045         } else {
26046            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
26047            imm8 = (Int)(getUChar(delta+alen) & 15);
26048            assign( src_u8, loadLE( Ity_I8, mkexpr(addr) ) );
26049            delta += alen+1;
26050            DIP( "vpinsrb $%d,%s,%s,%s\n",
26051                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
26052         }
26053
26054         IRTemp src_vec = newTemp(Ity_V128);
26055         assign(src_vec, getXMMReg( rV ));
26056         IRTemp res_vec = math_PINSRB_128( src_vec, src_u8, imm8 );
26057         putYMMRegLoAndZU( rG, mkexpr(res_vec) );
26058         *uses_vvvv = True;
26059         goto decode_success;
26060      }
26061      break;
26062
26063   case 0x21:
26064      /* VINSERTPS imm8, xmm3/m32, xmm2, xmm1
26065         = VEX.NDS.128.66.0F3A.WIG 21 /r ib */
26066      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26067         UChar  modrm = getUChar(delta);
26068         UInt   rG    = gregOfRexRM(pfx, modrm);
26069         UInt   rV    = getVexNvvvv(pfx);
26070         UInt   imm8;
26071         IRTemp d2ins = newTemp(Ity_I32); /* comes from the E part */
26072         const IRTemp inval = IRTemp_INVALID;
26073
26074         if ( epartIsReg( modrm ) ) {
26075            UInt   rE = eregOfRexRM(pfx, modrm);
26076            IRTemp vE = newTemp(Ity_V128);
26077            assign( vE, getXMMReg(rE) );
26078            IRTemp dsE[4] = { inval, inval, inval, inval };
26079            breakupV128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
26080            imm8 = getUChar(delta+1);
26081            d2ins = dsE[(imm8 >> 6) & 3]; /* "imm8_count_s" */
26082            delta += 1+1;
26083            DIP( "insertps $%u, %s,%s\n",
26084                 imm8, nameXMMReg(rE), nameXMMReg(rG) );
26085         } else {
26086            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
26087            assign( d2ins, loadLE( Ity_I32, mkexpr(addr) ) );
26088            imm8 = getUChar(delta+alen);
26089            delta += alen+1;
26090            DIP( "insertps $%u, %s,%s\n",
26091                 imm8, dis_buf, nameXMMReg(rG) );
26092         }
26093
26094         IRTemp vV = newTemp(Ity_V128);
26095         assign( vV, getXMMReg(rV) );
26096
26097         putYMMRegLoAndZU( rG, mkexpr(math_INSERTPS( vV, d2ins, imm8 )) );
26098         *uses_vvvv = True;
26099         goto decode_success;
26100      }
26101      break;
26102
26103   case 0x22:
26104      /* VPINSRD r32/m32, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 22 /r ib */
26105      if (have66noF2noF3(pfx)
26106          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
26107         UChar  modrm = getUChar(delta);
26108         UInt   rG    = gregOfRexRM(pfx, modrm);
26109         UInt   rV    = getVexNvvvv(pfx);
26110         Int    imm8_10;
26111         IRTemp src_u32 = newTemp(Ity_I32);
26112
26113         if ( epartIsReg( modrm ) ) {
26114            UInt rE = eregOfRexRM(pfx,modrm);
26115            imm8_10 = (Int)(getUChar(delta+1) & 3);
26116            assign( src_u32, getIReg32( rE ) );
26117            delta += 1+1;
26118            DIP( "vpinsrd $%d,%s,%s,%s\n",
26119                 imm8_10, nameIReg32(rE), nameXMMReg(rV), nameXMMReg(rG) );
26120         } else {
26121            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
26122            imm8_10 = (Int)(getUChar(delta+alen) & 3);
26123            assign( src_u32, loadLE( Ity_I32, mkexpr(addr) ) );
26124            delta += alen+1;
26125            DIP( "vpinsrd $%d,%s,%s,%s\n",
26126                 imm8_10, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
26127         }
26128
26129         IRTemp src_vec = newTemp(Ity_V128);
26130         assign(src_vec, getXMMReg( rV ));
26131         IRTemp res_vec = math_PINSRD_128( src_vec, src_u32, imm8_10 );
26132         putYMMRegLoAndZU( rG, mkexpr(res_vec) );
26133         *uses_vvvv = True;
26134         goto decode_success;
26135      }
26136      /* VPINSRQ r64/m64, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W1 22 /r ib */
26137      if (have66noF2noF3(pfx)
26138          && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
26139         UChar  modrm = getUChar(delta);
26140         UInt   rG    = gregOfRexRM(pfx, modrm);
26141         UInt   rV    = getVexNvvvv(pfx);
26142         Int    imm8_0;
26143         IRTemp src_u64 = newTemp(Ity_I64);
26144
26145         if ( epartIsReg( modrm ) ) {
26146            UInt rE = eregOfRexRM(pfx,modrm);
26147            imm8_0 = (Int)(getUChar(delta+1) & 1);
26148            assign( src_u64, getIReg64( rE ) );
26149            delta += 1+1;
26150            DIP( "vpinsrq $%d,%s,%s,%s\n",
26151                 imm8_0, nameIReg64(rE), nameXMMReg(rV), nameXMMReg(rG) );
26152         } else {
26153            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
26154            imm8_0 = (Int)(getUChar(delta+alen) & 1);
26155            assign( src_u64, loadLE( Ity_I64, mkexpr(addr) ) );
26156            delta += alen+1;
26157            DIP( "vpinsrd $%d,%s,%s,%s\n",
26158                 imm8_0, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
26159         }
26160
26161         IRTemp src_vec = newTemp(Ity_V128);
26162         assign(src_vec, getXMMReg( rV ));
26163         IRTemp res_vec = math_PINSRQ_128( src_vec, src_u64, imm8_0 );
26164         putYMMRegLoAndZU( rG, mkexpr(res_vec) );
26165         *uses_vvvv = True;
26166         goto decode_success;
26167      }
26168      break;
26169
26170   case 0x40:
26171      /* VDPPS imm8, xmm3/m128,xmm2,xmm1 = VEX.NDS.128.66.0F3A.WIG 40 /r ib */
26172      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26173         UChar  modrm   = getUChar(delta);
26174         UInt   rG      = gregOfRexRM(pfx, modrm);
26175         UInt   rV      = getVexNvvvv(pfx);
26176         IRTemp dst_vec = newTemp(Ity_V128);
26177         Int    imm8;
26178         if (epartIsReg( modrm )) {
26179            UInt rE = eregOfRexRM(pfx,modrm);
26180            imm8 = (Int)getUChar(delta+1);
26181            assign( dst_vec, getXMMReg( rE ) );
26182            delta += 1+1;
26183            DIP( "vdpps $%d,%s,%s,%s\n",
26184                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
26185         } else {
26186            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
26187            imm8 = (Int)getUChar(delta+alen);
26188            assign( dst_vec, loadLE( Ity_V128, mkexpr(addr) ) );
26189            delta += alen+1;
26190            DIP( "vdpps $%d,%s,%s,%s\n",
26191                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
26192         }
26193
26194         IRTemp src_vec = newTemp(Ity_V128);
26195         assign(src_vec, getXMMReg( rV ));
26196         IRTemp res_vec = math_DPPS_128( src_vec, dst_vec, imm8 );
26197         putYMMRegLoAndZU( rG, mkexpr(res_vec) );
26198         *uses_vvvv = True;
26199         goto decode_success;
26200      }
26201      /* VDPPS imm8, ymm3/m128,ymm2,ymm1 = VEX.NDS.256.66.0F3A.WIG 40 /r ib */
26202      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26203         UChar  modrm   = getUChar(delta);
26204         UInt   rG      = gregOfRexRM(pfx, modrm);
26205         UInt   rV      = getVexNvvvv(pfx);
26206         IRTemp dst_vec = newTemp(Ity_V256);
26207         Int    imm8;
26208         if (epartIsReg( modrm )) {
26209            UInt rE = eregOfRexRM(pfx,modrm);
26210            imm8 = (Int)getUChar(delta+1);
26211            assign( dst_vec, getYMMReg( rE ) );
26212            delta += 1+1;
26213            DIP( "vdpps $%d,%s,%s,%s\n",
26214                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG) );
26215         } else {
26216            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
26217            imm8 = (Int)getUChar(delta+alen);
26218            assign( dst_vec, loadLE( Ity_V256, mkexpr(addr) ) );
26219            delta += alen+1;
26220            DIP( "vdpps $%d,%s,%s,%s\n",
26221                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
26222         }
26223
26224         IRTemp src_vec = newTemp(Ity_V256);
26225         assign(src_vec, getYMMReg( rV ));
26226         IRTemp s0, s1, d0, d1;
26227         s0 = s1 = d0 = d1 = IRTemp_INVALID;
26228         breakupV256toV128s( dst_vec, &d1, &d0 );
26229         breakupV256toV128s( src_vec, &s1, &s0 );
26230         putYMMReg( rG, binop( Iop_V128HLtoV256,
26231                               mkexpr( math_DPPS_128(s1, d1, imm8) ),
26232                               mkexpr( math_DPPS_128(s0, d0, imm8) ) ) );
26233         *uses_vvvv = True;
26234         goto decode_success;
26235      }
26236      break;
26237
26238   case 0x41:
26239      /* VDPPD imm8, xmm3/m128,xmm2,xmm1 = VEX.NDS.128.66.0F3A.WIG 41 /r ib */
26240      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26241         UChar  modrm   = getUChar(delta);
26242         UInt   rG      = gregOfRexRM(pfx, modrm);
26243         UInt   rV      = getVexNvvvv(pfx);
26244         IRTemp dst_vec = newTemp(Ity_V128);
26245         Int    imm8;
26246         if (epartIsReg( modrm )) {
26247            UInt rE = eregOfRexRM(pfx,modrm);
26248            imm8 = (Int)getUChar(delta+1);
26249            assign( dst_vec, getXMMReg( rE ) );
26250            delta += 1+1;
26251            DIP( "vdppd $%d,%s,%s,%s\n",
26252                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
26253         } else {
26254            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
26255            imm8 = (Int)getUChar(delta+alen);
26256            assign( dst_vec, loadLE( Ity_V128, mkexpr(addr) ) );
26257            delta += alen+1;
26258            DIP( "vdppd $%d,%s,%s,%s\n",
26259                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
26260         }
26261
26262         IRTemp src_vec = newTemp(Ity_V128);
26263         assign(src_vec, getXMMReg( rV ));
26264         IRTemp res_vec = math_DPPD_128( src_vec, dst_vec, imm8 );
26265         putYMMRegLoAndZU( rG, mkexpr(res_vec) );
26266         *uses_vvvv = True;
26267         goto decode_success;
26268      }
26269      break;
26270
26271   case 0x42:
26272      /* VMPSADBW imm8, xmm3/m128,xmm2,xmm1 */
26273      /* VMPSADBW = VEX.NDS.128.66.0F3A.WIG 42 /r ib */
26274      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26275         UChar  modrm   = getUChar(delta);
26276         Int    imm8;
26277         IRTemp src_vec = newTemp(Ity_V128);
26278         IRTemp dst_vec = newTemp(Ity_V128);
26279         UInt   rG      = gregOfRexRM(pfx, modrm);
26280         UInt   rV      = getVexNvvvv(pfx);
26281
26282         assign( dst_vec, getXMMReg(rV) );
26283
26284         if ( epartIsReg( modrm ) ) {
26285            UInt rE = eregOfRexRM(pfx, modrm);
26286
26287            imm8 = (Int)getUChar(delta+1);
26288            assign( src_vec, getXMMReg(rE) );
26289            delta += 1+1;
26290            DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
26291                 nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
26292         } else {
26293            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
26294                             1/* imm8 is 1 byte after the amode */ );
26295            assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
26296            imm8 = (Int)getUChar(delta+alen);
26297            delta += alen+1;
26298            DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
26299                 dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
26300         }
26301
26302         putYMMRegLoAndZU( rG, mkexpr( math_MPSADBW_128(dst_vec,
26303                                                        src_vec, imm8) ) );
26304         *uses_vvvv = True;
26305         goto decode_success;
26306      }
26307      break;
26308
26309   case 0x44:
26310      /* VPCLMULQDQ imm8, xmm3/m128,xmm2,xmm1 */
26311      /* VPCLMULQDQ = VEX.NDS.128.66.0F3A.WIG 44 /r ib */
26312      /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
26313       * Carry-less multiplication of selected XMM quadwords into XMM
26314       * registers (a.k.a multiplication of polynomials over GF(2))
26315       */
26316      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26317         UChar  modrm = getUChar(delta);
26318         Int imm8;
26319         IRTemp sV    = newTemp(Ity_V128);
26320         IRTemp dV    = newTemp(Ity_V128);
26321         UInt   rG    = gregOfRexRM(pfx, modrm);
26322         UInt   rV    = getVexNvvvv(pfx);
26323
26324         assign( dV, getXMMReg(rV) );
26325
26326         if ( epartIsReg( modrm ) ) {
26327            UInt rE = eregOfRexRM(pfx, modrm);
26328            imm8 = (Int)getUChar(delta+1);
26329            assign( sV, getXMMReg(rE) );
26330            delta += 1+1;
26331            DIP( "vpclmulqdq $%d, %s,%s,%s\n", imm8,
26332                 nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
26333         } else {
26334            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
26335                             1/* imm8 is 1 byte after the amode */ );
26336            assign( sV, loadLE( Ity_V128, mkexpr(addr) ) );
26337            imm8 = (Int)getUChar(delta+alen);
26338            delta += alen+1;
26339            DIP( "vpclmulqdq $%d, %s,%s,%s\n",
26340                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
26341         }
26342
26343         putYMMRegLoAndZU( rG, mkexpr( math_PCLMULQDQ(dV, sV, imm8) ) );
26344         *uses_vvvv = True;
26345         goto decode_success;
26346      }
26347      break;
26348
26349   case 0x4A:
26350      /* VBLENDVPS xmmG, xmmE/memE, xmmV, xmmIS4
26351         ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
26352      /* VBLENDVPS = VEX.NDS.128.66.0F3A.WIG 4A /r /is4 */
26353      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26354         delta = dis_VBLENDV_128 ( vbi, pfx, delta,
26355                                   "vblendvps", 4, Iop_SarN32x4 );
26356         *uses_vvvv = True;
26357         goto decode_success;
26358      }
26359      /* VBLENDVPS ymmG, ymmE/memE, ymmV, ymmIS4
26360         ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
26361      /* VBLENDVPS = VEX.NDS.256.66.0F3A.WIG 4A /r /is4 */
26362      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26363         delta = dis_VBLENDV_256 ( vbi, pfx, delta,
26364                                   "vblendvps", 4, Iop_SarN32x4 );
26365         *uses_vvvv = True;
26366         goto decode_success;
26367      }
26368      break;
26369
26370   case 0x4B:
26371      /* VBLENDVPD xmmG, xmmE/memE, xmmV, xmmIS4
26372         ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
26373      /* VBLENDVPD = VEX.NDS.128.66.0F3A.WIG 4B /r /is4 */
26374      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26375         delta = dis_VBLENDV_128 ( vbi, pfx, delta,
26376                                   "vblendvpd", 8, Iop_SarN64x2 );
26377         *uses_vvvv = True;
26378         goto decode_success;
26379      }
26380      /* VBLENDVPD ymmG, ymmE/memE, ymmV, ymmIS4
26381         ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
26382      /* VBLENDVPD = VEX.NDS.256.66.0F3A.WIG 4B /r /is4 */
26383      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26384         delta = dis_VBLENDV_256 ( vbi, pfx, delta,
26385                                   "vblendvpd", 8, Iop_SarN64x2 );
26386         *uses_vvvv = True;
26387         goto decode_success;
26388      }
26389      break;
26390
26391   case 0x4C:
26392      /* VPBLENDVB xmmG, xmmE/memE, xmmV, xmmIS4
26393         ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
26394      /* VPBLENDVB = VEX.NDS.128.66.0F3A.WIG 4C /r /is4 */
26395      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26396         delta = dis_VBLENDV_128 ( vbi, pfx, delta,
26397                                   "vpblendvb", 1, Iop_SarN8x16 );
26398         *uses_vvvv = True;
26399         goto decode_success;
26400      }
26401      break;
26402
26403   case 0x60:
26404   case 0x61:
26405   case 0x62:
26406   case 0x63:
26407      /* VEX.128.66.0F3A.WIG 63 /r ib = VPCMPISTRI imm8, xmm2/m128, xmm1
26408         VEX.128.66.0F3A.WIG 62 /r ib = VPCMPISTRM imm8, xmm2/m128, xmm1
26409         VEX.128.66.0F3A.WIG 61 /r ib = VPCMPESTRI imm8, xmm2/m128, xmm1
26410         VEX.128.66.0F3A.WIG 60 /r ib = VPCMPESTRM imm8, xmm2/m128, xmm1
26411         (selected special cases that actually occur in glibc,
26412          not by any means a complete implementation.)
26413      */
26414      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26415         Long delta0 = delta;
26416         delta = dis_PCMPxSTRx( vbi, pfx, delta, True/*isAvx*/, opc );
26417         if (delta > delta0) goto decode_success;
26418         /* else fall though; dis_PCMPxSTRx failed to decode it */
26419      }
26420      break;
26421
26422   case 0xDF:
26423      /* VAESKEYGENASSIST imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG DF /r */
26424      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26425         delta = dis_AESKEYGENASSIST( vbi, pfx, delta, True/*!isAvx*/ );
26426         goto decode_success;
26427      }
26428      break;
26429
26430   default:
26431      break;
26432
26433   }
26434
26435  //decode_failure:
26436   return deltaIN;
26437
26438  decode_success:
26439   return delta;
26440}
26441
26442
26443/*------------------------------------------------------------*/
26444/*---                                                      ---*/
26445/*--- Disassemble a single instruction                     ---*/
26446/*---                                                      ---*/
26447/*------------------------------------------------------------*/
26448
26449/* Disassemble a single instruction into IR.  The instruction is
26450   located in host memory at &guest_code[delta]. */
26451
26452static
26453DisResult disInstr_AMD64_WRK (
26454             /*OUT*/Bool* expect_CAS,
26455             Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
26456             Bool         resteerCisOk,
26457             void*        callback_opaque,
26458             Long         delta64,
26459             VexArchInfo* archinfo,
26460             VexAbiInfo*  vbi
26461          )
26462{
26463   IRTemp    t1, t2, t3, t4, t5, t6;
26464   UChar     pre;
26465   Int       n, n_prefixes;
26466   DisResult dres;
26467
26468   /* The running delta */
26469   Long delta = delta64;
26470
26471   /* Holds eip at the start of the insn, so that we can print
26472      consistent error messages for unimplemented insns. */
26473   Long delta_start = delta;
26474
26475   /* sz denotes the nominal data-op size of the insn; we change it to
26476      2 if an 0x66 prefix is seen and 8 if REX.W is 1.  In case of
26477      conflict REX.W takes precedence. */
26478   Int sz = 4;
26479
26480   /* pfx holds the summary of prefixes. */
26481   Prefix pfx = PFX_EMPTY;
26482
26483   /* Holds the computed opcode-escape indication. */
26484   Escape esc = ESC_NONE;
26485
26486   /* Set result defaults. */
26487   dres.whatNext    = Dis_Continue;
26488   dres.len         = 0;
26489   dres.continueAt  = 0;
26490   dres.jk_StopHere = Ijk_INVALID;
26491   *expect_CAS = False;
26492
26493   vassert(guest_RIP_next_assumed == 0);
26494   vassert(guest_RIP_next_mustcheck == False);
26495
26496   t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID;
26497
26498   DIP("\t0x%llx:  ", guest_RIP_bbstart+delta);
26499
26500   /* Spot "Special" instructions (see comment at top of file). */
26501   {
26502      UChar* code = (UChar*)(guest_code + delta);
26503      /* Spot the 16-byte preamble:
26504         48C1C703   rolq $3,  %rdi
26505         48C1C70D   rolq $13, %rdi
26506         48C1C73D   rolq $61, %rdi
26507         48C1C733   rolq $51, %rdi
26508      */
26509      if (code[ 0] == 0x48 && code[ 1] == 0xC1 && code[ 2] == 0xC7
26510                                               && code[ 3] == 0x03 &&
26511          code[ 4] == 0x48 && code[ 5] == 0xC1 && code[ 6] == 0xC7
26512                                               && code[ 7] == 0x0D &&
26513          code[ 8] == 0x48 && code[ 9] == 0xC1 && code[10] == 0xC7
26514                                               && code[11] == 0x3D &&
26515          code[12] == 0x48 && code[13] == 0xC1 && code[14] == 0xC7
26516                                               && code[15] == 0x33) {
26517         /* Got a "Special" instruction preamble.  Which one is it? */
26518         if (code[16] == 0x48 && code[17] == 0x87
26519                              && code[18] == 0xDB /* xchgq %rbx,%rbx */) {
26520            /* %RDX = client_request ( %RAX ) */
26521            DIP("%%rdx = client_request ( %%rax )\n");
26522            delta += 19;
26523            jmp_lit(&dres, Ijk_ClientReq, guest_RIP_bbstart+delta);
26524            vassert(dres.whatNext == Dis_StopHere);
26525            goto decode_success;
26526         }
26527         else
26528         if (code[16] == 0x48 && code[17] == 0x87
26529                              && code[18] == 0xC9 /* xchgq %rcx,%rcx */) {
26530            /* %RAX = guest_NRADDR */
26531            DIP("%%rax = guest_NRADDR\n");
26532            delta += 19;
26533            putIRegRAX(8, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
26534            goto decode_success;
26535         }
26536         else
26537         if (code[16] == 0x48 && code[17] == 0x87
26538                              && code[18] == 0xD2 /* xchgq %rdx,%rdx */) {
26539            /* call-noredir *%RAX */
26540            DIP("call-noredir *%%rax\n");
26541            delta += 19;
26542            t1 = newTemp(Ity_I64);
26543            assign(t1, getIRegRAX(8));
26544            t2 = newTemp(Ity_I64);
26545            assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
26546            putIReg64(R_RSP, mkexpr(t2));
26547            storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta));
26548            jmp_treg(&dres, Ijk_NoRedir, t1);
26549            vassert(dres.whatNext == Dis_StopHere);
26550            goto decode_success;
26551         }
26552         /* We don't know what it is. */
26553         goto decode_failure;
26554         /*NOTREACHED*/
26555      }
26556   }
26557
26558   /* Eat prefixes, summarising the result in pfx and sz, and rejecting
26559      as many invalid combinations as possible. */
26560   n_prefixes = 0;
26561   while (True) {
26562      if (n_prefixes > 7) goto decode_failure;
26563      pre = getUChar(delta);
26564      switch (pre) {
26565         case 0x66: pfx |= PFX_66; break;
26566         case 0x67: pfx |= PFX_ASO; break;
26567         case 0xF2: pfx |= PFX_F2; break;
26568         case 0xF3: pfx |= PFX_F3; break;
26569         case 0xF0: pfx |= PFX_LOCK; *expect_CAS = True; break;
26570         case 0x2E: pfx |= PFX_CS; break;
26571         case 0x3E: pfx |= PFX_DS; break;
26572         case 0x26: pfx |= PFX_ES; break;
26573         case 0x64: pfx |= PFX_FS; break;
26574         case 0x65: pfx |= PFX_GS; break;
26575         case 0x36: pfx |= PFX_SS; break;
26576         case 0x40 ... 0x4F:
26577            pfx |= PFX_REX;
26578            if (pre & (1<<3)) pfx |= PFX_REXW;
26579            if (pre & (1<<2)) pfx |= PFX_REXR;
26580            if (pre & (1<<1)) pfx |= PFX_REXX;
26581            if (pre & (1<<0)) pfx |= PFX_REXB;
26582            break;
26583         default:
26584            goto not_a_legacy_prefix;
26585      }
26586      n_prefixes++;
26587      delta++;
26588   }
26589
26590   not_a_legacy_prefix:
26591   /* We've used up all the non-VEX prefixes.  Parse and validate a
26592      VEX prefix if that's appropriate. */
26593   if (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX) {
26594      /* Used temporarily for holding VEX prefixes. */
26595      UChar vex0 = getUChar(delta);
26596      if (vex0 == 0xC4) {
26597         /* 3-byte VEX */
26598         UChar vex1 = getUChar(delta+1);
26599         UChar vex2 = getUChar(delta+2);
26600         delta += 3;
26601         pfx |= PFX_VEX;
26602         /* Snarf contents of byte 1 */
26603         /* R */ pfx |= (vex1 & (1<<7)) ? 0 : PFX_REXR;
26604         /* X */ pfx |= (vex1 & (1<<6)) ? 0 : PFX_REXX;
26605         /* B */ pfx |= (vex1 & (1<<5)) ? 0 : PFX_REXB;
26606         /* m-mmmm */
26607         switch (vex1 & 0x1F) {
26608            case 1: esc = ESC_0F;   break;
26609            case 2: esc = ESC_0F38; break;
26610            case 3: esc = ESC_0F3A; break;
26611            /* Any other m-mmmm field will #UD */
26612            default: goto decode_failure;
26613         }
26614         /* Snarf contents of byte 2 */
26615         /* W */    pfx |= (vex2 & (1<<7)) ? PFX_REXW : 0;
26616         /* ~v3 */  pfx |= (vex2 & (1<<6)) ? 0 : PFX_VEXnV3;
26617         /* ~v2 */  pfx |= (vex2 & (1<<5)) ? 0 : PFX_VEXnV2;
26618         /* ~v1 */  pfx |= (vex2 & (1<<4)) ? 0 : PFX_VEXnV1;
26619         /* ~v0 */  pfx |= (vex2 & (1<<3)) ? 0 : PFX_VEXnV0;
26620         /* L */    pfx |= (vex2 & (1<<2)) ? PFX_VEXL : 0;
26621         /* pp */
26622         switch (vex2 & 3) {
26623            case 0: break;
26624            case 1: pfx |= PFX_66; break;
26625            case 2: pfx |= PFX_F3; break;
26626            case 3: pfx |= PFX_F2; break;
26627            default: vassert(0);
26628         }
26629      }
26630      else if (vex0 == 0xC5) {
26631         /* 2-byte VEX */
26632         UChar vex1 = getUChar(delta+1);
26633         delta += 2;
26634         pfx |= PFX_VEX;
26635         /* Snarf contents of byte 1 */
26636         /* R */    pfx |= (vex1 & (1<<7)) ? 0 : PFX_REXR;
26637         /* ~v3 */  pfx |= (vex1 & (1<<6)) ? 0 : PFX_VEXnV3;
26638         /* ~v2 */  pfx |= (vex1 & (1<<5)) ? 0 : PFX_VEXnV2;
26639         /* ~v1 */  pfx |= (vex1 & (1<<4)) ? 0 : PFX_VEXnV1;
26640         /* ~v0 */  pfx |= (vex1 & (1<<3)) ? 0 : PFX_VEXnV0;
26641         /* L */    pfx |= (vex1 & (1<<2)) ? PFX_VEXL : 0;
26642         /* pp */
26643         switch (vex1 & 3) {
26644            case 0: break;
26645            case 1: pfx |= PFX_66; break;
26646            case 2: pfx |= PFX_F3; break;
26647            case 3: pfx |= PFX_F2; break;
26648            default: vassert(0);
26649         }
26650         /* implied: */
26651         esc = ESC_0F;
26652      }
26653      /* Can't have both VEX and REX */
26654      if ((pfx & PFX_VEX) && (pfx & PFX_REX))
26655         goto decode_failure; /* can't have both */
26656   }
26657
26658   /* Dump invalid combinations */
26659   n = 0;
26660   if (pfx & PFX_F2) n++;
26661   if (pfx & PFX_F3) n++;
26662   if (n > 1)
26663      goto decode_failure; /* can't have both */
26664
26665   n = 0;
26666   if (pfx & PFX_CS) n++;
26667   if (pfx & PFX_DS) n++;
26668   if (pfx & PFX_ES) n++;
26669   if (pfx & PFX_FS) n++;
26670   if (pfx & PFX_GS) n++;
26671   if (pfx & PFX_SS) n++;
26672   if (n > 1)
26673      goto decode_failure; /* multiple seg overrides == illegal */
26674
26675   /* We have a %fs prefix.  Reject it if there's no evidence in 'vbi'
26676      that we should accept it. */
26677   if ((pfx & PFX_FS) && !vbi->guest_amd64_assume_fs_is_zero)
26678      goto decode_failure;
26679
26680   /* Ditto for %gs prefixes. */
26681   if ((pfx & PFX_GS) && !vbi->guest_amd64_assume_gs_is_0x60)
26682      goto decode_failure;
26683
26684   /* Set up sz. */
26685   sz = 4;
26686   if (pfx & PFX_66) sz = 2;
26687   if ((pfx & PFX_REX) && (pfx & PFX_REXW)) sz = 8;
26688
26689   /* Now we should be looking at the primary opcode byte or the
26690      leading escapes.  Check that any LOCK prefix is actually
26691      allowed. */
26692   if (pfx & PFX_LOCK) {
26693      if (can_be_used_with_LOCK_prefix( (UChar*)&guest_code[delta] )) {
26694         DIP("lock ");
26695      } else {
26696         *expect_CAS = False;
26697         goto decode_failure;
26698      }
26699   }
26700
26701   /* Eat up opcode escape bytes, until we're really looking at the
26702      primary opcode byte.  But only if there's no VEX present. */
26703   if (!(pfx & PFX_VEX)) {
26704      vassert(esc == ESC_NONE);
26705      pre = getUChar(delta);
26706      if (pre == 0x0F) {
26707         delta++;
26708         pre = getUChar(delta);
26709         switch (pre) {
26710            case 0x38: esc = ESC_0F38; delta++; break;
26711            case 0x3A: esc = ESC_0F3A; delta++; break;
26712            default:   esc = ESC_0F; break;
26713         }
26714      }
26715   }
26716
26717   /* So now we're really really looking at the primary opcode
26718      byte. */
26719   Long delta_at_primary_opcode = delta;
26720
26721   if (!(pfx & PFX_VEX)) {
26722      /* Handle non-VEX prefixed instructions.  "Legacy" (non-VEX) SSE
26723         instructions preserve the upper 128 bits of YMM registers;
26724         iow we can simply ignore the presence of the upper halves of
26725         these registers. */
26726      switch (esc) {
26727         case ESC_NONE:
26728            delta = dis_ESC_NONE( &dres, expect_CAS,
26729                                  resteerOkFn, resteerCisOk, callback_opaque,
26730                                  archinfo, vbi, pfx, sz, delta );
26731            break;
26732         case ESC_0F:
26733            delta = dis_ESC_0F  ( &dres, expect_CAS,
26734                                  resteerOkFn, resteerCisOk, callback_opaque,
26735                                  archinfo, vbi, pfx, sz, delta );
26736            break;
26737         case ESC_0F38:
26738            delta = dis_ESC_0F38( &dres,
26739                                  resteerOkFn, resteerCisOk, callback_opaque,
26740                                  archinfo, vbi, pfx, sz, delta );
26741            break;
26742         case ESC_0F3A:
26743            delta = dis_ESC_0F3A( &dres,
26744                                  resteerOkFn, resteerCisOk, callback_opaque,
26745                                  archinfo, vbi, pfx, sz, delta );
26746            break;
26747         default:
26748            vassert(0);
26749      }
26750   } else {
26751      /* VEX prefixed instruction */
26752      /* Sloppy Intel wording: "An instruction encoded with a VEX.128
26753         prefix that loads a YMM register operand ..." zeroes out bits
26754         128 and above of the register. */
26755      Bool uses_vvvv = False;
26756      switch (esc) {
26757         case ESC_0F:
26758            delta = dis_ESC_0F__VEX ( &dres, &uses_vvvv,
26759                                      resteerOkFn, resteerCisOk,
26760                                      callback_opaque,
26761                                      archinfo, vbi, pfx, sz, delta );
26762            break;
26763         case ESC_0F38:
26764            delta = dis_ESC_0F38__VEX ( &dres, &uses_vvvv,
26765                                        resteerOkFn, resteerCisOk,
26766                                        callback_opaque,
26767                                        archinfo, vbi, pfx, sz, delta );
26768            break;
26769         case ESC_0F3A:
26770            delta = dis_ESC_0F3A__VEX ( &dres, &uses_vvvv,
26771                                        resteerOkFn, resteerCisOk,
26772                                        callback_opaque,
26773                                        archinfo, vbi, pfx, sz, delta );
26774            break;
26775         case ESC_NONE:
26776            /* The presence of a VEX prefix, by Intel definition,
26777               always implies at least an 0F escape. */
26778            goto decode_failure;
26779         default:
26780            vassert(0);
26781      }
26782      /* If the insn doesn't use VEX.vvvv then it must be all ones.
26783         Check this. */
26784      if (!uses_vvvv) {
26785         if (getVexNvvvv(pfx) != 0)
26786            goto decode_failure;
26787      }
26788   }
26789
26790   vassert(delta - delta_at_primary_opcode >= 0);
26791   vassert(delta - delta_at_primary_opcode < 16/*let's say*/);
26792
26793   /* Use delta == delta_at_primary_opcode to denote decode failure.
26794      This implies that any successful decode must use at least one
26795      byte up. */
26796   if (delta == delta_at_primary_opcode)
26797      goto decode_failure;
26798   else
26799      goto decode_success; /* \o/ */
26800
26801#if 0 /* XYZZY */
26802
26803   /* ---------------------------------------------------- */
26804   /* --- The SSE/SSE2 decoder.                        --- */
26805   /* ---------------------------------------------------- */
26806
26807   /* What did I do to deserve SSE ?  Perhaps I was really bad in a
26808      previous life? */
26809
26810   /* Note, this doesn't handle SSE3 right now.  All amd64s support
26811      SSE2 as a minimum so there is no point distinguishing SSE1 vs
26812      SSE2. */
26813
26814   insn = (UChar*)&guest_code[delta];
26815
26816   /* FXSAVE is spuriously at the start here only because it is
26817      thusly placed in guest-x86/toIR.c. */
26818
26819   /* ------ SSE decoder main ------ */
26820
26821   /* ---------------------------------------------------- */
26822   /* --- end of the SSE decoder.                      --- */
26823   /* ---------------------------------------------------- */
26824
26825   /* ---------------------------------------------------- */
26826   /* --- start of the SSE2 decoder.                   --- */
26827   /* ---------------------------------------------------- */
26828
26829   /* ---------------------------------------------------- */
26830   /* --- end of the SSE/SSE2 decoder.                 --- */
26831   /* ---------------------------------------------------- */
26832
26833   /* ---------------------------------------------------- */
26834   /* --- start of the SSE3 decoder.                   --- */
26835   /* ---------------------------------------------------- */
26836
26837   /* ---------------------------------------------------- */
26838   /* --- end of the SSE3 decoder.                     --- */
26839   /* ---------------------------------------------------- */
26840
26841   /* ---------------------------------------------------- */
26842   /* --- start of the SSSE3 decoder.                  --- */
26843   /* ---------------------------------------------------- */
26844
26845   /* ---------------------------------------------------- */
26846   /* --- end of the SSSE3 decoder.                    --- */
26847   /* ---------------------------------------------------- */
26848
26849   /* ---------------------------------------------------- */
26850   /* --- start of the SSE4 decoder                    --- */
26851   /* ---------------------------------------------------- */
26852
26853   /* ---------------------------------------------------- */
26854   /* --- end of the SSE4 decoder                      --- */
26855   /* ---------------------------------------------------- */
26856
26857   /*after_sse_decoders:*/
26858
26859   /* Get the primary opcode. */
26860   opc = getUChar(delta); delta++;
26861
26862   /* We get here if the current insn isn't SSE, or this CPU doesn't
26863      support SSE. */
26864
26865   switch (opc) {
26866
26867   /* ------------------------ Control flow --------------- */
26868
26869   /* ------------------------ CWD/CDQ -------------------- */
26870
26871   /* ------------------------ FPU ops -------------------- */
26872
26873   /* ------------------------ INT ------------------------ */
26874
26875   case 0xCD: { /* INT imm8 */
26876      IRJumpKind jk = Ijk_Boring;
26877      if (have66orF2orF3(pfx)) goto decode_failure;
26878      d64 = getUChar(delta); delta++;
26879      switch (d64) {
26880         case 32: jk = Ijk_Sys_int32; break;
26881         default: goto decode_failure;
26882      }
26883      guest_RIP_next_mustcheck = True;
26884      guest_RIP_next_assumed = guest_RIP_bbstart + delta;
26885      jmp_lit(jk, guest_RIP_next_assumed);
26886      /* It's important that all ArchRegs carry their up-to-date value
26887         at this point.  So we declare an end-of-block here, which
26888         forces any TempRegs caching ArchRegs to be flushed. */
26889      vassert(dres.whatNext == Dis_StopHere);
26890      DIP("int $0x%02x\n", (UInt)d64);
26891      break;
26892   }
26893
26894   /* ------------------------ Jcond, byte offset --------- */
26895
26896   /* ------------------------ IMUL ----------------------- */
26897
26898   /* ------------------------ MOV ------------------------ */
26899
26900   /* ------------------------ MOVx ------------------------ */
26901
26902   /* ------------------------ opl imm, A ----------------- */
26903
26904   /* ------------------------ opl Ev, Gv ----------------- */
26905
26906   /* ------------------------ opl Gv, Ev ----------------- */
26907
26908   /* ------------------------ POP ------------------------ */
26909
26910   /* ------------------------ PUSH ----------------------- */
26911
26912   /* ------ AE: SCAS variants ------ */
26913
26914   /* ------ A6, A7: CMPS variants ------ */
26915
26916   /* ------ AA, AB: STOS variants ------ */
26917
26918   /* ------ A4, A5: MOVS variants ------ */
26919
26920   /* ------------------------ XCHG ----------------------- */
26921
26922   /* ------------------------ IN / OUT ----------------------- */
26923
26924   /* ------------------------ (Grp1 extensions) ---------- */
26925
26926   /* ------------------------ (Grp2 extensions) ---------- */
26927
26928   /* ------------------------ (Grp3 extensions) ---------- */
26929
26930   /* ------------------------ (Grp4 extensions) ---------- */
26931
26932   /* ------------------------ (Grp5 extensions) ---------- */
26933
26934   /* ------------------------ Escapes to 2-byte opcodes -- */
26935
26936   case 0x0F: {
26937      opc = getUChar(delta); delta++;
26938      switch (opc) {
26939
26940      /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
26941
26942      /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
26943
26944      /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
26945
26946      /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
26947
26948      /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
26949
26950      /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
26951
26952      /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
26953
26954      /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
26955
26956      /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
26957
26958      /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
26959
26960      /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
26961
26962      /* =-=-=-=-=-=-=-=-=- PREFETCH =-=-=-=-=-=-=-=-=-= */
26963
26964      /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
26965
26966      /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
26967
26968      /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
26969
26970      /* =-=-=-=-=-=-=-=-=- SYSCALL -=-=-=-=-=-=-=-=-=-= */
26971
26972      /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
26973
26974      case 0xC0: { /* XADD Gb,Eb */
26975         Bool decode_OK = False;
26976         delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, 1, delta );
26977         if (!decode_OK)
26978            goto decode_failure;
26979         break;
26980      }
26981
26982      /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
26983
26984      /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
26985
26986      default:
26987         goto decode_failure;
26988   } /* switch (opc) for the 2-byte opcodes */
26989   goto decode_success;
26990   } /* case 0x0F: of primary opcode */
26991
26992   /* ------------------------ ??? ------------------------ */
26993#endif /* XYZZY */
26994
26995     //default:
26996  decode_failure:
26997   /* All decode failures end up here. */
26998   vex_printf("vex amd64->IR: unhandled instruction bytes: "
26999              "0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
27000              (Int)getUChar(delta_start+0),
27001              (Int)getUChar(delta_start+1),
27002              (Int)getUChar(delta_start+2),
27003              (Int)getUChar(delta_start+3),
27004              (Int)getUChar(delta_start+4),
27005              (Int)getUChar(delta_start+5),
27006              (Int)getUChar(delta_start+6),
27007              (Int)getUChar(delta_start+7) );
27008   vex_printf("vex amd64->IR:   REX=%d REX.W=%d REX.R=%d REX.X=%d REX.B=%d\n",
27009              haveREX(pfx) ? 1 : 0, getRexW(pfx), getRexR(pfx),
27010              getRexX(pfx), getRexB(pfx));
27011   vex_printf("vex amd64->IR:   VEX=%d VEX.L=%d VEX.nVVVV=0x%x ESC=%s\n",
27012              haveVEX(pfx) ? 1 : 0, getVexL(pfx),
27013              getVexNvvvv(pfx),
27014              esc==ESC_NONE ? "NONE" :
27015                esc==ESC_0F ? "0F" :
27016                esc==ESC_0F38 ? "0F38" :
27017                esc==ESC_0F3A ? "0F3A" : "???");
27018   vex_printf("vex amd64->IR:   PFX.66=%d PFX.F2=%d PFX.F3=%d\n",
27019              have66(pfx) ? 1 : 0, haveF2(pfx) ? 1 : 0,
27020              haveF3(pfx) ? 1 : 0);
27021
27022   /* Tell the dispatcher that this insn cannot be decoded, and so has
27023      not been executed, and (is currently) the next to be executed.
27024      RIP should be up-to-date since it made so at the start of each
27025      insn, but nevertheless be paranoid and update it again right
27026      now. */
27027   stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
27028   jmp_lit(&dres, Ijk_NoDecode, guest_RIP_curr_instr);
27029   vassert(dres.whatNext == Dis_StopHere);
27030   dres.len = 0;
27031   /* We also need to say that a CAS is not expected now, regardless
27032      of what it might have been set to at the start of the function,
27033      since the IR that we've emitted just above (to synthesis a
27034      SIGILL) does not involve any CAS, and presumably no other IR has
27035      been emitted for this (non-decoded) insn. */
27036   *expect_CAS = False;
27037   return dres;
27038
27039   //   } /* switch (opc) for the main (primary) opcode switch. */
27040
27041  decode_success:
27042   /* All decode successes end up here. */
27043   switch (dres.whatNext) {
27044      case Dis_Continue:
27045         stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_bbstart + delta) ) );
27046         break;
27047      case Dis_ResteerU:
27048      case Dis_ResteerC:
27049         stmt( IRStmt_Put( OFFB_RIP, mkU64(dres.continueAt) ) );
27050         break;
27051      case Dis_StopHere:
27052         break;
27053      default:
27054         vassert(0);
27055   }
27056
27057   DIP("\n");
27058   dres.len = (Int)toUInt(delta - delta_start);
27059   return dres;
27060}
27061
27062#undef DIP
27063#undef DIS
27064
27065
27066/*------------------------------------------------------------*/
27067/*--- Top-level fn                                         ---*/
27068/*------------------------------------------------------------*/
27069
27070/* Disassemble a single instruction into IR.  The instruction
27071   is located in host memory at &guest_code[delta]. */
27072
27073DisResult disInstr_AMD64 ( IRSB*        irsb_IN,
27074                           Bool         (*resteerOkFn) ( void*, Addr64 ),
27075                           Bool         resteerCisOk,
27076                           void*        callback_opaque,
27077                           UChar*       guest_code_IN,
27078                           Long         delta,
27079                           Addr64       guest_IP,
27080                           VexArch      guest_arch,
27081                           VexArchInfo* archinfo,
27082                           VexAbiInfo*  abiinfo,
27083                           Bool         host_bigendian_IN )
27084{
27085   Int       i, x1, x2;
27086   Bool      expect_CAS, has_CAS;
27087   DisResult dres;
27088
27089   /* Set globals (see top of this file) */
27090   vassert(guest_arch == VexArchAMD64);
27091   guest_code           = guest_code_IN;
27092   irsb                 = irsb_IN;
27093   host_is_bigendian    = host_bigendian_IN;
27094   guest_RIP_curr_instr = guest_IP;
27095   guest_RIP_bbstart    = guest_IP - delta;
27096
27097   /* We'll consult these after doing disInstr_AMD64_WRK. */
27098   guest_RIP_next_assumed   = 0;
27099   guest_RIP_next_mustcheck = False;
27100
27101   x1 = irsb_IN->stmts_used;
27102   expect_CAS = False;
27103   dres = disInstr_AMD64_WRK ( &expect_CAS, resteerOkFn,
27104                               resteerCisOk,
27105                               callback_opaque,
27106                               delta, archinfo, abiinfo );
27107   x2 = irsb_IN->stmts_used;
27108   vassert(x2 >= x1);
27109
27110   /* If disInstr_AMD64_WRK tried to figure out the next rip, check it
27111      got it right.  Failure of this assertion is serious and denotes
27112      a bug in disInstr. */
27113   if (guest_RIP_next_mustcheck
27114       && guest_RIP_next_assumed != guest_RIP_curr_instr + dres.len) {
27115      vex_printf("\n");
27116      vex_printf("assumed next %%rip = 0x%llx\n",
27117                 guest_RIP_next_assumed );
27118      vex_printf(" actual next %%rip = 0x%llx\n",
27119                 guest_RIP_curr_instr + dres.len );
27120      vpanic("disInstr_AMD64: disInstr miscalculated next %rip");
27121   }
27122
27123   /* See comment at the top of disInstr_AMD64_WRK for meaning of
27124      expect_CAS.  Here, we (sanity-)check for the presence/absence of
27125      IRCAS as directed by the returned expect_CAS value. */
27126   has_CAS = False;
27127   for (i = x1; i < x2; i++) {
27128      if (irsb_IN->stmts[i]->tag == Ist_CAS)
27129         has_CAS = True;
27130   }
27131
27132   if (expect_CAS != has_CAS) {
27133      /* inconsistency detected.  re-disassemble the instruction so as
27134         to generate a useful error message; then assert. */
27135      vex_traceflags |= VEX_TRACE_FE;
27136      dres = disInstr_AMD64_WRK ( &expect_CAS, resteerOkFn,
27137                                  resteerCisOk,
27138                                  callback_opaque,
27139                                  delta, archinfo, abiinfo );
27140      for (i = x1; i < x2; i++) {
27141         vex_printf("\t\t");
27142         ppIRStmt(irsb_IN->stmts[i]);
27143         vex_printf("\n");
27144      }
27145      /* Failure of this assertion is serious and denotes a bug in
27146         disInstr. */
27147      vpanic("disInstr_AMD64: inconsistency in LOCK prefix handling");
27148   }
27149
27150   return dres;
27151}
27152
27153
27154/*------------------------------------------------------------*/
27155/*--- Unused stuff                                         ---*/
27156/*------------------------------------------------------------*/
27157
27158// A potentially more Memcheck-friendly version of gen_LZCNT, if
27159// this should ever be needed.
27160//
27161//static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
27162//{
27163//   /* Scheme is simple: propagate the most significant 1-bit into all
27164//      lower positions in the word.  This gives a word of the form
27165//      0---01---1.  Now invert it, giving a word of the form
27166//      1---10---0, then do a population-count idiom (to count the 1s,
27167//      which is the number of leading zeroes, or the word size if the
27168//      original word was 0.
27169//   */
27170//   Int i;
27171//   IRTemp t[7];
27172//   for (i = 0; i < 7; i++) {
27173//      t[i] = newTemp(ty);
27174//   }
27175//   if (ty == Ity_I64) {
27176//      assign(t[0], binop(Iop_Or64, mkexpr(src),
27177//                                   binop(Iop_Shr64, mkexpr(src),  mkU8(1))));
27178//      assign(t[1], binop(Iop_Or64, mkexpr(t[0]),
27179//                                   binop(Iop_Shr64, mkexpr(t[0]), mkU8(2))));
27180//      assign(t[2], binop(Iop_Or64, mkexpr(t[1]),
27181//                                   binop(Iop_Shr64, mkexpr(t[1]), mkU8(4))));
27182//      assign(t[3], binop(Iop_Or64, mkexpr(t[2]),
27183//                                   binop(Iop_Shr64, mkexpr(t[2]), mkU8(8))));
27184//      assign(t[4], binop(Iop_Or64, mkexpr(t[3]),
27185//                                   binop(Iop_Shr64, mkexpr(t[3]), mkU8(16))));
27186//      assign(t[5], binop(Iop_Or64, mkexpr(t[4]),
27187//                                   binop(Iop_Shr64, mkexpr(t[4]), mkU8(32))));
27188//      assign(t[6], unop(Iop_Not64, mkexpr(t[5])));
27189//      return gen_POPCOUNT(ty, t[6]);
27190//   }
27191//   if (ty == Ity_I32) {
27192//      assign(t[0], binop(Iop_Or32, mkexpr(src),
27193//                                   binop(Iop_Shr32, mkexpr(src),  mkU8(1))));
27194//      assign(t[1], binop(Iop_Or32, mkexpr(t[0]),
27195//                                   binop(Iop_Shr32, mkexpr(t[0]), mkU8(2))));
27196//      assign(t[2], binop(Iop_Or32, mkexpr(t[1]),
27197//                                   binop(Iop_Shr32, mkexpr(t[1]), mkU8(4))));
27198//      assign(t[3], binop(Iop_Or32, mkexpr(t[2]),
27199//                                   binop(Iop_Shr32, mkexpr(t[2]), mkU8(8))));
27200//      assign(t[4], binop(Iop_Or32, mkexpr(t[3]),
27201//                                   binop(Iop_Shr32, mkexpr(t[3]), mkU8(16))));
27202//      assign(t[5], unop(Iop_Not32, mkexpr(t[4])));
27203//      return gen_POPCOUNT(ty, t[5]);
27204//   }
27205//   if (ty == Ity_I16) {
27206//      assign(t[0], binop(Iop_Or16, mkexpr(src),
27207//                                   binop(Iop_Shr16, mkexpr(src),  mkU8(1))));
27208//      assign(t[1], binop(Iop_Or16, mkexpr(t[0]),
27209//                                   binop(Iop_Shr16, mkexpr(t[0]), mkU8(2))));
27210//      assign(t[2], binop(Iop_Or16, mkexpr(t[1]),
27211//                                   binop(Iop_Shr16, mkexpr(t[1]), mkU8(4))));
27212//      assign(t[3], binop(Iop_Or16, mkexpr(t[2]),
27213//                                   binop(Iop_Shr16, mkexpr(t[2]), mkU8(8))));
27214//      assign(t[4], unop(Iop_Not16, mkexpr(t[3])));
27215//      return gen_POPCOUNT(ty, t[4]);
27216//   }
27217//   vassert(0);
27218//}
27219
27220
27221/*--------------------------------------------------------------------*/
27222/*--- end                                       guest_amd64_toIR.c ---*/
27223/*--------------------------------------------------------------------*/
27224