1
2/*--------------------------------------------------------------------*/
3/*--- begin                                     guest_amd64_toIR.c ---*/
4/*--------------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2017 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36/* Translates AMD64 code to IR. */
37
38/* TODO:
39
40   All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
41   to ensure a 64-bit value is being written.
42
43   x87 FP Limitations:
44
45   * all arithmetic done at 64 bits
46
47   * no FP exceptions, except for handling stack over/underflow
48
49   * FP rounding mode observed only for float->int conversions and
50     int->float conversions which could lose accuracy, and for
51     float-to-float rounding.  For all other operations,
52     round-to-nearest is used, regardless.
53
54   * some of the FCOM cases could do with testing -- not convinced
55     that the args are the right way round.
56
57   * FSAVE does not re-initialise the FPU; it should do
58
59   * FINIT not only initialises the FPU environment, it also zeroes
60     all the FP registers.  It should leave the registers unchanged.
61
62    SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
63    per Intel docs this bit has no meaning anyway.  Since PUSHF is the
64    only way to observe eflags[1], a proper fix would be to make that
65    bit be set by PUSHF.
66
67    This module uses global variables and so is not MT-safe (if that
68    should ever become relevant).
69*/
70
71/* Notes re address size overrides (0x67).
72
73   According to the AMD documentation (24594 Rev 3.09, Sept 2003,
74   "AMD64 Architecture Programmer's Manual Volume 3: General-Purpose
75   and System Instructions"), Section 1.2.3 ("Address-Size Override
76   Prefix"):
77
78   0x67 applies to all explicit memory references, causing the top
79   32 bits of the effective address to become zero.
80
81   0x67 has no effect on stack references (push/pop); these always
82   use a 64-bit address.
83
84   0x67 changes the interpretation of instructions which implicitly
85   reference RCX/RSI/RDI, so that in fact ECX/ESI/EDI are used
86   instead.  These are:
87
88      cmp{s,sb,sw,sd,sq}
89      in{s,sb,sw,sd}
90      jcxz, jecxz, jrcxz
91      lod{s,sb,sw,sd,sq}
92      loop{,e,bz,be,z}
93      mov{s,sb,sw,sd,sq}
94      out{s,sb,sw,sd}
95      rep{,e,ne,nz}
96      sca{s,sb,sw,sd,sq}
97      sto{s,sb,sw,sd,sq}
98      xlat{,b} */
99
100/* "Special" instructions.
101
102   This instruction decoder can decode three special instructions
103   which mean nothing natively (are no-ops as far as regs/mem are
104   concerned) but have meaning for supporting Valgrind.  A special
105   instruction is flagged by the 16-byte preamble 48C1C703 48C1C70D
106   48C1C73D 48C1C733 (in the standard interpretation, that means: rolq
107   $3, %rdi; rolq $13, %rdi; rolq $61, %rdi; rolq $51, %rdi).
108   Following that, one of the following 3 are allowed (standard
109   interpretation in parentheses):
110
111      4887DB (xchgq %rbx,%rbx)   %RDX = client_request ( %RAX )
112      4887C9 (xchgq %rcx,%rcx)   %RAX = guest_NRADDR
113      4887D2 (xchgq %rdx,%rdx)   call-noredir *%RAX
114      4887F6 (xchgq %rdi,%rdi)   IR injection
115
116   Any other bytes following the 16-byte preamble are illegal and
117   constitute a failure in instruction decoding.  This all assumes
118   that the preamble will never occur except in specific code
119   fragments designed for Valgrind to catch.
120
121   No prefixes may precede a "Special" instruction.
122*/
123
124/* casLE (implementation of lock-prefixed insns) and rep-prefixed
125   insns: the side-exit back to the start of the insn is done with
126   Ijk_Boring.  This is quite wrong, it should be done with
127   Ijk_NoRedir, since otherwise the side exit, which is intended to
128   restart the instruction for whatever reason, could go somewhere
129   entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
130   no-redir jumps performance critical, at least for rep-prefixed
131   instructions, since all iterations thereof would involve such a
132   jump.  It's not such a big deal with casLE since the side exit is
133   only taken if the CAS fails, that is, the location is contended,
134   which is relatively unlikely.
135
136   Note also, the test for CAS success vs failure is done using
137   Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
138   Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
139   shouldn't definedness-check these comparisons.  See
140   COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
141   background/rationale.
142*/
143
144/* LOCK prefixed instructions.  These are translated using IR-level
145   CAS statements (IRCAS) and are believed to preserve atomicity, even
146   from the point of view of some other process racing against a
147   simulated one (presumably they communicate via a shared memory
148   segment).
149
150   Handlers which are aware of LOCK prefixes are:
151      dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
152      dis_cmpxchg_G_E  (cmpxchg)
153      dis_Grp1         (add, or, adc, sbb, and, sub, xor)
154      dis_Grp3         (not, neg)
155      dis_Grp4         (inc, dec)
156      dis_Grp5         (inc, dec)
157      dis_Grp8_Imm     (bts, btc, btr)
158      dis_bt_G_E       (bts, btc, btr)
159      dis_xadd_G_E     (xadd)
160*/
161
162
163#include "libvex_basictypes.h"
164#include "libvex_ir.h"
165#include "libvex.h"
166#include "libvex_guest_amd64.h"
167
168#include "main_util.h"
169#include "main_globals.h"
170#include "guest_generic_bb_to_IR.h"
171#include "guest_generic_x87.h"
172#include "guest_amd64_defs.h"
173
174
175/*------------------------------------------------------------*/
176/*--- Globals                                              ---*/
177/*------------------------------------------------------------*/
178
179/* These are set at the start of the translation of an insn, right
180   down in disInstr_AMD64, so that we don't have to pass them around
181   endlessly.  They are all constant during the translation of any
182   given insn. */
183
184/* These are set at the start of the translation of a BB, so
185   that we don't have to pass them around endlessly. */
186
187/* We need to know this to do sub-register accesses correctly. */
188static VexEndness host_endness;
189
190/* Pointer to the guest code area (points to start of BB, not to the
191   insn being processed). */
192static const UChar* guest_code;
193
194/* The guest address corresponding to guest_code[0]. */
195static Addr64 guest_RIP_bbstart;
196
197/* The guest address for the instruction currently being
198   translated. */
199static Addr64 guest_RIP_curr_instr;
200
201/* The IRSB* into which we're generating code. */
202static IRSB* irsb;
203
204/* For ensuring that %rip-relative addressing is done right.  A read
205   of %rip generates the address of the next instruction.  It may be
206   that we don't conveniently know that inside disAMode().  For sanity
207   checking, if the next insn %rip is needed, we make a guess at what
208   it is, record that guess here, and set the accompanying Bool to
209   indicate that -- after this insn's decode is finished -- that guess
210   needs to be checked.  */
211
212/* At the start of each insn decode, is set to (0, False).
213   After the decode, if _mustcheck is now True, _assumed is
214   checked. */
215
216static Addr64 guest_RIP_next_assumed;
217static Bool   guest_RIP_next_mustcheck;
218
219
220/*------------------------------------------------------------*/
221/*--- Helpers for constructing IR.                         ---*/
222/*------------------------------------------------------------*/
223
224/* Generate a new temporary of the given type. */
225static IRTemp newTemp ( IRType ty )
226{
227   vassert(isPlausibleIRType(ty));
228   return newIRTemp( irsb->tyenv, ty );
229}
230
231/* Add a statement to the list held by "irsb". */
232static void stmt ( IRStmt* st )
233{
234   addStmtToIRSB( irsb, st );
235}
236
237/* Generate a statement "dst := e". */
238static void assign ( IRTemp dst, IRExpr* e )
239{
240   stmt( IRStmt_WrTmp(dst, e) );
241}
242
243static IRExpr* unop ( IROp op, IRExpr* a )
244{
245   return IRExpr_Unop(op, a);
246}
247
248static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
249{
250   return IRExpr_Binop(op, a1, a2);
251}
252
253static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
254{
255   return IRExpr_Triop(op, a1, a2, a3);
256}
257
258static IRExpr* mkexpr ( IRTemp tmp )
259{
260   return IRExpr_RdTmp(tmp);
261}
262
263static IRExpr* mkU8 ( ULong i )
264{
265   vassert(i < 256);
266   return IRExpr_Const(IRConst_U8( (UChar)i ));
267}
268
269static IRExpr* mkU16 ( ULong i )
270{
271   vassert(i < 0x10000ULL);
272   return IRExpr_Const(IRConst_U16( (UShort)i ));
273}
274
275static IRExpr* mkU32 ( ULong i )
276{
277   vassert(i < 0x100000000ULL);
278   return IRExpr_Const(IRConst_U32( (UInt)i ));
279}
280
281static IRExpr* mkU64 ( ULong i )
282{
283   return IRExpr_Const(IRConst_U64(i));
284}
285
286static IRExpr* mkU ( IRType ty, ULong i )
287{
288   switch (ty) {
289      case Ity_I8:  return mkU8(i);
290      case Ity_I16: return mkU16(i);
291      case Ity_I32: return mkU32(i);
292      case Ity_I64: return mkU64(i);
293      default: vpanic("mkU(amd64)");
294   }
295}
296
297static void storeLE ( IRExpr* addr, IRExpr* data )
298{
299   stmt( IRStmt_Store(Iend_LE, addr, data) );
300}
301
302static IRExpr* loadLE ( IRType ty, IRExpr* addr )
303{
304   return IRExpr_Load(Iend_LE, ty, addr);
305}
306
307static IROp mkSizedOp ( IRType ty, IROp op8 )
308{
309   vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
310           || op8 == Iop_Mul8
311           || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
312           || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
313           || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
314           || op8 == Iop_CasCmpNE8
315           || op8 == Iop_Not8 );
316   switch (ty) {
317      case Ity_I8:  return 0 +op8;
318      case Ity_I16: return 1 +op8;
319      case Ity_I32: return 2 +op8;
320      case Ity_I64: return 3 +op8;
321      default: vpanic("mkSizedOp(amd64)");
322   }
323}
324
325static
326IRExpr* doScalarWidening ( Int szSmall, Int szBig, Bool signd, IRExpr* src )
327{
328   if (szSmall == 1 && szBig == 4) {
329      return unop(signd ? Iop_8Sto32 : Iop_8Uto32, src);
330   }
331   if (szSmall == 1 && szBig == 2) {
332      return unop(signd ? Iop_8Sto16 : Iop_8Uto16, src);
333   }
334   if (szSmall == 2 && szBig == 4) {
335      return unop(signd ? Iop_16Sto32 : Iop_16Uto32, src);
336   }
337   if (szSmall == 1 && szBig == 8 && !signd) {
338      return unop(Iop_8Uto64, src);
339   }
340   if (szSmall == 1 && szBig == 8 && signd) {
341      return unop(Iop_8Sto64, src);
342   }
343   if (szSmall == 2 && szBig == 8 && !signd) {
344      return unop(Iop_16Uto64, src);
345   }
346   if (szSmall == 2 && szBig == 8 && signd) {
347      return unop(Iop_16Sto64, src);
348   }
349   vpanic("doScalarWidening(amd64)");
350}
351
352static
353void putGuarded ( Int gstOffB, IRExpr* guard, IRExpr* value )
354{
355   IRType ty = typeOfIRExpr(irsb->tyenv, value);
356   stmt( IRStmt_Put(gstOffB,
357                    IRExpr_ITE(guard, value, IRExpr_Get(gstOffB, ty))) );
358}
359
360
361/*------------------------------------------------------------*/
362/*--- Debugging output                                     ---*/
363/*------------------------------------------------------------*/
364
365/* Bomb out if we can't handle something. */
366__attribute__ ((noreturn))
367static void unimplemented ( const HChar* str )
368{
369   vex_printf("amd64toIR: unimplemented feature\n");
370   vpanic(str);
371}
372
373#define DIP(format, args...)           \
374   if (vex_traceflags & VEX_TRACE_FE)  \
375      vex_printf(format, ## args)
376
377#define DIS(buf, format, args...)      \
378   if (vex_traceflags & VEX_TRACE_FE)  \
379      vex_sprintf(buf, format, ## args)
380
381
382/*------------------------------------------------------------*/
383/*--- Offsets of various parts of the amd64 guest state.   ---*/
384/*------------------------------------------------------------*/
385
386#define OFFB_RAX       offsetof(VexGuestAMD64State,guest_RAX)
387#define OFFB_RBX       offsetof(VexGuestAMD64State,guest_RBX)
388#define OFFB_RCX       offsetof(VexGuestAMD64State,guest_RCX)
389#define OFFB_RDX       offsetof(VexGuestAMD64State,guest_RDX)
390#define OFFB_RSP       offsetof(VexGuestAMD64State,guest_RSP)
391#define OFFB_RBP       offsetof(VexGuestAMD64State,guest_RBP)
392#define OFFB_RSI       offsetof(VexGuestAMD64State,guest_RSI)
393#define OFFB_RDI       offsetof(VexGuestAMD64State,guest_RDI)
394#define OFFB_R8        offsetof(VexGuestAMD64State,guest_R8)
395#define OFFB_R9        offsetof(VexGuestAMD64State,guest_R9)
396#define OFFB_R10       offsetof(VexGuestAMD64State,guest_R10)
397#define OFFB_R11       offsetof(VexGuestAMD64State,guest_R11)
398#define OFFB_R12       offsetof(VexGuestAMD64State,guest_R12)
399#define OFFB_R13       offsetof(VexGuestAMD64State,guest_R13)
400#define OFFB_R14       offsetof(VexGuestAMD64State,guest_R14)
401#define OFFB_R15       offsetof(VexGuestAMD64State,guest_R15)
402
403#define OFFB_RIP       offsetof(VexGuestAMD64State,guest_RIP)
404
405#define OFFB_FS_CONST  offsetof(VexGuestAMD64State,guest_FS_CONST)
406#define OFFB_GS_CONST  offsetof(VexGuestAMD64State,guest_GS_CONST)
407
408#define OFFB_CC_OP     offsetof(VexGuestAMD64State,guest_CC_OP)
409#define OFFB_CC_DEP1   offsetof(VexGuestAMD64State,guest_CC_DEP1)
410#define OFFB_CC_DEP2   offsetof(VexGuestAMD64State,guest_CC_DEP2)
411#define OFFB_CC_NDEP   offsetof(VexGuestAMD64State,guest_CC_NDEP)
412
413#define OFFB_FPREGS    offsetof(VexGuestAMD64State,guest_FPREG[0])
414#define OFFB_FPTAGS    offsetof(VexGuestAMD64State,guest_FPTAG[0])
415#define OFFB_DFLAG     offsetof(VexGuestAMD64State,guest_DFLAG)
416#define OFFB_ACFLAG    offsetof(VexGuestAMD64State,guest_ACFLAG)
417#define OFFB_IDFLAG    offsetof(VexGuestAMD64State,guest_IDFLAG)
418#define OFFB_FTOP      offsetof(VexGuestAMD64State,guest_FTOP)
419#define OFFB_FC3210    offsetof(VexGuestAMD64State,guest_FC3210)
420#define OFFB_FPROUND   offsetof(VexGuestAMD64State,guest_FPROUND)
421
422#define OFFB_SSEROUND  offsetof(VexGuestAMD64State,guest_SSEROUND)
423#define OFFB_YMM0      offsetof(VexGuestAMD64State,guest_YMM0)
424#define OFFB_YMM1      offsetof(VexGuestAMD64State,guest_YMM1)
425#define OFFB_YMM2      offsetof(VexGuestAMD64State,guest_YMM2)
426#define OFFB_YMM3      offsetof(VexGuestAMD64State,guest_YMM3)
427#define OFFB_YMM4      offsetof(VexGuestAMD64State,guest_YMM4)
428#define OFFB_YMM5      offsetof(VexGuestAMD64State,guest_YMM5)
429#define OFFB_YMM6      offsetof(VexGuestAMD64State,guest_YMM6)
430#define OFFB_YMM7      offsetof(VexGuestAMD64State,guest_YMM7)
431#define OFFB_YMM8      offsetof(VexGuestAMD64State,guest_YMM8)
432#define OFFB_YMM9      offsetof(VexGuestAMD64State,guest_YMM9)
433#define OFFB_YMM10     offsetof(VexGuestAMD64State,guest_YMM10)
434#define OFFB_YMM11     offsetof(VexGuestAMD64State,guest_YMM11)
435#define OFFB_YMM12     offsetof(VexGuestAMD64State,guest_YMM12)
436#define OFFB_YMM13     offsetof(VexGuestAMD64State,guest_YMM13)
437#define OFFB_YMM14     offsetof(VexGuestAMD64State,guest_YMM14)
438#define OFFB_YMM15     offsetof(VexGuestAMD64State,guest_YMM15)
439#define OFFB_YMM16     offsetof(VexGuestAMD64State,guest_YMM16)
440
441#define OFFB_EMNOTE    offsetof(VexGuestAMD64State,guest_EMNOTE)
442#define OFFB_CMSTART   offsetof(VexGuestAMD64State,guest_CMSTART)
443#define OFFB_CMLEN     offsetof(VexGuestAMD64State,guest_CMLEN)
444
445#define OFFB_NRADDR    offsetof(VexGuestAMD64State,guest_NRADDR)
446
447
448/*------------------------------------------------------------*/
449/*--- Helper bits and pieces for deconstructing the        ---*/
450/*--- amd64 insn stream.                                   ---*/
451/*------------------------------------------------------------*/
452
453/* This is the AMD64 register encoding -- integer regs. */
454#define R_RAX 0
455#define R_RCX 1
456#define R_RDX 2
457#define R_RBX 3
458#define R_RSP 4
459#define R_RBP 5
460#define R_RSI 6
461#define R_RDI 7
462#define R_R8  8
463#define R_R9  9
464#define R_R10 10
465#define R_R11 11
466#define R_R12 12
467#define R_R13 13
468#define R_R14 14
469#define R_R15 15
470
471/* This is the Intel register encoding -- segment regs. */
472#define R_ES 0
473#define R_CS 1
474#define R_SS 2
475#define R_DS 3
476#define R_FS 4
477#define R_GS 5
478
479
480/* Various simple conversions */
481
482static ULong extend_s_8to64 ( UChar x )
483{
484   return (ULong)((Long)(((ULong)x) << 56) >> 56);
485}
486
487static ULong extend_s_16to64 ( UShort x )
488{
489   return (ULong)((Long)(((ULong)x) << 48) >> 48);
490}
491
492static ULong extend_s_32to64 ( UInt x )
493{
494   return (ULong)((Long)(((ULong)x) << 32) >> 32);
495}
496
497/* Figure out whether the mod and rm parts of a modRM byte refer to a
498   register or memory.  If so, the byte will have the form 11XXXYYY,
499   where YYY is the register number. */
500inline
501static Bool epartIsReg ( UChar mod_reg_rm )
502{
503   return toBool(0xC0 == (mod_reg_rm & 0xC0));
504}
505
506/* Extract the 'g' field from a modRM byte.  This only produces 3
507   bits, which is not a complete register number.  You should avoid
508   this function if at all possible. */
509inline
510static Int gregLO3ofRM ( UChar mod_reg_rm )
511{
512   return (Int)( (mod_reg_rm >> 3) & 7 );
513}
514
515/* Ditto the 'e' field of a modRM byte. */
516inline
517static Int eregLO3ofRM ( UChar mod_reg_rm )
518{
519   return (Int)(mod_reg_rm & 0x7);
520}
521
522/* Get a 8/16/32-bit unsigned value out of the insn stream. */
523
524static inline UChar getUChar ( Long delta )
525{
526   UChar v = guest_code[delta+0];
527   return v;
528}
529
530static UInt getUDisp16 ( Long delta )
531{
532   UInt v = guest_code[delta+1]; v <<= 8;
533   v |= guest_code[delta+0];
534   return v & 0xFFFF;
535}
536
537//.. static UInt getUDisp ( Int size, Long delta )
538//.. {
539//..    switch (size) {
540//..       case 4: return getUDisp32(delta);
541//..       case 2: return getUDisp16(delta);
542//..       case 1: return getUChar(delta);
543//..       default: vpanic("getUDisp(x86)");
544//..    }
545//..    return 0; /*notreached*/
546//.. }
547
548
549/* Get a byte value out of the insn stream and sign-extend to 64
550   bits. */
551static Long getSDisp8 ( Long delta )
552{
553   return extend_s_8to64( guest_code[delta] );
554}
555
556/* Get a 16-bit value out of the insn stream and sign-extend to 64
557   bits. */
558static Long getSDisp16 ( Long delta )
559{
560   UInt v = guest_code[delta+1]; v <<= 8;
561   v |= guest_code[delta+0];
562   return extend_s_16to64( (UShort)v );
563}
564
565/* Get a 32-bit value out of the insn stream and sign-extend to 64
566   bits. */
567static Long getSDisp32 ( Long delta )
568{
569   UInt v = guest_code[delta+3]; v <<= 8;
570   v |= guest_code[delta+2]; v <<= 8;
571   v |= guest_code[delta+1]; v <<= 8;
572   v |= guest_code[delta+0];
573   return extend_s_32to64( v );
574}
575
576/* Get a 64-bit value out of the insn stream. */
577static Long getDisp64 ( Long delta )
578{
579   ULong v = 0;
580   v |= guest_code[delta+7]; v <<= 8;
581   v |= guest_code[delta+6]; v <<= 8;
582   v |= guest_code[delta+5]; v <<= 8;
583   v |= guest_code[delta+4]; v <<= 8;
584   v |= guest_code[delta+3]; v <<= 8;
585   v |= guest_code[delta+2]; v <<= 8;
586   v |= guest_code[delta+1]; v <<= 8;
587   v |= guest_code[delta+0];
588   return v;
589}
590
591/* Note: because AMD64 doesn't allow 64-bit literals, it is an error
592   if this is called with size==8.  Should not happen. */
593static Long getSDisp ( Int size, Long delta )
594{
595   switch (size) {
596      case 4: return getSDisp32(delta);
597      case 2: return getSDisp16(delta);
598      case 1: return getSDisp8(delta);
599      default: vpanic("getSDisp(amd64)");
600  }
601}
602
603static ULong mkSizeMask ( Int sz )
604{
605   switch (sz) {
606      case 1: return 0x00000000000000FFULL;
607      case 2: return 0x000000000000FFFFULL;
608      case 4: return 0x00000000FFFFFFFFULL;
609      case 8: return 0xFFFFFFFFFFFFFFFFULL;
610      default: vpanic("mkSzMask(amd64)");
611   }
612}
613
614static Int imin ( Int a, Int b )
615{
616   return (a < b) ? a : b;
617}
618
619static IRType szToITy ( Int n )
620{
621   switch (n) {
622      case 1: return Ity_I8;
623      case 2: return Ity_I16;
624      case 4: return Ity_I32;
625      case 8: return Ity_I64;
626      default: vex_printf("\nszToITy(%d)\n", n);
627               vpanic("szToITy(amd64)");
628   }
629}
630
631
632/*------------------------------------------------------------*/
633/*--- For dealing with prefixes.                           ---*/
634/*------------------------------------------------------------*/
635
636/* The idea is to pass around an int holding a bitmask summarising
637   info from the prefixes seen on the current instruction, including
638   info from the REX byte.  This info is used in various places, but
639   most especially when making sense of register fields in
640   instructions.
641
642   The top 8 bits of the prefix are 0x55, just as a hacky way to
643   ensure it really is a valid prefix.
644
645   Things you can safely assume about a well-formed prefix:
646   * at most one segment-override bit (CS,DS,ES,FS,GS,SS) is set.
647   * if REX is not present then REXW,REXR,REXX,REXB will read
648     as zero.
649   * F2 and F3 will not both be 1.
650*/
651
652typedef UInt  Prefix;
653
654#define PFX_ASO    (1<<0)    /* address-size override present (0x67) */
655#define PFX_66     (1<<1)    /* operand-size override-to-16 present (0x66) */
656#define PFX_REX    (1<<2)    /* REX byte present (0x40 to 0x4F) */
657#define PFX_REXW   (1<<3)    /* REX W bit, if REX present, else 0 */
658#define PFX_REXR   (1<<4)    /* REX R bit, if REX present, else 0 */
659#define PFX_REXX   (1<<5)    /* REX X bit, if REX present, else 0 */
660#define PFX_REXB   (1<<6)    /* REX B bit, if REX present, else 0 */
661#define PFX_LOCK   (1<<7)    /* bus LOCK prefix present (0xF0) */
662#define PFX_F2     (1<<8)    /* REP/REPE/REPZ prefix present (0xF2) */
663#define PFX_F3     (1<<9)    /* REPNE/REPNZ prefix present (0xF3) */
664#define PFX_CS     (1<<10)   /* CS segment prefix present (0x2E) */
665#define PFX_DS     (1<<11)   /* DS segment prefix present (0x3E) */
666#define PFX_ES     (1<<12)   /* ES segment prefix present (0x26) */
667#define PFX_FS     (1<<13)   /* FS segment prefix present (0x64) */
668#define PFX_GS     (1<<14)   /* GS segment prefix present (0x65) */
669#define PFX_SS     (1<<15)   /* SS segment prefix present (0x36) */
670#define PFX_VEX    (1<<16)   /* VEX prefix present (0xC4 or 0xC5) */
671#define PFX_VEXL   (1<<17)   /* VEX L bit, if VEX present, else 0 */
672/* The extra register field VEX.vvvv is encoded (after not-ing it) as
673   PFX_VEXnV3 .. PFX_VEXnV0, so these must occupy adjacent bit
674   positions. */
675#define PFX_VEXnV0 (1<<18)   /* ~VEX vvvv[0], if VEX present, else 0 */
676#define PFX_VEXnV1 (1<<19)   /* ~VEX vvvv[1], if VEX present, else 0 */
677#define PFX_VEXnV2 (1<<20)   /* ~VEX vvvv[2], if VEX present, else 0 */
678#define PFX_VEXnV3 (1<<21)   /* ~VEX vvvv[3], if VEX present, else 0 */
679
680
681#define PFX_EMPTY 0x55000000
682
683static Bool IS_VALID_PFX ( Prefix pfx ) {
684   return toBool((pfx & 0xFF000000) == PFX_EMPTY);
685}
686
687static Bool haveREX ( Prefix pfx ) {
688   return toBool(pfx & PFX_REX);
689}
690
691static Int getRexW ( Prefix pfx ) {
692   return (pfx & PFX_REXW) ? 1 : 0;
693}
694static Int getRexR ( Prefix pfx ) {
695   return (pfx & PFX_REXR) ? 1 : 0;
696}
697static Int getRexX ( Prefix pfx ) {
698   return (pfx & PFX_REXX) ? 1 : 0;
699}
700static Int getRexB ( Prefix pfx ) {
701   return (pfx & PFX_REXB) ? 1 : 0;
702}
703
704/* Check a prefix doesn't have F2 or F3 set in it, since usually that
705   completely changes what instruction it really is. */
706static Bool haveF2orF3 ( Prefix pfx ) {
707   return toBool((pfx & (PFX_F2|PFX_F3)) > 0);
708}
709static Bool haveF2andF3 ( Prefix pfx ) {
710   return toBool((pfx & (PFX_F2|PFX_F3)) == (PFX_F2|PFX_F3));
711}
712static Bool haveF2 ( Prefix pfx ) {
713   return toBool((pfx & PFX_F2) > 0);
714}
715static Bool haveF3 ( Prefix pfx ) {
716   return toBool((pfx & PFX_F3) > 0);
717}
718
719static Bool have66 ( Prefix pfx ) {
720   return toBool((pfx & PFX_66) > 0);
721}
722static Bool haveASO ( Prefix pfx ) {
723   return toBool((pfx & PFX_ASO) > 0);
724}
725static Bool haveLOCK ( Prefix pfx ) {
726   return toBool((pfx & PFX_LOCK) > 0);
727}
728
729/* Return True iff pfx has 66 set and F2 and F3 clear */
730static Bool have66noF2noF3 ( Prefix pfx )
731{
732  return
733     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_66);
734}
735
736/* Return True iff pfx has F2 set and 66 and F3 clear */
737static Bool haveF2no66noF3 ( Prefix pfx )
738{
739  return
740     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F2);
741}
742
743/* Return True iff pfx has F3 set and 66 and F2 clear */
744static Bool haveF3no66noF2 ( Prefix pfx )
745{
746  return
747     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F3);
748}
749
750/* Return True iff pfx has F3 set and F2 clear */
751static Bool haveF3noF2 ( Prefix pfx )
752{
753  return
754     toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F3);
755}
756
757/* Return True iff pfx has F2 set and F3 clear */
758static Bool haveF2noF3 ( Prefix pfx )
759{
760  return
761     toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F2);
762}
763
764/* Return True iff pfx has 66, F2 and F3 clear */
765static Bool haveNo66noF2noF3 ( Prefix pfx )
766{
767  return
768     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == 0);
769}
770
771/* Return True iff pfx has any of 66, F2 and F3 set */
772static Bool have66orF2orF3 ( Prefix pfx )
773{
774  return toBool( ! haveNo66noF2noF3(pfx) );
775}
776
777/* Return True iff pfx has 66 or F3 set */
778static Bool have66orF3 ( Prefix pfx )
779{
780   return toBool((pfx & (PFX_66|PFX_F3)) > 0);
781}
782
783/* Clear all the segment-override bits in a prefix. */
784static Prefix clearSegBits ( Prefix p )
785{
786   return
787      p & ~(PFX_CS | PFX_DS | PFX_ES | PFX_FS | PFX_GS | PFX_SS);
788}
789
790/* Get the (inverted, hence back to "normal") VEX.vvvv field. */
791static UInt getVexNvvvv ( Prefix pfx ) {
792   UInt r = (UInt)pfx;
793   r /= (UInt)PFX_VEXnV0; /* pray this turns into a shift */
794   return r & 0xF;
795}
796
797static Bool haveVEX ( Prefix pfx ) {
798   return toBool(pfx & PFX_VEX);
799}
800
801static Int getVexL ( Prefix pfx ) {
802   return (pfx & PFX_VEXL) ? 1 : 0;
803}
804
805
806/*------------------------------------------------------------*/
807/*--- For dealing with escapes                             ---*/
808/*------------------------------------------------------------*/
809
810
811/* Escapes come after the prefixes, but before the primary opcode
812   byte.  They escape the primary opcode byte into a bigger space.
813   The 0xF0000000 isn't significant, except so as to make it not
814   overlap valid Prefix values, for sanity checking.
815*/
816
817typedef
818   enum {
819      ESC_NONE=0xF0000000, // none
820      ESC_0F,              // 0F
821      ESC_0F38,            // 0F 38
822      ESC_0F3A             // 0F 3A
823   }
824   Escape;
825
826
827/*------------------------------------------------------------*/
828/*--- For dealing with integer registers                   ---*/
829/*------------------------------------------------------------*/
830
831/* This is somewhat complex.  The rules are:
832
833   For 64, 32 and 16 bit register references, the e or g fields in the
834   modrm bytes supply the low 3 bits of the register number.  The
835   fourth (most-significant) bit of the register number is supplied by
836   the REX byte, if it is present; else that bit is taken to be zero.
837
838   The REX.R bit supplies the high bit corresponding to the g register
839   field, and the REX.B bit supplies the high bit corresponding to the
840   e register field (when the mod part of modrm indicates that modrm's
841   e component refers to a register and not to memory).
842
843   The REX.X bit supplies a high register bit for certain registers
844   in SIB address modes, and is generally rarely used.
845
846   For 8 bit register references, the presence of the REX byte itself
847   has significance.  If there is no REX present, then the 3-bit
848   number extracted from the modrm e or g field is treated as an index
849   into the sequence %al %cl %dl %bl %ah %ch %dh %bh -- that is, the
850   old x86 encoding scheme.
851
852   But if there is a REX present, the register reference is
853   interpreted in the same way as for 64/32/16-bit references: a high
854   bit is extracted from REX, giving a 4-bit number, and the denoted
855   register is the lowest 8 bits of the 16 integer registers denoted
856   by the number.  In particular, values 3 through 7 of this sequence
857   do not refer to %ah %ch %dh %bh but instead to the lowest 8 bits of
858   %rsp %rbp %rsi %rdi.
859
860   The REX.W bit has no bearing at all on register numbers.  Instead
861   its presence indicates that the operand size is to be overridden
862   from its default value (32 bits) to 64 bits instead.  This is in
863   the same fashion that an 0x66 prefix indicates the operand size is
864   to be overridden from 32 bits down to 16 bits.  When both REX.W and
865   0x66 are present there is a conflict, and REX.W takes precedence.
866
867   Rather than try to handle this complexity using a single huge
868   function, several smaller ones are provided.  The aim is to make it
869   as difficult as possible to screw up register decoding in a subtle
870   and hard-to-track-down way.
871
872   Because these routines fish around in the host's memory (that is,
873   in the guest state area) for sub-parts of guest registers, their
874   correctness depends on the host's endianness.  So far these
875   routines only work for little-endian hosts.  Those for which
876   endianness is important have assertions to ensure sanity.
877*/
878
879
880/* About the simplest question you can ask: where do the 64-bit
881   integer registers live (in the guest state) ? */
882
883static Int integerGuestReg64Offset ( UInt reg )
884{
885   switch (reg) {
886      case R_RAX: return OFFB_RAX;
887      case R_RCX: return OFFB_RCX;
888      case R_RDX: return OFFB_RDX;
889      case R_RBX: return OFFB_RBX;
890      case R_RSP: return OFFB_RSP;
891      case R_RBP: return OFFB_RBP;
892      case R_RSI: return OFFB_RSI;
893      case R_RDI: return OFFB_RDI;
894      case R_R8:  return OFFB_R8;
895      case R_R9:  return OFFB_R9;
896      case R_R10: return OFFB_R10;
897      case R_R11: return OFFB_R11;
898      case R_R12: return OFFB_R12;
899      case R_R13: return OFFB_R13;
900      case R_R14: return OFFB_R14;
901      case R_R15: return OFFB_R15;
902      default: vpanic("integerGuestReg64Offset(amd64)");
903   }
904}
905
906
907/* Produce the name of an integer register, for printing purposes.
908   reg is a number in the range 0 .. 15 that has been generated from a
909   3-bit reg-field number and a REX extension bit.  irregular denotes
910   the case where sz==1 and no REX byte is present. */
911
912static
913const HChar* nameIReg ( Int sz, UInt reg, Bool irregular )
914{
915   static const HChar* ireg64_names[16]
916     = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
917         "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
918   static const HChar* ireg32_names[16]
919     = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
920         "%r8d", "%r9d", "%r10d","%r11d","%r12d","%r13d","%r14d","%r15d" };
921   static const HChar* ireg16_names[16]
922     = { "%ax",  "%cx",  "%dx",  "%bx",  "%sp",  "%bp",  "%si",  "%di",
923         "%r8w", "%r9w", "%r10w","%r11w","%r12w","%r13w","%r14w","%r15w" };
924   static const HChar* ireg8_names[16]
925     = { "%al",  "%cl",  "%dl",  "%bl",  "%spl", "%bpl", "%sil", "%dil",
926         "%r8b", "%r9b", "%r10b","%r11b","%r12b","%r13b","%r14b","%r15b" };
927   static const HChar* ireg8_irregular[8]
928     = { "%al", "%cl", "%dl", "%bl", "%ah", "%ch", "%dh", "%bh" };
929
930   vassert(reg < 16);
931   if (sz == 1) {
932      if (irregular)
933         vassert(reg < 8);
934   } else {
935      vassert(irregular == False);
936   }
937
938   switch (sz) {
939      case 8: return ireg64_names[reg];
940      case 4: return ireg32_names[reg];
941      case 2: return ireg16_names[reg];
942      case 1: if (irregular) {
943                 return ireg8_irregular[reg];
944              } else {
945                 return ireg8_names[reg];
946              }
947      default: vpanic("nameIReg(amd64)");
948   }
949}
950
951/* Using the same argument conventions as nameIReg, produce the
952   guest state offset of an integer register. */
953
954static
955Int offsetIReg ( Int sz, UInt reg, Bool irregular )
956{
957   vassert(reg < 16);
958   if (sz == 1) {
959      if (irregular)
960         vassert(reg < 8);
961   } else {
962      vassert(irregular == False);
963   }
964
965   /* Deal with irregular case -- sz==1 and no REX present */
966   if (sz == 1 && irregular) {
967      switch (reg) {
968         case R_RSP: return 1+ OFFB_RAX;
969         case R_RBP: return 1+ OFFB_RCX;
970         case R_RSI: return 1+ OFFB_RDX;
971         case R_RDI: return 1+ OFFB_RBX;
972         default:    break; /* use the normal case */
973      }
974   }
975
976   /* Normal case */
977   return integerGuestReg64Offset(reg);
978}
979
980
981/* Read the %CL register :: Ity_I8, for shift/rotate operations. */
982
983static IRExpr* getIRegCL ( void )
984{
985   vassert(host_endness == VexEndnessLE);
986   return IRExpr_Get( OFFB_RCX, Ity_I8 );
987}
988
989
990/* Write to the %AH register. */
991
992static void putIRegAH ( IRExpr* e )
993{
994   vassert(host_endness == VexEndnessLE);
995   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
996   stmt( IRStmt_Put( OFFB_RAX+1, e ) );
997}
998
999
1000/* Read/write various widths of %RAX, as it has various
1001   special-purpose uses. */
1002
1003static const HChar* nameIRegRAX ( Int sz )
1004{
1005   switch (sz) {
1006      case 1: return "%al";
1007      case 2: return "%ax";
1008      case 4: return "%eax";
1009      case 8: return "%rax";
1010      default: vpanic("nameIRegRAX(amd64)");
1011   }
1012}
1013
1014static IRExpr* getIRegRAX ( Int sz )
1015{
1016   vassert(host_endness == VexEndnessLE);
1017   switch (sz) {
1018      case 1: return IRExpr_Get( OFFB_RAX, Ity_I8 );
1019      case 2: return IRExpr_Get( OFFB_RAX, Ity_I16 );
1020      case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RAX, Ity_I64 ));
1021      case 8: return IRExpr_Get( OFFB_RAX, Ity_I64 );
1022      default: vpanic("getIRegRAX(amd64)");
1023   }
1024}
1025
1026static void putIRegRAX ( Int sz, IRExpr* e )
1027{
1028   IRType ty = typeOfIRExpr(irsb->tyenv, e);
1029   vassert(host_endness == VexEndnessLE);
1030   switch (sz) {
1031      case 8: vassert(ty == Ity_I64);
1032              stmt( IRStmt_Put( OFFB_RAX, e ));
1033              break;
1034      case 4: vassert(ty == Ity_I32);
1035              stmt( IRStmt_Put( OFFB_RAX, unop(Iop_32Uto64,e) ));
1036              break;
1037      case 2: vassert(ty == Ity_I16);
1038              stmt( IRStmt_Put( OFFB_RAX, e ));
1039              break;
1040      case 1: vassert(ty == Ity_I8);
1041              stmt( IRStmt_Put( OFFB_RAX, e ));
1042              break;
1043      default: vpanic("putIRegRAX(amd64)");
1044   }
1045}
1046
1047
1048/* Read/write various widths of %RDX, as it has various
1049   special-purpose uses. */
1050
1051static const HChar* nameIRegRDX ( Int sz )
1052{
1053   switch (sz) {
1054      case 1: return "%dl";
1055      case 2: return "%dx";
1056      case 4: return "%edx";
1057      case 8: return "%rdx";
1058      default: vpanic("nameIRegRDX(amd64)");
1059   }
1060}
1061
1062static IRExpr* getIRegRDX ( Int sz )
1063{
1064   vassert(host_endness == VexEndnessLE);
1065   switch (sz) {
1066      case 1: return IRExpr_Get( OFFB_RDX, Ity_I8 );
1067      case 2: return IRExpr_Get( OFFB_RDX, Ity_I16 );
1068      case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RDX, Ity_I64 ));
1069      case 8: return IRExpr_Get( OFFB_RDX, Ity_I64 );
1070      default: vpanic("getIRegRDX(amd64)");
1071   }
1072}
1073
1074static void putIRegRDX ( Int sz, IRExpr* e )
1075{
1076   vassert(host_endness == VexEndnessLE);
1077   vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
1078   switch (sz) {
1079      case 8: stmt( IRStmt_Put( OFFB_RDX, e ));
1080              break;
1081      case 4: stmt( IRStmt_Put( OFFB_RDX, unop(Iop_32Uto64,e) ));
1082              break;
1083      case 2: stmt( IRStmt_Put( OFFB_RDX, e ));
1084              break;
1085      case 1: stmt( IRStmt_Put( OFFB_RDX, e ));
1086              break;
1087      default: vpanic("putIRegRDX(amd64)");
1088   }
1089}
1090
1091
1092/* Simplistic functions to deal with the integer registers as a
1093   straightforward bank of 16 64-bit regs. */
1094
1095static IRExpr* getIReg64 ( UInt regno )
1096{
1097   return IRExpr_Get( integerGuestReg64Offset(regno),
1098                      Ity_I64 );
1099}
1100
1101static void putIReg64 ( UInt regno, IRExpr* e )
1102{
1103   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
1104   stmt( IRStmt_Put( integerGuestReg64Offset(regno), e ) );
1105}
1106
1107static const HChar* nameIReg64 ( UInt regno )
1108{
1109   return nameIReg( 8, regno, False );
1110}
1111
1112
1113/* Simplistic functions to deal with the lower halves of integer
1114   registers as a straightforward bank of 16 32-bit regs. */
1115
1116static IRExpr* getIReg32 ( UInt regno )
1117{
1118   vassert(host_endness == VexEndnessLE);
1119   return unop(Iop_64to32,
1120               IRExpr_Get( integerGuestReg64Offset(regno),
1121                           Ity_I64 ));
1122}
1123
1124static void putIReg32 ( UInt regno, IRExpr* e )
1125{
1126   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
1127   stmt( IRStmt_Put( integerGuestReg64Offset(regno),
1128                     unop(Iop_32Uto64,e) ) );
1129}
1130
1131static const HChar* nameIReg32 ( UInt regno )
1132{
1133   return nameIReg( 4, regno, False );
1134}
1135
1136
1137/* Simplistic functions to deal with the lower quarters of integer
1138   registers as a straightforward bank of 16 16-bit regs. */
1139
1140static IRExpr* getIReg16 ( UInt regno )
1141{
1142   vassert(host_endness == VexEndnessLE);
1143   return IRExpr_Get( integerGuestReg64Offset(regno),
1144                      Ity_I16 );
1145}
1146
1147static void putIReg16 ( UInt regno, IRExpr* e )
1148{
1149   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
1150   stmt( IRStmt_Put( integerGuestReg64Offset(regno),
1151                     unop(Iop_16Uto64,e) ) );
1152}
1153
1154static const HChar* nameIReg16 ( UInt regno )
1155{
1156   return nameIReg( 2, regno, False );
1157}
1158
1159
1160/* Sometimes what we know is a 3-bit register number, a REX byte, and
1161   which field of the REX byte is to be used to extend to a 4-bit
1162   number.  These functions cater for that situation.
1163*/
1164static IRExpr* getIReg64rexX ( Prefix pfx, UInt lo3bits )
1165{
1166   vassert(lo3bits < 8);
1167   vassert(IS_VALID_PFX(pfx));
1168   return getIReg64( lo3bits | (getRexX(pfx) << 3) );
1169}
1170
1171static const HChar* nameIReg64rexX ( Prefix pfx, UInt lo3bits )
1172{
1173   vassert(lo3bits < 8);
1174   vassert(IS_VALID_PFX(pfx));
1175   return nameIReg( 8, lo3bits | (getRexX(pfx) << 3), False );
1176}
1177
1178static const HChar* nameIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
1179{
1180   vassert(lo3bits < 8);
1181   vassert(IS_VALID_PFX(pfx));
1182   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1183   return nameIReg( sz, lo3bits | (getRexB(pfx) << 3),
1184                        toBool(sz==1 && !haveREX(pfx)) );
1185}
1186
1187static IRExpr* getIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
1188{
1189   vassert(lo3bits < 8);
1190   vassert(IS_VALID_PFX(pfx));
1191   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1192   if (sz == 4) {
1193      sz = 8;
1194      return unop(Iop_64to32,
1195                  IRExpr_Get(
1196                     offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
1197                                     False/*!irregular*/ ),
1198                     szToITy(sz)
1199                 )
1200             );
1201   } else {
1202      return IRExpr_Get(
1203                offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
1204                                toBool(sz==1 && !haveREX(pfx)) ),
1205                szToITy(sz)
1206             );
1207   }
1208}
1209
1210static void putIRegRexB ( Int sz, Prefix pfx, UInt lo3bits, IRExpr* e )
1211{
1212   vassert(lo3bits < 8);
1213   vassert(IS_VALID_PFX(pfx));
1214   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1215   vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
1216   stmt( IRStmt_Put(
1217            offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
1218                            toBool(sz==1 && !haveREX(pfx)) ),
1219            sz==4 ? unop(Iop_32Uto64,e) : e
1220   ));
1221}
1222
1223
1224/* Functions for getting register numbers from modrm bytes and REX
1225   when we don't have to consider the complexities of integer subreg
1226   accesses.
1227*/
1228/* Extract the g reg field from a modRM byte, and augment it using the
1229   REX.R bit from the supplied REX byte.  The R bit usually is
1230   associated with the g register field.
1231*/
1232static UInt gregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
1233{
1234   Int reg = (Int)( (mod_reg_rm >> 3) & 7 );
1235   reg += (pfx & PFX_REXR) ? 8 : 0;
1236   return reg;
1237}
1238
1239/* Extract the e reg field from a modRM byte, and augment it using the
1240   REX.B bit from the supplied REX byte.  The B bit usually is
1241   associated with the e register field (when modrm indicates e is a
1242   register, that is).
1243*/
1244static UInt eregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
1245{
1246   Int rm;
1247   vassert(epartIsReg(mod_reg_rm));
1248   rm = (Int)(mod_reg_rm & 0x7);
1249   rm += (pfx & PFX_REXB) ? 8 : 0;
1250   return rm;
1251}
1252
1253
1254/* General functions for dealing with integer register access. */
1255
1256/* Produce the guest state offset for a reference to the 'g' register
1257   field in a modrm byte, taking into account REX (or its absence),
1258   and the size of the access.
1259*/
1260static UInt offsetIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
1261{
1262   UInt reg;
1263   vassert(host_endness == VexEndnessLE);
1264   vassert(IS_VALID_PFX(pfx));
1265   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1266   reg = gregOfRexRM( pfx, mod_reg_rm );
1267   return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
1268}
1269
1270static
1271IRExpr* getIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
1272{
1273   if (sz == 4) {
1274      sz = 8;
1275      return unop(Iop_64to32,
1276                  IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
1277                              szToITy(sz) ));
1278   } else {
1279      return IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
1280                         szToITy(sz) );
1281   }
1282}
1283
1284static
1285void putIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
1286{
1287   vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
1288   if (sz == 4) {
1289      e = unop(Iop_32Uto64,e);
1290   }
1291   stmt( IRStmt_Put( offsetIRegG( sz, pfx, mod_reg_rm ), e ) );
1292}
1293
1294static
1295const HChar* nameIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
1296{
1297   return nameIReg( sz, gregOfRexRM(pfx,mod_reg_rm),
1298                        toBool(sz==1 && !haveREX(pfx)) );
1299}
1300
1301
1302static
1303IRExpr* getIRegV ( Int sz, Prefix pfx )
1304{
1305   if (sz == 4) {
1306      sz = 8;
1307      return unop(Iop_64to32,
1308                  IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ),
1309                              szToITy(sz) ));
1310   } else {
1311      return IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ),
1312                         szToITy(sz) );
1313   }
1314}
1315
1316static
1317void putIRegV ( Int sz, Prefix pfx, IRExpr* e )
1318{
1319   vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
1320   if (sz == 4) {
1321      e = unop(Iop_32Uto64,e);
1322   }
1323   stmt( IRStmt_Put( offsetIReg( sz, getVexNvvvv(pfx), False ), e ) );
1324}
1325
1326static
1327const HChar* nameIRegV ( Int sz, Prefix pfx )
1328{
1329   return nameIReg( sz, getVexNvvvv(pfx), False );
1330}
1331
1332
1333
1334/* Produce the guest state offset for a reference to the 'e' register
1335   field in a modrm byte, taking into account REX (or its absence),
1336   and the size of the access.  eregOfRexRM will assert if mod_reg_rm
1337   denotes a memory access rather than a register access.
1338*/
1339static UInt offsetIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
1340{
1341   UInt reg;
1342   vassert(host_endness == VexEndnessLE);
1343   vassert(IS_VALID_PFX(pfx));
1344   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1345   reg = eregOfRexRM( pfx, mod_reg_rm );
1346   return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
1347}
1348
1349static
1350IRExpr* getIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
1351{
1352   if (sz == 4) {
1353      sz = 8;
1354      return unop(Iop_64to32,
1355                  IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
1356                              szToITy(sz) ));
1357   } else {
1358      return IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
1359                         szToITy(sz) );
1360   }
1361}
1362
1363static
1364void putIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
1365{
1366   vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
1367   if (sz == 4) {
1368      e = unop(Iop_32Uto64,e);
1369   }
1370   stmt( IRStmt_Put( offsetIRegE( sz, pfx, mod_reg_rm ), e ) );
1371}
1372
1373static
1374const HChar* nameIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
1375{
1376   return nameIReg( sz, eregOfRexRM(pfx,mod_reg_rm),
1377                        toBool(sz==1 && !haveREX(pfx)) );
1378}
1379
1380
1381/*------------------------------------------------------------*/
1382/*--- For dealing with XMM registers                       ---*/
1383/*------------------------------------------------------------*/
1384
1385static Int ymmGuestRegOffset ( UInt ymmreg )
1386{
1387   switch (ymmreg) {
1388      case 0:  return OFFB_YMM0;
1389      case 1:  return OFFB_YMM1;
1390      case 2:  return OFFB_YMM2;
1391      case 3:  return OFFB_YMM3;
1392      case 4:  return OFFB_YMM4;
1393      case 5:  return OFFB_YMM5;
1394      case 6:  return OFFB_YMM6;
1395      case 7:  return OFFB_YMM7;
1396      case 8:  return OFFB_YMM8;
1397      case 9:  return OFFB_YMM9;
1398      case 10: return OFFB_YMM10;
1399      case 11: return OFFB_YMM11;
1400      case 12: return OFFB_YMM12;
1401      case 13: return OFFB_YMM13;
1402      case 14: return OFFB_YMM14;
1403      case 15: return OFFB_YMM15;
1404      default: vpanic("ymmGuestRegOffset(amd64)");
1405   }
1406}
1407
1408static Int xmmGuestRegOffset ( UInt xmmreg )
1409{
1410   /* Correct for little-endian host only. */
1411   vassert(host_endness == VexEndnessLE);
1412   return ymmGuestRegOffset( xmmreg );
1413}
1414
1415/* Lanes of vector registers are always numbered from zero being the
1416   least significant lane (rightmost in the register).  */
1417
1418static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
1419{
1420   /* Correct for little-endian host only. */
1421   vassert(host_endness == VexEndnessLE);
1422   vassert(laneno >= 0 && laneno < 8);
1423   return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
1424}
1425
1426static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
1427{
1428   /* Correct for little-endian host only. */
1429   vassert(host_endness == VexEndnessLE);
1430   vassert(laneno >= 0 && laneno < 4);
1431   return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
1432}
1433
1434static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
1435{
1436   /* Correct for little-endian host only. */
1437   vassert(host_endness == VexEndnessLE);
1438   vassert(laneno >= 0 && laneno < 2);
1439   return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
1440}
1441
1442static Int ymmGuestRegLane128offset ( UInt ymmreg, Int laneno )
1443{
1444   /* Correct for little-endian host only. */
1445   vassert(host_endness == VexEndnessLE);
1446   vassert(laneno >= 0 && laneno < 2);
1447   return ymmGuestRegOffset( ymmreg ) + 16 * laneno;
1448}
1449
1450static Int ymmGuestRegLane64offset ( UInt ymmreg, Int laneno )
1451{
1452   /* Correct for little-endian host only. */
1453   vassert(host_endness == VexEndnessLE);
1454   vassert(laneno >= 0 && laneno < 4);
1455   return ymmGuestRegOffset( ymmreg ) + 8 * laneno;
1456}
1457
1458static Int ymmGuestRegLane32offset ( UInt ymmreg, Int laneno )
1459{
1460   /* Correct for little-endian host only. */
1461   vassert(host_endness == VexEndnessLE);
1462   vassert(laneno >= 0 && laneno < 8);
1463   return ymmGuestRegOffset( ymmreg ) + 4 * laneno;
1464}
1465
1466static IRExpr* getXMMReg ( UInt xmmreg )
1467{
1468   return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
1469}
1470
1471static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
1472{
1473   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
1474}
1475
1476static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
1477{
1478   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
1479}
1480
1481static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
1482{
1483   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
1484}
1485
1486static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
1487{
1488   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
1489}
1490
1491static IRExpr* getXMMRegLane16 ( UInt xmmreg, Int laneno )
1492{
1493  return IRExpr_Get( xmmGuestRegLane16offset(xmmreg,laneno), Ity_I16 );
1494}
1495
1496static void putXMMReg ( UInt xmmreg, IRExpr* e )
1497{
1498   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
1499   stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
1500}
1501
1502static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
1503{
1504   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
1505   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
1506}
1507
1508static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
1509{
1510   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
1511   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
1512}
1513
1514static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
1515{
1516   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
1517   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
1518}
1519
1520static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
1521{
1522   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
1523   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
1524}
1525
1526static IRExpr* getYMMReg ( UInt xmmreg )
1527{
1528   return IRExpr_Get( ymmGuestRegOffset(xmmreg), Ity_V256 );
1529}
1530
1531static IRExpr* getYMMRegLane128 ( UInt ymmreg, Int laneno )
1532{
1533   return IRExpr_Get( ymmGuestRegLane128offset(ymmreg,laneno), Ity_V128 );
1534}
1535
1536static IRExpr* getYMMRegLane64F ( UInt ymmreg, Int laneno )
1537{
1538   return IRExpr_Get( ymmGuestRegLane64offset(ymmreg,laneno), Ity_F64 );
1539}
1540
1541static IRExpr* getYMMRegLane64 ( UInt ymmreg, Int laneno )
1542{
1543   return IRExpr_Get( ymmGuestRegLane64offset(ymmreg,laneno), Ity_I64 );
1544}
1545
1546static IRExpr* getYMMRegLane32F ( UInt ymmreg, Int laneno )
1547{
1548   return IRExpr_Get( ymmGuestRegLane32offset(ymmreg,laneno), Ity_F32 );
1549}
1550
1551static IRExpr* getYMMRegLane32 ( UInt ymmreg, Int laneno )
1552{
1553   return IRExpr_Get( ymmGuestRegLane32offset(ymmreg,laneno), Ity_I32 );
1554}
1555
1556static void putYMMReg ( UInt ymmreg, IRExpr* e )
1557{
1558   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V256);
1559   stmt( IRStmt_Put( ymmGuestRegOffset(ymmreg), e ) );
1560}
1561
1562static void putYMMRegLane128 ( UInt ymmreg, Int laneno, IRExpr* e )
1563{
1564   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
1565   stmt( IRStmt_Put( ymmGuestRegLane128offset(ymmreg,laneno), e ) );
1566}
1567
1568static void putYMMRegLane64F ( UInt ymmreg, Int laneno, IRExpr* e )
1569{
1570   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
1571   stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
1572}
1573
1574static void putYMMRegLane64 ( UInt ymmreg, Int laneno, IRExpr* e )
1575{
1576   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
1577   stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
1578}
1579
1580static void putYMMRegLane32F ( UInt ymmreg, Int laneno, IRExpr* e )
1581{
1582   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
1583   stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
1584}
1585
1586static void putYMMRegLane32 ( UInt ymmreg, Int laneno, IRExpr* e )
1587{
1588   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
1589   stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
1590}
1591
1592static IRExpr* mkV128 ( UShort mask )
1593{
1594   return IRExpr_Const(IRConst_V128(mask));
1595}
1596
1597/* Write the low half of a YMM reg and zero out the upper half. */
1598static void putYMMRegLoAndZU ( UInt ymmreg, IRExpr* e )
1599{
1600   putYMMRegLane128( ymmreg, 0, e );
1601   putYMMRegLane128( ymmreg, 1, mkV128(0) );
1602}
1603
1604static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
1605{
1606   vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
1607   vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
1608   return unop(Iop_64to1,
1609               binop(Iop_And64,
1610                     unop(Iop_1Uto64,x),
1611                     unop(Iop_1Uto64,y)));
1612}
1613
1614/* Generate a compare-and-swap operation, operating on memory at
1615   'addr'.  The expected value is 'expVal' and the new value is
1616   'newVal'.  If the operation fails, then transfer control (with a
1617   no-redir jump (XXX no -- see comment at top of this file)) to
1618   'restart_point', which is presumably the address of the guest
1619   instruction again -- retrying, essentially. */
1620static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
1621                    Addr64 restart_point )
1622{
1623   IRCAS* cas;
1624   IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
1625   IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
1626   IRTemp oldTmp = newTemp(tyE);
1627   IRTemp expTmp = newTemp(tyE);
1628   vassert(tyE == tyN);
1629   vassert(tyE == Ity_I64 || tyE == Ity_I32
1630           || tyE == Ity_I16 || tyE == Ity_I8);
1631   assign(expTmp, expVal);
1632   cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
1633                  NULL, mkexpr(expTmp), NULL, newVal );
1634   stmt( IRStmt_CAS(cas) );
1635   stmt( IRStmt_Exit(
1636            binop( mkSizedOp(tyE,Iop_CasCmpNE8),
1637                   mkexpr(oldTmp), mkexpr(expTmp) ),
1638            Ijk_Boring, /*Ijk_NoRedir*/
1639            IRConst_U64( restart_point ),
1640            OFFB_RIP
1641         ));
1642}
1643
1644
1645/*------------------------------------------------------------*/
1646/*--- Helpers for %rflags.                                 ---*/
1647/*------------------------------------------------------------*/
1648
1649/* -------------- Evaluating the flags-thunk. -------------- */
1650
1651/* Build IR to calculate all the eflags from stored
1652   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1653   Ity_I64. */
1654static IRExpr* mk_amd64g_calculate_rflags_all ( void )
1655{
1656   IRExpr** args
1657      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1658                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1659                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1660                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1661   IRExpr* call
1662      = mkIRExprCCall(
1663           Ity_I64,
1664           0/*regparm*/,
1665           "amd64g_calculate_rflags_all", &amd64g_calculate_rflags_all,
1666           args
1667        );
1668   /* Exclude OP and NDEP from definedness checking.  We're only
1669      interested in DEP1 and DEP2. */
1670   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1671   return call;
1672}
1673
1674/* Build IR to calculate some particular condition from stored
1675   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1676   Ity_Bit. */
1677static IRExpr* mk_amd64g_calculate_condition ( AMD64Condcode cond )
1678{
1679   IRExpr** args
1680      = mkIRExprVec_5( mkU64(cond),
1681                       IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1682                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1683                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1684                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1685   IRExpr* call
1686      = mkIRExprCCall(
1687           Ity_I64,
1688           0/*regparm*/,
1689           "amd64g_calculate_condition", &amd64g_calculate_condition,
1690           args
1691        );
1692   /* Exclude the requested condition, OP and NDEP from definedness
1693      checking.  We're only interested in DEP1 and DEP2. */
1694   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
1695   return unop(Iop_64to1, call);
1696}
1697
1698/* Build IR to calculate just the carry flag from stored
1699   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I64. */
1700static IRExpr* mk_amd64g_calculate_rflags_c ( void )
1701{
1702   IRExpr** args
1703      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1704                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1705                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1706                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1707   IRExpr* call
1708      = mkIRExprCCall(
1709           Ity_I64,
1710           0/*regparm*/,
1711           "amd64g_calculate_rflags_c", &amd64g_calculate_rflags_c,
1712           args
1713        );
1714   /* Exclude OP and NDEP from definedness checking.  We're only
1715      interested in DEP1 and DEP2. */
1716   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1717   return call;
1718}
1719
1720
1721/* -------------- Building the flags-thunk. -------------- */
1722
1723/* The machinery in this section builds the flag-thunk following a
1724   flag-setting operation.  Hence the various setFlags_* functions.
1725*/
1726
1727static Bool isAddSub ( IROp op8 )
1728{
1729   return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
1730}
1731
1732static Bool isLogic ( IROp op8 )
1733{
1734   return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
1735}
1736
1737/* U-widen 1/8/16/32/64 bit int expr to 64. */
1738static IRExpr* widenUto64 ( IRExpr* e )
1739{
1740   switch (typeOfIRExpr(irsb->tyenv,e)) {
1741      case Ity_I64: return e;
1742      case Ity_I32: return unop(Iop_32Uto64, e);
1743      case Ity_I16: return unop(Iop_16Uto64, e);
1744      case Ity_I8:  return unop(Iop_8Uto64, e);
1745      case Ity_I1:  return unop(Iop_1Uto64, e);
1746      default: vpanic("widenUto64");
1747   }
1748}
1749
1750/* S-widen 8/16/32/64 bit int expr to 32. */
1751static IRExpr* widenSto64 ( IRExpr* e )
1752{
1753   switch (typeOfIRExpr(irsb->tyenv,e)) {
1754      case Ity_I64: return e;
1755      case Ity_I32: return unop(Iop_32Sto64, e);
1756      case Ity_I16: return unop(Iop_16Sto64, e);
1757      case Ity_I8:  return unop(Iop_8Sto64, e);
1758      default: vpanic("widenSto64");
1759   }
1760}
1761
1762/* Narrow 8/16/32/64 bit int expr to 8/16/32/64.  Clearly only some
1763   of these combinations make sense. */
1764static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
1765{
1766   IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
1767   if (src_ty == dst_ty)
1768      return e;
1769   if (src_ty == Ity_I32 && dst_ty == Ity_I16)
1770      return unop(Iop_32to16, e);
1771   if (src_ty == Ity_I32 && dst_ty == Ity_I8)
1772      return unop(Iop_32to8, e);
1773   if (src_ty == Ity_I64 && dst_ty == Ity_I32)
1774      return unop(Iop_64to32, e);
1775   if (src_ty == Ity_I64 && dst_ty == Ity_I16)
1776      return unop(Iop_64to16, e);
1777   if (src_ty == Ity_I64 && dst_ty == Ity_I8)
1778      return unop(Iop_64to8, e);
1779
1780   vex_printf("\nsrc, dst tys are: ");
1781   ppIRType(src_ty);
1782   vex_printf(", ");
1783   ppIRType(dst_ty);
1784   vex_printf("\n");
1785   vpanic("narrowTo(amd64)");
1786}
1787
1788
1789/* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
1790   auto-sized up to the real op. */
1791
1792static
1793void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
1794{
1795   Int ccOp = 0;
1796   switch (ty) {
1797      case Ity_I8:  ccOp = 0; break;
1798      case Ity_I16: ccOp = 1; break;
1799      case Ity_I32: ccOp = 2; break;
1800      case Ity_I64: ccOp = 3; break;
1801      default: vassert(0);
1802   }
1803   switch (op8) {
1804      case Iop_Add8: ccOp += AMD64G_CC_OP_ADDB;   break;
1805      case Iop_Sub8: ccOp += AMD64G_CC_OP_SUBB;   break;
1806      default:       ppIROp(op8);
1807                     vpanic("setFlags_DEP1_DEP2(amd64)");
1808   }
1809   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
1810   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
1811   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(dep2))) );
1812}
1813
1814
1815/* Set the OP and DEP1 fields only, and write zero to DEP2. */
1816
1817static
1818void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
1819{
1820   Int ccOp = 0;
1821   switch (ty) {
1822      case Ity_I8:  ccOp = 0; break;
1823      case Ity_I16: ccOp = 1; break;
1824      case Ity_I32: ccOp = 2; break;
1825      case Ity_I64: ccOp = 3; break;
1826      default: vassert(0);
1827   }
1828   switch (op8) {
1829      case Iop_Or8:
1830      case Iop_And8:
1831      case Iop_Xor8: ccOp += AMD64G_CC_OP_LOGICB; break;
1832      default:       ppIROp(op8);
1833                     vpanic("setFlags_DEP1(amd64)");
1834   }
1835   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
1836   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
1837   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
1838}
1839
1840
1841/* For shift operations, we put in the result and the undershifted
1842   result.  Except if the shift amount is zero, the thunk is left
1843   unchanged. */
1844
1845static void setFlags_DEP1_DEP2_shift ( IROp    op64,
1846                                       IRTemp  res,
1847                                       IRTemp  resUS,
1848                                       IRType  ty,
1849                                       IRTemp  guard )
1850{
1851   Int ccOp = 0;
1852   switch (ty) {
1853      case Ity_I8:  ccOp = 0; break;
1854      case Ity_I16: ccOp = 1; break;
1855      case Ity_I32: ccOp = 2; break;
1856      case Ity_I64: ccOp = 3; break;
1857      default: vassert(0);
1858   }
1859
1860   vassert(guard);
1861
1862   /* Both kinds of right shifts are handled by the same thunk
1863      operation. */
1864   switch (op64) {
1865      case Iop_Shr64:
1866      case Iop_Sar64: ccOp += AMD64G_CC_OP_SHRB; break;
1867      case Iop_Shl64: ccOp += AMD64G_CC_OP_SHLB; break;
1868      default:        ppIROp(op64);
1869                      vpanic("setFlags_DEP1_DEP2_shift(amd64)");
1870   }
1871
1872   /* guard :: Ity_I8.  We need to convert it to I1. */
1873   IRTemp guardB = newTemp(Ity_I1);
1874   assign( guardB, binop(Iop_CmpNE8, mkexpr(guard), mkU8(0)) );
1875
1876   /* DEP1 contains the result, DEP2 contains the undershifted value. */
1877   stmt( IRStmt_Put( OFFB_CC_OP,
1878                     IRExpr_ITE( mkexpr(guardB),
1879                                 mkU64(ccOp),
1880                                 IRExpr_Get(OFFB_CC_OP,Ity_I64) ) ));
1881   stmt( IRStmt_Put( OFFB_CC_DEP1,
1882                     IRExpr_ITE( mkexpr(guardB),
1883                                 widenUto64(mkexpr(res)),
1884                                 IRExpr_Get(OFFB_CC_DEP1,Ity_I64) ) ));
1885   stmt( IRStmt_Put( OFFB_CC_DEP2,
1886                     IRExpr_ITE( mkexpr(guardB),
1887                                 widenUto64(mkexpr(resUS)),
1888                                 IRExpr_Get(OFFB_CC_DEP2,Ity_I64) ) ));
1889}
1890
1891
1892/* For the inc/dec case, we store in DEP1 the result value and in NDEP
1893   the former value of the carry flag, which unfortunately we have to
1894   compute. */
1895
1896static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
1897{
1898   Int ccOp = inc ? AMD64G_CC_OP_INCB : AMD64G_CC_OP_DECB;
1899
1900   switch (ty) {
1901      case Ity_I8:  ccOp += 0; break;
1902      case Ity_I16: ccOp += 1; break;
1903      case Ity_I32: ccOp += 2; break;
1904      case Ity_I64: ccOp += 3; break;
1905      default: vassert(0);
1906   }
1907
1908   /* This has to come first, because calculating the C flag
1909      may require reading all four thunk fields. */
1910   stmt( IRStmt_Put( OFFB_CC_NDEP, mk_amd64g_calculate_rflags_c()) );
1911   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
1912   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(res))) );
1913   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
1914}
1915
1916
1917/* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
1918   two arguments. */
1919
1920static
1921void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, ULong base_op )
1922{
1923   switch (ty) {
1924      case Ity_I8:
1925         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+0) ) );
1926         break;
1927      case Ity_I16:
1928         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+1) ) );
1929         break;
1930      case Ity_I32:
1931         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+2) ) );
1932         break;
1933      case Ity_I64:
1934         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+3) ) );
1935         break;
1936      default:
1937         vpanic("setFlags_MUL(amd64)");
1938   }
1939   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(arg1)) ));
1940   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(arg2)) ));
1941}
1942
1943
1944/* -------------- Condition codes. -------------- */
1945
1946/* Condition codes, using the AMD encoding.  */
1947
1948static const HChar* name_AMD64Condcode ( AMD64Condcode cond )
1949{
1950   switch (cond) {
1951      case AMD64CondO:      return "o";
1952      case AMD64CondNO:     return "no";
1953      case AMD64CondB:      return "b";
1954      case AMD64CondNB:     return "ae"; /*"nb";*/
1955      case AMD64CondZ:      return "e"; /*"z";*/
1956      case AMD64CondNZ:     return "ne"; /*"nz";*/
1957      case AMD64CondBE:     return "be";
1958      case AMD64CondNBE:    return "a"; /*"nbe";*/
1959      case AMD64CondS:      return "s";
1960      case AMD64CondNS:     return "ns";
1961      case AMD64CondP:      return "p";
1962      case AMD64CondNP:     return "np";
1963      case AMD64CondL:      return "l";
1964      case AMD64CondNL:     return "ge"; /*"nl";*/
1965      case AMD64CondLE:     return "le";
1966      case AMD64CondNLE:    return "g"; /*"nle";*/
1967      case AMD64CondAlways: return "ALWAYS";
1968      default: vpanic("name_AMD64Condcode");
1969   }
1970}
1971
1972static
1973AMD64Condcode positiveIse_AMD64Condcode ( AMD64Condcode  cond,
1974                                          /*OUT*/Bool*   needInvert )
1975{
1976   vassert(cond >= AMD64CondO && cond <= AMD64CondNLE);
1977   if (cond & 1) {
1978      *needInvert = True;
1979      return cond-1;
1980   } else {
1981      *needInvert = False;
1982      return cond;
1983   }
1984}
1985
1986
1987/* -------------- Helpers for ADD/SUB with carry. -------------- */
1988
1989/* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
1990   appropriately.
1991
1992   Optionally, generate a store for the 'tres' value.  This can either
1993   be a normal store, or it can be a cas-with-possible-failure style
1994   store:
1995
1996   if taddr is IRTemp_INVALID, then no store is generated.
1997
1998   if taddr is not IRTemp_INVALID, then a store (using taddr as
1999   the address) is generated:
2000
2001     if texpVal is IRTemp_INVALID then a normal store is
2002     generated, and restart_point must be zero (it is irrelevant).
2003
2004     if texpVal is not IRTemp_INVALID then a cas-style store is
2005     generated.  texpVal is the expected value, restart_point
2006     is the restart point if the store fails, and texpVal must
2007     have the same type as tres.
2008
2009*/
2010static void helper_ADC ( Int sz,
2011                         IRTemp tres, IRTemp ta1, IRTemp ta2,
2012                         /* info about optional store: */
2013                         IRTemp taddr, IRTemp texpVal, Addr64 restart_point )
2014{
2015   UInt    thunkOp;
2016   IRType  ty    = szToITy(sz);
2017   IRTemp  oldc  = newTemp(Ity_I64);
2018   IRTemp  oldcn = newTemp(ty);
2019   IROp    plus  = mkSizedOp(ty, Iop_Add8);
2020   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
2021
2022   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
2023
2024   switch (sz) {
2025      case 8:  thunkOp = AMD64G_CC_OP_ADCQ; break;
2026      case 4:  thunkOp = AMD64G_CC_OP_ADCL; break;
2027      case 2:  thunkOp = AMD64G_CC_OP_ADCW; break;
2028      case 1:  thunkOp = AMD64G_CC_OP_ADCB; break;
2029      default: vassert(0);
2030   }
2031
2032   /* oldc = old carry flag, 0 or 1 */
2033   assign( oldc,  binop(Iop_And64,
2034                        mk_amd64g_calculate_rflags_c(),
2035                        mkU64(1)) );
2036
2037   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
2038
2039   assign( tres, binop(plus,
2040                       binop(plus,mkexpr(ta1),mkexpr(ta2)),
2041                       mkexpr(oldcn)) );
2042
2043   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
2044      start of this function. */
2045   if (taddr != IRTemp_INVALID) {
2046      if (texpVal == IRTemp_INVALID) {
2047         vassert(restart_point == 0);
2048         storeLE( mkexpr(taddr), mkexpr(tres) );
2049      } else {
2050         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
2051         /* .. and hence 'texpVal' has the same type as 'tres'. */
2052         casLE( mkexpr(taddr),
2053                mkexpr(texpVal), mkexpr(tres), restart_point );
2054      }
2055   }
2056
2057   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
2058   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1))  ));
2059   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
2060                                                         mkexpr(oldcn)) )) );
2061   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
2062}
2063
2064
2065/* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
2066   appropriately.  As with helper_ADC, possibly generate a store of
2067   the result -- see comments on helper_ADC for details.
2068*/
2069static void helper_SBB ( Int sz,
2070                         IRTemp tres, IRTemp ta1, IRTemp ta2,
2071                         /* info about optional store: */
2072                         IRTemp taddr, IRTemp texpVal, Addr64 restart_point )
2073{
2074   UInt    thunkOp;
2075   IRType  ty    = szToITy(sz);
2076   IRTemp  oldc  = newTemp(Ity_I64);
2077   IRTemp  oldcn = newTemp(ty);
2078   IROp    minus = mkSizedOp(ty, Iop_Sub8);
2079   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
2080
2081   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
2082
2083   switch (sz) {
2084      case 8:  thunkOp = AMD64G_CC_OP_SBBQ; break;
2085      case 4:  thunkOp = AMD64G_CC_OP_SBBL; break;
2086      case 2:  thunkOp = AMD64G_CC_OP_SBBW; break;
2087      case 1:  thunkOp = AMD64G_CC_OP_SBBB; break;
2088      default: vassert(0);
2089   }
2090
2091   /* oldc = old carry flag, 0 or 1 */
2092   assign( oldc, binop(Iop_And64,
2093                       mk_amd64g_calculate_rflags_c(),
2094                       mkU64(1)) );
2095
2096   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
2097
2098   assign( tres, binop(minus,
2099                       binop(minus,mkexpr(ta1),mkexpr(ta2)),
2100                       mkexpr(oldcn)) );
2101
2102   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
2103      start of this function. */
2104   if (taddr != IRTemp_INVALID) {
2105      if (texpVal == IRTemp_INVALID) {
2106         vassert(restart_point == 0);
2107         storeLE( mkexpr(taddr), mkexpr(tres) );
2108      } else {
2109         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
2110         /* .. and hence 'texpVal' has the same type as 'tres'. */
2111         casLE( mkexpr(taddr),
2112                mkexpr(texpVal), mkexpr(tres), restart_point );
2113      }
2114   }
2115
2116   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
2117   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1) )) );
2118   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
2119                                                         mkexpr(oldcn)) )) );
2120   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
2121}
2122
2123
2124/* Given ta1, ta2 and tres, compute tres = ADCX(ta1,ta2) or tres = ADOX(ta1,ta2)
2125   and set flags appropriately.
2126*/
2127static void helper_ADCX_ADOX ( Bool isADCX, Int sz,
2128                               IRTemp tres, IRTemp ta1, IRTemp ta2 )
2129{
2130   UInt    thunkOp;
2131   IRType  ty        = szToITy(sz);
2132   IRTemp  oldflags  = newTemp(Ity_I64);
2133   IRTemp  oldOC     = newTemp(Ity_I64); // old O or C flag
2134   IRTemp  oldOCn    = newTemp(ty);      // old O or C flag, narrowed
2135   IROp    plus      = mkSizedOp(ty, Iop_Add8);
2136   IROp    xor       = mkSizedOp(ty, Iop_Xor8);
2137
2138   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
2139
2140   switch (sz) {
2141      case 8:  thunkOp = isADCX ? AMD64G_CC_OP_ADCX64
2142                                : AMD64G_CC_OP_ADOX64; break;
2143      case 4:  thunkOp = isADCX ? AMD64G_CC_OP_ADCX32
2144                                : AMD64G_CC_OP_ADOX32; break;
2145      default: vassert(0);
2146   }
2147
2148   assign( oldflags, mk_amd64g_calculate_rflags_all() );
2149
2150   /* oldOC = old overflow/carry flag, 0 or 1 */
2151   assign( oldOC, binop(Iop_And64,
2152                        binop(Iop_Shr64,
2153                              mkexpr(oldflags),
2154                              mkU8(isADCX ? AMD64G_CC_SHIFT_C
2155                                          : AMD64G_CC_SHIFT_O)),
2156                        mkU64(1)) );
2157
2158   assign( oldOCn, narrowTo(ty, mkexpr(oldOC)) );
2159
2160   assign( tres, binop(plus,
2161                       binop(plus,mkexpr(ta1),mkexpr(ta2)),
2162                       mkexpr(oldOCn)) );
2163
2164   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
2165   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1))  ));
2166   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
2167                                                         mkexpr(oldOCn)) )) );
2168   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldflags) ) );
2169}
2170
2171
2172/* -------------- Helpers for disassembly printing. -------------- */
2173
2174static const HChar* nameGrp1 ( Int opc_aux )
2175{
2176   static const HChar* grp1_names[8]
2177     = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
2178   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(amd64)");
2179   return grp1_names[opc_aux];
2180}
2181
2182static const HChar* nameGrp2 ( Int opc_aux )
2183{
2184   static const HChar* grp2_names[8]
2185     = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
2186   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(amd64)");
2187   return grp2_names[opc_aux];
2188}
2189
2190static const HChar* nameGrp4 ( Int opc_aux )
2191{
2192   static const HChar* grp4_names[8]
2193     = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
2194   if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(amd64)");
2195   return grp4_names[opc_aux];
2196}
2197
2198static const HChar* nameGrp5 ( Int opc_aux )
2199{
2200   static const HChar* grp5_names[8]
2201     = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
2202   if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(amd64)");
2203   return grp5_names[opc_aux];
2204}
2205
2206static const HChar* nameGrp8 ( Int opc_aux )
2207{
2208   static const HChar* grp8_names[8]
2209      = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
2210   if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(amd64)");
2211   return grp8_names[opc_aux];
2212}
2213
2214static const HChar* nameSReg ( UInt sreg )
2215{
2216   switch (sreg) {
2217      case R_ES: return "%es";
2218      case R_CS: return "%cs";
2219      case R_SS: return "%ss";
2220      case R_DS: return "%ds";
2221      case R_FS: return "%fs";
2222      case R_GS: return "%gs";
2223      default: vpanic("nameSReg(amd64)");
2224   }
2225}
2226
2227static const HChar* nameMMXReg ( Int mmxreg )
2228{
2229   static const HChar* mmx_names[8]
2230     = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
2231   if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(amd64,guest)");
2232   return mmx_names[mmxreg];
2233}
2234
2235static const HChar* nameXMMReg ( Int xmmreg )
2236{
2237   static const HChar* xmm_names[16]
2238     = { "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3",
2239         "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7",
2240         "%xmm8",  "%xmm9",  "%xmm10", "%xmm11",
2241         "%xmm12", "%xmm13", "%xmm14", "%xmm15" };
2242   if (xmmreg < 0 || xmmreg > 15) vpanic("nameXMMReg(amd64)");
2243   return xmm_names[xmmreg];
2244}
2245
2246static const HChar* nameMMXGran ( Int gran )
2247{
2248   switch (gran) {
2249      case 0: return "b";
2250      case 1: return "w";
2251      case 2: return "d";
2252      case 3: return "q";
2253      default: vpanic("nameMMXGran(amd64,guest)");
2254   }
2255}
2256
2257static HChar nameISize ( Int size )
2258{
2259   switch (size) {
2260      case 8: return 'q';
2261      case 4: return 'l';
2262      case 2: return 'w';
2263      case 1: return 'b';
2264      default: vpanic("nameISize(amd64)");
2265   }
2266}
2267
2268static const HChar* nameYMMReg ( Int ymmreg )
2269{
2270   static const HChar* ymm_names[16]
2271     = { "%ymm0",  "%ymm1",  "%ymm2",  "%ymm3",
2272         "%ymm4",  "%ymm5",  "%ymm6",  "%ymm7",
2273         "%ymm8",  "%ymm9",  "%ymm10", "%ymm11",
2274         "%ymm12", "%ymm13", "%ymm14", "%ymm15" };
2275   if (ymmreg < 0 || ymmreg > 15) vpanic("nameYMMReg(amd64)");
2276   return ymm_names[ymmreg];
2277}
2278
2279
2280/*------------------------------------------------------------*/
2281/*--- JMP helpers                                          ---*/
2282/*------------------------------------------------------------*/
2283
2284static void jmp_lit( /*MOD*/DisResult* dres,
2285                     IRJumpKind kind, Addr64 d64 )
2286{
2287   vassert(dres->whatNext    == Dis_Continue);
2288   vassert(dres->len         == 0);
2289   vassert(dres->continueAt  == 0);
2290   vassert(dres->jk_StopHere == Ijk_INVALID);
2291   dres->whatNext    = Dis_StopHere;
2292   dres->jk_StopHere = kind;
2293   stmt( IRStmt_Put( OFFB_RIP, mkU64(d64) ) );
2294}
2295
2296static void jmp_treg( /*MOD*/DisResult* dres,
2297                      IRJumpKind kind, IRTemp t )
2298{
2299   vassert(dres->whatNext    == Dis_Continue);
2300   vassert(dres->len         == 0);
2301   vassert(dres->continueAt  == 0);
2302   vassert(dres->jk_StopHere == Ijk_INVALID);
2303   dres->whatNext    = Dis_StopHere;
2304   dres->jk_StopHere = kind;
2305   stmt( IRStmt_Put( OFFB_RIP, mkexpr(t) ) );
2306}
2307
2308static
2309void jcc_01 ( /*MOD*/DisResult* dres,
2310              AMD64Condcode cond, Addr64 d64_false, Addr64 d64_true )
2311{
2312   Bool          invert;
2313   AMD64Condcode condPos;
2314   vassert(dres->whatNext    == Dis_Continue);
2315   vassert(dres->len         == 0);
2316   vassert(dres->continueAt  == 0);
2317   vassert(dres->jk_StopHere == Ijk_INVALID);
2318   dres->whatNext    = Dis_StopHere;
2319   dres->jk_StopHere = Ijk_Boring;
2320   condPos = positiveIse_AMD64Condcode ( cond, &invert );
2321   if (invert) {
2322      stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
2323                         Ijk_Boring,
2324                         IRConst_U64(d64_false),
2325                         OFFB_RIP ) );
2326      stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_true) ) );
2327   } else {
2328      stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
2329                         Ijk_Boring,
2330                         IRConst_U64(d64_true),
2331                         OFFB_RIP ) );
2332      stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_false) ) );
2333   }
2334}
2335
2336/* Let new_rsp be the %rsp value after a call/return.  Let nia be the
2337   guest address of the next instruction to be executed.
2338
2339   This function generates an AbiHint to say that -128(%rsp)
2340   .. -1(%rsp) should now be regarded as uninitialised.
2341*/
2342static
2343void make_redzone_AbiHint ( const VexAbiInfo* vbi,
2344                            IRTemp new_rsp, IRTemp nia, const HChar* who )
2345{
2346   Int szB = vbi->guest_stack_redzone_size;
2347   vassert(szB >= 0);
2348
2349   /* A bit of a kludge.  Currently the only AbI we've guested AMD64
2350      for is ELF.  So just check it's the expected 128 value
2351      (paranoia). */
2352   vassert(szB == 128);
2353
2354   if (0) vex_printf("AbiHint: %s\n", who);
2355   vassert(typeOfIRTemp(irsb->tyenv, new_rsp) == Ity_I64);
2356   vassert(typeOfIRTemp(irsb->tyenv, nia) == Ity_I64);
2357   if (szB > 0)
2358      stmt( IRStmt_AbiHint(
2359               binop(Iop_Sub64, mkexpr(new_rsp), mkU64(szB)),
2360               szB,
2361               mkexpr(nia)
2362            ));
2363}
2364
2365
2366/*------------------------------------------------------------*/
2367/*--- Disassembling addressing modes                       ---*/
2368/*------------------------------------------------------------*/
2369
2370static
2371const HChar* segRegTxt ( Prefix pfx )
2372{
2373   if (pfx & PFX_CS) return "%cs:";
2374   if (pfx & PFX_DS) return "%ds:";
2375   if (pfx & PFX_ES) return "%es:";
2376   if (pfx & PFX_FS) return "%fs:";
2377   if (pfx & PFX_GS) return "%gs:";
2378   if (pfx & PFX_SS) return "%ss:";
2379   return ""; /* no override */
2380}
2381
2382
2383/* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
2384   linear address by adding any required segment override as indicated
2385   by sorb, and also dealing with any address size override
2386   present. */
2387static
2388IRExpr* handleAddrOverrides ( const VexAbiInfo* vbi,
2389                              Prefix pfx, IRExpr* virtual )
2390{
2391   /* --- address size override --- */
2392   if (haveASO(pfx))
2393      virtual = unop(Iop_32Uto64, unop(Iop_64to32, virtual));
2394
2395   /* Note that the below are hacks that relies on the assumption
2396      that %fs or %gs are constant.
2397      Typically, %fs is always 0x63 on linux (in the main thread, it
2398      stays at value 0), %gs always 0x60 on Darwin, ... */
2399   /* --- segment overrides --- */
2400   if (pfx & PFX_FS) {
2401      if (vbi->guest_amd64_assume_fs_is_const) {
2402         /* return virtual + guest_FS_CONST. */
2403         virtual = binop(Iop_Add64, virtual,
2404                                    IRExpr_Get(OFFB_FS_CONST, Ity_I64));
2405      } else {
2406         unimplemented("amd64 %fs segment override");
2407      }
2408   }
2409
2410   if (pfx & PFX_GS) {
2411      if (vbi->guest_amd64_assume_gs_is_const) {
2412         /* return virtual + guest_GS_CONST. */
2413         virtual = binop(Iop_Add64, virtual,
2414                                    IRExpr_Get(OFFB_GS_CONST, Ity_I64));
2415      } else {
2416         unimplemented("amd64 %gs segment override");
2417      }
2418   }
2419
2420   /* cs, ds, es and ss are simply ignored in 64-bit mode. */
2421
2422   return virtual;
2423}
2424
2425//.. {
2426//..    Int    sreg;
2427//..    IRType hWordTy;
2428//..    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
2429//..
2430//..    if (sorb == 0)
2431//..       /* the common case - no override */
2432//..       return virtual;
2433//..
2434//..    switch (sorb) {
2435//..       case 0x3E: sreg = R_DS; break;
2436//..       case 0x26: sreg = R_ES; break;
2437//..       case 0x64: sreg = R_FS; break;
2438//..       case 0x65: sreg = R_GS; break;
2439//..       default: vpanic("handleAddrOverrides(x86,guest)");
2440//..    }
2441//..
2442//..    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
2443//..
2444//..    seg_selector = newTemp(Ity_I32);
2445//..    ldt_ptr      = newTemp(hWordTy);
2446//..    gdt_ptr      = newTemp(hWordTy);
2447//..    r64          = newTemp(Ity_I64);
2448//..
2449//..    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
2450//..    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
2451//..    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
2452//..
2453//..    /*
2454//..    Call this to do the translation and limit checks:
2455//..    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
2456//..                                  UInt seg_selector, UInt virtual_addr )
2457//..    */
2458//..    assign(
2459//..       r64,
2460//..       mkIRExprCCall(
2461//..          Ity_I64,
2462//..          0/*regparms*/,
2463//..          "x86g_use_seg_selector",
2464//..          &x86g_use_seg_selector,
2465//..          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
2466//..                         mkexpr(seg_selector), virtual)
2467//..       )
2468//..    );
2469//..
2470//..    /* If the high 32 of the result are non-zero, there was a
2471//..       failure in address translation.  In which case, make a
2472//..       quick exit.
2473//..    */
2474//..    stmt(
2475//..       IRStmt_Exit(
2476//..          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
2477//..          Ijk_MapFail,
2478//..          IRConst_U32( guest_eip_curr_instr )
2479//..       )
2480//..    );
2481//..
2482//..    /* otherwise, here's the translated result. */
2483//..    return unop(Iop_64to32, mkexpr(r64));
2484//.. }
2485
2486
2487/* Generate IR to calculate an address indicated by a ModRM and
2488   following SIB bytes.  The expression, and the number of bytes in
2489   the address mode, are returned (the latter in *len).  Note that
2490   this fn should not be called if the R/M part of the address denotes
2491   a register instead of memory.  If print_codegen is true, text of
2492   the addressing mode is placed in buf.
2493
2494   The computed address is stored in a new tempreg, and the
2495   identity of the tempreg is returned.
2496
2497   extra_bytes holds the number of bytes after the amode, as supplied
2498   by the caller.  This is needed to make sense of %rip-relative
2499   addresses.  Note that the value that *len is set to is only the
2500   length of the amode itself and does not include the value supplied
2501   in extra_bytes.
2502 */
2503
2504static IRTemp disAMode_copy2tmp ( IRExpr* addr64 )
2505{
2506   IRTemp tmp = newTemp(Ity_I64);
2507   assign( tmp, addr64 );
2508   return tmp;
2509}
2510
2511static
2512IRTemp disAMode ( /*OUT*/Int* len,
2513                  const VexAbiInfo* vbi, Prefix pfx, Long delta,
2514                  /*OUT*/HChar* buf, Int extra_bytes )
2515{
2516   UChar mod_reg_rm = getUChar(delta);
2517   delta++;
2518
2519   buf[0] = (UChar)0;
2520   vassert(extra_bytes >= 0 && extra_bytes < 10);
2521
2522   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
2523      jump table seems a bit excessive.
2524   */
2525   mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
2526   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
2527                                               /* is now XX0XXYYY */
2528   mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
2529   switch (mod_reg_rm) {
2530
2531      /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
2532         REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
2533      */
2534      case 0x00: case 0x01: case 0x02: case 0x03:
2535      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
2536         { UChar rm = toUChar(mod_reg_rm & 7);
2537           DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
2538           *len = 1;
2539           return disAMode_copy2tmp(
2540                  handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,rm)));
2541         }
2542
2543      /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
2544         REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
2545      */
2546      case 0x08: case 0x09: case 0x0A: case 0x0B:
2547      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
2548         { UChar rm = toUChar(mod_reg_rm & 7);
2549           Long d   = getSDisp8(delta);
2550           if (d == 0) {
2551              DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
2552           } else {
2553              DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
2554           }
2555           *len = 2;
2556           return disAMode_copy2tmp(
2557                  handleAddrOverrides(vbi, pfx,
2558                     binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
2559         }
2560
2561      /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
2562         REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
2563      */
2564      case 0x10: case 0x11: case 0x12: case 0x13:
2565      /* ! 14 */ case 0x15: case 0x16: case 0x17:
2566         { UChar rm = toUChar(mod_reg_rm & 7);
2567           Long  d  = getSDisp32(delta);
2568           DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
2569           *len = 5;
2570           return disAMode_copy2tmp(
2571                  handleAddrOverrides(vbi, pfx,
2572                     binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
2573         }
2574
2575      /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
2576      /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
2577      case 0x18: case 0x19: case 0x1A: case 0x1B:
2578      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
2579         vpanic("disAMode(amd64): not an addr!");
2580
2581      /* RIP + disp32.  This assumes that guest_RIP_curr_instr is set
2582         correctly at the start of handling each instruction. */
2583      case 0x05:
2584         { Long d = getSDisp32(delta);
2585           *len = 5;
2586           DIS(buf, "%s%lld(%%rip)", segRegTxt(pfx), d);
2587           /* We need to know the next instruction's start address.
2588              Try and figure out what it is, record the guess, and ask
2589              the top-level driver logic (bbToIR_AMD64) to check we
2590              guessed right, after the instruction is completely
2591              decoded. */
2592           guest_RIP_next_mustcheck = True;
2593           guest_RIP_next_assumed = guest_RIP_bbstart
2594                                    + delta+4 + extra_bytes;
2595           return disAMode_copy2tmp(
2596                     handleAddrOverrides(vbi, pfx,
2597                        binop(Iop_Add64, mkU64(guest_RIP_next_assumed),
2598                                         mkU64(d))));
2599         }
2600
2601      case 0x04: {
2602         /* SIB, with no displacement.  Special cases:
2603            -- %rsp cannot act as an index value.
2604               If index_r indicates %rsp, zero is used for the index.
2605            -- when mod is zero and base indicates RBP or R13, base is
2606               instead a 32-bit sign-extended literal.
2607            It's all madness, I tell you.  Extract %index, %base and
2608            scale from the SIB byte.  The value denoted is then:
2609               | %index == %RSP && (%base == %RBP || %base == %R13)
2610               = d32 following SIB byte
2611               | %index == %RSP && !(%base == %RBP || %base == %R13)
2612               = %base
2613               | %index != %RSP && (%base == %RBP || %base == %R13)
2614               = d32 following SIB byte + (%index << scale)
2615               | %index != %RSP && !(%base == %RBP || %base == %R13)
2616               = %base + (%index << scale)
2617         */
2618         UChar sib     = getUChar(delta);
2619         UChar scale   = toUChar((sib >> 6) & 3);
2620         UChar index_r = toUChar((sib >> 3) & 7);
2621         UChar base_r  = toUChar(sib & 7);
2622         /* correct since #(R13) == 8 + #(RBP) */
2623         Bool  base_is_BPor13 = toBool(base_r == R_RBP);
2624         Bool  index_is_SP    = toBool(index_r == R_RSP && 0==getRexX(pfx));
2625         delta++;
2626
2627         if ((!index_is_SP) && (!base_is_BPor13)) {
2628            if (scale == 0) {
2629               DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
2630                         nameIRegRexB(8,pfx,base_r),
2631                         nameIReg64rexX(pfx,index_r));
2632            } else {
2633               DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
2634                         nameIRegRexB(8,pfx,base_r),
2635                         nameIReg64rexX(pfx,index_r), 1<<scale);
2636            }
2637            *len = 2;
2638            return
2639               disAMode_copy2tmp(
2640               handleAddrOverrides(vbi, pfx,
2641                  binop(Iop_Add64,
2642                        getIRegRexB(8,pfx,base_r),
2643                        binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
2644                              mkU8(scale)))));
2645         }
2646
2647         if ((!index_is_SP) && base_is_BPor13) {
2648            Long d = getSDisp32(delta);
2649            DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d,
2650                      nameIReg64rexX(pfx,index_r), 1<<scale);
2651            *len = 6;
2652            return
2653               disAMode_copy2tmp(
2654               handleAddrOverrides(vbi, pfx,
2655                  binop(Iop_Add64,
2656                        binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
2657                                         mkU8(scale)),
2658                        mkU64(d))));
2659         }
2660
2661         if (index_is_SP && (!base_is_BPor13)) {
2662            DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,base_r));
2663            *len = 2;
2664            return disAMode_copy2tmp(
2665                   handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,base_r)));
2666         }
2667
2668         if (index_is_SP && base_is_BPor13) {
2669            Long d = getSDisp32(delta);
2670            DIS(buf, "%s%lld", segRegTxt(pfx), d);
2671            *len = 6;
2672            return disAMode_copy2tmp(
2673                   handleAddrOverrides(vbi, pfx, mkU64(d)));
2674         }
2675
2676         vassert(0);
2677      }
2678
2679      /* SIB, with 8-bit displacement.  Special cases:
2680         -- %esp cannot act as an index value.
2681            If index_r indicates %esp, zero is used for the index.
2682         Denoted value is:
2683            | %index == %ESP
2684            = d8 + %base
2685            | %index != %ESP
2686            = d8 + %base + (%index << scale)
2687      */
2688      case 0x0C: {
2689         UChar sib     = getUChar(delta);
2690         UChar scale   = toUChar((sib >> 6) & 3);
2691         UChar index_r = toUChar((sib >> 3) & 7);
2692         UChar base_r  = toUChar(sib & 7);
2693         Long d        = getSDisp8(delta+1);
2694
2695         if (index_r == R_RSP && 0==getRexX(pfx)) {
2696            DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
2697                                   d, nameIRegRexB(8,pfx,base_r));
2698            *len = 3;
2699            return disAMode_copy2tmp(
2700                   handleAddrOverrides(vbi, pfx,
2701                      binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
2702         } else {
2703            if (scale == 0) {
2704               DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
2705                         nameIRegRexB(8,pfx,base_r),
2706                         nameIReg64rexX(pfx,index_r));
2707            } else {
2708               DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
2709                         nameIRegRexB(8,pfx,base_r),
2710                         nameIReg64rexX(pfx,index_r), 1<<scale);
2711            }
2712            *len = 3;
2713            return
2714                disAMode_copy2tmp(
2715                handleAddrOverrides(vbi, pfx,
2716                  binop(Iop_Add64,
2717                        binop(Iop_Add64,
2718                              getIRegRexB(8,pfx,base_r),
2719                              binop(Iop_Shl64,
2720                                    getIReg64rexX(pfx,index_r), mkU8(scale))),
2721                        mkU64(d))));
2722         }
2723         vassert(0); /*NOTREACHED*/
2724      }
2725
2726      /* SIB, with 32-bit displacement.  Special cases:
2727         -- %rsp cannot act as an index value.
2728            If index_r indicates %rsp, zero is used for the index.
2729         Denoted value is:
2730            | %index == %RSP
2731            = d32 + %base
2732            | %index != %RSP
2733            = d32 + %base + (%index << scale)
2734      */
2735      case 0x14: {
2736         UChar sib     = getUChar(delta);
2737         UChar scale   = toUChar((sib >> 6) & 3);
2738         UChar index_r = toUChar((sib >> 3) & 7);
2739         UChar base_r  = toUChar(sib & 7);
2740         Long d        = getSDisp32(delta+1);
2741
2742         if (index_r == R_RSP && 0==getRexX(pfx)) {
2743            DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
2744                                   d, nameIRegRexB(8,pfx,base_r));
2745            *len = 6;
2746            return disAMode_copy2tmp(
2747                   handleAddrOverrides(vbi, pfx,
2748                      binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
2749         } else {
2750            if (scale == 0) {
2751               DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
2752                         nameIRegRexB(8,pfx,base_r),
2753                         nameIReg64rexX(pfx,index_r));
2754            } else {
2755               DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
2756                         nameIRegRexB(8,pfx,base_r),
2757                         nameIReg64rexX(pfx,index_r), 1<<scale);
2758            }
2759            *len = 6;
2760            return
2761                disAMode_copy2tmp(
2762                handleAddrOverrides(vbi, pfx,
2763                  binop(Iop_Add64,
2764                        binop(Iop_Add64,
2765                              getIRegRexB(8,pfx,base_r),
2766                              binop(Iop_Shl64,
2767                                    getIReg64rexX(pfx,index_r), mkU8(scale))),
2768                        mkU64(d))));
2769         }
2770         vassert(0); /*NOTREACHED*/
2771      }
2772
2773      default:
2774         vpanic("disAMode(amd64)");
2775         return 0; /*notreached*/
2776   }
2777}
2778
2779
2780/* Similarly for VSIB addressing.  This returns just the addend,
2781   and fills in *rI and *vscale with the register number of the vector
2782   index and its multiplicand.  */
2783static
2784IRTemp disAVSIBMode ( /*OUT*/Int* len,
2785                      const VexAbiInfo* vbi, Prefix pfx, Long delta,
2786                      /*OUT*/HChar* buf, /*OUT*/UInt* rI,
2787                      IRType ty, /*OUT*/Int* vscale )
2788{
2789   UChar mod_reg_rm = getUChar(delta);
2790   const HChar *vindex;
2791
2792   *len = 0;
2793   *rI = 0;
2794   *vscale = 0;
2795   buf[0] = (UChar)0;
2796   if ((mod_reg_rm & 7) != 4 || epartIsReg(mod_reg_rm))
2797      return IRTemp_INVALID;
2798
2799   UChar sib     = getUChar(delta+1);
2800   UChar scale   = toUChar((sib >> 6) & 3);
2801   UChar index_r = toUChar((sib >> 3) & 7);
2802   UChar base_r  = toUChar(sib & 7);
2803   Long  d       = 0;
2804   /* correct since #(R13) == 8 + #(RBP) */
2805   Bool  base_is_BPor13 = toBool(base_r == R_RBP);
2806   delta += 2;
2807   *len = 2;
2808
2809   *rI = index_r | (getRexX(pfx) << 3);
2810   if (ty == Ity_V128)
2811      vindex = nameXMMReg(*rI);
2812   else
2813      vindex = nameYMMReg(*rI);
2814   *vscale = 1<<scale;
2815
2816   switch (mod_reg_rm >> 6) {
2817   case 0:
2818      if (base_is_BPor13) {
2819         d = getSDisp32(delta);
2820         *len += 4;
2821         if (scale == 0) {
2822            DIS(buf, "%s%lld(,%s)", segRegTxt(pfx), d, vindex);
2823         } else {
2824            DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d, vindex, 1<<scale);
2825         }
2826         return disAMode_copy2tmp( mkU64(d) );
2827      } else {
2828         if (scale == 0) {
2829            DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
2830                     nameIRegRexB(8,pfx,base_r), vindex);
2831         } else {
2832            DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
2833                     nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
2834         }
2835      }
2836      break;
2837   case 1:
2838      d = getSDisp8(delta);
2839      *len += 1;
2840      goto have_disp;
2841   case 2:
2842      d = getSDisp32(delta);
2843      *len += 4;
2844   have_disp:
2845      if (scale == 0) {
2846         DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
2847                  nameIRegRexB(8,pfx,base_r), vindex);
2848      } else {
2849         DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
2850                  nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
2851      }
2852      break;
2853   }
2854
2855   if (!d)
2856      return disAMode_copy2tmp( getIRegRexB(8,pfx,base_r) );
2857   return disAMode_copy2tmp( binop(Iop_Add64, getIRegRexB(8,pfx,base_r),
2858                                   mkU64(d)) );
2859}
2860
2861
2862/* Figure out the number of (insn-stream) bytes constituting the amode
2863   beginning at delta.  Is useful for getting hold of literals beyond
2864   the end of the amode before it has been disassembled.  */
2865
2866static UInt lengthAMode ( Prefix pfx, Long delta )
2867{
2868   UChar mod_reg_rm = getUChar(delta);
2869   delta++;
2870
2871   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
2872      jump table seems a bit excessive.
2873   */
2874   mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
2875   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
2876                                               /* is now XX0XXYYY */
2877   mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
2878   switch (mod_reg_rm) {
2879
2880      /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
2881         REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
2882      */
2883      case 0x00: case 0x01: case 0x02: case 0x03:
2884      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
2885         return 1;
2886
2887      /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
2888         REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
2889      */
2890      case 0x08: case 0x09: case 0x0A: case 0x0B:
2891      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
2892         return 2;
2893
2894      /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
2895         REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
2896      */
2897      case 0x10: case 0x11: case 0x12: case 0x13:
2898      /* ! 14 */ case 0x15: case 0x16: case 0x17:
2899         return 5;
2900
2901      /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
2902      /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
2903      /* Not an address, but still handled. */
2904      case 0x18: case 0x19: case 0x1A: case 0x1B:
2905      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
2906         return 1;
2907
2908      /* RIP + disp32. */
2909      case 0x05:
2910         return 5;
2911
2912      case 0x04: {
2913         /* SIB, with no displacement. */
2914         UChar sib     = getUChar(delta);
2915         UChar base_r  = toUChar(sib & 7);
2916         /* correct since #(R13) == 8 + #(RBP) */
2917         Bool  base_is_BPor13 = toBool(base_r == R_RBP);
2918
2919         if (base_is_BPor13) {
2920            return 6;
2921         } else {
2922            return 2;
2923         }
2924      }
2925
2926      /* SIB, with 8-bit displacement. */
2927      case 0x0C:
2928         return 3;
2929
2930      /* SIB, with 32-bit displacement. */
2931      case 0x14:
2932         return 6;
2933
2934      default:
2935         vpanic("lengthAMode(amd64)");
2936         return 0; /*notreached*/
2937   }
2938}
2939
2940
2941/*------------------------------------------------------------*/
2942/*--- Disassembling common idioms                          ---*/
2943/*------------------------------------------------------------*/
2944
2945typedef
2946  enum { WithFlagNone=2, WithFlagCarry, WithFlagCarryX, WithFlagOverX }
2947  WithFlag;
2948
2949/* Handle binary integer instructions of the form
2950      op E, G  meaning
2951      op reg-or-mem, reg
2952   Is passed the a ptr to the modRM byte, the actual operation, and the
2953   data size.  Returns the address advanced completely over this
2954   instruction.
2955
2956   E(src) is reg-or-mem
2957   G(dst) is reg.
2958
2959   If E is reg, -->    GET %G,  tmp
2960                       OP %E,   tmp
2961                       PUT tmp, %G
2962
2963   If E is mem and OP is not reversible,
2964                -->    (getAddr E) -> tmpa
2965                       LD (tmpa), tmpa
2966                       GET %G, tmp2
2967                       OP tmpa, tmp2
2968                       PUT tmp2, %G
2969
2970   If E is mem and OP is reversible
2971                -->    (getAddr E) -> tmpa
2972                       LD (tmpa), tmpa
2973                       OP %G, tmpa
2974                       PUT tmpa, %G
2975*/
2976static
2977ULong dis_op2_E_G ( const VexAbiInfo* vbi,
2978                    Prefix      pfx,
2979                    IROp        op8,
2980                    WithFlag    flag,
2981                    Bool        keep,
2982                    Int         size,
2983                    Long        delta0,
2984                    const HChar* t_amd64opc )
2985{
2986   HChar   dis_buf[50];
2987   Int     len;
2988   IRType  ty   = szToITy(size);
2989   IRTemp  dst1 = newTemp(ty);
2990   IRTemp  src  = newTemp(ty);
2991   IRTemp  dst0 = newTemp(ty);
2992   UChar   rm   = getUChar(delta0);
2993   IRTemp  addr = IRTemp_INVALID;
2994
2995   /* Stay sane -- check for valid (op8, flag, keep) combinations. */
2996   switch (op8) {
2997      case Iop_Add8:
2998         switch (flag) {
2999            case WithFlagNone: case WithFlagCarry:
3000            case WithFlagCarryX: case WithFlagOverX:
3001               vassert(keep);
3002               break;
3003            default:
3004               vassert(0);
3005         }
3006         break;
3007      case Iop_Sub8:
3008         vassert(flag == WithFlagNone || flag == WithFlagCarry);
3009         if (flag == WithFlagCarry) vassert(keep);
3010         break;
3011      case Iop_And8:
3012         vassert(flag == WithFlagNone);
3013         break;
3014      case Iop_Or8: case Iop_Xor8:
3015         vassert(flag == WithFlagNone);
3016         vassert(keep);
3017         break;
3018      default:
3019         vassert(0);
3020   }
3021
3022   if (epartIsReg(rm)) {
3023      /* Specially handle XOR reg,reg, because that doesn't really
3024         depend on reg, and doing the obvious thing potentially
3025         generates a spurious value check failure due to the bogus
3026         dependency.  Ditto SUB/SBB reg,reg. */
3027      if ((op8 == Iop_Xor8 || ((op8 == Iop_Sub8) && keep))
3028          && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
3029         putIRegG(size,pfx,rm, mkU(ty,0));
3030      }
3031
3032      assign( dst0, getIRegG(size,pfx,rm) );
3033      assign( src,  getIRegE(size,pfx,rm) );
3034
3035      if (op8 == Iop_Add8 && flag == WithFlagCarry) {
3036         helper_ADC( size, dst1, dst0, src,
3037                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3038         putIRegG(size, pfx, rm, mkexpr(dst1));
3039      } else
3040      if (op8 == Iop_Sub8 && flag == WithFlagCarry) {
3041         helper_SBB( size, dst1, dst0, src,
3042                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3043         putIRegG(size, pfx, rm, mkexpr(dst1));
3044      } else
3045      if (op8 == Iop_Add8 && flag == WithFlagCarryX) {
3046         helper_ADCX_ADOX( True/*isADCX*/, size, dst1, dst0, src );
3047         putIRegG(size, pfx, rm, mkexpr(dst1));
3048      } else
3049      if (op8 == Iop_Add8 && flag == WithFlagOverX) {
3050         helper_ADCX_ADOX( False/*!isADCX*/, size, dst1, dst0, src );
3051         putIRegG(size, pfx, rm, mkexpr(dst1));
3052      } else {
3053         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
3054         if (isAddSub(op8))
3055            setFlags_DEP1_DEP2(op8, dst0, src, ty);
3056         else
3057            setFlags_DEP1(op8, dst1, ty);
3058         if (keep)
3059            putIRegG(size, pfx, rm, mkexpr(dst1));
3060      }
3061
3062      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
3063                          nameIRegE(size,pfx,rm),
3064                          nameIRegG(size,pfx,rm));
3065      return 1+delta0;
3066   } else {
3067      /* E refers to memory */
3068      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
3069      assign( dst0, getIRegG(size,pfx,rm) );
3070      assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
3071
3072      if (op8 == Iop_Add8 && flag == WithFlagCarry) {
3073         helper_ADC( size, dst1, dst0, src,
3074                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3075         putIRegG(size, pfx, rm, mkexpr(dst1));
3076      } else
3077      if (op8 == Iop_Sub8 && flag == WithFlagCarry) {
3078         helper_SBB( size, dst1, dst0, src,
3079                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3080         putIRegG(size, pfx, rm, mkexpr(dst1));
3081      } else
3082      if (op8 == Iop_Add8 && flag == WithFlagCarryX) {
3083         /* normal store */
3084         helper_ADCX_ADOX( True/*isADCX*/, size, dst1, dst0, src );
3085      } else
3086      if (op8 == Iop_Add8 && flag == WithFlagOverX) {
3087         /* normal store */
3088         helper_ADCX_ADOX( False/*!isADCX*/, size, dst1, dst0, src );
3089      } else {
3090         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
3091         if (isAddSub(op8))
3092            setFlags_DEP1_DEP2(op8, dst0, src, ty);
3093         else
3094            setFlags_DEP1(op8, dst1, ty);
3095         if (keep)
3096            putIRegG(size, pfx, rm, mkexpr(dst1));
3097      }
3098
3099      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
3100                          dis_buf, nameIRegG(size, pfx, rm));
3101      return len+delta0;
3102   }
3103}
3104
3105
3106
3107/* Handle binary integer instructions of the form
3108      op G, E  meaning
3109      op reg, reg-or-mem
3110   Is passed the a ptr to the modRM byte, the actual operation, and the
3111   data size.  Returns the address advanced completely over this
3112   instruction.
3113
3114   G(src) is reg.
3115   E(dst) is reg-or-mem
3116
3117   If E is reg, -->    GET %E,  tmp
3118                       OP %G,   tmp
3119                       PUT tmp, %E
3120
3121   If E is mem, -->    (getAddr E) -> tmpa
3122                       LD (tmpa), tmpv
3123                       OP %G, tmpv
3124                       ST tmpv, (tmpa)
3125*/
3126static
3127ULong dis_op2_G_E ( const VexAbiInfo* vbi,
3128                    Prefix      pfx,
3129                    IROp        op8,
3130                    WithFlag    flag,
3131                    Bool        keep,
3132                    Int         size,
3133                    Long        delta0,
3134                    const HChar* t_amd64opc )
3135{
3136   HChar   dis_buf[50];
3137   Int     len;
3138   IRType  ty   = szToITy(size);
3139   IRTemp  dst1 = newTemp(ty);
3140   IRTemp  src  = newTemp(ty);
3141   IRTemp  dst0 = newTemp(ty);
3142   UChar   rm   = getUChar(delta0);
3143   IRTemp  addr = IRTemp_INVALID;
3144
3145   /* Stay sane -- check for valid (op8, flag, keep) combinations. */
3146   switch (op8) {
3147      case Iop_Add8:
3148         vassert(flag == WithFlagNone || flag == WithFlagCarry);
3149         vassert(keep);
3150         break;
3151      case Iop_Sub8:
3152         vassert(flag == WithFlagNone || flag == WithFlagCarry);
3153         if (flag == WithFlagCarry) vassert(keep);
3154         break;
3155      case Iop_And8: case Iop_Or8: case Iop_Xor8:
3156         vassert(flag == WithFlagNone);
3157         vassert(keep);
3158         break;
3159      default:
3160         vassert(0);
3161   }
3162
3163   /* flag != WithFlagNone is only allowed for Add and Sub and indicates the
3164      intended operation is add-with-carry or subtract-with-borrow. */
3165
3166   if (epartIsReg(rm)) {
3167      /* Specially handle XOR reg,reg, because that doesn't really
3168         depend on reg, and doing the obvious thing potentially
3169         generates a spurious value check failure due to the bogus
3170         dependency.  Ditto SUB/SBB reg,reg. */
3171      if ((op8 == Iop_Xor8 || ((op8 == Iop_Sub8) && keep))
3172          && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
3173         putIRegE(size,pfx,rm, mkU(ty,0));
3174      }
3175
3176      assign(dst0, getIRegE(size,pfx,rm));
3177      assign(src,  getIRegG(size,pfx,rm));
3178
3179      if (op8 == Iop_Add8 && flag == WithFlagCarry) {
3180         helper_ADC( size, dst1, dst0, src,
3181                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3182         putIRegE(size, pfx, rm, mkexpr(dst1));
3183      } else
3184      if (op8 == Iop_Sub8 && flag == WithFlagCarry) {
3185         helper_SBB( size, dst1, dst0, src,
3186                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3187         putIRegE(size, pfx, rm, mkexpr(dst1));
3188      } else {
3189         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
3190         if (isAddSub(op8))
3191            setFlags_DEP1_DEP2(op8, dst0, src, ty);
3192         else
3193            setFlags_DEP1(op8, dst1, ty);
3194         if (keep)
3195            putIRegE(size, pfx, rm, mkexpr(dst1));
3196      }
3197
3198      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
3199                          nameIRegG(size,pfx,rm),
3200                          nameIRegE(size,pfx,rm));
3201      return 1+delta0;
3202   }
3203
3204   /* E refers to memory */
3205   {
3206      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
3207      assign(dst0, loadLE(ty,mkexpr(addr)));
3208      assign(src,  getIRegG(size,pfx,rm));
3209
3210      if (op8 == Iop_Add8 && flag == WithFlagCarry) {
3211         if (haveLOCK(pfx)) {
3212            /* cas-style store */
3213            helper_ADC( size, dst1, dst0, src,
3214                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
3215         } else {
3216            /* normal store */
3217            helper_ADC( size, dst1, dst0, src,
3218                        /*store*/addr, IRTemp_INVALID, 0 );
3219         }
3220      } else
3221      if (op8 == Iop_Sub8 && flag == WithFlagCarry) {
3222         if (haveLOCK(pfx)) {
3223            /* cas-style store */
3224            helper_SBB( size, dst1, dst0, src,
3225                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
3226         } else {
3227            /* normal store */
3228            helper_SBB( size, dst1, dst0, src,
3229                        /*store*/addr, IRTemp_INVALID, 0 );
3230         }
3231      } else {
3232         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
3233         if (keep) {
3234            if (haveLOCK(pfx)) {
3235               if (0) vex_printf("locked case\n" );
3236               casLE( mkexpr(addr),
3237                      mkexpr(dst0)/*expval*/,
3238                      mkexpr(dst1)/*newval*/, guest_RIP_curr_instr );
3239            } else {
3240               if (0) vex_printf("nonlocked case\n");
3241               storeLE(mkexpr(addr), mkexpr(dst1));
3242            }
3243         }
3244         if (isAddSub(op8))
3245            setFlags_DEP1_DEP2(op8, dst0, src, ty);
3246         else
3247            setFlags_DEP1(op8, dst1, ty);
3248      }
3249
3250      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
3251                          nameIRegG(size,pfx,rm), dis_buf);
3252      return len+delta0;
3253   }
3254}
3255
3256
3257/* Handle move instructions of the form
3258      mov E, G  meaning
3259      mov reg-or-mem, reg
3260   Is passed the a ptr to the modRM byte, and the data size.  Returns
3261   the address advanced completely over this instruction.
3262
3263   E(src) is reg-or-mem
3264   G(dst) is reg.
3265
3266   If E is reg, -->    GET %E,  tmpv
3267                       PUT tmpv, %G
3268
3269   If E is mem  -->    (getAddr E) -> tmpa
3270                       LD (tmpa), tmpb
3271                       PUT tmpb, %G
3272*/
3273static
3274ULong dis_mov_E_G ( const VexAbiInfo* vbi,
3275                    Prefix      pfx,
3276                    Int         size,
3277                    Long        delta0 )
3278{
3279   Int len;
3280   UChar rm = getUChar(delta0);
3281   HChar dis_buf[50];
3282
3283   if (epartIsReg(rm)) {
3284      putIRegG(size, pfx, rm, getIRegE(size, pfx, rm));
3285      DIP("mov%c %s,%s\n", nameISize(size),
3286                           nameIRegE(size,pfx,rm),
3287                           nameIRegG(size,pfx,rm));
3288      return 1+delta0;
3289   }
3290
3291   /* E refers to memory */
3292   {
3293      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
3294      putIRegG(size, pfx, rm, loadLE(szToITy(size), mkexpr(addr)));
3295      DIP("mov%c %s,%s\n", nameISize(size),
3296                           dis_buf,
3297                           nameIRegG(size,pfx,rm));
3298      return delta0+len;
3299   }
3300}
3301
3302
3303/* Handle move instructions of the form
3304      mov G, E  meaning
3305      mov reg, reg-or-mem
3306   Is passed the a ptr to the modRM byte, and the data size.  Returns
3307   the address advanced completely over this instruction.
3308   We have to decide here whether F2 or F3 are acceptable.  F2 never is.
3309
3310   G(src) is reg.
3311   E(dst) is reg-or-mem
3312
3313   If E is reg, -->    GET %G,  tmp
3314                       PUT tmp, %E
3315
3316   If E is mem, -->    (getAddr E) -> tmpa
3317                       GET %G, tmpv
3318                       ST tmpv, (tmpa)
3319*/
3320static
3321ULong dis_mov_G_E ( const VexAbiInfo*  vbi,
3322                    Prefix       pfx,
3323                    Int          size,
3324                    Long         delta0,
3325                    /*OUT*/Bool* ok )
3326{
3327   Int   len;
3328   UChar rm = getUChar(delta0);
3329   HChar dis_buf[50];
3330
3331   *ok = True;
3332
3333   if (epartIsReg(rm)) {
3334      if (haveF2orF3(pfx)) { *ok = False; return delta0; }
3335      putIRegE(size, pfx, rm, getIRegG(size, pfx, rm));
3336      DIP("mov%c %s,%s\n", nameISize(size),
3337                           nameIRegG(size,pfx,rm),
3338                           nameIRegE(size,pfx,rm));
3339      return 1+delta0;
3340   }
3341
3342   /* E refers to memory */
3343   {
3344      if (haveF2(pfx)) { *ok = False; return delta0; }
3345      /* F3(XRELEASE) is acceptable, though. */
3346      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
3347      storeLE( mkexpr(addr), getIRegG(size, pfx, rm) );
3348      DIP("mov%c %s,%s\n", nameISize(size),
3349                           nameIRegG(size,pfx,rm),
3350                           dis_buf);
3351      return len+delta0;
3352   }
3353}
3354
3355
3356/* op $immediate, AL/AX/EAX/RAX. */
3357static
3358ULong dis_op_imm_A ( Int    size,
3359                     Bool   carrying,
3360                     IROp   op8,
3361                     Bool   keep,
3362                     Long   delta,
3363                     const HChar* t_amd64opc )
3364{
3365   Int    size4 = imin(size,4);
3366   IRType ty    = szToITy(size);
3367   IRTemp dst0  = newTemp(ty);
3368   IRTemp src   = newTemp(ty);
3369   IRTemp dst1  = newTemp(ty);
3370   Long  lit    = getSDisp(size4,delta);
3371   assign(dst0, getIRegRAX(size));
3372   assign(src,  mkU(ty,lit & mkSizeMask(size)));
3373
3374   if (isAddSub(op8) && !carrying) {
3375      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
3376      setFlags_DEP1_DEP2(op8, dst0, src, ty);
3377   }
3378   else
3379   if (isLogic(op8)) {
3380      vassert(!carrying);
3381      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
3382      setFlags_DEP1(op8, dst1, ty);
3383   }
3384   else
3385   if (op8 == Iop_Add8 && carrying) {
3386      helper_ADC( size, dst1, dst0, src,
3387                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3388   }
3389   else
3390   if (op8 == Iop_Sub8 && carrying) {
3391      helper_SBB( size, dst1, dst0, src,
3392                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3393   }
3394   else
3395      vpanic("dis_op_imm_A(amd64,guest)");
3396
3397   if (keep)
3398      putIRegRAX(size, mkexpr(dst1));
3399
3400   DIP("%s%c $%lld, %s\n", t_amd64opc, nameISize(size),
3401                           lit, nameIRegRAX(size));
3402   return delta+size4;
3403}
3404
3405
3406/* Sign- and Zero-extending moves. */
3407static
3408ULong dis_movx_E_G ( const VexAbiInfo* vbi,
3409                     Prefix pfx,
3410                     Long delta, Int szs, Int szd, Bool sign_extend )
3411{
3412   UChar rm = getUChar(delta);
3413   if (epartIsReg(rm)) {
3414      putIRegG(szd, pfx, rm,
3415                    doScalarWidening(
3416                       szs,szd,sign_extend,
3417                       getIRegE(szs,pfx,rm)));
3418      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
3419                               nameISize(szs),
3420                               nameISize(szd),
3421                               nameIRegE(szs,pfx,rm),
3422                               nameIRegG(szd,pfx,rm));
3423      return 1+delta;
3424   }
3425
3426   /* E refers to memory */
3427   {
3428      Int    len;
3429      HChar  dis_buf[50];
3430      IRTemp addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
3431      putIRegG(szd, pfx, rm,
3432                    doScalarWidening(
3433                       szs,szd,sign_extend,
3434                       loadLE(szToITy(szs),mkexpr(addr))));
3435      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
3436                               nameISize(szs),
3437                               nameISize(szd),
3438                               dis_buf,
3439                               nameIRegG(szd,pfx,rm));
3440      return len+delta;
3441   }
3442}
3443
3444
3445/* Generate code to divide ArchRegs RDX:RAX / EDX:EAX / DX:AX / AX by
3446   the 64 / 32 / 16 / 8 bit quantity in the given IRTemp.  */
3447static
3448void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
3449{
3450   /* special-case the 64-bit case */
3451   if (sz == 8) {
3452      IROp   op     = signed_divide ? Iop_DivModS128to64
3453                                    : Iop_DivModU128to64;
3454      IRTemp src128 = newTemp(Ity_I128);
3455      IRTemp dst128 = newTemp(Ity_I128);
3456      assign( src128, binop(Iop_64HLto128,
3457                            getIReg64(R_RDX),
3458                            getIReg64(R_RAX)) );
3459      assign( dst128, binop(op, mkexpr(src128), mkexpr(t)) );
3460      putIReg64( R_RAX, unop(Iop_128to64,mkexpr(dst128)) );
3461      putIReg64( R_RDX, unop(Iop_128HIto64,mkexpr(dst128)) );
3462   } else {
3463      IROp   op    = signed_divide ? Iop_DivModS64to32
3464                                   : Iop_DivModU64to32;
3465      IRTemp src64 = newTemp(Ity_I64);
3466      IRTemp dst64 = newTemp(Ity_I64);
3467      switch (sz) {
3468      case 4:
3469         assign( src64,
3470                 binop(Iop_32HLto64, getIRegRDX(4), getIRegRAX(4)) );
3471         assign( dst64,
3472                 binop(op, mkexpr(src64), mkexpr(t)) );
3473         putIRegRAX( 4, unop(Iop_64to32,mkexpr(dst64)) );
3474         putIRegRDX( 4, unop(Iop_64HIto32,mkexpr(dst64)) );
3475         break;
3476      case 2: {
3477         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
3478         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
3479         assign( src64, unop(widen3264,
3480                             binop(Iop_16HLto32,
3481                                   getIRegRDX(2),
3482                                   getIRegRAX(2))) );
3483         assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
3484         putIRegRAX( 2, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
3485         putIRegRDX( 2, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
3486         break;
3487      }
3488      case 1: {
3489         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
3490         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
3491         IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
3492         assign( src64, unop(widen3264,
3493                        unop(widen1632, getIRegRAX(2))) );
3494         assign( dst64,
3495                 binop(op, mkexpr(src64),
3496                           unop(widen1632, unop(widen816, mkexpr(t)))) );
3497         putIRegRAX( 1, unop(Iop_16to8,
3498                        unop(Iop_32to16,
3499                        unop(Iop_64to32,mkexpr(dst64)))) );
3500         putIRegAH( unop(Iop_16to8,
3501                    unop(Iop_32to16,
3502                    unop(Iop_64HIto32,mkexpr(dst64)))) );
3503         break;
3504      }
3505      default:
3506         vpanic("codegen_div(amd64)");
3507      }
3508   }
3509}
3510
3511static
3512ULong dis_Grp1 ( const VexAbiInfo* vbi,
3513                 Prefix pfx,
3514                 Long delta, UChar modrm,
3515                 Int am_sz, Int d_sz, Int sz, Long d64 )
3516{
3517   Int     len;
3518   HChar   dis_buf[50];
3519   IRType  ty   = szToITy(sz);
3520   IRTemp  dst1 = newTemp(ty);
3521   IRTemp  src  = newTemp(ty);
3522   IRTemp  dst0 = newTemp(ty);
3523   IRTemp  addr = IRTemp_INVALID;
3524   IROp    op8  = Iop_INVALID;
3525   ULong   mask = mkSizeMask(sz);
3526
3527   switch (gregLO3ofRM(modrm)) {
3528      case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
3529      case 2: break;  // ADC
3530      case 3: break;  // SBB
3531      case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
3532      case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
3533      /*NOTREACHED*/
3534      default: vpanic("dis_Grp1(amd64): unhandled case");
3535   }
3536
3537   if (epartIsReg(modrm)) {
3538      vassert(am_sz == 1);
3539
3540      assign(dst0, getIRegE(sz,pfx,modrm));
3541      assign(src,  mkU(ty,d64 & mask));
3542
3543      if (gregLO3ofRM(modrm) == 2 /* ADC */) {
3544         helper_ADC( sz, dst1, dst0, src,
3545                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3546      } else
3547      if (gregLO3ofRM(modrm) == 3 /* SBB */) {
3548         helper_SBB( sz, dst1, dst0, src,
3549                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3550      } else {
3551         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
3552         if (isAddSub(op8))
3553            setFlags_DEP1_DEP2(op8, dst0, src, ty);
3554         else
3555            setFlags_DEP1(op8, dst1, ty);
3556      }
3557
3558      if (gregLO3ofRM(modrm) < 7)
3559         putIRegE(sz, pfx, modrm, mkexpr(dst1));
3560
3561      delta += (am_sz + d_sz);
3562      DIP("%s%c $%lld, %s\n",
3563          nameGrp1(gregLO3ofRM(modrm)), nameISize(sz), d64,
3564          nameIRegE(sz,pfx,modrm));
3565   } else {
3566      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
3567
3568      assign(dst0, loadLE(ty,mkexpr(addr)));
3569      assign(src, mkU(ty,d64 & mask));
3570
3571      if (gregLO3ofRM(modrm) == 2 /* ADC */) {
3572         if (haveLOCK(pfx)) {
3573            /* cas-style store */
3574            helper_ADC( sz, dst1, dst0, src,
3575                       /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
3576         } else {
3577            /* normal store */
3578            helper_ADC( sz, dst1, dst0, src,
3579                        /*store*/addr, IRTemp_INVALID, 0 );
3580         }
3581      } else
3582      if (gregLO3ofRM(modrm) == 3 /* SBB */) {
3583         if (haveLOCK(pfx)) {
3584            /* cas-style store */
3585            helper_SBB( sz, dst1, dst0, src,
3586                       /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
3587         } else {
3588            /* normal store */
3589            helper_SBB( sz, dst1, dst0, src,
3590                        /*store*/addr, IRTemp_INVALID, 0 );
3591         }
3592      } else {
3593         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
3594         if (gregLO3ofRM(modrm) < 7) {
3595            if (haveLOCK(pfx)) {
3596               casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
3597                                    mkexpr(dst1)/*newVal*/,
3598                                    guest_RIP_curr_instr );
3599            } else {
3600               storeLE(mkexpr(addr), mkexpr(dst1));
3601            }
3602         }
3603         if (isAddSub(op8))
3604            setFlags_DEP1_DEP2(op8, dst0, src, ty);
3605         else
3606            setFlags_DEP1(op8, dst1, ty);
3607      }
3608
3609      delta += (len+d_sz);
3610      DIP("%s%c $%lld, %s\n",
3611          nameGrp1(gregLO3ofRM(modrm)), nameISize(sz),
3612          d64, dis_buf);
3613   }
3614   return delta;
3615}
3616
3617
3618/* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
3619   expression. */
3620
3621static
3622ULong dis_Grp2 ( const VexAbiInfo* vbi,
3623                 Prefix pfx,
3624                 Long delta, UChar modrm,
3625                 Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
3626                 const HChar* shift_expr_txt, Bool* decode_OK )
3627{
3628   /* delta on entry points at the modrm byte. */
3629   HChar  dis_buf[50];
3630   Int    len;
3631   Bool   isShift, isRotate, isRotateC;
3632   IRType ty    = szToITy(sz);
3633   IRTemp dst0  = newTemp(ty);
3634   IRTemp dst1  = newTemp(ty);
3635   IRTemp addr  = IRTemp_INVALID;
3636
3637   *decode_OK = True;
3638
3639   vassert(sz == 1 || sz == 2 || sz == 4 || sz == 8);
3640
3641   /* Put value to shift/rotate in dst0. */
3642   if (epartIsReg(modrm)) {
3643      assign(dst0, getIRegE(sz, pfx, modrm));
3644      delta += (am_sz + d_sz);
3645   } else {
3646      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
3647      assign(dst0, loadLE(ty,mkexpr(addr)));
3648      delta += len + d_sz;
3649   }
3650
3651   isShift = False;
3652   switch (gregLO3ofRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
3653
3654   isRotate = False;
3655   switch (gregLO3ofRM(modrm)) { case 0: case 1: isRotate = True; }
3656
3657   isRotateC = False;
3658   switch (gregLO3ofRM(modrm)) { case 2: case 3: isRotateC = True; }
3659
3660   if (!isShift && !isRotate && !isRotateC) {
3661      /*NOTREACHED*/
3662      vpanic("dis_Grp2(Reg): unhandled case(amd64)");
3663   }
3664
3665   if (isRotateC) {
3666      /* Call a helper; this insn is so ridiculous it does not deserve
3667         better.  One problem is, the helper has to calculate both the
3668         new value and the new flags.  This is more than 64 bits, and
3669         there is no way to return more than 64 bits from the helper.
3670         Hence the crude and obvious solution is to call it twice,
3671         using the sign of the sz field to indicate whether it is the
3672         value or rflags result we want.
3673      */
3674      Bool     left = toBool(gregLO3ofRM(modrm) == 2);
3675      IRExpr** argsVALUE;
3676      IRExpr** argsRFLAGS;
3677
3678      IRTemp new_value  = newTemp(Ity_I64);
3679      IRTemp new_rflags = newTemp(Ity_I64);
3680      IRTemp old_rflags = newTemp(Ity_I64);
3681
3682      assign( old_rflags, widenUto64(mk_amd64g_calculate_rflags_all()) );
3683
3684      argsVALUE
3685         = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
3686                          widenUto64(shift_expr),   /* rotate amount */
3687                          mkexpr(old_rflags),
3688                          mkU64(sz) );
3689      assign( new_value,
3690                 mkIRExprCCall(
3691                    Ity_I64,
3692                    0/*regparm*/,
3693                    left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
3694                    left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
3695                    argsVALUE
3696                 )
3697            );
3698
3699      argsRFLAGS
3700         = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
3701                          widenUto64(shift_expr),   /* rotate amount */
3702                          mkexpr(old_rflags),
3703                          mkU64(-sz) );
3704      assign( new_rflags,
3705                 mkIRExprCCall(
3706                    Ity_I64,
3707                    0/*regparm*/,
3708                    left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
3709                    left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
3710                    argsRFLAGS
3711                 )
3712            );
3713
3714      assign( dst1, narrowTo(ty, mkexpr(new_value)) );
3715      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
3716      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(new_rflags) ));
3717      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
3718      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
3719   }
3720
3721   else
3722   if (isShift) {
3723
3724      IRTemp pre64     = newTemp(Ity_I64);
3725      IRTemp res64     = newTemp(Ity_I64);
3726      IRTemp res64ss   = newTemp(Ity_I64);
3727      IRTemp shift_amt = newTemp(Ity_I8);
3728      UChar  mask      = toUChar(sz==8 ? 63 : 31);
3729      IROp   op64;
3730
3731      switch (gregLO3ofRM(modrm)) {
3732         case 4: op64 = Iop_Shl64; break;
3733         case 5: op64 = Iop_Shr64; break;
3734         case 6: op64 = Iop_Shl64; break;
3735         case 7: op64 = Iop_Sar64; break;
3736         /*NOTREACHED*/
3737         default: vpanic("dis_Grp2:shift"); break;
3738      }
3739
3740      /* Widen the value to be shifted to 64 bits, do the shift, and
3741         narrow back down.  This seems surprisingly long-winded, but
3742         unfortunately the AMD semantics requires that 8/16/32-bit
3743         shifts give defined results for shift values all the way up
3744         to 32, and this seems the simplest way to do it.  It has the
3745         advantage that the only IR level shifts generated are of 64
3746         bit values, and the shift amount is guaranteed to be in the
3747         range 0 .. 63, thereby observing the IR semantics requiring
3748         all shift values to be in the range 0 .. 2^word_size-1.
3749
3750         Therefore the shift amount is masked with 63 for 64-bit shifts
3751         and 31 for all others.
3752      */
3753      /* shift_amt = shift_expr & MASK, regardless of operation size */
3754      assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(mask)) );
3755
3756      /* suitably widen the value to be shifted to 64 bits. */
3757      assign( pre64, op64==Iop_Sar64 ? widenSto64(mkexpr(dst0))
3758                                     : widenUto64(mkexpr(dst0)) );
3759
3760      /* res64 = pre64 `shift` shift_amt */
3761      assign( res64, binop(op64, mkexpr(pre64), mkexpr(shift_amt)) );
3762
3763      /* res64ss = pre64 `shift` ((shift_amt - 1) & MASK) */
3764      assign( res64ss,
3765              binop(op64,
3766                    mkexpr(pre64),
3767                    binop(Iop_And8,
3768                          binop(Iop_Sub8,
3769                                mkexpr(shift_amt), mkU8(1)),
3770                          mkU8(mask))) );
3771
3772      /* Build the flags thunk. */
3773      setFlags_DEP1_DEP2_shift(op64, res64, res64ss, ty, shift_amt);
3774
3775      /* Narrow the result back down. */
3776      assign( dst1, narrowTo(ty, mkexpr(res64)) );
3777
3778   } /* if (isShift) */
3779
3780   else
3781   if (isRotate) {
3782      Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1
3783                                        : (ty==Ity_I32 ? 2 : 3));
3784      Bool   left      = toBool(gregLO3ofRM(modrm) == 0);
3785      IRTemp rot_amt   = newTemp(Ity_I8);
3786      IRTemp rot_amt64 = newTemp(Ity_I8);
3787      IRTemp oldFlags  = newTemp(Ity_I64);
3788      UChar  mask      = toUChar(sz==8 ? 63 : 31);
3789
3790      /* rot_amt = shift_expr & mask */
3791      /* By masking the rotate amount thusly, the IR-level Shl/Shr
3792         expressions never shift beyond the word size and thus remain
3793         well defined. */
3794      assign(rot_amt64, binop(Iop_And8, shift_expr, mkU8(mask)));
3795
3796      if (ty == Ity_I64)
3797         assign(rot_amt, mkexpr(rot_amt64));
3798      else
3799         assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt64), mkU8(8*sz-1)));
3800
3801      if (left) {
3802
3803         /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
3804         assign(dst1,
3805            binop( mkSizedOp(ty,Iop_Or8),
3806                   binop( mkSizedOp(ty,Iop_Shl8),
3807                          mkexpr(dst0),
3808                          mkexpr(rot_amt)
3809                   ),
3810                   binop( mkSizedOp(ty,Iop_Shr8),
3811                          mkexpr(dst0),
3812                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
3813                   )
3814            )
3815         );
3816         ccOp += AMD64G_CC_OP_ROLB;
3817
3818      } else { /* right */
3819
3820         /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
3821         assign(dst1,
3822            binop( mkSizedOp(ty,Iop_Or8),
3823                   binop( mkSizedOp(ty,Iop_Shr8),
3824                          mkexpr(dst0),
3825                          mkexpr(rot_amt)
3826                   ),
3827                   binop( mkSizedOp(ty,Iop_Shl8),
3828                          mkexpr(dst0),
3829                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
3830                   )
3831            )
3832         );
3833         ccOp += AMD64G_CC_OP_RORB;
3834
3835      }
3836
3837      /* dst1 now holds the rotated value.  Build flag thunk.  We
3838         need the resulting value for this, and the previous flags.
3839         Except don't set it if the rotate count is zero. */
3840
3841      assign(oldFlags, mk_amd64g_calculate_rflags_all());
3842
3843      /* rot_amt64 :: Ity_I8.  We need to convert it to I1. */
3844      IRTemp rot_amt64b = newTemp(Ity_I1);
3845      assign(rot_amt64b, binop(Iop_CmpNE8, mkexpr(rot_amt64), mkU8(0)) );
3846
3847      /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
3848      stmt( IRStmt_Put( OFFB_CC_OP,
3849                        IRExpr_ITE( mkexpr(rot_amt64b),
3850                                    mkU64(ccOp),
3851                                    IRExpr_Get(OFFB_CC_OP,Ity_I64) ) ));
3852      stmt( IRStmt_Put( OFFB_CC_DEP1,
3853                        IRExpr_ITE( mkexpr(rot_amt64b),
3854                                    widenUto64(mkexpr(dst1)),
3855                                    IRExpr_Get(OFFB_CC_DEP1,Ity_I64) ) ));
3856      stmt( IRStmt_Put( OFFB_CC_DEP2,
3857                        IRExpr_ITE( mkexpr(rot_amt64b),
3858                                    mkU64(0),
3859                                    IRExpr_Get(OFFB_CC_DEP2,Ity_I64) ) ));
3860      stmt( IRStmt_Put( OFFB_CC_NDEP,
3861                        IRExpr_ITE( mkexpr(rot_amt64b),
3862                                    mkexpr(oldFlags),
3863                                    IRExpr_Get(OFFB_CC_NDEP,Ity_I64) ) ));
3864   } /* if (isRotate) */
3865
3866   /* Save result, and finish up. */
3867   if (epartIsReg(modrm)) {
3868      putIRegE(sz, pfx, modrm, mkexpr(dst1));
3869      if (vex_traceflags & VEX_TRACE_FE) {
3870         vex_printf("%s%c ",
3871                    nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
3872         if (shift_expr_txt)
3873            vex_printf("%s", shift_expr_txt);
3874         else
3875            ppIRExpr(shift_expr);
3876         vex_printf(", %s\n", nameIRegE(sz,pfx,modrm));
3877      }
3878   } else {
3879      storeLE(mkexpr(addr), mkexpr(dst1));
3880      if (vex_traceflags & VEX_TRACE_FE) {
3881         vex_printf("%s%c ",
3882                    nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
3883         if (shift_expr_txt)
3884            vex_printf("%s", shift_expr_txt);
3885         else
3886            ppIRExpr(shift_expr);
3887         vex_printf(", %s\n", dis_buf);
3888      }
3889   }
3890   return delta;
3891}
3892
3893
3894/* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
3895static
3896ULong dis_Grp8_Imm ( const VexAbiInfo* vbi,
3897                     Prefix pfx,
3898                     Long delta, UChar modrm,
3899                     Int am_sz, Int sz, ULong src_val,
3900                     Bool* decode_OK )
3901{
3902   /* src_val denotes a d8.
3903      And delta on entry points at the modrm byte. */
3904
3905   IRType ty     = szToITy(sz);
3906   IRTemp t2     = newTemp(Ity_I64);
3907   IRTemp t2m    = newTemp(Ity_I64);
3908   IRTemp t_addr = IRTemp_INVALID;
3909   HChar  dis_buf[50];
3910   ULong  mask;
3911
3912   /* we're optimists :-) */
3913   *decode_OK = True;
3914
3915   /* Check whether F2 or F3 are acceptable. */
3916   if (epartIsReg(modrm)) {
3917      /* F2 or F3 are not allowed in the register case. */
3918      if (haveF2orF3(pfx)) {
3919         *decode_OK = False;
3920         return delta;
3921     }
3922   } else {
3923      /* F2 or F3 (but not both) are allowable provided LOCK is also
3924         present. */
3925      if (haveF2orF3(pfx)) {
3926         if (haveF2andF3(pfx) || !haveLOCK(pfx)) {
3927            *decode_OK = False;
3928            return delta;
3929         }
3930      }
3931   }
3932
3933   /* Limit src_val -- the bit offset -- to something within a word.
3934      The Intel docs say that literal offsets larger than a word are
3935      masked in this way. */
3936   switch (sz) {
3937      case 2:  src_val &= 15; break;
3938      case 4:  src_val &= 31; break;
3939      case 8:  src_val &= 63; break;
3940      default: *decode_OK = False; return delta;
3941   }
3942
3943   /* Invent a mask suitable for the operation. */
3944   switch (gregLO3ofRM(modrm)) {
3945      case 4: /* BT */  mask = 0;                  break;
3946      case 5: /* BTS */ mask = 1ULL << src_val;    break;
3947      case 6: /* BTR */ mask = ~(1ULL << src_val); break;
3948      case 7: /* BTC */ mask = 1ULL << src_val;    break;
3949         /* If this needs to be extended, probably simplest to make a
3950            new function to handle the other cases (0 .. 3).  The
3951            Intel docs do however not indicate any use for 0 .. 3, so
3952            we don't expect this to happen. */
3953      default: *decode_OK = False; return delta;
3954   }
3955
3956   /* Fetch the value to be tested and modified into t2, which is
3957      64-bits wide regardless of sz. */
3958   if (epartIsReg(modrm)) {
3959      vassert(am_sz == 1);
3960      assign( t2, widenUto64(getIRegE(sz, pfx, modrm)) );
3961      delta += (am_sz + 1);
3962      DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
3963                                nameISize(sz),
3964                                src_val, nameIRegE(sz,pfx,modrm));
3965   } else {
3966      Int len;
3967      t_addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 1 );
3968      delta  += (len+1);
3969      assign( t2, widenUto64(loadLE(ty, mkexpr(t_addr))) );
3970      DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
3971                                nameISize(sz),
3972                                src_val, dis_buf);
3973   }
3974
3975   /* Compute the new value into t2m, if non-BT. */
3976   switch (gregLO3ofRM(modrm)) {
3977      case 4: /* BT */
3978         break;
3979      case 5: /* BTS */
3980         assign( t2m, binop(Iop_Or64, mkU64(mask), mkexpr(t2)) );
3981         break;
3982      case 6: /* BTR */
3983         assign( t2m, binop(Iop_And64, mkU64(mask), mkexpr(t2)) );
3984         break;
3985      case 7: /* BTC */
3986         assign( t2m, binop(Iop_Xor64, mkU64(mask), mkexpr(t2)) );
3987         break;
3988     default:
3989         /*NOTREACHED*/ /*the previous switch guards this*/
3990         vassert(0);
3991   }
3992
3993   /* Write the result back, if non-BT. */
3994   if (gregLO3ofRM(modrm) != 4 /* BT */) {
3995      if (epartIsReg(modrm)) {
3996        putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m)));
3997      } else {
3998         if (haveLOCK(pfx)) {
3999            casLE( mkexpr(t_addr),
4000                   narrowTo(ty, mkexpr(t2))/*expd*/,
4001                   narrowTo(ty, mkexpr(t2m))/*new*/,
4002                   guest_RIP_curr_instr );
4003         } else {
4004            storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
4005         }
4006      }
4007   }
4008
4009   /* Copy relevant bit from t2 into the carry flag. */
4010   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
4011   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
4012   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
4013   stmt( IRStmt_Put(
4014            OFFB_CC_DEP1,
4015            binop(Iop_And64,
4016                  binop(Iop_Shr64, mkexpr(t2), mkU8(src_val)),
4017                  mkU64(1))
4018       ));
4019   /* Set NDEP even though it isn't used.  This makes redundant-PUT
4020      elimination of previous stores to this field work better. */
4021   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
4022
4023   return delta;
4024}
4025
4026
4027/* Signed/unsigned widening multiply.  Generate IR to multiply the
4028   value in RAX/EAX/AX/AL by the given IRTemp, and park the result in
4029   RDX:RAX/EDX:EAX/DX:AX/AX.
4030*/
4031static void codegen_mulL_A_D ( Int sz, Bool syned,
4032                               IRTemp tmp, const HChar* tmp_txt )
4033{
4034   IRType ty = szToITy(sz);
4035   IRTemp t1 = newTemp(ty);
4036
4037   assign( t1, getIRegRAX(sz) );
4038
4039   switch (ty) {
4040      case Ity_I64: {
4041         IRTemp res128  = newTemp(Ity_I128);
4042         IRTemp resHi   = newTemp(Ity_I64);
4043         IRTemp resLo   = newTemp(Ity_I64);
4044         IROp   mulOp   = syned ? Iop_MullS64 : Iop_MullU64;
4045         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
4046         setFlags_MUL ( Ity_I64, t1, tmp, tBaseOp );
4047         assign( res128, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
4048         assign( resHi, unop(Iop_128HIto64,mkexpr(res128)));
4049         assign( resLo, unop(Iop_128to64,mkexpr(res128)));
4050         putIReg64(R_RDX, mkexpr(resHi));
4051         putIReg64(R_RAX, mkexpr(resLo));
4052         break;
4053      }
4054      case Ity_I32: {
4055         IRTemp res64   = newTemp(Ity_I64);
4056         IRTemp resHi   = newTemp(Ity_I32);
4057         IRTemp resLo   = newTemp(Ity_I32);
4058         IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
4059         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
4060         setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
4061         assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
4062         assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
4063         assign( resLo, unop(Iop_64to32,mkexpr(res64)));
4064         putIRegRDX(4, mkexpr(resHi));
4065         putIRegRAX(4, mkexpr(resLo));
4066         break;
4067      }
4068      case Ity_I16: {
4069         IRTemp res32   = newTemp(Ity_I32);
4070         IRTemp resHi   = newTemp(Ity_I16);
4071         IRTemp resLo   = newTemp(Ity_I16);
4072         IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
4073         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
4074         setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
4075         assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
4076         assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
4077         assign( resLo, unop(Iop_32to16,mkexpr(res32)));
4078         putIRegRDX(2, mkexpr(resHi));
4079         putIRegRAX(2, mkexpr(resLo));
4080         break;
4081      }
4082      case Ity_I8: {
4083         IRTemp res16   = newTemp(Ity_I16);
4084         IRTemp resHi   = newTemp(Ity_I8);
4085         IRTemp resLo   = newTemp(Ity_I8);
4086         IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
4087         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
4088         setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
4089         assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
4090         assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
4091         assign( resLo, unop(Iop_16to8,mkexpr(res16)));
4092         putIRegRAX(2, mkexpr(res16));
4093         break;
4094      }
4095      default:
4096         ppIRType(ty);
4097         vpanic("codegen_mulL_A_D(amd64)");
4098   }
4099   DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
4100}
4101
4102
4103/* Group 3 extended opcodes.  We have to decide here whether F2 and F3
4104   might be valid.*/
4105static
4106ULong dis_Grp3 ( const VexAbiInfo* vbi,
4107                 Prefix pfx, Int sz, Long delta, Bool* decode_OK )
4108{
4109   Long    d64;
4110   UChar   modrm;
4111   HChar   dis_buf[50];
4112   Int     len;
4113   IRTemp  addr;
4114   IRType  ty = szToITy(sz);
4115   IRTemp  t1 = newTemp(ty);
4116   IRTemp dst1, src, dst0;
4117   *decode_OK = True;
4118   modrm = getUChar(delta);
4119   if (epartIsReg(modrm)) {
4120      /* F2/XACQ and F3/XREL are always invalid in the non-mem case. */
4121      if (haveF2orF3(pfx)) goto unhandled;
4122      switch (gregLO3ofRM(modrm)) {
4123         case 0: { /* TEST */
4124            delta++;
4125            d64 = getSDisp(imin(4,sz), delta);
4126            delta += imin(4,sz);
4127            dst1 = newTemp(ty);
4128            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
4129                               getIRegE(sz,pfx,modrm),
4130                               mkU(ty, d64 & mkSizeMask(sz))));
4131            setFlags_DEP1( Iop_And8, dst1, ty );
4132            DIP("test%c $%lld, %s\n",
4133                nameISize(sz), d64,
4134                nameIRegE(sz, pfx, modrm));
4135            break;
4136         }
4137         case 1:
4138            *decode_OK = False;
4139            return delta;
4140         case 2: /* NOT */
4141            delta++;
4142            putIRegE(sz, pfx, modrm,
4143                              unop(mkSizedOp(ty,Iop_Not8),
4144                                   getIRegE(sz, pfx, modrm)));
4145            DIP("not%c %s\n", nameISize(sz),
4146                              nameIRegE(sz, pfx, modrm));
4147            break;
4148         case 3: /* NEG */
4149            delta++;
4150            dst0 = newTemp(ty);
4151            src  = newTemp(ty);
4152            dst1 = newTemp(ty);
4153            assign(dst0, mkU(ty,0));
4154            assign(src,  getIRegE(sz, pfx, modrm));
4155            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
4156                                                       mkexpr(src)));
4157            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
4158            putIRegE(sz, pfx, modrm, mkexpr(dst1));
4159            DIP("neg%c %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm));
4160            break;
4161         case 4: /* MUL (unsigned widening) */
4162            delta++;
4163            src = newTemp(ty);
4164            assign(src, getIRegE(sz,pfx,modrm));
4165            codegen_mulL_A_D ( sz, False, src,
4166                               nameIRegE(sz,pfx,modrm) );
4167            break;
4168         case 5: /* IMUL (signed widening) */
4169            delta++;
4170            src = newTemp(ty);
4171            assign(src, getIRegE(sz,pfx,modrm));
4172            codegen_mulL_A_D ( sz, True, src,
4173                               nameIRegE(sz,pfx,modrm) );
4174            break;
4175         case 6: /* DIV */
4176            delta++;
4177            assign( t1, getIRegE(sz, pfx, modrm) );
4178            codegen_div ( sz, t1, False );
4179            DIP("div%c %s\n", nameISize(sz),
4180                              nameIRegE(sz, pfx, modrm));
4181            break;
4182         case 7: /* IDIV */
4183            delta++;
4184            assign( t1, getIRegE(sz, pfx, modrm) );
4185            codegen_div ( sz, t1, True );
4186            DIP("idiv%c %s\n", nameISize(sz),
4187                               nameIRegE(sz, pfx, modrm));
4188            break;
4189         default:
4190            /*NOTREACHED*/
4191            vpanic("Grp3(amd64,R)");
4192      }
4193   } else {
4194      /* Decide if F2/XACQ or F3/XREL might be valid. */
4195      Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
4196      if ((gregLO3ofRM(modrm) == 3/*NEG*/ || gregLO3ofRM(modrm) == 2/*NOT*/)
4197          && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
4198         validF2orF3 = True;
4199      }
4200      if (!validF2orF3) goto unhandled;
4201      /* */
4202      addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
4203                        /* we have to inform disAMode of any immediate
4204                           bytes used */
4205                        gregLO3ofRM(modrm)==0/*TEST*/
4206                           ? imin(4,sz)
4207                           : 0
4208                      );
4209      t1   = newTemp(ty);
4210      delta += len;
4211      assign(t1, loadLE(ty,mkexpr(addr)));
4212      switch (gregLO3ofRM(modrm)) {
4213         case 0: { /* TEST */
4214            d64 = getSDisp(imin(4,sz), delta);
4215            delta += imin(4,sz);
4216            dst1 = newTemp(ty);
4217            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
4218                               mkexpr(t1),
4219                               mkU(ty, d64 & mkSizeMask(sz))));
4220            setFlags_DEP1( Iop_And8, dst1, ty );
4221            DIP("test%c $%lld, %s\n", nameISize(sz), d64, dis_buf);
4222            break;
4223         }
4224         case 1:
4225            *decode_OK = False;
4226            return delta;
4227         case 2: /* NOT */
4228            dst1 = newTemp(ty);
4229            assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
4230            if (haveLOCK(pfx)) {
4231               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
4232                                    guest_RIP_curr_instr );
4233            } else {
4234               storeLE( mkexpr(addr), mkexpr(dst1) );
4235            }
4236            DIP("not%c %s\n", nameISize(sz), dis_buf);
4237            break;
4238         case 3: /* NEG */
4239            dst0 = newTemp(ty);
4240            src  = newTemp(ty);
4241            dst1 = newTemp(ty);
4242            assign(dst0, mkU(ty,0));
4243            assign(src,  mkexpr(t1));
4244            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
4245                                                       mkexpr(src)));
4246            if (haveLOCK(pfx)) {
4247               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
4248                                    guest_RIP_curr_instr );
4249            } else {
4250               storeLE( mkexpr(addr), mkexpr(dst1) );
4251            }
4252            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
4253            DIP("neg%c %s\n", nameISize(sz), dis_buf);
4254            break;
4255         case 4: /* MUL (unsigned widening) */
4256            codegen_mulL_A_D ( sz, False, t1, dis_buf );
4257            break;
4258         case 5: /* IMUL */
4259            codegen_mulL_A_D ( sz, True, t1, dis_buf );
4260            break;
4261         case 6: /* DIV */
4262            codegen_div ( sz, t1, False );
4263            DIP("div%c %s\n", nameISize(sz), dis_buf);
4264            break;
4265         case 7: /* IDIV */
4266            codegen_div ( sz, t1, True );
4267            DIP("idiv%c %s\n", nameISize(sz), dis_buf);
4268            break;
4269         default:
4270            /*NOTREACHED*/
4271            vpanic("Grp3(amd64,M)");
4272      }
4273   }
4274   return delta;
4275  unhandled:
4276   *decode_OK = False;
4277   return delta;
4278}
4279
4280
4281/* Group 4 extended opcodes.  We have to decide here whether F2 and F3
4282   might be valid. */
4283static
4284ULong dis_Grp4 ( const VexAbiInfo* vbi,
4285                 Prefix pfx, Long delta, Bool* decode_OK )
4286{
4287   Int   alen;
4288   UChar modrm;
4289   HChar dis_buf[50];
4290   IRType ty = Ity_I8;
4291   IRTemp t1 = newTemp(ty);
4292   IRTemp t2 = newTemp(ty);
4293
4294   *decode_OK = True;
4295
4296   modrm = getUChar(delta);
4297   if (epartIsReg(modrm)) {
4298      /* F2/XACQ and F3/XREL are always invalid in the non-mem case. */
4299      if (haveF2orF3(pfx)) goto unhandled;
4300      assign(t1, getIRegE(1, pfx, modrm));
4301      switch (gregLO3ofRM(modrm)) {
4302         case 0: /* INC */
4303            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
4304            putIRegE(1, pfx, modrm, mkexpr(t2));
4305            setFlags_INC_DEC( True, t2, ty );
4306            break;
4307         case 1: /* DEC */
4308            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
4309            putIRegE(1, pfx, modrm, mkexpr(t2));
4310            setFlags_INC_DEC( False, t2, ty );
4311            break;
4312         default:
4313            *decode_OK = False;
4314            return delta;
4315      }
4316      delta++;
4317      DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)),
4318                      nameIRegE(1, pfx, modrm));
4319   } else {
4320      /* Decide if F2/XACQ or F3/XREL might be valid. */
4321      Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
4322      if ((gregLO3ofRM(modrm) == 0/*INC*/ || gregLO3ofRM(modrm) == 1/*DEC*/)
4323          && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
4324         validF2orF3 = True;
4325      }
4326      if (!validF2orF3) goto unhandled;
4327      /* */
4328      IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
4329      assign( t1, loadLE(ty, mkexpr(addr)) );
4330      switch (gregLO3ofRM(modrm)) {
4331         case 0: /* INC */
4332            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
4333            if (haveLOCK(pfx)) {
4334               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
4335                      guest_RIP_curr_instr );
4336            } else {
4337               storeLE( mkexpr(addr), mkexpr(t2) );
4338            }
4339            setFlags_INC_DEC( True, t2, ty );
4340            break;
4341         case 1: /* DEC */
4342            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
4343            if (haveLOCK(pfx)) {
4344               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
4345                      guest_RIP_curr_instr );
4346            } else {
4347               storeLE( mkexpr(addr), mkexpr(t2) );
4348            }
4349            setFlags_INC_DEC( False, t2, ty );
4350            break;
4351         default:
4352            *decode_OK = False;
4353            return delta;
4354      }
4355      delta += alen;
4356      DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)), dis_buf);
4357   }
4358   return delta;
4359  unhandled:
4360   *decode_OK = False;
4361   return delta;
4362}
4363
4364
4365/* Group 5 extended opcodes.  We have to decide here whether F2 and F3
4366   might be valid. */
4367static
4368ULong dis_Grp5 ( const VexAbiInfo* vbi,
4369                 Prefix pfx, Int sz, Long delta,
4370                 /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
4371{
4372   Int     len;
4373   UChar   modrm;
4374   HChar   dis_buf[50];
4375   IRTemp  addr = IRTemp_INVALID;
4376   IRType  ty = szToITy(sz);
4377   IRTemp  t1 = newTemp(ty);
4378   IRTemp  t2 = IRTemp_INVALID;
4379   IRTemp  t3 = IRTemp_INVALID;
4380   Bool    showSz = True;
4381
4382   *decode_OK = True;
4383
4384   modrm = getUChar(delta);
4385   if (epartIsReg(modrm)) {
4386      /* F2/XACQ and F3/XREL are always invalid in the non-mem case.
4387         F2/CALL and F2/JMP may have bnd prefix. */
4388     if (haveF2orF3(pfx)
4389         && ! (haveF2(pfx)
4390               && (gregLO3ofRM(modrm) == 2 || gregLO3ofRM(modrm) == 4)))
4391        goto unhandledR;
4392      assign(t1, getIRegE(sz,pfx,modrm));
4393      switch (gregLO3ofRM(modrm)) {
4394         case 0: /* INC */
4395            t2 = newTemp(ty);
4396            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
4397                             mkexpr(t1), mkU(ty,1)));
4398            setFlags_INC_DEC( True, t2, ty );
4399            putIRegE(sz,pfx,modrm, mkexpr(t2));
4400            break;
4401         case 1: /* DEC */
4402            t2 = newTemp(ty);
4403            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
4404                             mkexpr(t1), mkU(ty,1)));
4405            setFlags_INC_DEC( False, t2, ty );
4406            putIRegE(sz,pfx,modrm, mkexpr(t2));
4407            break;
4408         case 2: /* call Ev */
4409            /* Ignore any sz value and operate as if sz==8. */
4410            if (!(sz == 4 || sz == 8)) goto unhandledR;
4411            if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
4412            sz = 8;
4413            t3 = newTemp(Ity_I64);
4414            assign(t3, getIRegE(sz,pfx,modrm));
4415            t2 = newTemp(Ity_I64);
4416            assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
4417            putIReg64(R_RSP, mkexpr(t2));
4418            storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+1));
4419            make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(reg)");
4420            jmp_treg(dres, Ijk_Call, t3);
4421            vassert(dres->whatNext == Dis_StopHere);
4422            showSz = False;
4423            break;
4424         case 4: /* jmp Ev */
4425            /* Ignore any sz value and operate as if sz==8. */
4426            if (!(sz == 4 || sz == 8)) goto unhandledR;
4427            if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
4428            sz = 8;
4429            t3 = newTemp(Ity_I64);
4430            assign(t3, getIRegE(sz,pfx,modrm));
4431            jmp_treg(dres, Ijk_Boring, t3);
4432            vassert(dres->whatNext == Dis_StopHere);
4433            showSz = False;
4434            break;
4435         case 6: /* PUSH Ev */
4436            /* There is no encoding for 32-bit operand size; hence ... */
4437            if (sz == 4) sz = 8;
4438            if (sz == 8 || sz == 2) {
4439               ty = szToITy(sz); /* redo it, since sz might have changed */
4440               t3 = newTemp(ty);
4441               assign(t3, getIRegE(sz,pfx,modrm));
4442               t2 = newTemp(Ity_I64);
4443               assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
4444               putIReg64(R_RSP, mkexpr(t2) );
4445               storeLE( mkexpr(t2), mkexpr(t3) );
4446               break;
4447            } else {
4448               goto unhandledR; /* awaiting test case */
4449            }
4450         default:
4451         unhandledR:
4452            *decode_OK = False;
4453            return delta;
4454      }
4455      delta++;
4456      DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
4457                       showSz ? nameISize(sz) : ' ',
4458                       nameIRegE(sz, pfx, modrm));
4459   } else {
4460      /* Decide if F2/XACQ, F3/XREL, F2/CALL or F2/JMP might be valid. */
4461      Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
4462      if ((gregLO3ofRM(modrm) == 0/*INC*/ || gregLO3ofRM(modrm) == 1/*DEC*/)
4463          && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
4464         validF2orF3 = True;
4465      } else if ((gregLO3ofRM(modrm) == 2 || gregLO3ofRM(modrm) == 4)
4466                 && (haveF2(pfx) && !haveF3(pfx))) {
4467         validF2orF3 = True;
4468      }
4469      if (!validF2orF3) goto unhandledM;
4470      /* */
4471      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
4472      if (gregLO3ofRM(modrm) != 2 && gregLO3ofRM(modrm) != 4
4473                                  && gregLO3ofRM(modrm) != 6) {
4474         assign(t1, loadLE(ty,mkexpr(addr)));
4475      }
4476      switch (gregLO3ofRM(modrm)) {
4477         case 0: /* INC */
4478            t2 = newTemp(ty);
4479            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
4480                             mkexpr(t1), mkU(ty,1)));
4481            if (haveLOCK(pfx)) {
4482               casLE( mkexpr(addr),
4483                      mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
4484            } else {
4485               storeLE(mkexpr(addr),mkexpr(t2));
4486            }
4487            setFlags_INC_DEC( True, t2, ty );
4488            break;
4489         case 1: /* DEC */
4490            t2 = newTemp(ty);
4491            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
4492                             mkexpr(t1), mkU(ty,1)));
4493            if (haveLOCK(pfx)) {
4494               casLE( mkexpr(addr),
4495                      mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
4496            } else {
4497               storeLE(mkexpr(addr),mkexpr(t2));
4498            }
4499            setFlags_INC_DEC( False, t2, ty );
4500            break;
4501         case 2: /* call Ev */
4502            /* Ignore any sz value and operate as if sz==8. */
4503            if (!(sz == 4 || sz == 8)) goto unhandledM;
4504            if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
4505            sz = 8;
4506            t3 = newTemp(Ity_I64);
4507            assign(t3, loadLE(Ity_I64,mkexpr(addr)));
4508            t2 = newTemp(Ity_I64);
4509            assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
4510            putIReg64(R_RSP, mkexpr(t2));
4511            storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+len));
4512            make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(mem)");
4513            jmp_treg(dres, Ijk_Call, t3);
4514            vassert(dres->whatNext == Dis_StopHere);
4515            showSz = False;
4516            break;
4517         case 4: /* JMP Ev */
4518            /* Ignore any sz value and operate as if sz==8. */
4519            if (!(sz == 4 || sz == 8)) goto unhandledM;
4520            if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
4521            sz = 8;
4522            t3 = newTemp(Ity_I64);
4523            assign(t3, loadLE(Ity_I64,mkexpr(addr)));
4524            jmp_treg(dres, Ijk_Boring, t3);
4525            vassert(dres->whatNext == Dis_StopHere);
4526            showSz = False;
4527            break;
4528         case 6: /* PUSH Ev */
4529            /* There is no encoding for 32-bit operand size; hence ... */
4530            if (sz == 4) sz = 8;
4531            if (sz == 8 || sz == 2) {
4532               ty = szToITy(sz); /* redo it, since sz might have changed */
4533               t3 = newTemp(ty);
4534               assign(t3, loadLE(ty,mkexpr(addr)));
4535               t2 = newTemp(Ity_I64);
4536               assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
4537               putIReg64(R_RSP, mkexpr(t2) );
4538               storeLE( mkexpr(t2), mkexpr(t3) );
4539               break;
4540            } else {
4541               goto unhandledM; /* awaiting test case */
4542            }
4543         default:
4544         unhandledM:
4545            *decode_OK = False;
4546            return delta;
4547      }
4548      delta += len;
4549      DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
4550                       showSz ? nameISize(sz) : ' ',
4551                       dis_buf);
4552   }
4553   return delta;
4554}
4555
4556
4557/*------------------------------------------------------------*/
4558/*--- Disassembling string ops (including REP prefixes)    ---*/
4559/*------------------------------------------------------------*/
4560
4561/* Code shared by all the string ops */
4562static
4563void dis_string_op_increment ( Int sz, IRTemp t_inc )
4564{
4565   UChar logSz;
4566   if (sz == 8 || sz == 4 || sz == 2) {
4567      logSz = 1;
4568      if (sz == 4) logSz = 2;
4569      if (sz == 8) logSz = 3;
4570      assign( t_inc,
4571              binop(Iop_Shl64, IRExpr_Get( OFFB_DFLAG, Ity_I64 ),
4572                               mkU8(logSz) ) );
4573   } else {
4574      assign( t_inc,
4575              IRExpr_Get( OFFB_DFLAG, Ity_I64 ) );
4576   }
4577}
4578
4579static
4580void dis_string_op( void (*dis_OP)( Int, IRTemp, Prefix pfx ),
4581                    Int sz, const HChar* name, Prefix pfx )
4582{
4583   IRTemp t_inc = newTemp(Ity_I64);
4584   /* Really we ought to inspect the override prefixes, but we don't.
4585      The following assertion catches any resulting sillyness. */
4586   vassert(pfx == clearSegBits(pfx));
4587   dis_string_op_increment(sz, t_inc);
4588   dis_OP( sz, t_inc, pfx );
4589   DIP("%s%c\n", name, nameISize(sz));
4590}
4591
4592static
4593void dis_MOVS ( Int sz, IRTemp t_inc, Prefix pfx )
4594{
4595   IRType ty = szToITy(sz);
4596   IRTemp td = newTemp(Ity_I64);   /* RDI */
4597   IRTemp ts = newTemp(Ity_I64);   /* RSI */
4598   IRExpr *incd, *incs;
4599
4600   if (haveASO(pfx)) {
4601      assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
4602      assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
4603   } else {
4604      assign( td, getIReg64(R_RDI) );
4605      assign( ts, getIReg64(R_RSI) );
4606   }
4607
4608   storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
4609
4610   incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
4611   incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
4612   if (haveASO(pfx)) {
4613      incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
4614      incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
4615   }
4616   putIReg64( R_RDI, incd );
4617   putIReg64( R_RSI, incs );
4618}
4619
4620static
4621void dis_LODS ( Int sz, IRTemp t_inc, Prefix pfx )
4622{
4623   IRType ty = szToITy(sz);
4624   IRTemp ts = newTemp(Ity_I64);   /* RSI */
4625   IRExpr *incs;
4626
4627   if (haveASO(pfx))
4628      assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
4629   else
4630      assign( ts, getIReg64(R_RSI) );
4631
4632   putIRegRAX ( sz, loadLE(ty, mkexpr(ts)) );
4633
4634   incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
4635   if (haveASO(pfx))
4636      incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
4637   putIReg64( R_RSI, incs );
4638}
4639
4640static
4641void dis_STOS ( Int sz, IRTemp t_inc, Prefix pfx )
4642{
4643   IRType ty = szToITy(sz);
4644   IRTemp ta = newTemp(ty);        /* rAX */
4645   IRTemp td = newTemp(Ity_I64);   /* RDI */
4646   IRExpr *incd;
4647
4648   assign( ta, getIRegRAX(sz) );
4649
4650   if (haveASO(pfx))
4651      assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
4652   else
4653      assign( td, getIReg64(R_RDI) );
4654
4655   storeLE( mkexpr(td), mkexpr(ta) );
4656
4657   incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
4658   if (haveASO(pfx))
4659      incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
4660   putIReg64( R_RDI, incd );
4661}
4662
4663static
4664void dis_CMPS ( Int sz, IRTemp t_inc, Prefix pfx )
4665{
4666   IRType ty  = szToITy(sz);
4667   IRTemp tdv = newTemp(ty);      /* (RDI) */
4668   IRTemp tsv = newTemp(ty);      /* (RSI) */
4669   IRTemp td  = newTemp(Ity_I64); /*  RDI  */
4670   IRTemp ts  = newTemp(Ity_I64); /*  RSI  */
4671   IRExpr *incd, *incs;
4672
4673   if (haveASO(pfx)) {
4674      assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
4675      assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
4676   } else {
4677      assign( td, getIReg64(R_RDI) );
4678      assign( ts, getIReg64(R_RSI) );
4679   }
4680
4681   assign( tdv, loadLE(ty,mkexpr(td)) );
4682
4683   assign( tsv, loadLE(ty,mkexpr(ts)) );
4684
4685   setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
4686
4687   incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
4688   incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
4689   if (haveASO(pfx)) {
4690      incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
4691      incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
4692   }
4693   putIReg64( R_RDI, incd );
4694   putIReg64( R_RSI, incs );
4695}
4696
4697static
4698void dis_SCAS ( Int sz, IRTemp t_inc, Prefix pfx )
4699{
4700   IRType ty  = szToITy(sz);
4701   IRTemp ta  = newTemp(ty);       /*  rAX  */
4702   IRTemp td  = newTemp(Ity_I64);  /*  RDI  */
4703   IRTemp tdv = newTemp(ty);       /* (RDI) */
4704   IRExpr *incd;
4705
4706   assign( ta, getIRegRAX(sz) );
4707
4708   if (haveASO(pfx))
4709      assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
4710   else
4711      assign( td, getIReg64(R_RDI) );
4712
4713   assign( tdv, loadLE(ty,mkexpr(td)) );
4714
4715   setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
4716
4717   incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
4718   if (haveASO(pfx))
4719      incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
4720   putIReg64( R_RDI, incd );
4721}
4722
4723
4724/* Wrap the appropriate string op inside a REP/REPE/REPNE.  We assume
4725   the insn is the last one in the basic block, and so emit a jump to
4726   the next insn, rather than just falling through. */
4727static
4728void dis_REP_op ( /*MOD*/DisResult* dres,
4729                  AMD64Condcode cond,
4730                  void (*dis_OP)(Int, IRTemp, Prefix),
4731                  Int sz, Addr64 rip, Addr64 rip_next, const HChar* name,
4732                  Prefix pfx )
4733{
4734   IRTemp t_inc = newTemp(Ity_I64);
4735   IRTemp tc;
4736   IRExpr* cmp;
4737
4738   /* Really we ought to inspect the override prefixes, but we don't.
4739      The following assertion catches any resulting sillyness. */
4740   vassert(pfx == clearSegBits(pfx));
4741
4742   if (haveASO(pfx)) {
4743      tc = newTemp(Ity_I32);  /*  ECX  */
4744      assign( tc, getIReg32(R_RCX) );
4745      cmp = binop(Iop_CmpEQ32, mkexpr(tc), mkU32(0));
4746   } else {
4747      tc = newTemp(Ity_I64);  /*  RCX  */
4748      assign( tc, getIReg64(R_RCX) );
4749      cmp = binop(Iop_CmpEQ64, mkexpr(tc), mkU64(0));
4750   }
4751
4752   stmt( IRStmt_Exit( cmp, Ijk_Boring,
4753                      IRConst_U64(rip_next), OFFB_RIP ) );
4754
4755   if (haveASO(pfx))
4756      putIReg32(R_RCX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
4757  else
4758      putIReg64(R_RCX, binop(Iop_Sub64, mkexpr(tc), mkU64(1)) );
4759
4760   dis_string_op_increment(sz, t_inc);
4761   dis_OP (sz, t_inc, pfx);
4762
4763   if (cond == AMD64CondAlways) {
4764      jmp_lit(dres, Ijk_Boring, rip);
4765      vassert(dres->whatNext == Dis_StopHere);
4766   } else {
4767      stmt( IRStmt_Exit( mk_amd64g_calculate_condition(cond),
4768                         Ijk_Boring,
4769                         IRConst_U64(rip),
4770                         OFFB_RIP ) );
4771      jmp_lit(dres, Ijk_Boring, rip_next);
4772      vassert(dres->whatNext == Dis_StopHere);
4773   }
4774   DIP("%s%c\n", name, nameISize(sz));
4775}
4776
4777
4778/*------------------------------------------------------------*/
4779/*--- Arithmetic, etc.                                     ---*/
4780/*------------------------------------------------------------*/
4781
4782/* IMUL E, G.  Supplied eip points to the modR/M byte. */
4783static
4784ULong dis_mul_E_G ( const VexAbiInfo* vbi,
4785                    Prefix      pfx,
4786                    Int         size,
4787                    Long        delta0 )
4788{
4789   Int    alen;
4790   HChar  dis_buf[50];
4791   UChar  rm = getUChar(delta0);
4792   IRType ty = szToITy(size);
4793   IRTemp te = newTemp(ty);
4794   IRTemp tg = newTemp(ty);
4795   IRTemp resLo = newTemp(ty);
4796
4797   assign( tg, getIRegG(size, pfx, rm) );
4798   if (epartIsReg(rm)) {
4799      assign( te, getIRegE(size, pfx, rm) );
4800   } else {
4801      IRTemp addr = disAMode( &alen, vbi, pfx, delta0, dis_buf, 0 );
4802      assign( te, loadLE(ty,mkexpr(addr)) );
4803   }
4804
4805   setFlags_MUL ( ty, te, tg, AMD64G_CC_OP_SMULB );
4806
4807   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
4808
4809   putIRegG(size, pfx, rm, mkexpr(resLo) );
4810
4811   if (epartIsReg(rm)) {
4812      DIP("imul%c %s, %s\n", nameISize(size),
4813                             nameIRegE(size,pfx,rm),
4814                             nameIRegG(size,pfx,rm));
4815      return 1+delta0;
4816   } else {
4817      DIP("imul%c %s, %s\n", nameISize(size),
4818                             dis_buf,
4819                             nameIRegG(size,pfx,rm));
4820      return alen+delta0;
4821   }
4822}
4823
4824
4825/* IMUL I * E -> G.  Supplied rip points to the modR/M byte. */
4826static
4827ULong dis_imul_I_E_G ( const VexAbiInfo* vbi,
4828                       Prefix      pfx,
4829                       Int         size,
4830                       Long        delta,
4831                       Int         litsize )
4832{
4833   Long   d64;
4834   Int    alen;
4835   HChar  dis_buf[50];
4836   UChar  rm = getUChar(delta);
4837   IRType ty = szToITy(size);
4838   IRTemp te = newTemp(ty);
4839   IRTemp tl = newTemp(ty);
4840   IRTemp resLo = newTemp(ty);
4841
4842   vassert(/*size == 1 ||*/ size == 2 || size == 4 || size == 8);
4843
4844   if (epartIsReg(rm)) {
4845      assign(te, getIRegE(size, pfx, rm));
4846      delta++;
4847   } else {
4848      IRTemp addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
4849                                     imin(4,litsize) );
4850      assign(te, loadLE(ty, mkexpr(addr)));
4851      delta += alen;
4852   }
4853   d64 = getSDisp(imin(4,litsize),delta);
4854   delta += imin(4,litsize);
4855
4856   d64 &= mkSizeMask(size);
4857   assign(tl, mkU(ty,d64));
4858
4859   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
4860
4861   setFlags_MUL ( ty, te, tl, AMD64G_CC_OP_SMULB );
4862
4863   putIRegG(size, pfx, rm, mkexpr(resLo));
4864
4865   DIP("imul%c $%lld, %s, %s\n",
4866       nameISize(size), d64,
4867       ( epartIsReg(rm) ? nameIRegE(size,pfx,rm) : dis_buf ),
4868       nameIRegG(size,pfx,rm) );
4869   return delta;
4870}
4871
4872
4873/* Generate an IR sequence to do a popcount operation on the supplied
4874   IRTemp, and return a new IRTemp holding the result.  'ty' may be
4875   Ity_I16, Ity_I32 or Ity_I64 only. */
4876static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src )
4877{
4878   Int i;
4879   if (ty == Ity_I16) {
4880      IRTemp old = IRTemp_INVALID;
4881      IRTemp nyu = IRTemp_INVALID;
4882      IRTemp mask[4], shift[4];
4883      for (i = 0; i < 4; i++) {
4884         mask[i]  = newTemp(ty);
4885         shift[i] = 1 << i;
4886      }
4887      assign(mask[0], mkU16(0x5555));
4888      assign(mask[1], mkU16(0x3333));
4889      assign(mask[2], mkU16(0x0F0F));
4890      assign(mask[3], mkU16(0x00FF));
4891      old = src;
4892      for (i = 0; i < 4; i++) {
4893         nyu = newTemp(ty);
4894         assign(nyu,
4895                binop(Iop_Add16,
4896                      binop(Iop_And16,
4897                            mkexpr(old),
4898                            mkexpr(mask[i])),
4899                      binop(Iop_And16,
4900                            binop(Iop_Shr16, mkexpr(old), mkU8(shift[i])),
4901                            mkexpr(mask[i]))));
4902         old = nyu;
4903      }
4904      return nyu;
4905   }
4906   if (ty == Ity_I32) {
4907      IRTemp old = IRTemp_INVALID;
4908      IRTemp nyu = IRTemp_INVALID;
4909      IRTemp mask[5], shift[5];
4910      for (i = 0; i < 5; i++) {
4911         mask[i]  = newTemp(ty);
4912         shift[i] = 1 << i;
4913      }
4914      assign(mask[0], mkU32(0x55555555));
4915      assign(mask[1], mkU32(0x33333333));
4916      assign(mask[2], mkU32(0x0F0F0F0F));
4917      assign(mask[3], mkU32(0x00FF00FF));
4918      assign(mask[4], mkU32(0x0000FFFF));
4919      old = src;
4920      for (i = 0; i < 5; i++) {
4921         nyu = newTemp(ty);
4922         assign(nyu,
4923                binop(Iop_Add32,
4924                      binop(Iop_And32,
4925                            mkexpr(old),
4926                            mkexpr(mask[i])),
4927                      binop(Iop_And32,
4928                            binop(Iop_Shr32, mkexpr(old), mkU8(shift[i])),
4929                            mkexpr(mask[i]))));
4930         old = nyu;
4931      }
4932      return nyu;
4933   }
4934   if (ty == Ity_I64) {
4935      IRTemp old = IRTemp_INVALID;
4936      IRTemp nyu = IRTemp_INVALID;
4937      IRTemp mask[6], shift[6];
4938      for (i = 0; i < 6; i++) {
4939         mask[i]  = newTemp(ty);
4940         shift[i] = 1 << i;
4941      }
4942      assign(mask[0], mkU64(0x5555555555555555ULL));
4943      assign(mask[1], mkU64(0x3333333333333333ULL));
4944      assign(mask[2], mkU64(0x0F0F0F0F0F0F0F0FULL));
4945      assign(mask[3], mkU64(0x00FF00FF00FF00FFULL));
4946      assign(mask[4], mkU64(0x0000FFFF0000FFFFULL));
4947      assign(mask[5], mkU64(0x00000000FFFFFFFFULL));
4948      old = src;
4949      for (i = 0; i < 6; i++) {
4950         nyu = newTemp(ty);
4951         assign(nyu,
4952                binop(Iop_Add64,
4953                      binop(Iop_And64,
4954                            mkexpr(old),
4955                            mkexpr(mask[i])),
4956                      binop(Iop_And64,
4957                            binop(Iop_Shr64, mkexpr(old), mkU8(shift[i])),
4958                            mkexpr(mask[i]))));
4959         old = nyu;
4960      }
4961      return nyu;
4962   }
4963   /*NOTREACHED*/
4964   vassert(0);
4965}
4966
4967
4968/* Generate an IR sequence to do a count-leading-zeroes operation on
4969   the supplied IRTemp, and return a new IRTemp holding the result.
4970   'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
4971   the argument is zero, return the number of bits in the word (the
4972   natural semantics). */
4973static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
4974{
4975   vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
4976
4977   IRTemp src64 = newTemp(Ity_I64);
4978   assign(src64, widenUto64( mkexpr(src) ));
4979
4980   IRTemp src64x = newTemp(Ity_I64);
4981   assign(src64x,
4982          binop(Iop_Shl64, mkexpr(src64),
4983                           mkU8(64 - 8 * sizeofIRType(ty))));
4984
4985   // Clz64 has undefined semantics when its input is zero, so
4986   // special-case around that.
4987   IRTemp res64 = newTemp(Ity_I64);
4988   assign(res64,
4989          IRExpr_ITE(
4990             binop(Iop_CmpEQ64, mkexpr(src64x), mkU64(0)),
4991             mkU64(8 * sizeofIRType(ty)),
4992             unop(Iop_Clz64, mkexpr(src64x))
4993   ));
4994
4995   IRTemp res = newTemp(ty);
4996   assign(res, narrowTo(ty, mkexpr(res64)));
4997   return res;
4998}
4999
5000
5001/* Generate an IR sequence to do a count-trailing-zeroes operation on
5002   the supplied IRTemp, and return a new IRTemp holding the result.
5003   'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
5004   the argument is zero, return the number of bits in the word (the
5005   natural semantics). */
5006static IRTemp gen_TZCNT ( IRType ty, IRTemp src )
5007{
5008   vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
5009
5010   IRTemp src64 = newTemp(Ity_I64);
5011   assign(src64, widenUto64( mkexpr(src) ));
5012
5013   // Ctz64 has undefined semantics when its input is zero, so
5014   // special-case around that.
5015   IRTemp res64 = newTemp(Ity_I64);
5016   assign(res64,
5017          IRExpr_ITE(
5018             binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0)),
5019             mkU64(8 * sizeofIRType(ty)),
5020             unop(Iop_Ctz64, mkexpr(src64))
5021   ));
5022
5023   IRTemp res = newTemp(ty);
5024   assign(res, narrowTo(ty, mkexpr(res64)));
5025   return res;
5026}
5027
5028
5029/*------------------------------------------------------------*/
5030/*---                                                      ---*/
5031/*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
5032/*---                                                      ---*/
5033/*------------------------------------------------------------*/
5034
5035/* --- Helper functions for dealing with the register stack. --- */
5036
5037/* --- Set the emulation-warning pseudo-register. --- */
5038
5039static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
5040{
5041   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
5042   stmt( IRStmt_Put( OFFB_EMNOTE, e ) );
5043}
5044
5045/* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
5046
5047static IRExpr* mkQNaN64 ( void )
5048{
5049  /* QNaN is 0 2047 1 0(51times)
5050     == 0b 11111111111b 1 0(51times)
5051     == 0x7FF8 0000 0000 0000
5052   */
5053   return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
5054}
5055
5056/* --------- Get/put the top-of-stack pointer :: Ity_I32 --------- */
5057
5058static IRExpr* get_ftop ( void )
5059{
5060   return IRExpr_Get( OFFB_FTOP, Ity_I32 );
5061}
5062
5063static void put_ftop ( IRExpr* e )
5064{
5065   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
5066   stmt( IRStmt_Put( OFFB_FTOP, e ) );
5067}
5068
5069/* --------- Get/put the C3210 bits. --------- */
5070
5071static IRExpr*  /* :: Ity_I64 */ get_C3210 ( void )
5072{
5073   return IRExpr_Get( OFFB_FC3210, Ity_I64 );
5074}
5075
5076static void put_C3210 ( IRExpr* e  /* :: Ity_I64 */ )
5077{
5078   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
5079   stmt( IRStmt_Put( OFFB_FC3210, e ) );
5080}
5081
5082/* --------- Get/put the FPU rounding mode. --------- */
5083static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
5084{
5085   return unop(Iop_64to32, IRExpr_Get( OFFB_FPROUND, Ity_I64 ));
5086}
5087
5088static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
5089{
5090   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
5091   stmt( IRStmt_Put( OFFB_FPROUND, unop(Iop_32Uto64,e) ) );
5092}
5093
5094
5095/* --------- Synthesise a 2-bit FPU rounding mode. --------- */
5096/* Produces a value in 0 .. 3, which is encoded as per the type
5097   IRRoundingMode.  Since the guest_FPROUND value is also encoded as
5098   per IRRoundingMode, we merely need to get it and mask it for
5099   safety.
5100*/
5101static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
5102{
5103   return binop( Iop_And32, get_fpround(), mkU32(3) );
5104}
5105
5106static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
5107{
5108   return mkU32(Irrm_NEAREST);
5109}
5110
5111
5112/* --------- Get/set FP register tag bytes. --------- */
5113
5114/* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
5115
5116static void put_ST_TAG ( Int i, IRExpr* value )
5117{
5118   IRRegArray* descr;
5119   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
5120   descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
5121   stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
5122}
5123
5124/* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
5125   zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
5126
5127static IRExpr* get_ST_TAG ( Int i )
5128{
5129   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
5130   return IRExpr_GetI( descr, get_ftop(), i );
5131}
5132
5133
5134/* --------- Get/set FP registers. --------- */
5135
5136/* Given i, and some expression e, emit 'ST(i) = e' and set the
5137   register's tag to indicate the register is full.  The previous
5138   state of the register is not checked. */
5139
5140static void put_ST_UNCHECKED ( Int i, IRExpr* value )
5141{
5142   IRRegArray* descr;
5143   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
5144   descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
5145   stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
5146   /* Mark the register as in-use. */
5147   put_ST_TAG(i, mkU8(1));
5148}
5149
5150/* Given i, and some expression e, emit
5151      ST(i) = is_full(i) ? NaN : e
5152   and set the tag accordingly.
5153*/
5154
5155static void put_ST ( Int i, IRExpr* value )
5156{
5157   put_ST_UNCHECKED(
5158      i,
5159      IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
5160                  /* non-0 means full */
5161                  mkQNaN64(),
5162                  /* 0 means empty */
5163                  value
5164      )
5165   );
5166}
5167
5168
5169/* Given i, generate an expression yielding 'ST(i)'. */
5170
5171static IRExpr* get_ST_UNCHECKED ( Int i )
5172{
5173   IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
5174   return IRExpr_GetI( descr, get_ftop(), i );
5175}
5176
5177
5178/* Given i, generate an expression yielding
5179  is_full(i) ? ST(i) : NaN
5180*/
5181
5182static IRExpr* get_ST ( Int i )
5183{
5184   return
5185      IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
5186                  /* non-0 means full */
5187                  get_ST_UNCHECKED(i),
5188                  /* 0 means empty */
5189                  mkQNaN64());
5190}
5191
5192
5193/* Given i, and some expression e, and a condition cond, generate IR
5194   which has the same effect as put_ST(i,e) when cond is true and has
5195   no effect when cond is false.  Given the lack of proper
5196   if-then-else in the IR, this is pretty tricky.
5197*/
5198
5199static void maybe_put_ST ( IRTemp cond, Int i, IRExpr* value )
5200{
5201   // new_tag = if cond then FULL else old_tag
5202   // new_val = if cond then (if old_tag==FULL then NaN else val)
5203   //                   else old_val
5204
5205   IRTemp old_tag = newTemp(Ity_I8);
5206   assign(old_tag, get_ST_TAG(i));
5207   IRTemp new_tag = newTemp(Ity_I8);
5208   assign(new_tag,
5209          IRExpr_ITE(mkexpr(cond), mkU8(1)/*FULL*/, mkexpr(old_tag)));
5210
5211   IRTemp old_val = newTemp(Ity_F64);
5212   assign(old_val, get_ST_UNCHECKED(i));
5213   IRTemp new_val = newTemp(Ity_F64);
5214   assign(new_val,
5215          IRExpr_ITE(mkexpr(cond),
5216                     IRExpr_ITE(binop(Iop_CmpNE8, mkexpr(old_tag), mkU8(0)),
5217                                /* non-0 means full */
5218                                mkQNaN64(),
5219                                /* 0 means empty */
5220                                value),
5221                     mkexpr(old_val)));
5222
5223   put_ST_UNCHECKED(i, mkexpr(new_val));
5224   // put_ST_UNCHECKED incorrectly sets tag(i) to always be FULL.  So
5225   // now set it to new_tag instead.
5226   put_ST_TAG(i, mkexpr(new_tag));
5227}
5228
5229/* Adjust FTOP downwards by one register. */
5230
5231static void fp_push ( void )
5232{
5233   put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
5234}
5235
5236/* Adjust FTOP downwards by one register when COND is 1:I1.  Else
5237   don't change it. */
5238
5239static void maybe_fp_push ( IRTemp cond )
5240{
5241   put_ftop( binop(Iop_Sub32, get_ftop(), unop(Iop_1Uto32,mkexpr(cond))) );
5242}
5243
5244/* Adjust FTOP upwards by one register, and mark the vacated register
5245   as empty.  */
5246
5247static void fp_pop ( void )
5248{
5249   put_ST_TAG(0, mkU8(0));
5250   put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
5251}
5252
5253/* Set the C2 bit of the FPU status register to e[0].  Assumes that
5254   e[31:1] == 0.
5255*/
5256static void set_C2 ( IRExpr* e )
5257{
5258   IRExpr* cleared = binop(Iop_And64, get_C3210(), mkU64(~AMD64G_FC_MASK_C2));
5259   put_C3210( binop(Iop_Or64,
5260                    cleared,
5261                    binop(Iop_Shl64, e, mkU8(AMD64G_FC_SHIFT_C2))) );
5262}
5263
5264/* Generate code to check that abs(d64) < 2^63 and is finite.  This is
5265   used to do the range checks for FSIN, FCOS, FSINCOS and FPTAN.  The
5266   test is simple, but the derivation of it is not so simple.
5267
5268   The exponent field for an IEEE754 double is 11 bits.  That means it
5269   can take values 0 through 0x7FF.  If the exponent has value 0x7FF,
5270   the number is either a NaN or an Infinity and so is not finite.
5271   Furthermore, a finite value of exactly 2^63 is the smallest value
5272   that has exponent value 0x43E.  Hence, what we need to do is
5273   extract the exponent, ignoring the sign bit and mantissa, and check
5274   it is < 0x43E, or <= 0x43D.
5275
5276   To make this easily applicable to 32- and 64-bit targets, a
5277   roundabout approach is used.  First the number is converted to I64,
5278   then the top 32 bits are taken.  Shifting them right by 20 bits
5279   places the sign bit and exponent in the bottom 12 bits.  Anding
5280   with 0x7FF gets rid of the sign bit, leaving just the exponent
5281   available for comparison.
5282*/
5283static IRTemp math_IS_TRIG_ARG_FINITE_AND_IN_RANGE ( IRTemp d64 )
5284{
5285   IRTemp i64 = newTemp(Ity_I64);
5286   assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(d64)) );
5287   IRTemp exponent = newTemp(Ity_I32);
5288   assign(exponent,
5289          binop(Iop_And32,
5290                binop(Iop_Shr32, unop(Iop_64HIto32, mkexpr(i64)), mkU8(20)),
5291                mkU32(0x7FF)));
5292   IRTemp in_range_and_finite = newTemp(Ity_I1);
5293   assign(in_range_and_finite,
5294          binop(Iop_CmpLE32U, mkexpr(exponent), mkU32(0x43D)));
5295   return in_range_and_finite;
5296}
5297
5298/* Invent a plausible-looking FPU status word value:
5299      ((ftop & 7) << 11) | (c3210 & 0x4700)
5300 */
5301static IRExpr* get_FPU_sw ( void )
5302{
5303   return
5304      unop(Iop_32to16,
5305           binop(Iop_Or32,
5306                 binop(Iop_Shl32,
5307                       binop(Iop_And32, get_ftop(), mkU32(7)),
5308                             mkU8(11)),
5309                       binop(Iop_And32, unop(Iop_64to32, get_C3210()),
5310                                        mkU32(0x4700))
5311      ));
5312}
5313
5314
5315/* Generate a dirty helper call that initialises the x87 state a la
5316   FINIT.  If |guard| is NULL, it is done unconditionally.  Otherwise
5317   |guard| is used as a guarding condition.
5318*/
5319static void gen_FINIT_SEQUENCE ( IRExpr* guard )
5320{
5321   /* Uses dirty helper:
5322         void amd64g_do_FINIT ( VexGuestAMD64State* ) */
5323   IRDirty* d  = unsafeIRDirty_0_N (
5324                    0/*regparms*/,
5325                    "amd64g_dirtyhelper_FINIT",
5326                    &amd64g_dirtyhelper_FINIT,
5327                    mkIRExprVec_1( IRExpr_GSPTR() )
5328                 );
5329
5330   /* declare we're writing guest state */
5331   d->nFxState = 5;
5332   vex_bzero(&d->fxState, sizeof(d->fxState));
5333
5334   d->fxState[0].fx     = Ifx_Write;
5335   d->fxState[0].offset = OFFB_FTOP;
5336   d->fxState[0].size   = sizeof(UInt);
5337
5338   d->fxState[1].fx     = Ifx_Write;
5339   d->fxState[1].offset = OFFB_FPREGS;
5340   d->fxState[1].size   = 8 * sizeof(ULong);
5341
5342   d->fxState[2].fx     = Ifx_Write;
5343   d->fxState[2].offset = OFFB_FPTAGS;
5344   d->fxState[2].size   = 8 * sizeof(UChar);
5345
5346   d->fxState[3].fx     = Ifx_Write;
5347   d->fxState[3].offset = OFFB_FPROUND;
5348   d->fxState[3].size   = sizeof(ULong);
5349
5350   d->fxState[4].fx     = Ifx_Write;
5351   d->fxState[4].offset = OFFB_FC3210;
5352   d->fxState[4].size   = sizeof(ULong);
5353
5354   if (guard)
5355      d->guard = guard;
5356
5357   stmt( IRStmt_Dirty(d) );
5358}
5359
5360
5361/* ------------------------------------------------------- */
5362/* Given all that stack-mangling junk, we can now go ahead
5363   and describe FP instructions.
5364*/
5365
5366/* ST(0) = ST(0) `op` mem64/32(addr)
5367   Need to check ST(0)'s tag on read, but not on write.
5368*/
5369static
5370void fp_do_op_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
5371                         IROp op, Bool dbl )
5372{
5373   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
5374   if (dbl) {
5375      put_ST_UNCHECKED(0,
5376         triop( op,
5377                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5378                get_ST(0),
5379                loadLE(Ity_F64,mkexpr(addr))
5380         ));
5381   } else {
5382      put_ST_UNCHECKED(0,
5383         triop( op,
5384                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5385                get_ST(0),
5386                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
5387         ));
5388   }
5389}
5390
5391
5392/* ST(0) = mem64/32(addr) `op` ST(0)
5393   Need to check ST(0)'s tag on read, but not on write.
5394*/
5395static
5396void fp_do_oprev_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
5397                            IROp op, Bool dbl )
5398{
5399   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
5400   if (dbl) {
5401      put_ST_UNCHECKED(0,
5402         triop( op,
5403                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5404                loadLE(Ity_F64,mkexpr(addr)),
5405                get_ST(0)
5406         ));
5407   } else {
5408      put_ST_UNCHECKED(0,
5409         triop( op,
5410                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5411                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
5412                get_ST(0)
5413         ));
5414   }
5415}
5416
5417
5418/* ST(dst) = ST(dst) `op` ST(src).
5419   Check dst and src tags when reading but not on write.
5420*/
5421static
5422void fp_do_op_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
5423                      Bool pop_after )
5424{
5425   DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
5426   put_ST_UNCHECKED(
5427      st_dst,
5428      triop( op,
5429             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5430             get_ST(st_dst),
5431             get_ST(st_src) )
5432   );
5433   if (pop_after)
5434      fp_pop();
5435}
5436
5437/* ST(dst) = ST(src) `op` ST(dst).
5438   Check dst and src tags when reading but not on write.
5439*/
5440static
5441void fp_do_oprev_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
5442                         Bool pop_after )
5443{
5444   DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
5445   put_ST_UNCHECKED(
5446      st_dst,
5447      triop( op,
5448             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5449             get_ST(st_src),
5450             get_ST(st_dst) )
5451   );
5452   if (pop_after)
5453      fp_pop();
5454}
5455
5456/* %rflags(Z,P,C) = UCOMI( st(0), st(i) ) */
5457static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
5458{
5459   DIP("fucomi%s %%st(0),%%st(%u)\n", pop_after ? "p" : "", i);
5460   /* This is a bit of a hack (and isn't really right).  It sets
5461      Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
5462      documentation implies A and S are unchanged.
5463   */
5464   /* It's also fishy in that it is used both for COMIP and
5465      UCOMIP, and they aren't the same (although similar). */
5466   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
5467   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
5468   stmt( IRStmt_Put(
5469            OFFB_CC_DEP1,
5470            binop( Iop_And64,
5471                   unop( Iop_32Uto64,
5472                         binop(Iop_CmpF64, get_ST(0), get_ST(i))),
5473                   mkU64(0x45)
5474        )));
5475   if (pop_after)
5476      fp_pop();
5477}
5478
5479
5480/* returns
5481   32to16( if e32 <s -32768 || e32 >s 32767 then -32768 else e32 )
5482*/
5483static IRExpr* x87ishly_qnarrow_32_to_16 ( IRExpr* e32 )
5484{
5485   IRTemp t32 = newTemp(Ity_I32);
5486   assign( t32, e32 );
5487   return
5488      IRExpr_ITE(
5489         binop(Iop_CmpLT64U,
5490               unop(Iop_32Uto64,
5491                    binop(Iop_Add32, mkexpr(t32), mkU32(32768))),
5492               mkU64(65536)),
5493         unop(Iop_32to16, mkexpr(t32)),
5494         mkU16( 0x8000 ) );
5495}
5496
5497
5498static
5499ULong dis_FPU ( /*OUT*/Bool* decode_ok,
5500                const VexAbiInfo* vbi, Prefix pfx, Long delta )
5501{
5502   Int    len;
5503   UInt   r_src, r_dst;
5504   HChar  dis_buf[50];
5505   IRTemp t1, t2;
5506
5507   /* On entry, delta points at the second byte of the insn (the modrm
5508      byte).*/
5509   UChar first_opcode = getUChar(delta-1);
5510   UChar modrm        = getUChar(delta+0);
5511
5512   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
5513
5514   if (first_opcode == 0xD8) {
5515      if (modrm < 0xC0) {
5516
5517         /* bits 5,4,3 are an opcode extension, and the modRM also
5518           specifies an address. */
5519         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
5520         delta += len;
5521
5522         switch (gregLO3ofRM(modrm)) {
5523
5524            case 0: /* FADD single-real */
5525               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
5526               break;
5527
5528            case 1: /* FMUL single-real */
5529               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
5530               break;
5531
5532            case 2: /* FCOM single-real */
5533               DIP("fcoms %s\n", dis_buf);
5534               /* This forces C1 to zero, which isn't right. */
5535               /* The AMD documentation suggests that forcing C1 to
5536                  zero is correct (Eliot Moss) */
5537               put_C3210(
5538                   unop( Iop_32Uto64,
5539                       binop( Iop_And32,
5540                              binop(Iop_Shl32,
5541                                    binop(Iop_CmpF64,
5542                                          get_ST(0),
5543                                          unop(Iop_F32toF64,
5544                                               loadLE(Ity_F32,mkexpr(addr)))),
5545                                    mkU8(8)),
5546                              mkU32(0x4500)
5547                   )));
5548               break;
5549
5550            case 3: /* FCOMP single-real */
5551               /* The AMD documentation suggests that forcing C1 to
5552                  zero is correct (Eliot Moss) */
5553               DIP("fcomps %s\n", dis_buf);
5554               /* This forces C1 to zero, which isn't right. */
5555               put_C3210(
5556                   unop( Iop_32Uto64,
5557                       binop( Iop_And32,
5558                              binop(Iop_Shl32,
5559                                    binop(Iop_CmpF64,
5560                                          get_ST(0),
5561                                          unop(Iop_F32toF64,
5562                                               loadLE(Ity_F32,mkexpr(addr)))),
5563                                    mkU8(8)),
5564                              mkU32(0x4500)
5565                   )));
5566               fp_pop();
5567               break;
5568
5569            case 4: /* FSUB single-real */
5570               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
5571               break;
5572
5573            case 5: /* FSUBR single-real */
5574               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
5575               break;
5576
5577            case 6: /* FDIV single-real */
5578               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
5579               break;
5580
5581            case 7: /* FDIVR single-real */
5582               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
5583               break;
5584
5585            default:
5586               vex_printf("unhandled opc_aux = 0x%2x\n",
5587                          (UInt)gregLO3ofRM(modrm));
5588               vex_printf("first_opcode == 0xD8\n");
5589               goto decode_fail;
5590         }
5591      } else {
5592         delta++;
5593         switch (modrm) {
5594
5595            case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
5596               fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
5597               break;
5598
5599            case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
5600               fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
5601               break;
5602
5603            /* Dunno if this is right */
5604            case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
5605               r_dst = (UInt)modrm - 0xD0;
5606               DIP("fcom %%st(0),%%st(%u)\n", r_dst);
5607               /* This forces C1 to zero, which isn't right. */
5608               put_C3210(
5609                   unop(Iop_32Uto64,
5610                   binop( Iop_And32,
5611                          binop(Iop_Shl32,
5612                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
5613                                mkU8(8)),
5614                          mkU32(0x4500)
5615                   )));
5616               break;
5617
5618            /* Dunno if this is right */
5619            case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
5620               r_dst = (UInt)modrm - 0xD8;
5621               DIP("fcomp %%st(0),%%st(%u)\n", r_dst);
5622               /* This forces C1 to zero, which isn't right. */
5623               put_C3210(
5624                   unop(Iop_32Uto64,
5625                   binop( Iop_And32,
5626                          binop(Iop_Shl32,
5627                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
5628                                mkU8(8)),
5629                          mkU32(0x4500)
5630                   )));
5631               fp_pop();
5632               break;
5633
5634            case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
5635               fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
5636               break;
5637
5638            case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
5639               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
5640               break;
5641
5642            case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
5643               fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
5644               break;
5645
5646            case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
5647               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
5648               break;
5649
5650            default:
5651               goto decode_fail;
5652         }
5653      }
5654   }
5655
5656   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
5657   else
5658   if (first_opcode == 0xD9) {
5659      if (modrm < 0xC0) {
5660
5661         /* bits 5,4,3 are an opcode extension, and the modRM also
5662            specifies an address. */
5663         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
5664         delta += len;
5665
5666         switch (gregLO3ofRM(modrm)) {
5667
5668            case 0: /* FLD single-real */
5669               DIP("flds %s\n", dis_buf);
5670               fp_push();
5671               put_ST(0, unop(Iop_F32toF64,
5672                              loadLE(Ity_F32, mkexpr(addr))));
5673               break;
5674
5675            case 2: /* FST single-real */
5676               DIP("fsts %s\n", dis_buf);
5677               storeLE(mkexpr(addr),
5678                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
5679               break;
5680
5681            case 3: /* FSTP single-real */
5682               DIP("fstps %s\n", dis_buf);
5683               storeLE(mkexpr(addr),
5684                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
5685               fp_pop();
5686               break;
5687
5688            case 4: { /* FLDENV m28 */
5689               /* Uses dirty helper:
5690                     VexEmNote amd64g_do_FLDENV ( VexGuestX86State*, HWord ) */
5691               IRTemp    ew = newTemp(Ity_I32);
5692               IRTemp   w64 = newTemp(Ity_I64);
5693               IRDirty*   d = unsafeIRDirty_0_N (
5694                                 0/*regparms*/,
5695                                 "amd64g_dirtyhelper_FLDENV",
5696                                 &amd64g_dirtyhelper_FLDENV,
5697                                 mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
5698                              );
5699               d->tmp       = w64;
5700               /* declare we're reading memory */
5701               d->mFx   = Ifx_Read;
5702               d->mAddr = mkexpr(addr);
5703               d->mSize = 28;
5704
5705               /* declare we're writing guest state */
5706               d->nFxState = 4;
5707               vex_bzero(&d->fxState, sizeof(d->fxState));
5708
5709               d->fxState[0].fx     = Ifx_Write;
5710               d->fxState[0].offset = OFFB_FTOP;
5711               d->fxState[0].size   = sizeof(UInt);
5712
5713               d->fxState[1].fx     = Ifx_Write;
5714               d->fxState[1].offset = OFFB_FPTAGS;
5715               d->fxState[1].size   = 8 * sizeof(UChar);
5716
5717               d->fxState[2].fx     = Ifx_Write;
5718               d->fxState[2].offset = OFFB_FPROUND;
5719               d->fxState[2].size   = sizeof(ULong);
5720
5721               d->fxState[3].fx     = Ifx_Write;
5722               d->fxState[3].offset = OFFB_FC3210;
5723               d->fxState[3].size   = sizeof(ULong);
5724
5725               stmt( IRStmt_Dirty(d) );
5726
5727               /* ew contains any emulation warning we may need to
5728                  issue.  If needed, side-exit to the next insn,
5729                  reporting the warning, so that Valgrind's dispatcher
5730                  sees the warning. */
5731               assign(ew, unop(Iop_64to32,mkexpr(w64)) );
5732               put_emwarn( mkexpr(ew) );
5733               stmt(
5734                  IRStmt_Exit(
5735                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
5736                     Ijk_EmWarn,
5737                     IRConst_U64( guest_RIP_bbstart+delta ),
5738                     OFFB_RIP
5739                  )
5740               );
5741
5742               DIP("fldenv %s\n", dis_buf);
5743               break;
5744            }
5745
5746            case 5: {/* FLDCW */
5747               /* The only thing we observe in the control word is the
5748                  rounding mode.  Therefore, pass the 16-bit value
5749                  (x87 native-format control word) to a clean helper,
5750                  getting back a 64-bit value, the lower half of which
5751                  is the FPROUND value to store, and the upper half of
5752                  which is the emulation-warning token which may be
5753                  generated.
5754               */
5755               /* ULong amd64h_check_fldcw ( ULong ); */
5756               IRTemp t64 = newTemp(Ity_I64);
5757               IRTemp ew = newTemp(Ity_I32);
5758               DIP("fldcw %s\n", dis_buf);
5759               assign( t64, mkIRExprCCall(
5760                               Ity_I64, 0/*regparms*/,
5761                               "amd64g_check_fldcw",
5762                               &amd64g_check_fldcw,
5763                               mkIRExprVec_1(
5764                                  unop( Iop_16Uto64,
5765                                        loadLE(Ity_I16, mkexpr(addr)))
5766                               )
5767                            )
5768                     );
5769
5770               put_fpround( unop(Iop_64to32, mkexpr(t64)) );
5771               assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
5772               put_emwarn( mkexpr(ew) );
5773               /* Finally, if an emulation warning was reported,
5774                  side-exit to the next insn, reporting the warning,
5775                  so that Valgrind's dispatcher sees the warning. */
5776               stmt(
5777                  IRStmt_Exit(
5778                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
5779                     Ijk_EmWarn,
5780                     IRConst_U64( guest_RIP_bbstart+delta ),
5781                     OFFB_RIP
5782                  )
5783               );
5784               break;
5785            }
5786
5787            case 6: { /* FNSTENV m28 */
5788               /* Uses dirty helper:
5789                     void amd64g_do_FSTENV ( VexGuestAMD64State*, HWord ) */
5790               IRDirty* d = unsafeIRDirty_0_N (
5791                               0/*regparms*/,
5792                               "amd64g_dirtyhelper_FSTENV",
5793                               &amd64g_dirtyhelper_FSTENV,
5794                               mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
5795                            );
5796               /* declare we're writing memory */
5797               d->mFx   = Ifx_Write;
5798               d->mAddr = mkexpr(addr);
5799               d->mSize = 28;
5800
5801               /* declare we're reading guest state */
5802               d->nFxState = 4;
5803               vex_bzero(&d->fxState, sizeof(d->fxState));
5804
5805               d->fxState[0].fx     = Ifx_Read;
5806               d->fxState[0].offset = OFFB_FTOP;
5807               d->fxState[0].size   = sizeof(UInt);
5808
5809               d->fxState[1].fx     = Ifx_Read;
5810               d->fxState[1].offset = OFFB_FPTAGS;
5811               d->fxState[1].size   = 8 * sizeof(UChar);
5812
5813               d->fxState[2].fx     = Ifx_Read;
5814               d->fxState[2].offset = OFFB_FPROUND;
5815               d->fxState[2].size   = sizeof(ULong);
5816
5817               d->fxState[3].fx     = Ifx_Read;
5818               d->fxState[3].offset = OFFB_FC3210;
5819               d->fxState[3].size   = sizeof(ULong);
5820
5821               stmt( IRStmt_Dirty(d) );
5822
5823               DIP("fnstenv %s\n", dis_buf);
5824               break;
5825            }
5826
5827            case 7: /* FNSTCW */
5828               /* Fake up a native x87 FPU control word.  The only
5829                  thing it depends on is FPROUND[1:0], so call a clean
5830                  helper to cook it up. */
5831               /* ULong amd64g_create_fpucw ( ULong fpround ) */
5832               DIP("fnstcw %s\n", dis_buf);
5833               storeLE(
5834                  mkexpr(addr),
5835                  unop( Iop_64to16,
5836                        mkIRExprCCall(
5837                           Ity_I64, 0/*regp*/,
5838                           "amd64g_create_fpucw", &amd64g_create_fpucw,
5839                           mkIRExprVec_1( unop(Iop_32Uto64, get_fpround()) )
5840                        )
5841                  )
5842               );
5843               break;
5844
5845            default:
5846               vex_printf("unhandled opc_aux = 0x%2x\n",
5847                          (UInt)gregLO3ofRM(modrm));
5848               vex_printf("first_opcode == 0xD9\n");
5849               goto decode_fail;
5850         }
5851
5852      } else {
5853         delta++;
5854         switch (modrm) {
5855
5856            case 0xC0 ... 0xC7: /* FLD %st(?) */
5857               r_src = (UInt)modrm - 0xC0;
5858               DIP("fld %%st(%u)\n", r_src);
5859               t1 = newTemp(Ity_F64);
5860               assign(t1, get_ST(r_src));
5861               fp_push();
5862               put_ST(0, mkexpr(t1));
5863               break;
5864
5865            case 0xC8 ... 0xCF: /* FXCH %st(?) */
5866               r_src = (UInt)modrm - 0xC8;
5867               DIP("fxch %%st(%u)\n", r_src);
5868               t1 = newTemp(Ity_F64);
5869               t2 = newTemp(Ity_F64);
5870               assign(t1, get_ST(0));
5871               assign(t2, get_ST(r_src));
5872               put_ST_UNCHECKED(0, mkexpr(t2));
5873               put_ST_UNCHECKED(r_src, mkexpr(t1));
5874               break;
5875
5876            case 0xE0: /* FCHS */
5877               DIP("fchs\n");
5878               put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
5879               break;
5880
5881            case 0xE1: /* FABS */
5882               DIP("fabs\n");
5883               put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
5884               break;
5885
5886            case 0xE5: { /* FXAM */
5887               /* This is an interesting one.  It examines %st(0),
5888                  regardless of whether the tag says it's empty or not.
5889                  Here, just pass both the tag (in our format) and the
5890                  value (as a double, actually a ULong) to a helper
5891                  function. */
5892               IRExpr** args
5893                  = mkIRExprVec_2( unop(Iop_8Uto64, get_ST_TAG(0)),
5894                                   unop(Iop_ReinterpF64asI64,
5895                                        get_ST_UNCHECKED(0)) );
5896               put_C3210(mkIRExprCCall(
5897                            Ity_I64,
5898                            0/*regparm*/,
5899                            "amd64g_calculate_FXAM", &amd64g_calculate_FXAM,
5900                            args
5901                        ));
5902               DIP("fxam\n");
5903               break;
5904            }
5905
5906            case 0xE8: /* FLD1 */
5907               DIP("fld1\n");
5908               fp_push();
5909               /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
5910               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
5911               break;
5912
5913            case 0xE9: /* FLDL2T */
5914               DIP("fldl2t\n");
5915               fp_push();
5916               /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
5917               put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
5918               break;
5919
5920            case 0xEA: /* FLDL2E */
5921               DIP("fldl2e\n");
5922               fp_push();
5923               /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
5924               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
5925               break;
5926
5927            case 0xEB: /* FLDPI */
5928               DIP("fldpi\n");
5929               fp_push();
5930               /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
5931               put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
5932               break;
5933
5934            case 0xEC: /* FLDLG2 */
5935               DIP("fldlg2\n");
5936               fp_push();
5937               /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
5938               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
5939               break;
5940
5941            case 0xED: /* FLDLN2 */
5942               DIP("fldln2\n");
5943               fp_push();
5944               /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
5945               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
5946               break;
5947
5948            case 0xEE: /* FLDZ */
5949               DIP("fldz\n");
5950               fp_push();
5951               /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
5952               put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
5953               break;
5954
5955            case 0xF0: /* F2XM1 */
5956               DIP("f2xm1\n");
5957               put_ST_UNCHECKED(0,
5958                  binop(Iop_2xm1F64,
5959                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5960                        get_ST(0)));
5961               break;
5962
5963            case 0xF1: /* FYL2X */
5964               DIP("fyl2x\n");
5965               put_ST_UNCHECKED(1,
5966                  triop(Iop_Yl2xF64,
5967                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5968                        get_ST(1),
5969                        get_ST(0)));
5970               fp_pop();
5971               break;
5972
5973            case 0xF2: { /* FPTAN */
5974               DIP("fptan\n");
5975               IRTemp argD = newTemp(Ity_F64);
5976               assign(argD, get_ST(0));
5977               IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
5978               IRTemp resD = newTemp(Ity_F64);
5979               assign(resD,
5980                  IRExpr_ITE(
5981                     mkexpr(argOK),
5982                     binop(Iop_TanF64,
5983                           get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5984                           mkexpr(argD)),
5985                     mkexpr(argD))
5986               );
5987               put_ST_UNCHECKED(0, mkexpr(resD));
5988               /* Conditionally push 1.0 on the stack, if the arg is
5989                  in range */
5990               maybe_fp_push(argOK);
5991               maybe_put_ST(argOK, 0,
5992                            IRExpr_Const(IRConst_F64(1.0)));
5993               set_C2( binop(Iop_Xor64,
5994                             unop(Iop_1Uto64, mkexpr(argOK)),
5995                             mkU64(1)) );
5996               break;
5997            }
5998
5999            case 0xF3: /* FPATAN */
6000               DIP("fpatan\n");
6001               put_ST_UNCHECKED(1,
6002                  triop(Iop_AtanF64,
6003                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6004                        get_ST(1),
6005                        get_ST(0)));
6006               fp_pop();
6007               break;
6008
6009            case 0xF4: { /* FXTRACT */
6010               IRTemp argF = newTemp(Ity_F64);
6011               IRTemp sigF = newTemp(Ity_F64);
6012               IRTemp expF = newTemp(Ity_F64);
6013               IRTemp argI = newTemp(Ity_I64);
6014               IRTemp sigI = newTemp(Ity_I64);
6015               IRTemp expI = newTemp(Ity_I64);
6016               DIP("fxtract\n");
6017               assign( argF, get_ST(0) );
6018               assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
6019               assign( sigI,
6020                       mkIRExprCCall(
6021                          Ity_I64, 0/*regparms*/,
6022                          "x86amd64g_calculate_FXTRACT",
6023                          &x86amd64g_calculate_FXTRACT,
6024                          mkIRExprVec_2( mkexpr(argI),
6025                                         mkIRExpr_HWord(0)/*sig*/ ))
6026               );
6027               assign( expI,
6028                       mkIRExprCCall(
6029                          Ity_I64, 0/*regparms*/,
6030                          "x86amd64g_calculate_FXTRACT",
6031                          &x86amd64g_calculate_FXTRACT,
6032                          mkIRExprVec_2( mkexpr(argI),
6033                                         mkIRExpr_HWord(1)/*exp*/ ))
6034               );
6035               assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
6036               assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
6037               /* exponent */
6038               put_ST_UNCHECKED(0, mkexpr(expF) );
6039               fp_push();
6040               /* significand */
6041               put_ST(0, mkexpr(sigF) );
6042               break;
6043            }
6044
6045            case 0xF5: { /* FPREM1 -- IEEE compliant */
6046               IRTemp a1 = newTemp(Ity_F64);
6047               IRTemp a2 = newTemp(Ity_F64);
6048               DIP("fprem1\n");
6049               /* Do FPREM1 twice, once to get the remainder, and once
6050                  to get the C3210 flag values. */
6051               assign( a1, get_ST(0) );
6052               assign( a2, get_ST(1) );
6053               put_ST_UNCHECKED(0,
6054                  triop(Iop_PRem1F64,
6055                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6056                        mkexpr(a1),
6057                        mkexpr(a2)));
6058               put_C3210(
6059                  unop(Iop_32Uto64,
6060                  triop(Iop_PRem1C3210F64,
6061                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6062                        mkexpr(a1),
6063                        mkexpr(a2)) ));
6064               break;
6065            }
6066
6067            case 0xF7: /* FINCSTP */
6068               DIP("fincstp\n");
6069               put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
6070               break;
6071
6072            case 0xF8: { /* FPREM -- not IEEE compliant */
6073               IRTemp a1 = newTemp(Ity_F64);
6074               IRTemp a2 = newTemp(Ity_F64);
6075               DIP("fprem\n");
6076               /* Do FPREM twice, once to get the remainder, and once
6077                  to get the C3210 flag values. */
6078               assign( a1, get_ST(0) );
6079               assign( a2, get_ST(1) );
6080               put_ST_UNCHECKED(0,
6081                  triop(Iop_PRemF64,
6082                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6083                        mkexpr(a1),
6084                        mkexpr(a2)));
6085               put_C3210(
6086                  unop(Iop_32Uto64,
6087                  triop(Iop_PRemC3210F64,
6088                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6089                        mkexpr(a1),
6090                        mkexpr(a2)) ));
6091               break;
6092            }
6093
6094            case 0xF9: /* FYL2XP1 */
6095               DIP("fyl2xp1\n");
6096               put_ST_UNCHECKED(1,
6097                  triop(Iop_Yl2xp1F64,
6098                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6099                        get_ST(1),
6100                        get_ST(0)));
6101               fp_pop();
6102               break;
6103
6104            case 0xFA: /* FSQRT */
6105               DIP("fsqrt\n");
6106               put_ST_UNCHECKED(0,
6107                  binop(Iop_SqrtF64,
6108                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6109                        get_ST(0)));
6110               break;
6111
6112            case 0xFB: { /* FSINCOS */
6113               DIP("fsincos\n");
6114               IRTemp argD = newTemp(Ity_F64);
6115               assign(argD, get_ST(0));
6116               IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
6117               IRTemp resD = newTemp(Ity_F64);
6118               assign(resD,
6119                  IRExpr_ITE(
6120                     mkexpr(argOK),
6121                     binop(Iop_SinF64,
6122                           get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6123                           mkexpr(argD)),
6124                     mkexpr(argD))
6125               );
6126               put_ST_UNCHECKED(0, mkexpr(resD));
6127               /* Conditionally push the cos value on the stack, if
6128                  the arg is in range */
6129               maybe_fp_push(argOK);
6130               maybe_put_ST(argOK, 0,
6131                  binop(Iop_CosF64,
6132                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6133                        mkexpr(argD)));
6134               set_C2( binop(Iop_Xor64,
6135                             unop(Iop_1Uto64, mkexpr(argOK)),
6136                             mkU64(1)) );
6137               break;
6138            }
6139
6140            case 0xFC: /* FRNDINT */
6141               DIP("frndint\n");
6142               put_ST_UNCHECKED(0,
6143                  binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
6144               break;
6145
6146            case 0xFD: /* FSCALE */
6147               DIP("fscale\n");
6148               put_ST_UNCHECKED(0,
6149                  triop(Iop_ScaleF64,
6150                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6151                        get_ST(0),
6152                        get_ST(1)));
6153               break;
6154
6155            case 0xFE:   /* FSIN */
6156            case 0xFF: { /* FCOS */
6157               Bool isSIN = modrm == 0xFE;
6158               DIP("%s\n", isSIN ? "fsin" : "fcos");
6159               IRTemp argD = newTemp(Ity_F64);
6160               assign(argD, get_ST(0));
6161               IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
6162               IRTemp resD = newTemp(Ity_F64);
6163               assign(resD,
6164                  IRExpr_ITE(
6165                     mkexpr(argOK),
6166                     binop(isSIN ? Iop_SinF64 : Iop_CosF64,
6167                           get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6168                           mkexpr(argD)),
6169                     mkexpr(argD))
6170               );
6171               put_ST_UNCHECKED(0, mkexpr(resD));
6172               set_C2( binop(Iop_Xor64,
6173                             unop(Iop_1Uto64, mkexpr(argOK)),
6174                             mkU64(1)) );
6175               break;
6176            }
6177
6178            default:
6179               goto decode_fail;
6180         }
6181      }
6182   }
6183
6184   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
6185   else
6186   if (first_opcode == 0xDA) {
6187
6188      if (modrm < 0xC0) {
6189
6190         /* bits 5,4,3 are an opcode extension, and the modRM also
6191            specifies an address. */
6192         IROp   fop;
6193         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6194         delta += len;
6195         switch (gregLO3ofRM(modrm)) {
6196
6197            case 0: /* FIADD m32int */ /* ST(0) += m32int */
6198               DIP("fiaddl %s\n", dis_buf);
6199               fop = Iop_AddF64;
6200               goto do_fop_m32;
6201
6202            case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
6203               DIP("fimull %s\n", dis_buf);
6204               fop = Iop_MulF64;
6205               goto do_fop_m32;
6206
6207            case 4: /* FISUB m32int */ /* ST(0) -= m32int */
6208               DIP("fisubl %s\n", dis_buf);
6209               fop = Iop_SubF64;
6210               goto do_fop_m32;
6211
6212            case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
6213               DIP("fisubrl %s\n", dis_buf);
6214               fop = Iop_SubF64;
6215               goto do_foprev_m32;
6216
6217            case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
6218               DIP("fisubl %s\n", dis_buf);
6219               fop = Iop_DivF64;
6220               goto do_fop_m32;
6221
6222            case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
6223               DIP("fidivrl %s\n", dis_buf);
6224               fop = Iop_DivF64;
6225               goto do_foprev_m32;
6226
6227            do_fop_m32:
6228               put_ST_UNCHECKED(0,
6229                  triop(fop,
6230                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6231                        get_ST(0),
6232                        unop(Iop_I32StoF64,
6233                             loadLE(Ity_I32, mkexpr(addr)))));
6234               break;
6235
6236            do_foprev_m32:
6237               put_ST_UNCHECKED(0,
6238                  triop(fop,
6239                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6240                        unop(Iop_I32StoF64,
6241                             loadLE(Ity_I32, mkexpr(addr))),
6242                        get_ST(0)));
6243               break;
6244
6245            default:
6246               vex_printf("unhandled opc_aux = 0x%2x\n",
6247                          (UInt)gregLO3ofRM(modrm));
6248               vex_printf("first_opcode == 0xDA\n");
6249               goto decode_fail;
6250         }
6251
6252      } else {
6253
6254         delta++;
6255         switch (modrm) {
6256
6257            case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
6258               r_src = (UInt)modrm - 0xC0;
6259               DIP("fcmovb %%st(%u), %%st(0)\n", r_src);
6260               put_ST_UNCHECKED(0,
6261                                IRExpr_ITE(
6262                                    mk_amd64g_calculate_condition(AMD64CondB),
6263                                    get_ST(r_src), get_ST(0)) );
6264               break;
6265
6266            case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
6267               r_src = (UInt)modrm - 0xC8;
6268               DIP("fcmovz %%st(%u), %%st(0)\n", r_src);
6269               put_ST_UNCHECKED(0,
6270                                IRExpr_ITE(
6271                                    mk_amd64g_calculate_condition(AMD64CondZ),
6272                                    get_ST(r_src), get_ST(0)) );
6273               break;
6274
6275            case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
6276               r_src = (UInt)modrm - 0xD0;
6277               DIP("fcmovbe %%st(%u), %%st(0)\n", r_src);
6278               put_ST_UNCHECKED(0,
6279                                IRExpr_ITE(
6280                                    mk_amd64g_calculate_condition(AMD64CondBE),
6281                                    get_ST(r_src), get_ST(0)) );
6282               break;
6283
6284            case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
6285               r_src = (UInt)modrm - 0xD8;
6286               DIP("fcmovu %%st(%u), %%st(0)\n", r_src);
6287               put_ST_UNCHECKED(0,
6288                                IRExpr_ITE(
6289                                    mk_amd64g_calculate_condition(AMD64CondP),
6290                                    get_ST(r_src), get_ST(0)) );
6291               break;
6292
6293            case 0xE9: /* FUCOMPP %st(0),%st(1) */
6294               DIP("fucompp %%st(0),%%st(1)\n");
6295               /* This forces C1 to zero, which isn't right. */
6296               put_C3210(
6297                   unop(Iop_32Uto64,
6298                   binop( Iop_And32,
6299                          binop(Iop_Shl32,
6300                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
6301                                mkU8(8)),
6302                          mkU32(0x4500)
6303                   )));
6304               fp_pop();
6305               fp_pop();
6306               break;
6307
6308            default:
6309               goto decode_fail;
6310         }
6311
6312      }
6313   }
6314
6315   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
6316   else
6317   if (first_opcode == 0xDB) {
6318      if (modrm < 0xC0) {
6319
6320         /* bits 5,4,3 are an opcode extension, and the modRM also
6321            specifies an address. */
6322         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6323         delta += len;
6324
6325         switch (gregLO3ofRM(modrm)) {
6326
6327            case 0: /* FILD m32int */
6328               DIP("fildl %s\n", dis_buf);
6329               fp_push();
6330               put_ST(0, unop(Iop_I32StoF64,
6331                              loadLE(Ity_I32, mkexpr(addr))));
6332               break;
6333
6334            case 1: /* FISTTPL m32 (SSE3) */
6335               DIP("fisttpl %s\n", dis_buf);
6336               storeLE( mkexpr(addr),
6337                        binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
6338               fp_pop();
6339               break;
6340
6341            case 2: /* FIST m32 */
6342               DIP("fistl %s\n", dis_buf);
6343               storeLE( mkexpr(addr),
6344                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
6345               break;
6346
6347            case 3: /* FISTP m32 */
6348               DIP("fistpl %s\n", dis_buf);
6349               storeLE( mkexpr(addr),
6350                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
6351               fp_pop();
6352               break;
6353
6354            case 5: { /* FLD extended-real */
6355               /* Uses dirty helper:
6356                     ULong amd64g_loadF80le ( ULong )
6357                  addr holds the address.  First, do a dirty call to
6358                  get hold of the data. */
6359               IRTemp   val  = newTemp(Ity_I64);
6360               IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
6361
6362               IRDirty* d = unsafeIRDirty_1_N (
6363                               val,
6364                               0/*regparms*/,
6365                               "amd64g_dirtyhelper_loadF80le",
6366                               &amd64g_dirtyhelper_loadF80le,
6367                               args
6368                            );
6369               /* declare that we're reading memory */
6370               d->mFx   = Ifx_Read;
6371               d->mAddr = mkexpr(addr);
6372               d->mSize = 10;
6373
6374               /* execute the dirty call, dumping the result in val. */
6375               stmt( IRStmt_Dirty(d) );
6376               fp_push();
6377               put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
6378
6379               DIP("fldt %s\n", dis_buf);
6380               break;
6381            }
6382
6383            case 7: { /* FSTP extended-real */
6384               /* Uses dirty helper:
6385                     void amd64g_storeF80le ( ULong addr, ULong data )
6386               */
6387               IRExpr** args
6388                  = mkIRExprVec_2( mkexpr(addr),
6389                                   unop(Iop_ReinterpF64asI64, get_ST(0)) );
6390
6391               IRDirty* d = unsafeIRDirty_0_N (
6392                               0/*regparms*/,
6393                               "amd64g_dirtyhelper_storeF80le",
6394                               &amd64g_dirtyhelper_storeF80le,
6395                               args
6396                            );
6397               /* declare we're writing memory */
6398               d->mFx   = Ifx_Write;
6399               d->mAddr = mkexpr(addr);
6400               d->mSize = 10;
6401
6402               /* execute the dirty call. */
6403               stmt( IRStmt_Dirty(d) );
6404               fp_pop();
6405
6406               DIP("fstpt\n %s", dis_buf);
6407               break;
6408            }
6409
6410            default:
6411               vex_printf("unhandled opc_aux = 0x%2x\n",
6412                          (UInt)gregLO3ofRM(modrm));
6413               vex_printf("first_opcode == 0xDB\n");
6414               goto decode_fail;
6415         }
6416
6417      } else {
6418
6419         delta++;
6420         switch (modrm) {
6421
6422            case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
6423               r_src = (UInt)modrm - 0xC0;
6424               DIP("fcmovnb %%st(%u), %%st(0)\n", r_src);
6425               put_ST_UNCHECKED(0,
6426                                IRExpr_ITE(
6427                                    mk_amd64g_calculate_condition(AMD64CondNB),
6428                                    get_ST(r_src), get_ST(0)) );
6429               break;
6430
6431            case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
6432               r_src = (UInt)modrm - 0xC8;
6433               DIP("fcmovnz %%st(%u), %%st(0)\n", r_src);
6434               put_ST_UNCHECKED(
6435                  0,
6436                  IRExpr_ITE(
6437                     mk_amd64g_calculate_condition(AMD64CondNZ),
6438                     get_ST(r_src),
6439                     get_ST(0)
6440                  )
6441               );
6442               break;
6443
6444            case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
6445               r_src = (UInt)modrm - 0xD0;
6446               DIP("fcmovnbe %%st(%u), %%st(0)\n", r_src);
6447               put_ST_UNCHECKED(
6448                  0,
6449                  IRExpr_ITE(
6450                     mk_amd64g_calculate_condition(AMD64CondNBE),
6451                     get_ST(r_src),
6452                     get_ST(0)
6453                  )
6454               );
6455               break;
6456
6457            case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
6458               r_src = (UInt)modrm - 0xD8;
6459               DIP("fcmovnu %%st(%u), %%st(0)\n", r_src);
6460               put_ST_UNCHECKED(
6461                  0,
6462                  IRExpr_ITE(
6463                     mk_amd64g_calculate_condition(AMD64CondNP),
6464                     get_ST(r_src),
6465                     get_ST(0)
6466                  )
6467               );
6468               break;
6469
6470            case 0xE2:
6471               DIP("fnclex\n");
6472               break;
6473
6474            case 0xE3: {
6475               gen_FINIT_SEQUENCE(NULL/*no guarding condition*/);
6476               DIP("fninit\n");
6477               break;
6478            }
6479
6480            case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
6481               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
6482               break;
6483
6484            case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
6485               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
6486               break;
6487
6488            default:
6489               goto decode_fail;
6490         }
6491      }
6492   }
6493
6494   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
6495   else
6496   if (first_opcode == 0xDC) {
6497      if (modrm < 0xC0) {
6498
6499         /* bits 5,4,3 are an opcode extension, and the modRM also
6500            specifies an address. */
6501         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6502         delta += len;
6503
6504         switch (gregLO3ofRM(modrm)) {
6505
6506            case 0: /* FADD double-real */
6507               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
6508               break;
6509
6510            case 1: /* FMUL double-real */
6511               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
6512               break;
6513
6514            case 2: /* FCOM double-real */
6515               DIP("fcoml %s\n", dis_buf);
6516               /* This forces C1 to zero, which isn't right. */
6517               put_C3210(
6518                   unop(Iop_32Uto64,
6519                   binop( Iop_And32,
6520                          binop(Iop_Shl32,
6521                                binop(Iop_CmpF64,
6522                                      get_ST(0),
6523                                      loadLE(Ity_F64,mkexpr(addr))),
6524                                mkU8(8)),
6525                          mkU32(0x4500)
6526                   )));
6527               break;
6528
6529            case 3: /* FCOMP double-real */
6530               DIP("fcompl %s\n", dis_buf);
6531               /* This forces C1 to zero, which isn't right. */
6532               put_C3210(
6533                   unop(Iop_32Uto64,
6534                   binop( Iop_And32,
6535                          binop(Iop_Shl32,
6536                                binop(Iop_CmpF64,
6537                                      get_ST(0),
6538                                      loadLE(Ity_F64,mkexpr(addr))),
6539                                mkU8(8)),
6540                          mkU32(0x4500)
6541                   )));
6542               fp_pop();
6543               break;
6544
6545            case 4: /* FSUB double-real */
6546               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
6547               break;
6548
6549            case 5: /* FSUBR double-real */
6550               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
6551               break;
6552
6553            case 6: /* FDIV double-real */
6554               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
6555               break;
6556
6557            case 7: /* FDIVR double-real */
6558               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
6559               break;
6560
6561            default:
6562               vex_printf("unhandled opc_aux = 0x%2x\n",
6563                          (UInt)gregLO3ofRM(modrm));
6564               vex_printf("first_opcode == 0xDC\n");
6565               goto decode_fail;
6566         }
6567
6568      } else {
6569
6570         delta++;
6571         switch (modrm) {
6572
6573            case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
6574               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
6575               break;
6576
6577            case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
6578               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
6579               break;
6580
6581            case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
6582               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
6583               break;
6584
6585            case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
6586               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
6587               break;
6588
6589            case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
6590               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
6591               break;
6592
6593            case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
6594               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
6595               break;
6596
6597            default:
6598               goto decode_fail;
6599         }
6600
6601      }
6602   }
6603
6604   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
6605   else
6606   if (first_opcode == 0xDD) {
6607
6608      if (modrm < 0xC0) {
6609
6610         /* bits 5,4,3 are an opcode extension, and the modRM also
6611            specifies an address. */
6612         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6613         delta += len;
6614
6615         switch (gregLO3ofRM(modrm)) {
6616
6617            case 0: /* FLD double-real */
6618               DIP("fldl %s\n", dis_buf);
6619               fp_push();
6620               put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
6621               break;
6622
6623            case 1: /* FISTTPQ m64 (SSE3) */
6624               DIP("fistppll %s\n", dis_buf);
6625               storeLE( mkexpr(addr),
6626                        binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
6627               fp_pop();
6628               break;
6629
6630            case 2: /* FST double-real */
6631               DIP("fstl %s\n", dis_buf);
6632               storeLE(mkexpr(addr), get_ST(0));
6633               break;
6634
6635            case 3: /* FSTP double-real */
6636               DIP("fstpl %s\n", dis_buf);
6637               storeLE(mkexpr(addr), get_ST(0));
6638               fp_pop();
6639               break;
6640
6641            case 4: { /* FRSTOR m94/m108 */
6642               IRTemp   ew = newTemp(Ity_I32);
6643               IRTemp  w64 = newTemp(Ity_I64);
6644               IRDirty*  d;
6645               if ( have66(pfx) ) {
6646                  /* Uses dirty helper:
6647                     VexEmNote amd64g_dirtyhelper_FRSTORS
6648                                  ( VexGuestAMD64State*, HWord ) */
6649                  d = unsafeIRDirty_0_N (
6650                         0/*regparms*/,
6651                         "amd64g_dirtyhelper_FRSTORS",
6652                         &amd64g_dirtyhelper_FRSTORS,
6653                         mkIRExprVec_1( mkexpr(addr) )
6654                      );
6655                  d->mSize = 94;
6656               } else {
6657                  /* Uses dirty helper:
6658                     VexEmNote amd64g_dirtyhelper_FRSTOR
6659                                  ( VexGuestAMD64State*, HWord ) */
6660                  d = unsafeIRDirty_0_N (
6661                         0/*regparms*/,
6662                         "amd64g_dirtyhelper_FRSTOR",
6663                         &amd64g_dirtyhelper_FRSTOR,
6664                         mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
6665                      );
6666                  d->mSize = 108;
6667               }
6668
6669               d->tmp    = w64;
6670               /* declare we're reading memory */
6671               d->mFx   = Ifx_Read;
6672               d->mAddr = mkexpr(addr);
6673               /* d->mSize set above */
6674
6675               /* declare we're writing guest state */
6676               d->nFxState = 5;
6677               vex_bzero(&d->fxState, sizeof(d->fxState));
6678
6679               d->fxState[0].fx     = Ifx_Write;
6680               d->fxState[0].offset = OFFB_FTOP;
6681               d->fxState[0].size   = sizeof(UInt);
6682
6683               d->fxState[1].fx     = Ifx_Write;
6684               d->fxState[1].offset = OFFB_FPREGS;
6685               d->fxState[1].size   = 8 * sizeof(ULong);
6686
6687               d->fxState[2].fx     = Ifx_Write;
6688               d->fxState[2].offset = OFFB_FPTAGS;
6689               d->fxState[2].size   = 8 * sizeof(UChar);
6690
6691               d->fxState[3].fx     = Ifx_Write;
6692               d->fxState[3].offset = OFFB_FPROUND;
6693               d->fxState[3].size   = sizeof(ULong);
6694
6695               d->fxState[4].fx     = Ifx_Write;
6696               d->fxState[4].offset = OFFB_FC3210;
6697               d->fxState[4].size   = sizeof(ULong);
6698
6699               stmt( IRStmt_Dirty(d) );
6700
6701               /* ew contains any emulation warning we may need to
6702                  issue.  If needed, side-exit to the next insn,
6703                  reporting the warning, so that Valgrind's dispatcher
6704                  sees the warning. */
6705               assign(ew, unop(Iop_64to32,mkexpr(w64)) );
6706               put_emwarn( mkexpr(ew) );
6707               stmt(
6708                  IRStmt_Exit(
6709                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
6710                     Ijk_EmWarn,
6711                     IRConst_U64( guest_RIP_bbstart+delta ),
6712                     OFFB_RIP
6713                  )
6714               );
6715
6716               if ( have66(pfx) ) {
6717                  DIP("frstors %s\n", dis_buf);
6718               } else {
6719                  DIP("frstor %s\n", dis_buf);
6720               }
6721               break;
6722            }
6723
6724            case 6: { /* FNSAVE m94/m108 */
6725               IRDirty *d;
6726               if ( have66(pfx) ) {
6727                 /* Uses dirty helper:
6728                    void amd64g_dirtyhelper_FNSAVES ( VexGuestAMD64State*,
6729                                                      HWord ) */
6730                  d = unsafeIRDirty_0_N (
6731                         0/*regparms*/,
6732                         "amd64g_dirtyhelper_FNSAVES",
6733                         &amd64g_dirtyhelper_FNSAVES,
6734                         mkIRExprVec_1( mkexpr(addr) )
6735                         );
6736                  d->mSize = 94;
6737               } else {
6738                 /* Uses dirty helper:
6739                    void amd64g_dirtyhelper_FNSAVE ( VexGuestAMD64State*,
6740                                                     HWord ) */
6741                  d = unsafeIRDirty_0_N (
6742                         0/*regparms*/,
6743                         "amd64g_dirtyhelper_FNSAVE",
6744                         &amd64g_dirtyhelper_FNSAVE,
6745                         mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
6746                      );
6747                  d->mSize = 108;
6748               }
6749
6750               /* declare we're writing memory */
6751               d->mFx   = Ifx_Write;
6752               d->mAddr = mkexpr(addr);
6753               /* d->mSize set above */
6754
6755               /* declare we're reading guest state */
6756               d->nFxState = 5;
6757               vex_bzero(&d->fxState, sizeof(d->fxState));
6758
6759               d->fxState[0].fx     = Ifx_Read;
6760               d->fxState[0].offset = OFFB_FTOP;
6761               d->fxState[0].size   = sizeof(UInt);
6762
6763               d->fxState[1].fx     = Ifx_Read;
6764               d->fxState[1].offset = OFFB_FPREGS;
6765               d->fxState[1].size   = 8 * sizeof(ULong);
6766
6767               d->fxState[2].fx     = Ifx_Read;
6768               d->fxState[2].offset = OFFB_FPTAGS;
6769               d->fxState[2].size   = 8 * sizeof(UChar);
6770
6771               d->fxState[3].fx     = Ifx_Read;
6772               d->fxState[3].offset = OFFB_FPROUND;
6773               d->fxState[3].size   = sizeof(ULong);
6774
6775               d->fxState[4].fx     = Ifx_Read;
6776               d->fxState[4].offset = OFFB_FC3210;
6777               d->fxState[4].size   = sizeof(ULong);
6778
6779               stmt( IRStmt_Dirty(d) );
6780
6781               if ( have66(pfx) ) {
6782                 DIP("fnsaves %s\n", dis_buf);
6783               } else {
6784                 DIP("fnsave %s\n", dis_buf);
6785               }
6786               break;
6787            }
6788
6789            case 7: { /* FNSTSW m16 */
6790               IRExpr* sw = get_FPU_sw();
6791               vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
6792               storeLE( mkexpr(addr), sw );
6793               DIP("fnstsw %s\n", dis_buf);
6794               break;
6795            }
6796
6797            default:
6798               vex_printf("unhandled opc_aux = 0x%2x\n",
6799                          (UInt)gregLO3ofRM(modrm));
6800               vex_printf("first_opcode == 0xDD\n");
6801               goto decode_fail;
6802         }
6803      } else {
6804         delta++;
6805         switch (modrm) {
6806
6807            case 0xC0 ... 0xC7: /* FFREE %st(?) */
6808               r_dst = (UInt)modrm - 0xC0;
6809               DIP("ffree %%st(%u)\n", r_dst);
6810               put_ST_TAG ( r_dst, mkU8(0) );
6811               break;
6812
6813            case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
6814               r_dst = (UInt)modrm - 0xD0;
6815               DIP("fst %%st(0),%%st(%u)\n", r_dst);
6816               /* P4 manual says: "If the destination operand is a
6817                  non-empty register, the invalid-operation exception
6818                  is not generated.  Hence put_ST_UNCHECKED. */
6819               put_ST_UNCHECKED(r_dst, get_ST(0));
6820               break;
6821
6822            case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
6823               r_dst = (UInt)modrm - 0xD8;
6824               DIP("fstp %%st(0),%%st(%u)\n", r_dst);
6825               /* P4 manual says: "If the destination operand is a
6826                  non-empty register, the invalid-operation exception
6827                  is not generated.  Hence put_ST_UNCHECKED. */
6828               put_ST_UNCHECKED(r_dst, get_ST(0));
6829               fp_pop();
6830               break;
6831
6832            case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
6833               r_dst = (UInt)modrm - 0xE0;
6834               DIP("fucom %%st(0),%%st(%u)\n", r_dst);
6835               /* This forces C1 to zero, which isn't right. */
6836               put_C3210(
6837                   unop(Iop_32Uto64,
6838                   binop( Iop_And32,
6839                          binop(Iop_Shl32,
6840                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
6841                                mkU8(8)),
6842                          mkU32(0x4500)
6843                   )));
6844               break;
6845
6846            case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
6847               r_dst = (UInt)modrm - 0xE8;
6848               DIP("fucomp %%st(0),%%st(%u)\n", r_dst);
6849               /* This forces C1 to zero, which isn't right. */
6850               put_C3210(
6851                   unop(Iop_32Uto64,
6852                   binop( Iop_And32,
6853                          binop(Iop_Shl32,
6854                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
6855                                mkU8(8)),
6856                          mkU32(0x4500)
6857                   )));
6858               fp_pop();
6859               break;
6860
6861            default:
6862               goto decode_fail;
6863         }
6864      }
6865   }
6866
6867   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
6868   else
6869   if (first_opcode == 0xDE) {
6870
6871      if (modrm < 0xC0) {
6872
6873         /* bits 5,4,3 are an opcode extension, and the modRM also
6874            specifies an address. */
6875         IROp   fop;
6876         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6877         delta += len;
6878
6879         switch (gregLO3ofRM(modrm)) {
6880
6881            case 0: /* FIADD m16int */ /* ST(0) += m16int */
6882               DIP("fiaddw %s\n", dis_buf);
6883               fop = Iop_AddF64;
6884               goto do_fop_m16;
6885
6886            case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
6887               DIP("fimulw %s\n", dis_buf);
6888               fop = Iop_MulF64;
6889               goto do_fop_m16;
6890
6891            case 4: /* FISUB m16int */ /* ST(0) -= m16int */
6892               DIP("fisubw %s\n", dis_buf);
6893               fop = Iop_SubF64;
6894               goto do_fop_m16;
6895
6896            case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
6897               DIP("fisubrw %s\n", dis_buf);
6898               fop = Iop_SubF64;
6899               goto do_foprev_m16;
6900
6901            case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
6902               DIP("fisubw %s\n", dis_buf);
6903               fop = Iop_DivF64;
6904               goto do_fop_m16;
6905
6906            case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
6907               DIP("fidivrw %s\n", dis_buf);
6908               fop = Iop_DivF64;
6909               goto do_foprev_m16;
6910
6911            do_fop_m16:
6912               put_ST_UNCHECKED(0,
6913                  triop(fop,
6914                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6915                        get_ST(0),
6916                        unop(Iop_I32StoF64,
6917                             unop(Iop_16Sto32,
6918                                  loadLE(Ity_I16, mkexpr(addr))))));
6919               break;
6920
6921            do_foprev_m16:
6922               put_ST_UNCHECKED(0,
6923                  triop(fop,
6924                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6925                        unop(Iop_I32StoF64,
6926                             unop(Iop_16Sto32,
6927                                  loadLE(Ity_I16, mkexpr(addr)))),
6928                        get_ST(0)));
6929               break;
6930
6931            default:
6932               vex_printf("unhandled opc_aux = 0x%2x\n",
6933                          (UInt)gregLO3ofRM(modrm));
6934               vex_printf("first_opcode == 0xDE\n");
6935               goto decode_fail;
6936         }
6937
6938      } else {
6939
6940         delta++;
6941         switch (modrm) {
6942
6943            case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
6944               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
6945               break;
6946
6947            case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
6948               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
6949               break;
6950
6951            case 0xD9: /* FCOMPP %st(0),%st(1) */
6952               DIP("fcompp %%st(0),%%st(1)\n");
6953               /* This forces C1 to zero, which isn't right. */
6954               put_C3210(
6955                   unop(Iop_32Uto64,
6956                   binop( Iop_And32,
6957                          binop(Iop_Shl32,
6958                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
6959                                mkU8(8)),
6960                          mkU32(0x4500)
6961                   )));
6962               fp_pop();
6963               fp_pop();
6964               break;
6965
6966            case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
6967               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
6968               break;
6969
6970            case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
6971               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
6972               break;
6973
6974            case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
6975               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
6976               break;
6977
6978            case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
6979               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
6980               break;
6981
6982            default:
6983               goto decode_fail;
6984         }
6985
6986      }
6987   }
6988
6989   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
6990   else
6991   if (first_opcode == 0xDF) {
6992
6993      if (modrm < 0xC0) {
6994
6995         /* bits 5,4,3 are an opcode extension, and the modRM also
6996            specifies an address. */
6997         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6998         delta += len;
6999
7000         switch (gregLO3ofRM(modrm)) {
7001
7002            case 0: /* FILD m16int */
7003               DIP("fildw %s\n", dis_buf);
7004               fp_push();
7005               put_ST(0, unop(Iop_I32StoF64,
7006                              unop(Iop_16Sto32,
7007                                   loadLE(Ity_I16, mkexpr(addr)))));
7008               break;
7009
7010            case 1: /* FISTTPS m16 (SSE3) */
7011               DIP("fisttps %s\n", dis_buf);
7012               storeLE( mkexpr(addr),
7013                        x87ishly_qnarrow_32_to_16(
7014                        binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) ));
7015               fp_pop();
7016               break;
7017
7018            case 2: /* FIST m16 */
7019               DIP("fists %s\n", dis_buf);
7020               storeLE( mkexpr(addr),
7021                        x87ishly_qnarrow_32_to_16(
7022                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
7023               break;
7024
7025            case 3: /* FISTP m16 */
7026               DIP("fistps %s\n", dis_buf);
7027               storeLE( mkexpr(addr),
7028                        x87ishly_qnarrow_32_to_16(
7029                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
7030               fp_pop();
7031               break;
7032
7033            case 5: /* FILD m64 */
7034               DIP("fildll %s\n", dis_buf);
7035               fp_push();
7036               put_ST(0, binop(Iop_I64StoF64,
7037                               get_roundingmode(),
7038                               loadLE(Ity_I64, mkexpr(addr))));
7039               break;
7040
7041            case 7: /* FISTP m64 */
7042               DIP("fistpll %s\n", dis_buf);
7043               storeLE( mkexpr(addr),
7044                        binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
7045               fp_pop();
7046               break;
7047
7048            default:
7049               vex_printf("unhandled opc_aux = 0x%2x\n",
7050                          (UInt)gregLO3ofRM(modrm));
7051               vex_printf("first_opcode == 0xDF\n");
7052               goto decode_fail;
7053         }
7054
7055      } else {
7056
7057         delta++;
7058         switch (modrm) {
7059
7060            case 0xC0: /* FFREEP %st(0) */
7061               DIP("ffreep %%st(%d)\n", 0);
7062               put_ST_TAG ( 0, mkU8(0) );
7063               fp_pop();
7064               break;
7065
7066            case 0xE0: /* FNSTSW %ax */
7067               DIP("fnstsw %%ax\n");
7068               /* Invent a plausible-looking FPU status word value and
7069                  dump it in %AX:
7070                     ((ftop & 7) << 11) | (c3210 & 0x4700)
7071               */
7072               putIRegRAX(
7073                  2,
7074                  unop(Iop_32to16,
7075                       binop(Iop_Or32,
7076                             binop(Iop_Shl32,
7077                                   binop(Iop_And32, get_ftop(), mkU32(7)),
7078                                   mkU8(11)),
7079                             binop(Iop_And32,
7080                                   unop(Iop_64to32, get_C3210()),
7081                                   mkU32(0x4700))
7082               )));
7083               break;
7084
7085            case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
7086               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
7087               break;
7088
7089            case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
7090               /* not really right since COMIP != UCOMIP */
7091               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
7092               break;
7093
7094            default:
7095               goto decode_fail;
7096         }
7097      }
7098
7099   }
7100
7101   else
7102      goto decode_fail;
7103
7104   *decode_ok = True;
7105   return delta;
7106
7107  decode_fail:
7108   *decode_ok = False;
7109   return delta;
7110}
7111
7112
7113/*------------------------------------------------------------*/
7114/*---                                                      ---*/
7115/*--- MMX INSTRUCTIONS                                     ---*/
7116/*---                                                      ---*/
7117/*------------------------------------------------------------*/
7118
7119/* Effect of MMX insns on x87 FPU state (table 11-2 of
7120   IA32 arch manual, volume 3):
7121
7122   Read from, or write to MMX register (viz, any insn except EMMS):
7123   * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
7124   * FP stack pointer set to zero
7125
7126   EMMS:
7127   * All tags set to Invalid (empty) -- FPTAGS[i] := zero
7128   * FP stack pointer set to zero
7129*/
7130
7131static void do_MMX_preamble ( void )
7132{
7133   Int         i;
7134   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
7135   IRExpr*     zero  = mkU32(0);
7136   IRExpr*     tag1  = mkU8(1);
7137   put_ftop(zero);
7138   for (i = 0; i < 8; i++)
7139      stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag1) ) );
7140}
7141
7142static void do_EMMS_preamble ( void )
7143{
7144   Int         i;
7145   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
7146   IRExpr*     zero  = mkU32(0);
7147   IRExpr*     tag0  = mkU8(0);
7148   put_ftop(zero);
7149   for (i = 0; i < 8; i++)
7150      stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag0) ) );
7151}
7152
7153
7154static IRExpr* getMMXReg ( UInt archreg )
7155{
7156   vassert(archreg < 8);
7157   return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
7158}
7159
7160
7161static void putMMXReg ( UInt archreg, IRExpr* e )
7162{
7163   vassert(archreg < 8);
7164   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
7165   stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
7166}
7167
7168
7169/* Helper for non-shift MMX insns.  Note this is incomplete in the
7170   sense that it does not first call do_MMX_preamble() -- that is the
7171   responsibility of its caller. */
7172
7173static
7174ULong dis_MMXop_regmem_to_reg ( const VexAbiInfo* vbi,
7175                                Prefix      pfx,
7176                                Long        delta,
7177                                UChar       opc,
7178                                const HChar* name,
7179                                Bool        show_granularity )
7180{
7181   HChar   dis_buf[50];
7182   UChar   modrm = getUChar(delta);
7183   Bool    isReg = epartIsReg(modrm);
7184   IRExpr* argL  = NULL;
7185   IRExpr* argR  = NULL;
7186   IRExpr* argG  = NULL;
7187   IRExpr* argE  = NULL;
7188   IRTemp  res   = newTemp(Ity_I64);
7189
7190   Bool    invG  = False;
7191   IROp    op    = Iop_INVALID;
7192   void*   hAddr = NULL;
7193   const HChar*  hName = NULL;
7194   Bool    eLeft = False;
7195
7196#  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
7197
7198   switch (opc) {
7199      /* Original MMX ones */
7200      case 0xFC: op = Iop_Add8x8; break;
7201      case 0xFD: op = Iop_Add16x4; break;
7202      case 0xFE: op = Iop_Add32x2; break;
7203
7204      case 0xEC: op = Iop_QAdd8Sx8; break;
7205      case 0xED: op = Iop_QAdd16Sx4; break;
7206
7207      case 0xDC: op = Iop_QAdd8Ux8; break;
7208      case 0xDD: op = Iop_QAdd16Ux4; break;
7209
7210      case 0xF8: op = Iop_Sub8x8;  break;
7211      case 0xF9: op = Iop_Sub16x4; break;
7212      case 0xFA: op = Iop_Sub32x2; break;
7213
7214      case 0xE8: op = Iop_QSub8Sx8; break;
7215      case 0xE9: op = Iop_QSub16Sx4; break;
7216
7217      case 0xD8: op = Iop_QSub8Ux8; break;
7218      case 0xD9: op = Iop_QSub16Ux4; break;
7219
7220      case 0xE5: op = Iop_MulHi16Sx4; break;
7221      case 0xD5: op = Iop_Mul16x4; break;
7222      case 0xF5: XXX(amd64g_calculate_mmx_pmaddwd); break;
7223
7224      case 0x74: op = Iop_CmpEQ8x8; break;
7225      case 0x75: op = Iop_CmpEQ16x4; break;
7226      case 0x76: op = Iop_CmpEQ32x2; break;
7227
7228      case 0x64: op = Iop_CmpGT8Sx8; break;
7229      case 0x65: op = Iop_CmpGT16Sx4; break;
7230      case 0x66: op = Iop_CmpGT32Sx2; break;
7231
7232      case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
7233      case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
7234      case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
7235
7236      case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
7237      case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
7238      case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
7239
7240      case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
7241      case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
7242      case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
7243
7244      case 0xDB: op = Iop_And64; break;
7245      case 0xDF: op = Iop_And64; invG = True; break;
7246      case 0xEB: op = Iop_Or64; break;
7247      case 0xEF: /* Possibly do better here if argL and argR are the
7248                    same reg */
7249                 op = Iop_Xor64; break;
7250
7251      /* Introduced in SSE1 */
7252      case 0xE0: op = Iop_Avg8Ux8;    break;
7253      case 0xE3: op = Iop_Avg16Ux4;   break;
7254      case 0xEE: op = Iop_Max16Sx4;   break;
7255      case 0xDE: op = Iop_Max8Ux8;    break;
7256      case 0xEA: op = Iop_Min16Sx4;   break;
7257      case 0xDA: op = Iop_Min8Ux8;    break;
7258      case 0xE4: op = Iop_MulHi16Ux4; break;
7259      case 0xF6: XXX(amd64g_calculate_mmx_psadbw); break;
7260
7261      /* Introduced in SSE2 */
7262      case 0xD4: op = Iop_Add64; break;
7263      case 0xFB: op = Iop_Sub64; break;
7264
7265      default:
7266         vex_printf("\n0x%x\n", (UInt)opc);
7267         vpanic("dis_MMXop_regmem_to_reg");
7268   }
7269
7270#  undef XXX
7271
7272   argG = getMMXReg(gregLO3ofRM(modrm));
7273   if (invG)
7274      argG = unop(Iop_Not64, argG);
7275
7276   if (isReg) {
7277      delta++;
7278      argE = getMMXReg(eregLO3ofRM(modrm));
7279   } else {
7280      Int    len;
7281      IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7282      delta += len;
7283      argE = loadLE(Ity_I64, mkexpr(addr));
7284   }
7285
7286   if (eLeft) {
7287      argL = argE;
7288      argR = argG;
7289   } else {
7290      argL = argG;
7291      argR = argE;
7292   }
7293
7294   if (op != Iop_INVALID) {
7295      vassert(hName == NULL);
7296      vassert(hAddr == NULL);
7297      assign(res, binop(op, argL, argR));
7298   } else {
7299      vassert(hName != NULL);
7300      vassert(hAddr != NULL);
7301      assign( res,
7302              mkIRExprCCall(
7303                 Ity_I64,
7304                 0/*regparms*/, hName, hAddr,
7305                 mkIRExprVec_2( argL, argR )
7306              )
7307            );
7308   }
7309
7310   putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
7311
7312   DIP("%s%s %s, %s\n",
7313       name, show_granularity ? nameMMXGran(opc & 3) : "",
7314       ( isReg ? nameMMXReg(eregLO3ofRM(modrm)) : dis_buf ),
7315       nameMMXReg(gregLO3ofRM(modrm)) );
7316
7317   return delta;
7318}
7319
7320
7321/* Vector by scalar shift of G by the amount specified at the bottom
7322   of E.  This is a straight copy of dis_SSE_shiftG_byE. */
7323
7324static ULong dis_MMX_shiftG_byE ( const VexAbiInfo* vbi,
7325                                  Prefix pfx, Long delta,
7326                                  const HChar* opname, IROp op )
7327{
7328   HChar   dis_buf[50];
7329   Int     alen, size;
7330   IRTemp  addr;
7331   Bool    shl, shr, sar;
7332   UChar   rm   = getUChar(delta);
7333   IRTemp  g0   = newTemp(Ity_I64);
7334   IRTemp  g1   = newTemp(Ity_I64);
7335   IRTemp  amt  = newTemp(Ity_I64);
7336   IRTemp  amt8 = newTemp(Ity_I8);
7337
7338   if (epartIsReg(rm)) {
7339      assign( amt, getMMXReg(eregLO3ofRM(rm)) );
7340      DIP("%s %s,%s\n", opname,
7341                        nameMMXReg(eregLO3ofRM(rm)),
7342                        nameMMXReg(gregLO3ofRM(rm)) );
7343      delta++;
7344   } else {
7345      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
7346      assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
7347      DIP("%s %s,%s\n", opname,
7348                        dis_buf,
7349                        nameMMXReg(gregLO3ofRM(rm)) );
7350      delta += alen;
7351   }
7352   assign( g0,   getMMXReg(gregLO3ofRM(rm)) );
7353   assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
7354
7355   shl = shr = sar = False;
7356   size = 0;
7357   switch (op) {
7358      case Iop_ShlN16x4: shl = True; size = 32; break;
7359      case Iop_ShlN32x2: shl = True; size = 32; break;
7360      case Iop_Shl64:    shl = True; size = 64; break;
7361      case Iop_ShrN16x4: shr = True; size = 16; break;
7362      case Iop_ShrN32x2: shr = True; size = 32; break;
7363      case Iop_Shr64:    shr = True; size = 64; break;
7364      case Iop_SarN16x4: sar = True; size = 16; break;
7365      case Iop_SarN32x2: sar = True; size = 32; break;
7366      default: vassert(0);
7367   }
7368
7369   if (shl || shr) {
7370     assign(
7371        g1,
7372        IRExpr_ITE(
7373           binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size)),
7374           binop(op, mkexpr(g0), mkexpr(amt8)),
7375           mkU64(0)
7376        )
7377     );
7378   } else
7379   if (sar) {
7380     assign(
7381        g1,
7382        IRExpr_ITE(
7383           binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size)),
7384           binop(op, mkexpr(g0), mkexpr(amt8)),
7385           binop(op, mkexpr(g0), mkU8(size-1))
7386        )
7387     );
7388   } else {
7389      vassert(0);
7390   }
7391
7392   putMMXReg( gregLO3ofRM(rm), mkexpr(g1) );
7393   return delta;
7394}
7395
7396
7397/* Vector by scalar shift of E by an immediate byte.  This is a
7398   straight copy of dis_SSE_shiftE_imm. */
7399
7400static
7401ULong dis_MMX_shiftE_imm ( Long delta, const HChar* opname, IROp op )
7402{
7403   Bool    shl, shr, sar;
7404   UChar   rm   = getUChar(delta);
7405   IRTemp  e0   = newTemp(Ity_I64);
7406   IRTemp  e1   = newTemp(Ity_I64);
7407   UChar   amt, size;
7408   vassert(epartIsReg(rm));
7409   vassert(gregLO3ofRM(rm) == 2
7410           || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
7411   amt = getUChar(delta+1);
7412   delta += 2;
7413   DIP("%s $%d,%s\n", opname,
7414                      (Int)amt,
7415                      nameMMXReg(eregLO3ofRM(rm)) );
7416
7417   assign( e0, getMMXReg(eregLO3ofRM(rm)) );
7418
7419   shl = shr = sar = False;
7420   size = 0;
7421   switch (op) {
7422      case Iop_ShlN16x4: shl = True; size = 16; break;
7423      case Iop_ShlN32x2: shl = True; size = 32; break;
7424      case Iop_Shl64:    shl = True; size = 64; break;
7425      case Iop_SarN16x4: sar = True; size = 16; break;
7426      case Iop_SarN32x2: sar = True; size = 32; break;
7427      case Iop_ShrN16x4: shr = True; size = 16; break;
7428      case Iop_ShrN32x2: shr = True; size = 32; break;
7429      case Iop_Shr64:    shr = True; size = 64; break;
7430      default: vassert(0);
7431   }
7432
7433   if (shl || shr) {
7434     assign( e1, amt >= size
7435                    ? mkU64(0)
7436                    : binop(op, mkexpr(e0), mkU8(amt))
7437     );
7438   } else
7439   if (sar) {
7440     assign( e1, amt >= size
7441                    ? binop(op, mkexpr(e0), mkU8(size-1))
7442                    : binop(op, mkexpr(e0), mkU8(amt))
7443     );
7444   } else {
7445      vassert(0);
7446   }
7447
7448   putMMXReg( eregLO3ofRM(rm), mkexpr(e1) );
7449   return delta;
7450}
7451
7452
7453/* Completely handle all MMX instructions except emms. */
7454
7455static
7456ULong dis_MMX ( Bool* decode_ok,
7457                const VexAbiInfo* vbi, Prefix pfx, Int sz, Long delta )
7458{
7459   Int   len;
7460   UChar modrm;
7461   HChar dis_buf[50];
7462   UChar opc = getUChar(delta);
7463   delta++;
7464
7465   /* dis_MMX handles all insns except emms. */
7466   do_MMX_preamble();
7467
7468   switch (opc) {
7469
7470      case 0x6E:
7471         if (sz == 4) {
7472            /* MOVD (src)ireg32-or-mem32 (E), (dst)mmxreg (G)*/
7473            modrm = getUChar(delta);
7474            if (epartIsReg(modrm)) {
7475               delta++;
7476               putMMXReg(
7477                  gregLO3ofRM(modrm),
7478                  binop( Iop_32HLto64,
7479                         mkU32(0),
7480                         getIReg32(eregOfRexRM(pfx,modrm)) ) );
7481               DIP("movd %s, %s\n",
7482                   nameIReg32(eregOfRexRM(pfx,modrm)),
7483                   nameMMXReg(gregLO3ofRM(modrm)));
7484            } else {
7485               IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7486               delta += len;
7487               putMMXReg(
7488                  gregLO3ofRM(modrm),
7489                  binop( Iop_32HLto64,
7490                         mkU32(0),
7491                         loadLE(Ity_I32, mkexpr(addr)) ) );
7492               DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
7493            }
7494         }
7495         else
7496         if (sz == 8) {
7497            /* MOVD (src)ireg64-or-mem64 (E), (dst)mmxreg (G)*/
7498            modrm = getUChar(delta);
7499            if (epartIsReg(modrm)) {
7500               delta++;
7501               putMMXReg( gregLO3ofRM(modrm),
7502                          getIReg64(eregOfRexRM(pfx,modrm)) );
7503               DIP("movd %s, %s\n",
7504                   nameIReg64(eregOfRexRM(pfx,modrm)),
7505                   nameMMXReg(gregLO3ofRM(modrm)));
7506            } else {
7507               IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7508               delta += len;
7509               putMMXReg( gregLO3ofRM(modrm),
7510                          loadLE(Ity_I64, mkexpr(addr)) );
7511               DIP("movd{64} %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
7512            }
7513         }
7514         else {
7515            goto mmx_decode_failure;
7516         }
7517         break;
7518
7519      case 0x7E:
7520         if (sz == 4) {
7521            /* MOVD (src)mmxreg (G), (dst)ireg32-or-mem32 (E) */
7522            modrm = getUChar(delta);
7523            if (epartIsReg(modrm)) {
7524               delta++;
7525               putIReg32( eregOfRexRM(pfx,modrm),
7526                          unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
7527               DIP("movd %s, %s\n",
7528                   nameMMXReg(gregLO3ofRM(modrm)),
7529                   nameIReg32(eregOfRexRM(pfx,modrm)));
7530            } else {
7531               IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7532               delta += len;
7533               storeLE( mkexpr(addr),
7534                        unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
7535               DIP("movd %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
7536            }
7537         }
7538         else
7539         if (sz == 8) {
7540            /* MOVD (src)mmxreg (G), (dst)ireg64-or-mem64 (E) */
7541            modrm = getUChar(delta);
7542            if (epartIsReg(modrm)) {
7543               delta++;
7544               putIReg64( eregOfRexRM(pfx,modrm),
7545                          getMMXReg(gregLO3ofRM(modrm)) );
7546               DIP("movd %s, %s\n",
7547                   nameMMXReg(gregLO3ofRM(modrm)),
7548                   nameIReg64(eregOfRexRM(pfx,modrm)));
7549            } else {
7550               IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7551               delta += len;
7552               storeLE( mkexpr(addr),
7553                       getMMXReg(gregLO3ofRM(modrm)) );
7554               DIP("movd{64} %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
7555            }
7556         } else {
7557            goto mmx_decode_failure;
7558         }
7559         break;
7560
7561      case 0x6F:
7562         /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
7563         if (sz != 4
7564             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7565            goto mmx_decode_failure;
7566         modrm = getUChar(delta);
7567         if (epartIsReg(modrm)) {
7568            delta++;
7569            putMMXReg( gregLO3ofRM(modrm), getMMXReg(eregLO3ofRM(modrm)) );
7570            DIP("movq %s, %s\n",
7571                nameMMXReg(eregLO3ofRM(modrm)),
7572                nameMMXReg(gregLO3ofRM(modrm)));
7573         } else {
7574            IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7575            delta += len;
7576            putMMXReg( gregLO3ofRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
7577            DIP("movq %s, %s\n",
7578                dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
7579         }
7580         break;
7581
7582      case 0x7F:
7583         /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
7584         if (sz != 4
7585             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7586            goto mmx_decode_failure;
7587         modrm = getUChar(delta);
7588         if (epartIsReg(modrm)) {
7589            delta++;
7590            putMMXReg( eregLO3ofRM(modrm), getMMXReg(gregLO3ofRM(modrm)) );
7591            DIP("movq %s, %s\n",
7592                nameMMXReg(gregLO3ofRM(modrm)),
7593                nameMMXReg(eregLO3ofRM(modrm)));
7594         } else {
7595            IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7596            delta += len;
7597            storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
7598            DIP("mov(nt)q %s, %s\n",
7599                nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
7600         }
7601         break;
7602
7603      case 0xFC:
7604      case 0xFD:
7605      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
7606         if (sz != 4)
7607            goto mmx_decode_failure;
7608         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padd", True );
7609         break;
7610
7611      case 0xEC:
7612      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
7613         if (sz != 4
7614             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7615            goto mmx_decode_failure;
7616         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padds", True );
7617         break;
7618
7619      case 0xDC:
7620      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
7621         if (sz != 4)
7622            goto mmx_decode_failure;
7623         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "paddus", True );
7624         break;
7625
7626      case 0xF8:
7627      case 0xF9:
7628      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
7629         if (sz != 4)
7630            goto mmx_decode_failure;
7631         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psub", True );
7632         break;
7633
7634      case 0xE8:
7635      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
7636         if (sz != 4)
7637            goto mmx_decode_failure;
7638         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubs", True );
7639         break;
7640
7641      case 0xD8:
7642      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
7643         if (sz != 4)
7644            goto mmx_decode_failure;
7645         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubus", True );
7646         break;
7647
7648      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
7649         if (sz != 4)
7650            goto mmx_decode_failure;
7651         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmulhw", False );
7652         break;
7653
7654      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
7655         if (sz != 4)
7656            goto mmx_decode_failure;
7657         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmullw", False );
7658         break;
7659
7660      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
7661         vassert(sz == 4);
7662         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmaddwd", False );
7663         break;
7664
7665      case 0x74:
7666      case 0x75:
7667      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
7668         if (sz != 4)
7669            goto mmx_decode_failure;
7670         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpeq", True );
7671         break;
7672
7673      case 0x64:
7674      case 0x65:
7675      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
7676         if (sz != 4)
7677            goto mmx_decode_failure;
7678         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpgt", True );
7679         break;
7680
7681      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
7682         if (sz != 4)
7683            goto mmx_decode_failure;
7684         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packssdw", False );
7685         break;
7686
7687      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
7688         if (sz != 4)
7689            goto mmx_decode_failure;
7690         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packsswb", False );
7691         break;
7692
7693      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
7694         if (sz != 4)
7695            goto mmx_decode_failure;
7696         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packuswb", False );
7697         break;
7698
7699      case 0x68:
7700      case 0x69:
7701      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
7702         if (sz != 4
7703             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7704            goto mmx_decode_failure;
7705         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckh", True );
7706         break;
7707
7708      case 0x60:
7709      case 0x61:
7710      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
7711         if (sz != 4
7712             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7713            goto mmx_decode_failure;
7714         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckl", True );
7715         break;
7716
7717      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
7718         if (sz != 4)
7719            goto mmx_decode_failure;
7720         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pand", False );
7721         break;
7722
7723      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
7724         if (sz != 4)
7725            goto mmx_decode_failure;
7726         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pandn", False );
7727         break;
7728
7729      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
7730         if (sz != 4)
7731            goto mmx_decode_failure;
7732         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "por", False );
7733         break;
7734
7735      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
7736         if (sz != 4)
7737            goto mmx_decode_failure;
7738         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pxor", False );
7739         break;
7740
7741#     define SHIFT_BY_REG(_name,_op)                                     \
7742                delta = dis_MMX_shiftG_byE(vbi, pfx, delta, _name, _op); \
7743                break;
7744
7745      /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
7746      case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
7747      case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
7748      case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
7749
7750      /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
7751      case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
7752      case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
7753      case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
7754
7755      /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
7756      case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
7757      case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
7758
7759#     undef SHIFT_BY_REG
7760
7761      case 0x71:
7762      case 0x72:
7763      case 0x73: {
7764         /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
7765         UChar byte2, subopc;
7766         if (sz != 4)
7767            goto mmx_decode_failure;
7768         byte2  = getUChar(delta);      /* amode / sub-opcode */
7769         subopc = toUChar( (byte2 >> 3) & 7 );
7770
7771#        define SHIFT_BY_IMM(_name,_op)                        \
7772            do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
7773            } while (0)
7774
7775              if (subopc == 2 /*SRL*/ && opc == 0x71)
7776                  SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
7777         else if (subopc == 2 /*SRL*/ && opc == 0x72)
7778                 SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
7779         else if (subopc == 2 /*SRL*/ && opc == 0x73)
7780                 SHIFT_BY_IMM("psrlq", Iop_Shr64);
7781
7782         else if (subopc == 4 /*SAR*/ && opc == 0x71)
7783                 SHIFT_BY_IMM("psraw", Iop_SarN16x4);
7784         else if (subopc == 4 /*SAR*/ && opc == 0x72)
7785                 SHIFT_BY_IMM("psrad", Iop_SarN32x2);
7786
7787         else if (subopc == 6 /*SHL*/ && opc == 0x71)
7788                 SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
7789         else if (subopc == 6 /*SHL*/ && opc == 0x72)
7790                  SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
7791         else if (subopc == 6 /*SHL*/ && opc == 0x73)
7792                 SHIFT_BY_IMM("psllq", Iop_Shl64);
7793
7794         else goto mmx_decode_failure;
7795
7796#        undef SHIFT_BY_IMM
7797         break;
7798      }
7799
7800      case 0xF7: {
7801         IRTemp addr    = newTemp(Ity_I64);
7802         IRTemp regD    = newTemp(Ity_I64);
7803         IRTemp regM    = newTemp(Ity_I64);
7804         IRTemp mask    = newTemp(Ity_I64);
7805         IRTemp olddata = newTemp(Ity_I64);
7806         IRTemp newdata = newTemp(Ity_I64);
7807
7808         modrm = getUChar(delta);
7809         if (sz != 4 || (!epartIsReg(modrm)))
7810            goto mmx_decode_failure;
7811         delta++;
7812
7813         assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
7814         assign( regM, getMMXReg( eregLO3ofRM(modrm) ));
7815         assign( regD, getMMXReg( gregLO3ofRM(modrm) ));
7816         assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
7817         assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
7818         assign( newdata,
7819                 binop(Iop_Or64,
7820                       binop(Iop_And64,
7821                             mkexpr(regD),
7822                             mkexpr(mask) ),
7823                       binop(Iop_And64,
7824                             mkexpr(olddata),
7825                             unop(Iop_Not64, mkexpr(mask)))) );
7826         storeLE( mkexpr(addr), mkexpr(newdata) );
7827         DIP("maskmovq %s,%s\n", nameMMXReg( eregLO3ofRM(modrm) ),
7828                                 nameMMXReg( gregLO3ofRM(modrm) ) );
7829         break;
7830      }
7831
7832      /* --- MMX decode failure --- */
7833      default:
7834      mmx_decode_failure:
7835         *decode_ok = False;
7836         return delta; /* ignored */
7837
7838   }
7839
7840   *decode_ok = True;
7841   return delta;
7842}
7843
7844
7845/*------------------------------------------------------------*/
7846/*--- More misc arithmetic and other obscure insns.        ---*/
7847/*------------------------------------------------------------*/
7848
7849/* Generate base << amt with vacated places filled with stuff
7850   from xtra.  amt guaranteed in 0 .. 63. */
7851static
7852IRExpr* shiftL64_with_extras ( IRTemp base, IRTemp xtra, IRTemp amt )
7853{
7854   /* if   amt == 0
7855      then base
7856      else (base << amt) | (xtra >>u (64-amt))
7857   */
7858   return
7859      IRExpr_ITE(
7860         binop(Iop_CmpNE8, mkexpr(amt), mkU8(0)),
7861         binop(Iop_Or64,
7862               binop(Iop_Shl64, mkexpr(base), mkexpr(amt)),
7863               binop(Iop_Shr64, mkexpr(xtra),
7864                                binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
7865               ),
7866         mkexpr(base)
7867      );
7868}
7869
7870/* Generate base >>u amt with vacated places filled with stuff
7871   from xtra.  amt guaranteed in 0 .. 63. */
7872static
7873IRExpr* shiftR64_with_extras ( IRTemp xtra, IRTemp base, IRTemp amt )
7874{
7875   /* if   amt == 0
7876      then base
7877      else (base >>u amt) | (xtra << (64-amt))
7878   */
7879   return
7880      IRExpr_ITE(
7881         binop(Iop_CmpNE8, mkexpr(amt), mkU8(0)),
7882         binop(Iop_Or64,
7883               binop(Iop_Shr64, mkexpr(base), mkexpr(amt)),
7884               binop(Iop_Shl64, mkexpr(xtra),
7885                                binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
7886               ),
7887         mkexpr(base)
7888      );
7889}
7890
7891/* Double length left and right shifts.  Apparently only required in
7892   v-size (no b- variant). */
7893static
7894ULong dis_SHLRD_Gv_Ev ( const VexAbiInfo* vbi,
7895                        Prefix pfx,
7896                        Long delta, UChar modrm,
7897                        Int sz,
7898                        IRExpr* shift_amt,
7899                        Bool amt_is_literal,
7900                        const HChar* shift_amt_txt,
7901                        Bool left_shift )
7902{
7903   /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
7904      for printing it.   And eip on entry points at the modrm byte. */
7905   Int len;
7906   HChar dis_buf[50];
7907
7908   IRType ty     = szToITy(sz);
7909   IRTemp gsrc   = newTemp(ty);
7910   IRTemp esrc   = newTemp(ty);
7911   IRTemp addr   = IRTemp_INVALID;
7912   IRTemp tmpSH  = newTemp(Ity_I8);
7913   IRTemp tmpSS  = newTemp(Ity_I8);
7914   IRTemp tmp64  = IRTemp_INVALID;
7915   IRTemp res64  = IRTemp_INVALID;
7916   IRTemp rss64  = IRTemp_INVALID;
7917   IRTemp resTy  = IRTemp_INVALID;
7918   IRTemp rssTy  = IRTemp_INVALID;
7919   Int    mask   = sz==8 ? 63 : 31;
7920
7921   vassert(sz == 2 || sz == 4 || sz == 8);
7922
7923   /* The E-part is the destination; this is shifted.  The G-part
7924      supplies bits to be shifted into the E-part, but is not
7925      changed.
7926
7927      If shifting left, form a double-length word with E at the top
7928      and G at the bottom, and shift this left.  The result is then in
7929      the high part.
7930
7931      If shifting right, form a double-length word with G at the top
7932      and E at the bottom, and shift this right.  The result is then
7933      at the bottom.  */
7934
7935   /* Fetch the operands. */
7936
7937   assign( gsrc, getIRegG(sz, pfx, modrm) );
7938
7939   if (epartIsReg(modrm)) {
7940      delta++;
7941      assign( esrc, getIRegE(sz, pfx, modrm) );
7942      DIP("sh%cd%c %s, %s, %s\n",
7943          ( left_shift ? 'l' : 'r' ), nameISize(sz),
7944          shift_amt_txt,
7945          nameIRegG(sz, pfx, modrm), nameIRegE(sz, pfx, modrm));
7946   } else {
7947      addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
7948                        /* # bytes following amode */
7949                        amt_is_literal ? 1 : 0 );
7950      delta += len;
7951      assign( esrc, loadLE(ty, mkexpr(addr)) );
7952      DIP("sh%cd%c %s, %s, %s\n",
7953          ( left_shift ? 'l' : 'r' ), nameISize(sz),
7954          shift_amt_txt,
7955          nameIRegG(sz, pfx, modrm), dis_buf);
7956   }
7957
7958   /* Calculate the masked shift amount (tmpSH), the masked subshift
7959      amount (tmpSS), the shifted value (res64) and the subshifted
7960      value (rss64). */
7961
7962   assign( tmpSH, binop(Iop_And8, shift_amt, mkU8(mask)) );
7963   assign( tmpSS, binop(Iop_And8,
7964                        binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
7965                        mkU8(mask)));
7966
7967   tmp64 = newTemp(Ity_I64);
7968   res64 = newTemp(Ity_I64);
7969   rss64 = newTemp(Ity_I64);
7970
7971   if (sz == 2 || sz == 4) {
7972
7973      /* G is xtra; E is data */
7974      /* what a freaking nightmare: */
7975      if (sz == 4 && left_shift) {
7976         assign( tmp64, binop(Iop_32HLto64, mkexpr(esrc), mkexpr(gsrc)) );
7977         assign( res64,
7978                 binop(Iop_Shr64,
7979                       binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
7980                       mkU8(32)) );
7981         assign( rss64,
7982                 binop(Iop_Shr64,
7983                       binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSS)),
7984                       mkU8(32)) );
7985      }
7986      else
7987      if (sz == 4 && !left_shift) {
7988         assign( tmp64, binop(Iop_32HLto64, mkexpr(gsrc), mkexpr(esrc)) );
7989         assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
7990         assign( rss64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSS)) );
7991      }
7992      else
7993      if (sz == 2 && left_shift) {
7994         assign( tmp64,
7995                 binop(Iop_32HLto64,
7996                       binop(Iop_16HLto32, mkexpr(esrc), mkexpr(gsrc)),
7997                       binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc))
7998         ));
7999         /* result formed by shifting [esrc'gsrc'gsrc'gsrc] */
8000         assign( res64,
8001                 binop(Iop_Shr64,
8002                       binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
8003                       mkU8(48)) );
8004         /* subshift formed by shifting [esrc'0000'0000'0000] */
8005         assign( rss64,
8006                 binop(Iop_Shr64,
8007                       binop(Iop_Shl64,
8008                             binop(Iop_Shl64, unop(Iop_16Uto64, mkexpr(esrc)),
8009                                              mkU8(48)),
8010                             mkexpr(tmpSS)),
8011                       mkU8(48)) );
8012      }
8013      else
8014      if (sz == 2 && !left_shift) {
8015         assign( tmp64,
8016                 binop(Iop_32HLto64,
8017                       binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc)),
8018                       binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(esrc))
8019         ));
8020         /* result formed by shifting [gsrc'gsrc'gsrc'esrc] */
8021         assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
8022         /* subshift formed by shifting [0000'0000'0000'esrc] */
8023         assign( rss64, binop(Iop_Shr64,
8024                              unop(Iop_16Uto64, mkexpr(esrc)),
8025                              mkexpr(tmpSS)) );
8026      }
8027
8028   } else {
8029
8030      vassert(sz == 8);
8031      if (left_shift) {
8032         assign( res64, shiftL64_with_extras( esrc, gsrc, tmpSH ));
8033         assign( rss64, shiftL64_with_extras( esrc, gsrc, tmpSS ));
8034      } else {
8035         assign( res64, shiftR64_with_extras( gsrc, esrc, tmpSH ));
8036         assign( rss64, shiftR64_with_extras( gsrc, esrc, tmpSS ));
8037      }
8038
8039   }
8040
8041   resTy = newTemp(ty);
8042   rssTy = newTemp(ty);
8043   assign( resTy, narrowTo(ty, mkexpr(res64)) );
8044   assign( rssTy, narrowTo(ty, mkexpr(rss64)) );
8045
8046   /* Put result back and write the flags thunk. */
8047   setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl64 : Iop_Sar64,
8048                              resTy, rssTy, ty, tmpSH );
8049
8050   if (epartIsReg(modrm)) {
8051      putIRegE(sz, pfx, modrm, mkexpr(resTy));
8052   } else {
8053      storeLE( mkexpr(addr), mkexpr(resTy) );
8054   }
8055
8056   if (amt_is_literal) delta++;
8057   return delta;
8058}
8059
8060
8061/* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
8062   required. */
8063
8064typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
8065
8066static const HChar* nameBtOp ( BtOp op )
8067{
8068   switch (op) {
8069      case BtOpNone:  return "";
8070      case BtOpSet:   return "s";
8071      case BtOpReset: return "r";
8072      case BtOpComp:  return "c";
8073      default: vpanic("nameBtOp(amd64)");
8074   }
8075}
8076
8077
8078static
8079ULong dis_bt_G_E ( const VexAbiInfo* vbi,
8080                   Prefix pfx, Int sz, Long delta, BtOp op,
8081                   /*OUT*/Bool* decode_OK )
8082{
8083   HChar  dis_buf[50];
8084   UChar  modrm;
8085   Int    len;
8086   IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
8087          t_addr1, t_rsp, t_mask, t_new;
8088
8089   vassert(sz == 2 || sz == 4 || sz == 8);
8090
8091   t_fetched = t_bitno0 = t_bitno1 = t_bitno2
8092             = t_addr0 = t_addr1 = t_rsp
8093             = t_mask = t_new = IRTemp_INVALID;
8094
8095   t_fetched = newTemp(Ity_I8);
8096   t_new     = newTemp(Ity_I8);
8097   t_bitno0  = newTemp(Ity_I64);
8098   t_bitno1  = newTemp(Ity_I64);
8099   t_bitno2  = newTemp(Ity_I8);
8100   t_addr1   = newTemp(Ity_I64);
8101   modrm     = getUChar(delta);
8102
8103   *decode_OK = True;
8104   if (epartIsReg(modrm)) {
8105      /* F2 and F3 are never acceptable. */
8106      if (haveF2orF3(pfx)) {
8107         *decode_OK = False;
8108         return delta;
8109      }
8110   } else {
8111      /* F2 or F3 (but not both) are allowed, provided LOCK is also
8112         present, and only for the BTC/BTS/BTR cases (not BT). */
8113      if (haveF2orF3(pfx)) {
8114         if (haveF2andF3(pfx) || !haveLOCK(pfx) || op == BtOpNone) {
8115            *decode_OK = False;
8116            return delta;
8117         }
8118      }
8119   }
8120
8121   assign( t_bitno0, widenSto64(getIRegG(sz, pfx, modrm)) );
8122
8123   if (epartIsReg(modrm)) {
8124      delta++;
8125      /* Get it onto the client's stack.  Oh, this is a horrible
8126         kludge.  See https://bugs.kde.org/show_bug.cgi?id=245925.
8127         Because of the ELF ABI stack redzone, there may be live data
8128         up to 128 bytes below %RSP.  So we can't just push it on the
8129         stack, else we may wind up trashing live data, and causing
8130         impossible-to-find simulation errors.  (Yes, this did
8131         happen.)  So we need to drop RSP before at least 128 before
8132         pushing it.  That unfortunately means hitting Memcheck's
8133         fast-case painting code.  Ideally we should drop more than
8134         128, to reduce the chances of breaking buggy programs that
8135         have live data below -128(%RSP).  Memcheck fast-cases moves
8136         of 288 bytes due to the need to handle ppc64-linux quickly,
8137         so let's use 288.  Of course the real fix is to get rid of
8138         this kludge entirely.  */
8139      t_rsp = newTemp(Ity_I64);
8140      t_addr0 = newTemp(Ity_I64);
8141
8142      vassert(vbi->guest_stack_redzone_size == 128);
8143      assign( t_rsp, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(288)) );
8144      putIReg64(R_RSP, mkexpr(t_rsp));
8145
8146      storeLE( mkexpr(t_rsp), getIRegE(sz, pfx, modrm) );
8147
8148      /* Make t_addr0 point at it. */
8149      assign( t_addr0, mkexpr(t_rsp) );
8150
8151      /* Mask out upper bits of the shift amount, since we're doing a
8152         reg. */
8153      assign( t_bitno1, binop(Iop_And64,
8154                              mkexpr(t_bitno0),
8155                              mkU64(sz == 8 ? 63 : sz == 4 ? 31 : 15)) );
8156
8157   } else {
8158      t_addr0 = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
8159      delta += len;
8160      assign( t_bitno1, mkexpr(t_bitno0) );
8161   }
8162
8163   /* At this point: t_addr0 is the address being operated on.  If it
8164      was a reg, we will have pushed it onto the client's stack.
8165      t_bitno1 is the bit number, suitably masked in the case of a
8166      reg.  */
8167
8168   /* Now the main sequence. */
8169   assign( t_addr1,
8170           binop(Iop_Add64,
8171                 mkexpr(t_addr0),
8172                 binop(Iop_Sar64, mkexpr(t_bitno1), mkU8(3))) );
8173
8174   /* t_addr1 now holds effective address */
8175
8176   assign( t_bitno2,
8177           unop(Iop_64to8,
8178                binop(Iop_And64, mkexpr(t_bitno1), mkU64(7))) );
8179
8180   /* t_bitno2 contains offset of bit within byte */
8181
8182   if (op != BtOpNone) {
8183      t_mask = newTemp(Ity_I8);
8184      assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
8185   }
8186
8187   /* t_mask is now a suitable byte mask */
8188
8189   assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
8190
8191   if (op != BtOpNone) {
8192      switch (op) {
8193         case BtOpSet:
8194            assign( t_new,
8195                    binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
8196            break;
8197         case BtOpComp:
8198            assign( t_new,
8199                    binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
8200            break;
8201         case BtOpReset:
8202            assign( t_new,
8203                    binop(Iop_And8, mkexpr(t_fetched),
8204                                    unop(Iop_Not8, mkexpr(t_mask))) );
8205            break;
8206         default:
8207            vpanic("dis_bt_G_E(amd64)");
8208      }
8209      if ((haveLOCK(pfx)) && !epartIsReg(modrm)) {
8210         casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
8211                                 mkexpr(t_new)/*new*/,
8212                                 guest_RIP_curr_instr );
8213      } else {
8214         storeLE( mkexpr(t_addr1), mkexpr(t_new) );
8215      }
8216   }
8217
8218   /* Side effect done; now get selected bit into Carry flag.  The Intel docs
8219      (as of 2015, at least) say that C holds the result, Z is unchanged, and
8220      O,S,A and P are undefined.  However, on Skylake it appears that O,S,A,P
8221      are also unchanged, so let's do that. */
8222   const ULong maskC     = AMD64G_CC_MASK_C;
8223   const ULong maskOSZAP = AMD64G_CC_MASK_O | AMD64G_CC_MASK_S
8224                           | AMD64G_CC_MASK_Z | AMD64G_CC_MASK_A
8225                           | AMD64G_CC_MASK_P;
8226
8227   IRTemp old_rflags = newTemp(Ity_I64);
8228   assign(old_rflags, mk_amd64g_calculate_rflags_all());
8229
8230   IRTemp new_rflags = newTemp(Ity_I64);
8231   assign(new_rflags,
8232          binop(Iop_Or64,
8233                binop(Iop_And64, mkexpr(old_rflags), mkU64(maskOSZAP)),
8234                binop(Iop_And64,
8235                      binop(Iop_Shr64,
8236                            unop(Iop_8Uto64, mkexpr(t_fetched)),
8237                            mkexpr(t_bitno2)),
8238                      mkU64(maskC))));
8239
8240   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
8241   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
8242   stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(new_rflags) ));
8243   /* Set NDEP even though it isn't used.  This makes redundant-PUT
8244      elimination of previous stores to this field work better. */
8245   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
8246
8247   /* Move reg operand from stack back to reg */
8248   if (epartIsReg(modrm)) {
8249      /* t_rsp still points at it. */
8250      /* only write the reg if actually modifying it; doing otherwise
8251         zeroes the top half erroneously when doing btl due to
8252         standard zero-extend rule */
8253      if (op != BtOpNone)
8254         putIRegE(sz, pfx, modrm, loadLE(szToITy(sz), mkexpr(t_rsp)) );
8255      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t_rsp), mkU64(288)) );
8256   }
8257
8258   DIP("bt%s%c %s, %s\n",
8259       nameBtOp(op), nameISize(sz), nameIRegG(sz, pfx, modrm),
8260       ( epartIsReg(modrm) ? nameIRegE(sz, pfx, modrm) : dis_buf ) );
8261
8262   return delta;
8263}
8264
8265
8266
8267/* Handle BSF/BSR.  Only v-size seems necessary. */
8268static
8269ULong dis_bs_E_G ( const VexAbiInfo* vbi,
8270                   Prefix pfx, Int sz, Long delta, Bool fwds )
8271{
8272   Bool   isReg;
8273   UChar  modrm;
8274   HChar  dis_buf[50];
8275
8276   IRType ty    = szToITy(sz);
8277   IRTemp src   = newTemp(ty);
8278   IRTemp dst   = newTemp(ty);
8279   IRTemp src64 = newTemp(Ity_I64);
8280   IRTemp dst64 = newTemp(Ity_I64);
8281   IRTemp srcB  = newTemp(Ity_I1);
8282
8283   vassert(sz == 8 || sz == 4 || sz == 2);
8284
8285   modrm = getUChar(delta);
8286   isReg = epartIsReg(modrm);
8287   if (isReg) {
8288      delta++;
8289      assign( src, getIRegE(sz, pfx, modrm) );
8290   } else {
8291      Int    len;
8292      IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
8293      delta += len;
8294      assign( src, loadLE(ty, mkexpr(addr)) );
8295   }
8296
8297   DIP("bs%c%c %s, %s\n",
8298       fwds ? 'f' : 'r', nameISize(sz),
8299       ( isReg ? nameIRegE(sz, pfx, modrm) : dis_buf ),
8300       nameIRegG(sz, pfx, modrm));
8301
8302   /* First, widen src to 64 bits if it is not already. */
8303   assign( src64, widenUto64(mkexpr(src)) );
8304
8305   /* Generate a bool expression which is zero iff the original is
8306      zero, and nonzero otherwise.  Ask for a CmpNE version which, if
8307      instrumented by Memcheck, is instrumented expensively, since
8308      this may be used on the output of a preceding movmskb insn,
8309      which has been known to be partially defined, and in need of
8310      careful handling. */
8311   assign( srcB, binop(Iop_ExpCmpNE64, mkexpr(src64), mkU64(0)) );
8312
8313   /* Flags: Z is 1 iff source value is zero.  All others
8314      are undefined -- we force them to zero. */
8315   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
8316   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
8317   stmt( IRStmt_Put(
8318            OFFB_CC_DEP1,
8319            IRExpr_ITE( mkexpr(srcB),
8320                        /* src!=0 */
8321                        mkU64(0),
8322                        /* src==0 */
8323                        mkU64(AMD64G_CC_MASK_Z)
8324                        )
8325       ));
8326   /* Set NDEP even though it isn't used.  This makes redundant-PUT
8327      elimination of previous stores to this field work better. */
8328   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
8329
8330   /* Result: iff source value is zero, we can't use
8331      Iop_Clz64/Iop_Ctz64 as they have no defined result in that case.
8332      But anyway, amd64 semantics say the result is undefined in
8333      such situations.  Hence handle the zero case specially. */
8334
8335   /* Bleh.  What we compute:
8336
8337          bsf64:  if src == 0 then {dst is unchanged}
8338                              else Ctz64(src)
8339
8340          bsr64:  if src == 0 then {dst is unchanged}
8341                              else 63 - Clz64(src)
8342
8343          bsf32:  if src == 0 then {dst is unchanged}
8344                              else Ctz64(32Uto64(src))
8345
8346          bsr32:  if src == 0 then {dst is unchanged}
8347                              else 63 - Clz64(32Uto64(src))
8348
8349          bsf16:  if src == 0 then {dst is unchanged}
8350                              else Ctz64(32Uto64(16Uto32(src)))
8351
8352          bsr16:  if src == 0 then {dst is unchanged}
8353                              else 63 - Clz64(32Uto64(16Uto32(src)))
8354   */
8355
8356   /* The main computation, guarding against zero. */
8357   assign( dst64,
8358           IRExpr_ITE(
8359              mkexpr(srcB),
8360              /* src != 0 */
8361              fwds ? unop(Iop_Ctz64, mkexpr(src64))
8362                   : binop(Iop_Sub64,
8363                           mkU64(63),
8364                           unop(Iop_Clz64, mkexpr(src64))),
8365              /* src == 0 -- leave dst unchanged */
8366              widenUto64( getIRegG( sz, pfx, modrm ) )
8367           )
8368         );
8369
8370   if (sz == 2)
8371      assign( dst, unop(Iop_64to16, mkexpr(dst64)) );
8372   else
8373   if (sz == 4)
8374      assign( dst, unop(Iop_64to32, mkexpr(dst64)) );
8375   else
8376      assign( dst, mkexpr(dst64) );
8377
8378   /* dump result back */
8379   putIRegG( sz, pfx, modrm, mkexpr(dst) );
8380
8381   return delta;
8382}
8383
8384
8385/* swap rAX with the reg specified by reg and REX.B */
8386static
8387void codegen_xchg_rAX_Reg ( Prefix pfx, Int sz, UInt regLo3 )
8388{
8389   IRType ty = szToITy(sz);
8390   IRTemp t1 = newTemp(ty);
8391   IRTemp t2 = newTemp(ty);
8392   vassert(sz == 2 || sz == 4 || sz == 8);
8393   vassert(regLo3 < 8);
8394   if (sz == 8) {
8395      assign( t1, getIReg64(R_RAX) );
8396      assign( t2, getIRegRexB(8, pfx, regLo3) );
8397      putIReg64( R_RAX, mkexpr(t2) );
8398      putIRegRexB(8, pfx, regLo3, mkexpr(t1) );
8399   } else if (sz == 4) {
8400      assign( t1, getIReg32(R_RAX) );
8401      assign( t2, getIRegRexB(4, pfx, regLo3) );
8402      putIReg32( R_RAX, mkexpr(t2) );
8403      putIRegRexB(4, pfx, regLo3, mkexpr(t1) );
8404   } else {
8405      assign( t1, getIReg16(R_RAX) );
8406      assign( t2, getIRegRexB(2, pfx, regLo3) );
8407      putIReg16( R_RAX, mkexpr(t2) );
8408      putIRegRexB(2, pfx, regLo3, mkexpr(t1) );
8409   }
8410   DIP("xchg%c %s, %s\n",
8411       nameISize(sz), nameIRegRAX(sz),
8412                      nameIRegRexB(sz,pfx, regLo3));
8413}
8414
8415
8416static
8417void codegen_SAHF ( void )
8418{
8419   /* Set the flags to:
8420      (amd64g_calculate_flags_all() & AMD64G_CC_MASK_O)
8421                                    -- retain the old O flag
8422      | (%AH & (AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
8423                |AMD64G_CC_MASK_P|AMD64G_CC_MASK_C)
8424   */
8425   ULong  mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
8426                       |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
8427   IRTemp oldflags   = newTemp(Ity_I64);
8428   assign( oldflags, mk_amd64g_calculate_rflags_all() );
8429   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
8430   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
8431   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
8432   stmt( IRStmt_Put( OFFB_CC_DEP1,
8433         binop(Iop_Or64,
8434               binop(Iop_And64, mkexpr(oldflags), mkU64(AMD64G_CC_MASK_O)),
8435               binop(Iop_And64,
8436                     binop(Iop_Shr64, getIReg64(R_RAX), mkU8(8)),
8437                     mkU64(mask_SZACP))
8438              )
8439   ));
8440}
8441
8442
8443static
8444void codegen_LAHF ( void  )
8445{
8446   /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
8447   IRExpr* rax_with_hole;
8448   IRExpr* new_byte;
8449   IRExpr* new_rax;
8450   ULong   mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
8451                        |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
8452
8453   IRTemp  flags = newTemp(Ity_I64);
8454   assign( flags, mk_amd64g_calculate_rflags_all() );
8455
8456   rax_with_hole
8457      = binop(Iop_And64, getIReg64(R_RAX), mkU64(~0xFF00ULL));
8458   new_byte
8459      = binop(Iop_Or64, binop(Iop_And64, mkexpr(flags), mkU64(mask_SZACP)),
8460                        mkU64(1<<1));
8461   new_rax
8462      = binop(Iop_Or64, rax_with_hole,
8463                        binop(Iop_Shl64, new_byte, mkU8(8)));
8464   putIReg64(R_RAX, new_rax);
8465}
8466
8467
8468static
8469ULong dis_cmpxchg_G_E ( /*OUT*/Bool* ok,
8470                        const VexAbiInfo*  vbi,
8471                        Prefix       pfx,
8472                        Int          size,
8473                        Long         delta0 )
8474{
8475   HChar dis_buf[50];
8476   Int   len;
8477
8478   IRType ty    = szToITy(size);
8479   IRTemp acc   = newTemp(ty);
8480   IRTemp src   = newTemp(ty);
8481   IRTemp dest  = newTemp(ty);
8482   IRTemp dest2 = newTemp(ty);
8483   IRTemp acc2  = newTemp(ty);
8484   IRTemp cond  = newTemp(Ity_I1);
8485   IRTemp addr  = IRTemp_INVALID;
8486   UChar  rm    = getUChar(delta0);
8487
8488   /* There are 3 cases to consider:
8489
8490      reg-reg: ignore any lock prefix, generate sequence based
8491               on ITE
8492
8493      reg-mem, not locked: ignore any lock prefix, generate sequence
8494                           based on ITE
8495
8496      reg-mem, locked: use IRCAS
8497   */
8498
8499   /* Decide whether F2 or F3 are acceptable.  Never for register
8500      case, but for the memory case, one or the other is OK provided
8501      LOCK is also present. */
8502   if (epartIsReg(rm)) {
8503      if (haveF2orF3(pfx)) {
8504         *ok = False;
8505         return delta0;
8506      }
8507   } else {
8508      if (haveF2orF3(pfx)) {
8509         if (haveF2andF3(pfx) || !haveLOCK(pfx)) {
8510            *ok = False;
8511            return delta0;
8512         }
8513      }
8514   }
8515
8516   if (epartIsReg(rm)) {
8517      /* case 1 */
8518      assign( dest, getIRegE(size, pfx, rm) );
8519      delta0++;
8520      assign( src, getIRegG(size, pfx, rm) );
8521      assign( acc, getIRegRAX(size) );
8522      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
8523      assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
8524      assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
8525      assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
8526      putIRegRAX(size, mkexpr(acc2));
8527      putIRegE(size, pfx, rm, mkexpr(dest2));
8528      DIP("cmpxchg%c %s,%s\n", nameISize(size),
8529                               nameIRegG(size,pfx,rm),
8530                               nameIRegE(size,pfx,rm) );
8531   }
8532   else if (!epartIsReg(rm) && !haveLOCK(pfx)) {
8533      /* case 2 */
8534      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
8535      assign( dest, loadLE(ty, mkexpr(addr)) );
8536      delta0 += len;
8537      assign( src, getIRegG(size, pfx, rm) );
8538      assign( acc, getIRegRAX(size) );
8539      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
8540      assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
8541      assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
8542      assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
8543      putIRegRAX(size, mkexpr(acc2));
8544      storeLE( mkexpr(addr), mkexpr(dest2) );
8545      DIP("cmpxchg%c %s,%s\n", nameISize(size),
8546                               nameIRegG(size,pfx,rm), dis_buf);
8547   }
8548   else if (!epartIsReg(rm) && haveLOCK(pfx)) {
8549      /* case 3 */
8550      /* src is new value.  acc is expected value.  dest is old value.
8551         Compute success from the output of the IRCAS, and steer the
8552         new value for RAX accordingly: in case of success, RAX is
8553         unchanged. */
8554      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
8555      delta0 += len;
8556      assign( src, getIRegG(size, pfx, rm) );
8557      assign( acc, getIRegRAX(size) );
8558      stmt( IRStmt_CAS(
8559         mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
8560                  NULL, mkexpr(acc), NULL, mkexpr(src) )
8561      ));
8562      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
8563      assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
8564      assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
8565      putIRegRAX(size, mkexpr(acc2));
8566      DIP("cmpxchg%c %s,%s\n", nameISize(size),
8567                               nameIRegG(size,pfx,rm), dis_buf);
8568   }
8569   else vassert(0);
8570
8571   *ok = True;
8572   return delta0;
8573}
8574
8575
8576/* Handle conditional move instructions of the form
8577      cmovcc E(reg-or-mem), G(reg)
8578
8579   E(src) is reg-or-mem
8580   G(dst) is reg.
8581
8582   If E is reg, -->    GET %E, tmps
8583                       GET %G, tmpd
8584                       CMOVcc tmps, tmpd
8585                       PUT tmpd, %G
8586
8587   If E is mem  -->    (getAddr E) -> tmpa
8588                       LD (tmpa), tmps
8589                       GET %G, tmpd
8590                       CMOVcc tmps, tmpd
8591                       PUT tmpd, %G
8592*/
8593static
8594ULong dis_cmov_E_G ( const VexAbiInfo* vbi,
8595                     Prefix        pfx,
8596                     Int           sz,
8597                     AMD64Condcode cond,
8598                     Long          delta0 )
8599{
8600   UChar rm  = getUChar(delta0);
8601   HChar dis_buf[50];
8602   Int   len;
8603
8604   IRType ty   = szToITy(sz);
8605   IRTemp tmps = newTemp(ty);
8606   IRTemp tmpd = newTemp(ty);
8607
8608   if (epartIsReg(rm)) {
8609      assign( tmps, getIRegE(sz, pfx, rm) );
8610      assign( tmpd, getIRegG(sz, pfx, rm) );
8611
8612      putIRegG( sz, pfx, rm,
8613                IRExpr_ITE( mk_amd64g_calculate_condition(cond),
8614                            mkexpr(tmps),
8615                            mkexpr(tmpd) )
8616              );
8617      DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
8618                            nameIRegE(sz,pfx,rm),
8619                            nameIRegG(sz,pfx,rm));
8620      return 1+delta0;
8621   }
8622
8623   /* E refers to memory */
8624   {
8625      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
8626      assign( tmps, loadLE(ty, mkexpr(addr)) );
8627      assign( tmpd, getIRegG(sz, pfx, rm) );
8628
8629      putIRegG( sz, pfx, rm,
8630                IRExpr_ITE( mk_amd64g_calculate_condition(cond),
8631                            mkexpr(tmps),
8632                            mkexpr(tmpd) )
8633              );
8634
8635      DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
8636                            dis_buf,
8637                            nameIRegG(sz,pfx,rm));
8638      return len+delta0;
8639   }
8640}
8641
8642
8643static
8644ULong dis_xadd_G_E ( /*OUT*/Bool* decode_ok,
8645                     const VexAbiInfo* vbi,
8646                     Prefix pfx, Int sz, Long delta0 )
8647{
8648   Int   len;
8649   UChar rm = getUChar(delta0);
8650   HChar dis_buf[50];
8651
8652   IRType ty    = szToITy(sz);
8653   IRTemp tmpd  = newTemp(ty);
8654   IRTemp tmpt0 = newTemp(ty);
8655   IRTemp tmpt1 = newTemp(ty);
8656
8657   /* There are 3 cases to consider:
8658
8659      reg-reg: ignore any lock prefix,
8660               generate 'naive' (non-atomic) sequence
8661
8662      reg-mem, not locked: ignore any lock prefix, generate 'naive'
8663                           (non-atomic) sequence
8664
8665      reg-mem, locked: use IRCAS
8666   */
8667
8668   if (epartIsReg(rm)) {
8669      /* case 1 */
8670      assign( tmpd, getIRegE(sz, pfx, rm) );
8671      assign( tmpt0, getIRegG(sz, pfx, rm) );
8672      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
8673                           mkexpr(tmpd), mkexpr(tmpt0)) );
8674      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
8675      putIRegG(sz, pfx, rm, mkexpr(tmpd));
8676      putIRegE(sz, pfx, rm, mkexpr(tmpt1));
8677      DIP("xadd%c %s, %s\n",
8678          nameISize(sz), nameIRegG(sz,pfx,rm), nameIRegE(sz,pfx,rm));
8679      *decode_ok = True;
8680      return 1+delta0;
8681   }
8682   else if (!epartIsReg(rm) && !haveLOCK(pfx)) {
8683      /* case 2 */
8684      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
8685      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
8686      assign( tmpt0, getIRegG(sz, pfx, rm) );
8687      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
8688                           mkexpr(tmpd), mkexpr(tmpt0)) );
8689      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
8690      storeLE( mkexpr(addr), mkexpr(tmpt1) );
8691      putIRegG(sz, pfx, rm, mkexpr(tmpd));
8692      DIP("xadd%c %s, %s\n",
8693          nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
8694      *decode_ok = True;
8695      return len+delta0;
8696   }
8697   else if (!epartIsReg(rm) && haveLOCK(pfx)) {
8698      /* case 3 */
8699      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
8700      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
8701      assign( tmpt0, getIRegG(sz, pfx, rm) );
8702      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
8703                           mkexpr(tmpd), mkexpr(tmpt0)) );
8704      casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
8705                           mkexpr(tmpt1)/*newVal*/, guest_RIP_curr_instr );
8706      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
8707      putIRegG(sz, pfx, rm, mkexpr(tmpd));
8708      DIP("xadd%c %s, %s\n",
8709          nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
8710      *decode_ok = True;
8711      return len+delta0;
8712   }
8713   /*UNREACHED*/
8714   vassert(0);
8715}
8716
8717//.. /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
8718//..
8719//.. static
8720//.. UInt dis_mov_Ew_Sw ( UChar sorb, Long delta0 )
8721//.. {
8722//..    Int    len;
8723//..    IRTemp addr;
8724//..    UChar  rm  = getUChar(delta0);
8725//..    HChar  dis_buf[50];
8726//..
8727//..    if (epartIsReg(rm)) {
8728//..       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
8729//..       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
8730//..       return 1+delta0;
8731//..    } else {
8732//..       addr = disAMode ( &len, sorb, delta0, dis_buf );
8733//..       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
8734//..       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
8735//..       return len+delta0;
8736//..    }
8737//.. }
8738//..
8739//.. /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
8740//..    dst is ireg and sz==4, zero out top half of it.  */
8741//..
8742//.. static
8743//.. UInt dis_mov_Sw_Ew ( UChar sorb,
8744//..                      Int   sz,
8745//..                      UInt  delta0 )
8746//.. {
8747//..    Int    len;
8748//..    IRTemp addr;
8749//..    UChar  rm  = getUChar(delta0);
8750//..    HChar  dis_buf[50];
8751//..
8752//..    vassert(sz == 2 || sz == 4);
8753//..
8754//..    if (epartIsReg(rm)) {
8755//..       if (sz == 4)
8756//..          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
8757//..       else
8758//..          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
8759//..
8760//..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
8761//..       return 1+delta0;
8762//..    } else {
8763//..       addr = disAMode ( &len, sorb, delta0, dis_buf );
8764//..       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
8765//..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
8766//..       return len+delta0;
8767//..    }
8768//.. }
8769
8770/* Handle move instructions of the form
8771      mov S, E  meaning
8772      mov sreg, reg-or-mem
8773   Is passed the a ptr to the modRM byte, and the data size.  Returns
8774   the address advanced completely over this instruction.
8775
8776   VEX does not currently simulate segment registers on AMD64 which means that
8777   instead of moving a value of a segment register, zero is moved to the
8778   destination.  The zero value represents a null (unused) selector.  This is
8779   not correct (especially for the %cs, %fs and %gs registers) but it seems to
8780   provide a sufficient simulation for currently seen programs that use this
8781   instruction.  If some program actually decides to use the obtained segment
8782   selector for something meaningful then the zero value should be a clear
8783   indicator that there is some problem.
8784
8785   S(src) is sreg.
8786   E(dst) is reg-or-mem
8787
8788   If E is reg, -->    PUT $0, %E
8789
8790   If E is mem, -->    (getAddr E) -> tmpa
8791                       ST $0, (tmpa)
8792*/
8793static
8794ULong dis_mov_S_E ( const VexAbiInfo* vbi,
8795                    Prefix      pfx,
8796                    Int         size,
8797                    Long        delta0 )
8798{
8799   Int   len;
8800   UChar rm = getUChar(delta0);
8801   HChar dis_buf[50];
8802
8803   if (epartIsReg(rm)) {
8804      putIRegE(size, pfx, rm, mkU(szToITy(size), 0));
8805      DIP("mov %s,%s\n", nameSReg(gregOfRexRM(pfx, rm)),
8806                         nameIRegE(size, pfx, rm));
8807      return 1+delta0;
8808   }
8809
8810   /* E refers to memory */
8811   {
8812      IRTemp addr = disAMode(&len, vbi, pfx, delta0, dis_buf, 0);
8813      storeLE(mkexpr(addr), mkU16(0));
8814      DIP("mov %s,%s\n", nameSReg(gregOfRexRM(pfx, rm)),
8815                         dis_buf);
8816      return len+delta0;
8817   }
8818}
8819
8820//.. static
8821//.. void dis_push_segreg ( UInt sreg, Int sz )
8822//.. {
8823//..     IRTemp t1 = newTemp(Ity_I16);
8824//..     IRTemp ta = newTemp(Ity_I32);
8825//..     vassert(sz == 2 || sz == 4);
8826//..
8827//..     assign( t1, getSReg(sreg) );
8828//..     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
8829//..     putIReg(4, R_ESP, mkexpr(ta));
8830//..     storeLE( mkexpr(ta), mkexpr(t1) );
8831//..
8832//..     DIP("pushw %s\n", nameSReg(sreg));
8833//.. }
8834//..
8835//.. static
8836//.. void dis_pop_segreg ( UInt sreg, Int sz )
8837//.. {
8838//..     IRTemp t1 = newTemp(Ity_I16);
8839//..     IRTemp ta = newTemp(Ity_I32);
8840//..     vassert(sz == 2 || sz == 4);
8841//..
8842//..     assign( ta, getIReg(4, R_ESP) );
8843//..     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
8844//..
8845//..     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
8846//..     putSReg( sreg, mkexpr(t1) );
8847//..     DIP("pop %s\n", nameSReg(sreg));
8848//.. }
8849
8850static
8851void dis_ret ( /*MOD*/DisResult* dres, const VexAbiInfo* vbi, ULong d64 )
8852{
8853   IRTemp t1 = newTemp(Ity_I64);
8854   IRTemp t2 = newTemp(Ity_I64);
8855   IRTemp t3 = newTemp(Ity_I64);
8856   assign(t1, getIReg64(R_RSP));
8857   assign(t2, loadLE(Ity_I64,mkexpr(t1)));
8858   assign(t3, binop(Iop_Add64, mkexpr(t1), mkU64(8+d64)));
8859   putIReg64(R_RSP, mkexpr(t3));
8860   make_redzone_AbiHint(vbi, t3, t2/*nia*/, "ret");
8861   jmp_treg(dres, Ijk_Ret, t2);
8862   vassert(dres->whatNext == Dis_StopHere);
8863}
8864
8865
8866/*------------------------------------------------------------*/
8867/*--- SSE/SSE2/SSE3 helpers                                ---*/
8868/*------------------------------------------------------------*/
8869
8870/* Indicates whether the op requires a rounding-mode argument.  Note
8871   that this covers only vector floating point arithmetic ops, and
8872   omits the scalar ones that need rounding modes.  Note also that
8873   inconsistencies here will get picked up later by the IR sanity
8874   checker, so this isn't correctness-critical. */
8875static Bool requiresRMode ( IROp op )
8876{
8877   switch (op) {
8878      /* 128 bit ops */
8879      case Iop_Add32Fx4: case Iop_Sub32Fx4:
8880      case Iop_Mul32Fx4: case Iop_Div32Fx4:
8881      case Iop_Add64Fx2: case Iop_Sub64Fx2:
8882      case Iop_Mul64Fx2: case Iop_Div64Fx2:
8883      /* 256 bit ops */
8884      case Iop_Add32Fx8: case Iop_Sub32Fx8:
8885      case Iop_Mul32Fx8: case Iop_Div32Fx8:
8886      case Iop_Add64Fx4: case Iop_Sub64Fx4:
8887      case Iop_Mul64Fx4: case Iop_Div64Fx4:
8888         return True;
8889      default:
8890         break;
8891   }
8892   return False;
8893}
8894
8895
8896/* Worker function; do not call directly.
8897   Handles full width G = G `op` E   and   G = (not G) `op` E.
8898*/
8899
8900static ULong dis_SSE_E_to_G_all_wrk (
8901                const VexAbiInfo* vbi,
8902                Prefix pfx, Long delta,
8903                const HChar* opname, IROp op,
8904                Bool   invertG
8905             )
8906{
8907   HChar   dis_buf[50];
8908   Int     alen;
8909   IRTemp  addr;
8910   UChar   rm = getUChar(delta);
8911   Bool    needsRMode = requiresRMode(op);
8912   IRExpr* gpart
8913      = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRexRM(pfx,rm)))
8914                : getXMMReg(gregOfRexRM(pfx,rm));
8915   if (epartIsReg(rm)) {
8916      putXMMReg(
8917         gregOfRexRM(pfx,rm),
8918         needsRMode
8919            ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
8920                        gpart,
8921                        getXMMReg(eregOfRexRM(pfx,rm)))
8922            : binop(op, gpart,
8923                        getXMMReg(eregOfRexRM(pfx,rm)))
8924      );
8925      DIP("%s %s,%s\n", opname,
8926                        nameXMMReg(eregOfRexRM(pfx,rm)),
8927                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8928      return delta+1;
8929   } else {
8930      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8931      putXMMReg(
8932         gregOfRexRM(pfx,rm),
8933         needsRMode
8934            ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
8935                        gpart,
8936                        loadLE(Ity_V128, mkexpr(addr)))
8937            : binop(op, gpart,
8938                        loadLE(Ity_V128, mkexpr(addr)))
8939      );
8940      DIP("%s %s,%s\n", opname,
8941                        dis_buf,
8942                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8943      return delta+alen;
8944   }
8945}
8946
8947
8948/* All lanes SSE binary operation, G = G `op` E. */
8949
8950static
8951ULong dis_SSE_E_to_G_all ( const VexAbiInfo* vbi,
8952                           Prefix pfx, Long delta,
8953                           const HChar* opname, IROp op )
8954{
8955   return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, False );
8956}
8957
8958/* All lanes SSE binary operation, G = (not G) `op` E. */
8959
8960static
8961ULong dis_SSE_E_to_G_all_invG ( const VexAbiInfo* vbi,
8962                                Prefix pfx, Long delta,
8963                                const HChar* opname, IROp op )
8964{
8965   return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, True );
8966}
8967
8968
8969/* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
8970
8971static ULong dis_SSE_E_to_G_lo32 ( const VexAbiInfo* vbi,
8972                                   Prefix pfx, Long delta,
8973                                   const HChar* opname, IROp op )
8974{
8975   HChar   dis_buf[50];
8976   Int     alen;
8977   IRTemp  addr;
8978   UChar   rm = getUChar(delta);
8979   IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
8980   if (epartIsReg(rm)) {
8981      putXMMReg( gregOfRexRM(pfx,rm),
8982                 binop(op, gpart,
8983                           getXMMReg(eregOfRexRM(pfx,rm))) );
8984      DIP("%s %s,%s\n", opname,
8985                        nameXMMReg(eregOfRexRM(pfx,rm)),
8986                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8987      return delta+1;
8988   } else {
8989      /* We can only do a 32-bit memory read, so the upper 3/4 of the
8990         E operand needs to be made simply of zeroes. */
8991      IRTemp epart = newTemp(Ity_V128);
8992      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8993      assign( epart, unop( Iop_32UtoV128,
8994                           loadLE(Ity_I32, mkexpr(addr))) );
8995      putXMMReg( gregOfRexRM(pfx,rm),
8996                 binop(op, gpart, mkexpr(epart)) );
8997      DIP("%s %s,%s\n", opname,
8998                        dis_buf,
8999                        nameXMMReg(gregOfRexRM(pfx,rm)) );
9000      return delta+alen;
9001   }
9002}
9003
9004
9005/* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
9006
9007static ULong dis_SSE_E_to_G_lo64 ( const VexAbiInfo* vbi,
9008                                   Prefix pfx, Long delta,
9009                                   const HChar* opname, IROp op )
9010{
9011   HChar   dis_buf[50];
9012   Int     alen;
9013   IRTemp  addr;
9014   UChar   rm = getUChar(delta);
9015   IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
9016   if (epartIsReg(rm)) {
9017      putXMMReg( gregOfRexRM(pfx,rm),
9018                 binop(op, gpart,
9019                           getXMMReg(eregOfRexRM(pfx,rm))) );
9020      DIP("%s %s,%s\n", opname,
9021                        nameXMMReg(eregOfRexRM(pfx,rm)),
9022                        nameXMMReg(gregOfRexRM(pfx,rm)) );
9023      return delta+1;
9024   } else {
9025      /* We can only do a 64-bit memory read, so the upper half of the
9026         E operand needs to be made simply of zeroes. */
9027      IRTemp epart = newTemp(Ity_V128);
9028      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9029      assign( epart, unop( Iop_64UtoV128,
9030                           loadLE(Ity_I64, mkexpr(addr))) );
9031      putXMMReg( gregOfRexRM(pfx,rm),
9032                 binop(op, gpart, mkexpr(epart)) );
9033      DIP("%s %s,%s\n", opname,
9034                        dis_buf,
9035                        nameXMMReg(gregOfRexRM(pfx,rm)) );
9036      return delta+alen;
9037   }
9038}
9039
9040
9041/* All lanes unary SSE operation, G = op(E). */
9042
9043static ULong dis_SSE_E_to_G_unary_all (
9044                const VexAbiInfo* vbi,
9045                Prefix pfx, Long delta,
9046                const HChar* opname, IROp op
9047             )
9048{
9049   HChar   dis_buf[50];
9050   Int     alen;
9051   IRTemp  addr;
9052   UChar   rm = getUChar(delta);
9053   // Sqrt32Fx4 and Sqrt64Fx2 take a rounding mode, which is faked
9054   // up in the usual way.
9055   Bool needsIRRM = op == Iop_Sqrt32Fx4 || op == Iop_Sqrt64Fx2;
9056   if (epartIsReg(rm)) {
9057      IRExpr* src = getXMMReg(eregOfRexRM(pfx,rm));
9058      /* XXXROUNDINGFIXME */
9059      IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
9060                              : unop(op, src);
9061      putXMMReg( gregOfRexRM(pfx,rm), res );
9062      DIP("%s %s,%s\n", opname,
9063                        nameXMMReg(eregOfRexRM(pfx,rm)),
9064                        nameXMMReg(gregOfRexRM(pfx,rm)) );
9065      return delta+1;
9066   } else {
9067      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9068      IRExpr* src = loadLE(Ity_V128, mkexpr(addr));
9069      /* XXXROUNDINGFIXME */
9070      IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), src)
9071                              : unop(op, src);
9072      putXMMReg( gregOfRexRM(pfx,rm), res );
9073      DIP("%s %s,%s\n", opname,
9074                        dis_buf,
9075                        nameXMMReg(gregOfRexRM(pfx,rm)) );
9076      return delta+alen;
9077   }
9078}
9079
9080
9081/* Lowest 32-bit lane only unary SSE operation, G = op(E). */
9082
9083static ULong dis_SSE_E_to_G_unary_lo32 (
9084                const VexAbiInfo* vbi,
9085                Prefix pfx, Long delta,
9086                const HChar* opname, IROp op
9087             )
9088{
9089   /* First we need to get the old G value and patch the low 32 bits
9090      of the E operand into it.  Then apply op and write back to G. */
9091   HChar   dis_buf[50];
9092   Int     alen;
9093   IRTemp  addr;
9094   UChar   rm = getUChar(delta);
9095   IRTemp  oldG0 = newTemp(Ity_V128);
9096   IRTemp  oldG1 = newTemp(Ity_V128);
9097
9098   assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
9099
9100   if (epartIsReg(rm)) {
9101      assign( oldG1,
9102              binop( Iop_SetV128lo32,
9103                     mkexpr(oldG0),
9104                     getXMMRegLane32(eregOfRexRM(pfx,rm), 0)) );
9105      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
9106      DIP("%s %s,%s\n", opname,
9107                        nameXMMReg(eregOfRexRM(pfx,rm)),
9108                        nameXMMReg(gregOfRexRM(pfx,rm)) );
9109      return delta+1;
9110   } else {
9111      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9112      assign( oldG1,
9113              binop( Iop_SetV128lo32,
9114                     mkexpr(oldG0),
9115                     loadLE(Ity_I32, mkexpr(addr)) ));
9116      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
9117      DIP("%s %s,%s\n", opname,
9118                        dis_buf,
9119                        nameXMMReg(gregOfRexRM(pfx,rm)) );
9120      return delta+alen;
9121   }
9122}
9123
9124
9125/* Lowest 64-bit lane only unary SSE operation, G = op(E). */
9126
9127static ULong dis_SSE_E_to_G_unary_lo64 (
9128                const VexAbiInfo* vbi,
9129                Prefix pfx, Long delta,
9130                const HChar* opname, IROp op
9131             )
9132{
9133   /* First we need to get the old G value and patch the low 64 bits
9134      of the E operand into it.  Then apply op and write back to G. */
9135   HChar   dis_buf[50];
9136   Int     alen;
9137   IRTemp  addr;
9138   UChar   rm = getUChar(delta);
9139   IRTemp  oldG0 = newTemp(Ity_V128);
9140   IRTemp  oldG1 = newTemp(Ity_V128);
9141
9142   assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
9143
9144   if (epartIsReg(rm)) {
9145      assign( oldG1,
9146              binop( Iop_SetV128lo64,
9147                     mkexpr(oldG0),
9148                     getXMMRegLane64(eregOfRexRM(pfx,rm), 0)) );
9149      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
9150      DIP("%s %s,%s\n", opname,
9151                        nameXMMReg(eregOfRexRM(pfx,rm)),
9152                        nameXMMReg(gregOfRexRM(pfx,rm)) );
9153      return delta+1;
9154   } else {
9155      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9156      assign( oldG1,
9157              binop( Iop_SetV128lo64,
9158                     mkexpr(oldG0),
9159                     loadLE(Ity_I64, mkexpr(addr)) ));
9160      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
9161      DIP("%s %s,%s\n", opname,
9162                        dis_buf,
9163                        nameXMMReg(gregOfRexRM(pfx,rm)) );
9164      return delta+alen;
9165   }
9166}
9167
9168
9169/* SSE integer binary operation:
9170      G = G `op` E   (eLeft == False)
9171      G = E `op` G   (eLeft == True)
9172*/
9173static ULong dis_SSEint_E_to_G(
9174                const VexAbiInfo* vbi,
9175                Prefix pfx, Long delta,
9176                const HChar* opname, IROp op,
9177                Bool   eLeft
9178             )
9179{
9180   HChar   dis_buf[50];
9181   Int     alen;
9182   IRTemp  addr;
9183   UChar   rm = getUChar(delta);
9184   IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
9185   IRExpr* epart = NULL;
9186   if (epartIsReg(rm)) {
9187      epart = getXMMReg(eregOfRexRM(pfx,rm));
9188      DIP("%s %s,%s\n", opname,
9189                        nameXMMReg(eregOfRexRM(pfx,rm)),
9190                        nameXMMReg(gregOfRexRM(pfx,rm)) );
9191      delta += 1;
9192   } else {
9193      addr  = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9194      epart = loadLE(Ity_V128, mkexpr(addr));
9195      DIP("%s %s,%s\n", opname,
9196                        dis_buf,
9197                        nameXMMReg(gregOfRexRM(pfx,rm)) );
9198      delta += alen;
9199   }
9200   putXMMReg( gregOfRexRM(pfx,rm),
9201              eLeft ? binop(op, epart, gpart)
9202                    : binop(op, gpart, epart) );
9203   return delta;
9204}
9205
9206
9207/* Helper for doing SSE FP comparisons.  False return ==> unhandled.
9208   This is all a bit of a kludge in that it ignores the subtleties of
9209   ordered-vs-unordered and signalling-vs-nonsignalling in the Intel
9210   spec. */
9211static Bool findSSECmpOp ( /*OUT*/Bool* preSwapP,
9212                           /*OUT*/IROp* opP,
9213                           /*OUT*/Bool* postNotP,
9214                           UInt imm8, Bool all_lanes, Int sz )
9215{
9216   if (imm8 >= 32) return False;
9217
9218   /* First, compute a (preSwap, op, postNot) triple from
9219      the supplied imm8. */
9220   Bool pre = False;
9221   IROp op  = Iop_INVALID;
9222   Bool not = False;
9223
9224#  define XXX(_pre, _op, _not) { pre = _pre; op = _op; not = _not; }
9225   // If you add a case here, add a corresponding test for both VCMPSD_128
9226   // and VCMPSS_128 in avx-1.c.
9227   // Cases 0xA and above are
9228   //    "Enhanced Comparison Predicate[s] for VEX-Encoded [insns]"
9229   switch (imm8) {
9230      // "O" = ordered, "U" = unordered
9231      // "Q" = non-signalling (quiet), "S" = signalling
9232      //
9233      //             swap operands?
9234      //             |
9235      //             |      cmp op          invert after?
9236      //             |      |               |
9237      //             v      v               v
9238      case 0x0:  XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_OQ
9239      case 0x8:  XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_UQ
9240      case 0x10: XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_OS
9241      case 0x18: XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_US
9242      //
9243      case 0x1:  XXX(False, Iop_CmpLT32Fx4, False); break; // LT_OS
9244      case 0x11: XXX(False, Iop_CmpLT32Fx4, False); break; // LT_OQ
9245      //
9246      case 0x2:  XXX(False, Iop_CmpLE32Fx4, False); break; // LE_OS
9247      case 0x12: XXX(False, Iop_CmpLE32Fx4, False); break; // LE_OQ
9248      //
9249      case 0x3:  XXX(False, Iop_CmpUN32Fx4, False); break; // UNORD_Q
9250      case 0x13: XXX(False, Iop_CmpUN32Fx4, False); break; // UNORD_S
9251      //
9252      // 0xC: this isn't really right because it returns all-1s when
9253      // either operand is a NaN, and it should return all-0s.
9254      case 0x4:  XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_UQ
9255      case 0xC:  XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_OQ
9256      case 0x14: XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_US
9257      case 0x1C: XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_OS
9258      //
9259      case 0x5:  XXX(False, Iop_CmpLT32Fx4, True);  break; // NLT_US
9260      case 0x15: XXX(False, Iop_CmpLT32Fx4, True);  break; // NLT_UQ
9261      //
9262      case 0x6:  XXX(False, Iop_CmpLE32Fx4, True);  break; // NLE_US
9263      case 0x16: XXX(False, Iop_CmpLE32Fx4, True);  break; // NLE_UQ
9264      //
9265      case 0x7:  XXX(False, Iop_CmpUN32Fx4, True);  break; // ORD_Q
9266      case 0x17: XXX(False, Iop_CmpUN32Fx4, True);  break; // ORD_S
9267      //
9268      case 0x9:  XXX(True,  Iop_CmpLE32Fx4, True);  break; // NGE_US
9269      case 0x19: XXX(True,  Iop_CmpLE32Fx4, True);  break; // NGE_UQ
9270      //
9271      case 0xA:  XXX(True,  Iop_CmpLT32Fx4, True);  break; // NGT_US
9272      case 0x1A: XXX(True,  Iop_CmpLT32Fx4, True);  break; // NGT_UQ
9273      //
9274      case 0xD:  XXX(True,  Iop_CmpLE32Fx4, False); break; // GE_OS
9275      case 0x1D: XXX(True,  Iop_CmpLE32Fx4, False); break; // GE_OQ
9276      //
9277      case 0xE:  XXX(True,  Iop_CmpLT32Fx4, False); break; // GT_OS
9278      case 0x1E: XXX(True,  Iop_CmpLT32Fx4, False); break; // GT_OQ
9279      // Unhandled:
9280      // 0xB  FALSE_OQ
9281      // 0xF  TRUE_UQ
9282      // 0x1B  FALSE_OS
9283      // 0x1F  TRUE_US
9284      /* Don't forget to add test cases to VCMPSS_128_<imm8> in
9285         avx-1.c if new cases turn up. */
9286      default: break;
9287   }
9288#  undef XXX
9289   if (op == Iop_INVALID) return False;
9290
9291   /* Now convert the op into one with the same arithmetic but that is
9292      correct for the width and laneage requirements. */
9293
9294   /**/ if (sz == 4 && all_lanes) {
9295      switch (op) {
9296         case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32Fx4; break;
9297         case Iop_CmpLT32Fx4: op = Iop_CmpLT32Fx4; break;
9298         case Iop_CmpLE32Fx4: op = Iop_CmpLE32Fx4; break;
9299         case Iop_CmpUN32Fx4: op = Iop_CmpUN32Fx4; break;
9300         default: vassert(0);
9301      }
9302   }
9303   else if (sz == 4 && !all_lanes) {
9304      switch (op) {
9305         case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32F0x4; break;
9306         case Iop_CmpLT32Fx4: op = Iop_CmpLT32F0x4; break;
9307         case Iop_CmpLE32Fx4: op = Iop_CmpLE32F0x4; break;
9308         case Iop_CmpUN32Fx4: op = Iop_CmpUN32F0x4; break;
9309         default: vassert(0);
9310      }
9311   }
9312   else if (sz == 8 && all_lanes) {
9313      switch (op) {
9314         case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64Fx2; break;
9315         case Iop_CmpLT32Fx4: op = Iop_CmpLT64Fx2; break;
9316         case Iop_CmpLE32Fx4: op = Iop_CmpLE64Fx2; break;
9317         case Iop_CmpUN32Fx4: op = Iop_CmpUN64Fx2; break;
9318         default: vassert(0);
9319      }
9320   }
9321   else if (sz == 8 && !all_lanes) {
9322      switch (op) {
9323         case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64F0x2; break;
9324         case Iop_CmpLT32Fx4: op = Iop_CmpLT64F0x2; break;
9325         case Iop_CmpLE32Fx4: op = Iop_CmpLE64F0x2; break;
9326         case Iop_CmpUN32Fx4: op = Iop_CmpUN64F0x2; break;
9327         default: vassert(0);
9328      }
9329   }
9330   else {
9331      vpanic("findSSECmpOp(amd64,guest)");
9332   }
9333
9334   *preSwapP = pre; *opP = op; *postNotP = not;
9335   return True;
9336}
9337
9338
9339/* Handles SSE 32F/64F comparisons.  It can fail, in which case it
9340   returns the original delta to indicate failure. */
9341
9342static Long dis_SSE_cmp_E_to_G ( const VexAbiInfo* vbi,
9343                                 Prefix pfx, Long delta,
9344                                 const HChar* opname, Bool all_lanes, Int sz )
9345{
9346   Long    delta0 = delta;
9347   HChar   dis_buf[50];
9348   Int     alen;
9349   UInt    imm8;
9350   IRTemp  addr;
9351   Bool    preSwap = False;
9352   IROp    op      = Iop_INVALID;
9353   Bool    postNot = False;
9354   IRTemp  plain   = newTemp(Ity_V128);
9355   UChar   rm      = getUChar(delta);
9356   UShort  mask    = 0;
9357   vassert(sz == 4 || sz == 8);
9358   if (epartIsReg(rm)) {
9359      imm8 = getUChar(delta+1);
9360      if (imm8 >= 8) return delta0; /* FAIL */
9361      Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
9362      if (!ok) return delta0; /* FAIL */
9363      vassert(!preSwap); /* never needed for imm8 < 8 */
9364      assign( plain, binop(op, getXMMReg(gregOfRexRM(pfx,rm)),
9365                               getXMMReg(eregOfRexRM(pfx,rm))) );
9366      delta += 2;
9367      DIP("%s $%u,%s,%s\n", opname,
9368                            imm8,
9369                            nameXMMReg(eregOfRexRM(pfx,rm)),
9370                            nameXMMReg(gregOfRexRM(pfx,rm)) );
9371   } else {
9372      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
9373      imm8 = getUChar(delta+alen);
9374      if (imm8 >= 8) return delta0; /* FAIL */
9375      Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
9376      if (!ok) return delta0; /* FAIL */
9377      vassert(!preSwap); /* never needed for imm8 < 8 */
9378      assign( plain,
9379              binop(
9380                 op,
9381                 getXMMReg(gregOfRexRM(pfx,rm)),
9382                   all_lanes
9383                      ? loadLE(Ity_V128, mkexpr(addr))
9384                   : sz == 8
9385                      ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
9386                   : /*sz==4*/
9387                      unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
9388              )
9389      );
9390      delta += alen+1;
9391      DIP("%s $%u,%s,%s\n", opname,
9392                            imm8,
9393                            dis_buf,
9394                            nameXMMReg(gregOfRexRM(pfx,rm)) );
9395   }
9396
9397   if (postNot && all_lanes) {
9398      putXMMReg( gregOfRexRM(pfx,rm),
9399                 unop(Iop_NotV128, mkexpr(plain)) );
9400   }
9401   else
9402   if (postNot && !all_lanes) {
9403      mask = toUShort(sz==4 ? 0x000F : 0x00FF);
9404      putXMMReg( gregOfRexRM(pfx,rm),
9405                 binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
9406   }
9407   else {
9408      putXMMReg( gregOfRexRM(pfx,rm), mkexpr(plain) );
9409   }
9410
9411   return delta;
9412}
9413
9414
9415/* Vector by scalar shift of G by the amount specified at the bottom
9416   of E. */
9417
9418static ULong dis_SSE_shiftG_byE ( const VexAbiInfo* vbi,
9419                                  Prefix pfx, Long delta,
9420                                  const HChar* opname, IROp op )
9421{
9422   HChar   dis_buf[50];
9423   Int     alen, size;
9424   IRTemp  addr;
9425   Bool    shl, shr, sar;
9426   UChar   rm   = getUChar(delta);
9427   IRTemp  g0   = newTemp(Ity_V128);
9428   IRTemp  g1   = newTemp(Ity_V128);
9429   IRTemp  amt  = newTemp(Ity_I64);
9430   IRTemp  amt8 = newTemp(Ity_I8);
9431   if (epartIsReg(rm)) {
9432      assign( amt, getXMMRegLane64(eregOfRexRM(pfx,rm), 0) );
9433      DIP("%s %s,%s\n", opname,
9434                        nameXMMReg(eregOfRexRM(pfx,rm)),
9435                        nameXMMReg(gregOfRexRM(pfx,rm)) );
9436      delta++;
9437   } else {
9438      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9439      assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
9440      DIP("%s %s,%s\n", opname,
9441                        dis_buf,
9442                        nameXMMReg(gregOfRexRM(pfx,rm)) );
9443      delta += alen;
9444   }
9445   assign( g0,   getXMMReg(gregOfRexRM(pfx,rm)) );
9446   assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
9447
9448   shl = shr = sar = False;
9449   size = 0;
9450   switch (op) {
9451      case Iop_ShlN16x8: shl = True; size = 32; break;
9452      case Iop_ShlN32x4: shl = True; size = 32; break;
9453      case Iop_ShlN64x2: shl = True; size = 64; break;
9454      case Iop_SarN16x8: sar = True; size = 16; break;
9455      case Iop_SarN32x4: sar = True; size = 32; break;
9456      case Iop_ShrN16x8: shr = True; size = 16; break;
9457      case Iop_ShrN32x4: shr = True; size = 32; break;
9458      case Iop_ShrN64x2: shr = True; size = 64; break;
9459      default: vassert(0);
9460   }
9461
9462   if (shl || shr) {
9463     assign(
9464        g1,
9465        IRExpr_ITE(
9466           binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
9467           binop(op, mkexpr(g0), mkexpr(amt8)),
9468           mkV128(0x0000)
9469        )
9470     );
9471   } else
9472   if (sar) {
9473     assign(
9474        g1,
9475        IRExpr_ITE(
9476           binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
9477           binop(op, mkexpr(g0), mkexpr(amt8)),
9478           binop(op, mkexpr(g0), mkU8(size-1))
9479        )
9480     );
9481   } else {
9482      vassert(0);
9483   }
9484
9485   putXMMReg( gregOfRexRM(pfx,rm), mkexpr(g1) );
9486   return delta;
9487}
9488
9489
9490/* Vector by scalar shift of E by an immediate byte. */
9491
9492static
9493ULong dis_SSE_shiftE_imm ( Prefix pfx,
9494                           Long delta, const HChar* opname, IROp op )
9495{
9496   Bool    shl, shr, sar;
9497   UChar   rm   = getUChar(delta);
9498   IRTemp  e0   = newTemp(Ity_V128);
9499   IRTemp  e1   = newTemp(Ity_V128);
9500   UChar   amt, size;
9501   vassert(epartIsReg(rm));
9502   vassert(gregLO3ofRM(rm) == 2
9503           || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
9504   amt = getUChar(delta+1);
9505   delta += 2;
9506   DIP("%s $%d,%s\n", opname,
9507                      (Int)amt,
9508                      nameXMMReg(eregOfRexRM(pfx,rm)) );
9509   assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
9510
9511   shl = shr = sar = False;
9512   size = 0;
9513   switch (op) {
9514      case Iop_ShlN16x8: shl = True; size = 16; break;
9515      case Iop_ShlN32x4: shl = True; size = 32; break;
9516      case Iop_ShlN64x2: shl = True; size = 64; break;
9517      case Iop_SarN16x8: sar = True; size = 16; break;
9518      case Iop_SarN32x4: sar = True; size = 32; break;
9519      case Iop_ShrN16x8: shr = True; size = 16; break;
9520      case Iop_ShrN32x4: shr = True; size = 32; break;
9521      case Iop_ShrN64x2: shr = True; size = 64; break;
9522      default: vassert(0);
9523   }
9524
9525   if (shl || shr) {
9526     assign( e1, amt >= size
9527                    ? mkV128(0x0000)
9528                    : binop(op, mkexpr(e0), mkU8(amt))
9529     );
9530   } else
9531   if (sar) {
9532     assign( e1, amt >= size
9533                    ? binop(op, mkexpr(e0), mkU8(size-1))
9534                    : binop(op, mkexpr(e0), mkU8(amt))
9535     );
9536   } else {
9537      vassert(0);
9538   }
9539
9540   putXMMReg( eregOfRexRM(pfx,rm), mkexpr(e1) );
9541   return delta;
9542}
9543
9544
9545/* Get the current SSE rounding mode. */
9546
9547static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
9548{
9549   return
9550      unop( Iop_64to32,
9551            binop( Iop_And64,
9552                   IRExpr_Get( OFFB_SSEROUND, Ity_I64 ),
9553                   mkU64(3) ));
9554}
9555
9556static void put_sse_roundingmode ( IRExpr* sseround )
9557{
9558   vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
9559   stmt( IRStmt_Put( OFFB_SSEROUND,
9560                     unop(Iop_32Uto64,sseround) ) );
9561}
9562
9563/* Break a V128-bit value up into four 32-bit ints. */
9564
9565static void breakupV128to32s ( IRTemp t128,
9566                               /*OUTs*/
9567                               IRTemp* t3, IRTemp* t2,
9568                               IRTemp* t1, IRTemp* t0 )
9569{
9570   IRTemp hi64 = newTemp(Ity_I64);
9571   IRTemp lo64 = newTemp(Ity_I64);
9572   assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
9573   assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
9574
9575   vassert(t0 && *t0 == IRTemp_INVALID);
9576   vassert(t1 && *t1 == IRTemp_INVALID);
9577   vassert(t2 && *t2 == IRTemp_INVALID);
9578   vassert(t3 && *t3 == IRTemp_INVALID);
9579
9580   *t0 = newTemp(Ity_I32);
9581   *t1 = newTemp(Ity_I32);
9582   *t2 = newTemp(Ity_I32);
9583   *t3 = newTemp(Ity_I32);
9584   assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
9585   assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
9586   assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
9587   assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
9588}
9589
9590/* Construct a V128-bit value from four 32-bit ints. */
9591
9592static IRExpr* mkV128from32s ( IRTemp t3, IRTemp t2,
9593                               IRTemp t1, IRTemp t0 )
9594{
9595   return
9596      binop( Iop_64HLtoV128,
9597             binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
9598             binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
9599   );
9600}
9601
9602/* Break a 64-bit value up into four 16-bit ints. */
9603
9604static void breakup64to16s ( IRTemp t64,
9605                             /*OUTs*/
9606                             IRTemp* t3, IRTemp* t2,
9607                             IRTemp* t1, IRTemp* t0 )
9608{
9609   IRTemp hi32 = newTemp(Ity_I32);
9610   IRTemp lo32 = newTemp(Ity_I32);
9611   assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
9612   assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
9613
9614   vassert(t0 && *t0 == IRTemp_INVALID);
9615   vassert(t1 && *t1 == IRTemp_INVALID);
9616   vassert(t2 && *t2 == IRTemp_INVALID);
9617   vassert(t3 && *t3 == IRTemp_INVALID);
9618
9619   *t0 = newTemp(Ity_I16);
9620   *t1 = newTemp(Ity_I16);
9621   *t2 = newTemp(Ity_I16);
9622   *t3 = newTemp(Ity_I16);
9623   assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
9624   assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
9625   assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
9626   assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
9627}
9628
9629/* Construct a 64-bit value from four 16-bit ints. */
9630
9631static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
9632                             IRTemp t1, IRTemp t0 )
9633{
9634   return
9635      binop( Iop_32HLto64,
9636             binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
9637             binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
9638   );
9639}
9640
9641/* Break a V256-bit value up into four 64-bit ints. */
9642
9643static void breakupV256to64s ( IRTemp t256,
9644                               /*OUTs*/
9645                               IRTemp* t3, IRTemp* t2,
9646                               IRTemp* t1, IRTemp* t0 )
9647{
9648   vassert(t0 && *t0 == IRTemp_INVALID);
9649   vassert(t1 && *t1 == IRTemp_INVALID);
9650   vassert(t2 && *t2 == IRTemp_INVALID);
9651   vassert(t3 && *t3 == IRTemp_INVALID);
9652   *t0 = newTemp(Ity_I64);
9653   *t1 = newTemp(Ity_I64);
9654   *t2 = newTemp(Ity_I64);
9655   *t3 = newTemp(Ity_I64);
9656   assign( *t0, unop(Iop_V256to64_0, mkexpr(t256)) );
9657   assign( *t1, unop(Iop_V256to64_1, mkexpr(t256)) );
9658   assign( *t2, unop(Iop_V256to64_2, mkexpr(t256)) );
9659   assign( *t3, unop(Iop_V256to64_3, mkexpr(t256)) );
9660}
9661
9662/* Break a V256-bit value up into two V128s. */
9663
9664static void breakupV256toV128s ( IRTemp t256,
9665                                 /*OUTs*/
9666                                 IRTemp* t1, IRTemp* t0 )
9667{
9668   vassert(t0 && *t0 == IRTemp_INVALID);
9669   vassert(t1 && *t1 == IRTemp_INVALID);
9670   *t0 = newTemp(Ity_V128);
9671   *t1 = newTemp(Ity_V128);
9672   assign(*t1, unop(Iop_V256toV128_1, mkexpr(t256)));
9673   assign(*t0, unop(Iop_V256toV128_0, mkexpr(t256)));
9674}
9675
9676/* Break a V256-bit value up into eight 32-bit ints.  */
9677
9678static void breakupV256to32s ( IRTemp t256,
9679                               /*OUTs*/
9680                               IRTemp* t7, IRTemp* t6,
9681                               IRTemp* t5, IRTemp* t4,
9682                               IRTemp* t3, IRTemp* t2,
9683                               IRTemp* t1, IRTemp* t0 )
9684{
9685   IRTemp t128_1 = IRTemp_INVALID;
9686   IRTemp t128_0 = IRTemp_INVALID;
9687   breakupV256toV128s( t256, &t128_1, &t128_0 );
9688   breakupV128to32s( t128_1, t7, t6, t5, t4 );
9689   breakupV128to32s( t128_0, t3, t2, t1, t0 );
9690}
9691
9692/* Break a V128-bit value up into two 64-bit ints. */
9693
9694static void breakupV128to64s ( IRTemp t128,
9695                               /*OUTs*/
9696                               IRTemp* t1, IRTemp* t0 )
9697{
9698   vassert(t0 && *t0 == IRTemp_INVALID);
9699   vassert(t1 && *t1 == IRTemp_INVALID);
9700   *t0 = newTemp(Ity_I64);
9701   *t1 = newTemp(Ity_I64);
9702   assign( *t0, unop(Iop_V128to64,   mkexpr(t128)) );
9703   assign( *t1, unop(Iop_V128HIto64, mkexpr(t128)) );
9704}
9705
9706/* Construct a V256-bit value from eight 32-bit ints. */
9707
9708static IRExpr* mkV256from32s ( IRTemp t7, IRTemp t6,
9709                               IRTemp t5, IRTemp t4,
9710                               IRTemp t3, IRTemp t2,
9711                               IRTemp t1, IRTemp t0 )
9712{
9713   return
9714      binop( Iop_V128HLtoV256,
9715             binop( Iop_64HLtoV128,
9716                    binop(Iop_32HLto64, mkexpr(t7), mkexpr(t6)),
9717                    binop(Iop_32HLto64, mkexpr(t5), mkexpr(t4)) ),
9718             binop( Iop_64HLtoV128,
9719                    binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
9720                    binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0)) )
9721   );
9722}
9723
9724/* Construct a V256-bit value from four 64-bit ints. */
9725
9726static IRExpr* mkV256from64s ( IRTemp t3, IRTemp t2,
9727                               IRTemp t1, IRTemp t0 )
9728{
9729   return
9730      binop( Iop_V128HLtoV256,
9731             binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)),
9732             binop(Iop_64HLtoV128, mkexpr(t1), mkexpr(t0))
9733   );
9734}
9735
9736/* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
9737   values (aa,bb), computes, for each of the 4 16-bit lanes:
9738
9739   (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
9740*/
9741static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
9742{
9743   IRTemp aa      = newTemp(Ity_I64);
9744   IRTemp bb      = newTemp(Ity_I64);
9745   IRTemp aahi32s = newTemp(Ity_I64);
9746   IRTemp aalo32s = newTemp(Ity_I64);
9747   IRTemp bbhi32s = newTemp(Ity_I64);
9748   IRTemp bblo32s = newTemp(Ity_I64);
9749   IRTemp rHi     = newTemp(Ity_I64);
9750   IRTemp rLo     = newTemp(Ity_I64);
9751   IRTemp one32x2 = newTemp(Ity_I64);
9752   assign(aa, aax);
9753   assign(bb, bbx);
9754   assign( aahi32s,
9755           binop(Iop_SarN32x2,
9756                 binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
9757                 mkU8(16) ));
9758   assign( aalo32s,
9759           binop(Iop_SarN32x2,
9760                 binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
9761                 mkU8(16) ));
9762   assign( bbhi32s,
9763           binop(Iop_SarN32x2,
9764                 binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
9765                 mkU8(16) ));
9766   assign( bblo32s,
9767           binop(Iop_SarN32x2,
9768                 binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
9769                 mkU8(16) ));
9770   assign(one32x2, mkU64( (1ULL << 32) + 1 ));
9771   assign(
9772      rHi,
9773      binop(
9774         Iop_ShrN32x2,
9775         binop(
9776            Iop_Add32x2,
9777            binop(
9778               Iop_ShrN32x2,
9779               binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
9780               mkU8(14)
9781            ),
9782            mkexpr(one32x2)
9783         ),
9784         mkU8(1)
9785      )
9786   );
9787   assign(
9788      rLo,
9789      binop(
9790         Iop_ShrN32x2,
9791         binop(
9792            Iop_Add32x2,
9793            binop(
9794               Iop_ShrN32x2,
9795               binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
9796               mkU8(14)
9797            ),
9798            mkexpr(one32x2)
9799         ),
9800         mkU8(1)
9801      )
9802   );
9803   return
9804      binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
9805}
9806
9807/* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
9808   values (aa,bb), computes, for each lane:
9809
9810          if aa_lane < 0 then - bb_lane
9811     else if aa_lane > 0 then bb_lane
9812     else 0
9813*/
9814static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
9815{
9816   IRTemp aa       = newTemp(Ity_I64);
9817   IRTemp bb       = newTemp(Ity_I64);
9818   IRTemp zero     = newTemp(Ity_I64);
9819   IRTemp bbNeg    = newTemp(Ity_I64);
9820   IRTemp negMask  = newTemp(Ity_I64);
9821   IRTemp posMask  = newTemp(Ity_I64);
9822   IROp   opSub    = Iop_INVALID;
9823   IROp   opCmpGTS = Iop_INVALID;
9824
9825   switch (laneszB) {
9826      case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
9827      case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
9828      case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
9829      default: vassert(0);
9830   }
9831
9832   assign( aa,      aax );
9833   assign( bb,      bbx );
9834   assign( zero,    mkU64(0) );
9835   assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
9836   assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
9837   assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
9838
9839   return
9840      binop(Iop_Or64,
9841            binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
9842            binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
9843
9844}
9845
9846
9847/* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
9848   value aa, computes, for each lane
9849
9850   if aa < 0 then -aa else aa
9851
9852   Note that the result is interpreted as unsigned, so that the
9853   absolute value of the most negative signed input can be
9854   represented.
9855*/
9856static IRTemp math_PABS_MMX ( IRTemp aa, Int laneszB )
9857{
9858   IRTemp res     = newTemp(Ity_I64);
9859   IRTemp zero    = newTemp(Ity_I64);
9860   IRTemp aaNeg   = newTemp(Ity_I64);
9861   IRTemp negMask = newTemp(Ity_I64);
9862   IRTemp posMask = newTemp(Ity_I64);
9863   IROp   opSub   = Iop_INVALID;
9864   IROp   opSarN  = Iop_INVALID;
9865
9866   switch (laneszB) {
9867      case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
9868      case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
9869      case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
9870      default: vassert(0);
9871   }
9872
9873   assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
9874   assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
9875   assign( zero,    mkU64(0) );
9876   assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
9877   assign( res,
9878           binop(Iop_Or64,
9879                 binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
9880                 binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) ));
9881   return res;
9882}
9883
9884/* XMM version of math_PABS_MMX. */
9885static IRTemp math_PABS_XMM ( IRTemp aa, Int laneszB )
9886{
9887   IRTemp res  = newTemp(Ity_V128);
9888   IRTemp aaHi = newTemp(Ity_I64);
9889   IRTemp aaLo = newTemp(Ity_I64);
9890   assign(aaHi, unop(Iop_V128HIto64, mkexpr(aa)));
9891   assign(aaLo, unop(Iop_V128to64, mkexpr(aa)));
9892   assign(res, binop(Iop_64HLtoV128,
9893                     mkexpr(math_PABS_MMX(aaHi, laneszB)),
9894                     mkexpr(math_PABS_MMX(aaLo, laneszB))));
9895   return res;
9896}
9897
9898/* Specialisations of math_PABS_XMM, since there's no easy way to do
9899   partial applications in C :-( */
9900static IRTemp math_PABS_XMM_pap4 ( IRTemp aa ) {
9901   return math_PABS_XMM(aa, 4);
9902}
9903
9904static IRTemp math_PABS_XMM_pap2 ( IRTemp aa ) {
9905   return math_PABS_XMM(aa, 2);
9906}
9907
9908static IRTemp math_PABS_XMM_pap1 ( IRTemp aa ) {
9909   return math_PABS_XMM(aa, 1);
9910}
9911
9912/* YMM version of math_PABS_XMM. */
9913static IRTemp math_PABS_YMM ( IRTemp aa, Int laneszB )
9914{
9915   IRTemp res  = newTemp(Ity_V256);
9916   IRTemp aaHi = IRTemp_INVALID;
9917   IRTemp aaLo = IRTemp_INVALID;
9918   breakupV256toV128s(aa, &aaHi, &aaLo);
9919   assign(res, binop(Iop_V128HLtoV256,
9920                     mkexpr(math_PABS_XMM(aaHi, laneszB)),
9921                     mkexpr(math_PABS_XMM(aaLo, laneszB))));
9922   return res;
9923}
9924
9925static IRTemp math_PABS_YMM_pap4 ( IRTemp aa ) {
9926   return math_PABS_YMM(aa, 4);
9927}
9928
9929static IRTemp math_PABS_YMM_pap2 ( IRTemp aa ) {
9930   return math_PABS_YMM(aa, 2);
9931}
9932
9933static IRTemp math_PABS_YMM_pap1 ( IRTemp aa ) {
9934   return math_PABS_YMM(aa, 1);
9935}
9936
9937static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
9938                                        IRTemp lo64, Long byteShift )
9939{
9940   vassert(byteShift >= 1 && byteShift <= 7);
9941   return
9942      binop(Iop_Or64,
9943            binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
9944            binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
9945      );
9946}
9947
9948static IRTemp math_PALIGNR_XMM ( IRTemp sV, IRTemp dV, UInt imm8 )
9949{
9950   IRTemp res = newTemp(Ity_V128);
9951   IRTemp sHi = newTemp(Ity_I64);
9952   IRTemp sLo = newTemp(Ity_I64);
9953   IRTemp dHi = newTemp(Ity_I64);
9954   IRTemp dLo = newTemp(Ity_I64);
9955   IRTemp rHi = newTemp(Ity_I64);
9956   IRTemp rLo = newTemp(Ity_I64);
9957
9958   assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
9959   assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
9960   assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
9961   assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
9962
9963   if (imm8 == 0) {
9964      assign( rHi, mkexpr(sHi) );
9965      assign( rLo, mkexpr(sLo) );
9966   }
9967   else if (imm8 >= 1 && imm8 <= 7) {
9968      assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, imm8) );
9969      assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, imm8) );
9970   }
9971   else if (imm8 == 8) {
9972      assign( rHi, mkexpr(dLo) );
9973      assign( rLo, mkexpr(sHi) );
9974   }
9975   else if (imm8 >= 9 && imm8 <= 15) {
9976      assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-8) );
9977      assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, imm8-8) );
9978   }
9979   else if (imm8 == 16) {
9980      assign( rHi, mkexpr(dHi) );
9981      assign( rLo, mkexpr(dLo) );
9982   }
9983   else if (imm8 >= 17 && imm8 <= 23) {
9984      assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-16))) );
9985      assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-16) );
9986   }
9987   else if (imm8 == 24) {
9988      assign( rHi, mkU64(0) );
9989      assign( rLo, mkexpr(dHi) );
9990   }
9991   else if (imm8 >= 25 && imm8 <= 31) {
9992      assign( rHi, mkU64(0) );
9993      assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-24))) );
9994   }
9995   else if (imm8 >= 32 && imm8 <= 255) {
9996      assign( rHi, mkU64(0) );
9997      assign( rLo, mkU64(0) );
9998   }
9999   else
10000      vassert(0);
10001
10002   assign( res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)));
10003   return res;
10004}
10005
10006
10007/* Generate a SIGSEGV followed by a restart of the current instruction
10008   if effective_addr is not 16-aligned.  This is required behaviour
10009   for some SSE3 instructions and all 128-bit SSSE3 instructions.
10010   This assumes that guest_RIP_curr_instr is set correctly! */
10011static
10012void gen_SEGV_if_not_XX_aligned ( IRTemp effective_addr, ULong mask )
10013{
10014   stmt(
10015      IRStmt_Exit(
10016         binop(Iop_CmpNE64,
10017               binop(Iop_And64,mkexpr(effective_addr),mkU64(mask)),
10018               mkU64(0)),
10019         Ijk_SigSEGV,
10020         IRConst_U64(guest_RIP_curr_instr),
10021         OFFB_RIP
10022      )
10023   );
10024}
10025
10026static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr ) {
10027   gen_SEGV_if_not_XX_aligned(effective_addr, 16-1);
10028}
10029
10030static void gen_SEGV_if_not_32_aligned ( IRTemp effective_addr ) {
10031   gen_SEGV_if_not_XX_aligned(effective_addr, 32-1);
10032}
10033
10034static void gen_SEGV_if_not_64_aligned ( IRTemp effective_addr ) {
10035   gen_SEGV_if_not_XX_aligned(effective_addr, 64-1);
10036}
10037
10038/* Helper for deciding whether a given insn (starting at the opcode
10039   byte) may validly be used with a LOCK prefix.  The following insns
10040   may be used with LOCK when their destination operand is in memory.
10041   AFAICS this is exactly the same for both 32-bit and 64-bit mode.
10042
10043   ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
10044   OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
10045   ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
10046   SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
10047   AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
10048   SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
10049   XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
10050
10051   DEC        FE /1,  FF /1
10052   INC        FE /0,  FF /0
10053
10054   NEG        F6 /3,  F7 /3
10055   NOT        F6 /2,  F7 /2
10056
10057   XCHG       86, 87
10058
10059   BTC        0F BB,  0F BA /7
10060   BTR        0F B3,  0F BA /6
10061   BTS        0F AB,  0F BA /5
10062
10063   CMPXCHG    0F B0,  0F B1
10064   CMPXCHG8B  0F C7 /1
10065
10066   XADD       0F C0,  0F C1
10067
10068   ------------------------------
10069
10070   80 /0  =  addb $imm8,  rm8
10071   81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
10072   82 /0  =  addb $imm8,  rm8
10073   83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
10074
10075   00     =  addb r8,  rm8
10076   01     =  addl r32, rm32  and  addw r16, rm16
10077
10078   Same for ADD OR ADC SBB AND SUB XOR
10079
10080   FE /1  = dec rm8
10081   FF /1  = dec rm32  and  dec rm16
10082
10083   FE /0  = inc rm8
10084   FF /0  = inc rm32  and  inc rm16
10085
10086   F6 /3  = neg rm8
10087   F7 /3  = neg rm32  and  neg rm16
10088
10089   F6 /2  = not rm8
10090   F7 /2  = not rm32  and  not rm16
10091
10092   0F BB     = btcw r16, rm16    and  btcl r32, rm32
10093   OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
10094
10095   Same for BTS, BTR
10096*/
10097static Bool can_be_used_with_LOCK_prefix ( const UChar* opc )
10098{
10099   switch (opc[0]) {
10100      case 0x00: case 0x01: case 0x08: case 0x09:
10101      case 0x10: case 0x11: case 0x18: case 0x19:
10102      case 0x20: case 0x21: case 0x28: case 0x29:
10103      case 0x30: case 0x31:
10104         if (!epartIsReg(opc[1]))
10105            return True;
10106         break;
10107
10108      case 0x80: case 0x81: case 0x82: case 0x83:
10109         if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 6
10110             && !epartIsReg(opc[1]))
10111            return True;
10112         break;
10113
10114      case 0xFE: case 0xFF:
10115         if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 1
10116             && !epartIsReg(opc[1]))
10117            return True;
10118         break;
10119
10120      case 0xF6: case 0xF7:
10121         if (gregLO3ofRM(opc[1]) >= 2 && gregLO3ofRM(opc[1]) <= 3
10122             && !epartIsReg(opc[1]))
10123            return True;
10124         break;
10125
10126      case 0x86: case 0x87:
10127         if (!epartIsReg(opc[1]))
10128            return True;
10129         break;
10130
10131      case 0x0F: {
10132         switch (opc[1]) {
10133            case 0xBB: case 0xB3: case 0xAB:
10134               if (!epartIsReg(opc[2]))
10135                  return True;
10136               break;
10137            case 0xBA:
10138               if (gregLO3ofRM(opc[2]) >= 5 && gregLO3ofRM(opc[2]) <= 7
10139                   && !epartIsReg(opc[2]))
10140                  return True;
10141               break;
10142            case 0xB0: case 0xB1:
10143               if (!epartIsReg(opc[2]))
10144                  return True;
10145               break;
10146            case 0xC7:
10147               if (gregLO3ofRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
10148                  return True;
10149               break;
10150            case 0xC0: case 0xC1:
10151               if (!epartIsReg(opc[2]))
10152                  return True;
10153               break;
10154            default:
10155               break;
10156         } /* switch (opc[1]) */
10157         break;
10158      }
10159
10160      default:
10161         break;
10162   } /* switch (opc[0]) */
10163
10164   return False;
10165}
10166
10167
10168/*------------------------------------------------------------*/
10169/*---                                                      ---*/
10170/*--- Top-level SSE/SSE2: dis_ESC_0F__SSE2                 ---*/
10171/*---                                                      ---*/
10172/*------------------------------------------------------------*/
10173
10174static Long dis_COMISD ( const VexAbiInfo* vbi, Prefix pfx,
10175                         Long delta, Bool isAvx, UChar opc )
10176{
10177   vassert(opc == 0x2F/*COMISD*/ || opc == 0x2E/*UCOMISD*/);
10178   Int    alen  = 0;
10179   HChar  dis_buf[50];
10180   IRTemp argL  = newTemp(Ity_F64);
10181   IRTemp argR  = newTemp(Ity_F64);
10182   UChar  modrm = getUChar(delta);
10183   IRTemp addr  = IRTemp_INVALID;
10184   if (epartIsReg(modrm)) {
10185      assign( argR, getXMMRegLane64F( eregOfRexRM(pfx,modrm),
10186                                      0/*lowest lane*/ ) );
10187      delta += 1;
10188      DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
10189                                opc==0x2E ? "u" : "",
10190                                nameXMMReg(eregOfRexRM(pfx,modrm)),
10191                                nameXMMReg(gregOfRexRM(pfx,modrm)) );
10192   } else {
10193      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10194      assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
10195      delta += alen;
10196      DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
10197                                opc==0x2E ? "u" : "",
10198                                dis_buf,
10199                                nameXMMReg(gregOfRexRM(pfx,modrm)) );
10200   }
10201   assign( argL, getXMMRegLane64F( gregOfRexRM(pfx,modrm),
10202                                   0/*lowest lane*/ ) );
10203
10204   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
10205   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
10206   stmt( IRStmt_Put(
10207            OFFB_CC_DEP1,
10208            binop( Iop_And64,
10209                   unop( Iop_32Uto64,
10210                         binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)) ),
10211                   mkU64(0x45)
10212       )));
10213   return delta;
10214}
10215
10216
10217static Long dis_COMISS ( const VexAbiInfo* vbi, Prefix pfx,
10218                         Long delta, Bool isAvx, UChar opc )
10219{
10220   vassert(opc == 0x2F/*COMISS*/ || opc == 0x2E/*UCOMISS*/);
10221   Int    alen  = 0;
10222   HChar  dis_buf[50];
10223   IRTemp argL  = newTemp(Ity_F32);
10224   IRTemp argR  = newTemp(Ity_F32);
10225   UChar  modrm = getUChar(delta);
10226   IRTemp addr  = IRTemp_INVALID;
10227   if (epartIsReg(modrm)) {
10228      assign( argR, getXMMRegLane32F( eregOfRexRM(pfx,modrm),
10229                                      0/*lowest lane*/ ) );
10230      delta += 1;
10231      DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
10232                                opc==0x2E ? "u" : "",
10233                                nameXMMReg(eregOfRexRM(pfx,modrm)),
10234                                nameXMMReg(gregOfRexRM(pfx,modrm)) );
10235   } else {
10236      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10237      assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
10238      delta += alen;
10239      DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
10240                                opc==0x2E ? "u" : "",
10241                                dis_buf,
10242                                nameXMMReg(gregOfRexRM(pfx,modrm)) );
10243   }
10244   assign( argL, getXMMRegLane32F( gregOfRexRM(pfx,modrm),
10245                                   0/*lowest lane*/ ) );
10246
10247   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
10248   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
10249   stmt( IRStmt_Put(
10250            OFFB_CC_DEP1,
10251            binop( Iop_And64,
10252                   unop( Iop_32Uto64,
10253                         binop(Iop_CmpF64,
10254                               unop(Iop_F32toF64,mkexpr(argL)),
10255                               unop(Iop_F32toF64,mkexpr(argR)))),
10256                   mkU64(0x45)
10257       )));
10258   return delta;
10259}
10260
10261
10262static Long dis_PSHUFD_32x4 ( const VexAbiInfo* vbi, Prefix pfx,
10263                              Long delta, Bool writesYmm )
10264{
10265   Int    order;
10266   Int    alen  = 0;
10267   HChar  dis_buf[50];
10268   IRTemp sV    = newTemp(Ity_V128);
10269   UChar  modrm = getUChar(delta);
10270   const HChar* strV  = writesYmm ? "v" : "";
10271   IRTemp addr  = IRTemp_INVALID;
10272   if (epartIsReg(modrm)) {
10273      assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
10274      order = (Int)getUChar(delta+1);
10275      delta += 1+1;
10276      DIP("%spshufd $%d,%s,%s\n", strV, order,
10277                                  nameXMMReg(eregOfRexRM(pfx,modrm)),
10278                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
10279   } else {
10280      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
10281                        1/*byte after the amode*/ );
10282      assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
10283      order = (Int)getUChar(delta+alen);
10284      delta += alen+1;
10285      DIP("%spshufd $%d,%s,%s\n", strV, order,
10286                                 dis_buf,
10287                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
10288   }
10289
10290   IRTemp s3, s2, s1, s0;
10291   s3 = s2 = s1 = s0 = IRTemp_INVALID;
10292   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
10293
10294#  define SEL(n)  ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
10295   IRTemp dV = newTemp(Ity_V128);
10296   assign(dV,
10297          mkV128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
10298                         SEL((order>>2)&3), SEL((order>>0)&3) )
10299   );
10300#  undef SEL
10301
10302   (writesYmm ? putYMMRegLoAndZU : putXMMReg)
10303      (gregOfRexRM(pfx,modrm), mkexpr(dV));
10304   return delta;
10305}
10306
10307
10308static Long dis_PSHUFD_32x8 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
10309{
10310   Int    order;
10311   Int    alen  = 0;
10312   HChar  dis_buf[50];
10313   IRTemp sV    = newTemp(Ity_V256);
10314   UChar  modrm = getUChar(delta);
10315   IRTemp addr  = IRTemp_INVALID;
10316   UInt   rG    = gregOfRexRM(pfx,modrm);
10317   if (epartIsReg(modrm)) {
10318      UInt rE = eregOfRexRM(pfx,modrm);
10319      assign( sV, getYMMReg(rE) );
10320      order = (Int)getUChar(delta+1);
10321      delta += 1+1;
10322      DIP("vpshufd $%d,%s,%s\n", order, nameYMMReg(rE), nameYMMReg(rG));
10323   } else {
10324      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
10325                        1/*byte after the amode*/ );
10326      assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
10327      order = (Int)getUChar(delta+alen);
10328      delta += alen+1;
10329      DIP("vpshufd $%d,%s,%s\n", order,  dis_buf, nameYMMReg(rG));
10330   }
10331
10332   IRTemp s[8];
10333   s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
10334   breakupV256to32s( sV, &s[7], &s[6], &s[5], &s[4],
10335                         &s[3], &s[2], &s[1], &s[0] );
10336
10337   putYMMReg( rG, mkV256from32s( s[4 + ((order>>6)&3)],
10338                                 s[4 + ((order>>4)&3)],
10339                                 s[4 + ((order>>2)&3)],
10340                                 s[4 + ((order>>0)&3)],
10341                                 s[0 + ((order>>6)&3)],
10342                                 s[0 + ((order>>4)&3)],
10343                                 s[0 + ((order>>2)&3)],
10344                                 s[0 + ((order>>0)&3)] ) );
10345   return delta;
10346}
10347
10348
10349static IRTemp math_PSRLDQ ( IRTemp sV, Int imm )
10350{
10351   IRTemp dV    = newTemp(Ity_V128);
10352   IRTemp hi64  = newTemp(Ity_I64);
10353   IRTemp lo64  = newTemp(Ity_I64);
10354   IRTemp hi64r = newTemp(Ity_I64);
10355   IRTemp lo64r = newTemp(Ity_I64);
10356
10357   vassert(imm >= 0 && imm <= 255);
10358   if (imm >= 16) {
10359      assign(dV, mkV128(0x0000));
10360      return dV;
10361   }
10362
10363   assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
10364   assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
10365
10366   if (imm == 0) {
10367      assign( lo64r, mkexpr(lo64) );
10368      assign( hi64r, mkexpr(hi64) );
10369   }
10370   else
10371   if (imm == 8) {
10372      assign( hi64r, mkU64(0) );
10373      assign( lo64r, mkexpr(hi64) );
10374   }
10375   else
10376   if (imm > 8) {
10377      assign( hi64r, mkU64(0) );
10378      assign( lo64r, binop( Iop_Shr64, mkexpr(hi64), mkU8( 8*(imm-8) ) ));
10379   } else {
10380      assign( hi64r, binop( Iop_Shr64, mkexpr(hi64), mkU8(8 * imm) ));
10381      assign( lo64r,
10382              binop( Iop_Or64,
10383                     binop(Iop_Shr64, mkexpr(lo64),
10384                           mkU8(8 * imm)),
10385                     binop(Iop_Shl64, mkexpr(hi64),
10386                           mkU8(8 * (8 - imm)) )
10387                     )
10388              );
10389   }
10390
10391   assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
10392   return dV;
10393}
10394
10395
10396static IRTemp math_PSLLDQ ( IRTemp sV, Int imm )
10397{
10398   IRTemp       dV    = newTemp(Ity_V128);
10399   IRTemp       hi64  = newTemp(Ity_I64);
10400   IRTemp       lo64  = newTemp(Ity_I64);
10401   IRTemp       hi64r = newTemp(Ity_I64);
10402   IRTemp       lo64r = newTemp(Ity_I64);
10403
10404   vassert(imm >= 0 && imm <= 255);
10405   if (imm >= 16) {
10406      assign(dV, mkV128(0x0000));
10407      return dV;
10408   }
10409
10410   assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
10411   assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
10412
10413   if (imm == 0) {
10414      assign( lo64r, mkexpr(lo64) );
10415      assign( hi64r, mkexpr(hi64) );
10416   }
10417   else
10418   if (imm == 8) {
10419      assign( lo64r, mkU64(0) );
10420      assign( hi64r, mkexpr(lo64) );
10421   }
10422   else
10423   if (imm > 8) {
10424      assign( lo64r, mkU64(0) );
10425      assign( hi64r, binop( Iop_Shl64, mkexpr(lo64), mkU8( 8*(imm-8) ) ));
10426   } else {
10427      assign( lo64r, binop( Iop_Shl64, mkexpr(lo64), mkU8(8 * imm) ));
10428      assign( hi64r,
10429              binop( Iop_Or64,
10430                     binop(Iop_Shl64, mkexpr(hi64),
10431                           mkU8(8 * imm)),
10432                     binop(Iop_Shr64, mkexpr(lo64),
10433                           mkU8(8 * (8 - imm)) )
10434                     )
10435              );
10436   }
10437
10438   assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
10439   return dV;
10440}
10441
10442
10443static Long dis_CVTxSD2SI ( const VexAbiInfo* vbi, Prefix pfx,
10444                            Long delta, Bool isAvx, UChar opc, Int sz )
10445{
10446   vassert(opc == 0x2D/*CVTSD2SI*/ || opc == 0x2C/*CVTTSD2SI*/);
10447   HChar  dis_buf[50];
10448   Int    alen   = 0;
10449   UChar  modrm  = getUChar(delta);
10450   IRTemp addr   = IRTemp_INVALID;
10451   IRTemp rmode  = newTemp(Ity_I32);
10452   IRTemp f64lo  = newTemp(Ity_F64);
10453   Bool   r2zero = toBool(opc == 0x2C);
10454
10455   if (epartIsReg(modrm)) {
10456      delta += 1;
10457      assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
10458      DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
10459                                  nameXMMReg(eregOfRexRM(pfx,modrm)),
10460                                  nameIReg(sz, gregOfRexRM(pfx,modrm),
10461                                           False));
10462   } else {
10463      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10464      assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
10465      delta += alen;
10466      DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
10467                                  dis_buf,
10468                                  nameIReg(sz, gregOfRexRM(pfx,modrm),
10469                                           False));
10470   }
10471
10472   if (r2zero) {
10473      assign( rmode, mkU32((UInt)Irrm_ZERO) );
10474   } else {
10475      assign( rmode, get_sse_roundingmode() );
10476   }
10477
10478   if (sz == 4) {
10479      putIReg32( gregOfRexRM(pfx,modrm),
10480                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
10481   } else {
10482      vassert(sz == 8);
10483      putIReg64( gregOfRexRM(pfx,modrm),
10484                 binop( Iop_F64toI64S, mkexpr(rmode), mkexpr(f64lo)) );
10485   }
10486
10487   return delta;
10488}
10489
10490
10491static Long dis_CVTxSS2SI ( const VexAbiInfo* vbi, Prefix pfx,
10492                            Long delta, Bool isAvx, UChar opc, Int sz )
10493{
10494   vassert(opc == 0x2D/*CVTSS2SI*/ || opc == 0x2C/*CVTTSS2SI*/);
10495   HChar  dis_buf[50];
10496   Int    alen   = 0;
10497   UChar  modrm  = getUChar(delta);
10498   IRTemp addr   = IRTemp_INVALID;
10499   IRTemp rmode  = newTemp(Ity_I32);
10500   IRTemp f32lo  = newTemp(Ity_F32);
10501   Bool   r2zero = toBool(opc == 0x2C);
10502
10503   if (epartIsReg(modrm)) {
10504      delta += 1;
10505      assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
10506      DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
10507                                  nameXMMReg(eregOfRexRM(pfx,modrm)),
10508                                  nameIReg(sz, gregOfRexRM(pfx,modrm),
10509                                           False));
10510   } else {
10511      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10512      assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
10513      delta += alen;
10514      DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
10515                                  dis_buf,
10516                                  nameIReg(sz, gregOfRexRM(pfx,modrm),
10517                                           False));
10518   }
10519
10520   if (r2zero) {
10521      assign( rmode, mkU32((UInt)Irrm_ZERO) );
10522   } else {
10523      assign( rmode, get_sse_roundingmode() );
10524   }
10525
10526   if (sz == 4) {
10527      putIReg32( gregOfRexRM(pfx,modrm),
10528                 binop( Iop_F64toI32S,
10529                        mkexpr(rmode),
10530                        unop(Iop_F32toF64, mkexpr(f32lo))) );
10531   } else {
10532      vassert(sz == 8);
10533      putIReg64( gregOfRexRM(pfx,modrm),
10534                 binop( Iop_F64toI64S,
10535                        mkexpr(rmode),
10536                        unop(Iop_F32toF64, mkexpr(f32lo))) );
10537   }
10538
10539   return delta;
10540}
10541
10542
10543static Long dis_CVTPS2PD_128 ( const VexAbiInfo* vbi, Prefix pfx,
10544                               Long delta, Bool isAvx )
10545{
10546   IRTemp addr  = IRTemp_INVALID;
10547   Int    alen  = 0;
10548   HChar  dis_buf[50];
10549   IRTemp f32lo = newTemp(Ity_F32);
10550   IRTemp f32hi = newTemp(Ity_F32);
10551   UChar  modrm = getUChar(delta);
10552   UInt   rG    = gregOfRexRM(pfx,modrm);
10553   if (epartIsReg(modrm)) {
10554      UInt rE = eregOfRexRM(pfx,modrm);
10555      assign( f32lo, getXMMRegLane32F(rE, 0) );
10556      assign( f32hi, getXMMRegLane32F(rE, 1) );
10557      delta += 1;
10558      DIP("%scvtps2pd %s,%s\n",
10559          isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
10560   } else {
10561      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10562      assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
10563      assign( f32hi, loadLE(Ity_F32,
10564                            binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
10565      delta += alen;
10566      DIP("%scvtps2pd %s,%s\n",
10567          isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
10568   }
10569
10570   putXMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32hi)) );
10571   putXMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32lo)) );
10572   if (isAvx)
10573      putYMMRegLane128( rG, 1, mkV128(0));
10574   return delta;
10575}
10576
10577
10578static Long dis_CVTPS2PD_256 ( const VexAbiInfo* vbi, Prefix pfx,
10579                               Long delta )
10580{
10581   IRTemp addr  = IRTemp_INVALID;
10582   Int    alen  = 0;
10583   HChar  dis_buf[50];
10584   IRTemp f32_0 = newTemp(Ity_F32);
10585   IRTemp f32_1 = newTemp(Ity_F32);
10586   IRTemp f32_2 = newTemp(Ity_F32);
10587   IRTemp f32_3 = newTemp(Ity_F32);
10588   UChar  modrm = getUChar(delta);
10589   UInt   rG    = gregOfRexRM(pfx,modrm);
10590   if (epartIsReg(modrm)) {
10591      UInt rE = eregOfRexRM(pfx,modrm);
10592      assign( f32_0, getXMMRegLane32F(rE, 0) );
10593      assign( f32_1, getXMMRegLane32F(rE, 1) );
10594      assign( f32_2, getXMMRegLane32F(rE, 2) );
10595      assign( f32_3, getXMMRegLane32F(rE, 3) );
10596      delta += 1;
10597      DIP("vcvtps2pd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
10598   } else {
10599      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10600      assign( f32_0, loadLE(Ity_F32, mkexpr(addr)) );
10601      assign( f32_1, loadLE(Ity_F32,
10602                            binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
10603      assign( f32_2, loadLE(Ity_F32,
10604                            binop(Iop_Add64,mkexpr(addr),mkU64(8))) );
10605      assign( f32_3, loadLE(Ity_F32,
10606                            binop(Iop_Add64,mkexpr(addr),mkU64(12))) );
10607      delta += alen;
10608      DIP("vcvtps2pd %s,%s\n", dis_buf, nameYMMReg(rG));
10609   }
10610
10611   putYMMRegLane64F( rG, 3, unop(Iop_F32toF64, mkexpr(f32_3)) );
10612   putYMMRegLane64F( rG, 2, unop(Iop_F32toF64, mkexpr(f32_2)) );
10613   putYMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32_1)) );
10614   putYMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32_0)) );
10615   return delta;
10616}
10617
10618
10619static Long dis_CVTPD2PS_128 ( const VexAbiInfo* vbi, Prefix pfx,
10620                               Long delta, Bool isAvx )
10621{
10622   IRTemp addr  = IRTemp_INVALID;
10623   Int    alen  = 0;
10624   HChar  dis_buf[50];
10625   UChar  modrm = getUChar(delta);
10626   UInt   rG    = gregOfRexRM(pfx,modrm);
10627   IRTemp argV  = newTemp(Ity_V128);
10628   IRTemp rmode = newTemp(Ity_I32);
10629   if (epartIsReg(modrm)) {
10630      UInt rE = eregOfRexRM(pfx,modrm);
10631      assign( argV, getXMMReg(rE) );
10632      delta += 1;
10633      DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
10634          nameXMMReg(rE), nameXMMReg(rG));
10635   } else {
10636      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10637      assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10638      delta += alen;
10639      DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
10640          dis_buf, nameXMMReg(rG) );
10641   }
10642
10643   assign( rmode, get_sse_roundingmode() );
10644   IRTemp t0 = newTemp(Ity_F64);
10645   IRTemp t1 = newTemp(Ity_F64);
10646   assign( t0, unop(Iop_ReinterpI64asF64,
10647                    unop(Iop_V128to64, mkexpr(argV))) );
10648   assign( t1, unop(Iop_ReinterpI64asF64,
10649                    unop(Iop_V128HIto64, mkexpr(argV))) );
10650
10651#  define CVT(_t)  binop( Iop_F64toF32, mkexpr(rmode), mkexpr(_t) )
10652   putXMMRegLane32(  rG, 3, mkU32(0) );
10653   putXMMRegLane32(  rG, 2, mkU32(0) );
10654   putXMMRegLane32F( rG, 1, CVT(t1) );
10655   putXMMRegLane32F( rG, 0, CVT(t0) );
10656#  undef CVT
10657   if (isAvx)
10658      putYMMRegLane128( rG, 1, mkV128(0) );
10659
10660   return delta;
10661}
10662
10663
10664static Long dis_CVTxPS2DQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
10665                                Long delta, Bool isAvx, Bool r2zero )
10666{
10667   IRTemp addr  = IRTemp_INVALID;
10668   Int    alen  = 0;
10669   HChar  dis_buf[50];
10670   UChar  modrm = getUChar(delta);
10671   IRTemp argV  = newTemp(Ity_V128);
10672   IRTemp rmode = newTemp(Ity_I32);
10673   UInt   rG    = gregOfRexRM(pfx,modrm);
10674   IRTemp t0, t1, t2, t3;
10675
10676   if (epartIsReg(modrm)) {
10677      UInt rE = eregOfRexRM(pfx,modrm);
10678      assign( argV, getXMMReg(rE) );
10679      delta += 1;
10680      DIP("%scvt%sps2dq %s,%s\n",
10681          isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
10682   } else {
10683      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10684      assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10685      delta += alen;
10686      DIP("%scvt%sps2dq %s,%s\n",
10687          isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
10688   }
10689
10690   assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
10691                         : get_sse_roundingmode() );
10692   t0 = t1 = t2 = t3 = IRTemp_INVALID;
10693   breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
10694   /* This is less than ideal.  If it turns out to be a performance
10695      bottleneck it can be improved. */
10696#  define CVT(_t)                             \
10697      binop( Iop_F64toI32S,                   \
10698             mkexpr(rmode),                   \
10699             unop( Iop_F32toF64,              \
10700                   unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
10701
10702   putXMMRegLane32( rG, 3, CVT(t3) );
10703   putXMMRegLane32( rG, 2, CVT(t2) );
10704   putXMMRegLane32( rG, 1, CVT(t1) );
10705   putXMMRegLane32( rG, 0, CVT(t0) );
10706#  undef CVT
10707   if (isAvx)
10708      putYMMRegLane128( rG, 1, mkV128(0) );
10709
10710   return delta;
10711}
10712
10713
10714static Long dis_CVTxPS2DQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
10715                                Long delta, Bool r2zero )
10716{
10717   IRTemp addr  = IRTemp_INVALID;
10718   Int    alen  = 0;
10719   HChar  dis_buf[50];
10720   UChar  modrm = getUChar(delta);
10721   IRTemp argV  = newTemp(Ity_V256);
10722   IRTemp rmode = newTemp(Ity_I32);
10723   UInt   rG    = gregOfRexRM(pfx,modrm);
10724   IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
10725
10726   if (epartIsReg(modrm)) {
10727      UInt rE = eregOfRexRM(pfx,modrm);
10728      assign( argV, getYMMReg(rE) );
10729      delta += 1;
10730      DIP("vcvt%sps2dq %s,%s\n",
10731          r2zero ? "t" : "", nameYMMReg(rE), nameYMMReg(rG));
10732   } else {
10733      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10734      assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
10735      delta += alen;
10736      DIP("vcvt%sps2dq %s,%s\n",
10737          r2zero ? "t" : "", dis_buf, nameYMMReg(rG) );
10738   }
10739
10740   assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
10741                         : get_sse_roundingmode() );
10742   t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = IRTemp_INVALID;
10743   breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
10744   /* This is less than ideal.  If it turns out to be a performance
10745      bottleneck it can be improved. */
10746#  define CVT(_t)                             \
10747      binop( Iop_F64toI32S,                   \
10748             mkexpr(rmode),                   \
10749             unop( Iop_F32toF64,              \
10750                   unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
10751
10752   putYMMRegLane32( rG, 7, CVT(t7) );
10753   putYMMRegLane32( rG, 6, CVT(t6) );
10754   putYMMRegLane32( rG, 5, CVT(t5) );
10755   putYMMRegLane32( rG, 4, CVT(t4) );
10756   putYMMRegLane32( rG, 3, CVT(t3) );
10757   putYMMRegLane32( rG, 2, CVT(t2) );
10758   putYMMRegLane32( rG, 1, CVT(t1) );
10759   putYMMRegLane32( rG, 0, CVT(t0) );
10760#  undef CVT
10761
10762   return delta;
10763}
10764
10765
10766static Long dis_CVTxPD2DQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
10767                                Long delta, Bool isAvx, Bool r2zero )
10768{
10769   IRTemp addr  = IRTemp_INVALID;
10770   Int    alen  = 0;
10771   HChar  dis_buf[50];
10772   UChar  modrm = getUChar(delta);
10773   IRTemp argV  = newTemp(Ity_V128);
10774   IRTemp rmode = newTemp(Ity_I32);
10775   UInt   rG    = gregOfRexRM(pfx,modrm);
10776   IRTemp t0, t1;
10777
10778   if (epartIsReg(modrm)) {
10779      UInt rE = eregOfRexRM(pfx,modrm);
10780      assign( argV, getXMMReg(rE) );
10781      delta += 1;
10782      DIP("%scvt%spd2dq %s,%s\n",
10783          isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
10784   } else {
10785      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10786      assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10787      delta += alen;
10788      DIP("%scvt%spd2dqx %s,%s\n",
10789          isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
10790   }
10791
10792   if (r2zero) {
10793      assign(rmode, mkU32((UInt)Irrm_ZERO) );
10794   } else {
10795      assign( rmode, get_sse_roundingmode() );
10796   }
10797
10798   t0 = newTemp(Ity_F64);
10799   t1 = newTemp(Ity_F64);
10800   assign( t0, unop(Iop_ReinterpI64asF64,
10801                    unop(Iop_V128to64, mkexpr(argV))) );
10802   assign( t1, unop(Iop_ReinterpI64asF64,
10803                    unop(Iop_V128HIto64, mkexpr(argV))) );
10804
10805#  define CVT(_t)  binop( Iop_F64toI32S,                   \
10806                          mkexpr(rmode),                   \
10807                          mkexpr(_t) )
10808
10809   putXMMRegLane32( rG, 3, mkU32(0) );
10810   putXMMRegLane32( rG, 2, mkU32(0) );
10811   putXMMRegLane32( rG, 1, CVT(t1) );
10812   putXMMRegLane32( rG, 0, CVT(t0) );
10813#  undef CVT
10814   if (isAvx)
10815      putYMMRegLane128( rG, 1, mkV128(0) );
10816
10817   return delta;
10818}
10819
10820
10821static Long dis_CVTxPD2DQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
10822                                Long delta, Bool r2zero )
10823{
10824   IRTemp addr  = IRTemp_INVALID;
10825   Int    alen  = 0;
10826   HChar  dis_buf[50];
10827   UChar  modrm = getUChar(delta);
10828   IRTemp argV  = newTemp(Ity_V256);
10829   IRTemp rmode = newTemp(Ity_I32);
10830   UInt   rG    = gregOfRexRM(pfx,modrm);
10831   IRTemp t0, t1, t2, t3;
10832
10833   if (epartIsReg(modrm)) {
10834      UInt rE = eregOfRexRM(pfx,modrm);
10835      assign( argV, getYMMReg(rE) );
10836      delta += 1;
10837      DIP("vcvt%spd2dq %s,%s\n",
10838          r2zero ? "t" : "", nameYMMReg(rE), nameXMMReg(rG));
10839   } else {
10840      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10841      assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
10842      delta += alen;
10843      DIP("vcvt%spd2dqy %s,%s\n",
10844          r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
10845   }
10846
10847   if (r2zero) {
10848      assign(rmode, mkU32((UInt)Irrm_ZERO) );
10849   } else {
10850      assign( rmode, get_sse_roundingmode() );
10851   }
10852
10853   t0 = IRTemp_INVALID;
10854   t1 = IRTemp_INVALID;
10855   t2 = IRTemp_INVALID;
10856   t3 = IRTemp_INVALID;
10857   breakupV256to64s( argV, &t3, &t2, &t1, &t0 );
10858
10859#  define CVT(_t)  binop( Iop_F64toI32S,                   \
10860                          mkexpr(rmode),                   \
10861                          unop( Iop_ReinterpI64asF64,      \
10862                                mkexpr(_t) ) )
10863
10864   putXMMRegLane32( rG, 3, CVT(t3) );
10865   putXMMRegLane32( rG, 2, CVT(t2) );
10866   putXMMRegLane32( rG, 1, CVT(t1) );
10867   putXMMRegLane32( rG, 0, CVT(t0) );
10868#  undef CVT
10869   putYMMRegLane128( rG, 1, mkV128(0) );
10870
10871   return delta;
10872}
10873
10874
10875static Long dis_CVTDQ2PS_128 ( const VexAbiInfo* vbi, Prefix pfx,
10876                               Long delta, Bool isAvx )
10877{
10878   IRTemp addr  = IRTemp_INVALID;
10879   Int    alen  = 0;
10880   HChar  dis_buf[50];
10881   UChar  modrm = getUChar(delta);
10882   IRTemp argV  = newTemp(Ity_V128);
10883   IRTemp rmode = newTemp(Ity_I32);
10884   UInt   rG    = gregOfRexRM(pfx,modrm);
10885   IRTemp t0, t1, t2, t3;
10886
10887   if (epartIsReg(modrm)) {
10888      UInt rE = eregOfRexRM(pfx,modrm);
10889      assign( argV, getXMMReg(rE) );
10890      delta += 1;
10891      DIP("%scvtdq2ps %s,%s\n",
10892          isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
10893   } else {
10894      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10895      assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10896      delta += alen;
10897      DIP("%scvtdq2ps %s,%s\n",
10898          isAvx ? "v" : "", dis_buf, nameXMMReg(rG) );
10899   }
10900
10901   assign( rmode, get_sse_roundingmode() );
10902   t0 = IRTemp_INVALID;
10903   t1 = IRTemp_INVALID;
10904   t2 = IRTemp_INVALID;
10905   t3 = IRTemp_INVALID;
10906   breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
10907
10908#  define CVT(_t)  binop( Iop_F64toF32,                    \
10909                          mkexpr(rmode),                   \
10910                          unop(Iop_I32StoF64,mkexpr(_t)))
10911
10912   putXMMRegLane32F( rG, 3, CVT(t3) );
10913   putXMMRegLane32F( rG, 2, CVT(t2) );
10914   putXMMRegLane32F( rG, 1, CVT(t1) );
10915   putXMMRegLane32F( rG, 0, CVT(t0) );
10916#  undef CVT
10917   if (isAvx)
10918      putYMMRegLane128( rG, 1, mkV128(0) );
10919
10920   return delta;
10921}
10922
10923static Long dis_CVTDQ2PS_256 ( const VexAbiInfo* vbi, Prefix pfx,
10924                               Long delta )
10925{
10926   IRTemp addr   = IRTemp_INVALID;
10927   Int    alen   = 0;
10928   HChar  dis_buf[50];
10929   UChar  modrm  = getUChar(delta);
10930   IRTemp argV   = newTemp(Ity_V256);
10931   IRTemp rmode  = newTemp(Ity_I32);
10932   UInt   rG     = gregOfRexRM(pfx,modrm);
10933   IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
10934
10935   if (epartIsReg(modrm)) {
10936      UInt rE = eregOfRexRM(pfx,modrm);
10937      assign( argV, getYMMReg(rE) );
10938      delta += 1;
10939      DIP("vcvtdq2ps %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
10940   } else {
10941      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10942      assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
10943      delta += alen;
10944      DIP("vcvtdq2ps %s,%s\n", dis_buf, nameYMMReg(rG) );
10945   }
10946
10947   assign( rmode, get_sse_roundingmode() );
10948   t0 = IRTemp_INVALID;
10949   t1 = IRTemp_INVALID;
10950   t2 = IRTemp_INVALID;
10951   t3 = IRTemp_INVALID;
10952   t4 = IRTemp_INVALID;
10953   t5 = IRTemp_INVALID;
10954   t6 = IRTemp_INVALID;
10955   t7 = IRTemp_INVALID;
10956   breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
10957
10958#  define CVT(_t)  binop( Iop_F64toF32,                    \
10959                          mkexpr(rmode),                   \
10960                          unop(Iop_I32StoF64,mkexpr(_t)))
10961
10962   putYMMRegLane32F( rG, 7, CVT(t7) );
10963   putYMMRegLane32F( rG, 6, CVT(t6) );
10964   putYMMRegLane32F( rG, 5, CVT(t5) );
10965   putYMMRegLane32F( rG, 4, CVT(t4) );
10966   putYMMRegLane32F( rG, 3, CVT(t3) );
10967   putYMMRegLane32F( rG, 2, CVT(t2) );
10968   putYMMRegLane32F( rG, 1, CVT(t1) );
10969   putYMMRegLane32F( rG, 0, CVT(t0) );
10970#  undef CVT
10971
10972   return delta;
10973}
10974
10975
10976static Long dis_PMOVMSKB_128 ( const VexAbiInfo* vbi, Prefix pfx,
10977                               Long delta, Bool isAvx )
10978{
10979   UChar modrm = getUChar(delta);
10980   vassert(epartIsReg(modrm)); /* ensured by caller */
10981   UInt   rE = eregOfRexRM(pfx,modrm);
10982   UInt   rG = gregOfRexRM(pfx,modrm);
10983   IRTemp t0 = newTemp(Ity_V128);
10984   IRTemp t1 = newTemp(Ity_I32);
10985   assign(t0, getXMMReg(rE));
10986   assign(t1, unop(Iop_16Uto32, unop(Iop_GetMSBs8x16, mkexpr(t0))));
10987   putIReg32(rG, mkexpr(t1));
10988   DIP("%spmovmskb %s,%s\n", isAvx ? "v" : "", nameXMMReg(rE),
10989       nameIReg32(rG));
10990   delta += 1;
10991   return delta;
10992}
10993
10994
10995static Long dis_PMOVMSKB_256 ( const VexAbiInfo* vbi, Prefix pfx,
10996                               Long delta  )
10997{
10998   UChar modrm = getUChar(delta);
10999   vassert(epartIsReg(modrm)); /* ensured by caller */
11000   UInt   rE = eregOfRexRM(pfx,modrm);
11001   UInt   rG = gregOfRexRM(pfx,modrm);
11002   IRTemp t0 = newTemp(Ity_V128);
11003   IRTemp t1 = newTemp(Ity_V128);
11004   IRTemp t2 = newTemp(Ity_I16);
11005   IRTemp t3 = newTemp(Ity_I16);
11006   assign(t0, getYMMRegLane128(rE, 0));
11007   assign(t1, getYMMRegLane128(rE, 1));
11008   assign(t2, unop(Iop_GetMSBs8x16, mkexpr(t0)));
11009   assign(t3, unop(Iop_GetMSBs8x16, mkexpr(t1)));
11010   putIReg32(rG, binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)));
11011   DIP("vpmovmskb %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
11012   delta += 1;
11013   return delta;
11014}
11015
11016
11017/* FIXME: why not just use InterleaveLO / InterleaveHI?  I think the
11018   relevant ops are "xIsH ? InterleaveHI32x4 : InterleaveLO32x4". */
11019/* Does the maths for 128 bit versions of UNPCKLPS and UNPCKHPS */
11020static IRTemp math_UNPCKxPS_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
11021{
11022   IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
11023   s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
11024   breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
11025   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
11026   IRTemp res = newTemp(Ity_V128);
11027   assign(res,  xIsH ? mkV128from32s( s3, d3, s2, d2 )
11028                     : mkV128from32s( s1, d1, s0, d0 ));
11029   return res;
11030}
11031
11032
11033/* FIXME: why not just use InterleaveLO / InterleaveHI ?? */
11034/* Does the maths for 128 bit versions of UNPCKLPD and UNPCKHPD */
11035static IRTemp math_UNPCKxPD_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
11036{
11037   IRTemp s1 = newTemp(Ity_I64);
11038   IRTemp s0 = newTemp(Ity_I64);
11039   IRTemp d1 = newTemp(Ity_I64);
11040   IRTemp d0 = newTemp(Ity_I64);
11041   assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
11042   assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
11043   assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
11044   assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
11045   IRTemp res = newTemp(Ity_V128);
11046   assign(res, xIsH ? binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1))
11047                    : binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)));
11048   return res;
11049}
11050
11051
11052/* Does the maths for 256 bit versions of UNPCKLPD and UNPCKHPD.
11053   Doesn't seem like this fits in either of the Iop_Interleave{LO,HI}
11054   or the Iop_Cat{Odd,Even}Lanes idioms, hence just do it the stupid
11055   way. */
11056static IRTemp math_UNPCKxPD_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
11057{
11058   IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
11059   s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
11060   breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
11061   breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
11062   IRTemp res = newTemp(Ity_V256);
11063   assign(res, xIsH
11064               ? IRExpr_Qop(Iop_64x4toV256, mkexpr(s3), mkexpr(d3),
11065                                            mkexpr(s1), mkexpr(d1))
11066               : IRExpr_Qop(Iop_64x4toV256, mkexpr(s2), mkexpr(d2),
11067                                            mkexpr(s0), mkexpr(d0)));
11068   return res;
11069}
11070
11071
11072/* FIXME: this is really bad.  Surely can do something better here?
11073   One observation is that the steering in the upper and lower 128 bit
11074   halves is the same as with math_UNPCKxPS_128, so we simply split
11075   into two halves, and use that.  Consequently any improvement in
11076   math_UNPCKxPS_128 (probably, to use interleave-style primops)
11077   benefits this too. */
11078static IRTemp math_UNPCKxPS_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
11079{
11080   IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
11081   IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
11082   breakupV256toV128s( sV, &sVhi, &sVlo );
11083   breakupV256toV128s( dV, &dVhi, &dVlo );
11084   IRTemp rVhi = math_UNPCKxPS_128(sVhi, dVhi, xIsH);
11085   IRTemp rVlo = math_UNPCKxPS_128(sVlo, dVlo, xIsH);
11086   IRTemp rV   = newTemp(Ity_V256);
11087   assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
11088   return rV;
11089}
11090
11091
11092static IRTemp math_SHUFPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
11093{
11094   IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
11095   s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
11096   vassert(imm8 < 256);
11097
11098   breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
11099   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
11100
11101#  define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
11102#  define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11103   IRTemp res = newTemp(Ity_V128);
11104   assign(res,
11105          mkV128from32s( SELS((imm8>>6)&3), SELS((imm8>>4)&3),
11106                         SELD((imm8>>2)&3), SELD((imm8>>0)&3) ) );
11107#  undef SELD
11108#  undef SELS
11109   return res;
11110}
11111
11112
11113/* 256-bit SHUFPS appears to steer each of the 128-bit halves
11114   identically.  Hence do the clueless thing and use math_SHUFPS_128
11115   twice. */
11116static IRTemp math_SHUFPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
11117{
11118   IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
11119   IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
11120   breakupV256toV128s( sV, &sVhi, &sVlo );
11121   breakupV256toV128s( dV, &dVhi, &dVlo );
11122   IRTemp rVhi = math_SHUFPS_128(sVhi, dVhi, imm8);
11123   IRTemp rVlo = math_SHUFPS_128(sVlo, dVlo, imm8);
11124   IRTemp rV   = newTemp(Ity_V256);
11125   assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
11126   return rV;
11127}
11128
11129
11130static IRTemp math_SHUFPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
11131{
11132   IRTemp s1 = newTemp(Ity_I64);
11133   IRTemp s0 = newTemp(Ity_I64);
11134   IRTemp d1 = newTemp(Ity_I64);
11135   IRTemp d0 = newTemp(Ity_I64);
11136
11137   assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
11138   assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
11139   assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
11140   assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
11141
11142#  define SELD(n) mkexpr((n)==0 ? d0 : d1)
11143#  define SELS(n) mkexpr((n)==0 ? s0 : s1)
11144
11145   IRTemp res = newTemp(Ity_V128);
11146   assign(res, binop( Iop_64HLtoV128,
11147                      SELS((imm8>>1)&1), SELD((imm8>>0)&1) ) );
11148
11149#  undef SELD
11150#  undef SELS
11151   return res;
11152}
11153
11154
11155static IRTemp math_SHUFPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
11156{
11157   IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
11158   IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
11159   breakupV256toV128s( sV, &sVhi, &sVlo );
11160   breakupV256toV128s( dV, &dVhi, &dVlo );
11161   IRTemp rVhi = math_SHUFPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
11162   IRTemp rVlo = math_SHUFPD_128(sVlo, dVlo, imm8 & 3);
11163   IRTemp rV   = newTemp(Ity_V256);
11164   assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
11165   return rV;
11166}
11167
11168
11169static IRTemp math_BLENDPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
11170{
11171   UShort imm8_mask_16;
11172   IRTemp imm8_mask = newTemp(Ity_V128);
11173
11174   switch( imm8 & 3 ) {
11175      case 0:  imm8_mask_16 = 0x0000; break;
11176      case 1:  imm8_mask_16 = 0x00FF; break;
11177      case 2:  imm8_mask_16 = 0xFF00; break;
11178      case 3:  imm8_mask_16 = 0xFFFF; break;
11179      default: vassert(0);            break;
11180   }
11181   assign( imm8_mask, mkV128( imm8_mask_16 ) );
11182
11183   IRTemp res = newTemp(Ity_V128);
11184   assign ( res, binop( Iop_OrV128,
11185                        binop( Iop_AndV128, mkexpr(sV),
11186                                            mkexpr(imm8_mask) ),
11187                        binop( Iop_AndV128, mkexpr(dV),
11188                               unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
11189   return res;
11190}
11191
11192
11193static IRTemp math_BLENDPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
11194{
11195   IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
11196   IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
11197   breakupV256toV128s( sV, &sVhi, &sVlo );
11198   breakupV256toV128s( dV, &dVhi, &dVlo );
11199   IRTemp rVhi = math_BLENDPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
11200   IRTemp rVlo = math_BLENDPD_128(sVlo, dVlo, imm8 & 3);
11201   IRTemp rV   = newTemp(Ity_V256);
11202   assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
11203   return rV;
11204}
11205
11206
11207static IRTemp math_BLENDPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
11208{
11209   UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
11210                             0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
11211                             0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0,
11212                             0xFFFF };
11213   IRTemp imm8_mask = newTemp(Ity_V128);
11214   assign( imm8_mask, mkV128( imm8_perms[ (imm8 & 15) ] ) );
11215
11216   IRTemp res = newTemp(Ity_V128);
11217   assign ( res, binop( Iop_OrV128,
11218                        binop( Iop_AndV128, mkexpr(sV),
11219                                            mkexpr(imm8_mask) ),
11220                        binop( Iop_AndV128, mkexpr(dV),
11221                               unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
11222   return res;
11223}
11224
11225
11226static IRTemp math_BLENDPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
11227{
11228   IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
11229   IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
11230   breakupV256toV128s( sV, &sVhi, &sVlo );
11231   breakupV256toV128s( dV, &dVhi, &dVlo );
11232   IRTemp rVhi = math_BLENDPS_128(sVhi, dVhi, (imm8 >> 4) & 15);
11233   IRTemp rVlo = math_BLENDPS_128(sVlo, dVlo, imm8 & 15);
11234   IRTemp rV   = newTemp(Ity_V256);
11235   assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
11236   return rV;
11237}
11238
11239
11240static IRTemp math_PBLENDW_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
11241{
11242   /* Make w be a 16-bit version of imm8, formed by duplicating each
11243      bit in imm8. */
11244   Int i;
11245   UShort imm16 = 0;
11246   for (i = 0; i < 8; i++) {
11247      if (imm8 & (1 << i))
11248         imm16 |= (3 << (2*i));
11249   }
11250   IRTemp imm16_mask = newTemp(Ity_V128);
11251   assign( imm16_mask, mkV128( imm16 ));
11252
11253   IRTemp res = newTemp(Ity_V128);
11254   assign ( res, binop( Iop_OrV128,
11255                        binop( Iop_AndV128, mkexpr(sV),
11256                                            mkexpr(imm16_mask) ),
11257                        binop( Iop_AndV128, mkexpr(dV),
11258                               unop( Iop_NotV128, mkexpr(imm16_mask) ) ) ) );
11259   return res;
11260}
11261
11262
11263static IRTemp math_PMULUDQ_128 ( IRTemp sV, IRTemp dV )
11264{
11265   /* This is a really poor translation -- could be improved if
11266      performance critical */
11267   IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
11268   s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
11269   breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
11270   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
11271   IRTemp res = newTemp(Ity_V128);
11272   assign(res, binop(Iop_64HLtoV128,
11273                     binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)),
11274                     binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) ));
11275   return res;
11276}
11277
11278
11279static IRTemp math_PMULUDQ_256 ( IRTemp sV, IRTemp dV )
11280{
11281   /* This is a really poor translation -- could be improved if
11282      performance critical */
11283   IRTemp sHi, sLo, dHi, dLo;
11284   sHi = sLo = dHi = dLo = IRTemp_INVALID;
11285   breakupV256toV128s( dV, &dHi, &dLo);
11286   breakupV256toV128s( sV, &sHi, &sLo);
11287   IRTemp res = newTemp(Ity_V256);
11288   assign(res, binop(Iop_V128HLtoV256,
11289                     mkexpr(math_PMULUDQ_128(sHi, dHi)),
11290                     mkexpr(math_PMULUDQ_128(sLo, dLo))));
11291   return res;
11292}
11293
11294
11295static IRTemp math_PMULDQ_128 ( IRTemp dV, IRTemp sV )
11296{
11297   /* This is a really poor translation -- could be improved if
11298      performance critical */
11299   IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
11300   s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
11301   breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
11302   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
11303   IRTemp res = newTemp(Ity_V128);
11304   assign(res, binop(Iop_64HLtoV128,
11305                     binop( Iop_MullS32, mkexpr(d2), mkexpr(s2)),
11306                     binop( Iop_MullS32, mkexpr(d0), mkexpr(s0)) ));
11307   return res;
11308}
11309
11310
11311static IRTemp math_PMULDQ_256 ( IRTemp sV, IRTemp dV )
11312{
11313   /* This is a really poor translation -- could be improved if
11314      performance critical */
11315   IRTemp sHi, sLo, dHi, dLo;
11316   sHi = sLo = dHi = dLo = IRTemp_INVALID;
11317   breakupV256toV128s( dV, &dHi, &dLo);
11318   breakupV256toV128s( sV, &sHi, &sLo);
11319   IRTemp res = newTemp(Ity_V256);
11320   assign(res, binop(Iop_V128HLtoV256,
11321                     mkexpr(math_PMULDQ_128(sHi, dHi)),
11322                     mkexpr(math_PMULDQ_128(sLo, dLo))));
11323   return res;
11324}
11325
11326
11327static IRTemp math_PMADDWD_128 ( IRTemp dV, IRTemp sV )
11328{
11329   IRTemp sVhi, sVlo, dVhi, dVlo;
11330   IRTemp resHi = newTemp(Ity_I64);
11331   IRTemp resLo = newTemp(Ity_I64);
11332   sVhi = sVlo = dVhi = dVlo = IRTemp_INVALID;
11333   breakupV128to64s( sV, &sVhi, &sVlo );
11334   breakupV128to64s( dV, &dVhi, &dVlo );
11335   assign( resHi, mkIRExprCCall(Ity_I64, 0/*regparms*/,
11336                                "amd64g_calculate_mmx_pmaddwd",
11337                                &amd64g_calculate_mmx_pmaddwd,
11338                                mkIRExprVec_2( mkexpr(sVhi), mkexpr(dVhi))));
11339   assign( resLo, mkIRExprCCall(Ity_I64, 0/*regparms*/,
11340                                "amd64g_calculate_mmx_pmaddwd",
11341                                &amd64g_calculate_mmx_pmaddwd,
11342                                mkIRExprVec_2( mkexpr(sVlo), mkexpr(dVlo))));
11343   IRTemp res = newTemp(Ity_V128);
11344   assign( res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo))) ;
11345   return res;
11346}
11347
11348
11349static IRTemp math_PMADDWD_256 ( IRTemp dV, IRTemp sV )
11350{
11351   IRTemp sHi, sLo, dHi, dLo;
11352   sHi = sLo = dHi = dLo = IRTemp_INVALID;
11353   breakupV256toV128s( dV, &dHi, &dLo);
11354   breakupV256toV128s( sV, &sHi, &sLo);
11355   IRTemp res = newTemp(Ity_V256);
11356   assign(res, binop(Iop_V128HLtoV256,
11357                     mkexpr(math_PMADDWD_128(dHi, sHi)),
11358                     mkexpr(math_PMADDWD_128(dLo, sLo))));
11359   return res;
11360}
11361
11362
11363static IRTemp math_ADDSUBPD_128 ( IRTemp dV, IRTemp sV )
11364{
11365   IRTemp addV = newTemp(Ity_V128);
11366   IRTemp subV = newTemp(Ity_V128);
11367   IRTemp a1   = newTemp(Ity_I64);
11368   IRTemp s0   = newTemp(Ity_I64);
11369   IRTemp rm   = newTemp(Ity_I32);
11370
11371   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
11372   assign( addV, triop(Iop_Add64Fx2, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11373   assign( subV, triop(Iop_Sub64Fx2, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11374
11375   assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
11376   assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
11377
11378   IRTemp res = newTemp(Ity_V128);
11379   assign( res, binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
11380   return res;
11381}
11382
11383
11384static IRTemp math_ADDSUBPD_256 ( IRTemp dV, IRTemp sV )
11385{
11386   IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
11387   IRTemp addV = newTemp(Ity_V256);
11388   IRTemp subV = newTemp(Ity_V256);
11389   IRTemp rm   = newTemp(Ity_I32);
11390   a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
11391
11392   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
11393   assign( addV, triop(Iop_Add64Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11394   assign( subV, triop(Iop_Sub64Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11395
11396   breakupV256to64s( addV, &a3, &a2, &a1, &a0 );
11397   breakupV256to64s( subV, &s3, &s2, &s1, &s0 );
11398
11399   IRTemp res = newTemp(Ity_V256);
11400   assign( res, mkV256from64s( a3, s2, a1, s0 ) );
11401   return res;
11402}
11403
11404
11405static IRTemp math_ADDSUBPS_128 ( IRTemp dV, IRTemp sV )
11406{
11407   IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
11408   IRTemp addV = newTemp(Ity_V128);
11409   IRTemp subV = newTemp(Ity_V128);
11410   IRTemp rm   = newTemp(Ity_I32);
11411   a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
11412
11413   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
11414   assign( addV, triop(Iop_Add32Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11415   assign( subV, triop(Iop_Sub32Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11416
11417   breakupV128to32s( addV, &a3, &a2, &a1, &a0 );
11418   breakupV128to32s( subV, &s3, &s2, &s1, &s0 );
11419
11420   IRTemp res = newTemp(Ity_V128);
11421   assign( res, mkV128from32s( a3, s2, a1, s0 ) );
11422   return res;
11423}
11424
11425
11426static IRTemp math_ADDSUBPS_256 ( IRTemp dV, IRTemp sV )
11427{
11428   IRTemp a7, a6, a5, a4, a3, a2, a1, a0;
11429   IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
11430   IRTemp addV = newTemp(Ity_V256);
11431   IRTemp subV = newTemp(Ity_V256);
11432   IRTemp rm   = newTemp(Ity_I32);
11433   a7 = a6 = a5 = a4 = a3 = a2 = a1 = a0 = IRTemp_INVALID;
11434   s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
11435
11436   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
11437   assign( addV, triop(Iop_Add32Fx8, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11438   assign( subV, triop(Iop_Sub32Fx8, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11439
11440   breakupV256to32s( addV, &a7, &a6, &a5, &a4, &a3, &a2, &a1, &a0 );
11441   breakupV256to32s( subV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
11442
11443   IRTemp res = newTemp(Ity_V256);
11444   assign( res, mkV256from32s( a7, s6, a5, s4, a3, s2, a1, s0 ) );
11445   return res;
11446}
11447
11448
11449/* Handle 128 bit PSHUFLW and PSHUFHW. */
11450static Long dis_PSHUFxW_128 ( const VexAbiInfo* vbi, Prefix pfx,
11451                              Long delta, Bool isAvx, Bool xIsH )
11452{
11453   IRTemp addr  = IRTemp_INVALID;
11454   Int    alen  = 0;
11455   HChar  dis_buf[50];
11456   UChar  modrm = getUChar(delta);
11457   UInt   rG = gregOfRexRM(pfx,modrm);
11458   UInt   imm8;
11459   IRTemp sVmut, dVmut, sVcon, sV, dV, s3, s2, s1, s0;
11460   s3 = s2 = s1 = s0 = IRTemp_INVALID;
11461   sV    = newTemp(Ity_V128);
11462   dV    = newTemp(Ity_V128);
11463   sVmut = newTemp(Ity_I64);
11464   dVmut = newTemp(Ity_I64);
11465   sVcon = newTemp(Ity_I64);
11466   if (epartIsReg(modrm)) {
11467      UInt rE = eregOfRexRM(pfx,modrm);
11468      assign( sV, getXMMReg(rE) );
11469      imm8 = (UInt)getUChar(delta+1);
11470      delta += 1+1;
11471      DIP("%spshuf%cw $%u,%s,%s\n",
11472          isAvx ? "v" : "", xIsH ? 'h' : 'l',
11473          imm8, nameXMMReg(rE), nameXMMReg(rG));
11474   } else {
11475      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
11476      assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11477      imm8 = (UInt)getUChar(delta+alen);
11478      delta += alen+1;
11479      DIP("%spshuf%cw $%u,%s,%s\n",
11480          isAvx ? "v" : "", xIsH ? 'h' : 'l',
11481          imm8, dis_buf, nameXMMReg(rG));
11482   }
11483
11484   /* Get the to-be-changed (mut) and unchanging (con) bits of the
11485      source. */
11486   assign( sVmut, unop(xIsH ? Iop_V128HIto64 : Iop_V128to64,   mkexpr(sV)) );
11487   assign( sVcon, unop(xIsH ? Iop_V128to64   : Iop_V128HIto64, mkexpr(sV)) );
11488
11489   breakup64to16s( sVmut, &s3, &s2, &s1, &s0 );
11490#  define SEL(n) \
11491             ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11492   assign(dVmut, mk64from16s( SEL((imm8>>6)&3), SEL((imm8>>4)&3),
11493                              SEL((imm8>>2)&3), SEL((imm8>>0)&3) ));
11494#  undef SEL
11495
11496   assign(dV, xIsH ? binop(Iop_64HLtoV128, mkexpr(dVmut), mkexpr(sVcon))
11497                   : binop(Iop_64HLtoV128, mkexpr(sVcon), mkexpr(dVmut)) );
11498
11499   (isAvx ? putYMMRegLoAndZU : putXMMReg)(rG, mkexpr(dV));
11500   return delta;
11501}
11502
11503
11504/* Handle 256 bit PSHUFLW and PSHUFHW. */
11505static Long dis_PSHUFxW_256 ( const VexAbiInfo* vbi, Prefix pfx,
11506                              Long delta, Bool xIsH )
11507{
11508   IRTemp addr  = IRTemp_INVALID;
11509   Int    alen  = 0;
11510   HChar  dis_buf[50];
11511   UChar  modrm = getUChar(delta);
11512   UInt   rG = gregOfRexRM(pfx,modrm);
11513   UInt   imm8;
11514   IRTemp sV, s[8], sV64[4], dVhi, dVlo;
11515   sV64[3] = sV64[2] = sV64[1] = sV64[0] = IRTemp_INVALID;
11516   s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
11517   sV    = newTemp(Ity_V256);
11518   dVhi  = newTemp(Ity_I64);
11519   dVlo  = newTemp(Ity_I64);
11520   if (epartIsReg(modrm)) {
11521      UInt rE = eregOfRexRM(pfx,modrm);
11522      assign( sV, getYMMReg(rE) );
11523      imm8 = (UInt)getUChar(delta+1);
11524      delta += 1+1;
11525      DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l',
11526          imm8, nameYMMReg(rE), nameYMMReg(rG));
11527   } else {
11528      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
11529      assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
11530      imm8 = (UInt)getUChar(delta+alen);
11531      delta += alen+1;
11532      DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l',
11533          imm8, dis_buf, nameYMMReg(rG));
11534   }
11535
11536   breakupV256to64s( sV, &sV64[3], &sV64[2], &sV64[1], &sV64[0] );
11537   breakup64to16s( sV64[xIsH ? 3 : 2], &s[7], &s[6], &s[5], &s[4] );
11538   breakup64to16s( sV64[xIsH ? 1 : 0], &s[3], &s[2], &s[1], &s[0] );
11539
11540   assign( dVhi, mk64from16s( s[4 + ((imm8>>6)&3)], s[4 + ((imm8>>4)&3)],
11541                              s[4 + ((imm8>>2)&3)], s[4 + ((imm8>>0)&3)] ) );
11542   assign( dVlo, mk64from16s( s[0 + ((imm8>>6)&3)], s[0 + ((imm8>>4)&3)],
11543                              s[0 + ((imm8>>2)&3)], s[0 + ((imm8>>0)&3)] ) );
11544   putYMMReg( rG, mkV256from64s( xIsH ? dVhi : sV64[3],
11545                                 xIsH ? sV64[2] : dVhi,
11546                                 xIsH ? dVlo : sV64[1],
11547                                 xIsH ? sV64[0] : dVlo ) );
11548   return delta;
11549}
11550
11551
11552static Long dis_PEXTRW_128_EregOnly_toG ( const VexAbiInfo* vbi, Prefix pfx,
11553                                          Long delta, Bool isAvx )
11554{
11555   Long   deltaIN = delta;
11556   UChar  modrm   = getUChar(delta);
11557   UInt   rG      = gregOfRexRM(pfx,modrm);
11558   IRTemp sV      = newTemp(Ity_V128);
11559   IRTemp d16     = newTemp(Ity_I16);
11560   UInt   imm8;
11561   IRTemp s0, s1, s2, s3;
11562   if (epartIsReg(modrm)) {
11563      UInt rE = eregOfRexRM(pfx,modrm);
11564      assign(sV, getXMMReg(rE));
11565      imm8 = getUChar(delta+1) & 7;
11566      delta += 1+1;
11567      DIP("%spextrw $%u,%s,%s\n", isAvx ? "v" : "",
11568          imm8, nameXMMReg(rE), nameIReg32(rG));
11569   } else {
11570      /* The memory case is disallowed, apparently. */
11571      return deltaIN; /* FAIL */
11572   }
11573   s3 = s2 = s1 = s0 = IRTemp_INVALID;
11574   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
11575   switch (imm8) {
11576      case 0:  assign(d16, unop(Iop_32to16,   mkexpr(s0))); break;
11577      case 1:  assign(d16, unop(Iop_32HIto16, mkexpr(s0))); break;
11578      case 2:  assign(d16, unop(Iop_32to16,   mkexpr(s1))); break;
11579      case 3:  assign(d16, unop(Iop_32HIto16, mkexpr(s1))); break;
11580      case 4:  assign(d16, unop(Iop_32to16,   mkexpr(s2))); break;
11581      case 5:  assign(d16, unop(Iop_32HIto16, mkexpr(s2))); break;
11582      case 6:  assign(d16, unop(Iop_32to16,   mkexpr(s3))); break;
11583      case 7:  assign(d16, unop(Iop_32HIto16, mkexpr(s3))); break;
11584      default: vassert(0);
11585   }
11586   putIReg32(rG, unop(Iop_16Uto32, mkexpr(d16)));
11587   return delta;
11588}
11589
11590
11591static Long dis_CVTDQ2PD_128 ( const VexAbiInfo* vbi, Prefix pfx,
11592                               Long delta, Bool isAvx )
11593{
11594   IRTemp addr  = IRTemp_INVALID;
11595   Int    alen  = 0;
11596   HChar  dis_buf[50];
11597   UChar  modrm = getUChar(delta);
11598   IRTemp arg64 = newTemp(Ity_I64);
11599   UInt   rG    = gregOfRexRM(pfx,modrm);
11600   const HChar* mbV   = isAvx ? "v" : "";
11601   if (epartIsReg(modrm)) {
11602      UInt rE = eregOfRexRM(pfx,modrm);
11603      assign( arg64, getXMMRegLane64(rE, 0) );
11604      delta += 1;
11605      DIP("%scvtdq2pd %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG));
11606   } else {
11607      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11608      assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
11609      delta += alen;
11610      DIP("%scvtdq2pd %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
11611   }
11612   putXMMRegLane64F(
11613      rG, 0,
11614      unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
11615   );
11616   putXMMRegLane64F(
11617      rG, 1,
11618      unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
11619   );
11620   if (isAvx)
11621      putYMMRegLane128(rG, 1, mkV128(0));
11622   return delta;
11623}
11624
11625
11626static Long dis_STMXCSR ( const VexAbiInfo* vbi, Prefix pfx,
11627                          Long delta, Bool isAvx )
11628{
11629   IRTemp addr  = IRTemp_INVALID;
11630   Int    alen  = 0;
11631   HChar  dis_buf[50];
11632   UChar  modrm = getUChar(delta);
11633   vassert(!epartIsReg(modrm)); /* ensured by caller */
11634   vassert(gregOfRexRM(pfx,modrm) == 3); /* ditto */
11635
11636   addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11637   delta += alen;
11638
11639   /* Fake up a native SSE mxcsr word.  The only thing it depends on
11640      is SSEROUND[1:0], so call a clean helper to cook it up.
11641   */
11642   /* ULong amd64h_create_mxcsr ( ULong sseround ) */
11643   DIP("%sstmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
11644   storeLE(
11645      mkexpr(addr),
11646      unop(Iop_64to32,
11647           mkIRExprCCall(
11648              Ity_I64, 0/*regp*/,
11649              "amd64g_create_mxcsr", &amd64g_create_mxcsr,
11650              mkIRExprVec_1( unop(Iop_32Uto64,get_sse_roundingmode()) )
11651           )
11652      )
11653   );
11654   return delta;
11655}
11656
11657
11658static Long dis_LDMXCSR ( const VexAbiInfo* vbi, Prefix pfx,
11659                          Long delta, Bool isAvx )
11660{
11661   IRTemp addr  = IRTemp_INVALID;
11662   Int    alen  = 0;
11663   HChar  dis_buf[50];
11664   UChar  modrm = getUChar(delta);
11665   vassert(!epartIsReg(modrm)); /* ensured by caller */
11666   vassert(gregOfRexRM(pfx,modrm) == 2); /* ditto */
11667
11668   IRTemp t64 = newTemp(Ity_I64);
11669   IRTemp ew  = newTemp(Ity_I32);
11670
11671   addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11672   delta += alen;
11673   DIP("%sldmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
11674
11675   /* The only thing we observe in %mxcsr is the rounding mode.
11676      Therefore, pass the 32-bit value (SSE native-format control
11677      word) to a clean helper, getting back a 64-bit value, the
11678      lower half of which is the SSEROUND value to store, and the
11679      upper half of which is the emulation-warning token which may
11680      be generated.
11681   */
11682   /* ULong amd64h_check_ldmxcsr ( ULong ); */
11683   assign( t64, mkIRExprCCall(
11684                   Ity_I64, 0/*regparms*/,
11685                   "amd64g_check_ldmxcsr",
11686                   &amd64g_check_ldmxcsr,
11687                   mkIRExprVec_1(
11688                      unop(Iop_32Uto64,
11689                           loadLE(Ity_I32, mkexpr(addr))
11690                      )
11691                   )
11692                )
11693         );
11694
11695   put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
11696   assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
11697   put_emwarn( mkexpr(ew) );
11698   /* Finally, if an emulation warning was reported, side-exit to
11699      the next insn, reporting the warning, so that Valgrind's
11700      dispatcher sees the warning. */
11701   stmt(
11702      IRStmt_Exit(
11703         binop(Iop_CmpNE64, unop(Iop_32Uto64,mkexpr(ew)), mkU64(0)),
11704         Ijk_EmWarn,
11705         IRConst_U64(guest_RIP_bbstart+delta),
11706         OFFB_RIP
11707      )
11708   );
11709   return delta;
11710}
11711
11712
11713static void gen_XSAVE_SEQUENCE ( IRTemp addr, IRTemp rfbm )
11714{
11715   /* ------ rfbm[0] gates the x87 state ------ */
11716
11717   /* Uses dirty helper:
11718         void amd64g_do_XSAVE_COMPONENT_0 ( VexGuestAMD64State*, ULong )
11719   */
11720   IRDirty* d0 = unsafeIRDirty_0_N (
11721                    0/*regparms*/,
11722                    "amd64g_dirtyhelper_XSAVE_COMPONENT_0",
11723                    &amd64g_dirtyhelper_XSAVE_COMPONENT_0,
11724                    mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
11725                 );
11726   d0->guard = binop(Iop_CmpEQ64, binop(Iop_And64, mkexpr(rfbm), mkU64(1)),
11727                     mkU64(1));
11728
11729   /* Declare we're writing memory.  Really, bytes 24 through 31
11730      (MXCSR and MXCSR_MASK) aren't written, but we can't express more
11731      than 1 memory area here, so just mark the whole thing as
11732      written. */
11733   d0->mFx   = Ifx_Write;
11734   d0->mAddr = mkexpr(addr);
11735   d0->mSize = 160;
11736
11737   /* declare we're reading guest state */
11738   d0->nFxState = 5;
11739   vex_bzero(&d0->fxState, sizeof(d0->fxState));
11740
11741   d0->fxState[0].fx     = Ifx_Read;
11742   d0->fxState[0].offset = OFFB_FTOP;
11743   d0->fxState[0].size   = sizeof(UInt);
11744
11745   d0->fxState[1].fx     = Ifx_Read;
11746   d0->fxState[1].offset = OFFB_FPREGS;
11747   d0->fxState[1].size   = 8 * sizeof(ULong);
11748
11749   d0->fxState[2].fx     = Ifx_Read;
11750   d0->fxState[2].offset = OFFB_FPTAGS;
11751   d0->fxState[2].size   = 8 * sizeof(UChar);
11752
11753   d0->fxState[3].fx     = Ifx_Read;
11754   d0->fxState[3].offset = OFFB_FPROUND;
11755   d0->fxState[3].size   = sizeof(ULong);
11756
11757   d0->fxState[4].fx     = Ifx_Read;
11758   d0->fxState[4].offset = OFFB_FC3210;
11759   d0->fxState[4].size   = sizeof(ULong);
11760
11761   stmt( IRStmt_Dirty(d0) );
11762
11763   /* ------ rfbm[1] gates the SSE state ------ */
11764
11765   IRTemp rfbm_1    = newTemp(Ity_I64);
11766   IRTemp rfbm_1or2 = newTemp(Ity_I64);
11767   assign(rfbm_1,    binop(Iop_And64, mkexpr(rfbm), mkU64(2)));
11768   assign(rfbm_1or2, binop(Iop_And64, mkexpr(rfbm), mkU64(6)));
11769
11770   IRExpr* guard_1    = binop(Iop_CmpEQ64, mkexpr(rfbm_1),    mkU64(2));
11771   IRExpr* guard_1or2 = binop(Iop_CmpNE64, mkexpr(rfbm_1or2), mkU64(0));
11772
11773   /* Uses dirty helper:
11774         void amd64g_do_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS
11775                 ( VexGuestAMD64State*, ULong )
11776      This creates only MXCSR and MXCSR_MASK.  We need to do this if
11777      either components 1 (SSE) or 2 (AVX) are requested.  Hence the
11778      guard condition is a bit more complex.
11779   */
11780   IRDirty* d1 = unsafeIRDirty_0_N (
11781                    0/*regparms*/,
11782                    "amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS",
11783                    &amd64g_dirtyhelper_XSAVE_COMPONENT_1_EXCLUDING_XMMREGS,
11784                    mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
11785                 );
11786   d1->guard = guard_1or2;
11787
11788   /* Declare we're writing memory: MXCSR and MXCSR_MASK.  Note that
11789      the code for rbfm[0] just above claims a write of 0 .. 159, so
11790      this duplicates it.  But at least correctly connects 24 .. 31 to
11791      the MXCSR guest state representation (SSEROUND field). */
11792   d1->mFx   = Ifx_Write;
11793   d1->mAddr = binop(Iop_Add64, mkexpr(addr), mkU64(24));
11794   d1->mSize = 8;
11795
11796   /* declare we're reading guest state */
11797   d1->nFxState = 1;
11798   vex_bzero(&d1->fxState, sizeof(d1->fxState));
11799
11800   d1->fxState[0].fx     = Ifx_Read;
11801   d1->fxState[0].offset = OFFB_SSEROUND;
11802   d1->fxState[0].size   = sizeof(ULong);
11803
11804   /* Call the helper.  This creates MXCSR and MXCSR_MASK but nothing
11805      else.  We do the actual register array, XMM[0..15], separately,
11806      in order that any undefinedness in the XMM registers is tracked
11807      separately by Memcheck and does not "infect" the in-memory
11808      shadow for the other parts of the image. */
11809   stmt( IRStmt_Dirty(d1) );
11810
11811   /* And now the XMMs themselves. */
11812   UInt reg;
11813   for (reg = 0; reg < 16; reg++) {
11814      stmt( IRStmt_StoreG(
11815               Iend_LE,
11816               binop(Iop_Add64, mkexpr(addr), mkU64(160 + reg * 16)),
11817               getXMMReg(reg),
11818               guard_1
11819      ));
11820   }
11821
11822   /* ------ rfbm[2] gates the AVX state ------ */
11823   /* Component 2 is just a bunch of register saves, so we'll do it
11824      inline, just to be simple and to be Memcheck friendly. */
11825
11826   IRTemp rfbm_2 = newTemp(Ity_I64);
11827   assign(rfbm_2, binop(Iop_And64, mkexpr(rfbm), mkU64(4)));
11828
11829   IRExpr* guard_2 = binop(Iop_CmpEQ64, mkexpr(rfbm_2), mkU64(4));
11830
11831   for (reg = 0; reg < 16; reg++) {
11832      stmt( IRStmt_StoreG(
11833               Iend_LE,
11834               binop(Iop_Add64, mkexpr(addr), mkU64(576 + reg * 16)),
11835               getYMMRegLane128(reg,1),
11836               guard_2
11837      ));
11838   }
11839}
11840
11841
11842static Long dis_XSAVE ( const VexAbiInfo* vbi,
11843                        Prefix pfx, Long delta, Int sz )
11844{
11845   /* Note that the presence or absence of REX.W (indicated here by
11846      |sz|) slightly affects the written format: whether the saved FPU
11847      IP and DP pointers are 64 or 32 bits.  But the helper function
11848      we call simply writes zero bits in the relevant fields, which
11849      are 64 bits regardless of what REX.W is, and so it's good enough
11850      (iow, equally broken) in both cases. */
11851   IRTemp addr  = IRTemp_INVALID;
11852   Int    alen  = 0;
11853   HChar  dis_buf[50];
11854   UChar  modrm = getUChar(delta);
11855   vassert(!epartIsReg(modrm)); /* ensured by caller */
11856   vassert(sz == 4 || sz == 8); /* ditto */
11857
11858   addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11859   delta += alen;
11860   gen_SEGV_if_not_64_aligned(addr);
11861
11862   DIP("%sxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
11863
11864   /* VEX's caller is assumed to have checked this. */
11865   const ULong aSSUMED_XCR0_VALUE = 7;
11866
11867   IRTemp rfbm = newTemp(Ity_I64);
11868   assign(rfbm,
11869          binop(Iop_And64,
11870                binop(Iop_Or64,
11871                      binop(Iop_Shl64,
11872                            unop(Iop_32Uto64, getIRegRDX(4)), mkU8(32)),
11873                      unop(Iop_32Uto64, getIRegRAX(4))),
11874                mkU64(aSSUMED_XCR0_VALUE)));
11875
11876   gen_XSAVE_SEQUENCE(addr, rfbm);
11877
11878   /* Finally, we need to update XSTATE_BV in the XSAVE header area, by
11879      OR-ing the RFBM value into it. */
11880   IRTemp addr_plus_512 = newTemp(Ity_I64);
11881   assign(addr_plus_512, binop(Iop_Add64, mkexpr(addr), mkU64(512)));
11882   storeLE( mkexpr(addr_plus_512),
11883            binop(Iop_Or8,
11884                  unop(Iop_64to8, mkexpr(rfbm)),
11885                  loadLE(Ity_I8, mkexpr(addr_plus_512))) );
11886
11887   return delta;
11888}
11889
11890
11891static Long dis_FXSAVE ( const VexAbiInfo* vbi,
11892                         Prefix pfx, Long delta, Int sz )
11893{
11894   /* See comment in dis_XSAVE about the significance of REX.W. */
11895   IRTemp addr  = IRTemp_INVALID;
11896   Int    alen  = 0;
11897   HChar  dis_buf[50];
11898   UChar  modrm = getUChar(delta);
11899   vassert(!epartIsReg(modrm)); /* ensured by caller */
11900   vassert(sz == 4 || sz == 8); /* ditto */
11901
11902   addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11903   delta += alen;
11904   gen_SEGV_if_not_16_aligned(addr);
11905
11906   DIP("%sfxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
11907
11908   /* FXSAVE is just XSAVE with components 0 and 1 selected.  Set rfbm
11909      to 0b011, generate the XSAVE sequence accordingly, and let iropt
11910      fold out the unused (AVX) parts accordingly. */
11911   IRTemp rfbm = newTemp(Ity_I64);
11912   assign(rfbm, mkU64(3));
11913   gen_XSAVE_SEQUENCE(addr, rfbm);
11914
11915   return delta;
11916}
11917
11918
11919static void gen_XRSTOR_SEQUENCE ( IRTemp addr, IRTemp xstate_bv, IRTemp rfbm )
11920{
11921   /* ------ rfbm[0] gates the x87 state ------ */
11922
11923   /* If rfbm[0] == 1, we have to write the x87 state.  If
11924      xstate_bv[0] == 1, we will read it from the memory image, else
11925      we'll set it to initial values.  Doing this with a helper
11926      function and getting the definedness flow annotations correct is
11927      too difficult, so generate stupid but simple code: first set the
11928      registers to initial values, regardless of xstate_bv[0].  Then,
11929      conditionally restore from the memory image. */
11930
11931   IRTemp rfbm_0       = newTemp(Ity_I64);
11932   IRTemp xstate_bv_0  = newTemp(Ity_I64);
11933   IRTemp restore_0    = newTemp(Ity_I64);
11934   assign(rfbm_0,      binop(Iop_And64, mkexpr(rfbm), mkU64(1)));
11935   assign(xstate_bv_0, binop(Iop_And64, mkexpr(xstate_bv), mkU64(1)));
11936   assign(restore_0,   binop(Iop_And64, mkexpr(rfbm_0), mkexpr(xstate_bv_0)));
11937
11938   gen_FINIT_SEQUENCE( binop(Iop_CmpNE64, mkexpr(rfbm_0), mkU64(0)) );
11939
11940   /* Uses dirty helper:
11941         void amd64g_do_XRSTOR_COMPONENT_0 ( VexGuestAMD64State*, ULong )
11942   */
11943   IRDirty* d0 = unsafeIRDirty_0_N (
11944                    0/*regparms*/,
11945                    "amd64g_dirtyhelper_XRSTOR_COMPONENT_0",
11946                    &amd64g_dirtyhelper_XRSTOR_COMPONENT_0,
11947                    mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
11948                 );
11949   d0->guard = binop(Iop_CmpNE64, mkexpr(restore_0), mkU64(0));
11950
11951   /* Declare we're reading memory.  Really, bytes 24 through 31
11952      (MXCSR and MXCSR_MASK) aren't read, but we can't express more
11953      than 1 memory area here, so just mark the whole thing as
11954      read. */
11955   d0->mFx   = Ifx_Read;
11956   d0->mAddr = mkexpr(addr);
11957   d0->mSize = 160;
11958
11959   /* declare we're writing guest state */
11960   d0->nFxState = 5;
11961   vex_bzero(&d0->fxState, sizeof(d0->fxState));
11962
11963   d0->fxState[0].fx     = Ifx_Write;
11964   d0->fxState[0].offset = OFFB_FTOP;
11965   d0->fxState[0].size   = sizeof(UInt);
11966
11967   d0->fxState[1].fx     = Ifx_Write;
11968   d0->fxState[1].offset = OFFB_FPREGS;
11969   d0->fxState[1].size   = 8 * sizeof(ULong);
11970
11971   d0->fxState[2].fx     = Ifx_Write;
11972   d0->fxState[2].offset = OFFB_FPTAGS;
11973   d0->fxState[2].size   = 8 * sizeof(UChar);
11974
11975   d0->fxState[3].fx     = Ifx_Write;
11976   d0->fxState[3].offset = OFFB_FPROUND;
11977   d0->fxState[3].size   = sizeof(ULong);
11978
11979   d0->fxState[4].fx     = Ifx_Write;
11980   d0->fxState[4].offset = OFFB_FC3210;
11981   d0->fxState[4].size   = sizeof(ULong);
11982
11983   stmt( IRStmt_Dirty(d0) );
11984
11985   /* ------ rfbm[1] gates the SSE state ------ */
11986
11987   /* Same scheme as component 0: first zero it out, and then possibly
11988      restore from the memory area. */
11989   IRTemp rfbm_1       = newTemp(Ity_I64);
11990   IRTemp xstate_bv_1  = newTemp(Ity_I64);
11991   IRTemp restore_1    = newTemp(Ity_I64);
11992   assign(rfbm_1,      binop(Iop_And64, mkexpr(rfbm), mkU64(2)));
11993   assign(xstate_bv_1, binop(Iop_And64, mkexpr(xstate_bv), mkU64(2)));
11994   assign(restore_1,   binop(Iop_And64, mkexpr(rfbm_1), mkexpr(xstate_bv_1)));
11995   IRExpr* rfbm_1e     = binop(Iop_CmpNE64, mkexpr(rfbm_1),    mkU64(0));
11996   IRExpr* restore_1e  = binop(Iop_CmpNE64, mkexpr(restore_1), mkU64(0));
11997
11998   IRTemp rfbm_1or2       = newTemp(Ity_I64);
11999   IRTemp xstate_bv_1or2  = newTemp(Ity_I64);
12000   IRTemp restore_1or2    = newTemp(Ity_I64);
12001   assign(rfbm_1or2,      binop(Iop_And64, mkexpr(rfbm), mkU64(6)));
12002   assign(xstate_bv_1or2, binop(Iop_And64, mkexpr(xstate_bv), mkU64(6)));
12003   assign(restore_1or2,   binop(Iop_And64, mkexpr(rfbm_1or2),
12004                                           mkexpr(xstate_bv_1or2)));
12005   IRExpr* rfbm_1or2e     = binop(Iop_CmpNE64, mkexpr(rfbm_1or2),    mkU64(0));
12006   IRExpr* restore_1or2e  = binop(Iop_CmpNE64, mkexpr(restore_1or2), mkU64(0));
12007
12008   /* The areas in question are: SSEROUND, and the XMM register array. */
12009   putGuarded(OFFB_SSEROUND, rfbm_1or2e, mkU64(Irrm_NEAREST));
12010
12011   UInt reg;
12012   for (reg = 0; reg < 16; reg++) {
12013      putGuarded(xmmGuestRegOffset(reg), rfbm_1e, mkV128(0));
12014   }
12015
12016   /* And now possibly restore from MXCSR/MXCSR_MASK */
12017   /* Uses dirty helper:
12018         void amd64g_do_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS
12019                 ( VexGuestAMD64State*, ULong )
12020      This restores from only MXCSR and MXCSR_MASK.  We need to do
12021      this if either components 1 (SSE) or 2 (AVX) are requested.
12022      Hence the guard condition is a bit more complex.
12023   */
12024   IRDirty* d1 = unsafeIRDirty_0_N (
12025                    0/*regparms*/,
12026                    "amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS",
12027                    &amd64g_dirtyhelper_XRSTOR_COMPONENT_1_EXCLUDING_XMMREGS,
12028                    mkIRExprVec_2( IRExpr_GSPTR(), mkexpr(addr) )
12029                ) ;
12030   d1->guard = restore_1or2e;
12031
12032   /* Declare we're reading memory: MXCSR and MXCSR_MASK.  Note that
12033      the code for rbfm[0] just above claims a read of 0 .. 159, so
12034      this duplicates it.  But at least correctly connects 24 .. 31 to
12035      the MXCSR guest state representation (SSEROUND field). */
12036   d1->mFx   = Ifx_Read;
12037   d1->mAddr = binop(Iop_Add64, mkexpr(addr), mkU64(24));
12038   d1->mSize = 8;
12039
12040   /* declare we're writing guest state */
12041   d1->nFxState = 1;
12042   vex_bzero(&d1->fxState, sizeof(d1->fxState));
12043
12044   d1->fxState[0].fx     = Ifx_Write;
12045   d1->fxState[0].offset = OFFB_SSEROUND;
12046   d1->fxState[0].size   = sizeof(ULong);
12047
12048   /* Call the helper.  This creates SSEROUND but nothing
12049      else.  We do the actual register array, XMM[0..15], separately,
12050      in order that any undefinedness in the XMM registers is tracked
12051      separately by Memcheck and is not "infected" by the in-memory
12052      shadow for the other parts of the image. */
12053   stmt( IRStmt_Dirty(d1) );
12054
12055   /* And now the XMMs themselves.  For each register, we PUT either
12056      its old value, or the value loaded from memory.  One convenient
12057      way to do that is with a conditional load that has its the
12058      default value, the old value of the register. */
12059   for (reg = 0; reg < 16; reg++) {
12060      IRExpr* ea  = binop(Iop_Add64, mkexpr(addr), mkU64(160 + reg * 16));
12061      IRExpr* alt = getXMMReg(reg);
12062      IRTemp  loadedValue = newTemp(Ity_V128);
12063      stmt( IRStmt_LoadG(Iend_LE,
12064                         ILGop_IdentV128,
12065                         loadedValue, ea, alt, restore_1e) );
12066      putXMMReg(reg, mkexpr(loadedValue));
12067   }
12068
12069   /* ------ rfbm[2] gates the AVX state ------ */
12070   /* Component 2 is just a bunch of register loads, so we'll do it
12071      inline, just to be simple and to be Memcheck friendly. */
12072
12073   /* Same scheme as component 0: first zero it out, and then possibly
12074      restore from the memory area. */
12075   IRTemp rfbm_2      = newTemp(Ity_I64);
12076   IRTemp xstate_bv_2 = newTemp(Ity_I64);
12077   IRTemp restore_2   = newTemp(Ity_I64);
12078   assign(rfbm_2,      binop(Iop_And64, mkexpr(rfbm), mkU64(4)));
12079   assign(xstate_bv_2, binop(Iop_And64, mkexpr(xstate_bv), mkU64(4)));
12080   assign(restore_2,   binop(Iop_And64, mkexpr(rfbm_2), mkexpr(xstate_bv_2)));
12081
12082   IRExpr* rfbm_2e    = binop(Iop_CmpNE64, mkexpr(rfbm_2),    mkU64(0));
12083   IRExpr* restore_2e = binop(Iop_CmpNE64, mkexpr(restore_2), mkU64(0));
12084
12085   for (reg = 0; reg < 16; reg++) {
12086      putGuarded(ymmGuestRegLane128offset(reg, 1), rfbm_2e, mkV128(0));
12087   }
12088
12089   for (reg = 0; reg < 16; reg++) {
12090      IRExpr* ea  = binop(Iop_Add64, mkexpr(addr), mkU64(576 + reg * 16));
12091      IRExpr* alt = getYMMRegLane128(reg, 1);
12092      IRTemp  loadedValue = newTemp(Ity_V128);
12093      stmt( IRStmt_LoadG(Iend_LE,
12094                         ILGop_IdentV128,
12095                         loadedValue, ea, alt, restore_2e) );
12096      putYMMRegLane128(reg, 1, mkexpr(loadedValue));
12097   }
12098}
12099
12100
12101static Long dis_XRSTOR ( const VexAbiInfo* vbi,
12102                         Prefix pfx, Long delta, Int sz )
12103{
12104   /* As with XRSTOR above we ignore the value of REX.W since we're
12105      not bothering with the FPU DP and IP fields. */
12106   IRTemp addr  = IRTemp_INVALID;
12107   Int    alen  = 0;
12108   HChar  dis_buf[50];
12109   UChar  modrm = getUChar(delta);
12110   vassert(!epartIsReg(modrm)); /* ensured by caller */
12111   vassert(sz == 4 || sz == 8); /* ditto */
12112
12113   addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12114   delta += alen;
12115   gen_SEGV_if_not_64_aligned(addr);
12116
12117   DIP("%sxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
12118
12119   /* VEX's caller is assumed to have checked this. */
12120   const ULong aSSUMED_XCR0_VALUE = 7;
12121
12122   IRTemp rfbm = newTemp(Ity_I64);
12123   assign(rfbm,
12124          binop(Iop_And64,
12125                binop(Iop_Or64,
12126                      binop(Iop_Shl64,
12127                            unop(Iop_32Uto64, getIRegRDX(4)), mkU8(32)),
12128                      unop(Iop_32Uto64, getIRegRAX(4))),
12129                mkU64(aSSUMED_XCR0_VALUE)));
12130
12131   IRTemp xstate_bv = newTemp(Ity_I64);
12132   assign(xstate_bv, loadLE(Ity_I64,
12133                            binop(Iop_Add64, mkexpr(addr), mkU64(512+0))));
12134
12135   IRTemp xcomp_bv = newTemp(Ity_I64);
12136   assign(xcomp_bv, loadLE(Ity_I64,
12137                           binop(Iop_Add64, mkexpr(addr), mkU64(512+8))));
12138
12139   IRTemp xsavehdr_23_16 = newTemp(Ity_I64);
12140   assign( xsavehdr_23_16,
12141           loadLE(Ity_I64,
12142                  binop(Iop_Add64, mkexpr(addr), mkU64(512+16))));
12143
12144   /* We must fault if
12145      * xcomp_bv[63] == 1, since this simulated CPU does not support
12146        the compaction extension.
12147      * xstate_bv sets a bit outside of XCR0 (which we assume to be 7).
12148      * any of the xsave header bytes 23 .. 8 are nonzero.  This seems to
12149        imply that xcomp_bv must be zero.
12150      xcomp_bv is header bytes 15 .. 8 and xstate_bv is header bytes 7 .. 0
12151   */
12152   IRTemp fault_if_nonzero = newTemp(Ity_I64);
12153   assign(fault_if_nonzero,
12154          binop(Iop_Or64,
12155                binop(Iop_And64, mkexpr(xstate_bv), mkU64(~aSSUMED_XCR0_VALUE)),
12156                binop(Iop_Or64, mkexpr(xcomp_bv), mkexpr(xsavehdr_23_16))));
12157   stmt( IRStmt_Exit(binop(Iop_CmpNE64, mkexpr(fault_if_nonzero), mkU64(0)),
12158                     Ijk_SigSEGV,
12159                     IRConst_U64(guest_RIP_curr_instr),
12160                     OFFB_RIP
12161   ));
12162
12163   /* We are guaranteed now that both xstate_bv and rfbm are in the
12164      range 0 .. 7.  Generate the restore sequence proper. */
12165   gen_XRSTOR_SEQUENCE(addr, xstate_bv, rfbm);
12166
12167   return delta;
12168}
12169
12170
12171static Long dis_FXRSTOR ( const VexAbiInfo* vbi,
12172                          Prefix pfx, Long delta, Int sz )
12173{
12174   /* As with FXSAVE above we ignore the value of REX.W since we're
12175      not bothering with the FPU DP and IP fields. */
12176   IRTemp addr  = IRTemp_INVALID;
12177   Int    alen  = 0;
12178   HChar  dis_buf[50];
12179   UChar  modrm = getUChar(delta);
12180   vassert(!epartIsReg(modrm)); /* ensured by caller */
12181   vassert(sz == 4 || sz == 8); /* ditto */
12182
12183   addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12184   delta += alen;
12185   gen_SEGV_if_not_16_aligned(addr);
12186
12187   DIP("%sfxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
12188
12189   /* FXRSTOR is just XRSTOR with components 0 and 1 selected and also
12190      as if components 0 and 1 are set as present in XSTATE_BV in the
12191      XSAVE header.  Set both rfbm and xstate_bv to 0b011 therefore,
12192      generate the XRSTOR sequence accordingly, and let iropt fold out
12193      the unused (AVX) parts accordingly. */
12194   IRTemp three = newTemp(Ity_I64);
12195   assign(three, mkU64(3));
12196   gen_XRSTOR_SEQUENCE(addr, three/*xstate_bv*/, three/*rfbm*/);
12197
12198   return delta;
12199}
12200
12201
12202static IRTemp math_PINSRW_128 ( IRTemp v128, IRTemp u16, UInt imm8 )
12203{
12204   vassert(imm8 >= 0 && imm8 <= 7);
12205
12206   // Create a V128 value which has the selected word in the
12207   // specified lane, and zeroes everywhere else.
12208   IRTemp tmp128    = newTemp(Ity_V128);
12209   IRTemp halfshift = newTemp(Ity_I64);
12210   assign(halfshift, binop(Iop_Shl64,
12211                           unop(Iop_16Uto64, mkexpr(u16)),
12212                           mkU8(16 * (imm8 & 3))));
12213   if (imm8 < 4) {
12214      assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
12215   } else {
12216      assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
12217   }
12218
12219   UShort mask = ~(3 << (imm8 * 2));
12220   IRTemp res  = newTemp(Ity_V128);
12221   assign( res, binop(Iop_OrV128,
12222                      mkexpr(tmp128),
12223                      binop(Iop_AndV128, mkexpr(v128), mkV128(mask))) );
12224   return res;
12225}
12226
12227
12228static IRTemp math_PSADBW_128 ( IRTemp dV, IRTemp sV )
12229{
12230   IRTemp s1, s0, d1, d0;
12231   s1 = s0 = d1 = d0 = IRTemp_INVALID;
12232
12233   breakupV128to64s( sV, &s1, &s0 );
12234   breakupV128to64s( dV, &d1, &d0 );
12235
12236   IRTemp res = newTemp(Ity_V128);
12237   assign( res,
12238           binop(Iop_64HLtoV128,
12239                 mkIRExprCCall(Ity_I64, 0/*regparms*/,
12240                               "amd64g_calculate_mmx_psadbw",
12241                               &amd64g_calculate_mmx_psadbw,
12242                               mkIRExprVec_2( mkexpr(s1), mkexpr(d1))),
12243                 mkIRExprCCall(Ity_I64, 0/*regparms*/,
12244                               "amd64g_calculate_mmx_psadbw",
12245                               &amd64g_calculate_mmx_psadbw,
12246                               mkIRExprVec_2( mkexpr(s0), mkexpr(d0)))) );
12247   return res;
12248}
12249
12250
12251static IRTemp math_PSADBW_256 ( IRTemp dV, IRTemp sV )
12252{
12253   IRTemp sHi, sLo, dHi, dLo;
12254   sHi = sLo = dHi = dLo = IRTemp_INVALID;
12255   breakupV256toV128s( dV, &dHi, &dLo);
12256   breakupV256toV128s( sV, &sHi, &sLo);
12257   IRTemp res = newTemp(Ity_V256);
12258   assign(res, binop(Iop_V128HLtoV256,
12259                     mkexpr(math_PSADBW_128(dHi, sHi)),
12260                     mkexpr(math_PSADBW_128(dLo, sLo))));
12261   return res;
12262}
12263
12264
12265static Long dis_MASKMOVDQU ( const VexAbiInfo* vbi, Prefix pfx,
12266                             Long delta, Bool isAvx )
12267{
12268   IRTemp regD    = newTemp(Ity_V128);
12269   IRTemp mask    = newTemp(Ity_V128);
12270   IRTemp olddata = newTemp(Ity_V128);
12271   IRTemp newdata = newTemp(Ity_V128);
12272   IRTemp addr    = newTemp(Ity_I64);
12273   UChar  modrm   = getUChar(delta);
12274   UInt   rG      = gregOfRexRM(pfx,modrm);
12275   UInt   rE      = eregOfRexRM(pfx,modrm);
12276
12277   assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
12278   assign( regD, getXMMReg( rG ));
12279
12280   /* Unfortunately can't do the obvious thing with SarN8x16
12281      here since that can't be re-emitted as SSE2 code - no such
12282      insn. */
12283   assign( mask,
12284           binop(Iop_64HLtoV128,
12285                 binop(Iop_SarN8x8,
12286                       getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ),
12287                       mkU8(7) ),
12288                 binop(Iop_SarN8x8,
12289                       getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ),
12290                       mkU8(7) ) ));
12291   assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
12292   assign( newdata, binop(Iop_OrV128,
12293                          binop(Iop_AndV128,
12294                                mkexpr(regD),
12295                                mkexpr(mask) ),
12296                          binop(Iop_AndV128,
12297                                mkexpr(olddata),
12298                                unop(Iop_NotV128, mkexpr(mask)))) );
12299   storeLE( mkexpr(addr), mkexpr(newdata) );
12300
12301   delta += 1;
12302   DIP("%smaskmovdqu %s,%s\n", isAvx ? "v" : "",
12303       nameXMMReg(rE), nameXMMReg(rG) );
12304   return delta;
12305}
12306
12307
12308static Long dis_MOVMSKPS_128 ( const VexAbiInfo* vbi, Prefix pfx,
12309                               Long delta, Bool isAvx )
12310{
12311   UChar modrm = getUChar(delta);
12312   UInt   rG   = gregOfRexRM(pfx,modrm);
12313   UInt   rE   = eregOfRexRM(pfx,modrm);
12314   IRTemp t0   = newTemp(Ity_I32);
12315   IRTemp t1   = newTemp(Ity_I32);
12316   IRTemp t2   = newTemp(Ity_I32);
12317   IRTemp t3   = newTemp(Ity_I32);
12318   delta += 1;
12319   assign( t0, binop( Iop_And32,
12320                      binop(Iop_Shr32, getXMMRegLane32(rE,0), mkU8(31)),
12321                      mkU32(1) ));
12322   assign( t1, binop( Iop_And32,
12323                      binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(30)),
12324                      mkU32(2) ));
12325   assign( t2, binop( Iop_And32,
12326                      binop(Iop_Shr32, getXMMRegLane32(rE,2), mkU8(29)),
12327                      mkU32(4) ));
12328   assign( t3, binop( Iop_And32,
12329                      binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(28)),
12330                      mkU32(8) ));
12331   putIReg32( rG, binop(Iop_Or32,
12332                        binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
12333                        binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
12334   DIP("%smovmskps %s,%s\n", isAvx ? "v" : "",
12335       nameXMMReg(rE), nameIReg32(rG));
12336   return delta;
12337}
12338
12339
12340static Long dis_MOVMSKPS_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
12341{
12342   UChar modrm = getUChar(delta);
12343   UInt   rG   = gregOfRexRM(pfx,modrm);
12344   UInt   rE   = eregOfRexRM(pfx,modrm);
12345   IRTemp t0   = newTemp(Ity_I32);
12346   IRTemp t1   = newTemp(Ity_I32);
12347   IRTemp t2   = newTemp(Ity_I32);
12348   IRTemp t3   = newTemp(Ity_I32);
12349   IRTemp t4   = newTemp(Ity_I32);
12350   IRTemp t5   = newTemp(Ity_I32);
12351   IRTemp t6   = newTemp(Ity_I32);
12352   IRTemp t7   = newTemp(Ity_I32);
12353   delta += 1;
12354   assign( t0, binop( Iop_And32,
12355                      binop(Iop_Shr32, getYMMRegLane32(rE,0), mkU8(31)),
12356                      mkU32(1) ));
12357   assign( t1, binop( Iop_And32,
12358                      binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(30)),
12359                      mkU32(2) ));
12360   assign( t2, binop( Iop_And32,
12361                      binop(Iop_Shr32, getYMMRegLane32(rE,2), mkU8(29)),
12362                      mkU32(4) ));
12363   assign( t3, binop( Iop_And32,
12364                      binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(28)),
12365                      mkU32(8) ));
12366   assign( t4, binop( Iop_And32,
12367                      binop(Iop_Shr32, getYMMRegLane32(rE,4), mkU8(27)),
12368                      mkU32(16) ));
12369   assign( t5, binop( Iop_And32,
12370                      binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(26)),
12371                      mkU32(32) ));
12372   assign( t6, binop( Iop_And32,
12373                      binop(Iop_Shr32, getYMMRegLane32(rE,6), mkU8(25)),
12374                      mkU32(64) ));
12375   assign( t7, binop( Iop_And32,
12376                      binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(24)),
12377                      mkU32(128) ));
12378   putIReg32( rG, binop(Iop_Or32,
12379                        binop(Iop_Or32,
12380                              binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
12381                              binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ),
12382                        binop(Iop_Or32,
12383                              binop(Iop_Or32, mkexpr(t4), mkexpr(t5)),
12384                              binop(Iop_Or32, mkexpr(t6), mkexpr(t7)) ) ) );
12385   DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
12386   return delta;
12387}
12388
12389
12390static Long dis_MOVMSKPD_128 ( const VexAbiInfo* vbi, Prefix pfx,
12391                               Long delta, Bool isAvx )
12392{
12393   UChar modrm = getUChar(delta);
12394   UInt   rG   = gregOfRexRM(pfx,modrm);
12395   UInt   rE   = eregOfRexRM(pfx,modrm);
12396   IRTemp t0   = newTemp(Ity_I32);
12397   IRTemp t1   = newTemp(Ity_I32);
12398   delta += 1;
12399   assign( t0, binop( Iop_And32,
12400                      binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(31)),
12401                      mkU32(1) ));
12402   assign( t1, binop( Iop_And32,
12403                      binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(30)),
12404                      mkU32(2) ));
12405   putIReg32( rG, binop(Iop_Or32, mkexpr(t0), mkexpr(t1) ) );
12406   DIP("%smovmskpd %s,%s\n", isAvx ? "v" : "",
12407       nameXMMReg(rE), nameIReg32(rG));
12408   return delta;
12409}
12410
12411
12412static Long dis_MOVMSKPD_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
12413{
12414   UChar modrm = getUChar(delta);
12415   UInt   rG   = gregOfRexRM(pfx,modrm);
12416   UInt   rE   = eregOfRexRM(pfx,modrm);
12417   IRTemp t0   = newTemp(Ity_I32);
12418   IRTemp t1   = newTemp(Ity_I32);
12419   IRTemp t2   = newTemp(Ity_I32);
12420   IRTemp t3   = newTemp(Ity_I32);
12421   delta += 1;
12422   assign( t0, binop( Iop_And32,
12423                      binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(31)),
12424                      mkU32(1) ));
12425   assign( t1, binop( Iop_And32,
12426                      binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(30)),
12427                      mkU32(2) ));
12428   assign( t2, binop( Iop_And32,
12429                      binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(29)),
12430                      mkU32(4) ));
12431   assign( t3, binop( Iop_And32,
12432                      binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(28)),
12433                      mkU32(8) ));
12434   putIReg32( rG, binop(Iop_Or32,
12435                        binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
12436                        binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
12437   DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
12438   return delta;
12439}
12440
12441
12442/* Note, this also handles SSE(1) insns. */
12443__attribute__((noinline))
12444static
12445Long dis_ESC_0F__SSE2 ( Bool* decode_OK,
12446                        const VexArchInfo* archinfo,
12447                        const VexAbiInfo* vbi,
12448                        Prefix pfx, Int sz, Long deltaIN,
12449                        DisResult* dres )
12450{
12451   IRTemp addr  = IRTemp_INVALID;
12452   IRTemp t0    = IRTemp_INVALID;
12453   IRTemp t1    = IRTemp_INVALID;
12454   IRTemp t2    = IRTemp_INVALID;
12455   IRTemp t3    = IRTemp_INVALID;
12456   IRTemp t4    = IRTemp_INVALID;
12457   IRTemp t5    = IRTemp_INVALID;
12458   IRTemp t6    = IRTemp_INVALID;
12459   UChar  modrm = 0;
12460   Int    alen  = 0;
12461   HChar  dis_buf[50];
12462
12463   *decode_OK = False;
12464
12465   Long   delta = deltaIN;
12466   UChar  opc   = getUChar(delta);
12467   delta++;
12468   switch (opc) {
12469
12470   case 0x10:
12471      if (have66noF2noF3(pfx)
12472          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12473         /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
12474         modrm = getUChar(delta);
12475         if (epartIsReg(modrm)) {
12476            putXMMReg( gregOfRexRM(pfx,modrm),
12477                       getXMMReg( eregOfRexRM(pfx,modrm) ));
12478            DIP("movupd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12479                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12480            delta += 1;
12481         } else {
12482            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12483            putXMMReg( gregOfRexRM(pfx,modrm),
12484                       loadLE(Ity_V128, mkexpr(addr)) );
12485            DIP("movupd %s,%s\n", dis_buf,
12486                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12487            delta += alen;
12488         }
12489         goto decode_success;
12490      }
12491      /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
12492         G (lo half xmm).  If E is mem, upper half of G is zeroed out.
12493         If E is reg, upper half of G is unchanged. */
12494      if (haveF2no66noF3(pfx)
12495          && (sz == 4 || /* ignore redundant REX.W */ sz == 8) ) {
12496         modrm = getUChar(delta);
12497         if (epartIsReg(modrm)) {
12498            putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
12499                             getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
12500            DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12501                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
12502            delta += 1;
12503         } else {
12504            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12505            putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
12506            putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
12507                             loadLE(Ity_I64, mkexpr(addr)) );
12508            DIP("movsd %s,%s\n", dis_buf,
12509                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
12510            delta += alen;
12511         }
12512         goto decode_success;
12513      }
12514      /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
12515         (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
12516      if (haveF3no66noF2(pfx)
12517          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12518         modrm = getUChar(delta);
12519         if (epartIsReg(modrm)) {
12520            putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
12521                             getXMMRegLane32( eregOfRexRM(pfx,modrm), 0 ));
12522            DIP("movss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12523                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
12524            delta += 1;
12525         } else {
12526            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12527            putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
12528            putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
12529                             loadLE(Ity_I32, mkexpr(addr)) );
12530            DIP("movss %s,%s\n", dis_buf,
12531                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
12532            delta += alen;
12533         }
12534         goto decode_success;
12535      }
12536      /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
12537      if (haveNo66noF2noF3(pfx)
12538          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12539         modrm = getUChar(delta);
12540         if (epartIsReg(modrm)) {
12541            putXMMReg( gregOfRexRM(pfx,modrm),
12542                       getXMMReg( eregOfRexRM(pfx,modrm) ));
12543            DIP("movups %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12544                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12545            delta += 1;
12546         } else {
12547            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12548            putXMMReg( gregOfRexRM(pfx,modrm),
12549                       loadLE(Ity_V128, mkexpr(addr)) );
12550            DIP("movups %s,%s\n", dis_buf,
12551                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
12552            delta += alen;
12553         }
12554         goto decode_success;
12555      }
12556      break;
12557
12558   case 0x11:
12559      /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
12560         or lo half xmm). */
12561      if (haveF2no66noF3(pfx)
12562          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12563         modrm = getUChar(delta);
12564         if (epartIsReg(modrm)) {
12565            putXMMRegLane64( eregOfRexRM(pfx,modrm), 0,
12566                             getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
12567            DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12568                                 nameXMMReg(eregOfRexRM(pfx,modrm)));
12569            delta += 1;
12570         } else {
12571            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12572            storeLE( mkexpr(addr),
12573                     getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
12574            DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12575                                 dis_buf);
12576            delta += alen;
12577         }
12578         goto decode_success;
12579      }
12580      /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
12581         or lo 1/4 xmm). */
12582      if (haveF3no66noF2(pfx) && sz == 4) {
12583         modrm = getUChar(delta);
12584         if (epartIsReg(modrm)) {
12585            /* fall through, we don't yet have a test case */
12586         } else {
12587            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12588            storeLE( mkexpr(addr),
12589                     getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
12590            DIP("movss %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12591                                 dis_buf);
12592            delta += alen;
12593            goto decode_success;
12594         }
12595      }
12596      /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
12597      if (have66noF2noF3(pfx)
12598          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12599         modrm = getUChar(delta);
12600         if (epartIsReg(modrm)) {
12601            putXMMReg( eregOfRexRM(pfx,modrm),
12602                       getXMMReg( gregOfRexRM(pfx,modrm) ) );
12603            DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12604                                  nameXMMReg(eregOfRexRM(pfx,modrm)));
12605            delta += 1;
12606         } else {
12607            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12608            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
12609            DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12610                                  dis_buf );
12611            delta += alen;
12612         }
12613         goto decode_success;
12614      }
12615      /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
12616      if (haveNo66noF2noF3(pfx)
12617          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12618         modrm = getUChar(delta);
12619         if (epartIsReg(modrm)) {
12620            /* fall through; awaiting test case */
12621         } else {
12622            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12623            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
12624            DIP("movups %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12625                                  dis_buf );
12626            delta += alen;
12627            goto decode_success;
12628         }
12629      }
12630      break;
12631
12632   case 0x12:
12633      /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
12634      /* Identical to MOVLPS ? */
12635      if (have66noF2noF3(pfx)
12636          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12637         modrm = getUChar(delta);
12638         if (epartIsReg(modrm)) {
12639            /* fall through; apparently reg-reg is not possible */
12640         } else {
12641            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12642            delta += alen;
12643            putXMMRegLane64( gregOfRexRM(pfx,modrm),
12644                             0/*lower lane*/,
12645                             loadLE(Ity_I64, mkexpr(addr)) );
12646            DIP("movlpd %s, %s\n",
12647                dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
12648            goto decode_success;
12649         }
12650      }
12651      /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
12652      /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
12653      if (haveNo66noF2noF3(pfx)
12654          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12655         modrm = getUChar(delta);
12656         if (epartIsReg(modrm)) {
12657            delta += 1;
12658            putXMMRegLane64( gregOfRexRM(pfx,modrm),
12659                             0/*lower lane*/,
12660                             getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ));
12661            DIP("movhlps %s, %s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12662                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
12663         } else {
12664            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12665            delta += alen;
12666            putXMMRegLane64( gregOfRexRM(pfx,modrm),  0/*lower lane*/,
12667                             loadLE(Ity_I64, mkexpr(addr)) );
12668            DIP("movlps %s, %s\n",
12669                dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
12670         }
12671         goto decode_success;
12672      }
12673      break;
12674
12675   case 0x13:
12676      /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
12677      if (haveNo66noF2noF3(pfx)
12678          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12679         modrm = getUChar(delta);
12680         if (!epartIsReg(modrm)) {
12681            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12682            delta += alen;
12683            storeLE( mkexpr(addr),
12684                     getXMMRegLane64( gregOfRexRM(pfx,modrm),
12685                                      0/*lower lane*/ ) );
12686            DIP("movlps %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
12687                                   dis_buf);
12688            goto decode_success;
12689         }
12690         /* else fall through */
12691      }
12692      /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
12693      /* Identical to MOVLPS ? */
12694      if (have66noF2noF3(pfx)
12695          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12696         modrm = getUChar(delta);
12697         if (!epartIsReg(modrm)) {
12698            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12699            delta += alen;
12700            storeLE( mkexpr(addr),
12701                     getXMMRegLane64( gregOfRexRM(pfx,modrm),
12702                                      0/*lower lane*/ ) );
12703            DIP("movlpd %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
12704                                   dis_buf);
12705            goto decode_success;
12706         }
12707         /* else fall through */
12708      }
12709      break;
12710
12711   case 0x14:
12712   case 0x15:
12713      /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
12714      /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
12715      /* These just appear to be special cases of SHUFPS */
12716      if (haveNo66noF2noF3(pfx) && sz == 4) {
12717         Bool   hi = toBool(opc == 0x15);
12718         IRTemp sV = newTemp(Ity_V128);
12719         IRTemp dV = newTemp(Ity_V128);
12720         modrm = getUChar(delta);
12721         UInt   rG = gregOfRexRM(pfx,modrm);
12722         assign( dV, getXMMReg(rG) );
12723         if (epartIsReg(modrm)) {
12724            UInt rE = eregOfRexRM(pfx,modrm);
12725            assign( sV, getXMMReg(rE) );
12726            delta += 1;
12727            DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
12728                nameXMMReg(rE), nameXMMReg(rG));
12729         } else {
12730            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12731            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12732            delta += alen;
12733            DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
12734                dis_buf, nameXMMReg(rG));
12735         }
12736         IRTemp res = math_UNPCKxPS_128( sV, dV, hi );
12737         putXMMReg( rG, mkexpr(res) );
12738         goto decode_success;
12739      }
12740      /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
12741      /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
12742      /* These just appear to be special cases of SHUFPS */
12743      if (have66noF2noF3(pfx)
12744          && sz == 2 /* could be 8 if rex also present */) {
12745         Bool   hi = toBool(opc == 0x15);
12746         IRTemp sV = newTemp(Ity_V128);
12747         IRTemp dV = newTemp(Ity_V128);
12748         modrm = getUChar(delta);
12749         UInt   rG = gregOfRexRM(pfx,modrm);
12750         assign( dV, getXMMReg(rG) );
12751         if (epartIsReg(modrm)) {
12752            UInt rE = eregOfRexRM(pfx,modrm);
12753            assign( sV, getXMMReg(rE) );
12754            delta += 1;
12755            DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
12756                nameXMMReg(rE), nameXMMReg(rG));
12757         } else {
12758            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12759            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12760            delta += alen;
12761            DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
12762                dis_buf, nameXMMReg(rG));
12763         }
12764         IRTemp res = math_UNPCKxPD_128( sV, dV, hi );
12765         putXMMReg( rG, mkexpr(res) );
12766         goto decode_success;
12767      }
12768      break;
12769
12770   case 0x16:
12771      /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
12772      /* These seems identical to MOVHPS.  This instruction encoding is
12773         completely crazy. */
12774      if (have66noF2noF3(pfx)
12775          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12776         modrm = getUChar(delta);
12777         if (epartIsReg(modrm)) {
12778            /* fall through; apparently reg-reg is not possible */
12779         } else {
12780            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12781            delta += alen;
12782            putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
12783                             loadLE(Ity_I64, mkexpr(addr)) );
12784            DIP("movhpd %s,%s\n", dis_buf,
12785                                  nameXMMReg( gregOfRexRM(pfx,modrm) ));
12786            goto decode_success;
12787         }
12788      }
12789      /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
12790      /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
12791      if (haveNo66noF2noF3(pfx)
12792          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12793         modrm = getUChar(delta);
12794         if (epartIsReg(modrm)) {
12795            delta += 1;
12796            putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
12797                             getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ) );
12798            DIP("movhps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12799                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12800         } else {
12801            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12802            delta += alen;
12803            putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
12804                             loadLE(Ity_I64, mkexpr(addr)) );
12805            DIP("movhps %s,%s\n", dis_buf,
12806                                  nameXMMReg( gregOfRexRM(pfx,modrm) ));
12807         }
12808         goto decode_success;
12809      }
12810      break;
12811
12812   case 0x17:
12813      /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
12814      if (haveNo66noF2noF3(pfx)
12815          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12816         modrm = getUChar(delta);
12817         if (!epartIsReg(modrm)) {
12818            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12819            delta += alen;
12820            storeLE( mkexpr(addr),
12821                     getXMMRegLane64( gregOfRexRM(pfx,modrm),
12822                                      1/*upper lane*/ ) );
12823            DIP("movhps %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
12824                                  dis_buf);
12825            goto decode_success;
12826         }
12827         /* else fall through */
12828      }
12829      /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
12830      /* Again, this seems identical to MOVHPS. */
12831      if (have66noF2noF3(pfx)
12832          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12833         modrm = getUChar(delta);
12834         if (!epartIsReg(modrm)) {
12835            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12836            delta += alen;
12837            storeLE( mkexpr(addr),
12838                     getXMMRegLane64( gregOfRexRM(pfx,modrm),
12839                                      1/*upper lane*/ ) );
12840            DIP("movhpd %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
12841                                  dis_buf);
12842            goto decode_success;
12843         }
12844         /* else fall through */
12845      }
12846      break;
12847
12848   case 0x18:
12849      /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
12850      /* 0F 18 /1 = PREFETCH0   -- with various different hints */
12851      /* 0F 18 /2 = PREFETCH1 */
12852      /* 0F 18 /3 = PREFETCH2 */
12853      if (haveNo66noF2noF3(pfx)
12854          && !epartIsReg(getUChar(delta))
12855          && gregLO3ofRM(getUChar(delta)) >= 0
12856          && gregLO3ofRM(getUChar(delta)) <= 3) {
12857         const HChar* hintstr = "??";
12858
12859         modrm = getUChar(delta);
12860         vassert(!epartIsReg(modrm));
12861
12862         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12863         delta += alen;
12864
12865         switch (gregLO3ofRM(modrm)) {
12866            case 0: hintstr = "nta"; break;
12867            case 1: hintstr = "t0"; break;
12868            case 2: hintstr = "t1"; break;
12869            case 3: hintstr = "t2"; break;
12870            default: vassert(0);
12871         }
12872
12873         DIP("prefetch%s %s\n", hintstr, dis_buf);
12874         goto decode_success;
12875      }
12876      break;
12877
12878   case 0x28:
12879      /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
12880      if (have66noF2noF3(pfx)
12881          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12882         modrm = getUChar(delta);
12883         if (epartIsReg(modrm)) {
12884            putXMMReg( gregOfRexRM(pfx,modrm),
12885                       getXMMReg( eregOfRexRM(pfx,modrm) ));
12886            DIP("movapd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12887                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12888            delta += 1;
12889         } else {
12890            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12891            gen_SEGV_if_not_16_aligned( addr );
12892            putXMMReg( gregOfRexRM(pfx,modrm),
12893                       loadLE(Ity_V128, mkexpr(addr)) );
12894            DIP("movapd %s,%s\n", dis_buf,
12895                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12896            delta += alen;
12897         }
12898         goto decode_success;
12899      }
12900      /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
12901      if (haveNo66noF2noF3(pfx)
12902          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12903         modrm = getUChar(delta);
12904         if (epartIsReg(modrm)) {
12905            putXMMReg( gregOfRexRM(pfx,modrm),
12906                       getXMMReg( eregOfRexRM(pfx,modrm) ));
12907            DIP("movaps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12908                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12909            delta += 1;
12910         } else {
12911            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12912            gen_SEGV_if_not_16_aligned( addr );
12913            putXMMReg( gregOfRexRM(pfx,modrm),
12914                       loadLE(Ity_V128, mkexpr(addr)) );
12915            DIP("movaps %s,%s\n", dis_buf,
12916                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12917            delta += alen;
12918         }
12919         goto decode_success;
12920      }
12921      break;
12922
12923   case 0x29:
12924      /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
12925      if (haveNo66noF2noF3(pfx)
12926          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12927         modrm = getUChar(delta);
12928         if (epartIsReg(modrm)) {
12929            putXMMReg( eregOfRexRM(pfx,modrm),
12930                       getXMMReg( gregOfRexRM(pfx,modrm) ));
12931            DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12932                                  nameXMMReg(eregOfRexRM(pfx,modrm)));
12933            delta += 1;
12934         } else {
12935            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12936            gen_SEGV_if_not_16_aligned( addr );
12937            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
12938            DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12939                                  dis_buf );
12940            delta += alen;
12941         }
12942         goto decode_success;
12943      }
12944      /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
12945      if (have66noF2noF3(pfx)
12946          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12947         modrm = getUChar(delta);
12948         if (epartIsReg(modrm)) {
12949            putXMMReg( eregOfRexRM(pfx,modrm),
12950                       getXMMReg( gregOfRexRM(pfx,modrm) ) );
12951            DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12952                                  nameXMMReg(eregOfRexRM(pfx,modrm)));
12953            delta += 1;
12954         } else {
12955            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12956            gen_SEGV_if_not_16_aligned( addr );
12957            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
12958            DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12959                                  dis_buf );
12960            delta += alen;
12961         }
12962         goto decode_success;
12963      }
12964      break;
12965
12966   case 0x2A:
12967      /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
12968         half xmm */
12969      if (haveNo66noF2noF3(pfx) && sz == 4) {
12970         IRTemp arg64 = newTemp(Ity_I64);
12971         IRTemp rmode = newTemp(Ity_I32);
12972
12973         modrm = getUChar(delta);
12974         if (epartIsReg(modrm)) {
12975            /* Only switch to MMX mode if the source is a MMX register.
12976               See comments on CVTPI2PD for details.  Fixes #357059. */
12977            do_MMX_preamble();
12978            assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
12979            delta += 1;
12980            DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
12981                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
12982         } else {
12983            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12984            assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
12985            delta += alen;
12986            DIP("cvtpi2ps %s,%s\n", dis_buf,
12987                                    nameXMMReg(gregOfRexRM(pfx,modrm)) );
12988         }
12989
12990         assign( rmode, get_sse_roundingmode() );
12991
12992         putXMMRegLane32F(
12993            gregOfRexRM(pfx,modrm), 0,
12994            binop(Iop_F64toF32,
12995                  mkexpr(rmode),
12996                  unop(Iop_I32StoF64,
12997                       unop(Iop_64to32, mkexpr(arg64)) )) );
12998
12999         putXMMRegLane32F(
13000            gregOfRexRM(pfx,modrm), 1,
13001            binop(Iop_F64toF32,
13002                  mkexpr(rmode),
13003                  unop(Iop_I32StoF64,
13004                       unop(Iop_64HIto32, mkexpr(arg64)) )) );
13005
13006         goto decode_success;
13007      }
13008      /* F3 0F 2A = CVTSI2SS
13009         -- sz==4: convert I32 in mem/ireg to F32 in low quarter xmm
13010         -- sz==8: convert I64 in mem/ireg to F32 in low quarter xmm */
13011      if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
13012         IRTemp rmode = newTemp(Ity_I32);
13013         assign( rmode, get_sse_roundingmode() );
13014         modrm = getUChar(delta);
13015         if (sz == 4) {
13016            IRTemp arg32 = newTemp(Ity_I32);
13017            if (epartIsReg(modrm)) {
13018               assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
13019               delta += 1;
13020               DIP("cvtsi2ss %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
13021                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
13022            } else {
13023               addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13024               assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
13025               delta += alen;
13026               DIP("cvtsi2ss %s,%s\n", dis_buf,
13027                                       nameXMMReg(gregOfRexRM(pfx,modrm)) );
13028            }
13029            putXMMRegLane32F(
13030               gregOfRexRM(pfx,modrm), 0,
13031               binop(Iop_F64toF32,
13032                     mkexpr(rmode),
13033                     unop(Iop_I32StoF64, mkexpr(arg32)) ) );
13034         } else {
13035            /* sz == 8 */
13036            IRTemp arg64 = newTemp(Ity_I64);
13037            if (epartIsReg(modrm)) {
13038               assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
13039               delta += 1;
13040               DIP("cvtsi2ssq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
13041                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
13042            } else {
13043               addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13044               assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
13045               delta += alen;
13046               DIP("cvtsi2ssq %s,%s\n", dis_buf,
13047                                        nameXMMReg(gregOfRexRM(pfx,modrm)) );
13048            }
13049            putXMMRegLane32F(
13050               gregOfRexRM(pfx,modrm), 0,
13051               binop(Iop_F64toF32,
13052                     mkexpr(rmode),
13053                     binop(Iop_I64StoF64, mkexpr(rmode), mkexpr(arg64)) ) );
13054         }
13055         goto decode_success;
13056      }
13057      /* F2 0F 2A = CVTSI2SD
13058         when sz==4 -- convert I32 in mem/ireg to F64 in low half xmm
13059         when sz==8 -- convert I64 in mem/ireg to F64 in low half xmm
13060      */
13061      if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
13062         modrm = getUChar(delta);
13063         if (sz == 4) {
13064            IRTemp arg32 = newTemp(Ity_I32);
13065            if (epartIsReg(modrm)) {
13066               assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
13067               delta += 1;
13068               DIP("cvtsi2sdl %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
13069                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
13070            } else {
13071               addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13072               assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
13073               delta += alen;
13074               DIP("cvtsi2sdl %s,%s\n", dis_buf,
13075                                        nameXMMReg(gregOfRexRM(pfx,modrm)) );
13076            }
13077            putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
13078                              unop(Iop_I32StoF64, mkexpr(arg32))
13079            );
13080         } else {
13081            /* sz == 8 */
13082            IRTemp arg64 = newTemp(Ity_I64);
13083            if (epartIsReg(modrm)) {
13084               assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
13085               delta += 1;
13086               DIP("cvtsi2sdq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
13087                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
13088            } else {
13089               addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13090               assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
13091               delta += alen;
13092               DIP("cvtsi2sdq %s,%s\n", dis_buf,
13093                                        nameXMMReg(gregOfRexRM(pfx,modrm)) );
13094            }
13095            putXMMRegLane64F(
13096               gregOfRexRM(pfx,modrm),
13097               0,
13098               binop( Iop_I64StoF64,
13099                      get_sse_roundingmode(),
13100                      mkexpr(arg64)
13101               )
13102            );
13103         }
13104         goto decode_success;
13105      }
13106      /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
13107         xmm(G) */
13108      if (have66noF2noF3(pfx) && sz == 2) {
13109         IRTemp arg64 = newTemp(Ity_I64);
13110
13111         modrm = getUChar(delta);
13112         if (epartIsReg(modrm)) {
13113            /* Only switch to MMX mode if the source is a MMX register.
13114               This is inconsistent with all other instructions which
13115               convert between XMM and (M64 or MMX), which always switch
13116               to MMX mode even if 64-bit operand is M64 and not MMX.  At
13117               least, that's what the Intel docs seem to me to say.
13118               Fixes #210264. */
13119            do_MMX_preamble();
13120            assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
13121            delta += 1;
13122            DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
13123                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
13124         } else {
13125            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13126            assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
13127            delta += alen;
13128            DIP("cvtpi2pd %s,%s\n", dis_buf,
13129                                    nameXMMReg(gregOfRexRM(pfx,modrm)) );
13130         }
13131
13132         putXMMRegLane64F(
13133            gregOfRexRM(pfx,modrm), 0,
13134            unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
13135         );
13136
13137         putXMMRegLane64F(
13138            gregOfRexRM(pfx,modrm), 1,
13139            unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
13140         );
13141
13142         goto decode_success;
13143      }
13144      break;
13145
13146   case 0x2B:
13147      /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
13148      /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
13149      if ( (haveNo66noF2noF3(pfx) && sz == 4)
13150           || (have66noF2noF3(pfx) && sz == 2) ) {
13151         modrm = getUChar(delta);
13152         if (!epartIsReg(modrm)) {
13153            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13154            gen_SEGV_if_not_16_aligned( addr );
13155            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
13156            DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
13157                                    dis_buf,
13158                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
13159            delta += alen;
13160            goto decode_success;
13161         }
13162         /* else fall through */
13163      }
13164      break;
13165
13166   case 0x2C:
13167   case 0x2D:
13168      /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
13169         I32 in mmx, according to prevailing SSE rounding mode */
13170      /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
13171         I32 in mmx, rounding towards zero */
13172      if (haveNo66noF2noF3(pfx) && sz == 4) {
13173         IRTemp dst64  = newTemp(Ity_I64);
13174         IRTemp rmode  = newTemp(Ity_I32);
13175         IRTemp f32lo  = newTemp(Ity_F32);
13176         IRTemp f32hi  = newTemp(Ity_F32);
13177         Bool   r2zero = toBool(opc == 0x2C);
13178
13179         do_MMX_preamble();
13180         modrm = getUChar(delta);
13181
13182         if (epartIsReg(modrm)) {
13183            delta += 1;
13184            assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
13185            assign(f32hi, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 1));
13186            DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
13187                                      nameXMMReg(eregOfRexRM(pfx,modrm)),
13188                                      nameMMXReg(gregLO3ofRM(modrm)));
13189         } else {
13190            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13191            assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
13192            assign(f32hi, loadLE(Ity_F32, binop( Iop_Add64,
13193                                                 mkexpr(addr),
13194                                                 mkU64(4) )));
13195            delta += alen;
13196            DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
13197                                      dis_buf,
13198                                      nameMMXReg(gregLO3ofRM(modrm)));
13199         }
13200
13201         if (r2zero) {
13202            assign(rmode, mkU32((UInt)Irrm_ZERO) );
13203         } else {
13204            assign( rmode, get_sse_roundingmode() );
13205         }
13206
13207         assign(
13208            dst64,
13209            binop( Iop_32HLto64,
13210                   binop( Iop_F64toI32S,
13211                          mkexpr(rmode),
13212                          unop( Iop_F32toF64, mkexpr(f32hi) ) ),
13213                   binop( Iop_F64toI32S,
13214                          mkexpr(rmode),
13215                          unop( Iop_F32toF64, mkexpr(f32lo) ) )
13216                 )
13217         );
13218
13219         putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
13220         goto decode_success;
13221      }
13222      /* F3 0F 2D = CVTSS2SI
13223         when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
13224                       according to prevailing SSE rounding mode
13225         when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
13226                       according to prevailing SSE rounding mode
13227      */
13228      /* F3 0F 2C = CVTTSS2SI
13229         when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
13230                       truncating towards zero
13231         when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
13232                       truncating towards zero
13233      */
13234      if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
13235         delta = dis_CVTxSS2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
13236         goto decode_success;
13237      }
13238      /* F2 0F 2D = CVTSD2SI
13239         when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
13240                       according to prevailing SSE rounding mode
13241         when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
13242                       according to prevailing SSE rounding mode
13243      */
13244      /* F2 0F 2C = CVTTSD2SI
13245         when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
13246                       truncating towards zero
13247         when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
13248                       truncating towards zero
13249      */
13250      if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
13251         delta = dis_CVTxSD2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
13252         goto decode_success;
13253      }
13254      /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
13255         I32 in mmx, according to prevailing SSE rounding mode */
13256      /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
13257         I32 in mmx, rounding towards zero */
13258      if (have66noF2noF3(pfx) && sz == 2) {
13259         IRTemp dst64  = newTemp(Ity_I64);
13260         IRTemp rmode  = newTemp(Ity_I32);
13261         IRTemp f64lo  = newTemp(Ity_F64);
13262         IRTemp f64hi  = newTemp(Ity_F64);
13263         Bool   r2zero = toBool(opc == 0x2C);
13264
13265         do_MMX_preamble();
13266         modrm = getUChar(delta);
13267
13268         if (epartIsReg(modrm)) {
13269            delta += 1;
13270            assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
13271            assign(f64hi, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 1));
13272            DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
13273                                      nameXMMReg(eregOfRexRM(pfx,modrm)),
13274                                      nameMMXReg(gregLO3ofRM(modrm)));
13275         } else {
13276            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13277            assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
13278            assign(f64hi, loadLE(Ity_F64, binop( Iop_Add64,
13279                                                 mkexpr(addr),
13280                                                 mkU64(8) )));
13281            delta += alen;
13282            DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
13283                                      dis_buf,
13284                                      nameMMXReg(gregLO3ofRM(modrm)));
13285         }
13286
13287         if (r2zero) {
13288            assign(rmode, mkU32((UInt)Irrm_ZERO) );
13289         } else {
13290            assign( rmode, get_sse_roundingmode() );
13291         }
13292
13293         assign(
13294            dst64,
13295            binop( Iop_32HLto64,
13296                   binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
13297                   binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
13298                 )
13299         );
13300
13301         putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
13302         goto decode_success;
13303      }
13304      break;
13305
13306   case 0x2E:
13307   case 0x2F:
13308      /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
13309      /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
13310      if (have66noF2noF3(pfx) && sz == 2) {
13311         delta = dis_COMISD( vbi, pfx, delta, False/*!isAvx*/, opc );
13312         goto decode_success;
13313      }
13314      /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
13315      /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
13316      if (haveNo66noF2noF3(pfx) && sz == 4) {
13317         delta = dis_COMISS( vbi, pfx, delta, False/*!isAvx*/, opc );
13318         goto decode_success;
13319      }
13320      break;
13321
13322   case 0x50:
13323      /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
13324         to 4 lowest bits of ireg(G) */
13325      if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
13326          && epartIsReg(getUChar(delta))) {
13327         /* sz == 8 is a kludge to handle insns with REX.W redundantly
13328            set to 1, which has been known to happen:
13329
13330            4c 0f 50 d9             rex64X movmskps %xmm1,%r11d
13331
13332            20071106: Intel docs say that REX.W isn't redundant: when
13333            present, a 64-bit register is written; when not present, only
13334            the 32-bit half is written.  However, testing on a Core2
13335            machine suggests the entire 64 bit register is written
13336            irrespective of the status of REX.W.  That could be because
13337            of the default rule that says "if the lower half of a 32-bit
13338            register is written, the upper half is zeroed".  By using
13339            putIReg32 here we inadvertantly produce the same behaviour as
13340            the Core2, for the same reason -- putIReg32 implements said
13341            rule.
13342
13343            AMD docs give no indication that REX.W is even valid for this
13344            insn. */
13345         delta = dis_MOVMSKPS_128( vbi, pfx, delta, False/*!isAvx*/ );
13346         goto decode_success;
13347      }
13348      /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
13349         2 lowest bits of ireg(G) */
13350      if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
13351         /* sz == 8 is a kludge to handle insns with REX.W redundantly
13352            set to 1, which has been known to happen:
13353            66 4c 0f 50 d9          rex64X movmskpd %xmm1,%r11d
13354            20071106: see further comments on MOVMSKPS implementation above.
13355         */
13356         delta = dis_MOVMSKPD_128( vbi, pfx, delta, False/*!isAvx*/ );
13357         goto decode_success;
13358      }
13359      break;
13360
13361   case 0x51:
13362      /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
13363      if (haveF3no66noF2(pfx) && sz == 4) {
13364         delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
13365                                            "sqrtss", Iop_Sqrt32F0x4 );
13366         goto decode_success;
13367      }
13368      /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
13369      if (haveNo66noF2noF3(pfx) && sz == 4) {
13370         delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
13371                                           "sqrtps", Iop_Sqrt32Fx4 );
13372         goto decode_success;
13373      }
13374      /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
13375      if (haveF2no66noF3(pfx) && sz == 4) {
13376         delta = dis_SSE_E_to_G_unary_lo64( vbi, pfx, delta,
13377                                            "sqrtsd", Iop_Sqrt64F0x2 );
13378         goto decode_success;
13379      }
13380      /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
13381      if (have66noF2noF3(pfx) && sz == 2) {
13382         delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
13383                                           "sqrtpd", Iop_Sqrt64Fx2 );
13384         goto decode_success;
13385      }
13386      break;
13387
13388   case 0x52:
13389      /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
13390      if (haveF3no66noF2(pfx) && sz == 4) {
13391         delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
13392                                            "rsqrtss", Iop_RSqrtEst32F0x4 );
13393         goto decode_success;
13394      }
13395      /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
13396      if (haveNo66noF2noF3(pfx) && sz == 4) {
13397         delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
13398                                           "rsqrtps", Iop_RSqrtEst32Fx4 );
13399         goto decode_success;
13400      }
13401      break;
13402
13403   case 0x53:
13404      /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
13405      if (haveF3no66noF2(pfx) && sz == 4) {
13406         delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
13407                                            "rcpss", Iop_RecipEst32F0x4 );
13408         goto decode_success;
13409      }
13410      /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
13411      if (haveNo66noF2noF3(pfx) && sz == 4) {
13412         delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
13413                                           "rcpps", Iop_RecipEst32Fx4 );
13414         goto decode_success;
13415      }
13416      break;
13417
13418   case 0x54:
13419      /* 0F 54 = ANDPS -- G = G and E */
13420      if (haveNo66noF2noF3(pfx) && sz == 4) {
13421         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andps", Iop_AndV128 );
13422         goto decode_success;
13423      }
13424      /* 66 0F 54 = ANDPD -- G = G and E */
13425      if (have66noF2noF3(pfx) && sz == 2) {
13426         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andpd", Iop_AndV128 );
13427         goto decode_success;
13428      }
13429      break;
13430
13431   case 0x55:
13432      /* 0F 55 = ANDNPS -- G = (not G) and E */
13433      if (haveNo66noF2noF3(pfx) && sz == 4) {
13434         delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnps",
13435                                                           Iop_AndV128 );
13436         goto decode_success;
13437      }
13438      /* 66 0F 55 = ANDNPD -- G = (not G) and E */
13439      if (have66noF2noF3(pfx) && sz == 2) {
13440         delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnpd",
13441                                                           Iop_AndV128 );
13442         goto decode_success;
13443      }
13444      break;
13445
13446   case 0x56:
13447      /* 0F 56 = ORPS -- G = G and E */
13448      if (haveNo66noF2noF3(pfx) && sz == 4) {
13449         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orps", Iop_OrV128 );
13450         goto decode_success;
13451      }
13452      /* 66 0F 56 = ORPD -- G = G and E */
13453      if (have66noF2noF3(pfx) && sz == 2) {
13454         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orpd", Iop_OrV128 );
13455         goto decode_success;
13456      }
13457      break;
13458
13459   case 0x57:
13460      /* 66 0F 57 = XORPD -- G = G xor E */
13461      if (have66noF2noF3(pfx) && sz == 2) {
13462         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorpd", Iop_XorV128 );
13463         goto decode_success;
13464      }
13465      /* 0F 57 = XORPS -- G = G xor E */
13466      if (haveNo66noF2noF3(pfx) && sz == 4) {
13467         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorps", Iop_XorV128 );
13468         goto decode_success;
13469      }
13470      break;
13471
13472   case 0x58:
13473      /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
13474      if (haveNo66noF2noF3(pfx) && sz == 4) {
13475         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addps", Iop_Add32Fx4 );
13476         goto decode_success;
13477      }
13478      /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
13479      if (haveF3no66noF2(pfx) && sz == 4) {
13480         delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "addss", Iop_Add32F0x4 );
13481         goto decode_success;
13482      }
13483      /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
13484      if (haveF2no66noF3(pfx)
13485          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
13486         delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "addsd", Iop_Add64F0x2 );
13487         goto decode_success;
13488      }
13489      /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
13490      if (have66noF2noF3(pfx)
13491          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
13492         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addpd", Iop_Add64Fx2 );
13493         goto decode_success;
13494      }
13495      break;
13496
13497   case 0x59:
13498      /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
13499      if (haveF2no66noF3(pfx)
13500          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
13501         delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "mulsd", Iop_Mul64F0x2 );
13502         goto decode_success;
13503      }
13504      /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
13505      if (haveF3no66noF2(pfx) && sz == 4) {
13506         delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "mulss", Iop_Mul32F0x4 );
13507         goto decode_success;
13508      }
13509      /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
13510      if (haveNo66noF2noF3(pfx) && sz == 4) {
13511         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulps", Iop_Mul32Fx4 );
13512         goto decode_success;
13513      }
13514      /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
13515      if (have66noF2noF3(pfx)
13516          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
13517         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulpd", Iop_Mul64Fx2 );
13518         goto decode_success;
13519      }
13520      break;
13521
13522   case 0x5A:
13523      /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
13524         F64 in xmm(G). */
13525      if (haveNo66noF2noF3(pfx)
13526          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
13527         delta = dis_CVTPS2PD_128( vbi, pfx, delta, False/*!isAvx*/ );
13528         goto decode_success;
13529      }
13530      /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
13531         low half xmm(G) */
13532      if (haveF3no66noF2(pfx) && sz == 4) {
13533         IRTemp f32lo = newTemp(Ity_F32);
13534
13535         modrm = getUChar(delta);
13536         if (epartIsReg(modrm)) {
13537            delta += 1;
13538            assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
13539            DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
13540                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
13541         } else {
13542            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13543            assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
13544            delta += alen;
13545            DIP("cvtss2sd %s,%s\n", dis_buf,
13546                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
13547         }
13548
13549         putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
13550                           unop( Iop_F32toF64, mkexpr(f32lo) ) );
13551
13552         goto decode_success;
13553      }
13554      /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
13555         low 1/4 xmm(G), according to prevailing SSE rounding mode */
13556      if (haveF2no66noF3(pfx) && sz == 4) {
13557         IRTemp rmode = newTemp(Ity_I32);
13558         IRTemp f64lo = newTemp(Ity_F64);
13559
13560         modrm = getUChar(delta);
13561         if (epartIsReg(modrm)) {
13562            delta += 1;
13563            assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
13564            DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
13565                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
13566         } else {
13567            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13568            assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
13569            delta += alen;
13570            DIP("cvtsd2ss %s,%s\n", dis_buf,
13571                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
13572         }
13573
13574         assign( rmode, get_sse_roundingmode() );
13575         putXMMRegLane32F(
13576            gregOfRexRM(pfx,modrm), 0,
13577            binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
13578         );
13579
13580         goto decode_success;
13581      }
13582      /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
13583         lo half xmm(G), rounding according to prevailing SSE rounding
13584         mode, and zero upper half */
13585      /* Note, this is practically identical to CVTPD2DQ.  It would have
13586         be nice to merge them together. */
13587      if (have66noF2noF3(pfx) && sz == 2) {
13588         delta = dis_CVTPD2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
13589         goto decode_success;
13590      }
13591      break;
13592
13593   case 0x5B:
13594      /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
13595         xmm(G), rounding towards zero */
13596      /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
13597         xmm(G), as per the prevailing rounding mode */
13598      if ( (have66noF2noF3(pfx) && sz == 2)
13599           || (haveF3no66noF2(pfx) && sz == 4) ) {
13600         Bool r2zero = toBool(sz == 4); // FIXME -- unreliable (???)
13601         delta = dis_CVTxPS2DQ_128( vbi, pfx, delta, False/*!isAvx*/, r2zero );
13602         goto decode_success;
13603      }
13604      /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
13605         xmm(G) */
13606      if (haveNo66noF2noF3(pfx) && sz == 4) {
13607         delta = dis_CVTDQ2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
13608         goto decode_success;
13609      }
13610      break;
13611
13612   case 0x5C:
13613      /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
13614      if (haveF3no66noF2(pfx) && sz == 4) {
13615         delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "subss", Iop_Sub32F0x4 );
13616         goto decode_success;
13617      }
13618      /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
13619      if (haveF2no66noF3(pfx)
13620          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
13621         delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "subsd", Iop_Sub64F0x2 );
13622         goto decode_success;
13623      }
13624      /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
13625      if (haveNo66noF2noF3(pfx) && sz == 4) {
13626         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subps", Iop_Sub32Fx4 );
13627         goto decode_success;
13628      }
13629      /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
13630      if (have66noF2noF3(pfx) && sz == 2) {
13631         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subpd", Iop_Sub64Fx2 );
13632         goto decode_success;
13633      }
13634      break;
13635
13636   case 0x5D:
13637      /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
13638      if (haveNo66noF2noF3(pfx) && sz == 4) {
13639         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minps", Iop_Min32Fx4 );
13640         goto decode_success;
13641      }
13642      /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
13643      if (haveF3no66noF2(pfx) && sz == 4) {
13644         delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "minss", Iop_Min32F0x4 );
13645         goto decode_success;
13646      }
13647      /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
13648      if (haveF2no66noF3(pfx)
13649          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
13650         delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "minsd", Iop_Min64F0x2 );
13651         goto decode_success;
13652      }
13653      /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
13654      if (have66noF2noF3(pfx) && sz == 2) {
13655         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minpd", Iop_Min64Fx2 );
13656         goto decode_success;
13657      }
13658      break;
13659
13660   case 0x5E:
13661      /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
13662      if (haveF2no66noF3(pfx) && sz == 4) {
13663         delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "divsd", Iop_Div64F0x2 );
13664         goto decode_success;
13665      }
13666      /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
13667      if (haveNo66noF2noF3(pfx) && sz == 4) {
13668         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divps", Iop_Div32Fx4 );
13669         goto decode_success;
13670      }
13671      /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
13672      if (haveF3no66noF2(pfx) && sz == 4) {
13673         delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "divss", Iop_Div32F0x4 );
13674         goto decode_success;
13675      }
13676      /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
13677      if (have66noF2noF3(pfx) && sz == 2) {
13678         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divpd", Iop_Div64Fx2 );
13679         goto decode_success;
13680      }
13681      break;
13682
13683   case 0x5F:
13684      /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
13685      if (haveNo66noF2noF3(pfx) && sz == 4) {
13686         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxps", Iop_Max32Fx4 );
13687         goto decode_success;
13688      }
13689      /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
13690      if (haveF3no66noF2(pfx) && sz == 4) {
13691         delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "maxss", Iop_Max32F0x4 );
13692         goto decode_success;
13693      }
13694      /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
13695      if (haveF2no66noF3(pfx)
13696          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
13697         delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "maxsd", Iop_Max64F0x2 );
13698         goto decode_success;
13699      }
13700      /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
13701      if (have66noF2noF3(pfx) && sz == 2) {
13702         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxpd", Iop_Max64Fx2 );
13703         goto decode_success;
13704      }
13705      break;
13706
13707   case 0x60:
13708      /* 66 0F 60 = PUNPCKLBW */
13709      if (have66noF2noF3(pfx) && sz == 2) {
13710         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13711                                    "punpcklbw",
13712                                    Iop_InterleaveLO8x16, True );
13713         goto decode_success;
13714      }
13715      break;
13716
13717   case 0x61:
13718      /* 66 0F 61 = PUNPCKLWD */
13719      if (have66noF2noF3(pfx) && sz == 2) {
13720         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13721                                    "punpcklwd",
13722                                    Iop_InterleaveLO16x8, True );
13723         goto decode_success;
13724      }
13725      break;
13726
13727   case 0x62:
13728      /* 66 0F 62 = PUNPCKLDQ */
13729      if (have66noF2noF3(pfx) && sz == 2) {
13730         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13731                                    "punpckldq",
13732                                    Iop_InterleaveLO32x4, True );
13733         goto decode_success;
13734      }
13735      break;
13736
13737   case 0x63:
13738      /* 66 0F 63 = PACKSSWB */
13739      if (have66noF2noF3(pfx) && sz == 2) {
13740         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13741                                    "packsswb",
13742                                    Iop_QNarrowBin16Sto8Sx16, True );
13743         goto decode_success;
13744      }
13745      break;
13746
13747   case 0x64:
13748      /* 66 0F 64 = PCMPGTB */
13749      if (have66noF2noF3(pfx) && sz == 2) {
13750         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13751                                    "pcmpgtb", Iop_CmpGT8Sx16, False );
13752         goto decode_success;
13753      }
13754      break;
13755
13756   case 0x65:
13757      /* 66 0F 65 = PCMPGTW */
13758      if (have66noF2noF3(pfx) && sz == 2) {
13759         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13760                                    "pcmpgtw", Iop_CmpGT16Sx8, False );
13761         goto decode_success;
13762      }
13763      break;
13764
13765   case 0x66:
13766      /* 66 0F 66 = PCMPGTD */
13767      if (have66noF2noF3(pfx) && sz == 2) {
13768         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13769                                    "pcmpgtd", Iop_CmpGT32Sx4, False );
13770         goto decode_success;
13771      }
13772      break;
13773
13774   case 0x67:
13775      /* 66 0F 67 = PACKUSWB */
13776      if (have66noF2noF3(pfx) && sz == 2) {
13777         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13778                                    "packuswb",
13779                                    Iop_QNarrowBin16Sto8Ux16, True );
13780         goto decode_success;
13781      }
13782      break;
13783
13784   case 0x68:
13785      /* 66 0F 68 = PUNPCKHBW */
13786      if (have66noF2noF3(pfx) && sz == 2) {
13787         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13788                                    "punpckhbw",
13789                                    Iop_InterleaveHI8x16, True );
13790         goto decode_success;
13791      }
13792      break;
13793
13794   case 0x69:
13795      /* 66 0F 69 = PUNPCKHWD */
13796      if (have66noF2noF3(pfx) && sz == 2) {
13797         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13798                                    "punpckhwd",
13799                                    Iop_InterleaveHI16x8, True );
13800         goto decode_success;
13801      }
13802      break;
13803
13804   case 0x6A:
13805      /* 66 0F 6A = PUNPCKHDQ */
13806      if (have66noF2noF3(pfx) && sz == 2) {
13807         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13808                                    "punpckhdq",
13809                                    Iop_InterleaveHI32x4, True );
13810         goto decode_success;
13811      }
13812      break;
13813
13814   case 0x6B:
13815      /* 66 0F 6B = PACKSSDW */
13816      if (have66noF2noF3(pfx) && sz == 2) {
13817         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13818                                    "packssdw",
13819                                    Iop_QNarrowBin32Sto16Sx8, True );
13820         goto decode_success;
13821      }
13822      break;
13823
13824   case 0x6C:
13825      /* 66 0F 6C = PUNPCKLQDQ */
13826      if (have66noF2noF3(pfx) && sz == 2) {
13827         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13828                                    "punpcklqdq",
13829                                    Iop_InterleaveLO64x2, True );
13830         goto decode_success;
13831      }
13832      break;
13833
13834   case 0x6D:
13835      /* 66 0F 6D = PUNPCKHQDQ */
13836      if (have66noF2noF3(pfx) && sz == 2) {
13837         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13838                                    "punpckhqdq",
13839                                    Iop_InterleaveHI64x2, True );
13840         goto decode_success;
13841      }
13842      break;
13843
13844   case 0x6E:
13845      /* 66 0F 6E = MOVD from ireg32/m32 to xmm lo 1/4,
13846                    zeroing high 3/4 of xmm. */
13847      /*              or from ireg64/m64 to xmm lo 1/2,
13848                    zeroing high 1/2 of xmm. */
13849      if (have66noF2noF3(pfx)) {
13850         vassert(sz == 2 || sz == 8);
13851         if (sz == 2) sz = 4;
13852         modrm = getUChar(delta);
13853         if (epartIsReg(modrm)) {
13854            delta += 1;
13855            if (sz == 4) {
13856               putXMMReg(
13857                  gregOfRexRM(pfx,modrm),
13858                  unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
13859               );
13860               DIP("movd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
13861                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
13862            } else {
13863               putXMMReg(
13864                  gregOfRexRM(pfx,modrm),
13865                  unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
13866               );
13867               DIP("movq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
13868                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
13869            }
13870         } else {
13871            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
13872            delta += alen;
13873            putXMMReg(
13874               gregOfRexRM(pfx,modrm),
13875               sz == 4
13876                  ?  unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
13877                  :  unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)) )
13878            );
13879            DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q', dis_buf,
13880                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
13881         }
13882         goto decode_success;
13883      }
13884      break;
13885
13886   case 0x6F:
13887      if (have66noF2noF3(pfx)
13888          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
13889         /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
13890         modrm = getUChar(delta);
13891         if (epartIsReg(modrm)) {
13892            putXMMReg( gregOfRexRM(pfx,modrm),
13893                       getXMMReg( eregOfRexRM(pfx,modrm) ));
13894            DIP("movdqa %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
13895                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
13896            delta += 1;
13897         } else {
13898            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13899            gen_SEGV_if_not_16_aligned( addr );
13900            putXMMReg( gregOfRexRM(pfx,modrm),
13901                       loadLE(Ity_V128, mkexpr(addr)) );
13902            DIP("movdqa %s,%s\n", dis_buf,
13903                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
13904            delta += alen;
13905         }
13906         goto decode_success;
13907      }
13908      if (haveF3no66noF2(pfx) && sz == 4) {
13909         /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
13910         modrm = getUChar(delta);
13911         if (epartIsReg(modrm)) {
13912            putXMMReg( gregOfRexRM(pfx,modrm),
13913                       getXMMReg( eregOfRexRM(pfx,modrm) ));
13914            DIP("movdqu %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
13915                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
13916            delta += 1;
13917         } else {
13918            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13919            putXMMReg( gregOfRexRM(pfx,modrm),
13920                       loadLE(Ity_V128, mkexpr(addr)) );
13921            DIP("movdqu %s,%s\n", dis_buf,
13922                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
13923            delta += alen;
13924         }
13925         goto decode_success;
13926      }
13927      break;
13928
13929   case 0x70:
13930      /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
13931      if (have66noF2noF3(pfx) && sz == 2) {
13932         delta = dis_PSHUFD_32x4( vbi, pfx, delta, False/*!writesYmm*/);
13933         goto decode_success;
13934      }
13935      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
13936      /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
13937      if (haveNo66noF2noF3(pfx) && sz == 4) {
13938         Int order;
13939         IRTemp sV, dV, s3, s2, s1, s0;
13940         s3 = s2 = s1 = s0 = IRTemp_INVALID;
13941         sV = newTemp(Ity_I64);
13942         dV = newTemp(Ity_I64);
13943         do_MMX_preamble();
13944         modrm = getUChar(delta);
13945         if (epartIsReg(modrm)) {
13946            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
13947            order = (Int)getUChar(delta+1);
13948            delta += 1+1;
13949            DIP("pshufw $%d,%s,%s\n", order,
13950                                      nameMMXReg(eregLO3ofRM(modrm)),
13951                                      nameMMXReg(gregLO3ofRM(modrm)));
13952         } else {
13953            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
13954                              1/*extra byte after amode*/ );
13955            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
13956            order = (Int)getUChar(delta+alen);
13957            delta += 1+alen;
13958            DIP("pshufw $%d,%s,%s\n", order,
13959                                      dis_buf,
13960                                      nameMMXReg(gregLO3ofRM(modrm)));
13961         }
13962         breakup64to16s( sV, &s3, &s2, &s1, &s0 );
13963#        define SEL(n) \
13964                   ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
13965         assign(dV,
13966                mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
13967                             SEL((order>>2)&3), SEL((order>>0)&3) )
13968         );
13969         putMMXReg(gregLO3ofRM(modrm), mkexpr(dV));
13970#        undef SEL
13971         goto decode_success;
13972      }
13973      /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
13974         mem) to G(xmm), and copy upper half */
13975      if (haveF2no66noF3(pfx) && sz == 4) {
13976         delta = dis_PSHUFxW_128( vbi, pfx, delta,
13977                                  False/*!isAvx*/, False/*!xIsH*/ );
13978         goto decode_success;
13979      }
13980      /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
13981         mem) to G(xmm), and copy lower half */
13982      if (haveF3no66noF2(pfx) && sz == 4) {
13983         delta = dis_PSHUFxW_128( vbi, pfx, delta,
13984                                  False/*!isAvx*/, True/*xIsH*/ );
13985         goto decode_success;
13986      }
13987      break;
13988
13989   case 0x71:
13990      /* 66 0F 71 /2 ib = PSRLW by immediate */
13991      if (have66noF2noF3(pfx) && sz == 2
13992          && epartIsReg(getUChar(delta))
13993          && gregLO3ofRM(getUChar(delta)) == 2) {
13994         delta = dis_SSE_shiftE_imm( pfx, delta, "psrlw", Iop_ShrN16x8 );
13995         goto decode_success;
13996      }
13997      /* 66 0F 71 /4 ib = PSRAW by immediate */
13998      if (have66noF2noF3(pfx) && sz == 2
13999          && epartIsReg(getUChar(delta))
14000          && gregLO3ofRM(getUChar(delta)) == 4) {
14001         delta = dis_SSE_shiftE_imm( pfx, delta, "psraw", Iop_SarN16x8 );
14002         goto decode_success;
14003      }
14004      /* 66 0F 71 /6 ib = PSLLW by immediate */
14005      if (have66noF2noF3(pfx) && sz == 2
14006          && epartIsReg(getUChar(delta))
14007          && gregLO3ofRM(getUChar(delta)) == 6) {
14008         delta = dis_SSE_shiftE_imm( pfx, delta, "psllw", Iop_ShlN16x8 );
14009         goto decode_success;
14010      }
14011      break;
14012
14013   case 0x72:
14014      /* 66 0F 72 /2 ib = PSRLD by immediate */
14015      if (have66noF2noF3(pfx) && sz == 2
14016          && epartIsReg(getUChar(delta))
14017          && gregLO3ofRM(getUChar(delta)) == 2) {
14018         delta = dis_SSE_shiftE_imm( pfx, delta, "psrld", Iop_ShrN32x4 );
14019         goto decode_success;
14020      }
14021      /* 66 0F 72 /4 ib = PSRAD by immediate */
14022      if (have66noF2noF3(pfx) && sz == 2
14023          && epartIsReg(getUChar(delta))
14024          && gregLO3ofRM(getUChar(delta)) == 4) {
14025         delta = dis_SSE_shiftE_imm( pfx, delta, "psrad", Iop_SarN32x4 );
14026         goto decode_success;
14027      }
14028      /* 66 0F 72 /6 ib = PSLLD by immediate */
14029      if (have66noF2noF3(pfx) && sz == 2
14030          && epartIsReg(getUChar(delta))
14031          && gregLO3ofRM(getUChar(delta)) == 6) {
14032         delta = dis_SSE_shiftE_imm( pfx, delta, "pslld", Iop_ShlN32x4 );
14033         goto decode_success;
14034      }
14035      break;
14036
14037   case 0x73:
14038      /* 66 0F 73 /3 ib = PSRLDQ by immediate */
14039      /* note, if mem case ever filled in, 1 byte after amode */
14040      if (have66noF2noF3(pfx) && sz == 2
14041          && epartIsReg(getUChar(delta))
14042          && gregLO3ofRM(getUChar(delta)) == 3) {
14043         Int imm = (Int)getUChar(delta+1);
14044         Int reg = eregOfRexRM(pfx,getUChar(delta));
14045         DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
14046         delta += 2;
14047         IRTemp sV = newTemp(Ity_V128);
14048         assign( sV, getXMMReg(reg) );
14049         putXMMReg(reg, mkexpr(math_PSRLDQ( sV, imm )));
14050         goto decode_success;
14051      }
14052      /* 66 0F 73 /7 ib = PSLLDQ by immediate */
14053      /* note, if mem case ever filled in, 1 byte after amode */
14054      if (have66noF2noF3(pfx) && sz == 2
14055          && epartIsReg(getUChar(delta))
14056          && gregLO3ofRM(getUChar(delta)) == 7) {
14057         Int imm = (Int)getUChar(delta+1);
14058         Int reg = eregOfRexRM(pfx,getUChar(delta));
14059         DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
14060         vassert(imm >= 0 && imm <= 255);
14061         delta += 2;
14062         IRTemp sV = newTemp(Ity_V128);
14063         assign( sV, getXMMReg(reg) );
14064         putXMMReg(reg, mkexpr(math_PSLLDQ( sV, imm )));
14065         goto decode_success;
14066      }
14067      /* 66 0F 73 /2 ib = PSRLQ by immediate */
14068      if (have66noF2noF3(pfx) && sz == 2
14069          && epartIsReg(getUChar(delta))
14070          && gregLO3ofRM(getUChar(delta)) == 2) {
14071         delta = dis_SSE_shiftE_imm( pfx, delta, "psrlq", Iop_ShrN64x2 );
14072         goto decode_success;
14073      }
14074      /* 66 0F 73 /6 ib = PSLLQ by immediate */
14075      if (have66noF2noF3(pfx) && sz == 2
14076          && epartIsReg(getUChar(delta))
14077          && gregLO3ofRM(getUChar(delta)) == 6) {
14078         delta = dis_SSE_shiftE_imm( pfx, delta, "psllq", Iop_ShlN64x2 );
14079         goto decode_success;
14080      }
14081      break;
14082
14083   case 0x74:
14084      /* 66 0F 74 = PCMPEQB */
14085      if (have66noF2noF3(pfx) && sz == 2) {
14086         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14087                                    "pcmpeqb", Iop_CmpEQ8x16, False );
14088         goto decode_success;
14089      }
14090      break;
14091
14092   case 0x75:
14093      /* 66 0F 75 = PCMPEQW */
14094      if (have66noF2noF3(pfx) && sz == 2) {
14095         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14096                                    "pcmpeqw", Iop_CmpEQ16x8, False );
14097         goto decode_success;
14098      }
14099      break;
14100
14101   case 0x76:
14102      /* 66 0F 76 = PCMPEQD */
14103      if (have66noF2noF3(pfx) && sz == 2) {
14104         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14105                                    "pcmpeqd", Iop_CmpEQ32x4, False );
14106         goto decode_success;
14107      }
14108      break;
14109
14110   case 0x7E:
14111      /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
14112         G (lo half xmm).  Upper half of G is zeroed out. */
14113      if (haveF3no66noF2(pfx)
14114          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
14115         modrm = getUChar(delta);
14116         if (epartIsReg(modrm)) {
14117            putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
14118                             getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
14119               /* zero bits 127:64 */
14120               putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkU64(0) );
14121            DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
14122                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
14123            delta += 1;
14124         } else {
14125            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14126            putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
14127            putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
14128                             loadLE(Ity_I64, mkexpr(addr)) );
14129            DIP("movsd %s,%s\n", dis_buf,
14130                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
14131            delta += alen;
14132         }
14133         goto decode_success;
14134      }
14135      /* 66 0F 7E = MOVD from xmm low 1/4 to ireg32 or m32. */
14136      /*              or from xmm low 1/2 to ireg64 or m64. */
14137         if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
14138         if (sz == 2) sz = 4;
14139         modrm = getUChar(delta);
14140         if (epartIsReg(modrm)) {
14141            delta += 1;
14142            if (sz == 4) {
14143               putIReg32( eregOfRexRM(pfx,modrm),
14144                          getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
14145               DIP("movd %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
14146                                    nameIReg32(eregOfRexRM(pfx,modrm)));
14147            } else {
14148               putIReg64( eregOfRexRM(pfx,modrm),
14149                          getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
14150               DIP("movq %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
14151                                    nameIReg64(eregOfRexRM(pfx,modrm)));
14152            }
14153         } else {
14154            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
14155            delta += alen;
14156            storeLE( mkexpr(addr),
14157                     sz == 4
14158                        ? getXMMRegLane32(gregOfRexRM(pfx,modrm),0)
14159                        : getXMMRegLane64(gregOfRexRM(pfx,modrm),0) );
14160            DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q',
14161                                  nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
14162         }
14163         goto decode_success;
14164      }
14165      break;
14166
14167   case 0x7F:
14168      /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
14169      if (haveF3no66noF2(pfx) && sz == 4) {
14170         modrm = getUChar(delta);
14171         if (epartIsReg(modrm)) {
14172            goto decode_failure; /* awaiting test case */
14173            delta += 1;
14174            putXMMReg( eregOfRexRM(pfx,modrm),
14175                       getXMMReg(gregOfRexRM(pfx,modrm)) );
14176            DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
14177                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
14178         } else {
14179            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
14180            delta += alen;
14181            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
14182            DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
14183         }
14184         goto decode_success;
14185      }
14186      /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
14187      if (have66noF2noF3(pfx) && sz == 2) {
14188         modrm = getUChar(delta);
14189         if (epartIsReg(modrm)) {
14190            delta += 1;
14191            putXMMReg( eregOfRexRM(pfx,modrm),
14192                       getXMMReg(gregOfRexRM(pfx,modrm)) );
14193            DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
14194                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
14195         } else {
14196            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
14197            gen_SEGV_if_not_16_aligned( addr );
14198            delta += alen;
14199            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
14200            DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
14201         }
14202         goto decode_success;
14203      }
14204      break;
14205
14206   case 0xAE:
14207      /* 0F AE /7 = SFENCE -- flush pending operations to memory */
14208      if (haveNo66noF2noF3(pfx)
14209          && epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
14210          && sz == 4) {
14211         delta += 1;
14212         /* Insert a memory fence.  It's sometimes important that these
14213            are carried through to the generated code. */
14214         stmt( IRStmt_MBE(Imbe_Fence) );
14215         DIP("sfence\n");
14216         goto decode_success;
14217      }
14218      /* mindless duplication follows .. */
14219      /* 0F AE /5 = LFENCE -- flush pending operations to memory */
14220      /* 0F AE /6 = MFENCE -- flush pending operations to memory */
14221      if (haveNo66noF2noF3(pfx)
14222          && epartIsReg(getUChar(delta))
14223          && (gregLO3ofRM(getUChar(delta)) == 5
14224              || gregLO3ofRM(getUChar(delta)) == 6)
14225          && sz == 4) {
14226         delta += 1;
14227         /* Insert a memory fence.  It's sometimes important that these
14228            are carried through to the generated code. */
14229         stmt( IRStmt_MBE(Imbe_Fence) );
14230         DIP("%sfence\n", gregLO3ofRM(getUChar(delta-1))==5 ? "l" : "m");
14231         goto decode_success;
14232      }
14233
14234      /* 0F AE /7 = CLFLUSH -- flush cache line */
14235      if (haveNo66noF2noF3(pfx)
14236          && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
14237          && sz == 4) {
14238
14239         /* This is something of a hack.  We need to know the size of
14240            the cache line containing addr.  Since we don't (easily),
14241            assume 256 on the basis that no real cache would have a
14242            line that big.  It's safe to invalidate more stuff than we
14243            need, just inefficient. */
14244         ULong lineszB = 256ULL;
14245
14246         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14247         delta += alen;
14248
14249         /* Round addr down to the start of the containing block. */
14250         stmt( IRStmt_Put(
14251                  OFFB_CMSTART,
14252                  binop( Iop_And64,
14253                         mkexpr(addr),
14254                         mkU64( ~(lineszB-1) ))) );
14255
14256         stmt( IRStmt_Put(OFFB_CMLEN, mkU64(lineszB) ) );
14257
14258         jmp_lit(dres, Ijk_InvalICache, (Addr64)(guest_RIP_bbstart+delta));
14259
14260         DIP("clflush %s\n", dis_buf);
14261         goto decode_success;
14262      }
14263
14264      /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
14265      if (haveNo66noF2noF3(pfx)
14266          && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 3
14267          && sz == 4) {
14268         delta = dis_STMXCSR(vbi, pfx, delta, False/*!isAvx*/);
14269         goto decode_success;
14270      }
14271      /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
14272      if (haveNo66noF2noF3(pfx)
14273          && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 2
14274          && sz == 4) {
14275         delta = dis_LDMXCSR(vbi, pfx, delta, False/*!isAvx*/);
14276         goto decode_success;
14277      }
14278      /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory */
14279      if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
14280          && !epartIsReg(getUChar(delta))
14281          && gregOfRexRM(pfx,getUChar(delta)) == 0) {
14282         delta = dis_FXSAVE(vbi, pfx, delta, sz);
14283         goto decode_success;
14284      }
14285      /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory */
14286      if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
14287          && !epartIsReg(getUChar(delta))
14288          && gregOfRexRM(pfx,getUChar(delta)) == 1) {
14289         delta = dis_FXRSTOR(vbi, pfx, delta, sz);
14290         goto decode_success;
14291      }
14292      /* 0F AE /4 = XSAVE mem -- write x87, SSE, AVX state to memory */
14293      if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
14294          && !epartIsReg(getUChar(delta))
14295          && gregOfRexRM(pfx,getUChar(delta)) == 4
14296          && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
14297         delta = dis_XSAVE(vbi, pfx, delta, sz);
14298         goto decode_success;
14299      }
14300      /* 0F AE /5 = XRSTOR mem -- read x87, SSE, AVX state from memory */
14301      if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
14302          && !epartIsReg(getUChar(delta))
14303          && gregOfRexRM(pfx,getUChar(delta)) == 5
14304          && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
14305         delta = dis_XRSTOR(vbi, pfx, delta, sz);
14306         goto decode_success;
14307      }
14308      break;
14309
14310   case 0xC2:
14311      /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
14312      if (haveNo66noF2noF3(pfx) && sz == 4) {
14313         Long delta0 = delta;
14314         delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpps", True, 4 );
14315         if (delta > delta0) goto decode_success;
14316      }
14317      /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
14318      if (haveF3no66noF2(pfx) && sz == 4) {
14319         Long delta0 = delta;
14320         delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpss", False, 4 );
14321         if (delta > delta0) goto decode_success;
14322      }
14323      /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
14324      if (haveF2no66noF3(pfx) && sz == 4) {
14325         Long delta0 = delta;
14326         delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpsd", False, 8 );
14327         if (delta > delta0) goto decode_success;
14328      }
14329      /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
14330      if (have66noF2noF3(pfx) && sz == 2) {
14331         Long delta0 = delta;
14332         delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmppd", True, 8 );
14333         if (delta > delta0) goto decode_success;
14334      }
14335      break;
14336
14337   case 0xC3:
14338      /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
14339      if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
14340         modrm = getUChar(delta);
14341         if (!epartIsReg(modrm)) {
14342            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14343            storeLE( mkexpr(addr), getIRegG(sz, pfx, modrm) );
14344            DIP("movnti %s,%s\n", dis_buf,
14345                                  nameIRegG(sz, pfx, modrm));
14346            delta += alen;
14347            goto decode_success;
14348         }
14349         /* else fall through */
14350      }
14351      break;
14352
14353   case 0xC4:
14354      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14355      /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
14356         put it into the specified lane of mmx(G). */
14357      if (haveNo66noF2noF3(pfx)
14358          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
14359         /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
14360            mmx reg.  t4 is the new lane value.  t5 is the original
14361            mmx value. t6 is the new mmx value. */
14362         Int lane;
14363         t4 = newTemp(Ity_I16);
14364         t5 = newTemp(Ity_I64);
14365         t6 = newTemp(Ity_I64);
14366         modrm = getUChar(delta);
14367         do_MMX_preamble();
14368
14369         assign(t5, getMMXReg(gregLO3ofRM(modrm)));
14370         breakup64to16s( t5, &t3, &t2, &t1, &t0 );
14371
14372         if (epartIsReg(modrm)) {
14373            assign(t4, getIReg16(eregOfRexRM(pfx,modrm)));
14374            delta += 1+1;
14375            lane = getUChar(delta-1);
14376            DIP("pinsrw $%d,%s,%s\n", lane,
14377                                      nameIReg16(eregOfRexRM(pfx,modrm)),
14378                                      nameMMXReg(gregLO3ofRM(modrm)));
14379         } else {
14380            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
14381            delta += 1+alen;
14382            lane = getUChar(delta-1);
14383            assign(t4, loadLE(Ity_I16, mkexpr(addr)));
14384            DIP("pinsrw $%d,%s,%s\n", lane,
14385                                      dis_buf,
14386                                      nameMMXReg(gregLO3ofRM(modrm)));
14387         }
14388
14389         switch (lane & 3) {
14390            case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
14391            case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
14392            case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
14393            case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
14394            default: vassert(0);
14395         }
14396         putMMXReg(gregLO3ofRM(modrm), mkexpr(t6));
14397         goto decode_success;
14398      }
14399      /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
14400         put it into the specified lane of xmm(G). */
14401      if (have66noF2noF3(pfx)
14402          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
14403         Int lane;
14404         t4 = newTemp(Ity_I16);
14405         modrm = getUChar(delta);
14406         UInt rG = gregOfRexRM(pfx,modrm);
14407         if (epartIsReg(modrm)) {
14408            UInt rE = eregOfRexRM(pfx,modrm);
14409            assign(t4, getIReg16(rE));
14410            delta += 1+1;
14411            lane = getUChar(delta-1);
14412            DIP("pinsrw $%d,%s,%s\n",
14413                lane, nameIReg16(rE), nameXMMReg(rG));
14414         } else {
14415            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
14416                              1/*byte after the amode*/ );
14417            delta += 1+alen;
14418            lane = getUChar(delta-1);
14419            assign(t4, loadLE(Ity_I16, mkexpr(addr)));
14420            DIP("pinsrw $%d,%s,%s\n",
14421                lane, dis_buf, nameXMMReg(rG));
14422         }
14423         IRTemp src_vec = newTemp(Ity_V128);
14424         assign(src_vec, getXMMReg(rG));
14425         IRTemp res_vec = math_PINSRW_128( src_vec, t4, lane & 7);
14426         putXMMReg(rG, mkexpr(res_vec));
14427         goto decode_success;
14428      }
14429      break;
14430
14431   case 0xC5:
14432      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14433      /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
14434         zero-extend of it in ireg(G). */
14435      if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
14436         modrm = getUChar(delta);
14437         if (epartIsReg(modrm)) {
14438            IRTemp sV = newTemp(Ity_I64);
14439            t5 = newTemp(Ity_I16);
14440            do_MMX_preamble();
14441            assign(sV, getMMXReg(eregLO3ofRM(modrm)));
14442            breakup64to16s( sV, &t3, &t2, &t1, &t0 );
14443            switch (getUChar(delta+1) & 3) {
14444               case 0:  assign(t5, mkexpr(t0)); break;
14445               case 1:  assign(t5, mkexpr(t1)); break;
14446               case 2:  assign(t5, mkexpr(t2)); break;
14447               case 3:  assign(t5, mkexpr(t3)); break;
14448               default: vassert(0);
14449            }
14450            if (sz == 8)
14451               putIReg64(gregOfRexRM(pfx,modrm), unop(Iop_16Uto64, mkexpr(t5)));
14452            else
14453               putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_16Uto32, mkexpr(t5)));
14454            DIP("pextrw $%d,%s,%s\n",
14455                (Int)getUChar(delta+1),
14456                nameMMXReg(eregLO3ofRM(modrm)),
14457                sz==8 ? nameIReg64(gregOfRexRM(pfx,modrm))
14458                      : nameIReg32(gregOfRexRM(pfx,modrm))
14459            );
14460            delta += 2;
14461            goto decode_success;
14462         }
14463         /* else fall through */
14464         /* note, for anyone filling in the mem case: this insn has one
14465            byte after the amode and therefore you must pass 1 as the
14466            last arg to disAMode */
14467      }
14468      /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
14469         zero-extend of it in ireg(G). */
14470      if (have66noF2noF3(pfx)
14471          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
14472         Long delta0 = delta;
14473         delta = dis_PEXTRW_128_EregOnly_toG( vbi, pfx, delta,
14474                                              False/*!isAvx*/ );
14475         if (delta > delta0) goto decode_success;
14476         /* else fall through -- decoding has failed */
14477      }
14478      break;
14479
14480   case 0xC6:
14481      /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
14482      if (haveNo66noF2noF3(pfx) && sz == 4) {
14483         Int    imm8 = 0;
14484         IRTemp sV   = newTemp(Ity_V128);
14485         IRTemp dV   = newTemp(Ity_V128);
14486         modrm = getUChar(delta);
14487         UInt rG = gregOfRexRM(pfx,modrm);
14488         assign( dV, getXMMReg(rG) );
14489         if (epartIsReg(modrm)) {
14490            UInt rE = eregOfRexRM(pfx,modrm);
14491            assign( sV, getXMMReg(rE) );
14492            imm8 = (Int)getUChar(delta+1);
14493            delta += 1+1;
14494            DIP("shufps $%d,%s,%s\n", imm8, nameXMMReg(rE), nameXMMReg(rG));
14495         } else {
14496            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
14497            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
14498            imm8 = (Int)getUChar(delta+alen);
14499            delta += 1+alen;
14500            DIP("shufps $%d,%s,%s\n", imm8, dis_buf, nameXMMReg(rG));
14501         }
14502         IRTemp res = math_SHUFPS_128( sV, dV, imm8 );
14503         putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
14504         goto decode_success;
14505      }
14506      /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
14507      if (have66noF2noF3(pfx) && sz == 2) {
14508         Int    select;
14509         IRTemp sV = newTemp(Ity_V128);
14510         IRTemp dV = newTemp(Ity_V128);
14511
14512         modrm = getUChar(delta);
14513         assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
14514
14515         if (epartIsReg(modrm)) {
14516            assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
14517            select = (Int)getUChar(delta+1);
14518            delta += 1+1;
14519            DIP("shufpd $%d,%s,%s\n", select,
14520                                      nameXMMReg(eregOfRexRM(pfx,modrm)),
14521                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
14522         } else {
14523            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
14524            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
14525            select = getUChar(delta+alen);
14526            delta += 1+alen;
14527            DIP("shufpd $%d,%s,%s\n", select,
14528                                      dis_buf,
14529                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
14530         }
14531
14532         IRTemp res = math_SHUFPD_128( sV, dV, select );
14533         putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
14534         goto decode_success;
14535      }
14536      break;
14537
14538   case 0xD1:
14539      /* 66 0F D1 = PSRLW by E */
14540      if (have66noF2noF3(pfx) && sz == 2) {
14541         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlw", Iop_ShrN16x8 );
14542         goto decode_success;
14543      }
14544      break;
14545
14546   case 0xD2:
14547      /* 66 0F D2 = PSRLD by E */
14548      if (have66noF2noF3(pfx) && sz == 2) {
14549         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrld", Iop_ShrN32x4 );
14550         goto decode_success;
14551      }
14552      break;
14553
14554   case 0xD3:
14555      /* 66 0F D3 = PSRLQ by E */
14556      if (have66noF2noF3(pfx) && sz == 2) {
14557         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlq", Iop_ShrN64x2 );
14558         goto decode_success;
14559      }
14560      break;
14561
14562   case 0xD4:
14563      /* 66 0F D4 = PADDQ */
14564      if (have66noF2noF3(pfx) && sz == 2) {
14565         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14566                                    "paddq", Iop_Add64x2, False );
14567         goto decode_success;
14568      }
14569      /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
14570      /* 0F D4 = PADDQ -- add 64x1 */
14571      if (haveNo66noF2noF3(pfx) && sz == 4) {
14572         do_MMX_preamble();
14573         delta = dis_MMXop_regmem_to_reg (
14574                   vbi, pfx, delta, opc, "paddq", False );
14575         goto decode_success;
14576      }
14577      break;
14578
14579   case 0xD5:
14580      /* 66 0F D5 = PMULLW -- 16x8 multiply */
14581      if (have66noF2noF3(pfx) && sz == 2) {
14582         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14583                                    "pmullw", Iop_Mul16x8, False );
14584         goto decode_success;
14585      }
14586      break;
14587
14588   case 0xD6:
14589      /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
14590         hi half). */
14591      if (haveF3no66noF2(pfx) && sz == 4) {
14592         modrm = getUChar(delta);
14593         if (epartIsReg(modrm)) {
14594            do_MMX_preamble();
14595            putXMMReg( gregOfRexRM(pfx,modrm),
14596                       unop(Iop_64UtoV128, getMMXReg( eregLO3ofRM(modrm) )) );
14597            DIP("movq2dq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
14598                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
14599            delta += 1;
14600            goto decode_success;
14601         }
14602         /* apparently no mem case for this insn */
14603      }
14604      /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
14605         or lo half xmm).  */
14606      if (have66noF2noF3(pfx)
14607          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
14608         modrm = getUChar(delta);
14609         if (epartIsReg(modrm)) {
14610            /* fall through, awaiting test case */
14611            /* dst: lo half copied, hi half zeroed */
14612         } else {
14613            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14614            storeLE( mkexpr(addr),
14615                     getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
14616            DIP("movq %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf );
14617            delta += alen;
14618            goto decode_success;
14619         }
14620      }
14621      /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
14622      if (haveF2no66noF3(pfx) && sz == 4) {
14623         modrm = getUChar(delta);
14624         if (epartIsReg(modrm)) {
14625            do_MMX_preamble();
14626            putMMXReg( gregLO3ofRM(modrm),
14627                       getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
14628            DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
14629                                   nameMMXReg(gregLO3ofRM(modrm)));
14630            delta += 1;
14631            goto decode_success;
14632         }
14633         /* apparently no mem case for this insn */
14634      }
14635      break;
14636
14637   case 0xD7:
14638      /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16
14639         lanes in xmm(E), turn them into a byte, and put
14640         zero-extend of it in ireg(G).  Doing this directly is just
14641         too cumbersome; give up therefore and call a helper. */
14642      if (have66noF2noF3(pfx)
14643          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
14644          && epartIsReg(getUChar(delta))) { /* no memory case, it seems */
14645         delta = dis_PMOVMSKB_128( vbi, pfx, delta, False/*!isAvx*/ );
14646         goto decode_success;
14647      }
14648      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14649      /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
14650         mmx(E), turn them into a byte, and put zero-extend of it in
14651         ireg(G). */
14652      if (haveNo66noF2noF3(pfx)
14653          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
14654         modrm = getUChar(delta);
14655         if (epartIsReg(modrm)) {
14656            do_MMX_preamble();
14657            t0 = newTemp(Ity_I64);
14658            t1 = newTemp(Ity_I32);
14659            assign(t0, getMMXReg(eregLO3ofRM(modrm)));
14660            assign(t1, unop(Iop_8Uto32, unop(Iop_GetMSBs8x8, mkexpr(t0))));
14661            putIReg32(gregOfRexRM(pfx,modrm), mkexpr(t1));
14662            DIP("pmovmskb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
14663                                    nameIReg32(gregOfRexRM(pfx,modrm)));
14664            delta += 1;
14665            goto decode_success;
14666         }
14667         /* else fall through */
14668      }
14669      break;
14670
14671   case 0xD8:
14672      /* 66 0F D8 = PSUBUSB */
14673      if (have66noF2noF3(pfx) && sz == 2) {
14674         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14675                                    "psubusb", Iop_QSub8Ux16, False );
14676         goto decode_success;
14677      }
14678      break;
14679
14680   case 0xD9:
14681      /* 66 0F D9 = PSUBUSW */
14682      if (have66noF2noF3(pfx) && sz == 2) {
14683         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14684                                    "psubusw", Iop_QSub16Ux8, False );
14685         goto decode_success;
14686      }
14687      break;
14688
14689   case 0xDA:
14690      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14691      /* 0F DA = PMINUB -- 8x8 unsigned min */
14692      if (haveNo66noF2noF3(pfx) && sz == 4) {
14693         do_MMX_preamble();
14694         delta = dis_MMXop_regmem_to_reg (
14695                    vbi, pfx, delta, opc, "pminub", False );
14696         goto decode_success;
14697      }
14698      /* 66 0F DA = PMINUB -- 8x16 unsigned min */
14699      if (have66noF2noF3(pfx) && sz == 2) {
14700         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14701                                    "pminub", Iop_Min8Ux16, False );
14702         goto decode_success;
14703      }
14704      break;
14705
14706   case 0xDB:
14707      /* 66 0F DB = PAND */
14708      if (have66noF2noF3(pfx) && sz == 2) {
14709         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pand", Iop_AndV128 );
14710         goto decode_success;
14711      }
14712      break;
14713
14714   case 0xDC:
14715      /* 66 0F DC = PADDUSB */
14716      if (have66noF2noF3(pfx) && sz == 2) {
14717         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14718                                    "paddusb", Iop_QAdd8Ux16, False );
14719         goto decode_success;
14720      }
14721      break;
14722
14723   case 0xDD:
14724      /* 66 0F DD = PADDUSW */
14725      if (have66noF2noF3(pfx) && sz == 2) {
14726         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14727                                    "paddusw", Iop_QAdd16Ux8, False );
14728         goto decode_success;
14729      }
14730      break;
14731
14732   case 0xDE:
14733      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14734      /* 0F DE = PMAXUB -- 8x8 unsigned max */
14735      if (haveNo66noF2noF3(pfx) && sz == 4) {
14736         do_MMX_preamble();
14737         delta = dis_MMXop_regmem_to_reg (
14738                    vbi, pfx, delta, opc, "pmaxub", False );
14739         goto decode_success;
14740      }
14741      /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
14742      if (have66noF2noF3(pfx) && sz == 2) {
14743         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14744                                    "pmaxub", Iop_Max8Ux16, False );
14745         goto decode_success;
14746      }
14747      break;
14748
14749   case 0xDF:
14750      /* 66 0F DF = PANDN */
14751      if (have66noF2noF3(pfx) && sz == 2) {
14752         delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "pandn", Iop_AndV128 );
14753         goto decode_success;
14754      }
14755      break;
14756
14757   case 0xE0:
14758      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14759      /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
14760      if (haveNo66noF2noF3(pfx) && sz == 4) {
14761         do_MMX_preamble();
14762         delta = dis_MMXop_regmem_to_reg (
14763                    vbi, pfx, delta, opc, "pavgb", False );
14764         goto decode_success;
14765      }
14766      /* 66 0F E0 = PAVGB */
14767      if (have66noF2noF3(pfx) && sz == 2) {
14768         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14769                                    "pavgb", Iop_Avg8Ux16, False );
14770         goto decode_success;
14771      }
14772      break;
14773
14774   case 0xE1:
14775      /* 66 0F E1 = PSRAW by E */
14776      if (have66noF2noF3(pfx) && sz == 2) {
14777         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psraw", Iop_SarN16x8 );
14778         goto decode_success;
14779      }
14780      break;
14781
14782   case 0xE2:
14783      /* 66 0F E2 = PSRAD by E */
14784      if (have66noF2noF3(pfx) && sz == 2) {
14785         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrad", Iop_SarN32x4 );
14786         goto decode_success;
14787      }
14788      break;
14789
14790   case 0xE3:
14791      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14792      /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
14793      if (haveNo66noF2noF3(pfx) && sz == 4) {
14794         do_MMX_preamble();
14795         delta = dis_MMXop_regmem_to_reg (
14796                    vbi, pfx, delta, opc, "pavgw", False );
14797         goto decode_success;
14798      }
14799      /* 66 0F E3 = PAVGW */
14800      if (have66noF2noF3(pfx) && sz == 2) {
14801         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14802                                    "pavgw", Iop_Avg16Ux8, False );
14803         goto decode_success;
14804      }
14805      break;
14806
14807   case 0xE4:
14808      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14809      /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
14810      if (haveNo66noF2noF3(pfx) && sz == 4) {
14811         do_MMX_preamble();
14812         delta = dis_MMXop_regmem_to_reg (
14813                    vbi, pfx, delta, opc, "pmuluh", False );
14814         goto decode_success;
14815      }
14816      /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
14817      if (have66noF2noF3(pfx) && sz == 2) {
14818         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14819                                    "pmulhuw", Iop_MulHi16Ux8, False );
14820         goto decode_success;
14821      }
14822      break;
14823
14824   case 0xE5:
14825      /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
14826      if (have66noF2noF3(pfx) && sz == 2) {
14827         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14828                                    "pmulhw", Iop_MulHi16Sx8, False );
14829         goto decode_success;
14830      }
14831      break;
14832
14833   case 0xE6:
14834      /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
14835         lo half xmm(G), and zero upper half, rounding towards zero */
14836      /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
14837         lo half xmm(G), according to prevailing rounding mode, and zero
14838         upper half */
14839      if ( (haveF2no66noF3(pfx) && sz == 4)
14840           || (have66noF2noF3(pfx) && sz == 2) ) {
14841         delta = dis_CVTxPD2DQ_128( vbi, pfx, delta, False/*!isAvx*/,
14842                                    toBool(sz == 2)/*r2zero*/);
14843         goto decode_success;
14844      }
14845      /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
14846         F64 in xmm(G) */
14847      if (haveF3no66noF2(pfx) && sz == 4) {
14848         delta = dis_CVTDQ2PD_128(vbi, pfx, delta, False/*!isAvx*/);
14849         goto decode_success;
14850      }
14851      break;
14852
14853   case 0xE7:
14854      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14855      /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
14856         Intel manual does not say anything about the usual business of
14857         the FP reg tags getting trashed whenever an MMX insn happens.
14858         So we just leave them alone.
14859      */
14860      if (haveNo66noF2noF3(pfx) && sz == 4) {
14861         modrm = getUChar(delta);
14862         if (!epartIsReg(modrm)) {
14863            /* do_MMX_preamble(); Intel docs don't specify this */
14864            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14865            storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
14866            DIP("movntq %s,%s\n", dis_buf,
14867                                  nameMMXReg(gregLO3ofRM(modrm)));
14868            delta += alen;
14869            goto decode_success;
14870         }
14871         /* else fall through */
14872      }
14873      /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
14874      if (have66noF2noF3(pfx) && sz == 2) {
14875         modrm = getUChar(delta);
14876         if (!epartIsReg(modrm)) {
14877            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14878            gen_SEGV_if_not_16_aligned( addr );
14879            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
14880            DIP("movntdq %s,%s\n", dis_buf,
14881                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
14882            delta += alen;
14883            goto decode_success;
14884         }
14885         /* else fall through */
14886      }
14887      break;
14888
14889   case 0xE8:
14890      /* 66 0F E8 = PSUBSB */
14891      if (have66noF2noF3(pfx) && sz == 2) {
14892         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14893                                    "psubsb", Iop_QSub8Sx16, False );
14894         goto decode_success;
14895      }
14896      break;
14897
14898   case 0xE9:
14899      /* 66 0F E9 = PSUBSW */
14900      if (have66noF2noF3(pfx) && sz == 2) {
14901         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14902                                    "psubsw", Iop_QSub16Sx8, False );
14903         goto decode_success;
14904      }
14905      break;
14906
14907   case 0xEA:
14908      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14909      /* 0F EA = PMINSW -- 16x4 signed min */
14910      if (haveNo66noF2noF3(pfx) && sz == 4) {
14911         do_MMX_preamble();
14912         delta = dis_MMXop_regmem_to_reg (
14913                    vbi, pfx, delta, opc, "pminsw", False );
14914         goto decode_success;
14915      }
14916      /* 66 0F EA = PMINSW -- 16x8 signed min */
14917      if (have66noF2noF3(pfx) && sz == 2) {
14918         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14919                                    "pminsw", Iop_Min16Sx8, False );
14920         goto decode_success;
14921      }
14922      break;
14923
14924   case 0xEB:
14925      /* 66 0F EB = POR */
14926      if (have66noF2noF3(pfx) && sz == 2) {
14927         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "por", Iop_OrV128 );
14928         goto decode_success;
14929      }
14930      break;
14931
14932   case 0xEC:
14933      /* 66 0F EC = PADDSB */
14934      if (have66noF2noF3(pfx) && sz == 2) {
14935         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14936                                    "paddsb", Iop_QAdd8Sx16, False );
14937         goto decode_success;
14938      }
14939      break;
14940
14941   case 0xED:
14942      /* 66 0F ED = PADDSW */
14943      if (have66noF2noF3(pfx) && sz == 2) {
14944         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14945                                    "paddsw", Iop_QAdd16Sx8, False );
14946         goto decode_success;
14947      }
14948      break;
14949
14950   case 0xEE:
14951      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14952      /* 0F EE = PMAXSW -- 16x4 signed max */
14953      if (haveNo66noF2noF3(pfx) && sz == 4) {
14954         do_MMX_preamble();
14955         delta = dis_MMXop_regmem_to_reg (
14956                    vbi, pfx, delta, opc, "pmaxsw", False );
14957         goto decode_success;
14958      }
14959      /* 66 0F EE = PMAXSW -- 16x8 signed max */
14960      if (have66noF2noF3(pfx) && sz == 2) {
14961         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14962                                    "pmaxsw", Iop_Max16Sx8, False );
14963         goto decode_success;
14964      }
14965      break;
14966
14967   case 0xEF:
14968      /* 66 0F EF = PXOR */
14969      if (have66noF2noF3(pfx) && sz == 2) {
14970         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pxor", Iop_XorV128 );
14971         goto decode_success;
14972      }
14973      break;
14974
14975   case 0xF1:
14976      /* 66 0F F1 = PSLLW by E */
14977      if (have66noF2noF3(pfx) && sz == 2) {
14978         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllw", Iop_ShlN16x8 );
14979         goto decode_success;
14980      }
14981      break;
14982
14983   case 0xF2:
14984      /* 66 0F F2 = PSLLD by E */
14985      if (have66noF2noF3(pfx) && sz == 2) {
14986         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "pslld", Iop_ShlN32x4 );
14987         goto decode_success;
14988      }
14989      break;
14990
14991   case 0xF3:
14992      /* 66 0F F3 = PSLLQ by E */
14993      if (have66noF2noF3(pfx) && sz == 2) {
14994         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllq", Iop_ShlN64x2 );
14995         goto decode_success;
14996      }
14997      break;
14998
14999   case 0xF4:
15000      /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
15001         0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
15002         half */
15003      if (have66noF2noF3(pfx) && sz == 2) {
15004         IRTemp sV = newTemp(Ity_V128);
15005         IRTemp dV = newTemp(Ity_V128);
15006         modrm = getUChar(delta);
15007         UInt rG = gregOfRexRM(pfx,modrm);
15008         assign( dV, getXMMReg(rG) );
15009         if (epartIsReg(modrm)) {
15010            UInt rE = eregOfRexRM(pfx,modrm);
15011            assign( sV, getXMMReg(rE) );
15012            delta += 1;
15013            DIP("pmuludq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
15014         } else {
15015            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15016            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
15017            delta += alen;
15018            DIP("pmuludq %s,%s\n", dis_buf, nameXMMReg(rG));
15019         }
15020         putXMMReg( rG, mkexpr(math_PMULUDQ_128( sV, dV )) );
15021         goto decode_success;
15022      }
15023      /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
15024      /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
15025         0 to form 64-bit result */
15026      if (haveNo66noF2noF3(pfx) && sz == 4) {
15027         IRTemp sV = newTemp(Ity_I64);
15028         IRTemp dV = newTemp(Ity_I64);
15029         t1 = newTemp(Ity_I32);
15030         t0 = newTemp(Ity_I32);
15031         modrm = getUChar(delta);
15032
15033         do_MMX_preamble();
15034         assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
15035
15036         if (epartIsReg(modrm)) {
15037            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
15038            delta += 1;
15039            DIP("pmuludq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
15040                                   nameMMXReg(gregLO3ofRM(modrm)));
15041         } else {
15042            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15043            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
15044            delta += alen;
15045            DIP("pmuludq %s,%s\n", dis_buf,
15046                                   nameMMXReg(gregLO3ofRM(modrm)));
15047         }
15048
15049         assign( t0, unop(Iop_64to32, mkexpr(dV)) );
15050         assign( t1, unop(Iop_64to32, mkexpr(sV)) );
15051         putMMXReg( gregLO3ofRM(modrm),
15052                    binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
15053         goto decode_success;
15054      }
15055      break;
15056
15057   case 0xF5:
15058      /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
15059         E(xmm or mem) to G(xmm) */
15060      if (have66noF2noF3(pfx) && sz == 2) {
15061         IRTemp sV = newTemp(Ity_V128);
15062         IRTemp dV = newTemp(Ity_V128);
15063         modrm     = getUChar(delta);
15064         UInt   rG = gregOfRexRM(pfx,modrm);
15065         if (epartIsReg(modrm)) {
15066            UInt rE = eregOfRexRM(pfx,modrm);
15067            assign( sV, getXMMReg(rE) );
15068            delta += 1;
15069            DIP("pmaddwd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
15070         } else {
15071            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15072            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
15073            delta += alen;
15074            DIP("pmaddwd %s,%s\n", dis_buf, nameXMMReg(rG));
15075         }
15076         assign( dV, getXMMReg(rG) );
15077         putXMMReg( rG, mkexpr(math_PMADDWD_128(dV, sV)) );
15078         goto decode_success;
15079      }
15080      break;
15081
15082   case 0xF6:
15083      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
15084      /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
15085      if (haveNo66noF2noF3(pfx) && sz == 4) {
15086         do_MMX_preamble();
15087         delta = dis_MMXop_regmem_to_reg (
15088                    vbi, pfx, delta, opc, "psadbw", False );
15089         goto decode_success;
15090      }
15091      /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
15092         from E(xmm or mem) to G(xmm) */
15093      if (have66noF2noF3(pfx) && sz == 2) {
15094         IRTemp sV  = newTemp(Ity_V128);
15095         IRTemp dV  = newTemp(Ity_V128);
15096         modrm = getUChar(delta);
15097         UInt   rG   = gregOfRexRM(pfx,modrm);
15098         if (epartIsReg(modrm)) {
15099            UInt rE = eregOfRexRM(pfx,modrm);
15100            assign( sV, getXMMReg(rE) );
15101            delta += 1;
15102            DIP("psadbw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
15103         } else {
15104            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15105            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
15106            delta += alen;
15107            DIP("psadbw %s,%s\n", dis_buf, nameXMMReg(rG));
15108         }
15109         assign( dV, getXMMReg(rG) );
15110         putXMMReg( rG, mkexpr( math_PSADBW_128 ( dV, sV ) ) );
15111
15112         goto decode_success;
15113      }
15114      break;
15115
15116   case 0xF7:
15117      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
15118      /* 0F F7 = MASKMOVQ -- 8x8 masked store */
15119      if (haveNo66noF2noF3(pfx) && sz == 4) {
15120         Bool ok = False;
15121         delta = dis_MMX( &ok, vbi, pfx, sz, delta-1 );
15122         if (ok) goto decode_success;
15123      }
15124      /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
15125      if (have66noF2noF3(pfx) && sz == 2 && epartIsReg(getUChar(delta))) {
15126         delta = dis_MASKMOVDQU( vbi, pfx, delta, False/*!isAvx*/ );
15127         goto decode_success;
15128      }
15129      break;
15130
15131   case 0xF8:
15132      /* 66 0F F8 = PSUBB */
15133      if (have66noF2noF3(pfx) && sz == 2) {
15134         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
15135                                    "psubb", Iop_Sub8x16, False );
15136         goto decode_success;
15137      }
15138      break;
15139
15140   case 0xF9:
15141      /* 66 0F F9 = PSUBW */
15142      if (have66noF2noF3(pfx) && sz == 2) {
15143         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
15144                                    "psubw", Iop_Sub16x8, False );
15145         goto decode_success;
15146      }
15147      break;
15148
15149   case 0xFA:
15150      /* 66 0F FA = PSUBD */
15151      if (have66noF2noF3(pfx) && sz == 2) {
15152         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
15153                                    "psubd", Iop_Sub32x4, False );
15154         goto decode_success;
15155      }
15156      break;
15157
15158   case 0xFB:
15159      /* 66 0F FB = PSUBQ */
15160      if (have66noF2noF3(pfx) && sz == 2) {
15161         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
15162                                    "psubq", Iop_Sub64x2, False );
15163         goto decode_success;
15164      }
15165      /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
15166      /* 0F FB = PSUBQ -- sub 64x1 */
15167      if (haveNo66noF2noF3(pfx) && sz == 4) {
15168         do_MMX_preamble();
15169         delta = dis_MMXop_regmem_to_reg (
15170                   vbi, pfx, delta, opc, "psubq", False );
15171         goto decode_success;
15172      }
15173      break;
15174
15175   case 0xFC:
15176      /* 66 0F FC = PADDB */
15177      if (have66noF2noF3(pfx) && sz == 2) {
15178         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
15179                                    "paddb", Iop_Add8x16, False );
15180         goto decode_success;
15181      }
15182      break;
15183
15184   case 0xFD:
15185      /* 66 0F FD = PADDW */
15186      if (have66noF2noF3(pfx) && sz == 2) {
15187         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
15188                                    "paddw", Iop_Add16x8, False );
15189         goto decode_success;
15190      }
15191      break;
15192
15193   case 0xFE:
15194      /* 66 0F FE = PADDD */
15195      if (have66noF2noF3(pfx) && sz == 2) {
15196         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
15197                                    "paddd", Iop_Add32x4, False );
15198         goto decode_success;
15199      }
15200      break;
15201
15202   default:
15203      goto decode_failure;
15204
15205   }
15206
15207  decode_failure:
15208   *decode_OK = False;
15209   return deltaIN;
15210
15211  decode_success:
15212   *decode_OK = True;
15213   return delta;
15214}
15215
15216
15217/*------------------------------------------------------------*/
15218/*---                                                      ---*/
15219/*--- Top-level SSE3 (not SupSSE3): dis_ESC_0F__SSE3       ---*/
15220/*---                                                      ---*/
15221/*------------------------------------------------------------*/
15222
15223static Long dis_MOVDDUP_128 ( const VexAbiInfo* vbi, Prefix pfx,
15224                              Long delta, Bool isAvx )
15225{
15226   IRTemp addr   = IRTemp_INVALID;
15227   Int    alen   = 0;
15228   HChar  dis_buf[50];
15229   IRTemp sV    = newTemp(Ity_V128);
15230   IRTemp d0    = newTemp(Ity_I64);
15231   UChar  modrm = getUChar(delta);
15232   UInt   rG    = gregOfRexRM(pfx,modrm);
15233   if (epartIsReg(modrm)) {
15234      UInt rE = eregOfRexRM(pfx,modrm);
15235      assign( sV, getXMMReg(rE) );
15236      DIP("%smovddup %s,%s\n",
15237          isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
15238      delta += 1;
15239      assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
15240   } else {
15241      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15242      assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
15243      DIP("%smovddup %s,%s\n",
15244          isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
15245      delta += alen;
15246   }
15247   (isAvx ? putYMMRegLoAndZU : putXMMReg)
15248      ( rG, binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
15249   return delta;
15250}
15251
15252
15253static Long dis_MOVDDUP_256 ( const VexAbiInfo* vbi, Prefix pfx,
15254                              Long delta )
15255{
15256   IRTemp addr   = IRTemp_INVALID;
15257   Int    alen   = 0;
15258   HChar  dis_buf[50];
15259   IRTemp d0    = newTemp(Ity_I64);
15260   IRTemp d1    = newTemp(Ity_I64);
15261   UChar  modrm = getUChar(delta);
15262   UInt   rG    = gregOfRexRM(pfx,modrm);
15263   if (epartIsReg(modrm)) {
15264      UInt rE = eregOfRexRM(pfx,modrm);
15265      DIP("vmovddup %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
15266      delta += 1;
15267      assign ( d0, getYMMRegLane64(rE, 0) );
15268      assign ( d1, getYMMRegLane64(rE, 2) );
15269   } else {
15270      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15271      assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
15272      assign( d1, loadLE(Ity_I64, binop(Iop_Add64,
15273                                        mkexpr(addr), mkU64(16))) );
15274      DIP("vmovddup %s,%s\n", dis_buf, nameYMMReg(rG));
15275      delta += alen;
15276   }
15277   putYMMRegLane64( rG, 0, mkexpr(d0) );
15278   putYMMRegLane64( rG, 1, mkexpr(d0) );
15279   putYMMRegLane64( rG, 2, mkexpr(d1) );
15280   putYMMRegLane64( rG, 3, mkexpr(d1) );
15281   return delta;
15282}
15283
15284
15285static Long dis_MOVSxDUP_128 ( const VexAbiInfo* vbi, Prefix pfx,
15286                               Long delta, Bool isAvx, Bool isL )
15287{
15288   IRTemp addr  = IRTemp_INVALID;
15289   Int    alen  = 0;
15290   HChar  dis_buf[50];
15291   IRTemp sV    = newTemp(Ity_V128);
15292   UChar  modrm = getUChar(delta);
15293   UInt   rG    = gregOfRexRM(pfx,modrm);
15294   IRTemp s3, s2, s1, s0;
15295   s3 = s2 = s1 = s0 = IRTemp_INVALID;
15296   if (epartIsReg(modrm)) {
15297      UInt rE = eregOfRexRM(pfx,modrm);
15298      assign( sV, getXMMReg(rE) );
15299      DIP("%smovs%cdup %s,%s\n",
15300          isAvx ? "v" : "", isL ? 'l' : 'h', nameXMMReg(rE), nameXMMReg(rG));
15301      delta += 1;
15302   } else {
15303      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15304      if (!isAvx)
15305         gen_SEGV_if_not_16_aligned( addr );
15306      assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
15307      DIP("%smovs%cdup %s,%s\n",
15308          isAvx ? "v" : "", isL ? 'l' : 'h', dis_buf, nameXMMReg(rG));
15309      delta += alen;
15310   }
15311   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
15312   (isAvx ? putYMMRegLoAndZU : putXMMReg)
15313      ( rG, isL ? mkV128from32s( s2, s2, s0, s0 )
15314                : mkV128from32s( s3, s3, s1, s1 ) );
15315   return delta;
15316}
15317
15318
15319static Long dis_MOVSxDUP_256 ( const VexAbiInfo* vbi, Prefix pfx,
15320                               Long delta, Bool isL )
15321{
15322   IRTemp addr  = IRTemp_INVALID;
15323   Int    alen  = 0;
15324   HChar  dis_buf[50];
15325   IRTemp sV    = newTemp(Ity_V256);
15326   UChar  modrm = getUChar(delta);
15327   UInt   rG    = gregOfRexRM(pfx,modrm);
15328   IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
15329   s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
15330   if (epartIsReg(modrm)) {
15331      UInt rE = eregOfRexRM(pfx,modrm);
15332      assign( sV, getYMMReg(rE) );
15333      DIP("vmovs%cdup %s,%s\n",
15334          isL ? 'l' : 'h', nameYMMReg(rE), nameYMMReg(rG));
15335      delta += 1;
15336   } else {
15337      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15338      assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
15339      DIP("vmovs%cdup %s,%s\n",
15340          isL ? 'l' : 'h', dis_buf, nameYMMReg(rG));
15341      delta += alen;
15342   }
15343   breakupV256to32s( sV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
15344   putYMMRegLane128( rG, 1, isL ? mkV128from32s( s6, s6, s4, s4 )
15345                                : mkV128from32s( s7, s7, s5, s5 ) );
15346   putYMMRegLane128( rG, 0, isL ? mkV128from32s( s2, s2, s0, s0 )
15347                                : mkV128from32s( s3, s3, s1, s1 ) );
15348   return delta;
15349}
15350
15351
15352static IRTemp math_HADDPS_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
15353{
15354   IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
15355   IRTemp leftV  = newTemp(Ity_V128);
15356   IRTemp rightV = newTemp(Ity_V128);
15357   IRTemp rm     = newTemp(Ity_I32);
15358   s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
15359
15360   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
15361   breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
15362
15363   assign( leftV,  mkV128from32s( s2, s0, d2, d0 ) );
15364   assign( rightV, mkV128from32s( s3, s1, d3, d1 ) );
15365
15366   IRTemp res = newTemp(Ity_V128);
15367   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
15368   assign( res, triop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
15369                      mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
15370   return res;
15371}
15372
15373
15374static IRTemp math_HADDPD_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
15375{
15376   IRTemp s1, s0, d1, d0;
15377   IRTemp leftV  = newTemp(Ity_V128);
15378   IRTemp rightV = newTemp(Ity_V128);
15379   IRTemp rm     = newTemp(Ity_I32);
15380   s1 = s0 = d1 = d0 = IRTemp_INVALID;
15381
15382   breakupV128to64s( sV, &s1, &s0 );
15383   breakupV128to64s( dV, &d1, &d0 );
15384
15385   assign( leftV,  binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
15386   assign( rightV, binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
15387
15388   IRTemp res = newTemp(Ity_V128);
15389   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
15390   assign( res, triop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
15391                      mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
15392   return res;
15393}
15394
15395
15396__attribute__((noinline))
15397static
15398Long dis_ESC_0F__SSE3 ( Bool* decode_OK,
15399                        const VexAbiInfo* vbi,
15400                        Prefix pfx, Int sz, Long deltaIN )
15401{
15402   IRTemp addr  = IRTemp_INVALID;
15403   UChar  modrm = 0;
15404   Int    alen  = 0;
15405   HChar  dis_buf[50];
15406
15407   *decode_OK = False;
15408
15409   Long   delta = deltaIN;
15410   UChar  opc   = getUChar(delta);
15411   delta++;
15412   switch (opc) {
15413
15414   case 0x12:
15415      /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
15416         duplicating some lanes (2:2:0:0). */
15417      if (haveF3no66noF2(pfx) && sz == 4) {
15418         delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
15419                                   True/*isL*/ );
15420         goto decode_success;
15421      }
15422      /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
15423         duplicating some lanes (0:1:0:1). */
15424      if (haveF2no66noF3(pfx)
15425          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
15426         delta = dis_MOVDDUP_128( vbi, pfx, delta, False/*!isAvx*/ );
15427         goto decode_success;
15428      }
15429      break;
15430
15431   case 0x16:
15432      /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
15433         duplicating some lanes (3:3:1:1). */
15434      if (haveF3no66noF2(pfx) && sz == 4) {
15435         delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
15436                                   False/*!isL*/ );
15437         goto decode_success;
15438      }
15439      break;
15440
15441   case 0x7C:
15442   case 0x7D:
15443      /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
15444      /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
15445      if (haveF2no66noF3(pfx) && sz == 4) {
15446         IRTemp eV     = newTemp(Ity_V128);
15447         IRTemp gV     = newTemp(Ity_V128);
15448         Bool   isAdd  = opc == 0x7C;
15449         const HChar* str = isAdd ? "add" : "sub";
15450         modrm         = getUChar(delta);
15451         UInt   rG     = gregOfRexRM(pfx,modrm);
15452         if (epartIsReg(modrm)) {
15453            UInt rE = eregOfRexRM(pfx,modrm);
15454            assign( eV, getXMMReg(rE) );
15455            DIP("h%sps %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
15456            delta += 1;
15457         } else {
15458            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15459            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
15460            DIP("h%sps %s,%s\n", str, dis_buf, nameXMMReg(rG));
15461            delta += alen;
15462         }
15463
15464         assign( gV, getXMMReg(rG) );
15465         putXMMReg( rG, mkexpr( math_HADDPS_128 ( gV, eV, isAdd ) ) );
15466         goto decode_success;
15467      }
15468      /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
15469      /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
15470      if (have66noF2noF3(pfx) && sz == 2) {
15471         IRTemp eV     = newTemp(Ity_V128);
15472         IRTemp gV     = newTemp(Ity_V128);
15473         Bool   isAdd  = opc == 0x7C;
15474         const HChar* str = isAdd ? "add" : "sub";
15475         modrm         = getUChar(delta);
15476         UInt   rG     = gregOfRexRM(pfx,modrm);
15477         if (epartIsReg(modrm)) {
15478            UInt rE = eregOfRexRM(pfx,modrm);
15479            assign( eV, getXMMReg(rE) );
15480            DIP("h%spd %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
15481            delta += 1;
15482         } else {
15483            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15484            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
15485            DIP("h%spd %s,%s\n", str, dis_buf, nameXMMReg(rG));
15486            delta += alen;
15487         }
15488
15489         assign( gV, getXMMReg(rG) );
15490         putXMMReg( rG, mkexpr( math_HADDPD_128 ( gV, eV, isAdd ) ) );
15491         goto decode_success;
15492      }
15493      break;
15494
15495   case 0xD0:
15496      /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
15497      if (have66noF2noF3(pfx) && sz == 2) {
15498         IRTemp eV   = newTemp(Ity_V128);
15499         IRTemp gV   = newTemp(Ity_V128);
15500         modrm       = getUChar(delta);
15501         UInt   rG   = gregOfRexRM(pfx,modrm);
15502         if (epartIsReg(modrm)) {
15503            UInt rE = eregOfRexRM(pfx,modrm);
15504            assign( eV, getXMMReg(rE) );
15505            DIP("addsubpd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
15506            delta += 1;
15507         } else {
15508            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15509            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
15510            DIP("addsubpd %s,%s\n", dis_buf, nameXMMReg(rG));
15511            delta += alen;
15512         }
15513
15514         assign( gV, getXMMReg(rG) );
15515         putXMMReg( rG, mkexpr( math_ADDSUBPD_128 ( gV, eV ) ) );
15516         goto decode_success;
15517      }
15518      /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
15519      if (haveF2no66noF3(pfx) && sz == 4) {
15520         IRTemp eV   = newTemp(Ity_V128);
15521         IRTemp gV   = newTemp(Ity_V128);
15522         modrm       = getUChar(delta);
15523         UInt   rG   = gregOfRexRM(pfx,modrm);
15524
15525         modrm = getUChar(delta);
15526         if (epartIsReg(modrm)) {
15527            UInt rE = eregOfRexRM(pfx,modrm);
15528            assign( eV, getXMMReg(rE) );
15529            DIP("addsubps %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
15530            delta += 1;
15531         } else {
15532            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15533            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
15534            DIP("addsubps %s,%s\n", dis_buf, nameXMMReg(rG));
15535            delta += alen;
15536         }
15537
15538         assign( gV, getXMMReg(rG) );
15539         putXMMReg( rG, mkexpr( math_ADDSUBPS_128 ( gV, eV ) ) );
15540         goto decode_success;
15541      }
15542      break;
15543
15544   case 0xF0:
15545      /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
15546      if (haveF2no66noF3(pfx) && sz == 4) {
15547         modrm = getUChar(delta);
15548         if (epartIsReg(modrm)) {
15549            goto decode_failure;
15550         } else {
15551            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15552            putXMMReg( gregOfRexRM(pfx,modrm),
15553                       loadLE(Ity_V128, mkexpr(addr)) );
15554            DIP("lddqu %s,%s\n", dis_buf,
15555                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
15556            delta += alen;
15557         }
15558         goto decode_success;
15559      }
15560      break;
15561
15562   default:
15563      goto decode_failure;
15564
15565   }
15566
15567  decode_failure:
15568   *decode_OK = False;
15569   return deltaIN;
15570
15571  decode_success:
15572   *decode_OK = True;
15573   return delta;
15574}
15575
15576
15577/*------------------------------------------------------------*/
15578/*---                                                      ---*/
15579/*--- Top-level SSSE3: dis_ESC_0F38__SupSSE3               ---*/
15580/*---                                                      ---*/
15581/*------------------------------------------------------------*/
15582
15583static
15584IRTemp math_PSHUFB_XMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ )
15585{
15586   IRTemp sHi        = newTemp(Ity_I64);
15587   IRTemp sLo        = newTemp(Ity_I64);
15588   IRTemp dHi        = newTemp(Ity_I64);
15589   IRTemp dLo        = newTemp(Ity_I64);
15590   IRTemp rHi        = newTemp(Ity_I64);
15591   IRTemp rLo        = newTemp(Ity_I64);
15592   IRTemp sevens     = newTemp(Ity_I64);
15593   IRTemp mask0x80hi = newTemp(Ity_I64);
15594   IRTemp mask0x80lo = newTemp(Ity_I64);
15595   IRTemp maskBit3hi = newTemp(Ity_I64);
15596   IRTemp maskBit3lo = newTemp(Ity_I64);
15597   IRTemp sAnd7hi    = newTemp(Ity_I64);
15598   IRTemp sAnd7lo    = newTemp(Ity_I64);
15599   IRTemp permdHi    = newTemp(Ity_I64);
15600   IRTemp permdLo    = newTemp(Ity_I64);
15601   IRTemp res        = newTemp(Ity_V128);
15602
15603   assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
15604   assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
15605   assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
15606   assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
15607
15608   assign( sevens, mkU64(0x0707070707070707ULL) );
15609
15610   /* mask0x80hi = Not(SarN8x8(sHi,7))
15611      maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
15612      sAnd7hi    = And(sHi,sevens)
15613      permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
15614      And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
15615      rHi        = And(permdHi,mask0x80hi)
15616   */
15617   assign(
15618      mask0x80hi,
15619      unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
15620
15621   assign(
15622      maskBit3hi,
15623      binop(Iop_SarN8x8,
15624            binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
15625            mkU8(7)));
15626
15627   assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
15628
15629   assign(
15630      permdHi,
15631      binop(
15632         Iop_Or64,
15633         binop(Iop_And64,
15634               binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
15635               mkexpr(maskBit3hi)),
15636         binop(Iop_And64,
15637               binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
15638               unop(Iop_Not64,mkexpr(maskBit3hi))) ));
15639
15640   assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
15641
15642   /* And the same for the lower half of the result.  What fun. */
15643
15644   assign(
15645      mask0x80lo,
15646      unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
15647
15648   assign(
15649      maskBit3lo,
15650      binop(Iop_SarN8x8,
15651            binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
15652            mkU8(7)));
15653
15654   assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
15655
15656   assign(
15657      permdLo,
15658      binop(
15659         Iop_Or64,
15660         binop(Iop_And64,
15661               binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
15662               mkexpr(maskBit3lo)),
15663         binop(Iop_And64,
15664               binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
15665               unop(Iop_Not64,mkexpr(maskBit3lo))) ));
15666
15667   assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
15668
15669   assign(res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)));
15670   return res;
15671}
15672
15673
15674static
15675IRTemp math_PSHUFB_YMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ )
15676{
15677   IRTemp sHi, sLo, dHi, dLo;
15678   sHi = sLo = dHi = dLo = IRTemp_INVALID;
15679   breakupV256toV128s( dV, &dHi, &dLo);
15680   breakupV256toV128s( sV, &sHi, &sLo);
15681   IRTemp res = newTemp(Ity_V256);
15682   assign(res, binop(Iop_V128HLtoV256,
15683                     mkexpr(math_PSHUFB_XMM(dHi, sHi)),
15684                     mkexpr(math_PSHUFB_XMM(dLo, sLo))));
15685   return res;
15686}
15687
15688
15689static Long dis_PHADD_128 ( const VexAbiInfo* vbi, Prefix pfx, Long delta,
15690                            Bool isAvx, UChar opc )
15691{
15692   IRTemp addr   = IRTemp_INVALID;
15693   Int    alen   = 0;
15694   HChar  dis_buf[50];
15695   const HChar* str = "???";
15696   IROp   opV64  = Iop_INVALID;
15697   IROp   opCatO = Iop_CatOddLanes16x4;
15698   IROp   opCatE = Iop_CatEvenLanes16x4;
15699   IRTemp sV     = newTemp(Ity_V128);
15700   IRTemp dV     = newTemp(Ity_V128);
15701   IRTemp sHi    = newTemp(Ity_I64);
15702   IRTemp sLo    = newTemp(Ity_I64);
15703   IRTemp dHi    = newTemp(Ity_I64);
15704   IRTemp dLo    = newTemp(Ity_I64);
15705   UChar  modrm  = getUChar(delta);
15706   UInt   rG     = gregOfRexRM(pfx,modrm);
15707   UInt   rV     = isAvx ? getVexNvvvv(pfx) : rG;
15708
15709   switch (opc) {
15710      case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
15711      case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
15712      case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
15713      case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
15714      case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
15715      case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
15716      default: vassert(0);
15717   }
15718   if (opc == 0x02 || opc == 0x06) {
15719      opCatO = Iop_InterleaveHI32x2;
15720      opCatE = Iop_InterleaveLO32x2;
15721   }
15722
15723   assign( dV, getXMMReg(rV) );
15724
15725   if (epartIsReg(modrm)) {
15726      UInt rE = eregOfRexRM(pfx,modrm);
15727      assign( sV, getXMMReg(rE) );
15728      DIP("%sph%s %s,%s\n", isAvx ? "v" : "", str,
15729          nameXMMReg(rE), nameXMMReg(rG));
15730      delta += 1;
15731   } else {
15732      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15733      if (!isAvx)
15734         gen_SEGV_if_not_16_aligned( addr );
15735      assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
15736      DIP("%sph%s %s,%s\n", isAvx ? "v" : "", str,
15737          dis_buf, nameXMMReg(rG));
15738      delta += alen;
15739   }
15740
15741   assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
15742   assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
15743   assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
15744   assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
15745
15746   /* This isn't a particularly efficient way to compute the
15747      result, but at least it avoids a proliferation of IROps,
15748      hence avoids complication all the backends. */
15749
15750   (isAvx ? putYMMRegLoAndZU : putXMMReg)
15751      ( rG,
15752        binop(Iop_64HLtoV128,
15753              binop(opV64,
15754                    binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
15755                    binop(opCatO,mkexpr(sHi),mkexpr(sLo)) ),
15756              binop(opV64,
15757                    binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
15758                    binop(opCatO,mkexpr(dHi),mkexpr(dLo)) ) ) );
15759   return delta;
15760}
15761
15762
15763static Long dis_PHADD_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta,
15764                            UChar opc )
15765{
15766   IRTemp addr   = IRTemp_INVALID;
15767   Int    alen   = 0;
15768   HChar  dis_buf[50];
15769   const HChar* str = "???";
15770   IROp   opV64  = Iop_INVALID;
15771   IROp   opCatO = Iop_CatOddLanes16x4;
15772   IROp   opCatE = Iop_CatEvenLanes16x4;
15773   IRTemp sV     = newTemp(Ity_V256);
15774   IRTemp dV     = newTemp(Ity_V256);
15775   IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
15776   s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
15777   UChar  modrm  = getUChar(delta);
15778   UInt   rG     = gregOfRexRM(pfx,modrm);
15779   UInt   rV     = getVexNvvvv(pfx);
15780
15781   switch (opc) {
15782      case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
15783      case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
15784      case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
15785      case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
15786      case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
15787      case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
15788      default: vassert(0);
15789   }
15790   if (opc == 0x02 || opc == 0x06) {
15791      opCatO = Iop_InterleaveHI32x2;
15792      opCatE = Iop_InterleaveLO32x2;
15793   }
15794
15795   assign( dV, getYMMReg(rV) );
15796
15797   if (epartIsReg(modrm)) {
15798      UInt rE = eregOfRexRM(pfx,modrm);
15799      assign( sV, getYMMReg(rE) );
15800      DIP("vph%s %s,%s\n", str, nameYMMReg(rE), nameYMMReg(rG));
15801      delta += 1;
15802   } else {
15803      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15804      assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
15805      DIP("vph%s %s,%s\n", str, dis_buf, nameYMMReg(rG));
15806      delta += alen;
15807   }
15808
15809   breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
15810   breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
15811
15812   /* This isn't a particularly efficient way to compute the
15813      result, but at least it avoids a proliferation of IROps,
15814      hence avoids complication all the backends. */
15815
15816   putYMMReg( rG,
15817              binop(Iop_V128HLtoV256,
15818                    binop(Iop_64HLtoV128,
15819                          binop(opV64,
15820                                binop(opCatE,mkexpr(s3),mkexpr(s2)),
15821                                binop(opCatO,mkexpr(s3),mkexpr(s2)) ),
15822                          binop(opV64,
15823                                binop(opCatE,mkexpr(d3),mkexpr(d2)),
15824                                binop(opCatO,mkexpr(d3),mkexpr(d2)) ) ),
15825                    binop(Iop_64HLtoV128,
15826                          binop(opV64,
15827                                binop(opCatE,mkexpr(s1),mkexpr(s0)),
15828                                binop(opCatO,mkexpr(s1),mkexpr(s0)) ),
15829                          binop(opV64,
15830                                binop(opCatE,mkexpr(d1),mkexpr(d0)),
15831                                binop(opCatO,mkexpr(d1),mkexpr(d0)) ) ) ) );
15832   return delta;
15833}
15834
15835
15836static IRTemp math_PMADDUBSW_128 ( IRTemp dV, IRTemp sV )
15837{
15838   IRTemp sVoddsSX  = newTemp(Ity_V128);
15839   IRTemp sVevensSX = newTemp(Ity_V128);
15840   IRTemp dVoddsZX  = newTemp(Ity_V128);
15841   IRTemp dVevensZX = newTemp(Ity_V128);
15842   /* compute dV unsigned x sV signed */
15843   assign( sVoddsSX, binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
15844   assign( sVevensSX, binop(Iop_SarN16x8,
15845                            binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
15846                            mkU8(8)) );
15847   assign( dVoddsZX, binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
15848   assign( dVevensZX, binop(Iop_ShrN16x8,
15849                            binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
15850                            mkU8(8)) );
15851
15852   IRTemp res = newTemp(Ity_V128);
15853   assign( res, binop(Iop_QAdd16Sx8,
15854                      binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
15855                      binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
15856                     )
15857         );
15858   return res;
15859}
15860
15861
15862static
15863IRTemp math_PMADDUBSW_256 ( IRTemp dV, IRTemp sV )
15864{
15865   IRTemp sHi, sLo, dHi, dLo;
15866   sHi = sLo = dHi = dLo = IRTemp_INVALID;
15867   breakupV256toV128s( dV, &dHi, &dLo);
15868   breakupV256toV128s( sV, &sHi, &sLo);
15869   IRTemp res = newTemp(Ity_V256);
15870   assign(res, binop(Iop_V128HLtoV256,
15871                     mkexpr(math_PMADDUBSW_128(dHi, sHi)),
15872                     mkexpr(math_PMADDUBSW_128(dLo, sLo))));
15873   return res;
15874}
15875
15876
15877__attribute__((noinline))
15878static
15879Long dis_ESC_0F38__SupSSE3 ( Bool* decode_OK,
15880                             const VexAbiInfo* vbi,
15881                             Prefix pfx, Int sz, Long deltaIN )
15882{
15883   IRTemp addr  = IRTemp_INVALID;
15884   UChar  modrm = 0;
15885   Int    alen  = 0;
15886   HChar  dis_buf[50];
15887
15888   *decode_OK = False;
15889
15890   Long   delta = deltaIN;
15891   UChar  opc   = getUChar(delta);
15892   delta++;
15893   switch (opc) {
15894
15895   case 0x00:
15896      /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
15897      if (have66noF2noF3(pfx)
15898          && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
15899         IRTemp sV = newTemp(Ity_V128);
15900         IRTemp dV = newTemp(Ity_V128);
15901
15902         modrm = getUChar(delta);
15903         assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
15904
15905         if (epartIsReg(modrm)) {
15906            assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
15907            delta += 1;
15908            DIP("pshufb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
15909                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
15910         } else {
15911            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15912            gen_SEGV_if_not_16_aligned( addr );
15913            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
15914            delta += alen;
15915            DIP("pshufb %s,%s\n", dis_buf,
15916                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
15917         }
15918
15919         IRTemp res = math_PSHUFB_XMM( dV, sV );
15920         putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(res));
15921         goto decode_success;
15922      }
15923      /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
15924      if (haveNo66noF2noF3(pfx) && sz == 4) {
15925         IRTemp sV      = newTemp(Ity_I64);
15926         IRTemp dV      = newTemp(Ity_I64);
15927
15928         modrm = getUChar(delta);
15929         do_MMX_preamble();
15930         assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
15931
15932         if (epartIsReg(modrm)) {
15933            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
15934            delta += 1;
15935            DIP("pshufb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
15936                                  nameMMXReg(gregLO3ofRM(modrm)));
15937         } else {
15938            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15939            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
15940            delta += alen;
15941            DIP("pshufb %s,%s\n", dis_buf,
15942                                  nameMMXReg(gregLO3ofRM(modrm)));
15943         }
15944
15945         putMMXReg(
15946            gregLO3ofRM(modrm),
15947            binop(
15948               Iop_And64,
15949               /* permute the lanes */
15950               binop(
15951                  Iop_Perm8x8,
15952                  mkexpr(dV),
15953                  binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
15954               ),
15955               /* mask off lanes which have (index & 0x80) == 0x80 */
15956               unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
15957            )
15958         );
15959         goto decode_success;
15960      }
15961      break;
15962
15963   case 0x01:
15964   case 0x02:
15965   case 0x03:
15966   case 0x05:
15967   case 0x06:
15968   case 0x07:
15969      /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
15970         G to G (xmm). */
15971      /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
15972         G to G (xmm). */
15973      /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
15974         xmm) and G to G (xmm). */
15975      /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
15976         G to G (xmm). */
15977      /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
15978         G to G (xmm). */
15979      /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
15980         xmm) and G to G (xmm). */
15981      if (have66noF2noF3(pfx)
15982          && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
15983         delta = dis_PHADD_128( vbi, pfx, delta, False/*isAvx*/, opc );
15984         goto decode_success;
15985      }
15986      /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
15987      /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
15988         to G (mmx). */
15989      /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
15990         to G (mmx). */
15991      /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
15992         mmx) and G to G (mmx). */
15993      /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
15994         to G (mmx). */
15995      /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
15996         to G (mmx). */
15997      /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
15998         mmx) and G to G (mmx). */
15999      if (haveNo66noF2noF3(pfx) && sz == 4) {
16000         const HChar* str = "???";
16001         IROp   opV64  = Iop_INVALID;
16002         IROp   opCatO = Iop_CatOddLanes16x4;
16003         IROp   opCatE = Iop_CatEvenLanes16x4;
16004         IRTemp sV     = newTemp(Ity_I64);
16005         IRTemp dV     = newTemp(Ity_I64);
16006
16007         modrm = getUChar(delta);
16008
16009         switch (opc) {
16010            case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
16011            case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
16012            case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
16013            case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
16014            case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
16015            case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
16016            default: vassert(0);
16017         }
16018         if (opc == 0x02 || opc == 0x06) {
16019            opCatO = Iop_InterleaveHI32x2;
16020            opCatE = Iop_InterleaveLO32x2;
16021         }
16022
16023         do_MMX_preamble();
16024         assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
16025
16026         if (epartIsReg(modrm)) {
16027            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
16028            delta += 1;
16029            DIP("ph%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
16030                                     nameMMXReg(gregLO3ofRM(modrm)));
16031         } else {
16032            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
16033            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
16034            delta += alen;
16035            DIP("ph%s %s,%s\n", str, dis_buf,
16036                                     nameMMXReg(gregLO3ofRM(modrm)));
16037         }
16038
16039         putMMXReg(
16040            gregLO3ofRM(modrm),
16041            binop(opV64,
16042                  binop(opCatE,mkexpr(sV),mkexpr(dV)),
16043                  binop(opCatO,mkexpr(sV),mkexpr(dV))
16044            )
16045         );
16046         goto decode_success;
16047      }
16048      break;
16049
16050   case 0x04:
16051      /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
16052         Unsigned Bytes (XMM) */
16053      if (have66noF2noF3(pfx)
16054          && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
16055         IRTemp sV = newTemp(Ity_V128);
16056         IRTemp dV = newTemp(Ity_V128);
16057         modrm     = getUChar(delta);
16058         UInt   rG = gregOfRexRM(pfx,modrm);
16059
16060         assign( dV, getXMMReg(rG) );
16061
16062         if (epartIsReg(modrm)) {
16063            UInt rE = eregOfRexRM(pfx,modrm);
16064            assign( sV, getXMMReg(rE) );
16065            delta += 1;
16066            DIP("pmaddubsw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
16067         } else {
16068            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
16069            gen_SEGV_if_not_16_aligned( addr );
16070            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
16071            delta += alen;
16072            DIP("pmaddubsw %s,%s\n", dis_buf, nameXMMReg(rG));
16073         }
16074
16075         putXMMReg( rG, mkexpr( math_PMADDUBSW_128( dV, sV ) ) );
16076         goto decode_success;
16077      }
16078      /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
16079         Unsigned Bytes (MMX) */
16080      if (haveNo66noF2noF3(pfx) && sz == 4) {
16081         IRTemp sV        = newTemp(Ity_I64);
16082         IRTemp dV        = newTemp(Ity_I64);
16083         IRTemp sVoddsSX  = newTemp(Ity_I64);
16084         IRTemp sVevensSX = newTemp(Ity_I64);
16085         IRTemp dVoddsZX  = newTemp(Ity_I64);
16086         IRTemp dVevensZX = newTemp(Ity_I64);
16087
16088         modrm = getUChar(delta);
16089         do_MMX_preamble();
16090         assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
16091
16092         if (epartIsReg(modrm)) {
16093            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
16094            delta += 1;
16095            DIP("pmaddubsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
16096                                     nameMMXReg(gregLO3ofRM(modrm)));
16097         } else {
16098            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
16099            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
16100            delta += alen;
16101            DIP("pmaddubsw %s,%s\n", dis_buf,
16102                                     nameMMXReg(gregLO3ofRM(modrm)));
16103         }
16104
16105         /* compute dV unsigned x sV signed */
16106         assign( sVoddsSX,
16107                 binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
16108         assign( sVevensSX,
16109                 binop(Iop_SarN16x4,
16110                       binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
16111                       mkU8(8)) );
16112         assign( dVoddsZX,
16113                 binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
16114         assign( dVevensZX,
16115                 binop(Iop_ShrN16x4,
16116                       binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
16117                       mkU8(8)) );
16118
16119         putMMXReg(
16120            gregLO3ofRM(modrm),
16121            binop(Iop_QAdd16Sx4,
16122                  binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
16123                  binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
16124            )
16125         );
16126         goto decode_success;
16127      }
16128      break;
16129
16130   case 0x08:
16131   case 0x09:
16132   case 0x0A:
16133      /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
16134      /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
16135      /* 66 0F 38 0A = PSIGND -- Packed Sign 32x4 (XMM) */
16136      if (have66noF2noF3(pfx)
16137          && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
16138         IRTemp sV      = newTemp(Ity_V128);
16139         IRTemp dV      = newTemp(Ity_V128);
16140         IRTemp sHi     = newTemp(Ity_I64);
16141         IRTemp sLo     = newTemp(Ity_I64);
16142         IRTemp dHi     = newTemp(Ity_I64);
16143         IRTemp dLo     = newTemp(Ity_I64);
16144         const HChar* str = "???";
16145         Int    laneszB = 0;
16146
16147         switch (opc) {
16148            case 0x08: laneszB = 1; str = "b"; break;
16149            case 0x09: laneszB = 2; str = "w"; break;
16150            case 0x0A: laneszB = 4; str = "d"; break;
16151            default: vassert(0);
16152         }
16153
16154         modrm = getUChar(delta);
16155         assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
16156
16157         if (epartIsReg(modrm)) {
16158            assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
16159            delta += 1;
16160            DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
16161                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
16162         } else {
16163            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
16164            gen_SEGV_if_not_16_aligned( addr );
16165            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
16166            delta += alen;
16167            DIP("psign%s %s,%s\n", str, dis_buf,
16168                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
16169         }
16170
16171         assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
16172         assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
16173         assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
16174         assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
16175
16176         putXMMReg(
16177            gregOfRexRM(pfx,modrm),
16178            binop(Iop_64HLtoV128,
16179                  dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
16180                  dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
16181            )
16182         );
16183         goto decode_success;
16184      }
16185      /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
16186      /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
16187      /* 0F 38 0A = PSIGND -- Packed Sign 32x2 (MMX) */
16188      if (haveNo66noF2noF3(pfx) && sz == 4) {
16189         IRTemp sV      = newTemp(Ity_I64);
16190         IRTemp dV      = newTemp(Ity_I64);
16191         const HChar* str = "???";
16192         Int    laneszB = 0;
16193
16194         switch (opc) {
16195            case 0x08: laneszB = 1; str = "b"; break;
16196            case 0x09: laneszB = 2; str = "w"; break;
16197            case 0x0A: laneszB = 4; str = "d"; break;
16198            default: vassert(0);
16199         }
16200
16201         modrm = getUChar(delta);
16202         do_MMX_preamble();
16203         assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
16204
16205         if (epartIsReg(modrm)) {
16206            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
16207            delta += 1;
16208            DIP("psign%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
16209                                        nameMMXReg(gregLO3ofRM(modrm)));
16210         } else {
16211            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
16212            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
16213            delta += alen;
16214            DIP("psign%s %s,%s\n", str, dis_buf,
16215                                        nameMMXReg(gregLO3ofRM(modrm)));
16216         }
16217
16218         putMMXReg(
16219            gregLO3ofRM(modrm),
16220            dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
16221         );
16222         goto decode_success;
16223      }
16224      break;
16225
16226   case 0x0B:
16227      /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
16228         Scale (XMM) */
16229      if (have66noF2noF3(pfx)
16230          && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
16231         IRTemp sV  = newTemp(Ity_V128);
16232         IRTemp dV  = newTemp(Ity_V128);
16233         IRTemp sHi = newTemp(Ity_I64);
16234         IRTemp sLo = newTemp(Ity_I64);
16235         IRTemp dHi = newTemp(Ity_I64);
16236         IRTemp dLo = newTemp(Ity_I64);
16237
16238         modrm = getUChar(delta);
16239         assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
16240
16241         if (epartIsReg(modrm)) {
16242            assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
16243            delta += 1;
16244            DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
16245                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
16246         } else {
16247            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
16248            gen_SEGV_if_not_16_aligned( addr );
16249            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
16250            delta += alen;
16251            DIP("pmulhrsw %s,%s\n", dis_buf,
16252                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
16253         }
16254
16255         assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
16256         assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
16257         assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
16258         assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
16259
16260         putXMMReg(
16261            gregOfRexRM(pfx,modrm),
16262            binop(Iop_64HLtoV128,
16263                  dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
16264                  dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
16265            )
16266         );
16267         goto decode_success;
16268      }
16269      /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
16270         (MMX) */
16271      if (haveNo66noF2noF3(pfx) && sz == 4) {
16272         IRTemp sV = newTemp(Ity_I64);
16273         IRTemp dV = newTemp(Ity_I64);
16274
16275         modrm = getUChar(delta);
16276         do_MMX_preamble();
16277         assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
16278
16279         if (epartIsReg(modrm)) {
16280            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
16281            delta += 1;
16282            DIP("pmulhrsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
16283                                    nameMMXReg(gregLO3ofRM(modrm)));
16284         } else {
16285            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
16286            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
16287            delta += alen;
16288            DIP("pmulhrsw %s,%s\n", dis_buf,
16289                                    nameMMXReg(gregLO3ofRM(modrm)));
16290         }
16291
16292         putMMXReg(
16293            gregLO3ofRM(modrm),
16294            dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
16295         );
16296         goto decode_success;
16297      }
16298      break;
16299
16300   case 0x1C:
16301   case 0x1D:
16302   case 0x1E:
16303      /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
16304      /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
16305      /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
16306      if (have66noF2noF3(pfx)
16307          && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
16308         IRTemp sV  = newTemp(Ity_V128);
16309         const HChar* str = "???";
16310         Int    laneszB = 0;
16311
16312         switch (opc) {
16313            case 0x1C: laneszB = 1; str = "b"; break;
16314            case 0x1D: laneszB = 2; str = "w"; break;
16315            case 0x1E: laneszB = 4; str = "d"; break;
16316            default: vassert(0);
16317         }
16318
16319         modrm = getUChar(delta);
16320         if (epartIsReg(modrm)) {
16321            assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
16322            delta += 1;
16323            DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
16324                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
16325         } else {
16326            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
16327            gen_SEGV_if_not_16_aligned( addr );
16328            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
16329            delta += alen;
16330            DIP("pabs%s %s,%s\n", str, dis_buf,
16331                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
16332         }
16333
16334         putXMMReg( gregOfRexRM(pfx,modrm),
16335                    mkexpr(math_PABS_XMM(sV, laneszB)) );
16336         goto decode_success;
16337      }
16338      /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
16339      /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
16340      /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
16341      if (haveNo66noF2noF3(pfx) && sz == 4) {
16342         IRTemp sV      = newTemp(Ity_I64);
16343         const HChar* str = "???";
16344         Int    laneszB = 0;
16345
16346         switch (opc) {
16347            case 0x1C: laneszB = 1; str = "b"; break;
16348            case 0x1D: laneszB = 2; str = "w"; break;
16349            case 0x1E: laneszB = 4; str = "d"; break;
16350            default: vassert(0);
16351         }
16352
16353         modrm = getUChar(delta);
16354         do_MMX_preamble();
16355
16356         if (epartIsReg(modrm)) {
16357            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
16358            delta += 1;
16359            DIP("pabs%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
16360                                       nameMMXReg(gregLO3ofRM(modrm)));
16361         } else {
16362            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
16363            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
16364            delta += alen;
16365            DIP("pabs%s %s,%s\n", str, dis_buf,
16366                                       nameMMXReg(gregLO3ofRM(modrm)));
16367         }
16368
16369         putMMXReg( gregLO3ofRM(modrm),
16370                    mkexpr(math_PABS_MMX( sV, laneszB )) );
16371         goto decode_success;
16372      }
16373      break;
16374
16375   default:
16376      break;
16377
16378   }
16379
16380  //decode_failure:
16381   *decode_OK = False;
16382   return deltaIN;
16383
16384  decode_success:
16385   *decode_OK = True;
16386   return delta;
16387}
16388
16389
16390/*------------------------------------------------------------*/
16391/*---                                                      ---*/
16392/*--- Top-level SSSE3: dis_ESC_0F3A__SupSSE3               ---*/
16393/*---                                                      ---*/
16394/*------------------------------------------------------------*/
16395
16396__attribute__((noinline))
16397static
16398Long dis_ESC_0F3A__SupSSE3 ( Bool* decode_OK,
16399                             const VexAbiInfo* vbi,
16400                             Prefix pfx, Int sz, Long deltaIN )
16401{
16402   Long   d64   = 0;
16403   IRTemp addr  = IRTemp_INVALID;
16404   UChar  modrm = 0;
16405   Int    alen  = 0;
16406   HChar  dis_buf[50];
16407
16408   *decode_OK = False;
16409
16410   Long   delta = deltaIN;
16411   UChar  opc   = getUChar(delta);
16412   delta++;
16413   switch (opc) {
16414
16415   case 0x0F:
16416      /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
16417      if (have66noF2noF3(pfx)
16418          && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
16419         IRTemp sV  = newTemp(Ity_V128);
16420         IRTemp dV  = newTemp(Ity_V128);
16421
16422         modrm = getUChar(delta);
16423         assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
16424
16425         if (epartIsReg(modrm)) {
16426            assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
16427            d64 = (Long)getUChar(delta+1);
16428            delta += 1+1;
16429            DIP("palignr $%lld,%s,%s\n", d64,
16430                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
16431                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
16432         } else {
16433            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
16434            gen_SEGV_if_not_16_aligned( addr );
16435            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
16436            d64 = (Long)getUChar(delta+alen);
16437            delta += alen+1;
16438            DIP("palignr $%lld,%s,%s\n", d64,
16439                                       dis_buf,
16440                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
16441         }
16442
16443         IRTemp res = math_PALIGNR_XMM( sV, dV, d64 );
16444         putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
16445         goto decode_success;
16446      }
16447      /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
16448      if (haveNo66noF2noF3(pfx) && sz == 4) {
16449         IRTemp sV  = newTemp(Ity_I64);
16450         IRTemp dV  = newTemp(Ity_I64);
16451         IRTemp res = newTemp(Ity_I64);
16452
16453         modrm = getUChar(delta);
16454         do_MMX_preamble();
16455         assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
16456
16457         if (epartIsReg(modrm)) {
16458            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
16459            d64 = (Long)getUChar(delta+1);
16460            delta += 1+1;
16461            DIP("palignr $%lld,%s,%s\n",  d64,
16462                                        nameMMXReg(eregLO3ofRM(modrm)),
16463                                        nameMMXReg(gregLO3ofRM(modrm)));
16464         } else {
16465            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
16466            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
16467            d64 = (Long)getUChar(delta+alen);
16468            delta += alen+1;
16469            DIP("palignr $%lld%s,%s\n", d64,
16470                                      dis_buf,
16471                                      nameMMXReg(gregLO3ofRM(modrm)));
16472         }
16473
16474         if (d64 == 0) {
16475            assign( res, mkexpr(sV) );
16476         }
16477         else if (d64 >= 1 && d64 <= 7) {
16478            assign(res,
16479                   binop(Iop_Or64,
16480                         binop(Iop_Shr64, mkexpr(sV), mkU8(8*d64)),
16481                         binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d64))
16482                        )));
16483         }
16484         else if (d64 == 8) {
16485           assign( res, mkexpr(dV) );
16486         }
16487         else if (d64 >= 9 && d64 <= 15) {
16488            assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d64-8))) );
16489         }
16490         else if (d64 >= 16 && d64 <= 255) {
16491            assign( res, mkU64(0) );
16492         }
16493         else
16494            vassert(0);
16495
16496         putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
16497         goto decode_success;
16498      }
16499      break;
16500
16501   default:
16502      break;
16503
16504   }
16505
16506  //decode_failure:
16507   *decode_OK = False;
16508   return deltaIN;
16509
16510  decode_success:
16511   *decode_OK = True;
16512   return delta;
16513}
16514
16515
16516/*------------------------------------------------------------*/
16517/*---                                                      ---*/
16518/*--- Top-level SSE4: dis_ESC_0F__SSE4                     ---*/
16519/*---                                                      ---*/
16520/*------------------------------------------------------------*/
16521
16522__attribute__((noinline))
16523static
16524Long dis_ESC_0F__SSE4 ( Bool* decode_OK,
16525                        const VexArchInfo* archinfo,
16526                        const VexAbiInfo* vbi,
16527                        Prefix pfx, Int sz, Long deltaIN )
16528{
16529   IRTemp addr  = IRTemp_INVALID;
16530   IRType ty    = Ity_INVALID;
16531   UChar  modrm = 0;
16532   Int    alen  = 0;
16533   HChar  dis_buf[50];
16534
16535   *decode_OK = False;
16536
16537   Long   delta = deltaIN;
16538   UChar  opc   = getUChar(delta);
16539   delta++;
16540   switch (opc) {
16541
16542   case 0xB8:
16543      /* F3 0F B8  = POPCNT{W,L,Q}
16544         Count the number of 1 bits in a register
16545      */
16546      if (haveF3noF2(pfx) /* so both 66 and REX.W are possibilities */
16547          && (sz == 2 || sz == 4 || sz == 8)) {
16548         /*IRType*/ ty  = szToITy(sz);
16549         IRTemp     src = newTemp(ty);
16550         modrm = getUChar(delta);
16551         if (epartIsReg(modrm)) {
16552            assign(src, getIRegE(sz, pfx, modrm));
16553            delta += 1;
16554            DIP("popcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
16555                nameIRegG(sz, pfx, modrm));
16556         } else {
16557            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
16558            assign(src, loadLE(ty, mkexpr(addr)));
16559            delta += alen;
16560            DIP("popcnt%c %s, %s\n", nameISize(sz), dis_buf,
16561                nameIRegG(sz, pfx, modrm));
16562         }
16563
16564         IRTemp result = gen_POPCOUNT(ty, src);
16565         putIRegG(sz, pfx, modrm, mkexpr(result));
16566
16567         // Update flags.  This is pretty lame .. perhaps can do better
16568         // if this turns out to be performance critical.
16569         // O S A C P are cleared.  Z is set if SRC == 0.
16570         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
16571         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
16572         stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
16573         stmt( IRStmt_Put( OFFB_CC_DEP1,
16574               binop(Iop_Shl64,
16575                     unop(Iop_1Uto64,
16576                          binop(Iop_CmpEQ64,
16577                                widenUto64(mkexpr(src)),
16578                                mkU64(0))),
16579                     mkU8(AMD64G_CC_SHIFT_Z))));
16580
16581         goto decode_success;
16582      }
16583      break;
16584
16585   case 0xBC:
16586      /* F3 0F BC -- TZCNT (count trailing zeroes.  A BMI extension,
16587         which we can only decode if we're sure this is a BMI1 capable cpu
16588         that supports TZCNT, since otherwise it's BSF, which behaves
16589         differently on zero source.  */
16590      if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
16591          && (sz == 2 || sz == 4 || sz == 8)
16592          && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_BMI)) {
16593         /*IRType*/ ty  = szToITy(sz);
16594         IRTemp     src = newTemp(ty);
16595         modrm = getUChar(delta);
16596         if (epartIsReg(modrm)) {
16597            assign(src, getIRegE(sz, pfx, modrm));
16598            delta += 1;
16599            DIP("tzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
16600                nameIRegG(sz, pfx, modrm));
16601         } else {
16602            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
16603            assign(src, loadLE(ty, mkexpr(addr)));
16604            delta += alen;
16605            DIP("tzcnt%c %s, %s\n", nameISize(sz), dis_buf,
16606                nameIRegG(sz, pfx, modrm));
16607         }
16608
16609         IRTemp res = gen_TZCNT(ty, src);
16610         putIRegG(sz, pfx, modrm, mkexpr(res));
16611
16612         // Update flags.  This is pretty lame .. perhaps can do better
16613         // if this turns out to be performance critical.
16614         // O S A P are cleared.  Z is set if RESULT == 0.
16615         // C is set if SRC is zero.
16616         IRTemp src64 = newTemp(Ity_I64);
16617         IRTemp res64 = newTemp(Ity_I64);
16618         assign(src64, widenUto64(mkexpr(src)));
16619         assign(res64, widenUto64(mkexpr(res)));
16620
16621         IRTemp oszacp = newTemp(Ity_I64);
16622         assign(
16623            oszacp,
16624            binop(Iop_Or64,
16625                  binop(Iop_Shl64,
16626                        unop(Iop_1Uto64,
16627                             binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))),
16628                        mkU8(AMD64G_CC_SHIFT_Z)),
16629                  binop(Iop_Shl64,
16630                        unop(Iop_1Uto64,
16631                             binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))),
16632                        mkU8(AMD64G_CC_SHIFT_C))
16633            )
16634         );
16635
16636         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
16637         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
16638         stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
16639         stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
16640
16641         goto decode_success;
16642      }
16643      break;
16644
16645   case 0xBD:
16646      /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
16647         which we can only decode if we're sure this is an AMD cpu
16648         that supports LZCNT, since otherwise it's BSR, which behaves
16649         differently.  Bizarrely, my Sandy Bridge also accepts these
16650         instructions but produces different results. */
16651      if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
16652          && (sz == 2 || sz == 4 || sz == 8)
16653          && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT)) {
16654         /*IRType*/ ty  = szToITy(sz);
16655         IRTemp     src = newTemp(ty);
16656         modrm = getUChar(delta);
16657         if (epartIsReg(modrm)) {
16658            assign(src, getIRegE(sz, pfx, modrm));
16659            delta += 1;
16660            DIP("lzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
16661                nameIRegG(sz, pfx, modrm));
16662         } else {
16663            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
16664            assign(src, loadLE(ty, mkexpr(addr)));
16665            delta += alen;
16666            DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
16667                nameIRegG(sz, pfx, modrm));
16668         }
16669
16670         IRTemp res = gen_LZCNT(ty, src);
16671         putIRegG(sz, pfx, modrm, mkexpr(res));
16672
16673         // Update flags.  This is pretty lame .. perhaps can do better
16674         // if this turns out to be performance critical.
16675         // O S A P are cleared.  Z is set if RESULT == 0.
16676         // C is set if SRC is zero.
16677         IRTemp src64 = newTemp(Ity_I64);
16678         IRTemp res64 = newTemp(Ity_I64);
16679         assign(src64, widenUto64(mkexpr(src)));
16680         assign(res64, widenUto64(mkexpr(res)));
16681
16682         IRTemp oszacp = newTemp(Ity_I64);
16683         assign(
16684            oszacp,
16685            binop(Iop_Or64,
16686                  binop(Iop_Shl64,
16687                        unop(Iop_1Uto64,
16688                             binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))),
16689                        mkU8(AMD64G_CC_SHIFT_Z)),
16690                  binop(Iop_Shl64,
16691                        unop(Iop_1Uto64,
16692                             binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))),
16693                        mkU8(AMD64G_CC_SHIFT_C))
16694            )
16695         );
16696
16697         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
16698         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
16699         stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
16700         stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
16701
16702         goto decode_success;
16703      }
16704      break;
16705
16706   default:
16707      break;
16708
16709   }
16710
16711  //decode_failure:
16712   *decode_OK = False;
16713   return deltaIN;
16714
16715  decode_success:
16716   *decode_OK = True;
16717   return delta;
16718}
16719
16720
16721/*------------------------------------------------------------*/
16722/*---                                                      ---*/
16723/*--- Top-level SSE4: dis_ESC_0F38__SSE4                   ---*/
16724/*---                                                      ---*/
16725/*------------------------------------------------------------*/
16726
16727static IRTemp math_PBLENDVB_128 ( IRTemp vecE, IRTemp vecG,
16728                                  IRTemp vec0/*controlling mask*/,
16729                                  UInt gran, IROp opSAR )
16730{
16731   /* The tricky bit is to convert vec0 into a suitable mask, by
16732      copying the most significant bit of each lane into all positions
16733      in the lane. */
16734   IRTemp sh = newTemp(Ity_I8);
16735   assign(sh, mkU8(8 * gran - 1));
16736
16737   IRTemp mask = newTemp(Ity_V128);
16738   assign(mask, binop(opSAR, mkexpr(vec0), mkexpr(sh)));
16739
16740   IRTemp notmask = newTemp(Ity_V128);
16741   assign(notmask, unop(Iop_NotV128, mkexpr(mask)));
16742
16743   IRTemp res = newTemp(Ity_V128);
16744   assign(res,  binop(Iop_OrV128,
16745                      binop(Iop_AndV128, mkexpr(vecE), mkexpr(mask)),
16746                      binop(Iop_AndV128, mkexpr(vecG), mkexpr(notmask))));
16747   return res;
16748}
16749
16750static IRTemp math_PBLENDVB_256 ( IRTemp vecE, IRTemp vecG,
16751                                  IRTemp vec0/*controlling mask*/,
16752                                  UInt gran, IROp opSAR128 )
16753{
16754   /* The tricky bit is to convert vec0 into a suitable mask, by
16755      copying the most significant bit of each lane into all positions
16756      in the lane. */
16757   IRTemp sh = newTemp(Ity_I8);
16758   assign(sh, mkU8(8 * gran - 1));
16759
16760   IRTemp vec0Hi = IRTemp_INVALID;
16761   IRTemp vec0Lo = IRTemp_INVALID;
16762   breakupV256toV128s( vec0, &vec0Hi, &vec0Lo );
16763
16764   IRTemp mask = newTemp(Ity_V256);
16765   assign(mask, binop(Iop_V128HLtoV256,
16766                      binop(opSAR128, mkexpr(vec0Hi), mkexpr(sh)),
16767                      binop(opSAR128, mkexpr(vec0Lo), mkexpr(sh))));
16768
16769   IRTemp notmask = newTemp(Ity_V256);
16770   assign(notmask, unop(Iop_NotV256, mkexpr(mask)));
16771
16772   IRTemp res = newTemp(Ity_V256);
16773   assign(res,  binop(Iop_OrV256,
16774                      binop(Iop_AndV256, mkexpr(vecE), mkexpr(mask)),
16775                      binop(Iop_AndV256, mkexpr(vecG), mkexpr(notmask))));
16776   return res;
16777}
16778
16779static Long dis_VBLENDV_128 ( const VexAbiInfo* vbi, Prefix pfx, Long delta,
16780                              const HChar *name, UInt gran, IROp opSAR )
16781{
16782   IRTemp addr   = IRTemp_INVALID;
16783   Int    alen   = 0;
16784   HChar  dis_buf[50];
16785   UChar  modrm  = getUChar(delta);
16786   UInt   rG     = gregOfRexRM(pfx, modrm);
16787   UInt   rV     = getVexNvvvv(pfx);
16788   UInt   rIS4   = 0xFF; /* invalid */
16789   IRTemp vecE   = newTemp(Ity_V128);
16790   IRTemp vecV   = newTemp(Ity_V128);
16791   IRTemp vecIS4 = newTemp(Ity_V128);
16792   if (epartIsReg(modrm)) {
16793      delta++;
16794      UInt rE = eregOfRexRM(pfx, modrm);
16795      assign(vecE, getXMMReg(rE));
16796      UChar ib = getUChar(delta);
16797      rIS4 = (ib >> 4) & 0xF;
16798      DIP("%s %s,%s,%s,%s\n",
16799          name, nameXMMReg(rIS4), nameXMMReg(rE),
16800          nameXMMReg(rV), nameXMMReg(rG));
16801   } else {
16802      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
16803      delta += alen;
16804      assign(vecE, loadLE(Ity_V128, mkexpr(addr)));
16805      UChar ib = getUChar(delta);
16806      rIS4 = (ib >> 4) & 0xF;
16807      DIP("%s %s,%s,%s,%s\n",
16808          name, nameXMMReg(rIS4), dis_buf, nameXMMReg(rV), nameXMMReg(rG));
16809   }
16810   delta++;
16811   assign(vecV,   getXMMReg(rV));
16812   assign(vecIS4, getXMMReg(rIS4));
16813   IRTemp res = math_PBLENDVB_128( vecE, vecV, vecIS4, gran, opSAR );
16814   putYMMRegLoAndZU( rG, mkexpr(res) );
16815   return delta;
16816}
16817
16818static Long dis_VBLENDV_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta,
16819                              const HChar *name, UInt gran, IROp opSAR128 )
16820{
16821   IRTemp addr   = IRTemp_INVALID;
16822   Int    alen   = 0;
16823   HChar  dis_buf[50];
16824   UChar  modrm  = getUChar(delta);
16825   UInt   rG     = gregOfRexRM(pfx, modrm);
16826   UInt   rV     = getVexNvvvv(pfx);
16827   UInt   rIS4   = 0xFF; /* invalid */
16828   IRTemp vecE   = newTemp(Ity_V256);
16829   IRTemp vecV   = newTemp(Ity_V256);
16830   IRTemp vecIS4 = newTemp(Ity_V256);
16831   if (epartIsReg(modrm)) {
16832      delta++;
16833      UInt rE = eregOfRexRM(pfx, modrm);
16834      assign(vecE, getYMMReg(rE));
16835      UChar ib = getUChar(delta);
16836      rIS4 = (ib >> 4) & 0xF;
16837      DIP("%s %s,%s,%s,%s\n",
16838          name, nameYMMReg(rIS4), nameYMMReg(rE),
16839          nameYMMReg(rV), nameYMMReg(rG));
16840   } else {
16841      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
16842      delta += alen;
16843      assign(vecE, loadLE(Ity_V256, mkexpr(addr)));
16844      UChar ib = getUChar(delta);
16845      rIS4 = (ib >> 4) & 0xF;
16846      DIP("%s %s,%s,%s,%s\n",
16847          name, nameYMMReg(rIS4), dis_buf, nameYMMReg(rV), nameYMMReg(rG));
16848   }
16849   delta++;
16850   assign(vecV,   getYMMReg(rV));
16851   assign(vecIS4, getYMMReg(rIS4));
16852   IRTemp res = math_PBLENDVB_256( vecE, vecV, vecIS4, gran, opSAR128 );
16853   putYMMReg( rG, mkexpr(res) );
16854   return delta;
16855}
16856
16857static void finish_xTESTy ( IRTemp andV, IRTemp andnV, Int sign )
16858{
16859   /* Set Z=1 iff (vecE & vecG) == 0
16860      Set C=1 iff (vecE & not vecG) == 0
16861   */
16862
16863   /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
16864
16865   /* andV resp. andnV, reduced to 64-bit values, by or-ing the top
16866      and bottom 64-bits together.  It relies on this trick:
16867
16868      InterleaveLO64x2([a,b],[c,d]) == [b,d]    hence
16869
16870      InterleaveLO64x2([a,b],[a,b]) == [b,b]    and similarly
16871      InterleaveHI64x2([a,b],[a,b]) == [a,a]
16872
16873      and so the OR of the above 2 exprs produces
16874      [a OR b, a OR b], from which we simply take the lower half.
16875   */
16876   IRTemp and64  = newTemp(Ity_I64);
16877   IRTemp andn64 = newTemp(Ity_I64);
16878
16879   assign(and64,
16880          unop(Iop_V128to64,
16881               binop(Iop_OrV128,
16882                     binop(Iop_InterleaveLO64x2,
16883                           mkexpr(andV), mkexpr(andV)),
16884                     binop(Iop_InterleaveHI64x2,
16885                           mkexpr(andV), mkexpr(andV)))));
16886
16887   assign(andn64,
16888          unop(Iop_V128to64,
16889               binop(Iop_OrV128,
16890                     binop(Iop_InterleaveLO64x2,
16891                           mkexpr(andnV), mkexpr(andnV)),
16892                     binop(Iop_InterleaveHI64x2,
16893                           mkexpr(andnV), mkexpr(andnV)))));
16894
16895   IRTemp z64 = newTemp(Ity_I64);
16896   IRTemp c64 = newTemp(Ity_I64);
16897   if (sign == 64) {
16898      /* When only interested in the most significant bit, just shift
16899         arithmetically right and negate.  */
16900      assign(z64,
16901             unop(Iop_Not64,
16902                  binop(Iop_Sar64, mkexpr(and64), mkU8(63))));
16903
16904      assign(c64,
16905             unop(Iop_Not64,
16906                  binop(Iop_Sar64, mkexpr(andn64), mkU8(63))));
16907   } else {
16908      if (sign == 32) {
16909         /* When interested in bit 31 and bit 63, mask those bits and
16910            fallthrough into the PTEST handling.  */
16911         IRTemp t0 = newTemp(Ity_I64);
16912         IRTemp t1 = newTemp(Ity_I64);
16913         IRTemp t2 = newTemp(Ity_I64);
16914         assign(t0, mkU64(0x8000000080000000ULL));
16915         assign(t1, binop(Iop_And64, mkexpr(and64), mkexpr(t0)));
16916         assign(t2, binop(Iop_And64, mkexpr(andn64), mkexpr(t0)));
16917         and64 = t1;
16918         andn64 = t2;
16919      }
16920      /* Now convert and64, andn64 to all-zeroes or all-1s, so we can
16921         slice out the Z and C bits conveniently.  We use the standard
16922         trick all-zeroes -> all-zeroes, anything-else -> all-ones
16923         done by "(x | -x) >>s (word-size - 1)".
16924      */
16925      assign(z64,
16926             unop(Iop_Not64,
16927                  binop(Iop_Sar64,
16928                        binop(Iop_Or64,
16929                              binop(Iop_Sub64, mkU64(0), mkexpr(and64)),
16930                                    mkexpr(and64)), mkU8(63))));
16931
16932      assign(c64,
16933             unop(Iop_Not64,
16934                  binop(Iop_Sar64,
16935                        binop(Iop_Or64,
16936                              binop(Iop_Sub64, mkU64(0), mkexpr(andn64)),
16937                                    mkexpr(andn64)), mkU8(63))));
16938   }
16939
16940   /* And finally, slice out the Z and C flags and set the flags
16941      thunk to COPY for them.  OSAP are set to zero. */
16942   IRTemp newOSZACP = newTemp(Ity_I64);
16943   assign(newOSZACP,
16944          binop(Iop_Or64,
16945                binop(Iop_And64, mkexpr(z64), mkU64(AMD64G_CC_MASK_Z)),
16946                binop(Iop_And64, mkexpr(c64), mkU64(AMD64G_CC_MASK_C))));
16947
16948   stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(newOSZACP)));
16949   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
16950   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
16951   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
16952}
16953
16954
16955/* Handles 128 bit versions of PTEST, VTESTPS or VTESTPD.
16956   sign is 0 for PTEST insn, 32 for VTESTPS and 64 for VTESTPD. */
16957static Long dis_xTESTy_128 ( const VexAbiInfo* vbi, Prefix pfx,
16958                             Long delta, Bool isAvx, Int sign )
16959{
16960   IRTemp addr   = IRTemp_INVALID;
16961   Int    alen   = 0;
16962   HChar  dis_buf[50];
16963   UChar  modrm  = getUChar(delta);
16964   UInt   rG     = gregOfRexRM(pfx, modrm);
16965   IRTemp vecE = newTemp(Ity_V128);
16966   IRTemp vecG = newTemp(Ity_V128);
16967
16968   if ( epartIsReg(modrm) ) {
16969      UInt rE = eregOfRexRM(pfx, modrm);
16970      assign(vecE, getXMMReg(rE));
16971      delta += 1;
16972      DIP( "%s%stest%s %s,%s\n",
16973           isAvx ? "v" : "", sign == 0 ? "p" : "",
16974           sign == 0 ? "" : sign == 32 ? "ps" : "pd",
16975           nameXMMReg(rE), nameXMMReg(rG) );
16976   } else {
16977      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16978      if (!isAvx)
16979         gen_SEGV_if_not_16_aligned( addr );
16980      assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
16981      delta += alen;
16982      DIP( "%s%stest%s %s,%s\n",
16983           isAvx ? "v" : "", sign == 0 ? "p" : "",
16984           sign == 0 ? "" : sign == 32 ? "ps" : "pd",
16985           dis_buf, nameXMMReg(rG) );
16986   }
16987
16988   assign(vecG, getXMMReg(rG));
16989
16990   /* Set Z=1 iff (vecE & vecG) == 0
16991      Set C=1 iff (vecE & not vecG) == 0
16992   */
16993
16994   /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
16995   IRTemp andV  = newTemp(Ity_V128);
16996   IRTemp andnV = newTemp(Ity_V128);
16997   assign(andV,  binop(Iop_AndV128, mkexpr(vecE), mkexpr(vecG)));
16998   assign(andnV, binop(Iop_AndV128,
16999                       mkexpr(vecE),
17000                       binop(Iop_XorV128, mkexpr(vecG),
17001                                          mkV128(0xFFFF))));
17002
17003   finish_xTESTy ( andV, andnV, sign );
17004   return delta;
17005}
17006
17007
17008/* Handles 256 bit versions of PTEST, VTESTPS or VTESTPD.
17009   sign is 0 for PTEST insn, 32 for VTESTPS and 64 for VTESTPD. */
17010static Long dis_xTESTy_256 ( const VexAbiInfo* vbi, Prefix pfx,
17011                             Long delta, Int sign )
17012{
17013   IRTemp addr   = IRTemp_INVALID;
17014   Int    alen   = 0;
17015   HChar  dis_buf[50];
17016   UChar  modrm  = getUChar(delta);
17017   UInt   rG     = gregOfRexRM(pfx, modrm);
17018   IRTemp vecE   = newTemp(Ity_V256);
17019   IRTemp vecG   = newTemp(Ity_V256);
17020
17021   if ( epartIsReg(modrm) ) {
17022      UInt rE = eregOfRexRM(pfx, modrm);
17023      assign(vecE, getYMMReg(rE));
17024      delta += 1;
17025      DIP( "v%stest%s %s,%s\n", sign == 0 ? "p" : "",
17026           sign == 0 ? "" : sign == 32 ? "ps" : "pd",
17027           nameYMMReg(rE), nameYMMReg(rG) );
17028   } else {
17029      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17030      assign(vecE, loadLE( Ity_V256, mkexpr(addr) ));
17031      delta += alen;
17032      DIP( "v%stest%s %s,%s\n", sign == 0 ? "p" : "",
17033           sign == 0 ? "" : sign == 32 ? "ps" : "pd",
17034           dis_buf, nameYMMReg(rG) );
17035   }
17036
17037   assign(vecG, getYMMReg(rG));
17038
17039   /* Set Z=1 iff (vecE & vecG) == 0
17040      Set C=1 iff (vecE & not vecG) == 0
17041   */
17042
17043   /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
17044   IRTemp andV  = newTemp(Ity_V256);
17045   IRTemp andnV = newTemp(Ity_V256);
17046   assign(andV,  binop(Iop_AndV256, mkexpr(vecE), mkexpr(vecG)));
17047   assign(andnV, binop(Iop_AndV256,
17048                       mkexpr(vecE), unop(Iop_NotV256, mkexpr(vecG))));
17049
17050   IRTemp andVhi  = IRTemp_INVALID;
17051   IRTemp andVlo  = IRTemp_INVALID;
17052   IRTemp andnVhi = IRTemp_INVALID;
17053   IRTemp andnVlo = IRTemp_INVALID;
17054   breakupV256toV128s( andV, &andVhi, &andVlo );
17055   breakupV256toV128s( andnV, &andnVhi, &andnVlo );
17056
17057   IRTemp andV128  = newTemp(Ity_V128);
17058   IRTemp andnV128 = newTemp(Ity_V128);
17059   assign( andV128, binop( Iop_OrV128, mkexpr(andVhi), mkexpr(andVlo) ) );
17060   assign( andnV128, binop( Iop_OrV128, mkexpr(andnVhi), mkexpr(andnVlo) ) );
17061
17062   finish_xTESTy ( andV128, andnV128, sign );
17063   return delta;
17064}
17065
17066
17067/* Handles 128 bit versions of PMOVZXBW and PMOVSXBW. */
17068static Long dis_PMOVxXBW_128 ( const VexAbiInfo* vbi, Prefix pfx,
17069                               Long delta, Bool isAvx, Bool xIsZ )
17070{
17071   IRTemp addr   = IRTemp_INVALID;
17072   Int    alen   = 0;
17073   HChar  dis_buf[50];
17074   IRTemp srcVec = newTemp(Ity_V128);
17075   UChar  modrm  = getUChar(delta);
17076   const HChar* mbV    = isAvx ? "v" : "";
17077   const HChar  how    = xIsZ ? 'z' : 's';
17078   UInt   rG     = gregOfRexRM(pfx, modrm);
17079   if ( epartIsReg(modrm) ) {
17080      UInt rE = eregOfRexRM(pfx, modrm);
17081      assign( srcVec, getXMMReg(rE) );
17082      delta += 1;
17083      DIP( "%spmov%cxbw %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
17084   } else {
17085      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17086      assign( srcVec,
17087              unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
17088      delta += alen;
17089      DIP( "%spmov%cxbw %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
17090   }
17091
17092   IRExpr* res
17093      = xIsZ /* do math for either zero or sign extend */
17094        ? binop( Iop_InterleaveLO8x16,
17095                 IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) )
17096        : binop( Iop_SarN16x8,
17097                 binop( Iop_ShlN16x8,
17098                        binop( Iop_InterleaveLO8x16,
17099                               IRExpr_Const( IRConst_V128(0) ),
17100                               mkexpr(srcVec) ),
17101                        mkU8(8) ),
17102                 mkU8(8) );
17103
17104   (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
17105
17106   return delta;
17107}
17108
17109
17110/* Handles 256 bit versions of PMOVZXBW and PMOVSXBW. */
17111static Long dis_PMOVxXBW_256 ( const VexAbiInfo* vbi, Prefix pfx,
17112                               Long delta, Bool xIsZ )
17113{
17114   IRTemp addr   = IRTemp_INVALID;
17115   Int    alen   = 0;
17116   HChar  dis_buf[50];
17117   IRTemp srcVec = newTemp(Ity_V128);
17118   UChar  modrm  = getUChar(delta);
17119   UChar  how    = xIsZ ? 'z' : 's';
17120   UInt   rG     = gregOfRexRM(pfx, modrm);
17121   if ( epartIsReg(modrm) ) {
17122      UInt rE = eregOfRexRM(pfx, modrm);
17123      assign( srcVec, getXMMReg(rE) );
17124      delta += 1;
17125      DIP( "vpmov%cxbw %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
17126   } else {
17127      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17128      assign( srcVec, loadLE( Ity_V128, mkexpr(addr) ) );
17129      delta += alen;
17130      DIP( "vpmov%cxbw %s,%s\n", how, dis_buf, nameYMMReg(rG) );
17131   }
17132
17133   /* First do zero extend.  */
17134   IRExpr* res
17135      = binop( Iop_V128HLtoV256,
17136               binop( Iop_InterleaveHI8x16,
17137                      IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
17138               binop( Iop_InterleaveLO8x16,
17139                      IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
17140   /* And if needed sign extension as well.  */
17141   if (!xIsZ)
17142      res = binop( Iop_SarN16x16,
17143                   binop( Iop_ShlN16x16, res, mkU8(8) ), mkU8(8) );
17144
17145   putYMMReg ( rG, res );
17146
17147   return delta;
17148}
17149
17150
17151static Long dis_PMOVxXWD_128 ( const VexAbiInfo* vbi, Prefix pfx,
17152                               Long delta, Bool isAvx, Bool xIsZ )
17153{
17154   IRTemp addr   = IRTemp_INVALID;
17155   Int    alen   = 0;
17156   HChar  dis_buf[50];
17157   IRTemp srcVec = newTemp(Ity_V128);
17158   UChar  modrm  = getUChar(delta);
17159   const HChar* mbV    = isAvx ? "v" : "";
17160   const HChar  how    = xIsZ ? 'z' : 's';
17161   UInt   rG     = gregOfRexRM(pfx, modrm);
17162
17163   if ( epartIsReg(modrm) ) {
17164      UInt rE = eregOfRexRM(pfx, modrm);
17165      assign( srcVec, getXMMReg(rE) );
17166      delta += 1;
17167      DIP( "%spmov%cxwd %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
17168   } else {
17169      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17170      assign( srcVec,
17171              unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
17172      delta += alen;
17173      DIP( "%spmov%cxwd %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
17174   }
17175
17176   IRExpr* res
17177      = binop( Iop_InterleaveLO16x8,
17178               IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) );
17179   if (!xIsZ)
17180      res = binop(Iop_SarN32x4,
17181                  binop(Iop_ShlN32x4, res, mkU8(16)), mkU8(16));
17182
17183   (isAvx ? putYMMRegLoAndZU : putXMMReg)
17184      ( gregOfRexRM(pfx, modrm), res );
17185
17186   return delta;
17187}
17188
17189
17190static Long dis_PMOVxXWD_256 ( const VexAbiInfo* vbi, Prefix pfx,
17191                               Long delta, Bool xIsZ )
17192{
17193   IRTemp addr   = IRTemp_INVALID;
17194   Int    alen   = 0;
17195   HChar  dis_buf[50];
17196   IRTemp srcVec = newTemp(Ity_V128);
17197   UChar  modrm  = getUChar(delta);
17198   UChar  how    = xIsZ ? 'z' : 's';
17199   UInt   rG     = gregOfRexRM(pfx, modrm);
17200
17201   if ( epartIsReg(modrm) ) {
17202      UInt rE = eregOfRexRM(pfx, modrm);
17203      assign( srcVec, getXMMReg(rE) );
17204      delta += 1;
17205      DIP( "vpmov%cxwd %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
17206   } else {
17207      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17208      assign( srcVec, loadLE( Ity_V128, mkexpr(addr) ) );
17209      delta += alen;
17210      DIP( "vpmov%cxwd %s,%s\n", how, dis_buf, nameYMMReg(rG) );
17211   }
17212
17213   IRExpr* res
17214      = binop( Iop_V128HLtoV256,
17215               binop( Iop_InterleaveHI16x8,
17216                      IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
17217               binop( Iop_InterleaveLO16x8,
17218                      IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
17219   if (!xIsZ)
17220      res = binop(Iop_SarN32x8,
17221                  binop(Iop_ShlN32x8, res, mkU8(16)), mkU8(16));
17222
17223   putYMMReg ( rG, res );
17224
17225   return delta;
17226}
17227
17228
17229static Long dis_PMOVSXWQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
17230                               Long delta, Bool isAvx )
17231{
17232   IRTemp addr     = IRTemp_INVALID;
17233   Int    alen     = 0;
17234   HChar  dis_buf[50];
17235   IRTemp srcBytes = newTemp(Ity_I32);
17236   UChar  modrm    = getUChar(delta);
17237   const HChar* mbV = isAvx ? "v" : "";
17238   UInt   rG       = gregOfRexRM(pfx, modrm);
17239
17240   if ( epartIsReg( modrm ) ) {
17241      UInt rE = eregOfRexRM(pfx, modrm);
17242      assign( srcBytes, getXMMRegLane32( rE, 0 ) );
17243      delta += 1;
17244      DIP( "%spmovsxwq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
17245   } else {
17246      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17247      assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) );
17248      delta += alen;
17249      DIP( "%spmovsxwq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
17250   }
17251
17252   (isAvx ? putYMMRegLoAndZU : putXMMReg)
17253      ( rG, binop( Iop_64HLtoV128,
17254                   unop( Iop_16Sto64,
17255                         unop( Iop_32HIto16, mkexpr(srcBytes) ) ),
17256                   unop( Iop_16Sto64,
17257                         unop( Iop_32to16, mkexpr(srcBytes) ) ) ) );
17258   return delta;
17259}
17260
17261
17262static Long dis_PMOVSXWQ_256 ( const VexAbiInfo* vbi, Prefix pfx, Long delta )
17263{
17264   IRTemp addr     = IRTemp_INVALID;
17265   Int    alen     = 0;
17266   HChar  dis_buf[50];
17267   IRTemp srcBytes = newTemp(Ity_I64);
17268   UChar  modrm    = getUChar(delta);
17269   UInt   rG       = gregOfRexRM(pfx, modrm);
17270   IRTemp s3, s2, s1, s0;
17271   s3 = s2 = s1 = s0 = IRTemp_INVALID;
17272
17273   if ( epartIsReg( modrm ) ) {
17274      UInt rE = eregOfRexRM(pfx, modrm);
17275      assign( srcBytes, getXMMRegLane64( rE, 0 ) );
17276      delta += 1;
17277      DIP( "vpmovsxwq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
17278   } else {
17279      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17280      assign( srcBytes, loadLE( Ity_I64, mkexpr(addr) ) );
17281      delta += alen;
17282      DIP( "vpmovsxwq %s,%s\n", dis_buf, nameYMMReg(rG) );
17283   }
17284
17285   breakup64to16s( srcBytes, &s3, &s2, &s1, &s0 );
17286   putYMMReg( rG, binop( Iop_V128HLtoV256,
17287                         binop( Iop_64HLtoV128,
17288                                unop( Iop_16Sto64, mkexpr(s3) ),
17289                                unop( Iop_16Sto64, mkexpr(s2) ) ),
17290                         binop( Iop_64HLtoV128,
17291                                unop( Iop_16Sto64, mkexpr(s1) ),
17292                                unop( Iop_16Sto64, mkexpr(s0) ) ) ) );
17293   return delta;
17294}
17295
17296
17297static Long dis_PMOVZXWQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
17298                               Long delta, Bool isAvx )
17299{
17300   IRTemp addr     = IRTemp_INVALID;
17301   Int    alen     = 0;
17302   HChar  dis_buf[50];
17303   IRTemp srcVec = newTemp(Ity_V128);
17304   UChar  modrm    = getUChar(delta);
17305   const HChar* mbV = isAvx ? "v" : "";
17306   UInt   rG       = gregOfRexRM(pfx, modrm);
17307
17308   if ( epartIsReg( modrm ) ) {
17309      UInt rE = eregOfRexRM(pfx, modrm);
17310      assign( srcVec, getXMMReg(rE) );
17311      delta += 1;
17312      DIP( "%spmovzxwq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
17313   } else {
17314      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17315      assign( srcVec,
17316              unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
17317      delta += alen;
17318      DIP( "%spmovzxwq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
17319   }
17320
17321   IRTemp zeroVec = newTemp( Ity_V128 );
17322   assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
17323
17324   (isAvx ? putYMMRegLoAndZU : putXMMReg)
17325      ( rG, binop( Iop_InterleaveLO16x8,
17326                   mkexpr(zeroVec),
17327                   binop( Iop_InterleaveLO16x8,
17328                          mkexpr(zeroVec), mkexpr(srcVec) ) ) );
17329   return delta;
17330}
17331
17332
17333static Long dis_PMOVZXWQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
17334                               Long delta )
17335{
17336   IRTemp addr     = IRTemp_INVALID;
17337   Int    alen     = 0;
17338   HChar  dis_buf[50];
17339   IRTemp srcVec = newTemp(Ity_V128);
17340   UChar  modrm    = getUChar(delta);
17341   UInt   rG       = gregOfRexRM(pfx, modrm);
17342
17343   if ( epartIsReg( modrm ) ) {
17344      UInt rE = eregOfRexRM(pfx, modrm);
17345      assign( srcVec, getXMMReg(rE) );
17346      delta += 1;
17347      DIP( "vpmovzxwq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
17348   } else {
17349      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17350      assign( srcVec,
17351              unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
17352      delta += alen;
17353      DIP( "vpmovzxwq %s,%s\n", dis_buf, nameYMMReg(rG) );
17354   }
17355
17356   IRTemp zeroVec = newTemp( Ity_V128 );
17357   assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
17358
17359   putYMMReg( rG, binop( Iop_V128HLtoV256,
17360                         binop( Iop_InterleaveHI16x8,
17361                                mkexpr(zeroVec),
17362                                binop( Iop_InterleaveLO16x8,
17363                                       mkexpr(zeroVec), mkexpr(srcVec) ) ),
17364                         binop( Iop_InterleaveLO16x8,
17365                                mkexpr(zeroVec),
17366                                binop( Iop_InterleaveLO16x8,
17367                                       mkexpr(zeroVec), mkexpr(srcVec) ) ) ) );
17368   return delta;
17369}
17370
17371
17372/* Handles 128 bit versions of PMOVZXDQ and PMOVSXDQ. */
17373static Long dis_PMOVxXDQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
17374                               Long delta, Bool isAvx, Bool xIsZ )
17375{
17376   IRTemp addr   = IRTemp_INVALID;
17377   Int    alen   = 0;
17378   HChar  dis_buf[50];
17379   IRTemp srcI64 = newTemp(Ity_I64);
17380   IRTemp srcVec = newTemp(Ity_V128);
17381   UChar  modrm  = getUChar(delta);
17382   const HChar* mbV = isAvx ? "v" : "";
17383   const HChar  how = xIsZ ? 'z' : 's';
17384   UInt   rG     = gregOfRexRM(pfx, modrm);
17385   /* Compute both srcI64 -- the value to expand -- and srcVec -- same
17386      thing in a V128, with arbitrary junk in the top 64 bits.  Use
17387      one or both of them and let iropt clean up afterwards (as
17388      usual). */
17389   if ( epartIsReg(modrm) ) {
17390      UInt rE = eregOfRexRM(pfx, modrm);
17391      assign( srcVec, getXMMReg(rE) );
17392      assign( srcI64, unop(Iop_V128to64, mkexpr(srcVec)) );
17393      delta += 1;
17394      DIP( "%spmov%cxdq %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
17395   } else {
17396      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17397      assign( srcI64, loadLE(Ity_I64, mkexpr(addr)) );
17398      assign( srcVec, unop( Iop_64UtoV128, mkexpr(srcI64)) );
17399      delta += alen;
17400      DIP( "%spmov%cxdq %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
17401   }
17402
17403   IRExpr* res
17404      = xIsZ /* do math for either zero or sign extend */
17405        ? binop( Iop_InterleaveLO32x4,
17406                 IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) )
17407        : binop( Iop_64HLtoV128,
17408                 unop( Iop_32Sto64,
17409                       unop( Iop_64HIto32, mkexpr(srcI64) ) ),
17410                 unop( Iop_32Sto64,
17411                       unop( Iop_64to32, mkexpr(srcI64) ) ) );
17412
17413   (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
17414
17415   return delta;
17416}
17417
17418
17419/* Handles 256 bit versions of PMOVZXDQ and PMOVSXDQ. */
17420static Long dis_PMOVxXDQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
17421                               Long delta, Bool xIsZ )
17422{
17423   IRTemp addr   = IRTemp_INVALID;
17424   Int    alen   = 0;
17425   HChar  dis_buf[50];
17426   IRTemp srcVec = newTemp(Ity_V128);
17427   UChar  modrm  = getUChar(delta);
17428   UChar  how    = xIsZ ? 'z' : 's';
17429   UInt   rG     = gregOfRexRM(pfx, modrm);
17430   /* Compute both srcI64 -- the value to expand -- and srcVec -- same
17431      thing in a V128, with arbitrary junk in the top 64 bits.  Use
17432      one or both of them and let iropt clean up afterwards (as
17433      usual). */
17434   if ( epartIsReg(modrm) ) {
17435      UInt rE = eregOfRexRM(pfx, modrm);
17436      assign( srcVec, getXMMReg(rE) );
17437      delta += 1;
17438      DIP( "vpmov%cxdq %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
17439   } else {
17440      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17441      assign( srcVec, loadLE(Ity_V128, mkexpr(addr)) );
17442      delta += alen;
17443      DIP( "vpmov%cxdq %s,%s\n", how, dis_buf, nameYMMReg(rG) );
17444   }
17445
17446   IRExpr* res;
17447   if (xIsZ)
17448      res = binop( Iop_V128HLtoV256,
17449                   binop( Iop_InterleaveHI32x4,
17450                          IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
17451                   binop( Iop_InterleaveLO32x4,
17452                          IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
17453   else {
17454      IRTemp s3, s2, s1, s0;
17455      s3 = s2 = s1 = s0 = IRTemp_INVALID;
17456      breakupV128to32s( srcVec, &s3, &s2, &s1, &s0 );
17457      res = binop( Iop_V128HLtoV256,
17458                   binop( Iop_64HLtoV128,
17459                          unop( Iop_32Sto64, mkexpr(s3) ),
17460                          unop( Iop_32Sto64, mkexpr(s2) ) ),
17461                   binop( Iop_64HLtoV128,
17462                          unop( Iop_32Sto64, mkexpr(s1) ),
17463                          unop( Iop_32Sto64, mkexpr(s0) ) ) );
17464   }
17465
17466   putYMMReg ( rG, res );
17467
17468   return delta;
17469}
17470
17471
17472/* Handles 128 bit versions of PMOVZXBD and PMOVSXBD. */
17473static Long dis_PMOVxXBD_128 ( const VexAbiInfo* vbi, Prefix pfx,
17474                               Long delta, Bool isAvx, Bool xIsZ )
17475{
17476   IRTemp addr   = IRTemp_INVALID;
17477   Int    alen   = 0;
17478   HChar  dis_buf[50];
17479   IRTemp srcVec = newTemp(Ity_V128);
17480   UChar  modrm  = getUChar(delta);
17481   const HChar* mbV = isAvx ? "v" : "";
17482   const HChar  how = xIsZ ? 'z' : 's';
17483   UInt   rG     = gregOfRexRM(pfx, modrm);
17484   if ( epartIsReg(modrm) ) {
17485      UInt rE = eregOfRexRM(pfx, modrm);
17486      assign( srcVec, getXMMReg(rE) );
17487      delta += 1;
17488      DIP( "%spmov%cxbd %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
17489   } else {
17490      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17491      assign( srcVec,
17492              unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
17493      delta += alen;
17494      DIP( "%spmov%cxbd %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
17495   }
17496
17497   IRTemp zeroVec = newTemp(Ity_V128);
17498   assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
17499
17500   IRExpr* res
17501      = binop(Iop_InterleaveLO8x16,
17502              mkexpr(zeroVec),
17503              binop(Iop_InterleaveLO8x16,
17504                    mkexpr(zeroVec), mkexpr(srcVec)));
17505   if (!xIsZ)
17506      res = binop(Iop_SarN32x4,
17507                  binop(Iop_ShlN32x4, res, mkU8(24)), mkU8(24));
17508
17509   (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
17510
17511   return delta;
17512}
17513
17514
17515/* Handles 256 bit versions of PMOVZXBD and PMOVSXBD. */
17516static Long dis_PMOVxXBD_256 ( const VexAbiInfo* vbi, Prefix pfx,
17517                               Long delta, Bool xIsZ )
17518{
17519   IRTemp addr   = IRTemp_INVALID;
17520   Int    alen   = 0;
17521   HChar  dis_buf[50];
17522   IRTemp srcVec = newTemp(Ity_V128);
17523   UChar  modrm  = getUChar(delta);
17524   UChar  how    = xIsZ ? 'z' : 's';
17525   UInt   rG     = gregOfRexRM(pfx, modrm);
17526   if ( epartIsReg(modrm) ) {
17527      UInt rE = eregOfRexRM(pfx, modrm);
17528      assign( srcVec, getXMMReg(rE) );
17529      delta += 1;
17530      DIP( "vpmov%cxbd %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
17531   } else {
17532      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17533      assign( srcVec,
17534              unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
17535      delta += alen;
17536      DIP( "vpmov%cxbd %s,%s\n", how, dis_buf, nameYMMReg(rG) );
17537   }
17538
17539   IRTemp zeroVec = newTemp(Ity_V128);
17540   assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
17541
17542   IRExpr* res
17543      = binop( Iop_V128HLtoV256,
17544               binop(Iop_InterleaveHI8x16,
17545                     mkexpr(zeroVec),
17546                     binop(Iop_InterleaveLO8x16,
17547                           mkexpr(zeroVec), mkexpr(srcVec)) ),
17548               binop(Iop_InterleaveLO8x16,
17549                     mkexpr(zeroVec),
17550                     binop(Iop_InterleaveLO8x16,
17551                           mkexpr(zeroVec), mkexpr(srcVec)) ) );
17552   if (!xIsZ)
17553      res = binop(Iop_SarN32x8,
17554                  binop(Iop_ShlN32x8, res, mkU8(24)), mkU8(24));
17555
17556   putYMMReg ( rG, res );
17557
17558   return delta;
17559}
17560
17561
17562/* Handles 128 bit versions of PMOVSXBQ. */
17563static Long dis_PMOVSXBQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
17564                               Long delta, Bool isAvx )
17565{
17566   IRTemp addr     = IRTemp_INVALID;
17567   Int    alen     = 0;
17568   HChar  dis_buf[50];
17569   IRTemp srcBytes = newTemp(Ity_I16);
17570   UChar  modrm    = getUChar(delta);
17571   const HChar* mbV = isAvx ? "v" : "";
17572   UInt   rG       = gregOfRexRM(pfx, modrm);
17573   if ( epartIsReg(modrm) ) {
17574      UInt rE = eregOfRexRM(pfx, modrm);
17575      assign( srcBytes, getXMMRegLane16( rE, 0 ) );
17576      delta += 1;
17577      DIP( "%spmovsxbq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
17578   } else {
17579      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17580      assign( srcBytes, loadLE( Ity_I16, mkexpr(addr) ) );
17581      delta += alen;
17582      DIP( "%spmovsxbq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
17583   }
17584
17585   (isAvx ? putYMMRegLoAndZU : putXMMReg)
17586      ( rG, binop( Iop_64HLtoV128,
17587                   unop( Iop_8Sto64,
17588                         unop( Iop_16HIto8, mkexpr(srcBytes) ) ),
17589                   unop( Iop_8Sto64,
17590                         unop( Iop_16to8, mkexpr(srcBytes) ) ) ) );
17591   return delta;
17592}
17593
17594
17595/* Handles 256 bit versions of PMOVSXBQ. */
17596static Long dis_PMOVSXBQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
17597                               Long delta )
17598{
17599   IRTemp addr     = IRTemp_INVALID;
17600   Int    alen     = 0;
17601   HChar  dis_buf[50];
17602   IRTemp srcBytes = newTemp(Ity_I32);
17603   UChar  modrm    = getUChar(delta);
17604   UInt   rG       = gregOfRexRM(pfx, modrm);
17605   if ( epartIsReg(modrm) ) {
17606      UInt rE = eregOfRexRM(pfx, modrm);
17607      assign( srcBytes, getXMMRegLane32( rE, 0 ) );
17608      delta += 1;
17609      DIP( "vpmovsxbq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
17610   } else {
17611      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17612      assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) );
17613      delta += alen;
17614      DIP( "vpmovsxbq %s,%s\n", dis_buf, nameYMMReg(rG) );
17615   }
17616
17617   putYMMReg
17618      ( rG, binop( Iop_V128HLtoV256,
17619                   binop( Iop_64HLtoV128,
17620                          unop( Iop_8Sto64,
17621                                unop( Iop_16HIto8,
17622                                      unop( Iop_32HIto16,
17623                                            mkexpr(srcBytes) ) ) ),
17624                          unop( Iop_8Sto64,
17625                                unop( Iop_16to8,
17626                                      unop( Iop_32HIto16,
17627                                            mkexpr(srcBytes) ) ) ) ),
17628                   binop( Iop_64HLtoV128,
17629                          unop( Iop_8Sto64,
17630                                unop( Iop_16HIto8,
17631                                      unop( Iop_32to16,
17632                                            mkexpr(srcBytes) ) ) ),
17633                          unop( Iop_8Sto64,
17634                                unop( Iop_16to8,
17635                                      unop( Iop_32to16,
17636                                            mkexpr(srcBytes) ) ) ) ) ) );
17637   return delta;
17638}
17639
17640
17641/* Handles 128 bit versions of PMOVZXBQ. */
17642static Long dis_PMOVZXBQ_128 ( const VexAbiInfo* vbi, Prefix pfx,
17643                               Long delta, Bool isAvx )
17644{
17645   IRTemp addr     = IRTemp_INVALID;
17646   Int    alen     = 0;
17647   HChar  dis_buf[50];
17648   IRTemp srcVec   = newTemp(Ity_V128);
17649   UChar  modrm    = getUChar(delta);
17650   const HChar* mbV = isAvx ? "v" : "";
17651   UInt   rG       = gregOfRexRM(pfx, modrm);
17652   if ( epartIsReg(modrm) ) {
17653      UInt rE = eregOfRexRM(pfx, modrm);
17654      assign( srcVec, getXMMReg(rE) );
17655      delta += 1;
17656      DIP( "%spmovzxbq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
17657   } else {
17658      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17659      assign( srcVec,
17660              unop( Iop_32UtoV128,
17661                    unop( Iop_16Uto32, loadLE( Ity_I16, mkexpr(addr) ))));
17662      delta += alen;
17663      DIP( "%spmovzxbq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
17664   }
17665
17666   IRTemp zeroVec = newTemp(Ity_V128);
17667   assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
17668
17669   (isAvx ? putYMMRegLoAndZU : putXMMReg)
17670      ( rG, binop( Iop_InterleaveLO8x16,
17671                   mkexpr(zeroVec),
17672                   binop( Iop_InterleaveLO8x16,
17673                          mkexpr(zeroVec),
17674                          binop( Iop_InterleaveLO8x16,
17675                                 mkexpr(zeroVec), mkexpr(srcVec) ) ) ) );
17676   return delta;
17677}
17678
17679
17680/* Handles 256 bit versions of PMOVZXBQ. */
17681static Long dis_PMOVZXBQ_256 ( const VexAbiInfo* vbi, Prefix pfx,
17682                               Long delta )
17683{
17684   IRTemp addr     = IRTemp_INVALID;
17685   Int    alen     = 0;
17686   HChar  dis_buf[50];
17687   IRTemp srcVec   = newTemp(Ity_V128);
17688   UChar  modrm    = getUChar(delta);
17689   UInt   rG       = gregOfRexRM(pfx, modrm);
17690   if ( epartIsReg(modrm) ) {
17691      UInt rE = eregOfRexRM(pfx, modrm);
17692      assign( srcVec, getXMMReg(rE) );
17693      delta += 1;
17694      DIP( "vpmovzxbq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
17695   } else {
17696      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17697      assign( srcVec,
17698              unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) )));
17699      delta += alen;
17700      DIP( "vpmovzxbq %s,%s\n", dis_buf, nameYMMReg(rG) );
17701   }
17702
17703   IRTemp zeroVec = newTemp(Ity_V128);
17704   assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
17705
17706   putYMMReg
17707      ( rG, binop( Iop_V128HLtoV256,
17708                   binop( Iop_InterleaveHI8x16,
17709                          mkexpr(zeroVec),
17710                          binop( Iop_InterleaveLO8x16,
17711                                 mkexpr(zeroVec),
17712                                 binop( Iop_InterleaveLO8x16,
17713                                        mkexpr(zeroVec), mkexpr(srcVec) ) ) ),
17714                   binop( Iop_InterleaveLO8x16,
17715                          mkexpr(zeroVec),
17716                          binop( Iop_InterleaveLO8x16,
17717                                 mkexpr(zeroVec),
17718                                 binop( Iop_InterleaveLO8x16,
17719                                        mkexpr(zeroVec), mkexpr(srcVec) ) ) )
17720                 ) );
17721   return delta;
17722}
17723
17724
17725static Long dis_PHMINPOSUW_128 ( const VexAbiInfo* vbi, Prefix pfx,
17726                                 Long delta, Bool isAvx )
17727{
17728   IRTemp addr   = IRTemp_INVALID;
17729   Int    alen   = 0;
17730   HChar  dis_buf[50];
17731   UChar  modrm  = getUChar(delta);
17732   const HChar* mbV = isAvx ? "v" : "";
17733   IRTemp sV     = newTemp(Ity_V128);
17734   IRTemp sHi    = newTemp(Ity_I64);
17735   IRTemp sLo    = newTemp(Ity_I64);
17736   IRTemp dLo    = newTemp(Ity_I64);
17737   UInt   rG     = gregOfRexRM(pfx,modrm);
17738   if (epartIsReg(modrm)) {
17739      UInt rE = eregOfRexRM(pfx,modrm);
17740      assign( sV, getXMMReg(rE) );
17741      delta += 1;
17742      DIP("%sphminposuw %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG));
17743   } else {
17744      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
17745      if (!isAvx)
17746         gen_SEGV_if_not_16_aligned(addr);
17747      assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
17748      delta += alen;
17749      DIP("%sphminposuw %s,%s\n", mbV, dis_buf, nameXMMReg(rG));
17750   }
17751   assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
17752   assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
17753   assign( dLo, mkIRExprCCall(
17754                   Ity_I64, 0/*regparms*/,
17755                   "amd64g_calculate_sse_phminposuw",
17756                   &amd64g_calculate_sse_phminposuw,
17757                   mkIRExprVec_2( mkexpr(sLo), mkexpr(sHi) )
17758         ));
17759   (isAvx ? putYMMRegLoAndZU : putXMMReg)
17760      (rG, unop(Iop_64UtoV128, mkexpr(dLo)));
17761   return delta;
17762}
17763
17764
17765static Long dis_AESx ( const VexAbiInfo* vbi, Prefix pfx,
17766                       Long delta, Bool isAvx, UChar opc )
17767{
17768   IRTemp addr   = IRTemp_INVALID;
17769   Int    alen   = 0;
17770   HChar  dis_buf[50];
17771   UChar  modrm  = getUChar(delta);
17772   UInt   rG     = gregOfRexRM(pfx, modrm);
17773   UInt   regNoL = 0;
17774   UInt   regNoR = (isAvx && opc != 0xDB) ? getVexNvvvv(pfx) : rG;
17775
17776   /* This is a nasty kludge.  We need to pass 2 x V128 to the
17777      helper.  Since we can't do that, use a dirty
17778      helper to compute the results directly from the XMM regs in
17779      the guest state.  That means for the memory case, we need to
17780      move the left operand into a pseudo-register (XMM16, let's
17781      call it). */
17782   if (epartIsReg(modrm)) {
17783      regNoL = eregOfRexRM(pfx, modrm);
17784      delta += 1;
17785   } else {
17786      regNoL = 16; /* use XMM16 as an intermediary */
17787      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17788      /* alignment check needed ???? */
17789      stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
17790      delta += alen;
17791   }
17792
17793   void*  fn = &amd64g_dirtyhelper_AES;
17794   const HChar* nm = "amd64g_dirtyhelper_AES";
17795
17796   /* Round up the arguments.  Note that this is a kludge -- the
17797      use of mkU64 rather than mkIRExpr_HWord implies the
17798      assumption that the host's word size is 64-bit. */
17799   UInt gstOffD = ymmGuestRegOffset(rG);
17800   UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
17801   UInt gstOffR = ymmGuestRegOffset(regNoR);
17802   IRExpr*  opc4         = mkU64(opc);
17803   IRExpr*  gstOffDe     = mkU64(gstOffD);
17804   IRExpr*  gstOffLe     = mkU64(gstOffL);
17805   IRExpr*  gstOffRe     = mkU64(gstOffR);
17806   IRExpr** args
17807      = mkIRExprVec_5( IRExpr_GSPTR(), opc4, gstOffDe, gstOffLe, gstOffRe );
17808
17809   IRDirty* d    = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
17810   /* It's not really a dirty call, but we can't use the clean helper
17811      mechanism here for the very lame reason that we can't pass 2 x
17812      V128s by value to a helper.  Hence this roundabout scheme. */
17813   d->nFxState = 2;
17814   vex_bzero(&d->fxState, sizeof(d->fxState));
17815   /* AES{ENC,ENCLAST,DEC,DECLAST} read both registers, and writes
17816      the second for !isAvx or the third for isAvx.
17817      AESIMC (0xDB) reads the first register, and writes the second. */
17818   d->fxState[0].fx     = Ifx_Read;
17819   d->fxState[0].offset = gstOffL;
17820   d->fxState[0].size   = sizeof(U128);
17821   d->fxState[1].offset = gstOffR;
17822   d->fxState[1].size   = sizeof(U128);
17823   if (opc == 0xDB)
17824      d->fxState[1].fx   = Ifx_Write;
17825   else if (!isAvx || rG == regNoR)
17826      d->fxState[1].fx   = Ifx_Modify;
17827   else {
17828      d->fxState[1].fx     = Ifx_Read;
17829      d->nFxState++;
17830      d->fxState[2].fx     = Ifx_Write;
17831      d->fxState[2].offset = gstOffD;
17832      d->fxState[2].size   = sizeof(U128);
17833   }
17834
17835   stmt( IRStmt_Dirty(d) );
17836   {
17837      const HChar* opsuf;
17838      switch (opc) {
17839         case 0xDC: opsuf = "enc"; break;
17840         case 0XDD: opsuf = "enclast"; break;
17841         case 0xDE: opsuf = "dec"; break;
17842         case 0xDF: opsuf = "declast"; break;
17843         case 0xDB: opsuf = "imc"; break;
17844         default: vassert(0);
17845      }
17846      DIP("%saes%s %s,%s%s%s\n", isAvx ? "v" : "", opsuf,
17847          (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
17848          nameXMMReg(regNoR),
17849          (isAvx && opc != 0xDB) ? "," : "",
17850          (isAvx && opc != 0xDB) ? nameXMMReg(rG) : "");
17851   }
17852   if (isAvx)
17853      putYMMRegLane128( rG, 1, mkV128(0) );
17854   return delta;
17855}
17856
17857static Long dis_AESKEYGENASSIST ( const VexAbiInfo* vbi, Prefix pfx,
17858                                  Long delta, Bool isAvx )
17859{
17860   IRTemp addr   = IRTemp_INVALID;
17861   Int    alen   = 0;
17862   HChar  dis_buf[50];
17863   UChar  modrm  = getUChar(delta);
17864   UInt   regNoL = 0;
17865   UInt   regNoR = gregOfRexRM(pfx, modrm);
17866   UChar  imm    = 0;
17867
17868   /* This is a nasty kludge.  See AESENC et al. instructions. */
17869   modrm = getUChar(delta);
17870   if (epartIsReg(modrm)) {
17871      regNoL = eregOfRexRM(pfx, modrm);
17872      imm = getUChar(delta+1);
17873      delta += 1+1;
17874   } else {
17875      regNoL = 16; /* use XMM16 as an intermediary */
17876      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
17877      /* alignment check ???? . */
17878      stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
17879      imm = getUChar(delta+alen);
17880      delta += alen+1;
17881   }
17882
17883   /* Who ya gonna call?  Presumably not Ghostbusters. */
17884   void*  fn = &amd64g_dirtyhelper_AESKEYGENASSIST;
17885   const HChar* nm = "amd64g_dirtyhelper_AESKEYGENASSIST";
17886
17887   /* Round up the arguments.  Note that this is a kludge -- the
17888      use of mkU64 rather than mkIRExpr_HWord implies the
17889      assumption that the host's word size is 64-bit. */
17890   UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
17891   UInt gstOffR = ymmGuestRegOffset(regNoR);
17892
17893   IRExpr*  imme          = mkU64(imm & 0xFF);
17894   IRExpr*  gstOffLe     = mkU64(gstOffL);
17895   IRExpr*  gstOffRe     = mkU64(gstOffR);
17896   IRExpr** args
17897      = mkIRExprVec_4( IRExpr_GSPTR(), imme, gstOffLe, gstOffRe );
17898
17899   IRDirty* d    = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
17900   /* It's not really a dirty call, but we can't use the clean helper
17901      mechanism here for the very lame reason that we can't pass 2 x
17902      V128s by value to a helper.  Hence this roundabout scheme. */
17903   d->nFxState = 2;
17904   vex_bzero(&d->fxState, sizeof(d->fxState));
17905   d->fxState[0].fx     = Ifx_Read;
17906   d->fxState[0].offset = gstOffL;
17907   d->fxState[0].size   = sizeof(U128);
17908   d->fxState[1].fx     = Ifx_Write;
17909   d->fxState[1].offset = gstOffR;
17910   d->fxState[1].size   = sizeof(U128);
17911   stmt( IRStmt_Dirty(d) );
17912
17913   DIP("%saeskeygenassist $%x,%s,%s\n", isAvx ? "v" : "", (UInt)imm,
17914       (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
17915       nameXMMReg(regNoR));
17916   if (isAvx)
17917      putYMMRegLane128( regNoR, 1, mkV128(0) );
17918   return delta;
17919}
17920
17921
17922__attribute__((noinline))
17923static
17924Long dis_ESC_0F38__SSE4 ( Bool* decode_OK,
17925                          const VexAbiInfo* vbi,
17926                          Prefix pfx, Int sz, Long deltaIN )
17927{
17928   IRTemp addr  = IRTemp_INVALID;
17929   UChar  modrm = 0;
17930   Int    alen  = 0;
17931   HChar  dis_buf[50];
17932
17933   *decode_OK = False;
17934
17935   Long   delta = deltaIN;
17936   UChar  opc   = getUChar(delta);
17937   delta++;
17938   switch (opc) {
17939
17940   case 0x10:
17941   case 0x14:
17942   case 0x15:
17943      /* 66 0F 38 10 /r = PBLENDVB xmm1, xmm2/m128  (byte gran)
17944         66 0F 38 14 /r = BLENDVPS xmm1, xmm2/m128  (float gran)
17945         66 0F 38 15 /r = BLENDVPD xmm1, xmm2/m128  (double gran)
17946         Blend at various granularities, with XMM0 (implicit operand)
17947         providing the controlling mask.
17948      */
17949      if (have66noF2noF3(pfx) && sz == 2) {
17950         modrm = getUChar(delta);
17951
17952         const HChar* nm    = NULL;
17953         UInt   gran  = 0;
17954         IROp   opSAR = Iop_INVALID;
17955         switch (opc) {
17956            case 0x10:
17957               nm = "pblendvb"; gran = 1; opSAR = Iop_SarN8x16;
17958               break;
17959            case 0x14:
17960               nm = "blendvps"; gran = 4; opSAR = Iop_SarN32x4;
17961               break;
17962            case 0x15:
17963               nm = "blendvpd"; gran = 8; opSAR = Iop_SarN64x2;
17964               break;
17965         }
17966         vassert(nm);
17967
17968         IRTemp vecE = newTemp(Ity_V128);
17969         IRTemp vecG = newTemp(Ity_V128);
17970         IRTemp vec0 = newTemp(Ity_V128);
17971
17972         if ( epartIsReg(modrm) ) {
17973            assign(vecE, getXMMReg(eregOfRexRM(pfx, modrm)));
17974            delta += 1;
17975            DIP( "%s %s,%s\n", nm,
17976                 nameXMMReg( eregOfRexRM(pfx, modrm) ),
17977                 nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17978         } else {
17979            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17980            gen_SEGV_if_not_16_aligned( addr );
17981            assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
17982            delta += alen;
17983            DIP( "%s %s,%s\n", nm,
17984                 dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17985         }
17986
17987         assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm)));
17988         assign(vec0, getXMMReg(0));
17989
17990         IRTemp res = math_PBLENDVB_128( vecE, vecG, vec0, gran, opSAR );
17991         putXMMReg(gregOfRexRM(pfx, modrm), mkexpr(res));
17992
17993         goto decode_success;
17994      }
17995      break;
17996
17997   case 0x17:
17998      /* 66 0F 38 17 /r = PTEST xmm1, xmm2/m128
17999         Logical compare (set ZF and CF from AND/ANDN of the operands) */
18000      if (have66noF2noF3(pfx)
18001          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
18002         delta = dis_xTESTy_128( vbi, pfx, delta, False/*!isAvx*/, 0 );
18003         goto decode_success;
18004      }
18005      break;
18006
18007   case 0x20:
18008      /* 66 0F 38 20 /r = PMOVSXBW xmm1, xmm2/m64
18009         Packed Move with Sign Extend from Byte to Word (XMM) */
18010      if (have66noF2noF3(pfx) && sz == 2) {
18011         delta = dis_PMOVxXBW_128( vbi, pfx, delta,
18012                                   False/*!isAvx*/, False/*!xIsZ*/ );
18013         goto decode_success;
18014      }
18015      break;
18016
18017   case 0x21:
18018      /* 66 0F 38 21 /r = PMOVSXBD xmm1, xmm2/m32
18019         Packed Move with Sign Extend from Byte to DWord (XMM) */
18020      if (have66noF2noF3(pfx) && sz == 2) {
18021         delta = dis_PMOVxXBD_128( vbi, pfx, delta,
18022                                   False/*!isAvx*/, False/*!xIsZ*/ );
18023         goto decode_success;
18024      }
18025      break;
18026
18027   case 0x22:
18028      /* 66 0F 38 22 /r = PMOVSXBQ xmm1, xmm2/m16
18029         Packed Move with Sign Extend from Byte to QWord (XMM) */
18030      if (have66noF2noF3(pfx) && sz == 2) {
18031         delta = dis_PMOVSXBQ_128( vbi, pfx, delta, False/*!isAvx*/ );
18032         goto decode_success;
18033      }
18034      break;
18035
18036   case 0x23:
18037      /* 66 0F 38 23 /r = PMOVSXWD xmm1, xmm2/m64
18038         Packed Move with Sign Extend from Word to DWord (XMM) */
18039      if (have66noF2noF3(pfx) && sz == 2) {
18040         delta = dis_PMOVxXWD_128(vbi, pfx, delta,
18041                                  False/*!isAvx*/, False/*!xIsZ*/);
18042         goto decode_success;
18043      }
18044      break;
18045
18046   case 0x24:
18047      /* 66 0F 38 24 /r = PMOVSXWQ xmm1, xmm2/m32
18048         Packed Move with Sign Extend from Word to QWord (XMM) */
18049      if (have66noF2noF3(pfx) && sz == 2) {
18050         delta = dis_PMOVSXWQ_128( vbi, pfx, delta, False/*!isAvx*/ );
18051         goto decode_success;
18052      }
18053      break;
18054
18055   case 0x25:
18056      /* 66 0F 38 25 /r = PMOVSXDQ xmm1, xmm2/m64
18057         Packed Move with Sign Extend from Double Word to Quad Word (XMM) */
18058      if (have66noF2noF3(pfx) && sz == 2) {
18059         delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
18060                                   False/*!isAvx*/, False/*!xIsZ*/ );
18061         goto decode_success;
18062      }
18063      break;
18064
18065   case 0x28:
18066      /* 66 0F 38 28 = PMULDQ -- signed widening multiply of 32-lanes
18067         0 x 0 to form lower 64-bit half and lanes 2 x 2 to form upper
18068         64-bit half */
18069      /* This is a really poor translation -- could be improved if
18070         performance critical.  It's a copy-paste of PMULUDQ, too. */
18071      if (have66noF2noF3(pfx) && sz == 2) {
18072         IRTemp sV = newTemp(Ity_V128);
18073         IRTemp dV = newTemp(Ity_V128);
18074         modrm = getUChar(delta);
18075         UInt rG = gregOfRexRM(pfx,modrm);
18076         assign( dV, getXMMReg(rG) );
18077         if (epartIsReg(modrm)) {
18078            UInt rE = eregOfRexRM(pfx,modrm);
18079            assign( sV, getXMMReg(rE) );
18080            delta += 1;
18081            DIP("pmuldq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
18082         } else {
18083            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
18084            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
18085            delta += alen;
18086            DIP("pmuldq %s,%s\n", dis_buf, nameXMMReg(rG));
18087         }
18088
18089         putXMMReg( rG, mkexpr(math_PMULDQ_128( dV, sV )) );
18090         goto decode_success;
18091      }
18092      break;
18093
18094   case 0x29:
18095      /* 66 0F 38 29 = PCMPEQQ
18096         64x2 equality comparison */
18097      if (have66noF2noF3(pfx) && sz == 2) {
18098         /* FIXME: this needs an alignment check */
18099         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
18100                                    "pcmpeqq", Iop_CmpEQ64x2, False );
18101         goto decode_success;
18102      }
18103      break;
18104
18105   case 0x2A:
18106      /* 66 0F 38 2A = MOVNTDQA
18107         "non-temporal" "streaming" load
18108         Handle like MOVDQA but only memory operand is allowed */
18109      if (have66noF2noF3(pfx) && sz == 2) {
18110         modrm = getUChar(delta);
18111         if (!epartIsReg(modrm)) {
18112            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
18113            gen_SEGV_if_not_16_aligned( addr );
18114            putXMMReg( gregOfRexRM(pfx,modrm),
18115                       loadLE(Ity_V128, mkexpr(addr)) );
18116            DIP("movntdqa %s,%s\n", dis_buf,
18117                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
18118            delta += alen;
18119            goto decode_success;
18120         }
18121      }
18122      break;
18123
18124   case 0x2B:
18125      /* 66 0f 38 2B /r = PACKUSDW xmm1, xmm2/m128
18126         2x 32x4 S->U saturating narrow from xmm2/m128 to xmm1 */
18127      if (have66noF2noF3(pfx) && sz == 2) {
18128
18129         modrm = getUChar(delta);
18130
18131         IRTemp argL = newTemp(Ity_V128);
18132         IRTemp argR = newTemp(Ity_V128);
18133
18134         if ( epartIsReg(modrm) ) {
18135            assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
18136            delta += 1;
18137            DIP( "packusdw %s,%s\n",
18138                 nameXMMReg( eregOfRexRM(pfx, modrm) ),
18139                 nameXMMReg( gregOfRexRM(pfx, modrm) ) );
18140         } else {
18141            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
18142            gen_SEGV_if_not_16_aligned( addr );
18143            assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
18144            delta += alen;
18145            DIP( "packusdw %s,%s\n",
18146                 dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
18147         }
18148
18149         assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
18150
18151         putXMMReg( gregOfRexRM(pfx, modrm),
18152                    binop( Iop_QNarrowBin32Sto16Ux8,
18153                           mkexpr(argL), mkexpr(argR)) );
18154
18155         goto decode_success;
18156      }
18157      break;
18158
18159   case 0x30:
18160      /* 66 0F 38 30 /r = PMOVZXBW xmm1, xmm2/m64
18161         Packed Move with Zero Extend from Byte to Word (XMM) */
18162      if (have66noF2noF3(pfx) && sz == 2) {
18163         delta = dis_PMOVxXBW_128( vbi, pfx, delta,
18164                                   False/*!isAvx*/, True/*xIsZ*/ );
18165         goto decode_success;
18166      }
18167      break;
18168
18169   case 0x31:
18170      /* 66 0F 38 31 /r = PMOVZXBD xmm1, xmm2/m32
18171         Packed Move with Zero Extend from Byte to DWord (XMM) */
18172      if (have66noF2noF3(pfx) && sz == 2) {
18173         delta = dis_PMOVxXBD_128( vbi, pfx, delta,
18174                                   False/*!isAvx*/, True/*xIsZ*/ );
18175         goto decode_success;
18176      }
18177      break;
18178
18179   case 0x32:
18180      /* 66 0F 38 32 /r = PMOVZXBQ xmm1, xmm2/m16
18181         Packed Move with Zero Extend from Byte to QWord (XMM) */
18182      if (have66noF2noF3(pfx) && sz == 2) {
18183         delta = dis_PMOVZXBQ_128( vbi, pfx, delta, False/*!isAvx*/ );
18184         goto decode_success;
18185      }
18186      break;
18187
18188   case 0x33:
18189      /* 66 0F 38 33 /r = PMOVZXWD xmm1, xmm2/m64
18190         Packed Move with Zero Extend from Word to DWord (XMM) */
18191      if (have66noF2noF3(pfx) && sz == 2) {
18192         delta = dis_PMOVxXWD_128( vbi, pfx, delta,
18193                                   False/*!isAvx*/, True/*xIsZ*/ );
18194         goto decode_success;
18195      }
18196      break;
18197
18198   case 0x34:
18199      /* 66 0F 38 34 /r = PMOVZXWQ xmm1, xmm2/m32
18200         Packed Move with Zero Extend from Word to QWord (XMM) */
18201      if (have66noF2noF3(pfx) && sz == 2) {
18202         delta = dis_PMOVZXWQ_128( vbi, pfx, delta, False/*!isAvx*/ );
18203         goto decode_success;
18204      }
18205      break;
18206
18207   case 0x35:
18208      /* 66 0F 38 35 /r = PMOVZXDQ xmm1, xmm2/m64
18209         Packed Move with Zero Extend from DWord to QWord (XMM) */
18210      if (have66noF2noF3(pfx) && sz == 2) {
18211         delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
18212                                   False/*!isAvx*/, True/*xIsZ*/ );
18213         goto decode_success;
18214      }
18215      break;
18216
18217   case 0x37:
18218      /* 66 0F 38 37 = PCMPGTQ
18219         64x2 comparison (signed, presumably; the Intel docs don't say :-)
18220      */
18221      if (have66noF2noF3(pfx) && sz == 2) {
18222         /* FIXME: this needs an alignment check */
18223         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
18224                                    "pcmpgtq", Iop_CmpGT64Sx2, False );
18225         goto decode_success;
18226      }
18227      break;
18228
18229   case 0x38:
18230   case 0x3C:
18231      /* 66 0F 38 38 /r = PMINSB xmm1, xmm2/m128    8Sx16 (signed) min
18232         66 0F 38 3C /r = PMAXSB xmm1, xmm2/m128    8Sx16 (signed) max
18233      */
18234      if (have66noF2noF3(pfx) && sz == 2) {
18235         /* FIXME: this needs an alignment check */
18236         Bool isMAX = opc == 0x3C;
18237         delta = dis_SSEint_E_to_G(
18238                    vbi, pfx, delta,
18239                    isMAX ? "pmaxsb" : "pminsb",
18240                    isMAX ? Iop_Max8Sx16 : Iop_Min8Sx16,
18241                    False
18242                 );
18243         goto decode_success;
18244      }
18245      break;
18246
18247   case 0x39:
18248   case 0x3D:
18249      /* 66 0F 38 39 /r = PMINSD xmm1, xmm2/m128
18250         Minimum of Packed Signed Double Word Integers (XMM)
18251         66 0F 38 3D /r = PMAXSD xmm1, xmm2/m128
18252         Maximum of Packed Signed Double Word Integers (XMM)
18253      */
18254      if (have66noF2noF3(pfx) && sz == 2) {
18255         /* FIXME: this needs an alignment check */
18256         Bool isMAX = opc == 0x3D;
18257         delta = dis_SSEint_E_to_G(
18258                    vbi, pfx, delta,
18259                    isMAX ? "pmaxsd" : "pminsd",
18260                    isMAX ? Iop_Max32Sx4 : Iop_Min32Sx4,
18261                    False
18262                 );
18263         goto decode_success;
18264      }
18265      break;
18266
18267   case 0x3A:
18268   case 0x3E:
18269      /* 66 0F 38 3A /r = PMINUW xmm1, xmm2/m128
18270         Minimum of Packed Unsigned Word Integers (XMM)
18271         66 0F 38 3E /r = PMAXUW xmm1, xmm2/m128
18272         Maximum of Packed Unsigned Word Integers (XMM)
18273      */
18274      if (have66noF2noF3(pfx) && sz == 2) {
18275         /* FIXME: this needs an alignment check */
18276         Bool isMAX = opc == 0x3E;
18277         delta = dis_SSEint_E_to_G(
18278                    vbi, pfx, delta,
18279                    isMAX ? "pmaxuw" : "pminuw",
18280                    isMAX ? Iop_Max16Ux8 : Iop_Min16Ux8,
18281                    False
18282                 );
18283         goto decode_success;
18284      }
18285      break;
18286
18287   case 0x3B:
18288   case 0x3F:
18289      /* 66 0F 38 3B /r = PMINUD xmm1, xmm2/m128
18290         Minimum of Packed Unsigned Doubleword Integers (XMM)
18291         66 0F 38 3F /r = PMAXUD xmm1, xmm2/m128
18292         Maximum of Packed Unsigned Doubleword Integers (XMM)
18293      */
18294      if (have66noF2noF3(pfx) && sz == 2) {
18295         /* FIXME: this needs an alignment check */
18296         Bool isMAX = opc == 0x3F;
18297         delta = dis_SSEint_E_to_G(
18298                    vbi, pfx, delta,
18299                    isMAX ? "pmaxud" : "pminud",
18300                    isMAX ? Iop_Max32Ux4 : Iop_Min32Ux4,
18301                    False
18302                 );
18303         goto decode_success;
18304      }
18305      break;
18306
18307   case 0x40:
18308      /* 66 0F 38 40 /r = PMULLD xmm1, xmm2/m128
18309         32x4 integer multiply from xmm2/m128 to xmm1 */
18310      if (have66noF2noF3(pfx) && sz == 2) {
18311
18312         modrm = getUChar(delta);
18313
18314         IRTemp argL = newTemp(Ity_V128);
18315         IRTemp argR = newTemp(Ity_V128);
18316
18317         if ( epartIsReg(modrm) ) {
18318            assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
18319            delta += 1;
18320            DIP( "pmulld %s,%s\n",
18321                 nameXMMReg( eregOfRexRM(pfx, modrm) ),
18322                 nameXMMReg( gregOfRexRM(pfx, modrm) ) );
18323         } else {
18324            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
18325            gen_SEGV_if_not_16_aligned( addr );
18326            assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
18327            delta += alen;
18328            DIP( "pmulld %s,%s\n",
18329                 dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
18330         }
18331
18332         assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
18333
18334         putXMMReg( gregOfRexRM(pfx, modrm),
18335                    binop( Iop_Mul32x4, mkexpr(argL), mkexpr(argR)) );
18336
18337         goto decode_success;
18338      }
18339      break;
18340
18341   case 0x41:
18342      /* 66 0F 38 41 /r = PHMINPOSUW xmm1, xmm2/m128
18343         Packed Horizontal Word Minimum from xmm2/m128 to xmm1 */
18344      if (have66noF2noF3(pfx) && sz == 2) {
18345         delta = dis_PHMINPOSUW_128( vbi, pfx, delta, False/*!isAvx*/ );
18346         goto decode_success;
18347      }
18348      break;
18349
18350   case 0xDC:
18351   case 0xDD:
18352   case 0xDE:
18353   case 0xDF:
18354   case 0xDB:
18355      /* 66 0F 38 DC /r = AESENC xmm1, xmm2/m128
18356                  DD /r = AESENCLAST xmm1, xmm2/m128
18357                  DE /r = AESDEC xmm1, xmm2/m128
18358                  DF /r = AESDECLAST xmm1, xmm2/m128
18359
18360                  DB /r = AESIMC xmm1, xmm2/m128 */
18361      if (have66noF2noF3(pfx) && sz == 2) {
18362         delta = dis_AESx( vbi, pfx, delta, False/*!isAvx*/, opc );
18363         goto decode_success;
18364      }
18365      break;
18366
18367   case 0xF0:
18368   case 0xF1:
18369      /* F2 0F 38 F0 /r = CRC32 r/m8, r32 (REX.W ok, 66 not ok)
18370         F2 0F 38 F1 /r = CRC32 r/m{16,32,64}, r32
18371         The decoding on this is a bit unusual.
18372      */
18373      if (haveF2noF3(pfx)
18374          && (opc == 0xF1 || (opc == 0xF0 && !have66(pfx)))) {
18375         modrm = getUChar(delta);
18376
18377         if (opc == 0xF0)
18378            sz = 1;
18379         else
18380            vassert(sz == 2 || sz == 4 || sz == 8);
18381
18382         IRType tyE = szToITy(sz);
18383         IRTemp valE = newTemp(tyE);
18384
18385         if (epartIsReg(modrm)) {
18386            assign(valE, getIRegE(sz, pfx, modrm));
18387            delta += 1;
18388            DIP("crc32b %s,%s\n", nameIRegE(sz, pfx, modrm),
18389                nameIRegG(1==getRexW(pfx) ? 8 : 4, pfx, modrm));
18390         } else {
18391            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
18392            assign(valE, loadLE(tyE, mkexpr(addr)));
18393            delta += alen;
18394            DIP("crc32b %s,%s\n", dis_buf,
18395                nameIRegG(1==getRexW(pfx) ? 8 : 4, pfx, modrm));
18396         }
18397
18398         /* Somewhat funny getting/putting of the crc32 value, in order
18399            to ensure that it turns into 64-bit gets and puts.  However,
18400            mask off the upper 32 bits so as to not get memcheck false
18401            +ves around the helper call. */
18402         IRTemp valG0 = newTemp(Ity_I64);
18403         assign(valG0, binop(Iop_And64, getIRegG(8, pfx, modrm),
18404                             mkU64(0xFFFFFFFF)));
18405
18406         const HChar* nm = NULL;
18407         void*  fn = NULL;
18408         switch (sz) {
18409            case 1: nm = "amd64g_calc_crc32b";
18410                    fn = &amd64g_calc_crc32b; break;
18411            case 2: nm = "amd64g_calc_crc32w";
18412                    fn = &amd64g_calc_crc32w; break;
18413            case 4: nm = "amd64g_calc_crc32l";
18414                    fn = &amd64g_calc_crc32l; break;
18415            case 8: nm = "amd64g_calc_crc32q";
18416                    fn = &amd64g_calc_crc32q; break;
18417         }
18418         vassert(nm && fn);
18419         IRTemp valG1 = newTemp(Ity_I64);
18420         assign(valG1,
18421                mkIRExprCCall(Ity_I64, 0/*regparm*/, nm, fn,
18422                              mkIRExprVec_2(mkexpr(valG0),
18423                                            widenUto64(mkexpr(valE)))));
18424
18425         putIRegG(4, pfx, modrm, unop(Iop_64to32, mkexpr(valG1)));
18426         goto decode_success;
18427      }
18428      break;
18429
18430   default:
18431      break;
18432
18433   }
18434
18435  //decode_failure:
18436   *decode_OK = False;
18437   return deltaIN;
18438
18439  decode_success:
18440   *decode_OK = True;
18441   return delta;
18442}
18443
18444
18445/*------------------------------------------------------------*/
18446/*---                                                      ---*/
18447/*--- Top-level SSE4: dis_ESC_0F3A__SSE4                   ---*/
18448/*---                                                      ---*/
18449/*------------------------------------------------------------*/
18450
18451static Long dis_PEXTRW ( const VexAbiInfo* vbi, Prefix pfx,
18452                         Long delta, Bool isAvx )
18453{
18454   IRTemp addr  = IRTemp_INVALID;
18455   IRTemp t0    = IRTemp_INVALID;
18456   IRTemp t1    = IRTemp_INVALID;
18457   IRTemp t2    = IRTemp_INVALID;
18458   IRTemp t3    = IRTemp_INVALID;
18459   UChar  modrm = getUChar(delta);
18460   Int    alen  = 0;
18461   HChar  dis_buf[50];
18462   UInt   rG    = gregOfRexRM(pfx,modrm);
18463   Int    imm8_20;
18464   IRTemp xmm_vec = newTemp(Ity_V128);
18465   IRTemp d16   = newTemp(Ity_I16);
18466   const HChar* mbV = isAvx ? "v" : "";
18467
18468   vassert(0==getRexW(pfx)); /* ensured by caller */
18469   assign( xmm_vec, getXMMReg(rG) );
18470   breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
18471
18472   if ( epartIsReg( modrm ) ) {
18473      imm8_20 = (Int)(getUChar(delta+1) & 7);
18474   } else {
18475      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
18476      imm8_20 = (Int)(getUChar(delta+alen) & 7);
18477   }
18478
18479   switch (imm8_20) {
18480      case 0:  assign(d16, unop(Iop_32to16,   mkexpr(t0))); break;
18481      case 1:  assign(d16, unop(Iop_32HIto16, mkexpr(t0))); break;
18482      case 2:  assign(d16, unop(Iop_32to16,   mkexpr(t1))); break;
18483      case 3:  assign(d16, unop(Iop_32HIto16, mkexpr(t1))); break;
18484      case 4:  assign(d16, unop(Iop_32to16,   mkexpr(t2))); break;
18485      case 5:  assign(d16, unop(Iop_32HIto16, mkexpr(t2))); break;
18486      case 6:  assign(d16, unop(Iop_32to16,   mkexpr(t3))); break;
18487      case 7:  assign(d16, unop(Iop_32HIto16, mkexpr(t3))); break;
18488      default: vassert(0);
18489   }
18490
18491   if ( epartIsReg( modrm ) ) {
18492      UInt rE = eregOfRexRM(pfx,modrm);
18493      putIReg32( rE, unop(Iop_16Uto32, mkexpr(d16)) );
18494      delta += 1+1;
18495      DIP( "%spextrw $%d, %s,%s\n", mbV, imm8_20,
18496           nameXMMReg( rG ), nameIReg32( rE ) );
18497   } else {
18498      storeLE( mkexpr(addr), mkexpr(d16) );
18499      delta += alen+1;
18500      DIP( "%spextrw $%d, %s,%s\n", mbV, imm8_20, nameXMMReg( rG ), dis_buf );
18501   }
18502   return delta;
18503}
18504
18505
18506static Long dis_PEXTRD ( const VexAbiInfo* vbi, Prefix pfx,
18507                         Long delta, Bool isAvx )
18508{
18509   IRTemp addr  = IRTemp_INVALID;
18510   IRTemp t0    = IRTemp_INVALID;
18511   IRTemp t1    = IRTemp_INVALID;
18512   IRTemp t2    = IRTemp_INVALID;
18513   IRTemp t3    = IRTemp_INVALID;
18514   UChar  modrm = 0;
18515   Int    alen  = 0;
18516   HChar  dis_buf[50];
18517
18518   Int    imm8_10;
18519   IRTemp xmm_vec   = newTemp(Ity_V128);
18520   IRTemp src_dword = newTemp(Ity_I32);
18521   const HChar* mbV = isAvx ? "v" : "";
18522
18523   vassert(0==getRexW(pfx)); /* ensured by caller */
18524   modrm = getUChar(delta);
18525   assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
18526   breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
18527
18528   if ( epartIsReg( modrm ) ) {
18529      imm8_10 = (Int)(getUChar(delta+1) & 3);
18530   } else {
18531      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
18532      imm8_10 = (Int)(getUChar(delta+alen) & 3);
18533   }
18534
18535   switch ( imm8_10 ) {
18536      case 0:  assign( src_dword, mkexpr(t0) ); break;
18537      case 1:  assign( src_dword, mkexpr(t1) ); break;
18538      case 2:  assign( src_dword, mkexpr(t2) ); break;
18539      case 3:  assign( src_dword, mkexpr(t3) ); break;
18540      default: vassert(0);
18541   }
18542
18543   if ( epartIsReg( modrm ) ) {
18544      putIReg32( eregOfRexRM(pfx,modrm), mkexpr(src_dword) );
18545      delta += 1+1;
18546      DIP( "%spextrd $%d, %s,%s\n", mbV, imm8_10,
18547           nameXMMReg( gregOfRexRM(pfx, modrm) ),
18548           nameIReg32( eregOfRexRM(pfx, modrm) ) );
18549   } else {
18550      storeLE( mkexpr(addr), mkexpr(src_dword) );
18551      delta += alen+1;
18552      DIP( "%spextrd $%d, %s,%s\n", mbV,
18553           imm8_10, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
18554   }
18555   return delta;
18556}
18557
18558
18559static Long dis_PEXTRQ ( const VexAbiInfo* vbi, Prefix pfx,
18560                         Long delta, Bool isAvx )
18561{
18562   IRTemp addr  = IRTemp_INVALID;
18563   UChar  modrm = 0;
18564   Int    alen  = 0;
18565   HChar  dis_buf[50];
18566
18567   Int imm8_0;
18568   IRTemp xmm_vec   = newTemp(Ity_V128);
18569   IRTemp src_qword = newTemp(Ity_I64);
18570   const HChar* mbV = isAvx ? "v" : "";
18571
18572   vassert(1==getRexW(pfx)); /* ensured by caller */
18573   modrm = getUChar(delta);
18574   assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
18575
18576   if ( epartIsReg( modrm ) ) {
18577      imm8_0 = (Int)(getUChar(delta+1) & 1);
18578   } else {
18579      addr   = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
18580      imm8_0 = (Int)(getUChar(delta+alen) & 1);
18581   }
18582
18583   switch ( imm8_0 ) {
18584      case 0:  assign( src_qword, unop(Iop_V128to64,   mkexpr(xmm_vec)) );
18585               break;
18586      case 1:  assign( src_qword, unop(Iop_V128HIto64, mkexpr(xmm_vec)) );
18587               break;
18588      default: vassert(0);
18589   }
18590
18591   if ( epartIsReg( modrm ) ) {
18592      putIReg64( eregOfRexRM(pfx,modrm), mkexpr(src_qword) );
18593      delta += 1+1;
18594      DIP( "%spextrq $%d, %s,%s\n", mbV, imm8_0,
18595           nameXMMReg( gregOfRexRM(pfx, modrm) ),
18596           nameIReg64( eregOfRexRM(pfx, modrm) ) );
18597   } else {
18598      storeLE( mkexpr(addr), mkexpr(src_qword) );
18599      delta += alen+1;
18600      DIP( "%spextrq $%d, %s,%s\n", mbV,
18601           imm8_0, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
18602   }
18603   return delta;
18604}
18605
18606static IRExpr* math_CTZ32(IRExpr *exp)
18607{
18608   /* Iop_Ctz32 isn't implemented by the amd64 back end, so use Iop_Ctz64. */
18609   return unop(Iop_64to32, unop(Iop_Ctz64, unop(Iop_32Uto64, exp)));
18610}
18611
18612static Long dis_PCMPISTRI_3A ( UChar modrm, UInt regNoL, UInt regNoR,
18613                               Long delta, UChar opc, UChar imm,
18614                               HChar dis_buf[])
18615{
18616   /* We only handle PCMPISTRI for now */
18617   vassert((opc & 0x03) == 0x03);
18618   /* And only an immediate byte of 0x38 or 0x3A */
18619   vassert((imm & ~0x02) == 0x38);
18620
18621   /* FIXME: Is this correct when RegNoL == 16 ? */
18622   IRTemp argL = newTemp(Ity_V128);
18623   assign(argL, getXMMReg(regNoL));
18624   IRTemp argR = newTemp(Ity_V128);
18625   assign(argR, getXMMReg(regNoR));
18626
18627   IRTemp zmaskL = newTemp(Ity_I32);
18628   assign(zmaskL, unop(Iop_16Uto32,
18629                       unop(Iop_GetMSBs8x16,
18630                            binop(Iop_CmpEQ8x16, mkexpr(argL), mkV128(0)))));
18631   IRTemp zmaskR = newTemp(Ity_I32);
18632   assign(zmaskR, unop(Iop_16Uto32,
18633                       unop(Iop_GetMSBs8x16,
18634                            binop(Iop_CmpEQ8x16, mkexpr(argR), mkV128(0)))));
18635
18636   /* We want validL = ~(zmaskL | -zmaskL)
18637
18638      But this formulation kills memcheck's validity tracking when any
18639      bits above the first "1" are invalid.  So reformulate as:
18640
18641      validL = (zmaskL ? (1 << ctz(zmaskL)) : 0) - 1
18642   */
18643
18644   IRExpr *ctzL = unop(Iop_32to8, math_CTZ32(mkexpr(zmaskL)));
18645
18646   /* Generate a bool expression which is zero iff the original is
18647      zero.  Do this carefully so memcheck can propagate validity bits
18648      correctly.
18649    */
18650   IRTemp zmaskL_zero = newTemp(Ity_I1);
18651   assign(zmaskL_zero, binop(Iop_ExpCmpNE32, mkexpr(zmaskL), mkU32(0)));
18652
18653   IRTemp validL = newTemp(Ity_I32);
18654   assign(validL, binop(Iop_Sub32,
18655                        IRExpr_ITE(mkexpr(zmaskL_zero),
18656                                   binop(Iop_Shl32, mkU32(1), ctzL),
18657                                   mkU32(0)),
18658                        mkU32(1)));
18659
18660   /* And similarly for validR. */
18661   IRExpr *ctzR = unop(Iop_32to8, math_CTZ32(mkexpr(zmaskR)));
18662   IRTemp zmaskR_zero = newTemp(Ity_I1);
18663   assign(zmaskR_zero, binop(Iop_ExpCmpNE32, mkexpr(zmaskR), mkU32(0)));
18664   IRTemp validR = newTemp(Ity_I32);
18665   assign(validR, binop(Iop_Sub32,
18666                        IRExpr_ITE(mkexpr(zmaskR_zero),
18667                                   binop(Iop_Shl32, mkU32(1), ctzR),
18668                                   mkU32(0)),
18669                        mkU32(1)));
18670
18671   /* Do the actual comparison. */
18672   IRExpr *boolResII = unop(Iop_16Uto32,
18673                            unop(Iop_GetMSBs8x16,
18674                                 binop(Iop_CmpEQ8x16, mkexpr(argL),
18675                                                      mkexpr(argR))));
18676
18677   /* Compute boolresII & validL & validR (i.e., if both valid, use
18678      comparison result) */
18679   IRExpr *intRes1_a = binop(Iop_And32, boolResII,
18680                             binop(Iop_And32,
18681                                   mkexpr(validL), mkexpr(validR)));
18682
18683   /* Compute ~(validL | validR); i.e., if both invalid, force 1. */
18684   IRExpr *intRes1_b = unop(Iop_Not32, binop(Iop_Or32,
18685                                             mkexpr(validL), mkexpr(validR)));
18686   /* Otherwise, zero. */
18687   IRExpr *intRes1 = binop(Iop_And32, mkU32(0xFFFF),
18688                           binop(Iop_Or32, intRes1_a, intRes1_b));
18689
18690   /* The "0x30" in imm=0x3A means "polarity=3" means XOR validL with
18691      result. */
18692   IRTemp intRes2 = newTemp(Ity_I32);
18693   assign(intRes2, binop(Iop_And32, mkU32(0xFFFF),
18694                         binop(Iop_Xor32, intRes1, mkexpr(validL))));
18695
18696   /* If the 0x40 bit were set in imm=0x3A, we would return the index
18697      of the msb.  Since it is clear, we return the index of the
18698      lsb. */
18699   IRExpr *newECX = math_CTZ32(binop(Iop_Or32,
18700                                     mkexpr(intRes2), mkU32(0x10000)));
18701
18702   /* And thats our rcx. */
18703   putIReg32(R_RCX, newECX);
18704
18705   /* Now for the condition codes... */
18706
18707   /* C == 0 iff intRes2 == 0 */
18708   IRExpr *c_bit = IRExpr_ITE( binop(Iop_ExpCmpNE32, mkexpr(intRes2),
18709                                     mkU32(0)),
18710                               mkU32(1 << AMD64G_CC_SHIFT_C),
18711                               mkU32(0));
18712   /* Z == 1 iff any in argL is 0 */
18713   IRExpr *z_bit = IRExpr_ITE( mkexpr(zmaskL_zero),
18714                               mkU32(1 << AMD64G_CC_SHIFT_Z),
18715                               mkU32(0));
18716   /* S == 1 iff any in argR is 0 */
18717   IRExpr *s_bit = IRExpr_ITE( mkexpr(zmaskR_zero),
18718                               mkU32(1 << AMD64G_CC_SHIFT_S),
18719                               mkU32(0));
18720   /* O == IntRes2[0] */
18721   IRExpr *o_bit = binop(Iop_Shl32, binop(Iop_And32, mkexpr(intRes2),
18722                                          mkU32(0x01)),
18723                         mkU8(AMD64G_CC_SHIFT_O));
18724
18725   /* Put them all together */
18726   IRTemp cc = newTemp(Ity_I64);
18727   assign(cc, widenUto64(binop(Iop_Or32,
18728                               binop(Iop_Or32, c_bit, z_bit),
18729                               binop(Iop_Or32, s_bit, o_bit))));
18730   stmt(IRStmt_Put(OFFB_CC_OP, mkU64(AMD64G_CC_OP_COPY)));
18731   stmt(IRStmt_Put(OFFB_CC_DEP1, mkexpr(cc)));
18732   stmt(IRStmt_Put(OFFB_CC_DEP2, mkU64(0)));
18733   stmt(IRStmt_Put(OFFB_CC_NDEP, mkU64(0)));
18734
18735   return delta;
18736}
18737
18738/* This can fail, in which case it returns the original (unchanged)
18739   delta. */
18740static Long dis_PCMPxSTRx ( const VexAbiInfo* vbi, Prefix pfx,
18741                            Long delta, Bool isAvx, UChar opc )
18742{
18743   Long   delta0  = delta;
18744   UInt   isISTRx = opc & 2;
18745   UInt   isxSTRM = (opc & 1) ^ 1;
18746   UInt   regNoL  = 0;
18747   UInt   regNoR  = 0;
18748   UChar  imm     = 0;
18749   IRTemp addr    = IRTemp_INVALID;
18750   Int    alen    = 0;
18751   HChar  dis_buf[50];
18752
18753   /* This is a nasty kludge.  We need to pass 2 x V128 to the helper
18754      (which is clean).  Since we can't do that, use a dirty helper to
18755      compute the results directly from the XMM regs in the guest
18756      state.  That means for the memory case, we need to move the left
18757      operand into a pseudo-register (XMM16, let's call it). */
18758   UChar modrm = getUChar(delta);
18759   if (epartIsReg(modrm)) {
18760      regNoL = eregOfRexRM(pfx, modrm);
18761      regNoR = gregOfRexRM(pfx, modrm);
18762      imm = getUChar(delta+1);
18763      delta += 1+1;
18764   } else {
18765      regNoL = 16; /* use XMM16 as an intermediary */
18766      regNoR = gregOfRexRM(pfx, modrm);
18767      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
18768      /* No alignment check; I guess that makes sense, given that
18769         these insns are for dealing with C style strings. */
18770      stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
18771      imm = getUChar(delta+alen);
18772      delta += alen+1;
18773   }
18774
18775   /* Print the insn here, since dis_PCMPISTRI_3A doesn't do so
18776      itself. */
18777   if (regNoL == 16) {
18778      DIP("%spcmp%cstr%c $%x,%s,%s\n",
18779          isAvx ? "v" : "", isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
18780          (UInt)imm, dis_buf, nameXMMReg(regNoR));
18781   } else {
18782      DIP("%spcmp%cstr%c $%x,%s,%s\n",
18783          isAvx ? "v" : "", isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
18784          (UInt)imm, nameXMMReg(regNoL), nameXMMReg(regNoR));
18785   }
18786
18787   /* Handle special case(s). */
18788   if (imm == 0x3A && isISTRx && !isxSTRM) {
18789      return dis_PCMPISTRI_3A ( modrm, regNoL, regNoR, delta,
18790                                opc, imm, dis_buf);
18791   }
18792
18793   /* Now we know the XMM reg numbers for the operands, and the
18794      immediate byte.  Is it one we can actually handle? Throw out any
18795      cases for which the helper function has not been verified. */
18796   switch (imm) {
18797      case 0x00: case 0x02:
18798      case 0x08: case 0x0A: case 0x0C: case 0x0E:
18799      case 0x10: case 0x12: case 0x14:
18800      case 0x18: case 0x1A:
18801      case 0x30:            case 0x34:
18802      case 0x38: case 0x3A:
18803      case 0x40: case 0x42: case 0x44: case 0x46:
18804                 case 0x4A:
18805                 case 0x62:
18806      case 0x70: case 0x72:
18807         break;
18808      // the 16-bit character versions of the above
18809      case 0x01: case 0x03:
18810      case 0x09: case 0x0B: case 0x0D:
18811                 case 0x13:
18812      case 0x19: case 0x1B:
18813      case 0x39: case 0x3B:
18814                            case 0x45:
18815                 case 0x4B:
18816         break;
18817      default:
18818         return delta0; /*FAIL*/
18819   }
18820
18821   /* Who ya gonna call?  Presumably not Ghostbusters. */
18822   void*  fn = &amd64g_dirtyhelper_PCMPxSTRx;
18823   const HChar* nm = "amd64g_dirtyhelper_PCMPxSTRx";
18824
18825   /* Round up the arguments.  Note that this is a kludge -- the use
18826      of mkU64 rather than mkIRExpr_HWord implies the assumption that
18827      the host's word size is 64-bit. */
18828   UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
18829   UInt gstOffR = ymmGuestRegOffset(regNoR);
18830
18831   IRExpr*  opc4_and_imm = mkU64((opc << 8) | (imm & 0xFF));
18832   IRExpr*  gstOffLe     = mkU64(gstOffL);
18833   IRExpr*  gstOffRe     = mkU64(gstOffR);
18834   IRExpr*  edxIN        = isISTRx ? mkU64(0) : getIRegRDX(8);
18835   IRExpr*  eaxIN        = isISTRx ? mkU64(0) : getIRegRAX(8);
18836   IRExpr** args
18837      = mkIRExprVec_6( IRExpr_GSPTR(),
18838                       opc4_and_imm, gstOffLe, gstOffRe, edxIN, eaxIN );
18839
18840   IRTemp   resT = newTemp(Ity_I64);
18841   IRDirty* d    = unsafeIRDirty_1_N( resT, 0/*regparms*/, nm, fn, args );
18842   /* It's not really a dirty call, but we can't use the clean helper
18843      mechanism here for the very lame reason that we can't pass 2 x
18844      V128s by value to a helper.  Hence this roundabout scheme. */
18845   d->nFxState = 2;
18846   vex_bzero(&d->fxState, sizeof(d->fxState));
18847   d->fxState[0].fx     = Ifx_Read;
18848   d->fxState[0].offset = gstOffL;
18849   d->fxState[0].size   = sizeof(U128);
18850   d->fxState[1].fx     = Ifx_Read;
18851   d->fxState[1].offset = gstOffR;
18852   d->fxState[1].size   = sizeof(U128);
18853   if (isxSTRM) {
18854      /* Declare that the helper writes XMM0. */
18855      d->nFxState = 3;
18856      d->fxState[2].fx     = Ifx_Write;
18857      d->fxState[2].offset = ymmGuestRegOffset(0);
18858      d->fxState[2].size   = sizeof(U128);
18859   }
18860
18861   stmt( IRStmt_Dirty(d) );
18862
18863   /* Now resT[15:0] holds the new OSZACP values, so the condition
18864      codes must be updated. And for a xSTRI case, resT[31:16] holds
18865      the new ECX value, so stash that too. */
18866   if (!isxSTRM) {
18867      putIReg64(R_RCX, binop(Iop_And64,
18868                             binop(Iop_Shr64, mkexpr(resT), mkU8(16)),
18869                             mkU64(0xFFFF)));
18870   }
18871
18872   /* Zap the upper half of the dest reg as per AVX conventions. */
18873   if (isxSTRM && isAvx)
18874      putYMMRegLane128(/*YMM*/0, 1, mkV128(0));
18875
18876   stmt( IRStmt_Put(
18877            OFFB_CC_DEP1,
18878            binop(Iop_And64, mkexpr(resT), mkU64(0xFFFF))
18879   ));
18880   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
18881   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
18882   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
18883
18884   return delta;
18885}
18886
18887
18888static IRTemp math_PINSRB_128 ( IRTemp v128, IRTemp u8, UInt imm8 )
18889{
18890   vassert(imm8 >= 0 && imm8 <= 15);
18891
18892   // Create a V128 value which has the selected byte in the
18893   // specified lane, and zeroes everywhere else.
18894   IRTemp tmp128    = newTemp(Ity_V128);
18895   IRTemp halfshift = newTemp(Ity_I64);
18896   assign(halfshift, binop(Iop_Shl64,
18897                           unop(Iop_8Uto64, mkexpr(u8)),
18898                           mkU8(8 * (imm8 & 7))));
18899   if (imm8 < 8) {
18900      assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
18901   } else {
18902      assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
18903   }
18904
18905   UShort mask = ~(1 << imm8);
18906   IRTemp res  = newTemp(Ity_V128);
18907   assign( res, binop(Iop_OrV128,
18908                      mkexpr(tmp128),
18909                      binop(Iop_AndV128, mkexpr(v128), mkV128(mask))) );
18910   return res;
18911}
18912
18913
18914static IRTemp math_PINSRD_128 ( IRTemp v128, IRTemp u32, UInt imm8 )
18915{
18916   IRTemp z32 = newTemp(Ity_I32);
18917   assign(z32, mkU32(0));
18918
18919   /* Surround u32 with zeroes as per imm, giving us something we can
18920      OR into a suitably masked-out v128.*/
18921   IRTemp withZs = newTemp(Ity_V128);
18922   UShort mask = 0;
18923   switch (imm8) {
18924      case 3:  mask = 0x0FFF;
18925               assign(withZs, mkV128from32s(u32, z32, z32, z32));
18926               break;
18927      case 2:  mask = 0xF0FF;
18928               assign(withZs, mkV128from32s(z32, u32, z32, z32));
18929               break;
18930      case 1:  mask = 0xFF0F;
18931               assign(withZs, mkV128from32s(z32, z32, u32, z32));
18932               break;
18933      case 0:  mask = 0xFFF0;
18934               assign(withZs, mkV128from32s(z32, z32, z32, u32));
18935               break;
18936      default: vassert(0);
18937   }
18938
18939   IRTemp res = newTemp(Ity_V128);
18940   assign(res, binop( Iop_OrV128,
18941                      mkexpr(withZs),
18942                      binop( Iop_AndV128, mkexpr(v128), mkV128(mask) ) ) );
18943   return res;
18944}
18945
18946
18947static IRTemp math_PINSRQ_128 ( IRTemp v128, IRTemp u64, UInt imm8 )
18948{
18949   /* Surround u64 with zeroes as per imm, giving us something we can
18950      OR into a suitably masked-out v128.*/
18951   IRTemp withZs = newTemp(Ity_V128);
18952   UShort mask = 0;
18953   if (imm8 == 0) {
18954      mask = 0xFF00;
18955      assign(withZs, binop(Iop_64HLtoV128, mkU64(0), mkexpr(u64)));
18956   } else {
18957      vassert(imm8 == 1);
18958      mask = 0x00FF;
18959      assign( withZs, binop(Iop_64HLtoV128, mkexpr(u64), mkU64(0)));
18960   }
18961
18962   IRTemp res = newTemp(Ity_V128);
18963   assign( res, binop( Iop_OrV128,
18964                       mkexpr(withZs),
18965                       binop( Iop_AndV128, mkexpr(v128), mkV128(mask) ) ) );
18966   return res;
18967}
18968
18969
18970static IRTemp math_INSERTPS ( IRTemp dstV, IRTemp toInsertD, UInt imm8 )
18971{
18972   const IRTemp inval = IRTemp_INVALID;
18973   IRTemp dstDs[4] = { inval, inval, inval, inval };
18974   breakupV128to32s( dstV, &dstDs[3], &dstDs[2], &dstDs[1], &dstDs[0] );
18975
18976   vassert(imm8 <= 255);
18977   dstDs[(imm8 >> 4) & 3] = toInsertD; /* "imm8_count_d" */
18978
18979   UInt imm8_zmask = (imm8 & 15);
18980   IRTemp zero_32 = newTemp(Ity_I32);
18981   assign( zero_32, mkU32(0) );
18982   IRTemp resV = newTemp(Ity_V128);
18983   assign( resV, mkV128from32s(
18984                    ((imm8_zmask & 8) == 8) ? zero_32 : dstDs[3],
18985                    ((imm8_zmask & 4) == 4) ? zero_32 : dstDs[2],
18986                    ((imm8_zmask & 2) == 2) ? zero_32 : dstDs[1],
18987                    ((imm8_zmask & 1) == 1) ? zero_32 : dstDs[0]) );
18988   return resV;
18989}
18990
18991
18992static Long dis_PEXTRB_128_GtoE ( const VexAbiInfo* vbi, Prefix pfx,
18993                                  Long delta, Bool isAvx )
18994{
18995   IRTemp addr     = IRTemp_INVALID;
18996   Int    alen     = 0;
18997   HChar  dis_buf[50];
18998   IRTemp xmm_vec  = newTemp(Ity_V128);
18999   IRTemp sel_lane = newTemp(Ity_I32);
19000   IRTemp shr_lane = newTemp(Ity_I32);
19001   const HChar* mbV = isAvx ? "v" : "";
19002   UChar  modrm    = getUChar(delta);
19003   IRTemp t3, t2, t1, t0;
19004   Int    imm8;
19005   assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
19006   t3 = t2 = t1 = t0 = IRTemp_INVALID;
19007   breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
19008
19009   if ( epartIsReg( modrm ) ) {
19010      imm8 = (Int)getUChar(delta+1);
19011   } else {
19012      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19013      imm8 = (Int)getUChar(delta+alen);
19014   }
19015   switch ( (imm8 >> 2) & 3 ) {
19016      case 0:  assign( sel_lane, mkexpr(t0) ); break;
19017      case 1:  assign( sel_lane, mkexpr(t1) ); break;
19018      case 2:  assign( sel_lane, mkexpr(t2) ); break;
19019      case 3:  assign( sel_lane, mkexpr(t3) ); break;
19020      default: vassert(0);
19021   }
19022   assign( shr_lane,
19023           binop( Iop_Shr32, mkexpr(sel_lane), mkU8(((imm8 & 3)*8)) ) );
19024
19025   if ( epartIsReg( modrm ) ) {
19026      putIReg64( eregOfRexRM(pfx,modrm),
19027                 unop( Iop_32Uto64,
19028                       binop(Iop_And32, mkexpr(shr_lane), mkU32(255)) ) );
19029      delta += 1+1;
19030      DIP( "%spextrb $%d, %s,%s\n", mbV, imm8,
19031           nameXMMReg( gregOfRexRM(pfx, modrm) ),
19032           nameIReg64( eregOfRexRM(pfx, modrm) ) );
19033   } else {
19034      storeLE( mkexpr(addr), unop(Iop_32to8, mkexpr(shr_lane) ) );
19035      delta += alen+1;
19036      DIP( "%spextrb $%d,%s,%s\n", mbV,
19037           imm8, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
19038   }
19039
19040   return delta;
19041}
19042
19043
19044static IRTemp math_DPPD_128 ( IRTemp src_vec, IRTemp dst_vec, UInt imm8 )
19045{
19046   vassert(imm8 < 256);
19047   UShort imm8_perms[4] = { 0x0000, 0x00FF, 0xFF00, 0xFFFF };
19048   IRTemp and_vec = newTemp(Ity_V128);
19049   IRTemp sum_vec = newTemp(Ity_V128);
19050   IRTemp rm      = newTemp(Ity_I32);
19051   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
19052   assign( and_vec, binop( Iop_AndV128,
19053                           triop( Iop_Mul64Fx2,
19054                                  mkexpr(rm),
19055                                  mkexpr(dst_vec), mkexpr(src_vec) ),
19056                           mkV128( imm8_perms[ ((imm8 >> 4) & 3) ] ) ) );
19057
19058   assign( sum_vec, binop( Iop_Add64F0x2,
19059                           binop( Iop_InterleaveHI64x2,
19060                                  mkexpr(and_vec), mkexpr(and_vec) ),
19061                           binop( Iop_InterleaveLO64x2,
19062                                  mkexpr(and_vec), mkexpr(and_vec) ) ) );
19063   IRTemp res = newTemp(Ity_V128);
19064   assign(res, binop( Iop_AndV128,
19065                      binop( Iop_InterleaveLO64x2,
19066                             mkexpr(sum_vec), mkexpr(sum_vec) ),
19067                      mkV128( imm8_perms[ (imm8 & 3) ] ) ) );
19068   return res;
19069}
19070
19071
19072static IRTemp math_DPPS_128 ( IRTemp src_vec, IRTemp dst_vec, UInt imm8 )
19073{
19074   vassert(imm8 < 256);
19075   IRTemp tmp_prod_vec = newTemp(Ity_V128);
19076   IRTemp prod_vec     = newTemp(Ity_V128);
19077   IRTemp sum_vec      = newTemp(Ity_V128);
19078   IRTemp rm           = newTemp(Ity_I32);
19079   IRTemp v3, v2, v1, v0;
19080   v3 = v2 = v1 = v0   = IRTemp_INVALID;
19081   UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
19082                             0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
19083                             0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0,
19084                             0xFFFF };
19085
19086   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
19087   assign( tmp_prod_vec,
19088           binop( Iop_AndV128,
19089                  triop( Iop_Mul32Fx4,
19090                         mkexpr(rm), mkexpr(dst_vec), mkexpr(src_vec) ),
19091                  mkV128( imm8_perms[((imm8 >> 4)& 15)] ) ) );
19092   breakupV128to32s( tmp_prod_vec, &v3, &v2, &v1, &v0 );
19093   assign( prod_vec, mkV128from32s( v3, v1, v2, v0 ) );
19094
19095   assign( sum_vec, triop( Iop_Add32Fx4,
19096                           mkexpr(rm),
19097                           binop( Iop_InterleaveHI32x4,
19098                                  mkexpr(prod_vec), mkexpr(prod_vec) ),
19099                           binop( Iop_InterleaveLO32x4,
19100                                  mkexpr(prod_vec), mkexpr(prod_vec) ) ) );
19101
19102   IRTemp res = newTemp(Ity_V128);
19103   assign( res, binop( Iop_AndV128,
19104                       triop( Iop_Add32Fx4,
19105                              mkexpr(rm),
19106                              binop( Iop_InterleaveHI32x4,
19107                                     mkexpr(sum_vec), mkexpr(sum_vec) ),
19108                              binop( Iop_InterleaveLO32x4,
19109                                     mkexpr(sum_vec), mkexpr(sum_vec) ) ),
19110                       mkV128( imm8_perms[ (imm8 & 15) ] ) ) );
19111   return res;
19112}
19113
19114
19115static IRTemp math_MPSADBW_128 ( IRTemp dst_vec, IRTemp src_vec, UInt imm8 )
19116{
19117   /* Mask out bits of the operands we don't need.  This isn't
19118      strictly necessary, but it does ensure Memcheck doesn't
19119      give us any false uninitialised value errors as a
19120      result. */
19121   UShort src_mask[4] = { 0x000F, 0x00F0, 0x0F00, 0xF000 };
19122   UShort dst_mask[2] = { 0x07FF, 0x7FF0 };
19123
19124   IRTemp src_maskV = newTemp(Ity_V128);
19125   IRTemp dst_maskV = newTemp(Ity_V128);
19126   assign(src_maskV, mkV128( src_mask[ imm8 & 3 ] ));
19127   assign(dst_maskV, mkV128( dst_mask[ (imm8 >> 2) & 1 ] ));
19128
19129   IRTemp src_masked = newTemp(Ity_V128);
19130   IRTemp dst_masked = newTemp(Ity_V128);
19131   assign(src_masked, binop(Iop_AndV128, mkexpr(src_vec), mkexpr(src_maskV)));
19132   assign(dst_masked, binop(Iop_AndV128, mkexpr(dst_vec), mkexpr(dst_maskV)));
19133
19134   /* Generate 4 64 bit values that we can hand to a clean helper */
19135   IRTemp sHi = newTemp(Ity_I64);
19136   IRTemp sLo = newTemp(Ity_I64);
19137   assign( sHi, unop(Iop_V128HIto64, mkexpr(src_masked)) );
19138   assign( sLo, unop(Iop_V128to64,   mkexpr(src_masked)) );
19139
19140   IRTemp dHi = newTemp(Ity_I64);
19141   IRTemp dLo = newTemp(Ity_I64);
19142   assign( dHi, unop(Iop_V128HIto64, mkexpr(dst_masked)) );
19143   assign( dLo, unop(Iop_V128to64,   mkexpr(dst_masked)) );
19144
19145   /* Compute halves of the result separately */
19146   IRTemp resHi = newTemp(Ity_I64);
19147   IRTemp resLo = newTemp(Ity_I64);
19148
19149   IRExpr** argsHi
19150      = mkIRExprVec_5( mkexpr(sHi), mkexpr(sLo), mkexpr(dHi), mkexpr(dLo),
19151                       mkU64( 0x80 | (imm8 & 7) ));
19152   IRExpr** argsLo
19153      = mkIRExprVec_5( mkexpr(sHi), mkexpr(sLo), mkexpr(dHi), mkexpr(dLo),
19154                       mkU64( 0x00 | (imm8 & 7) ));
19155
19156   assign(resHi, mkIRExprCCall( Ity_I64, 0/*regparm*/,
19157                                "amd64g_calc_mpsadbw",
19158                                &amd64g_calc_mpsadbw, argsHi ));
19159   assign(resLo, mkIRExprCCall( Ity_I64, 0/*regparm*/,
19160                                "amd64g_calc_mpsadbw",
19161                                &amd64g_calc_mpsadbw, argsLo ));
19162
19163   IRTemp res = newTemp(Ity_V128);
19164   assign(res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo)));
19165   return res;
19166}
19167
19168static Long dis_EXTRACTPS ( const VexAbiInfo* vbi, Prefix pfx,
19169                            Long delta, Bool isAvx )
19170{
19171   IRTemp addr       = IRTemp_INVALID;
19172   Int    alen       = 0;
19173   HChar  dis_buf[50];
19174   UChar  modrm      = getUChar(delta);
19175   Int imm8_10;
19176   IRTemp xmm_vec    = newTemp(Ity_V128);
19177   IRTemp src_dword  = newTemp(Ity_I32);
19178   UInt   rG         = gregOfRexRM(pfx,modrm);
19179   IRTemp t3, t2, t1, t0;
19180   t3 = t2 = t1 = t0 = IRTemp_INVALID;
19181
19182   assign( xmm_vec, getXMMReg( rG ) );
19183   breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
19184
19185   if ( epartIsReg( modrm ) ) {
19186      imm8_10 = (Int)(getUChar(delta+1) & 3);
19187   } else {
19188      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19189      imm8_10 = (Int)(getUChar(delta+alen) & 3);
19190   }
19191
19192   switch ( imm8_10 ) {
19193      case 0:  assign( src_dword, mkexpr(t0) ); break;
19194      case 1:  assign( src_dword, mkexpr(t1) ); break;
19195      case 2:  assign( src_dword, mkexpr(t2) ); break;
19196      case 3:  assign( src_dword, mkexpr(t3) ); break;
19197      default: vassert(0);
19198   }
19199
19200   if ( epartIsReg( modrm ) ) {
19201      UInt rE = eregOfRexRM(pfx,modrm);
19202      putIReg32( rE, mkexpr(src_dword) );
19203      delta += 1+1;
19204      DIP( "%sextractps $%d, %s,%s\n", isAvx ? "v" : "", imm8_10,
19205           nameXMMReg( rG ), nameIReg32( rE ) );
19206   } else {
19207      storeLE( mkexpr(addr), mkexpr(src_dword) );
19208      delta += alen+1;
19209      DIP( "%sextractps $%d, %s,%s\n", isAvx ? "v" : "", imm8_10,
19210           nameXMMReg( rG ), dis_buf );
19211   }
19212
19213   return delta;
19214}
19215
19216
19217static IRTemp math_PCLMULQDQ( IRTemp dV, IRTemp sV, UInt imm8 )
19218{
19219   IRTemp t0 = newTemp(Ity_I64);
19220   IRTemp t1 = newTemp(Ity_I64);
19221   assign(t0, unop((imm8&1)? Iop_V128HIto64 : Iop_V128to64,
19222              mkexpr(dV)));
19223   assign(t1, unop((imm8&16) ? Iop_V128HIto64 : Iop_V128to64,
19224              mkexpr(sV)));
19225
19226   IRTemp t2 = newTemp(Ity_I64);
19227   IRTemp t3 = newTemp(Ity_I64);
19228
19229   IRExpr** args;
19230
19231   args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(0));
19232   assign(t2, mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
19233                            &amd64g_calculate_pclmul, args));
19234   args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(1));
19235   assign(t3, mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
19236                            &amd64g_calculate_pclmul, args));
19237
19238   IRTemp res     = newTemp(Ity_V128);
19239   assign(res, binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)));
19240   return res;
19241}
19242
19243
19244__attribute__((noinline))
19245static
19246Long dis_ESC_0F3A__SSE4 ( Bool* decode_OK,
19247                          const VexAbiInfo* vbi,
19248                          Prefix pfx, Int sz, Long deltaIN )
19249{
19250   IRTemp addr  = IRTemp_INVALID;
19251   UChar  modrm = 0;
19252   Int    alen  = 0;
19253   HChar  dis_buf[50];
19254
19255   *decode_OK = False;
19256
19257   Long   delta = deltaIN;
19258   UChar  opc   = getUChar(delta);
19259   delta++;
19260   switch (opc) {
19261
19262   case 0x08:
19263      /* 66 0F 3A 08 /r ib = ROUNDPS imm8, xmm2/m128, xmm1 */
19264      if (have66noF2noF3(pfx) && sz == 2) {
19265
19266         IRTemp src0 = newTemp(Ity_F32);
19267         IRTemp src1 = newTemp(Ity_F32);
19268         IRTemp src2 = newTemp(Ity_F32);
19269         IRTemp src3 = newTemp(Ity_F32);
19270         IRTemp res0 = newTemp(Ity_F32);
19271         IRTemp res1 = newTemp(Ity_F32);
19272         IRTemp res2 = newTemp(Ity_F32);
19273         IRTemp res3 = newTemp(Ity_F32);
19274         IRTemp rm   = newTemp(Ity_I32);
19275         Int    imm  = 0;
19276
19277         modrm = getUChar(delta);
19278
19279         if (epartIsReg(modrm)) {
19280            assign( src0,
19281                    getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
19282            assign( src1,
19283                    getXMMRegLane32F( eregOfRexRM(pfx, modrm), 1 ) );
19284            assign( src2,
19285                    getXMMRegLane32F( eregOfRexRM(pfx, modrm), 2 ) );
19286            assign( src3,
19287                    getXMMRegLane32F( eregOfRexRM(pfx, modrm), 3 ) );
19288            imm = getUChar(delta+1);
19289            if (imm & ~15) goto decode_failure;
19290            delta += 1+1;
19291            DIP( "roundps $%d,%s,%s\n",
19292                 imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
19293                      nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19294         } else {
19295            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19296            gen_SEGV_if_not_16_aligned(addr);
19297            assign( src0, loadLE(Ity_F32,
19298                                 binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
19299            assign( src1, loadLE(Ity_F32,
19300                                 binop(Iop_Add64, mkexpr(addr), mkU64(4) )));
19301            assign( src2, loadLE(Ity_F32,
19302                                 binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
19303            assign( src3, loadLE(Ity_F32,
19304                                 binop(Iop_Add64, mkexpr(addr), mkU64(12) )));
19305            imm = getUChar(delta+alen);
19306            if (imm & ~15) goto decode_failure;
19307            delta += alen+1;
19308            DIP( "roundps $%d,%s,%s\n",
19309                 imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19310         }
19311
19312         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
19313            that encoding is the same as the encoding for IRRoundingMode,
19314            we can use that value directly in the IR as a rounding
19315            mode. */
19316         assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
19317
19318         assign(res0, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src0)) );
19319         assign(res1, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src1)) );
19320         assign(res2, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src2)) );
19321         assign(res3, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src3)) );
19322
19323         putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
19324         putXMMRegLane32F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
19325         putXMMRegLane32F( gregOfRexRM(pfx, modrm), 2, mkexpr(res2) );
19326         putXMMRegLane32F( gregOfRexRM(pfx, modrm), 3, mkexpr(res3) );
19327
19328         goto decode_success;
19329      }
19330      break;
19331
19332   case 0x09:
19333      /* 66 0F 3A 09 /r ib = ROUNDPD imm8, xmm2/m128, xmm1 */
19334      if (have66noF2noF3(pfx) && sz == 2) {
19335
19336         IRTemp src0 = newTemp(Ity_F64);
19337         IRTemp src1 = newTemp(Ity_F64);
19338         IRTemp res0 = newTemp(Ity_F64);
19339         IRTemp res1 = newTemp(Ity_F64);
19340         IRTemp rm   = newTemp(Ity_I32);
19341         Int    imm  = 0;
19342
19343         modrm = getUChar(delta);
19344
19345         if (epartIsReg(modrm)) {
19346            assign( src0,
19347                    getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 ) );
19348            assign( src1,
19349                    getXMMRegLane64F( eregOfRexRM(pfx, modrm), 1 ) );
19350            imm = getUChar(delta+1);
19351            if (imm & ~15) goto decode_failure;
19352            delta += 1+1;
19353            DIP( "roundpd $%d,%s,%s\n",
19354                 imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
19355                      nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19356         } else {
19357            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19358            gen_SEGV_if_not_16_aligned(addr);
19359            assign( src0, loadLE(Ity_F64,
19360                                 binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
19361            assign( src1, loadLE(Ity_F64,
19362                                 binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
19363            imm = getUChar(delta+alen);
19364            if (imm & ~15) goto decode_failure;
19365            delta += alen+1;
19366            DIP( "roundpd $%d,%s,%s\n",
19367                 imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19368         }
19369
19370         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
19371            that encoding is the same as the encoding for IRRoundingMode,
19372            we can use that value directly in the IR as a rounding
19373            mode. */
19374         assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
19375
19376         assign(res0, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src0)) );
19377         assign(res1, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src1)) );
19378
19379         putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
19380         putXMMRegLane64F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
19381
19382         goto decode_success;
19383      }
19384      break;
19385
19386   case 0x0A:
19387   case 0x0B:
19388      /* 66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
19389         66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
19390      */
19391      if (have66noF2noF3(pfx) && sz == 2) {
19392
19393         Bool   isD = opc == 0x0B;
19394         IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
19395         IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
19396         Int    imm = 0;
19397
19398         modrm = getUChar(delta);
19399
19400         if (epartIsReg(modrm)) {
19401            assign( src,
19402                    isD ? getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 )
19403                        : getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
19404            imm = getUChar(delta+1);
19405            if (imm & ~15) goto decode_failure;
19406            delta += 1+1;
19407            DIP( "rounds%c $%d,%s,%s\n",
19408                 isD ? 'd' : 's',
19409                 imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
19410                      nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19411         } else {
19412            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19413            assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
19414            imm = getUChar(delta+alen);
19415            if (imm & ~15) goto decode_failure;
19416            delta += alen+1;
19417            DIP( "rounds%c $%d,%s,%s\n",
19418                 isD ? 'd' : 's',
19419                 imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19420         }
19421
19422         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
19423            that encoding is the same as the encoding for IRRoundingMode,
19424            we can use that value directly in the IR as a rounding
19425            mode. */
19426         assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
19427                           (imm & 4) ? get_sse_roundingmode()
19428                                     : mkU32(imm & 3),
19429                           mkexpr(src)) );
19430
19431         if (isD)
19432            putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
19433         else
19434            putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
19435
19436         goto decode_success;
19437      }
19438      break;
19439
19440   case 0x0C:
19441      /* 66 0F 3A 0C /r ib = BLENDPS xmm1, xmm2/m128, imm8
19442         Blend Packed Single Precision Floating-Point Values (XMM) */
19443      if (have66noF2noF3(pfx) && sz == 2) {
19444
19445         Int imm8;
19446         IRTemp dst_vec = newTemp(Ity_V128);
19447         IRTemp src_vec = newTemp(Ity_V128);
19448
19449         modrm = getUChar(delta);
19450
19451         assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
19452
19453         if ( epartIsReg( modrm ) ) {
19454            imm8 = (Int)getUChar(delta+1);
19455            assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
19456            delta += 1+1;
19457            DIP( "blendps $%d, %s,%s\n", imm8,
19458                 nameXMMReg( eregOfRexRM(pfx, modrm) ),
19459                 nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19460         } else {
19461            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
19462                             1/* imm8 is 1 byte after the amode */ );
19463            gen_SEGV_if_not_16_aligned( addr );
19464            assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
19465            imm8 = (Int)getUChar(delta+alen);
19466            delta += alen+1;
19467            DIP( "blendpd $%d, %s,%s\n",
19468                 imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19469         }
19470
19471         putXMMReg( gregOfRexRM(pfx, modrm),
19472                    mkexpr( math_BLENDPS_128( src_vec, dst_vec, imm8) ) );
19473         goto decode_success;
19474      }
19475      break;
19476
19477   case 0x0D:
19478      /* 66 0F 3A 0D /r ib = BLENDPD xmm1, xmm2/m128, imm8
19479         Blend Packed Double Precision Floating-Point Values (XMM) */
19480      if (have66noF2noF3(pfx) && sz == 2) {
19481
19482         Int imm8;
19483         IRTemp dst_vec = newTemp(Ity_V128);
19484         IRTemp src_vec = newTemp(Ity_V128);
19485
19486         modrm = getUChar(delta);
19487         assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
19488
19489         if ( epartIsReg( modrm ) ) {
19490            imm8 = (Int)getUChar(delta+1);
19491            assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
19492            delta += 1+1;
19493            DIP( "blendpd $%d, %s,%s\n", imm8,
19494                 nameXMMReg( eregOfRexRM(pfx, modrm) ),
19495                 nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19496         } else {
19497            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
19498                             1/* imm8 is 1 byte after the amode */ );
19499            gen_SEGV_if_not_16_aligned( addr );
19500            assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
19501            imm8 = (Int)getUChar(delta+alen);
19502            delta += alen+1;
19503            DIP( "blendpd $%d, %s,%s\n",
19504                 imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19505         }
19506
19507         putXMMReg( gregOfRexRM(pfx, modrm),
19508                    mkexpr( math_BLENDPD_128( src_vec, dst_vec, imm8) ) );
19509         goto decode_success;
19510      }
19511      break;
19512
19513   case 0x0E:
19514      /* 66 0F 3A 0E /r ib = PBLENDW xmm1, xmm2/m128, imm8
19515         Blend Packed Words (XMM) */
19516      if (have66noF2noF3(pfx) && sz == 2) {
19517
19518         Int imm8;
19519         IRTemp dst_vec = newTemp(Ity_V128);
19520         IRTemp src_vec = newTemp(Ity_V128);
19521
19522         modrm = getUChar(delta);
19523
19524         assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
19525
19526         if ( epartIsReg( modrm ) ) {
19527            imm8 = (Int)getUChar(delta+1);
19528            assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
19529            delta += 1+1;
19530            DIP( "pblendw $%d, %s,%s\n", imm8,
19531                 nameXMMReg( eregOfRexRM(pfx, modrm) ),
19532                 nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19533         } else {
19534            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
19535                             1/* imm8 is 1 byte after the amode */ );
19536            gen_SEGV_if_not_16_aligned( addr );
19537            assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
19538            imm8 = (Int)getUChar(delta+alen);
19539            delta += alen+1;
19540            DIP( "pblendw $%d, %s,%s\n",
19541                 imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
19542         }
19543
19544         putXMMReg( gregOfRexRM(pfx, modrm),
19545                    mkexpr( math_PBLENDW_128( src_vec, dst_vec, imm8) ) );
19546         goto decode_success;
19547      }
19548      break;
19549
19550   case 0x14:
19551      /* 66 0F 3A 14 /r ib = PEXTRB r/m16, xmm, imm8
19552         Extract Byte from xmm, store in mem or zero-extend + store in gen.reg.
19553         (XMM) */
19554      if (have66noF2noF3(pfx) && sz == 2) {
19555         delta = dis_PEXTRB_128_GtoE( vbi, pfx, delta, False/*!isAvx*/ );
19556         goto decode_success;
19557      }
19558      break;
19559
19560   case 0x15:
19561      /* 66 0F 3A 15 /r ib = PEXTRW r/m16, xmm, imm8
19562         Extract Word from xmm, store in mem or zero-extend + store in gen.reg.
19563         (XMM) */
19564      if (have66noF2noF3(pfx) && sz == 2) {
19565         delta = dis_PEXTRW( vbi, pfx, delta, False/*!isAvx*/ );
19566         goto decode_success;
19567      }
19568      break;
19569
19570   case 0x16:
19571      /* 66 no-REX.W 0F 3A 16 /r ib = PEXTRD reg/mem32, xmm2, imm8
19572         Extract Doubleword int from xmm reg and store in gen.reg or mem. (XMM)
19573         Note that this insn has the same opcodes as PEXTRQ, but
19574         here the REX.W bit is _not_ present */
19575      if (have66noF2noF3(pfx)
19576          && sz == 2 /* REX.W is _not_ present */) {
19577         delta = dis_PEXTRD( vbi, pfx, delta, False/*!isAvx*/ );
19578         goto decode_success;
19579      }
19580      /* 66 REX.W 0F 3A 16 /r ib = PEXTRQ reg/mem64, xmm2, imm8
19581         Extract Quadword int from xmm reg and store in gen.reg or mem. (XMM)
19582         Note that this insn has the same opcodes as PEXTRD, but
19583         here the REX.W bit is present */
19584      if (have66noF2noF3(pfx)
19585          && sz == 8 /* REX.W is present */) {
19586         delta = dis_PEXTRQ( vbi, pfx, delta, False/*!isAvx*/);
19587         goto decode_success;
19588      }
19589      break;
19590
19591   case 0x17:
19592      /* 66 0F 3A 17 /r ib = EXTRACTPS reg/mem32, xmm2, imm8 Extract
19593         float from xmm reg and store in gen.reg or mem.  This is
19594         identical to PEXTRD, except that REX.W appears to be ignored.
19595      */
19596      if (have66noF2noF3(pfx)
19597          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
19598         delta = dis_EXTRACTPS( vbi, pfx, delta, False/*!isAvx*/ );
19599         goto decode_success;
19600      }
19601      break;
19602
19603   case 0x20:
19604      /* 66 0F 3A 20 /r ib = PINSRB xmm1, r32/m8, imm8
19605         Extract byte from r32/m8 and insert into xmm1 */
19606      if (have66noF2noF3(pfx) && sz == 2) {
19607         Int    imm8;
19608         IRTemp new8 = newTemp(Ity_I8);
19609         modrm = getUChar(delta);
19610         UInt rG = gregOfRexRM(pfx, modrm);
19611         if ( epartIsReg( modrm ) ) {
19612            UInt rE = eregOfRexRM(pfx,modrm);
19613            imm8 = (Int)(getUChar(delta+1) & 0xF);
19614            assign( new8, unop(Iop_32to8, getIReg32(rE)) );
19615            delta += 1+1;
19616            DIP( "pinsrb $%d,%s,%s\n", imm8,
19617                 nameIReg32(rE), nameXMMReg(rG) );
19618         } else {
19619            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19620            imm8 = (Int)(getUChar(delta+alen) & 0xF);
19621            assign( new8, loadLE( Ity_I8, mkexpr(addr) ) );
19622            delta += alen+1;
19623            DIP( "pinsrb $%d,%s,%s\n",
19624                 imm8, dis_buf, nameXMMReg(rG) );
19625         }
19626         IRTemp src_vec = newTemp(Ity_V128);
19627         assign(src_vec, getXMMReg( gregOfRexRM(pfx, modrm) ));
19628         IRTemp res = math_PINSRB_128( src_vec, new8, imm8 );
19629         putXMMReg( rG, mkexpr(res) );
19630         goto decode_success;
19631      }
19632      break;
19633
19634   case 0x21:
19635      /* 66 0F 3A 21 /r ib = INSERTPS imm8, xmm2/m32, xmm1
19636         Insert Packed Single Precision Floating-Point Value (XMM) */
19637      if (have66noF2noF3(pfx) && sz == 2) {
19638         UInt   imm8;
19639         IRTemp d2ins = newTemp(Ity_I32); /* comes from the E part */
19640         const IRTemp inval = IRTemp_INVALID;
19641
19642         modrm = getUChar(delta);
19643         UInt rG = gregOfRexRM(pfx, modrm);
19644
19645         if ( epartIsReg( modrm ) ) {
19646            UInt   rE = eregOfRexRM(pfx, modrm);
19647            IRTemp vE = newTemp(Ity_V128);
19648            assign( vE, getXMMReg(rE) );
19649            IRTemp dsE[4] = { inval, inval, inval, inval };
19650            breakupV128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
19651            imm8 = getUChar(delta+1);
19652            d2ins = dsE[(imm8 >> 6) & 3]; /* "imm8_count_s" */
19653            delta += 1+1;
19654            DIP( "insertps $%u, %s,%s\n",
19655                 imm8, nameXMMReg(rE), nameXMMReg(rG) );
19656         } else {
19657            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19658            assign( d2ins, loadLE( Ity_I32, mkexpr(addr) ) );
19659            imm8 = getUChar(delta+alen);
19660            delta += alen+1;
19661            DIP( "insertps $%u, %s,%s\n",
19662                 imm8, dis_buf, nameXMMReg(rG) );
19663         }
19664
19665         IRTemp vG = newTemp(Ity_V128);
19666         assign( vG, getXMMReg(rG) );
19667
19668         putXMMReg( rG, mkexpr(math_INSERTPS( vG, d2ins, imm8 )) );
19669         goto decode_success;
19670      }
19671      break;
19672
19673   case 0x22:
19674      /* 66 no-REX.W 0F 3A 22 /r ib = PINSRD xmm1, r/m32, imm8
19675         Extract Doubleword int from gen.reg/mem32 and insert into xmm1 */
19676      if (have66noF2noF3(pfx)
19677          && sz == 2 /* REX.W is NOT present */) {
19678         Int    imm8_10;
19679         IRTemp src_u32 = newTemp(Ity_I32);
19680         modrm = getUChar(delta);
19681         UInt rG = gregOfRexRM(pfx, modrm);
19682
19683         if ( epartIsReg( modrm ) ) {
19684            UInt rE = eregOfRexRM(pfx,modrm);
19685            imm8_10 = (Int)(getUChar(delta+1) & 3);
19686            assign( src_u32, getIReg32( rE ) );
19687            delta += 1+1;
19688            DIP( "pinsrd $%d, %s,%s\n",
19689                 imm8_10, nameIReg32(rE), nameXMMReg(rG) );
19690         } else {
19691            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19692            imm8_10 = (Int)(getUChar(delta+alen) & 3);
19693            assign( src_u32, loadLE( Ity_I32, mkexpr(addr) ) );
19694            delta += alen+1;
19695            DIP( "pinsrd $%d, %s,%s\n",
19696                 imm8_10, dis_buf, nameXMMReg(rG) );
19697         }
19698
19699         IRTemp src_vec = newTemp(Ity_V128);
19700         assign(src_vec, getXMMReg( rG ));
19701         IRTemp res_vec = math_PINSRD_128( src_vec, src_u32, imm8_10 );
19702         putXMMReg( rG, mkexpr(res_vec) );
19703         goto decode_success;
19704      }
19705      /* 66 REX.W 0F 3A 22 /r ib = PINSRQ xmm1, r/m64, imm8
19706         Extract Quadword int from gen.reg/mem64 and insert into xmm1 */
19707      if (have66noF2noF3(pfx)
19708          && sz == 8 /* REX.W is present */) {
19709         Int imm8_0;
19710         IRTemp src_u64 = newTemp(Ity_I64);
19711         modrm = getUChar(delta);
19712         UInt rG = gregOfRexRM(pfx, modrm);
19713
19714         if ( epartIsReg( modrm ) ) {
19715            UInt rE = eregOfRexRM(pfx,modrm);
19716            imm8_0 = (Int)(getUChar(delta+1) & 1);
19717            assign( src_u64, getIReg64( rE ) );
19718            delta += 1+1;
19719            DIP( "pinsrq $%d, %s,%s\n",
19720                 imm8_0, nameIReg64(rE), nameXMMReg(rG) );
19721         } else {
19722            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19723            imm8_0 = (Int)(getUChar(delta+alen) & 1);
19724            assign( src_u64, loadLE( Ity_I64, mkexpr(addr) ) );
19725            delta += alen+1;
19726            DIP( "pinsrq $%d, %s,%s\n",
19727                 imm8_0, dis_buf, nameXMMReg(rG) );
19728         }
19729
19730         IRTemp src_vec = newTemp(Ity_V128);
19731         assign(src_vec, getXMMReg( rG ));
19732         IRTemp res_vec = math_PINSRQ_128( src_vec, src_u64, imm8_0 );
19733         putXMMReg( rG, mkexpr(res_vec) );
19734         goto decode_success;
19735      }
19736      break;
19737
19738   case 0x40:
19739      /* 66 0F 3A 40 /r ib = DPPS xmm1, xmm2/m128, imm8
19740         Dot Product of Packed Single Precision Floating-Point Values (XMM) */
19741      if (have66noF2noF3(pfx) && sz == 2) {
19742         modrm = getUChar(delta);
19743         Int    imm8;
19744         IRTemp src_vec = newTemp(Ity_V128);
19745         IRTemp dst_vec = newTemp(Ity_V128);
19746         UInt   rG      = gregOfRexRM(pfx, modrm);
19747         assign( dst_vec, getXMMReg( rG ) );
19748         if ( epartIsReg( modrm ) ) {
19749            UInt rE = eregOfRexRM(pfx, modrm);
19750            imm8 = (Int)getUChar(delta+1);
19751            assign( src_vec, getXMMReg(rE) );
19752            delta += 1+1;
19753            DIP( "dpps $%d, %s,%s\n",
19754                 imm8, nameXMMReg(rE), nameXMMReg(rG) );
19755         } else {
19756            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
19757                             1/* imm8 is 1 byte after the amode */ );
19758            gen_SEGV_if_not_16_aligned( addr );
19759            assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
19760            imm8 = (Int)getUChar(delta+alen);
19761            delta += alen+1;
19762            DIP( "dpps $%d, %s,%s\n",
19763                 imm8, dis_buf, nameXMMReg(rG) );
19764         }
19765         IRTemp res = math_DPPS_128( src_vec, dst_vec, imm8 );
19766         putXMMReg( rG, mkexpr(res) );
19767         goto decode_success;
19768      }
19769      break;
19770
19771   case 0x41:
19772      /* 66 0F 3A 41 /r ib = DPPD xmm1, xmm2/m128, imm8
19773         Dot Product of Packed Double Precision Floating-Point Values (XMM) */
19774      if (have66noF2noF3(pfx) && sz == 2) {
19775         modrm = getUChar(delta);
19776         Int    imm8;
19777         IRTemp src_vec = newTemp(Ity_V128);
19778         IRTemp dst_vec = newTemp(Ity_V128);
19779         UInt   rG      = gregOfRexRM(pfx, modrm);
19780         assign( dst_vec, getXMMReg( rG ) );
19781         if ( epartIsReg( modrm ) ) {
19782            UInt rE = eregOfRexRM(pfx, modrm);
19783            imm8 = (Int)getUChar(delta+1);
19784            assign( src_vec, getXMMReg(rE) );
19785            delta += 1+1;
19786            DIP( "dppd $%d, %s,%s\n",
19787                 imm8, nameXMMReg(rE), nameXMMReg(rG) );
19788         } else {
19789            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
19790                             1/* imm8 is 1 byte after the amode */ );
19791            gen_SEGV_if_not_16_aligned( addr );
19792            assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
19793            imm8 = (Int)getUChar(delta+alen);
19794            delta += alen+1;
19795            DIP( "dppd $%d, %s,%s\n",
19796                 imm8, dis_buf, nameXMMReg(rG) );
19797         }
19798         IRTemp res = math_DPPD_128( src_vec, dst_vec, imm8 );
19799         putXMMReg( rG, mkexpr(res) );
19800         goto decode_success;
19801      }
19802      break;
19803
19804   case 0x42:
19805      /* 66 0F 3A 42 /r ib = MPSADBW xmm1, xmm2/m128, imm8
19806         Multiple Packed Sums of Absolule Difference (XMM) */
19807      if (have66noF2noF3(pfx) && sz == 2) {
19808         Int    imm8;
19809         IRTemp src_vec = newTemp(Ity_V128);
19810         IRTemp dst_vec = newTemp(Ity_V128);
19811         modrm          = getUChar(delta);
19812         UInt   rG      = gregOfRexRM(pfx, modrm);
19813
19814         assign( dst_vec, getXMMReg(rG) );
19815
19816         if ( epartIsReg( modrm ) ) {
19817            UInt rE = eregOfRexRM(pfx, modrm);
19818
19819            imm8 = (Int)getUChar(delta+1);
19820            assign( src_vec, getXMMReg(rE) );
19821            delta += 1+1;
19822            DIP( "mpsadbw $%d, %s,%s\n", imm8,
19823                 nameXMMReg(rE), nameXMMReg(rG) );
19824         } else {
19825            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
19826                             1/* imm8 is 1 byte after the amode */ );
19827            gen_SEGV_if_not_16_aligned( addr );
19828            assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
19829            imm8 = (Int)getUChar(delta+alen);
19830            delta += alen+1;
19831            DIP( "mpsadbw $%d, %s,%s\n", imm8, dis_buf, nameXMMReg(rG) );
19832         }
19833
19834         putXMMReg( rG, mkexpr( math_MPSADBW_128(dst_vec, src_vec, imm8) ) );
19835         goto decode_success;
19836      }
19837      break;
19838
19839   case 0x44:
19840      /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
19841       * Carry-less multiplication of selected XMM quadwords into XMM
19842       * registers (a.k.a multiplication of polynomials over GF(2))
19843       */
19844      if (have66noF2noF3(pfx) && sz == 2) {
19845
19846         Int imm8;
19847         IRTemp svec = newTemp(Ity_V128);
19848         IRTemp dvec = newTemp(Ity_V128);
19849         modrm       = getUChar(delta);
19850         UInt   rG   = gregOfRexRM(pfx, modrm);
19851
19852         assign( dvec, getXMMReg(rG) );
19853
19854         if ( epartIsReg( modrm ) ) {
19855            UInt rE = eregOfRexRM(pfx, modrm);
19856            imm8 = (Int)getUChar(delta+1);
19857            assign( svec, getXMMReg(rE) );
19858            delta += 1+1;
19859            DIP( "pclmulqdq $%d, %s,%s\n", imm8,
19860                 nameXMMReg(rE), nameXMMReg(rG) );
19861         } else {
19862            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
19863                             1/* imm8 is 1 byte after the amode */ );
19864            gen_SEGV_if_not_16_aligned( addr );
19865            assign( svec, loadLE( Ity_V128, mkexpr(addr) ) );
19866            imm8 = (Int)getUChar(delta+alen);
19867            delta += alen+1;
19868            DIP( "pclmulqdq $%d, %s,%s\n",
19869                 imm8, dis_buf, nameXMMReg(rG) );
19870         }
19871
19872         putXMMReg( rG, mkexpr( math_PCLMULQDQ(dvec, svec, imm8) ) );
19873         goto decode_success;
19874      }
19875      break;
19876
19877   case 0x60:
19878   case 0x61:
19879   case 0x62:
19880   case 0x63:
19881      /* 66 0F 3A 63 /r ib = PCMPISTRI imm8, xmm2/m128, xmm1
19882         66 0F 3A 62 /r ib = PCMPISTRM imm8, xmm2/m128, xmm1
19883         66 0F 3A 61 /r ib = PCMPESTRI imm8, xmm2/m128, xmm1
19884         66 0F 3A 60 /r ib = PCMPESTRM imm8, xmm2/m128, xmm1
19885         (selected special cases that actually occur in glibc,
19886          not by any means a complete implementation.)
19887      */
19888      if (have66noF2noF3(pfx) && sz == 2) {
19889         Long delta0 = delta;
19890         delta = dis_PCMPxSTRx( vbi, pfx, delta, False/*!isAvx*/, opc );
19891         if (delta > delta0) goto decode_success;
19892         /* else fall though; dis_PCMPxSTRx failed to decode it */
19893      }
19894      break;
19895
19896   case 0xDF:
19897      /* 66 0F 3A DF /r ib = AESKEYGENASSIST imm8, xmm2/m128, xmm1 */
19898      if (have66noF2noF3(pfx) && sz == 2) {
19899         delta = dis_AESKEYGENASSIST( vbi, pfx, delta, False/*!isAvx*/ );
19900         goto decode_success;
19901      }
19902      break;
19903
19904   default:
19905      break;
19906
19907   }
19908
19909  decode_failure:
19910   *decode_OK = False;
19911   return deltaIN;
19912
19913  decode_success:
19914   *decode_OK = True;
19915   return delta;
19916}
19917
19918
19919/*------------------------------------------------------------*/
19920/*---                                                      ---*/
19921/*--- Top-level post-escape decoders: dis_ESC_NONE         ---*/
19922/*---                                                      ---*/
19923/*------------------------------------------------------------*/
19924
19925__attribute__((noinline))
19926static
19927Long dis_ESC_NONE (
19928        /*MB_OUT*/DisResult* dres,
19929        /*MB_OUT*/Bool*      expect_CAS,
19930        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
19931        Bool         resteerCisOk,
19932        void*        callback_opaque,
19933        const VexArchInfo* archinfo,
19934        const VexAbiInfo*  vbi,
19935        Prefix pfx, Int sz, Long deltaIN
19936     )
19937{
19938   Long   d64   = 0;
19939   UChar  abyte = 0;
19940   IRTemp addr  = IRTemp_INVALID;
19941   IRTemp t1    = IRTemp_INVALID;
19942   IRTemp t2    = IRTemp_INVALID;
19943   IRTemp t3    = IRTemp_INVALID;
19944   IRTemp t4    = IRTemp_INVALID;
19945   IRTemp t5    = IRTemp_INVALID;
19946   IRType ty    = Ity_INVALID;
19947   UChar  modrm = 0;
19948   Int    am_sz = 0;
19949   Int    d_sz  = 0;
19950   Int    alen  = 0;
19951   HChar  dis_buf[50];
19952
19953   Long   delta = deltaIN;
19954   UChar  opc   = getUChar(delta); delta++;
19955
19956   /* delta now points at the modrm byte.  In most of the cases that
19957      follow, neither the F2 nor F3 prefixes are allowed.  However,
19958      for some basic arithmetic operations we have to allow F2/XACQ or
19959      F3/XREL in the case where the destination is memory and the LOCK
19960      prefix is also present.  Do this check by looking at the modrm
19961      byte but not advancing delta over it. */
19962   /* By default, F2 and F3 are not allowed, so let's start off with
19963      that setting. */
19964   Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
19965   { UChar tmp_modrm = getUChar(delta);
19966     switch (opc) {
19967        case 0x00: /* ADD Gb,Eb */  case 0x01: /* ADD Gv,Ev */
19968        case 0x08: /* OR  Gb,Eb */  case 0x09: /* OR  Gv,Ev */
19969        case 0x10: /* ADC Gb,Eb */  case 0x11: /* ADC Gv,Ev */
19970        case 0x18: /* SBB Gb,Eb */  case 0x19: /* SBB Gv,Ev */
19971        case 0x20: /* AND Gb,Eb */  case 0x21: /* AND Gv,Ev */
19972        case 0x28: /* SUB Gb,Eb */  case 0x29: /* SUB Gv,Ev */
19973        case 0x30: /* XOR Gb,Eb */  case 0x31: /* XOR Gv,Ev */
19974           if (!epartIsReg(tmp_modrm)
19975               && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
19976              /* dst is mem, and we have F2 or F3 but not both */
19977              validF2orF3 = True;
19978           }
19979           break;
19980        default:
19981           break;
19982     }
19983   }
19984
19985   /* Now, in the switch below, for the opc values examined by the
19986      switch above, use validF2orF3 rather than looking at pfx
19987      directly. */
19988   switch (opc) {
19989
19990   case 0x00: /* ADD Gb,Eb */
19991      if (!validF2orF3) goto decode_failure;
19992      delta = dis_op2_G_E ( vbi, pfx, Iop_Add8, WithFlagNone, True, 1, delta, "add" );
19993      return delta;
19994   case 0x01: /* ADD Gv,Ev */
19995      if (!validF2orF3) goto decode_failure;
19996      delta = dis_op2_G_E ( vbi, pfx, Iop_Add8, WithFlagNone, True, sz, delta, "add" );
19997      return delta;
19998
19999   case 0x02: /* ADD Eb,Gb */
20000      if (haveF2orF3(pfx)) goto decode_failure;
20001      delta = dis_op2_E_G ( vbi, pfx, Iop_Add8, WithFlagNone, True, 1, delta, "add" );
20002      return delta;
20003   case 0x03: /* ADD Ev,Gv */
20004      if (haveF2orF3(pfx)) goto decode_failure;
20005      delta = dis_op2_E_G ( vbi, pfx, Iop_Add8, WithFlagNone, True, sz, delta, "add" );
20006      return delta;
20007
20008   case 0x04: /* ADD Ib, AL */
20009      if (haveF2orF3(pfx)) goto decode_failure;
20010      delta = dis_op_imm_A( 1, False, Iop_Add8, True, delta, "add" );
20011      return delta;
20012   case 0x05: /* ADD Iv, eAX */
20013      if (haveF2orF3(pfx)) goto decode_failure;
20014      delta = dis_op_imm_A(sz, False, Iop_Add8, True, delta, "add" );
20015      return delta;
20016
20017   case 0x08: /* OR Gb,Eb */
20018      if (!validF2orF3) goto decode_failure;
20019      delta = dis_op2_G_E ( vbi, pfx, Iop_Or8, WithFlagNone, True, 1, delta, "or" );
20020      return delta;
20021   case 0x09: /* OR Gv,Ev */
20022      if (!validF2orF3) goto decode_failure;
20023      delta = dis_op2_G_E ( vbi, pfx, Iop_Or8, WithFlagNone, True, sz, delta, "or" );
20024      return delta;
20025
20026   case 0x0A: /* OR Eb,Gb */
20027      if (haveF2orF3(pfx)) goto decode_failure;
20028      delta = dis_op2_E_G ( vbi, pfx, Iop_Or8, WithFlagNone, True, 1, delta, "or" );
20029      return delta;
20030   case 0x0B: /* OR Ev,Gv */
20031      if (haveF2orF3(pfx)) goto decode_failure;
20032      delta = dis_op2_E_G ( vbi, pfx, Iop_Or8, WithFlagNone, True, sz, delta, "or" );
20033      return delta;
20034
20035   case 0x0C: /* OR Ib, AL */
20036      if (haveF2orF3(pfx)) goto decode_failure;
20037      delta = dis_op_imm_A( 1, False, Iop_Or8, True, delta, "or" );
20038      return delta;
20039   case 0x0D: /* OR Iv, eAX */
20040      if (haveF2orF3(pfx)) goto decode_failure;
20041      delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
20042      return delta;
20043
20044   case 0x10: /* ADC Gb,Eb */
20045      if (!validF2orF3) goto decode_failure;
20046      delta = dis_op2_G_E ( vbi, pfx, Iop_Add8, WithFlagCarry, True, 1, delta, "adc" );
20047      return delta;
20048   case 0x11: /* ADC Gv,Ev */
20049      if (!validF2orF3) goto decode_failure;
20050      delta = dis_op2_G_E ( vbi, pfx, Iop_Add8, WithFlagCarry, True, sz, delta, "adc" );
20051      return delta;
20052
20053   case 0x12: /* ADC Eb,Gb */
20054      if (haveF2orF3(pfx)) goto decode_failure;
20055      delta = dis_op2_E_G ( vbi, pfx, Iop_Add8, WithFlagCarry, True, 1, delta, "adc" );
20056      return delta;
20057   case 0x13: /* ADC Ev,Gv */
20058      if (haveF2orF3(pfx)) goto decode_failure;
20059      delta = dis_op2_E_G ( vbi, pfx, Iop_Add8, WithFlagCarry, True, sz, delta, "adc" );
20060      return delta;
20061
20062   case 0x14: /* ADC Ib, AL */
20063      if (haveF2orF3(pfx)) goto decode_failure;
20064      delta = dis_op_imm_A( 1, True, Iop_Add8, True, delta, "adc" );
20065      return delta;
20066   case 0x15: /* ADC Iv, eAX */
20067      if (haveF2orF3(pfx)) goto decode_failure;
20068      delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
20069      return delta;
20070
20071   case 0x18: /* SBB Gb,Eb */
20072      if (!validF2orF3) goto decode_failure;
20073      delta = dis_op2_G_E ( vbi, pfx, Iop_Sub8, WithFlagCarry, True, 1, delta, "sbb" );
20074      return delta;
20075   case 0x19: /* SBB Gv,Ev */
20076      if (!validF2orF3) goto decode_failure;
20077      delta = dis_op2_G_E ( vbi, pfx, Iop_Sub8, WithFlagCarry, True, sz, delta, "sbb" );
20078      return delta;
20079
20080   case 0x1A: /* SBB Eb,Gb */
20081      if (haveF2orF3(pfx)) goto decode_failure;
20082      delta = dis_op2_E_G ( vbi, pfx, Iop_Sub8, WithFlagCarry, True, 1, delta, "sbb" );
20083      return delta;
20084   case 0x1B: /* SBB Ev,Gv */
20085      if (haveF2orF3(pfx)) goto decode_failure;
20086      delta = dis_op2_E_G ( vbi, pfx, Iop_Sub8, WithFlagCarry, True, sz, delta, "sbb" );
20087      return delta;
20088
20089   case 0x1C: /* SBB Ib, AL */
20090      if (haveF2orF3(pfx)) goto decode_failure;
20091      delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
20092      return delta;
20093   case 0x1D: /* SBB Iv, eAX */
20094      if (haveF2orF3(pfx)) goto decode_failure;
20095      delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
20096      return delta;
20097
20098   case 0x20: /* AND Gb,Eb */
20099      if (!validF2orF3) goto decode_failure;
20100      delta = dis_op2_G_E ( vbi, pfx, Iop_And8, WithFlagNone, True, 1, delta, "and" );
20101      return delta;
20102   case 0x21: /* AND Gv,Ev */
20103      if (!validF2orF3) goto decode_failure;
20104      delta = dis_op2_G_E ( vbi, pfx, Iop_And8, WithFlagNone, True, sz, delta, "and" );
20105      return delta;
20106
20107   case 0x22: /* AND Eb,Gb */
20108      if (haveF2orF3(pfx)) goto decode_failure;
20109      delta = dis_op2_E_G ( vbi, pfx, Iop_And8, WithFlagNone, True, 1, delta, "and" );
20110      return delta;
20111   case 0x23: /* AND Ev,Gv */
20112      if (haveF2orF3(pfx)) goto decode_failure;
20113      delta = dis_op2_E_G ( vbi, pfx, Iop_And8, WithFlagNone, True, sz, delta, "and" );
20114      return delta;
20115
20116   case 0x24: /* AND Ib, AL */
20117      if (haveF2orF3(pfx)) goto decode_failure;
20118      delta = dis_op_imm_A( 1, False, Iop_And8, True, delta, "and" );
20119      return delta;
20120   case 0x25: /* AND Iv, eAX */
20121      if (haveF2orF3(pfx)) goto decode_failure;
20122      delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
20123      return delta;
20124
20125   case 0x28: /* SUB Gb,Eb */
20126      if (!validF2orF3) goto decode_failure;
20127      delta = dis_op2_G_E ( vbi, pfx, Iop_Sub8, WithFlagNone, True, 1, delta, "sub" );
20128      return delta;
20129   case 0x29: /* SUB Gv,Ev */
20130      if (!validF2orF3) goto decode_failure;
20131      delta = dis_op2_G_E ( vbi, pfx, Iop_Sub8, WithFlagNone, True, sz, delta, "sub" );
20132      return delta;
20133
20134   case 0x2A: /* SUB Eb,Gb */
20135      if (haveF2orF3(pfx)) goto decode_failure;
20136      delta = dis_op2_E_G ( vbi, pfx, Iop_Sub8, WithFlagNone, True, 1, delta, "sub" );
20137      return delta;
20138   case 0x2B: /* SUB Ev,Gv */
20139      if (haveF2orF3(pfx)) goto decode_failure;
20140      delta = dis_op2_E_G ( vbi, pfx, Iop_Sub8, WithFlagNone, True, sz, delta, "sub" );
20141      return delta;
20142
20143   case 0x2C: /* SUB Ib, AL */
20144      if (haveF2orF3(pfx)) goto decode_failure;
20145      delta = dis_op_imm_A(1, False, Iop_Sub8, True, delta, "sub" );
20146      return delta;
20147   case 0x2D: /* SUB Iv, eAX */
20148      if (haveF2orF3(pfx)) goto decode_failure;
20149      delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
20150      return delta;
20151
20152   case 0x30: /* XOR Gb,Eb */
20153      if (!validF2orF3) goto decode_failure;
20154      delta = dis_op2_G_E ( vbi, pfx, Iop_Xor8, WithFlagNone, True, 1, delta, "xor" );
20155      return delta;
20156   case 0x31: /* XOR Gv,Ev */
20157      if (!validF2orF3) goto decode_failure;
20158      delta = dis_op2_G_E ( vbi, pfx, Iop_Xor8, WithFlagNone, True, sz, delta, "xor" );
20159      return delta;
20160
20161   case 0x32: /* XOR Eb,Gb */
20162      if (haveF2orF3(pfx)) goto decode_failure;
20163      delta = dis_op2_E_G ( vbi, pfx, Iop_Xor8, WithFlagNone, True, 1, delta, "xor" );
20164      return delta;
20165   case 0x33: /* XOR Ev,Gv */
20166      if (haveF2orF3(pfx)) goto decode_failure;
20167      delta = dis_op2_E_G ( vbi, pfx, Iop_Xor8, WithFlagNone, True, sz, delta, "xor" );
20168      return delta;
20169
20170   case 0x34: /* XOR Ib, AL */
20171      if (haveF2orF3(pfx)) goto decode_failure;
20172      delta = dis_op_imm_A( 1, False, Iop_Xor8, True, delta, "xor" );
20173      return delta;
20174   case 0x35: /* XOR Iv, eAX */
20175      if (haveF2orF3(pfx)) goto decode_failure;
20176      delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
20177      return delta;
20178
20179   case 0x38: /* CMP Gb,Eb */
20180      if (haveF2orF3(pfx)) goto decode_failure;
20181      delta = dis_op2_G_E ( vbi, pfx, Iop_Sub8, WithFlagNone, False, 1, delta, "cmp" );
20182      return delta;
20183   case 0x39: /* CMP Gv,Ev */
20184      if (haveF2orF3(pfx)) goto decode_failure;
20185      delta = dis_op2_G_E ( vbi, pfx, Iop_Sub8, WithFlagNone, False, sz, delta, "cmp" );
20186      return delta;
20187
20188   case 0x3A: /* CMP Eb,Gb */
20189      if (haveF2orF3(pfx)) goto decode_failure;
20190      delta = dis_op2_E_G ( vbi, pfx, Iop_Sub8, WithFlagNone, False, 1, delta, "cmp" );
20191      return delta;
20192   case 0x3B: /* CMP Ev,Gv */
20193      if (haveF2orF3(pfx)) goto decode_failure;
20194      delta = dis_op2_E_G ( vbi, pfx, Iop_Sub8, WithFlagNone, False, sz, delta, "cmp" );
20195      return delta;
20196
20197   case 0x3C: /* CMP Ib, AL */
20198      if (haveF2orF3(pfx)) goto decode_failure;
20199      delta = dis_op_imm_A( 1, False, Iop_Sub8, False, delta, "cmp" );
20200      return delta;
20201   case 0x3D: /* CMP Iv, eAX */
20202      if (haveF2orF3(pfx)) goto decode_failure;
20203      delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
20204      return delta;
20205
20206   case 0x50: /* PUSH eAX */
20207   case 0x51: /* PUSH eCX */
20208   case 0x52: /* PUSH eDX */
20209   case 0x53: /* PUSH eBX */
20210   case 0x55: /* PUSH eBP */
20211   case 0x56: /* PUSH eSI */
20212   case 0x57: /* PUSH eDI */
20213   case 0x54: /* PUSH eSP */
20214      /* This is the Right Way, in that the value to be pushed is
20215         established before %rsp is changed, so that pushq %rsp
20216         correctly pushes the old value. */
20217      if (haveF2orF3(pfx)) goto decode_failure;
20218      vassert(sz == 2 || sz == 4 || sz == 8);
20219      if (sz == 4)
20220         sz = 8; /* there is no encoding for 32-bit push in 64-bit mode */
20221      ty = sz==2 ? Ity_I16 : Ity_I64;
20222      t1 = newTemp(ty);
20223      t2 = newTemp(Ity_I64);
20224      assign(t1, getIRegRexB(sz, pfx, opc-0x50));
20225      assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(sz)));
20226      putIReg64(R_RSP, mkexpr(t2) );
20227      storeLE(mkexpr(t2),mkexpr(t1));
20228      DIP("push%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x50));
20229      return delta;
20230
20231   case 0x58: /* POP eAX */
20232   case 0x59: /* POP eCX */
20233   case 0x5A: /* POP eDX */
20234   case 0x5B: /* POP eBX */
20235   case 0x5D: /* POP eBP */
20236   case 0x5E: /* POP eSI */
20237   case 0x5F: /* POP eDI */
20238   case 0x5C: /* POP eSP */
20239      if (haveF2orF3(pfx)) goto decode_failure;
20240      vassert(sz == 2 || sz == 4 || sz == 8);
20241      if (sz == 4)
20242         sz = 8; /* there is no encoding for 32-bit pop in 64-bit mode */
20243      t1 = newTemp(szToITy(sz));
20244      t2 = newTemp(Ity_I64);
20245      assign(t2, getIReg64(R_RSP));
20246      assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
20247      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
20248      putIRegRexB(sz, pfx, opc-0x58, mkexpr(t1));
20249      DIP("pop%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x58));
20250      return delta;
20251
20252   case 0x63: /* MOVSX */
20253      if (haveF2orF3(pfx)) goto decode_failure;
20254      if (haveREX(pfx) && 1==getRexW(pfx)) {
20255         vassert(sz == 8);
20256         /* movsx r/m32 to r64 */
20257         modrm = getUChar(delta);
20258         if (epartIsReg(modrm)) {
20259            delta++;
20260            putIRegG(8, pfx, modrm,
20261                             unop(Iop_32Sto64,
20262                                  getIRegE(4, pfx, modrm)));
20263            DIP("movslq %s,%s\n",
20264                nameIRegE(4, pfx, modrm),
20265                nameIRegG(8, pfx, modrm));
20266            return delta;
20267         } else {
20268            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
20269            delta += alen;
20270            putIRegG(8, pfx, modrm,
20271                             unop(Iop_32Sto64,
20272                                  loadLE(Ity_I32, mkexpr(addr))));
20273            DIP("movslq %s,%s\n", dis_buf,
20274                nameIRegG(8, pfx, modrm));
20275            return delta;
20276         }
20277      } else {
20278         goto decode_failure;
20279      }
20280
20281   case 0x68: /* PUSH Iv */
20282      if (haveF2orF3(pfx)) goto decode_failure;
20283      /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
20284      if (sz == 4) sz = 8;
20285      d64 = getSDisp(imin(4,sz),delta);
20286      delta += imin(4,sz);
20287      goto do_push_I;
20288
20289   case 0x69: /* IMUL Iv, Ev, Gv */
20290      if (haveF2orF3(pfx)) goto decode_failure;
20291      delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, sz );
20292      return delta;
20293
20294   case 0x6A: /* PUSH Ib, sign-extended to sz */
20295      if (haveF2orF3(pfx)) goto decode_failure;
20296      /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
20297      if (sz == 4) sz = 8;
20298      d64 = getSDisp8(delta); delta += 1;
20299      goto do_push_I;
20300   do_push_I:
20301      ty = szToITy(sz);
20302      t1 = newTemp(Ity_I64);
20303      t2 = newTemp(ty);
20304      assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
20305      putIReg64(R_RSP, mkexpr(t1) );
20306      /* stop mkU16 asserting if d32 is a negative 16-bit number
20307         (bug #132813) */
20308      if (ty == Ity_I16)
20309         d64 &= 0xFFFF;
20310      storeLE( mkexpr(t1), mkU(ty,d64) );
20311      DIP("push%c $%lld\n", nameISize(sz), (Long)d64);
20312      return delta;
20313
20314   case 0x6B: /* IMUL Ib, Ev, Gv */
20315      delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, 1 );
20316      return delta;
20317
20318   case 0x70:
20319   case 0x71:
20320   case 0x72:   /* JBb/JNAEb (jump below) */
20321   case 0x73:   /* JNBb/JAEb (jump not below) */
20322   case 0x74:   /* JZb/JEb (jump zero) */
20323   case 0x75:   /* JNZb/JNEb (jump not zero) */
20324   case 0x76:   /* JBEb/JNAb (jump below or equal) */
20325   case 0x77:   /* JNBEb/JAb (jump not below or equal) */
20326   case 0x78:   /* JSb (jump negative) */
20327   case 0x79:   /* JSb (jump not negative) */
20328   case 0x7A:   /* JP (jump parity even) */
20329   case 0x7B:   /* JNP/JPO (jump parity odd) */
20330   case 0x7C:   /* JLb/JNGEb (jump less) */
20331   case 0x7D:   /* JGEb/JNLb (jump greater or equal) */
20332   case 0x7E:   /* JLEb/JNGb (jump less or equal) */
20333   case 0x7F: { /* JGb/JNLEb (jump greater) */
20334      Long   jmpDelta;
20335      const HChar* comment  = "";
20336      if (haveF3(pfx)) goto decode_failure;
20337      if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
20338      jmpDelta = getSDisp8(delta);
20339      vassert(-128 <= jmpDelta && jmpDelta < 128);
20340      d64 = (guest_RIP_bbstart+delta+1) + jmpDelta;
20341      delta++;
20342      if (resteerCisOk
20343          && vex_control.guest_chase_cond
20344          && (Addr64)d64 != (Addr64)guest_RIP_bbstart
20345          && jmpDelta < 0
20346          && resteerOkFn( callback_opaque, (Addr64)d64) ) {
20347         /* Speculation: assume this backward branch is taken.  So we
20348            need to emit a side-exit to the insn following this one,
20349            on the negation of the condition, and continue at the
20350            branch target address (d64).  If we wind up back at the
20351            first instruction of the trace, just stop; it's better to
20352            let the IR loop unroller handle that case. */
20353         stmt( IRStmt_Exit(
20354                  mk_amd64g_calculate_condition(
20355                     (AMD64Condcode)(1 ^ (opc - 0x70))),
20356                  Ijk_Boring,
20357                  IRConst_U64(guest_RIP_bbstart+delta),
20358                  OFFB_RIP ) );
20359         dres->whatNext   = Dis_ResteerC;
20360         dres->continueAt = d64;
20361         comment = "(assumed taken)";
20362      }
20363      else
20364      if (resteerCisOk
20365          && vex_control.guest_chase_cond
20366          && (Addr64)d64 != (Addr64)guest_RIP_bbstart
20367          && jmpDelta >= 0
20368          && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
20369         /* Speculation: assume this forward branch is not taken.  So
20370            we need to emit a side-exit to d64 (the dest) and continue
20371            disassembling at the insn immediately following this
20372            one. */
20373         stmt( IRStmt_Exit(
20374                  mk_amd64g_calculate_condition((AMD64Condcode)(opc - 0x70)),
20375                  Ijk_Boring,
20376                  IRConst_U64(d64),
20377                  OFFB_RIP ) );
20378         dres->whatNext   = Dis_ResteerC;
20379         dres->continueAt = guest_RIP_bbstart+delta;
20380         comment = "(assumed not taken)";
20381      }
20382      else {
20383         /* Conservative default translation - end the block at this
20384            point. */
20385         jcc_01( dres, (AMD64Condcode)(opc - 0x70),
20386                 guest_RIP_bbstart+delta, d64 );
20387         vassert(dres->whatNext == Dis_StopHere);
20388      }
20389      DIP("j%s-8 0x%llx %s\n", name_AMD64Condcode(opc - 0x70), (ULong)d64,
20390          comment);
20391      return delta;
20392   }
20393
20394   case 0x80: /* Grp1 Ib,Eb */
20395      modrm = getUChar(delta);
20396      /* Disallow F2/XACQ and F3/XREL for the non-mem case.  Allow
20397         just one for the mem case and also require LOCK in this case.
20398         Note that this erroneously allows XACQ/XREL on CMP since we
20399         don't check the subopcode here.  No big deal. */
20400      if (epartIsReg(modrm) && haveF2orF3(pfx))
20401         goto decode_failure;
20402      if (!epartIsReg(modrm) && haveF2andF3(pfx))
20403         goto decode_failure;
20404      if (!epartIsReg(modrm) && haveF2orF3(pfx) && !haveLOCK(pfx))
20405         goto decode_failure;
20406      am_sz = lengthAMode(pfx,delta);
20407      sz    = 1;
20408      d_sz  = 1;
20409      d64   = getSDisp8(delta + am_sz);
20410      delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
20411      return delta;
20412
20413   case 0x81: /* Grp1 Iv,Ev */
20414      modrm = getUChar(delta);
20415      /* Same comment as for case 0x80 just above. */
20416      if (epartIsReg(modrm) && haveF2orF3(pfx))
20417         goto decode_failure;
20418      if (!epartIsReg(modrm) && haveF2andF3(pfx))
20419         goto decode_failure;
20420      if (!epartIsReg(modrm) && haveF2orF3(pfx) && !haveLOCK(pfx))
20421         goto decode_failure;
20422      am_sz = lengthAMode(pfx,delta);
20423      d_sz  = imin(sz,4);
20424      d64   = getSDisp(d_sz, delta + am_sz);
20425      delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
20426      return delta;
20427
20428   case 0x83: /* Grp1 Ib,Ev */
20429      if (haveF2orF3(pfx)) goto decode_failure;
20430      modrm = getUChar(delta);
20431      am_sz = lengthAMode(pfx,delta);
20432      d_sz  = 1;
20433      d64   = getSDisp8(delta + am_sz);
20434      delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
20435      return delta;
20436
20437   case 0x84: /* TEST Eb,Gb */
20438      if (haveF2orF3(pfx)) goto decode_failure;
20439      delta = dis_op2_E_G ( vbi, pfx, Iop_And8, WithFlagNone, False,
20440                            1, delta, "test" );
20441      return delta;
20442
20443   case 0x85: /* TEST Ev,Gv */
20444      if (haveF2orF3(pfx)) goto decode_failure;
20445      delta = dis_op2_E_G ( vbi, pfx, Iop_And8, WithFlagNone, False,
20446                            sz, delta, "test" );
20447      return delta;
20448
20449   /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
20450      prefix.  Therefore, generate CAS regardless of the presence or
20451      otherwise of a LOCK prefix. */
20452   case 0x86: /* XCHG Gb,Eb */
20453      sz = 1;
20454      /* Fall through ... */
20455   case 0x87: /* XCHG Gv,Ev */
20456      modrm = getUChar(delta);
20457      /* Check whether F2 or F3 are allowable.  For the mem case, one
20458         or the othter but not both are.  We don't care about the
20459         presence of LOCK in this case -- XCHG is unusual in this
20460         respect. */
20461      if (haveF2orF3(pfx)) {
20462         if (epartIsReg(modrm)) {
20463            goto decode_failure;
20464         } else {
20465            if (haveF2andF3(pfx))
20466               goto decode_failure;
20467         }
20468      }
20469      ty = szToITy(sz);
20470      t1 = newTemp(ty); t2 = newTemp(ty);
20471      if (epartIsReg(modrm)) {
20472         assign(t1, getIRegE(sz, pfx, modrm));
20473         assign(t2, getIRegG(sz, pfx, modrm));
20474         putIRegG(sz, pfx, modrm, mkexpr(t1));
20475         putIRegE(sz, pfx, modrm, mkexpr(t2));
20476         delta++;
20477         DIP("xchg%c %s, %s\n",
20478             nameISize(sz), nameIRegG(sz, pfx, modrm),
20479                            nameIRegE(sz, pfx, modrm));
20480      } else {
20481         *expect_CAS = True;
20482         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
20483         assign( t1, loadLE(ty, mkexpr(addr)) );
20484         assign( t2, getIRegG(sz, pfx, modrm) );
20485         casLE( mkexpr(addr),
20486                mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
20487         putIRegG( sz, pfx, modrm, mkexpr(t1) );
20488         delta += alen;
20489         DIP("xchg%c %s, %s\n", nameISize(sz),
20490                                nameIRegG(sz, pfx, modrm), dis_buf);
20491      }
20492      return delta;
20493
20494   case 0x88: { /* MOV Gb,Eb */
20495      /* We let dis_mov_G_E decide whether F3(XRELEASE) is allowable. */
20496      Bool ok = True;
20497      delta = dis_mov_G_E(vbi, pfx, 1, delta, &ok);
20498      if (!ok) goto decode_failure;
20499      return delta;
20500   }
20501
20502   case 0x89: { /* MOV Gv,Ev */
20503      /* We let dis_mov_G_E decide whether F3(XRELEASE) is allowable. */
20504      Bool ok = True;
20505      delta = dis_mov_G_E(vbi, pfx, sz, delta, &ok);
20506      if (!ok) goto decode_failure;
20507      return delta;
20508   }
20509
20510   case 0x8A: /* MOV Eb,Gb */
20511      if (haveF2orF3(pfx)) goto decode_failure;
20512      delta = dis_mov_E_G(vbi, pfx, 1, delta);
20513      return delta;
20514
20515   case 0x8B: /* MOV Ev,Gv */
20516      if (haveF2orF3(pfx)) goto decode_failure;
20517      delta = dis_mov_E_G(vbi, pfx, sz, delta);
20518      return delta;
20519
20520   case 0x8C: /* MOV S,E -- MOV from a SEGMENT REGISTER */
20521      if (haveF2orF3(pfx)) goto decode_failure;
20522      delta = dis_mov_S_E(vbi, pfx, sz, delta);
20523      return delta;
20524
20525   case 0x8D: /* LEA M,Gv */
20526      if (haveF2orF3(pfx)) goto decode_failure;
20527      if (sz != 4 && sz != 8)
20528         goto decode_failure;
20529      modrm = getUChar(delta);
20530      if (epartIsReg(modrm))
20531         goto decode_failure;
20532      /* NOTE!  this is the one place where a segment override prefix
20533         has no effect on the address calculation.  Therefore we clear
20534         any segment override bits in pfx. */
20535      addr = disAMode ( &alen, vbi, clearSegBits(pfx), delta, dis_buf, 0 );
20536      delta += alen;
20537      /* This is a hack.  But it isn't clear that really doing the
20538         calculation at 32 bits is really worth it.  Hence for leal,
20539         do the full 64-bit calculation and then truncate it. */
20540      putIRegG( sz, pfx, modrm,
20541                         sz == 4
20542                            ? unop(Iop_64to32, mkexpr(addr))
20543                            : mkexpr(addr)
20544              );
20545      DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
20546                            nameIRegG(sz,pfx,modrm));
20547      return delta;
20548
20549   case 0x8F: { /* POPQ m64 / POPW m16 */
20550      Int   len;
20551      UChar rm;
20552      /* There is no encoding for 32-bit pop in 64-bit mode.
20553         So sz==4 actually means sz==8. */
20554      if (haveF2orF3(pfx)) goto decode_failure;
20555      vassert(sz == 2 || sz == 4
20556              || /* tolerate redundant REX.W, see #210481 */ sz == 8);
20557      if (sz == 4) sz = 8;
20558      if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
20559
20560      rm = getUChar(delta);
20561
20562      /* make sure this instruction is correct POP */
20563      if (epartIsReg(rm) || gregLO3ofRM(rm) != 0)
20564         goto decode_failure;
20565      /* and has correct size */
20566      vassert(sz == 8);
20567
20568      t1 = newTemp(Ity_I64);
20569      t3 = newTemp(Ity_I64);
20570      assign( t1, getIReg64(R_RSP) );
20571      assign( t3, loadLE(Ity_I64, mkexpr(t1)) );
20572
20573      /* Increase RSP; must be done before the STORE.  Intel manual
20574         says: If the RSP register is used as a base register for
20575         addressing a destination operand in memory, the POP
20576         instruction computes the effective address of the operand
20577         after it increments the RSP register.  */
20578      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(sz)) );
20579
20580      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
20581      storeLE( mkexpr(addr), mkexpr(t3) );
20582
20583      DIP("popl %s\n", dis_buf);
20584
20585      delta += len;
20586      return delta;
20587   }
20588
20589   case 0x90: /* XCHG eAX,eAX */
20590      /* detect and handle F3 90 (rep nop) specially */
20591      if (!have66(pfx) && !haveF2(pfx) && haveF3(pfx)) {
20592         DIP("rep nop (P4 pause)\n");
20593         /* "observe" the hint.  The Vex client needs to be careful not
20594            to cause very long delays as a result, though. */
20595         jmp_lit(dres, Ijk_Yield, guest_RIP_bbstart+delta);
20596         vassert(dres->whatNext == Dis_StopHere);
20597         return delta;
20598      }
20599      /* detect and handle NOPs specially */
20600      if (/* F2/F3 probably change meaning completely */
20601          !haveF2orF3(pfx)
20602          /* If REX.B is 1, we're not exchanging rAX with itself */
20603          && getRexB(pfx)==0 ) {
20604         DIP("nop\n");
20605         return delta;
20606      }
20607      /* else fall through to normal case. */
20608   case 0x91: /* XCHG rAX,rCX */
20609   case 0x92: /* XCHG rAX,rDX */
20610   case 0x93: /* XCHG rAX,rBX */
20611   case 0x94: /* XCHG rAX,rSP */
20612   case 0x95: /* XCHG rAX,rBP */
20613   case 0x96: /* XCHG rAX,rSI */
20614   case 0x97: /* XCHG rAX,rDI */
20615      /* guard against mutancy */
20616      if (haveF2orF3(pfx)) goto decode_failure;
20617      codegen_xchg_rAX_Reg ( pfx, sz, opc - 0x90 );
20618      return delta;
20619
20620   case 0x98: /* CBW */
20621      if (haveF2orF3(pfx)) goto decode_failure;
20622      if (sz == 8) {
20623         putIRegRAX( 8, unop(Iop_32Sto64, getIRegRAX(4)) );
20624         DIP(/*"cdqe\n"*/"cltq");
20625         return delta;
20626      }
20627      if (sz == 4) {
20628         putIRegRAX( 4, unop(Iop_16Sto32, getIRegRAX(2)) );
20629         DIP("cwtl\n");
20630         return delta;
20631      }
20632      if (sz == 2) {
20633         putIRegRAX( 2, unop(Iop_8Sto16, getIRegRAX(1)) );
20634         DIP("cbw\n");
20635         return delta;
20636      }
20637      goto decode_failure;
20638
20639   case 0x99: /* CWD/CDQ/CQO */
20640      if (haveF2orF3(pfx)) goto decode_failure;
20641      vassert(sz == 2 || sz == 4 || sz == 8);
20642      ty = szToITy(sz);
20643      putIRegRDX( sz,
20644                  binop(mkSizedOp(ty,Iop_Sar8),
20645                        getIRegRAX(sz),
20646                        mkU8(sz == 2 ? 15 : (sz == 4 ? 31 : 63))) );
20647      DIP(sz == 2 ? "cwd\n"
20648                  : (sz == 4 ? /*"cdq\n"*/ "cltd\n"
20649                             : "cqo\n"));
20650      return delta;
20651
20652   case 0x9B: /* FWAIT (X87 insn) */
20653      /* ignore? */
20654      DIP("fwait\n");
20655      return delta;
20656
20657   case 0x9C: /* PUSHF */ {
20658      /* Note.  There is no encoding for a 32-bit pushf in 64-bit
20659         mode.  So sz==4 actually means sz==8. */
20660      /* 24 July 06: has also been seen with a redundant REX prefix,
20661         so must also allow sz==8. */
20662      if (haveF2orF3(pfx)) goto decode_failure;
20663      vassert(sz == 2 || sz == 4 || sz == 8);
20664      if (sz == 4) sz = 8;
20665      if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
20666
20667      t1 = newTemp(Ity_I64);
20668      assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
20669      putIReg64(R_RSP, mkexpr(t1) );
20670
20671      t2 = newTemp(Ity_I64);
20672      assign( t2, mk_amd64g_calculate_rflags_all() );
20673
20674      /* Patch in the D flag.  This can simply be a copy of bit 10 of
20675         baseBlock[OFFB_DFLAG]. */
20676      t3 = newTemp(Ity_I64);
20677      assign( t3, binop(Iop_Or64,
20678                        mkexpr(t2),
20679                        binop(Iop_And64,
20680                              IRExpr_Get(OFFB_DFLAG,Ity_I64),
20681                              mkU64(1<<10)))
20682            );
20683
20684      /* And patch in the ID flag. */
20685      t4 = newTemp(Ity_I64);
20686      assign( t4, binop(Iop_Or64,
20687                        mkexpr(t3),
20688                        binop(Iop_And64,
20689                              binop(Iop_Shl64, IRExpr_Get(OFFB_IDFLAG,Ity_I64),
20690                                               mkU8(21)),
20691                              mkU64(1<<21)))
20692            );
20693
20694      /* And patch in the AC flag too. */
20695      t5 = newTemp(Ity_I64);
20696      assign( t5, binop(Iop_Or64,
20697                        mkexpr(t4),
20698                        binop(Iop_And64,
20699                              binop(Iop_Shl64, IRExpr_Get(OFFB_ACFLAG,Ity_I64),
20700                                               mkU8(18)),
20701                              mkU64(1<<18)))
20702            );
20703
20704      /* if sz==2, the stored value needs to be narrowed. */
20705      if (sz == 2)
20706        storeLE( mkexpr(t1), unop(Iop_32to16,
20707                             unop(Iop_64to32,mkexpr(t5))) );
20708      else
20709        storeLE( mkexpr(t1), mkexpr(t5) );
20710
20711      DIP("pushf%c\n", nameISize(sz));
20712      return delta;
20713   }
20714
20715   case 0x9D: /* POPF */
20716      /* Note.  There is no encoding for a 32-bit popf in 64-bit mode.
20717         So sz==4 actually means sz==8. */
20718      if (haveF2orF3(pfx)) goto decode_failure;
20719      vassert(sz == 2 || sz == 4);
20720      if (sz == 4) sz = 8;
20721      if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
20722      t1 = newTemp(Ity_I64); t2 = newTemp(Ity_I64);
20723      assign(t2, getIReg64(R_RSP));
20724      assign(t1, widenUto64(loadLE(szToITy(sz),mkexpr(t2))));
20725      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
20726      /* t1 is the flag word.  Mask out everything except OSZACP and
20727         set the flags thunk to AMD64G_CC_OP_COPY. */
20728      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
20729      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
20730      stmt( IRStmt_Put( OFFB_CC_DEP1,
20731                        binop(Iop_And64,
20732                              mkexpr(t1),
20733                              mkU64( AMD64G_CC_MASK_C | AMD64G_CC_MASK_P
20734                                     | AMD64G_CC_MASK_A | AMD64G_CC_MASK_Z
20735                                     | AMD64G_CC_MASK_S| AMD64G_CC_MASK_O )
20736                             )
20737                       )
20738          );
20739
20740      /* Also need to set the D flag, which is held in bit 10 of t1.
20741         If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
20742      stmt( IRStmt_Put(
20743               OFFB_DFLAG,
20744               IRExpr_ITE(
20745                  unop(Iop_64to1,
20746                       binop(Iop_And64,
20747                             binop(Iop_Shr64, mkexpr(t1), mkU8(10)),
20748                             mkU64(1))),
20749                  mkU64(0xFFFFFFFFFFFFFFFFULL),
20750                  mkU64(1)))
20751          );
20752
20753      /* And set the ID flag */
20754      stmt( IRStmt_Put(
20755               OFFB_IDFLAG,
20756               IRExpr_ITE(
20757                  unop(Iop_64to1,
20758                       binop(Iop_And64,
20759                             binop(Iop_Shr64, mkexpr(t1), mkU8(21)),
20760                             mkU64(1))),
20761                  mkU64(1),
20762                  mkU64(0)))
20763          );
20764
20765      /* And set the AC flag too */
20766      stmt( IRStmt_Put(
20767               OFFB_ACFLAG,
20768               IRExpr_ITE(
20769                  unop(Iop_64to1,
20770                       binop(Iop_And64,
20771                             binop(Iop_Shr64, mkexpr(t1), mkU8(18)),
20772                             mkU64(1))),
20773                  mkU64(1),
20774                  mkU64(0)))
20775          );
20776
20777      DIP("popf%c\n", nameISize(sz));
20778      return delta;
20779
20780   case 0x9E: /* SAHF */
20781      codegen_SAHF();
20782      DIP("sahf\n");
20783      return delta;
20784
20785   case 0x9F: /* LAHF */
20786      codegen_LAHF();
20787      DIP("lahf\n");
20788      return delta;
20789
20790   case 0xA0: /* MOV Ob,AL */
20791      if (have66orF2orF3(pfx)) goto decode_failure;
20792      sz = 1;
20793      /* Fall through ... */
20794   case 0xA1: /* MOV Ov,eAX */
20795      if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
20796         goto decode_failure;
20797      d64 = getDisp64(delta);
20798      delta += 8;
20799      ty = szToITy(sz);
20800      addr = newTemp(Ity_I64);
20801      assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
20802      putIRegRAX(sz, loadLE( ty, mkexpr(addr) ));
20803      DIP("mov%c %s0x%llx, %s\n", nameISize(sz),
20804                                  segRegTxt(pfx), (ULong)d64,
20805                                  nameIRegRAX(sz));
20806      return delta;
20807
20808   case 0xA2: /* MOV AL,Ob */
20809      if (have66orF2orF3(pfx)) goto decode_failure;
20810      sz = 1;
20811      /* Fall through ... */
20812   case 0xA3: /* MOV eAX,Ov */
20813      if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
20814         goto decode_failure;
20815      d64 = getDisp64(delta);
20816      delta += 8;
20817      ty = szToITy(sz);
20818      addr = newTemp(Ity_I64);
20819      assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
20820      storeLE( mkexpr(addr), getIRegRAX(sz) );
20821      DIP("mov%c %s, %s0x%llx\n", nameISize(sz), nameIRegRAX(sz),
20822                                  segRegTxt(pfx), (ULong)d64);
20823      return delta;
20824
20825   case 0xA4:
20826   case 0xA5:
20827      /* F3 A4: rep movsb */
20828      if (haveF3(pfx) && !haveF2(pfx)) {
20829         if (opc == 0xA4)
20830            sz = 1;
20831         dis_REP_op ( dres, AMD64CondAlways, dis_MOVS, sz,
20832                      guest_RIP_curr_instr,
20833                      guest_RIP_bbstart+delta, "rep movs", pfx );
20834        dres->whatNext = Dis_StopHere;
20835        return delta;
20836      }
20837      /* A4: movsb */
20838      if (!haveF3(pfx) && !haveF2(pfx)) {
20839         if (opc == 0xA4)
20840            sz = 1;
20841         dis_string_op( dis_MOVS, sz, "movs", pfx );
20842         return delta;
20843      }
20844      goto decode_failure;
20845
20846   case 0xA6:
20847   case 0xA7:
20848      /* F3 A6/A7: repe cmps/rep cmps{w,l,q} */
20849      if (haveF3(pfx) && !haveF2(pfx)) {
20850         if (opc == 0xA6)
20851            sz = 1;
20852         dis_REP_op ( dres, AMD64CondZ, dis_CMPS, sz,
20853                      guest_RIP_curr_instr,
20854                      guest_RIP_bbstart+delta, "repe cmps", pfx );
20855         dres->whatNext = Dis_StopHere;
20856         return delta;
20857      }
20858      goto decode_failure;
20859
20860   case 0xAA:
20861   case 0xAB:
20862      /* F3 AA/AB: rep stosb/rep stos{w,l,q} */
20863      if (haveF3(pfx) && !haveF2(pfx)) {
20864         if (opc == 0xAA)
20865            sz = 1;
20866         dis_REP_op ( dres, AMD64CondAlways, dis_STOS, sz,
20867                      guest_RIP_curr_instr,
20868                      guest_RIP_bbstart+delta, "rep stos", pfx );
20869         vassert(dres->whatNext == Dis_StopHere);
20870         return delta;
20871      }
20872      /* AA/AB: stosb/stos{w,l,q} */
20873      if (!haveF3(pfx) && !haveF2(pfx)) {
20874         if (opc == 0xAA)
20875            sz = 1;
20876         dis_string_op( dis_STOS, sz, "stos", pfx );
20877         return delta;
20878      }
20879      goto decode_failure;
20880
20881   case 0xA8: /* TEST Ib, AL */
20882      if (haveF2orF3(pfx)) goto decode_failure;
20883      delta = dis_op_imm_A( 1, False, Iop_And8, False, delta, "test" );
20884      return delta;
20885   case 0xA9: /* TEST Iv, eAX */
20886      if (haveF2orF3(pfx)) goto decode_failure;
20887      delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
20888      return delta;
20889
20890   case 0xAC: /* LODS, no REP prefix */
20891   case 0xAD:
20892      dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", pfx );
20893      return delta;
20894
20895   case 0xAE:
20896   case 0xAF:
20897      /* F2 AE/AF: repne scasb/repne scas{w,l,q} */
20898      if (haveF2(pfx) && !haveF3(pfx)) {
20899         if (opc == 0xAE)
20900            sz = 1;
20901         dis_REP_op ( dres, AMD64CondNZ, dis_SCAS, sz,
20902                      guest_RIP_curr_instr,
20903                      guest_RIP_bbstart+delta, "repne scas", pfx );
20904         vassert(dres->whatNext == Dis_StopHere);
20905         return delta;
20906      }
20907      /* F3 AE/AF: repe scasb/repe scas{w,l,q} */
20908      if (!haveF2(pfx) && haveF3(pfx)) {
20909         if (opc == 0xAE)
20910            sz = 1;
20911         dis_REP_op ( dres, AMD64CondZ, dis_SCAS, sz,
20912                      guest_RIP_curr_instr,
20913                      guest_RIP_bbstart+delta, "repe scas", pfx );
20914         vassert(dres->whatNext == Dis_StopHere);
20915         return delta;
20916      }
20917      /* AE/AF: scasb/scas{w,l,q} */
20918      if (!haveF2(pfx) && !haveF3(pfx)) {
20919         if (opc == 0xAE)
20920            sz = 1;
20921         dis_string_op( dis_SCAS, sz, "scas", pfx );
20922         return delta;
20923      }
20924      goto decode_failure;
20925
20926   /* XXXX be careful here with moves to AH/BH/CH/DH */
20927   case 0xB0: /* MOV imm,AL */
20928   case 0xB1: /* MOV imm,CL */
20929   case 0xB2: /* MOV imm,DL */
20930   case 0xB3: /* MOV imm,BL */
20931   case 0xB4: /* MOV imm,AH */
20932   case 0xB5: /* MOV imm,CH */
20933   case 0xB6: /* MOV imm,DH */
20934   case 0xB7: /* MOV imm,BH */
20935      if (haveF2orF3(pfx)) goto decode_failure;
20936      d64 = getUChar(delta);
20937      delta += 1;
20938      putIRegRexB(1, pfx, opc-0xB0, mkU8(d64));
20939      DIP("movb $%lld,%s\n", d64, nameIRegRexB(1,pfx,opc-0xB0));
20940      return delta;
20941
20942   case 0xB8: /* MOV imm,eAX */
20943   case 0xB9: /* MOV imm,eCX */
20944   case 0xBA: /* MOV imm,eDX */
20945   case 0xBB: /* MOV imm,eBX */
20946   case 0xBC: /* MOV imm,eSP */
20947   case 0xBD: /* MOV imm,eBP */
20948   case 0xBE: /* MOV imm,eSI */
20949   case 0xBF: /* MOV imm,eDI */
20950      /* This is the one-and-only place where 64-bit literals are
20951         allowed in the instruction stream. */
20952      if (haveF2orF3(pfx)) goto decode_failure;
20953      if (sz == 8) {
20954         d64 = getDisp64(delta);
20955         delta += 8;
20956         putIRegRexB(8, pfx, opc-0xB8, mkU64(d64));
20957         DIP("movabsq $%lld,%s\n", (Long)d64,
20958                                   nameIRegRexB(8,pfx,opc-0xB8));
20959      } else {
20960         d64 = getSDisp(imin(4,sz),delta);
20961         delta += imin(4,sz);
20962         putIRegRexB(sz, pfx, opc-0xB8,
20963                         mkU(szToITy(sz), d64 & mkSizeMask(sz)));
20964         DIP("mov%c $%lld,%s\n", nameISize(sz),
20965                                 (Long)d64,
20966                                 nameIRegRexB(sz,pfx,opc-0xB8));
20967      }
20968      return delta;
20969
20970   case 0xC0: { /* Grp2 Ib,Eb */
20971      Bool decode_OK = True;
20972      if (haveF2orF3(pfx)) goto decode_failure;
20973      modrm = getUChar(delta);
20974      am_sz = lengthAMode(pfx,delta);
20975      d_sz  = 1;
20976      d64   = getUChar(delta + am_sz);
20977      sz    = 1;
20978      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
20979                         mkU8(d64 & 0xFF), NULL, &decode_OK );
20980      if (!decode_OK) goto decode_failure;
20981      return delta;
20982   }
20983
20984   case 0xC1: { /* Grp2 Ib,Ev */
20985      Bool decode_OK = True;
20986      if (haveF2orF3(pfx)) goto decode_failure;
20987      modrm = getUChar(delta);
20988      am_sz = lengthAMode(pfx,delta);
20989      d_sz  = 1;
20990      d64   = getUChar(delta + am_sz);
20991      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
20992                         mkU8(d64 & 0xFF), NULL, &decode_OK );
20993      if (!decode_OK) goto decode_failure;
20994      return delta;
20995   }
20996
20997   case 0xC2: /* RET imm16 */
20998      if (have66orF3(pfx)) goto decode_failure;
20999      if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
21000      d64 = getUDisp16(delta);
21001      delta += 2;
21002      dis_ret(dres, vbi, d64);
21003      DIP("ret $%lld\n", d64);
21004      return delta;
21005
21006   case 0xC3: /* RET */
21007      if (have66(pfx)) goto decode_failure;
21008      /* F3 is acceptable on AMD. */
21009      if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
21010      dis_ret(dres, vbi, 0);
21011      DIP(haveF3(pfx) ? "rep ; ret\n" : "ret\n");
21012      return delta;
21013
21014   case 0xC6: /* C6 /0 = MOV Ib,Eb */
21015      sz = 1;
21016      goto maybe_do_Mov_I_E;
21017   case 0xC7: /* C7 /0 = MOV Iv,Ev */
21018      goto maybe_do_Mov_I_E;
21019   maybe_do_Mov_I_E:
21020      modrm = getUChar(delta);
21021      if (gregLO3ofRM(modrm) == 0) {
21022         if (epartIsReg(modrm)) {
21023            /* Neither F2 nor F3 are allowable. */
21024            if (haveF2orF3(pfx)) goto decode_failure;
21025            delta++; /* mod/rm byte */
21026            d64 = getSDisp(imin(4,sz),delta);
21027            delta += imin(4,sz);
21028            putIRegE(sz, pfx, modrm,
21029                         mkU(szToITy(sz), d64 & mkSizeMask(sz)));
21030            DIP("mov%c $%lld, %s\n", nameISize(sz),
21031                                     (Long)d64,
21032                                     nameIRegE(sz,pfx,modrm));
21033         } else {
21034            if (haveF2(pfx)) goto decode_failure;
21035            /* F3(XRELEASE) is allowable here */
21036            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
21037                              /*xtra*/imin(4,sz) );
21038            delta += alen;
21039            d64 = getSDisp(imin(4,sz),delta);
21040            delta += imin(4,sz);
21041            storeLE(mkexpr(addr),
21042                    mkU(szToITy(sz), d64 & mkSizeMask(sz)));
21043            DIP("mov%c $%lld, %s\n", nameISize(sz), (Long)d64, dis_buf);
21044         }
21045         return delta;
21046      }
21047      /* BEGIN HACKY SUPPORT FOR xbegin */
21048      if (opc == 0xC7 && modrm == 0xF8 && !have66orF2orF3(pfx) && sz == 4
21049          && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
21050         delta++; /* mod/rm byte */
21051         d64 = getSDisp(4,delta);
21052         delta += 4;
21053         guest_RIP_next_mustcheck = True;
21054         guest_RIP_next_assumed   = guest_RIP_bbstart + delta;
21055         Addr64 failAddr = guest_RIP_bbstart + delta + d64;
21056         /* EAX contains the failure status code.  Bit 3 is "Set if an
21057            internal buffer overflowed", which seems like the
21058            least-bogus choice we can make here. */
21059         putIRegRAX(4, mkU32(1<<3));
21060         /* And jump to the fail address. */
21061         jmp_lit(dres, Ijk_Boring, failAddr);
21062         vassert(dres->whatNext == Dis_StopHere);
21063         DIP("xbeginq 0x%llx\n", failAddr);
21064         return delta;
21065      }
21066      /* END HACKY SUPPORT FOR xbegin */
21067      /* BEGIN HACKY SUPPORT FOR xabort */
21068      if (opc == 0xC6 && modrm == 0xF8 && !have66orF2orF3(pfx) && sz == 1
21069          && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
21070         delta++; /* mod/rm byte */
21071         abyte = getUChar(delta); delta++;
21072         /* There is never a real transaction in progress, so do nothing. */
21073         DIP("xabort $%d", (Int)abyte);
21074         return delta;
21075      }
21076      /* END HACKY SUPPORT FOR xabort */
21077      goto decode_failure;
21078
21079   case 0xC8: /* ENTER */
21080      /* Same comments re operand size as for LEAVE below apply.
21081         Also, only handles the case "enter $imm16, $0"; other cases
21082         for the second operand (nesting depth) are not handled. */
21083      if (sz != 4)
21084         goto decode_failure;
21085      d64 = getUDisp16(delta);
21086      delta += 2;
21087      vassert(d64 >= 0 && d64 <= 0xFFFF);
21088      if (getUChar(delta) != 0)
21089         goto decode_failure;
21090      delta++;
21091      /* Intel docs seem to suggest:
21092           push rbp
21093           temp = rsp
21094           rbp = temp
21095           rsp = rsp - imm16
21096      */
21097      t1 = newTemp(Ity_I64);
21098      assign(t1, getIReg64(R_RBP));
21099      t2 = newTemp(Ity_I64);
21100      assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
21101      putIReg64(R_RSP, mkexpr(t2));
21102      storeLE(mkexpr(t2), mkexpr(t1));
21103      putIReg64(R_RBP, mkexpr(t2));
21104      if (d64 > 0) {
21105         putIReg64(R_RSP, binop(Iop_Sub64, mkexpr(t2), mkU64(d64)));
21106      }
21107      DIP("enter $%u, $0\n", (UInt)d64);
21108      return delta;
21109
21110   case 0xC9: /* LEAVE */
21111      /* In 64-bit mode this defaults to a 64-bit operand size.  There
21112         is no way to encode a 32-bit variant.  Hence sz==4 but we do
21113         it as if sz=8. */
21114      if (sz != 4)
21115         goto decode_failure;
21116      t1 = newTemp(Ity_I64);
21117      t2 = newTemp(Ity_I64);
21118      assign(t1, getIReg64(R_RBP));
21119      /* First PUT RSP looks redundant, but need it because RSP must
21120         always be up-to-date for Memcheck to work... */
21121      putIReg64(R_RSP, mkexpr(t1));
21122      assign(t2, loadLE(Ity_I64,mkexpr(t1)));
21123      putIReg64(R_RBP, mkexpr(t2));
21124      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(8)) );
21125      DIP("leave\n");
21126      return delta;
21127
21128   case 0xCC: /* INT 3 */
21129      jmp_lit(dres, Ijk_SigTRAP, guest_RIP_bbstart + delta);
21130      vassert(dres->whatNext == Dis_StopHere);
21131      DIP("int $0x3\n");
21132      return delta;
21133
21134   case 0xCD: /* INT imm8 */
21135      d64 = getUChar(delta); delta++;
21136
21137      /* Handle int $0xD2 (Solaris fasttrap syscalls). */
21138      if (d64 == 0xD2) {
21139         jmp_lit(dres, Ijk_Sys_int210, guest_RIP_bbstart + delta);
21140         vassert(dres->whatNext == Dis_StopHere);
21141         DIP("int $0xD2\n");
21142         return delta;
21143      }
21144      goto decode_failure;
21145
21146   case 0xD0: { /* Grp2 1,Eb */
21147      Bool decode_OK = True;
21148      if (haveF2orF3(pfx)) goto decode_failure;
21149      modrm = getUChar(delta);
21150      am_sz = lengthAMode(pfx,delta);
21151      d_sz  = 0;
21152      d64   = 1;
21153      sz    = 1;
21154      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
21155                         mkU8(d64), NULL, &decode_OK );
21156      if (!decode_OK) goto decode_failure;
21157      return delta;
21158   }
21159
21160   case 0xD1: { /* Grp2 1,Ev */
21161      Bool decode_OK = True;
21162      if (haveF2orF3(pfx)) goto decode_failure;
21163      modrm = getUChar(delta);
21164      am_sz = lengthAMode(pfx,delta);
21165      d_sz  = 0;
21166      d64   = 1;
21167      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
21168                         mkU8(d64), NULL, &decode_OK );
21169      if (!decode_OK) goto decode_failure;
21170      return delta;
21171   }
21172
21173   case 0xD2: { /* Grp2 CL,Eb */
21174      Bool decode_OK = True;
21175      if (haveF2orF3(pfx)) goto decode_failure;
21176      modrm = getUChar(delta);
21177      am_sz = lengthAMode(pfx,delta);
21178      d_sz  = 0;
21179      sz    = 1;
21180      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
21181                         getIRegCL(), "%cl", &decode_OK );
21182      if (!decode_OK) goto decode_failure;
21183      return delta;
21184   }
21185
21186   case 0xD3: { /* Grp2 CL,Ev */
21187      Bool decode_OK = True;
21188      if (haveF2orF3(pfx)) goto decode_failure;
21189      modrm = getUChar(delta);
21190      am_sz = lengthAMode(pfx,delta);
21191      d_sz  = 0;
21192      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
21193                         getIRegCL(), "%cl", &decode_OK );
21194      if (!decode_OK) goto decode_failure;
21195      return delta;
21196   }
21197
21198   case 0xD8: /* X87 instructions */
21199   case 0xD9:
21200   case 0xDA:
21201   case 0xDB:
21202   case 0xDC:
21203   case 0xDD:
21204   case 0xDE:
21205   case 0xDF: {
21206      Bool redundantREXWok = False;
21207
21208      if (haveF2orF3(pfx))
21209         goto decode_failure;
21210
21211      /* kludge to tolerate redundant rex.w prefixes (should do this
21212         properly one day) */
21213      /* mono 1.1.18.1 produces 48 D9 FA, which is rex.w fsqrt */
21214      if ( (opc == 0xD9 && getUChar(delta+0) == 0xFA)/*fsqrt*/ )
21215         redundantREXWok = True;
21216
21217      Bool size_OK = False;
21218      if ( sz == 4 )
21219         size_OK = True;
21220      else if ( sz == 8 )
21221         size_OK = redundantREXWok;
21222      else if ( sz == 2 ) {
21223         int mod_rm = getUChar(delta+0);
21224         int reg = gregLO3ofRM(mod_rm);
21225         /* The HotSpot JVM uses these */
21226         if ( (opc == 0xDD) && (reg == 0 /* FLDL   */ ||
21227                                reg == 4 /* FNSAVE */ ||
21228                                reg == 6 /* FRSTOR */ ) )
21229            size_OK = True;
21230      }
21231      /* AMD manual says 0x66 size override is ignored, except where
21232         it is meaningful */
21233      if (!size_OK)
21234         goto decode_failure;
21235
21236      Bool decode_OK = False;
21237      delta = dis_FPU ( &decode_OK, vbi, pfx, delta );
21238      if (!decode_OK)
21239         goto decode_failure;
21240
21241      return delta;
21242   }
21243
21244   case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
21245   case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
21246   case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
21247    { /* The docs say this uses rCX as a count depending on the
21248         address size override, not the operand one. */
21249      IRExpr* zbit  = NULL;
21250      IRExpr* count = NULL;
21251      IRExpr* cond  = NULL;
21252      const HChar* xtra = NULL;
21253
21254      if (have66orF2orF3(pfx) || 1==getRexW(pfx)) goto decode_failure;
21255      /* So at this point we've rejected any variants which appear to
21256         be governed by the usual operand-size modifiers.  Hence only
21257         the address size prefix can have an effect.  It changes the
21258         size from 64 (default) to 32. */
21259      d64 = guest_RIP_bbstart+delta+1 + getSDisp8(delta);
21260      delta++;
21261      if (haveASO(pfx)) {
21262         /* 64to32 of 64-bit get is merely a get-put improvement
21263            trick. */
21264         putIReg32(R_RCX, binop(Iop_Sub32,
21265                                unop(Iop_64to32, getIReg64(R_RCX)),
21266                                mkU32(1)));
21267      } else {
21268         putIReg64(R_RCX, binop(Iop_Sub64, getIReg64(R_RCX), mkU64(1)));
21269      }
21270
21271      /* This is correct, both for 32- and 64-bit versions.  If we're
21272         doing a 32-bit dec and the result is zero then the default
21273         zero extension rule will cause the upper 32 bits to be zero
21274         too.  Hence a 64-bit check against zero is OK. */
21275      count = getIReg64(R_RCX);
21276      cond = binop(Iop_CmpNE64, count, mkU64(0));
21277      switch (opc) {
21278         case 0xE2:
21279            xtra = "";
21280            break;
21281         case 0xE1:
21282            xtra = "e";
21283            zbit = mk_amd64g_calculate_condition( AMD64CondZ );
21284            cond = mkAnd1(cond, zbit);
21285            break;
21286         case 0xE0:
21287            xtra = "ne";
21288            zbit = mk_amd64g_calculate_condition( AMD64CondNZ );
21289            cond = mkAnd1(cond, zbit);
21290            break;
21291         default:
21292            vassert(0);
21293      }
21294      stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U64(d64), OFFB_RIP) );
21295
21296      DIP("loop%s%s 0x%llx\n", xtra, haveASO(pfx) ? "l" : "", (ULong)d64);
21297      return delta;
21298    }
21299
21300   case 0xE3:
21301      /* JRCXZ or JECXZ, depending address size override. */
21302      if (have66orF2orF3(pfx)) goto decode_failure;
21303      d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
21304      delta++;
21305      if (haveASO(pfx)) {
21306         /* 32-bit */
21307         stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
21308                                  unop(Iop_32Uto64, getIReg32(R_RCX)),
21309                                  mkU64(0)),
21310                            Ijk_Boring,
21311                            IRConst_U64(d64),
21312                            OFFB_RIP
21313             ));
21314         DIP("jecxz 0x%llx\n", (ULong)d64);
21315      } else {
21316         /* 64-bit */
21317         stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
21318                                  getIReg64(R_RCX),
21319                                  mkU64(0)),
21320                            Ijk_Boring,
21321                            IRConst_U64(d64),
21322                            OFFB_RIP
21323               ));
21324         DIP("jrcxz 0x%llx\n", (ULong)d64);
21325      }
21326      return delta;
21327
21328   case 0xE4: /* IN imm8, AL */
21329      sz = 1;
21330      t1 = newTemp(Ity_I64);
21331      abyte = getUChar(delta); delta++;
21332      assign(t1, mkU64( abyte & 0xFF ));
21333      DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
21334      goto do_IN;
21335   case 0xE5: /* IN imm8, eAX */
21336      if (!(sz == 2 || sz == 4)) goto decode_failure;
21337      t1 = newTemp(Ity_I64);
21338      abyte = getUChar(delta); delta++;
21339      assign(t1, mkU64( abyte & 0xFF ));
21340      DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
21341      goto do_IN;
21342   case 0xEC: /* IN %DX, AL */
21343      sz = 1;
21344      t1 = newTemp(Ity_I64);
21345      assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
21346      DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
21347                                         nameIRegRAX(sz));
21348      goto do_IN;
21349   case 0xED: /* IN %DX, eAX */
21350      if (!(sz == 2 || sz == 4)) goto decode_failure;
21351      t1 = newTemp(Ity_I64);
21352      assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
21353      DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
21354                                         nameIRegRAX(sz));
21355      goto do_IN;
21356   do_IN: {
21357      /* At this point, sz indicates the width, and t1 is a 64-bit
21358         value giving port number. */
21359      IRDirty* d;
21360      if (haveF2orF3(pfx)) goto decode_failure;
21361      vassert(sz == 1 || sz == 2 || sz == 4);
21362      ty = szToITy(sz);
21363      t2 = newTemp(Ity_I64);
21364      d = unsafeIRDirty_1_N(
21365             t2,
21366             0/*regparms*/,
21367             "amd64g_dirtyhelper_IN",
21368             &amd64g_dirtyhelper_IN,
21369             mkIRExprVec_2( mkexpr(t1), mkU64(sz) )
21370          );
21371      /* do the call, dumping the result in t2. */
21372      stmt( IRStmt_Dirty(d) );
21373      putIRegRAX(sz, narrowTo( ty, mkexpr(t2) ) );
21374      return delta;
21375   }
21376
21377   case 0xE6: /* OUT AL, imm8 */
21378      sz = 1;
21379      t1 = newTemp(Ity_I64);
21380      abyte = getUChar(delta); delta++;
21381      assign( t1, mkU64( abyte & 0xFF ) );
21382      DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
21383      goto do_OUT;
21384   case 0xE7: /* OUT eAX, imm8 */
21385      if (!(sz == 2 || sz == 4)) goto decode_failure;
21386      t1 = newTemp(Ity_I64);
21387      abyte = getUChar(delta); delta++;
21388      assign( t1, mkU64( abyte & 0xFF ) );
21389      DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
21390      goto do_OUT;
21391   case 0xEE: /* OUT AL, %DX */
21392      sz = 1;
21393      t1 = newTemp(Ity_I64);
21394      assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
21395      DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
21396                                          nameIRegRDX(2));
21397      goto do_OUT;
21398   case 0xEF: /* OUT eAX, %DX */
21399      if (!(sz == 2 || sz == 4)) goto decode_failure;
21400      t1 = newTemp(Ity_I64);
21401      assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
21402      DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
21403                                          nameIRegRDX(2));
21404      goto do_OUT;
21405   do_OUT: {
21406      /* At this point, sz indicates the width, and t1 is a 64-bit
21407         value giving port number. */
21408      IRDirty* d;
21409      if (haveF2orF3(pfx)) goto decode_failure;
21410      vassert(sz == 1 || sz == 2 || sz == 4);
21411      ty = szToITy(sz);
21412      d = unsafeIRDirty_0_N(
21413             0/*regparms*/,
21414             "amd64g_dirtyhelper_OUT",
21415             &amd64g_dirtyhelper_OUT,
21416             mkIRExprVec_3( mkexpr(t1),
21417                            widenUto64( getIRegRAX(sz) ),
21418                            mkU64(sz) )
21419          );
21420      stmt( IRStmt_Dirty(d) );
21421      return delta;
21422   }
21423
21424   case 0xE8: /* CALL J4 */
21425      if (haveF3(pfx)) goto decode_failure;
21426      if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
21427      d64 = getSDisp32(delta); delta += 4;
21428      d64 += (guest_RIP_bbstart+delta);
21429      /* (guest_RIP_bbstart+delta) == return-to addr, d64 == call-to addr */
21430      t1 = newTemp(Ity_I64);
21431      assign(t1, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
21432      putIReg64(R_RSP, mkexpr(t1));
21433      storeLE( mkexpr(t1), mkU64(guest_RIP_bbstart+delta));
21434      t2 = newTemp(Ity_I64);
21435      assign(t2, mkU64((Addr64)d64));
21436      make_redzone_AbiHint(vbi, t1, t2/*nia*/, "call-d32");
21437      if (resteerOkFn( callback_opaque, (Addr64)d64) ) {
21438         /* follow into the call target. */
21439         dres->whatNext   = Dis_ResteerU;
21440         dres->continueAt = d64;
21441      } else {
21442         jmp_lit(dres, Ijk_Call, d64);
21443         vassert(dres->whatNext == Dis_StopHere);
21444      }
21445      DIP("call 0x%llx\n", (ULong)d64);
21446      return delta;
21447
21448   case 0xE9: /* Jv (jump, 16/32 offset) */
21449      if (haveF3(pfx)) goto decode_failure;
21450      if (sz != 4)
21451         goto decode_failure; /* JRS added 2004 July 11 */
21452      if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
21453      d64 = (guest_RIP_bbstart+delta+sz) + getSDisp(sz,delta);
21454      delta += sz;
21455      if (resteerOkFn(callback_opaque, (Addr64)d64)) {
21456         dres->whatNext   = Dis_ResteerU;
21457         dres->continueAt = d64;
21458      } else {
21459         jmp_lit(dres, Ijk_Boring, d64);
21460         vassert(dres->whatNext == Dis_StopHere);
21461      }
21462      DIP("jmp 0x%llx\n", (ULong)d64);
21463      return delta;
21464
21465   case 0xEB: /* Jb (jump, byte offset) */
21466      if (haveF3(pfx)) goto decode_failure;
21467      if (sz != 4)
21468         goto decode_failure; /* JRS added 2004 July 11 */
21469      if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
21470      d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
21471      delta++;
21472      if (resteerOkFn(callback_opaque, (Addr64)d64)) {
21473         dres->whatNext   = Dis_ResteerU;
21474         dres->continueAt = d64;
21475      } else {
21476         jmp_lit(dres, Ijk_Boring, d64);
21477         vassert(dres->whatNext == Dis_StopHere);
21478      }
21479      DIP("jmp-8 0x%llx\n", (ULong)d64);
21480      return delta;
21481
21482   case 0xF5: /* CMC */
21483   case 0xF8: /* CLC */
21484   case 0xF9: /* STC */
21485      t1 = newTemp(Ity_I64);
21486      t2 = newTemp(Ity_I64);
21487      assign( t1, mk_amd64g_calculate_rflags_all() );
21488      switch (opc) {
21489         case 0xF5:
21490            assign( t2, binop(Iop_Xor64, mkexpr(t1),
21491                                         mkU64(AMD64G_CC_MASK_C)));
21492            DIP("cmc\n");
21493            break;
21494         case 0xF8:
21495            assign( t2, binop(Iop_And64, mkexpr(t1),
21496                                         mkU64(~AMD64G_CC_MASK_C)));
21497            DIP("clc\n");
21498            break;
21499         case 0xF9:
21500            assign( t2, binop(Iop_Or64, mkexpr(t1),
21501                                        mkU64(AMD64G_CC_MASK_C)));
21502            DIP("stc\n");
21503            break;
21504         default:
21505            vpanic("disInstr(x64)(cmc/clc/stc)");
21506      }
21507      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
21508      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
21509      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t2) ));
21510      /* Set NDEP even though it isn't used.  This makes redundant-PUT
21511         elimination of previous stores to this field work better. */
21512      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
21513      return delta;
21514
21515   case 0xF6: { /* Grp3 Eb */
21516      Bool decode_OK = True;
21517      /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
21518      /* We now let dis_Grp3 itself decide if F2 and/or F3 are valid */
21519      delta = dis_Grp3 ( vbi, pfx, 1, delta, &decode_OK );
21520      if (!decode_OK) goto decode_failure;
21521      return delta;
21522   }
21523
21524   case 0xF7: { /* Grp3 Ev */
21525      Bool decode_OK = True;
21526      /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
21527      /* We now let dis_Grp3 itself decide if F2 and/or F3 are valid */
21528      delta = dis_Grp3 ( vbi, pfx, sz, delta, &decode_OK );
21529      if (!decode_OK) goto decode_failure;
21530      return delta;
21531   }
21532
21533   case 0xFC: /* CLD */
21534      if (haveF2orF3(pfx)) goto decode_failure;
21535      stmt( IRStmt_Put( OFFB_DFLAG, mkU64(1)) );
21536      DIP("cld\n");
21537      return delta;
21538
21539   case 0xFD: /* STD */
21540      if (haveF2orF3(pfx)) goto decode_failure;
21541      stmt( IRStmt_Put( OFFB_DFLAG, mkU64(-1ULL)) );
21542      DIP("std\n");
21543      return delta;
21544
21545   case 0xFE: { /* Grp4 Eb */
21546      Bool decode_OK = True;
21547      /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
21548      /* We now let dis_Grp4 itself decide if F2 and/or F3 are valid */
21549      delta = dis_Grp4 ( vbi, pfx, delta, &decode_OK );
21550      if (!decode_OK) goto decode_failure;
21551      return delta;
21552   }
21553
21554   case 0xFF: { /* Grp5 Ev */
21555      Bool decode_OK = True;
21556      /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
21557      /* We now let dis_Grp5 itself decide if F2 and/or F3 are valid */
21558      delta = dis_Grp5 ( vbi, pfx, sz, delta, dres, &decode_OK );
21559      if (!decode_OK) goto decode_failure;
21560      return delta;
21561   }
21562
21563   default:
21564      break;
21565
21566   }
21567
21568  decode_failure:
21569   return deltaIN; /* fail */
21570}
21571
21572
21573/*------------------------------------------------------------*/
21574/*---                                                      ---*/
21575/*--- Top-level post-escape decoders: dis_ESC_0F           ---*/
21576/*---                                                      ---*/
21577/*------------------------------------------------------------*/
21578
21579static IRTemp math_BSWAP ( IRTemp t1, IRType ty )
21580{
21581   IRTemp t2 = newTemp(ty);
21582   if (ty == Ity_I64) {
21583      IRTemp m8  = newTemp(Ity_I64);
21584      IRTemp s8  = newTemp(Ity_I64);
21585      IRTemp m16 = newTemp(Ity_I64);
21586      IRTemp s16 = newTemp(Ity_I64);
21587      IRTemp m32 = newTemp(Ity_I64);
21588      assign( m8, mkU64(0xFF00FF00FF00FF00ULL) );
21589      assign( s8,
21590              binop(Iop_Or64,
21591                    binop(Iop_Shr64,
21592                          binop(Iop_And64,mkexpr(t1),mkexpr(m8)),
21593                          mkU8(8)),
21594                    binop(Iop_And64,
21595                          binop(Iop_Shl64,mkexpr(t1),mkU8(8)),
21596                          mkexpr(m8))
21597                   )
21598            );
21599
21600      assign( m16, mkU64(0xFFFF0000FFFF0000ULL) );
21601      assign( s16,
21602              binop(Iop_Or64,
21603                    binop(Iop_Shr64,
21604                          binop(Iop_And64,mkexpr(s8),mkexpr(m16)),
21605                          mkU8(16)),
21606                    binop(Iop_And64,
21607                          binop(Iop_Shl64,mkexpr(s8),mkU8(16)),
21608                          mkexpr(m16))
21609                   )
21610            );
21611
21612      assign( m32, mkU64(0xFFFFFFFF00000000ULL) );
21613      assign( t2,
21614              binop(Iop_Or64,
21615                    binop(Iop_Shr64,
21616                          binop(Iop_And64,mkexpr(s16),mkexpr(m32)),
21617                          mkU8(32)),
21618                    binop(Iop_And64,
21619                          binop(Iop_Shl64,mkexpr(s16),mkU8(32)),
21620                          mkexpr(m32))
21621                   )
21622            );
21623      return t2;
21624   }
21625   if (ty == Ity_I32) {
21626      assign( t2,
21627         binop(
21628            Iop_Or32,
21629            binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
21630            binop(
21631               Iop_Or32,
21632               binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
21633                                mkU32(0x00FF0000)),
21634               binop(Iop_Or32,
21635                     binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
21636                                      mkU32(0x0000FF00)),
21637                     binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
21638                                      mkU32(0x000000FF) )
21639            )))
21640      );
21641      return t2;
21642   }
21643   if (ty == Ity_I16) {
21644      assign(t2,
21645             binop(Iop_Or16,
21646                   binop(Iop_Shl16, mkexpr(t1), mkU8(8)),
21647                   binop(Iop_Shr16, mkexpr(t1), mkU8(8)) ));
21648      return t2;
21649   }
21650   vassert(0);
21651   /*NOTREACHED*/
21652   return IRTemp_INVALID;
21653}
21654
21655
21656__attribute__((noinline))
21657static
21658Long dis_ESC_0F (
21659        /*MB_OUT*/DisResult* dres,
21660        /*MB_OUT*/Bool*      expect_CAS,
21661        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
21662        Bool         resteerCisOk,
21663        void*        callback_opaque,
21664        const VexArchInfo* archinfo,
21665        const VexAbiInfo*  vbi,
21666        Prefix pfx, Int sz, Long deltaIN
21667     )
21668{
21669   Long   d64   = 0;
21670   IRTemp addr  = IRTemp_INVALID;
21671   IRTemp t1    = IRTemp_INVALID;
21672   IRTemp t2    = IRTemp_INVALID;
21673   UChar  modrm = 0;
21674   Int    am_sz = 0;
21675   Int    alen  = 0;
21676   HChar  dis_buf[50];
21677
21678   /* In the first switch, look for ordinary integer insns. */
21679   Long   delta = deltaIN;
21680   UChar  opc   = getUChar(delta);
21681   delta++;
21682   switch (opc) { /* first switch */
21683
21684   case 0x01:
21685   {
21686      modrm = getUChar(delta);
21687      /* 0F 01 /0 -- SGDT */
21688      /* 0F 01 /1 -- SIDT */
21689      if (!epartIsReg(modrm)
21690          && (gregLO3ofRM(modrm) == 0 || gregLO3ofRM(modrm) == 1)) {
21691         /* This is really revolting, but ... since each processor
21692            (core) only has one IDT and one GDT, just let the guest
21693            see it (pass-through semantics).  I can't see any way to
21694            construct a faked-up value, so don't bother to try. */
21695         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21696         delta += alen;
21697         switch (gregLO3ofRM(modrm)) {
21698            case 0: DIP("sgdt %s\n", dis_buf); break;
21699            case 1: DIP("sidt %s\n", dis_buf); break;
21700            default: vassert(0); /*NOTREACHED*/
21701         }
21702         IRDirty* d = unsafeIRDirty_0_N (
21703                          0/*regparms*/,
21704                          "amd64g_dirtyhelper_SxDT",
21705                          &amd64g_dirtyhelper_SxDT,
21706                          mkIRExprVec_2( mkexpr(addr),
21707                                         mkU64(gregLO3ofRM(modrm)) )
21708                      );
21709         /* declare we're writing memory */
21710         d->mFx   = Ifx_Write;
21711         d->mAddr = mkexpr(addr);
21712         d->mSize = 6;
21713         stmt( IRStmt_Dirty(d) );
21714         return delta;
21715      }
21716      /* 0F 01 D0 = XGETBV */
21717      if (modrm == 0xD0 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
21718         delta += 1;
21719         DIP("xgetbv\n");
21720         /* Fault (SEGV) if ECX isn't zero.  Intel docs say #GP and I
21721            am not sure if that translates in to SEGV or to something
21722            else, in user space. */
21723         t1 = newTemp(Ity_I32);
21724         assign( t1, getIReg32(R_RCX) );
21725         stmt( IRStmt_Exit(binop(Iop_CmpNE32, mkexpr(t1), mkU32(0)),
21726                           Ijk_SigSEGV,
21727                           IRConst_U64(guest_RIP_curr_instr),
21728                           OFFB_RIP
21729         ));
21730         putIRegRAX(4, mkU32(7));
21731         putIRegRDX(4, mkU32(0));
21732         return delta;
21733      }
21734      /* BEGIN HACKY SUPPORT FOR xend */
21735      /* 0F 01 D5 = XEND */
21736      if (modrm == 0xD5 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
21737         /* We are never in an transaction (xbegin immediately aborts).
21738            So this just always generates a General Protection Fault. */
21739         delta += 1;
21740         jmp_lit(dres, Ijk_SigSEGV, guest_RIP_bbstart + delta);
21741         vassert(dres->whatNext == Dis_StopHere);
21742         DIP("xend\n");
21743         return delta;
21744      }
21745      /* END HACKY SUPPORT FOR xend */
21746      /* BEGIN HACKY SUPPORT FOR xtest */
21747      /* 0F 01 D6 = XTEST */
21748      if (modrm == 0xD6 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
21749         /* Sets ZF because there never is a transaction, and all
21750            CF, OF, SF, PF and AF are always cleared by xtest. */
21751         delta += 1;
21752         DIP("xtest\n");
21753         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
21754         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
21755         stmt( IRStmt_Put( OFFB_CC_DEP1, mkU64(AMD64G_CC_MASK_Z) ));
21756         /* Set NDEP even though it isn't used.  This makes redundant-PUT
21757            elimination of previous stores to this field work better. */
21758         stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
21759         return delta;
21760      }
21761      /* END HACKY SUPPORT FOR xtest */
21762      /* 0F 01 F9 = RDTSCP */
21763      if (modrm == 0xF9 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_RDTSCP)) {
21764         delta += 1;
21765         /* Uses dirty helper:
21766            void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* )
21767            declared to wr rax, rcx, rdx
21768         */
21769         const HChar* fName = "amd64g_dirtyhelper_RDTSCP";
21770         void*        fAddr = &amd64g_dirtyhelper_RDTSCP;
21771         IRDirty* d
21772            = unsafeIRDirty_0_N ( 0/*regparms*/,
21773                                  fName, fAddr, mkIRExprVec_1(IRExpr_GSPTR()) );
21774         /* declare guest state effects */
21775         d->nFxState = 3;
21776         vex_bzero(&d->fxState, sizeof(d->fxState));
21777         d->fxState[0].fx     = Ifx_Write;
21778         d->fxState[0].offset = OFFB_RAX;
21779         d->fxState[0].size   = 8;
21780         d->fxState[1].fx     = Ifx_Write;
21781         d->fxState[1].offset = OFFB_RCX;
21782         d->fxState[1].size   = 8;
21783         d->fxState[2].fx     = Ifx_Write;
21784         d->fxState[2].offset = OFFB_RDX;
21785         d->fxState[2].size   = 8;
21786         /* execute the dirty call, side-effecting guest state */
21787         stmt( IRStmt_Dirty(d) );
21788         /* RDTSCP is a serialising insn.  So, just in case someone is
21789            using it as a memory fence ... */
21790         stmt( IRStmt_MBE(Imbe_Fence) );
21791         DIP("rdtscp\n");
21792         return delta;
21793      }
21794      /* else decode failed */
21795      break;
21796   }
21797
21798   case 0x05: /* SYSCALL */
21799      guest_RIP_next_mustcheck = True;
21800      guest_RIP_next_assumed = guest_RIP_bbstart + delta;
21801      putIReg64( R_RCX, mkU64(guest_RIP_next_assumed) );
21802      /* It's important that all guest state is up-to-date
21803         at this point.  So we declare an end-of-block here, which
21804         forces any cached guest state to be flushed. */
21805      jmp_lit(dres, Ijk_Sys_syscall, guest_RIP_next_assumed);
21806      vassert(dres->whatNext == Dis_StopHere);
21807      DIP("syscall\n");
21808      return delta;
21809
21810   case 0x0B: /* UD2 */
21811      stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
21812      jmp_lit(dres, Ijk_NoDecode, guest_RIP_curr_instr);
21813      vassert(dres->whatNext == Dis_StopHere);
21814      DIP("ud2\n");
21815      return delta;
21816
21817   case 0x0D: /* 0F 0D /0 -- prefetch mem8 */
21818              /* 0F 0D /1 -- prefetchw mem8 */
21819      if (have66orF2orF3(pfx)) goto decode_failure;
21820      modrm = getUChar(delta);
21821      if (epartIsReg(modrm)) goto decode_failure;
21822      if (gregLO3ofRM(modrm) != 0 && gregLO3ofRM(modrm) != 1)
21823         goto decode_failure;
21824      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21825      delta += alen;
21826      switch (gregLO3ofRM(modrm)) {
21827         case 0: DIP("prefetch %s\n", dis_buf); break;
21828         case 1: DIP("prefetchw %s\n", dis_buf); break;
21829         default: vassert(0); /*NOTREACHED*/
21830      }
21831      return delta;
21832
21833   case 0x19:
21834   case 0x1C:
21835   case 0x1D:
21836   case 0x1E:
21837   case 0x1F:
21838      // Intel CET instructions can have any prefixes before NOPs
21839      // and can use any ModRM, SIB and disp
21840      modrm = getUChar(delta);
21841      if (epartIsReg(modrm)) {
21842         delta += 1;
21843         DIP("nop%c\n", nameISize(sz));
21844      } else {
21845         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21846         delta += alen;
21847         DIP("nop%c %s\n", nameISize(sz), dis_buf);
21848      }
21849      return delta;
21850
21851   case 0x31: { /* RDTSC */
21852      IRTemp   val  = newTemp(Ity_I64);
21853      IRExpr** args = mkIRExprVec_0();
21854      IRDirty* d    = unsafeIRDirty_1_N (
21855                         val,
21856                         0/*regparms*/,
21857                         "amd64g_dirtyhelper_RDTSC",
21858                         &amd64g_dirtyhelper_RDTSC,
21859                         args
21860                      );
21861      if (have66orF2orF3(pfx)) goto decode_failure;
21862      /* execute the dirty call, dumping the result in val. */
21863      stmt( IRStmt_Dirty(d) );
21864      putIRegRDX(4, unop(Iop_64HIto32, mkexpr(val)));
21865      putIRegRAX(4, unop(Iop_64to32, mkexpr(val)));
21866      DIP("rdtsc\n");
21867      return delta;
21868   }
21869
21870   case 0x40:
21871   case 0x41:
21872   case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
21873   case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
21874   case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
21875   case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
21876   case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
21877   case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
21878   case 0x48: /* CMOVSb (cmov negative) */
21879   case 0x49: /* CMOVSb (cmov not negative) */
21880   case 0x4A: /* CMOVP (cmov parity even) */
21881   case 0x4B: /* CMOVNP (cmov parity odd) */
21882   case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
21883   case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
21884   case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
21885   case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
21886      if (haveF2orF3(pfx)) goto decode_failure;
21887      delta = dis_cmov_E_G(vbi, pfx, sz, (AMD64Condcode)(opc - 0x40), delta);
21888      return delta;
21889
21890   case 0x80:
21891   case 0x81:
21892   case 0x82:   /* JBb/JNAEb (jump below) */
21893   case 0x83:   /* JNBb/JAEb (jump not below) */
21894   case 0x84:   /* JZb/JEb (jump zero) */
21895   case 0x85:   /* JNZb/JNEb (jump not zero) */
21896   case 0x86:   /* JBEb/JNAb (jump below or equal) */
21897   case 0x87:   /* JNBEb/JAb (jump not below or equal) */
21898   case 0x88:   /* JSb (jump negative) */
21899   case 0x89:   /* JSb (jump not negative) */
21900   case 0x8A:   /* JP (jump parity even) */
21901   case 0x8B:   /* JNP/JPO (jump parity odd) */
21902   case 0x8C:   /* JLb/JNGEb (jump less) */
21903   case 0x8D:   /* JGEb/JNLb (jump greater or equal) */
21904   case 0x8E:   /* JLEb/JNGb (jump less or equal) */
21905   case 0x8F: { /* JGb/JNLEb (jump greater) */
21906      Long   jmpDelta;
21907      const HChar* comment  = "";
21908      if (haveF3(pfx)) goto decode_failure;
21909      if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
21910      jmpDelta = getSDisp32(delta);
21911      d64 = (guest_RIP_bbstart+delta+4) + jmpDelta;
21912      delta += 4;
21913      if (resteerCisOk
21914          && vex_control.guest_chase_cond
21915          && (Addr64)d64 != (Addr64)guest_RIP_bbstart
21916          && jmpDelta < 0
21917          && resteerOkFn( callback_opaque, (Addr64)d64) ) {
21918         /* Speculation: assume this backward branch is taken.  So
21919            we need to emit a side-exit to the insn following this
21920            one, on the negation of the condition, and continue at
21921            the branch target address (d64).  If we wind up back at
21922            the first instruction of the trace, just stop; it's
21923            better to let the IR loop unroller handle that case. */
21924         stmt( IRStmt_Exit(
21925                  mk_amd64g_calculate_condition(
21926                     (AMD64Condcode)(1 ^ (opc - 0x80))),
21927                  Ijk_Boring,
21928                  IRConst_U64(guest_RIP_bbstart+delta),
21929                  OFFB_RIP
21930             ));
21931         dres->whatNext   = Dis_ResteerC;
21932         dres->continueAt = d64;
21933         comment = "(assumed taken)";
21934      }
21935      else
21936      if (resteerCisOk
21937          && vex_control.guest_chase_cond
21938          && (Addr64)d64 != (Addr64)guest_RIP_bbstart
21939          && jmpDelta >= 0
21940          && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
21941         /* Speculation: assume this forward branch is not taken.
21942            So we need to emit a side-exit to d64 (the dest) and
21943            continue disassembling at the insn immediately
21944            following this one. */
21945         stmt( IRStmt_Exit(
21946                  mk_amd64g_calculate_condition((AMD64Condcode)
21947                                                (opc - 0x80)),
21948                  Ijk_Boring,
21949                  IRConst_U64(d64),
21950                  OFFB_RIP
21951             ));
21952         dres->whatNext   = Dis_ResteerC;
21953         dres->continueAt = guest_RIP_bbstart+delta;
21954         comment = "(assumed not taken)";
21955      }
21956      else {
21957         /* Conservative default translation - end the block at
21958            this point. */
21959         jcc_01( dres, (AMD64Condcode)(opc - 0x80),
21960                 guest_RIP_bbstart+delta, d64 );
21961         vassert(dres->whatNext == Dis_StopHere);
21962      }
21963      DIP("j%s-32 0x%llx %s\n", name_AMD64Condcode(opc - 0x80), (ULong)d64,
21964          comment);
21965      return delta;
21966   }
21967
21968   case 0x90:
21969   case 0x91:
21970   case 0x92: /* set-Bb/set-NAEb (set if below) */
21971   case 0x93: /* set-NBb/set-AEb (set if not below) */
21972   case 0x94: /* set-Zb/set-Eb (set if zero) */
21973   case 0x95: /* set-NZb/set-NEb (set if not zero) */
21974   case 0x96: /* set-BEb/set-NAb (set if below or equal) */
21975   case 0x97: /* set-NBEb/set-Ab (set if not below or equal) */
21976   case 0x98: /* set-Sb (set if negative) */
21977   case 0x99: /* set-Sb (set if not negative) */
21978   case 0x9A: /* set-P (set if parity even) */
21979   case 0x9B: /* set-NP (set if parity odd) */
21980   case 0x9C: /* set-Lb/set-NGEb (set if less) */
21981   case 0x9D: /* set-GEb/set-NLb (set if greater or equal) */
21982   case 0x9E: /* set-LEb/set-NGb (set if less or equal) */
21983   case 0x9F: /* set-Gb/set-NLEb (set if greater) */
21984      if (haveF2orF3(pfx)) goto decode_failure;
21985      t1 = newTemp(Ity_I8);
21986      assign( t1, unop(Iop_1Uto8,mk_amd64g_calculate_condition(opc-0x90)) );
21987      modrm = getUChar(delta);
21988      if (epartIsReg(modrm)) {
21989         delta++;
21990         putIRegE(1, pfx, modrm, mkexpr(t1));
21991         DIP("set%s %s\n", name_AMD64Condcode(opc-0x90),
21992                           nameIRegE(1,pfx,modrm));
21993      } else {
21994         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21995         delta += alen;
21996         storeLE( mkexpr(addr), mkexpr(t1) );
21997         DIP("set%s %s\n", name_AMD64Condcode(opc-0x90), dis_buf);
21998      }
21999      return delta;
22000
22001   case 0x1A:
22002   case 0x1B: { /* Future MPX instructions, currently NOPs.
22003                   BNDMK b, m     F3 0F 1B
22004                   BNDCL b, r/m   F3 0F 1A
22005                   BNDCU b, r/m   F2 0F 1A
22006                   BNDCN b, r/m   F2 0F 1B
22007                   BNDMOV b, b/m  66 0F 1A
22008                   BNDMOV b/m, b  66 0F 1B
22009                   BNDLDX b, mib     0F 1A
22010                   BNDSTX mib, b     0F 1B */
22011
22012      /* All instructions have two operands. One operand is always the
22013         bnd register number (bnd0-bnd3, other register numbers are
22014         ignored when MPX isn't enabled, but should generate an
22015         exception if MPX is enabled) given by gregOfRexRM. The other
22016         operand is either a ModRM:reg, ModRM:r/m or a SIB encoded
22017         address, all of which can be decoded by using either
22018         eregOfRexRM or disAMode. */
22019
22020      modrm = getUChar(delta);
22021      int bnd = gregOfRexRM(pfx,modrm);
22022      const HChar *oper;
22023      if (epartIsReg(modrm)) {
22024         oper = nameIReg64 (eregOfRexRM(pfx,modrm));
22025         delta += 1;
22026      } else {
22027         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22028         delta += alen;
22029         oper = dis_buf;
22030      }
22031
22032      if (haveF3no66noF2 (pfx)) {
22033         if (opc == 0x1B) {
22034            DIP ("bndmk %s, %%bnd%d\n", oper, bnd);
22035         } else /* opc == 0x1A */ {
22036            DIP ("bndcl %s, %%bnd%d\n", oper, bnd);
22037         }
22038      } else if (haveF2no66noF3 (pfx)) {
22039         if (opc == 0x1A) {
22040            DIP ("bndcu %s, %%bnd%d\n", oper, bnd);
22041         } else /* opc == 0x1B */ {
22042            DIP ("bndcn %s, %%bnd%d\n", oper, bnd);
22043         }
22044      } else if (have66noF2noF3 (pfx)) {
22045         if (opc == 0x1A) {
22046            DIP ("bndmov %s, %%bnd%d\n", oper, bnd);
22047         } else /* opc == 0x1B */ {
22048            DIP ("bndmov %%bnd%d, %s\n", bnd, oper);
22049         }
22050      } else if (haveNo66noF2noF3 (pfx)) {
22051         if (opc == 0x1A) {
22052            DIP ("bndldx %s, %%bnd%d\n", oper, bnd);
22053         } else /* opc == 0x1B */ {
22054            DIP ("bndstx %%bnd%d, %s\n", bnd, oper);
22055         }
22056      } else goto decode_failure;
22057
22058      return delta;
22059   }
22060
22061   case 0xA2: { /* CPUID */
22062      /* Uses dirty helper:
22063            void amd64g_dirtyhelper_CPUID ( VexGuestAMD64State* )
22064         declared to mod rax, wr rbx, rcx, rdx
22065      */
22066      IRDirty*     d     = NULL;
22067      const HChar* fName = NULL;
22068      void*        fAddr = NULL;
22069
22070      if (haveF2orF3(pfx)) goto decode_failure;
22071
22072      /* This isn't entirely correct, CPUID should depend on the VEX
22073         capabilities, not on the underlying CPU. See bug #324882. */
22074      if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) &&
22075          (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16) &&
22076          (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX2)) {
22077         fName = "amd64g_dirtyhelper_CPUID_avx2";
22078         fAddr = &amd64g_dirtyhelper_CPUID_avx2;
22079         /* This is a Core-i7-4910-like machine */
22080      }
22081      else if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) &&
22082               (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16) &&
22083               (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
22084         fName = "amd64g_dirtyhelper_CPUID_avx_and_cx16";
22085         fAddr = &amd64g_dirtyhelper_CPUID_avx_and_cx16;
22086         /* This is a Core-i5-2300-like machine */
22087      }
22088      else if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) &&
22089               (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16)) {
22090         fName = "amd64g_dirtyhelper_CPUID_sse42_and_cx16";
22091         fAddr = &amd64g_dirtyhelper_CPUID_sse42_and_cx16;
22092         /* This is a Core-i5-670-like machine */
22093      }
22094      else {
22095         /* Give a CPUID for at least a baseline machine, SSE2
22096            only, and no CX16 */
22097         fName = "amd64g_dirtyhelper_CPUID_baseline";
22098         fAddr = &amd64g_dirtyhelper_CPUID_baseline;
22099      }
22100
22101      vassert(fName); vassert(fAddr);
22102      d = unsafeIRDirty_0_N ( 0/*regparms*/,
22103                              fName, fAddr, mkIRExprVec_1(IRExpr_GSPTR()) );
22104      /* declare guest state effects */
22105      d->nFxState = 4;
22106      vex_bzero(&d->fxState, sizeof(d->fxState));
22107      d->fxState[0].fx     = Ifx_Modify;
22108      d->fxState[0].offset = OFFB_RAX;
22109      d->fxState[0].size   = 8;
22110      d->fxState[1].fx     = Ifx_Write;
22111      d->fxState[1].offset = OFFB_RBX;
22112      d->fxState[1].size   = 8;
22113      d->fxState[2].fx     = Ifx_Modify;
22114      d->fxState[2].offset = OFFB_RCX;
22115      d->fxState[2].size   = 8;
22116      d->fxState[3].fx     = Ifx_Write;
22117      d->fxState[3].offset = OFFB_RDX;
22118      d->fxState[3].size   = 8;
22119      /* execute the dirty call, side-effecting guest state */
22120      stmt( IRStmt_Dirty(d) );
22121      /* CPUID is a serialising insn.  So, just in case someone is
22122         using it as a memory fence ... */
22123      stmt( IRStmt_MBE(Imbe_Fence) );
22124      DIP("cpuid\n");
22125      return delta;
22126   }
22127
22128   case 0xA3: { /* BT Gv,Ev */
22129      /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
22130      Bool ok = True;
22131      if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
22132      delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpNone, &ok );
22133      if (!ok) goto decode_failure;
22134      return delta;
22135   }
22136
22137   case 0xA4: /* SHLDv imm8,Gv,Ev */
22138      modrm = getUChar(delta);
22139      d64   = delta + lengthAMode(pfx, delta);
22140      vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
22141      delta = dis_SHLRD_Gv_Ev (
22142                 vbi, pfx, delta, modrm, sz,
22143                 mkU8(getUChar(d64)), True, /* literal */
22144                 dis_buf, True /* left */ );
22145      return delta;
22146
22147   case 0xA5: /* SHLDv %cl,Gv,Ev */
22148      modrm = getUChar(delta);
22149      delta = dis_SHLRD_Gv_Ev (
22150                 vbi, pfx, delta, modrm, sz,
22151                 getIRegCL(), False, /* not literal */
22152                 "%cl", True /* left */ );
22153      return delta;
22154
22155   case 0xAB: { /* BTS Gv,Ev */
22156      /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
22157      Bool ok = True;
22158      if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
22159      delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpSet, &ok );
22160      if (!ok) goto decode_failure;
22161      return delta;
22162   }
22163
22164   case 0xAC: /* SHRDv imm8,Gv,Ev */
22165      modrm = getUChar(delta);
22166      d64   = delta + lengthAMode(pfx, delta);
22167      vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
22168      delta = dis_SHLRD_Gv_Ev (
22169                 vbi, pfx, delta, modrm, sz,
22170                 mkU8(getUChar(d64)), True, /* literal */
22171                 dis_buf, False /* right */ );
22172      return delta;
22173
22174   case 0xAD: /* SHRDv %cl,Gv,Ev */
22175      modrm = getUChar(delta);
22176      delta = dis_SHLRD_Gv_Ev (
22177                 vbi, pfx, delta, modrm, sz,
22178                 getIRegCL(), False, /* not literal */
22179                 "%cl", False /* right */);
22180      return delta;
22181
22182   case 0xAF: /* IMUL Ev, Gv */
22183      if (haveF2orF3(pfx)) goto decode_failure;
22184      delta = dis_mul_E_G ( vbi, pfx, sz, delta );
22185      return delta;
22186
22187   case 0xB0: { /* CMPXCHG Gb,Eb */
22188      Bool ok = True;
22189      /* We let dis_cmpxchg_G_E decide whether F2 or F3 are allowable. */
22190      delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, 1, delta );
22191      if (!ok) goto decode_failure;
22192      return delta;
22193   }
22194
22195   case 0xB1: { /* CMPXCHG Gv,Ev (allowed in 16,32,64 bit) */
22196      Bool ok = True;
22197      /* We let dis_cmpxchg_G_E decide whether F2 or F3 are allowable. */
22198      if (sz != 2 && sz != 4 && sz != 8) goto decode_failure;
22199      delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, sz, delta );
22200      if (!ok) goto decode_failure;
22201      return delta;
22202   }
22203
22204   case 0xB3: { /* BTR Gv,Ev */
22205      /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
22206      Bool ok = True;
22207      if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
22208      delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpReset, &ok );
22209      if (!ok) goto decode_failure;
22210      return delta;
22211   }
22212
22213   case 0xB6: /* MOVZXb Eb,Gv */
22214      if (haveF2orF3(pfx)) goto decode_failure;
22215      if (sz != 2 && sz != 4 && sz != 8)
22216         goto decode_failure;
22217      delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, False );
22218      return delta;
22219
22220   case 0xB7: /* MOVZXw Ew,Gv */
22221      if (haveF2orF3(pfx)) goto decode_failure;
22222      if (sz != 4 && sz != 8)
22223         goto decode_failure;
22224      delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, False );
22225      return delta;
22226
22227   case 0xBA: { /* Grp8 Ib,Ev */
22228      /* We let dis_Grp8_Imm decide whether F2 or F3 are allowable. */
22229      Bool decode_OK = False;
22230      modrm = getUChar(delta);
22231      am_sz = lengthAMode(pfx,delta);
22232      d64   = getSDisp8(delta + am_sz);
22233      delta = dis_Grp8_Imm ( vbi, pfx, delta, modrm, am_sz, sz, d64,
22234                             &decode_OK );
22235      if (!decode_OK)
22236         goto decode_failure;
22237      return delta;
22238   }
22239
22240   case 0xBB: { /* BTC Gv,Ev */
22241      /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
22242      Bool ok = False;
22243      if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
22244      delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpComp, &ok );
22245      if (!ok) goto decode_failure;
22246      return delta;
22247   }
22248
22249   case 0xBC: /* BSF Gv,Ev */
22250      if (!haveF2orF3(pfx)
22251          || (haveF3noF2(pfx)
22252              && 0 == (archinfo->hwcaps & VEX_HWCAPS_AMD64_BMI))) {
22253         /* no-F2 no-F3 0F BC = BSF
22254                  or F3 0F BC = REP; BSF on older CPUs.  */
22255         delta = dis_bs_E_G ( vbi, pfx, sz, delta, True );
22256         return delta;
22257      }
22258      /* Fall through, since F3 0F BC is TZCNT, and needs to
22259         be handled by dis_ESC_0F__SSE4. */
22260      break;
22261
22262   case 0xBD: /* BSR Gv,Ev */
22263      if (!haveF2orF3(pfx)
22264          || (haveF3noF2(pfx)
22265              && 0 == (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT))) {
22266         /* no-F2 no-F3 0F BD = BSR
22267                  or F3 0F BD = REP; BSR on older CPUs.  */
22268         delta = dis_bs_E_G ( vbi, pfx, sz, delta, False );
22269         return delta;
22270      }
22271      /* Fall through, since F3 0F BD is LZCNT, and needs to
22272         be handled by dis_ESC_0F__SSE4. */
22273      break;
22274
22275   case 0xBE: /* MOVSXb Eb,Gv */
22276      if (haveF2orF3(pfx)) goto decode_failure;
22277      if (sz != 2 && sz != 4 && sz != 8)
22278         goto decode_failure;
22279      delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, True );
22280      return delta;
22281
22282   case 0xBF: /* MOVSXw Ew,Gv */
22283      if (haveF2orF3(pfx)) goto decode_failure;
22284      if (sz != 4 && sz != 8)
22285         goto decode_failure;
22286      delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, True );
22287      return delta;
22288
22289   case 0xC0: { /* XADD Gb,Eb */
22290      Bool decode_OK = False;
22291      delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, 1, delta );
22292      if (!decode_OK)
22293         goto decode_failure;
22294      return delta;
22295   }
22296
22297   case 0xC1: { /* XADD Gv,Ev */
22298      Bool decode_OK = False;
22299      delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, sz, delta );
22300      if (!decode_OK)
22301         goto decode_failure;
22302      return delta;
22303   }
22304
22305   case 0xC7: { /* CMPXCHG8B Ev, CMPXCHG16B Ev */
22306      IRType  elemTy     = sz==4 ? Ity_I32 : Ity_I64;
22307      IRTemp  expdHi     = newTemp(elemTy);
22308      IRTemp  expdLo     = newTemp(elemTy);
22309      IRTemp  dataHi     = newTemp(elemTy);
22310      IRTemp  dataLo     = newTemp(elemTy);
22311      IRTemp  oldHi      = newTemp(elemTy);
22312      IRTemp  oldLo      = newTemp(elemTy);
22313      IRTemp  flags_old  = newTemp(Ity_I64);
22314      IRTemp  flags_new  = newTemp(Ity_I64);
22315      IRTemp  success    = newTemp(Ity_I1);
22316      IROp    opOR       = sz==4 ? Iop_Or32    : Iop_Or64;
22317      IROp    opXOR      = sz==4 ? Iop_Xor32   : Iop_Xor64;
22318      IROp    opCasCmpEQ = sz==4 ? Iop_CasCmpEQ32 : Iop_CasCmpEQ64;
22319      IRExpr* zero       = sz==4 ? mkU32(0)    : mkU64(0);
22320      IRTemp expdHi64    = newTemp(Ity_I64);
22321      IRTemp expdLo64    = newTemp(Ity_I64);
22322
22323      /* Translate this using a DCAS, even if there is no LOCK
22324         prefix.  Life is too short to bother with generating two
22325         different translations for the with/without-LOCK-prefix
22326         cases. */
22327      *expect_CAS = True;
22328
22329      /* Decode, and generate address. */
22330      if (have66(pfx)) goto decode_failure;
22331      if (sz != 4 && sz != 8) goto decode_failure;
22332      if (sz == 8 && !(archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16))
22333         goto decode_failure;
22334      modrm = getUChar(delta);
22335      if (epartIsReg(modrm)) goto decode_failure;
22336      if (gregLO3ofRM(modrm) != 1) goto decode_failure;
22337      if (haveF2orF3(pfx)) {
22338         /* Since the e-part is memory only, F2 or F3 (one or the
22339            other) is acceptable if LOCK is also present.  But only
22340            for cmpxchg8b. */
22341         if (sz == 8) goto decode_failure;
22342         if (haveF2andF3(pfx) || !haveLOCK(pfx)) goto decode_failure;
22343      }
22344
22345      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22346      delta += alen;
22347
22348      /* cmpxchg16b requires an alignment check. */
22349      if (sz == 8)
22350         gen_SEGV_if_not_16_aligned( addr );
22351
22352      /* Get the expected and new values. */
22353      assign( expdHi64, getIReg64(R_RDX) );
22354      assign( expdLo64, getIReg64(R_RAX) );
22355
22356      /* These are the correctly-sized expected and new values.
22357         However, we also get expdHi64/expdLo64 above as 64-bits
22358         regardless, because we will need them later in the 32-bit
22359         case (paradoxically). */
22360      assign( expdHi, sz==4 ? unop(Iop_64to32, mkexpr(expdHi64))
22361                            : mkexpr(expdHi64) );
22362      assign( expdLo, sz==4 ? unop(Iop_64to32, mkexpr(expdLo64))
22363                            : mkexpr(expdLo64) );
22364      assign( dataHi, sz==4 ? getIReg32(R_RCX) : getIReg64(R_RCX) );
22365      assign( dataLo, sz==4 ? getIReg32(R_RBX) : getIReg64(R_RBX) );
22366
22367      /* Do the DCAS */
22368      stmt( IRStmt_CAS(
22369               mkIRCAS( oldHi, oldLo,
22370                        Iend_LE, mkexpr(addr),
22371                        mkexpr(expdHi), mkexpr(expdLo),
22372                        mkexpr(dataHi), mkexpr(dataLo)
22373            )));
22374
22375      /* success when oldHi:oldLo == expdHi:expdLo */
22376      assign( success,
22377              binop(opCasCmpEQ,
22378                    binop(opOR,
22379                          binop(opXOR, mkexpr(oldHi), mkexpr(expdHi)),
22380                          binop(opXOR, mkexpr(oldLo), mkexpr(expdLo))
22381                    ),
22382                    zero
22383              ));
22384
22385      /* If the DCAS is successful, that is to say oldHi:oldLo ==
22386         expdHi:expdLo, then put expdHi:expdLo back in RDX:RAX,
22387         which is where they came from originally.  Both the actual
22388         contents of these two regs, and any shadow values, are
22389         unchanged.  If the DCAS fails then we're putting into
22390         RDX:RAX the value seen in memory. */
22391      /* Now of course there's a complication in the 32-bit case
22392         (bah!): if the DCAS succeeds, we need to leave RDX:RAX
22393         unchanged; but if we use the same scheme as in the 64-bit
22394         case, we get hit by the standard rule that a write to the
22395         bottom 32 bits of an integer register zeros the upper 32
22396         bits.  And so the upper halves of RDX and RAX mysteriously
22397         become zero.  So we have to stuff back in the original
22398         64-bit values which we previously stashed in
22399         expdHi64:expdLo64, even if we're doing a cmpxchg8b. */
22400      /* It's just _so_ much fun ... */
22401      putIRegRDX( 8,
22402                  IRExpr_ITE( mkexpr(success),
22403                              mkexpr(expdHi64),
22404                              sz == 4 ? unop(Iop_32Uto64, mkexpr(oldHi))
22405                                      : mkexpr(oldHi)
22406                ));
22407      putIRegRAX( 8,
22408                  IRExpr_ITE( mkexpr(success),
22409                              mkexpr(expdLo64),
22410                              sz == 4 ? unop(Iop_32Uto64, mkexpr(oldLo))
22411                                      : mkexpr(oldLo)
22412                ));
22413
22414      /* Copy the success bit into the Z flag and leave the others
22415         unchanged */
22416      assign( flags_old, widenUto64(mk_amd64g_calculate_rflags_all()));
22417      assign(
22418         flags_new,
22419         binop(Iop_Or64,
22420               binop(Iop_And64, mkexpr(flags_old),
22421                                mkU64(~AMD64G_CC_MASK_Z)),
22422               binop(Iop_Shl64,
22423                     binop(Iop_And64,
22424                           unop(Iop_1Uto64, mkexpr(success)), mkU64(1)),
22425                     mkU8(AMD64G_CC_SHIFT_Z)) ));
22426
22427      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
22428      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
22429      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
22430      /* Set NDEP even though it isn't used.  This makes
22431         redundant-PUT elimination of previous stores to this field
22432         work better. */
22433      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
22434
22435      /* Sheesh.  Aren't you glad it was me and not you that had to
22436         write and validate all this grunge? */
22437
22438      DIP("cmpxchg8b %s\n", dis_buf);
22439      return delta;
22440   }
22441
22442   case 0xC8: /* BSWAP %eax */
22443   case 0xC9:
22444   case 0xCA:
22445   case 0xCB:
22446   case 0xCC:
22447   case 0xCD:
22448   case 0xCE:
22449   case 0xCF: /* BSWAP %edi */
22450      if (haveF2orF3(pfx)) goto decode_failure;
22451      /* According to the AMD64 docs, this insn can have size 4 or
22452         8. */
22453      if (sz == 4) {
22454         t1 = newTemp(Ity_I32);
22455         assign( t1, getIRegRexB(4, pfx, opc-0xC8) );
22456         t2 = math_BSWAP( t1, Ity_I32 );
22457         putIRegRexB(4, pfx, opc-0xC8, mkexpr(t2));
22458         DIP("bswapl %s\n", nameIRegRexB(4, pfx, opc-0xC8));
22459         return delta;
22460      }
22461      if (sz == 8) {
22462         t1 = newTemp(Ity_I64);
22463         t2 = newTemp(Ity_I64);
22464         assign( t1, getIRegRexB(8, pfx, opc-0xC8) );
22465         t2 = math_BSWAP( t1, Ity_I64 );
22466         putIRegRexB(8, pfx, opc-0xC8, mkexpr(t2));
22467         DIP("bswapq %s\n", nameIRegRexB(8, pfx, opc-0xC8));
22468         return delta;
22469      }
22470      goto decode_failure;
22471
22472   default:
22473      break;
22474
22475   } /* first switch */
22476
22477
22478   /* =-=-=-=-=-=-=-=-= MMXery =-=-=-=-=-=-=-=-= */
22479   /* In the second switch, pick off MMX insns. */
22480
22481   if (!have66orF2orF3(pfx)) {
22482      /* So there's no SIMD prefix. */
22483
22484      vassert(sz == 4 || sz == 8);
22485
22486      switch (opc) { /* second switch */
22487
22488      case 0x71:
22489      case 0x72:
22490      case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
22491
22492      case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
22493      case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
22494      case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
22495      case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
22496
22497      case 0xFC:
22498      case 0xFD:
22499      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
22500
22501      case 0xEC:
22502      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
22503
22504      case 0xDC:
22505      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
22506
22507      case 0xF8:
22508      case 0xF9:
22509      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
22510
22511      case 0xE8:
22512      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
22513
22514      case 0xD8:
22515      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
22516
22517      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
22518      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
22519
22520      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
22521
22522      case 0x74:
22523      case 0x75:
22524      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
22525
22526      case 0x64:
22527      case 0x65:
22528      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
22529
22530      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
22531      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
22532      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
22533
22534      case 0x68:
22535      case 0x69:
22536      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
22537
22538      case 0x60:
22539      case 0x61:
22540      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
22541
22542      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
22543      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
22544      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
22545      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
22546
22547      case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
22548      case 0xF2:
22549      case 0xF3:
22550
22551      case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
22552      case 0xD2:
22553      case 0xD3:
22554
22555      case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
22556      case 0xE2: {
22557         Bool decode_OK = False;
22558         delta = dis_MMX ( &decode_OK, vbi, pfx, sz, deltaIN );
22559         if (decode_OK)
22560            return delta;
22561         goto decode_failure;
22562      }
22563
22564      default:
22565         break;
22566      } /* second switch */
22567
22568   }
22569
22570   /* A couple of MMX corner cases */
22571   if (opc == 0x0E/* FEMMS */ || opc == 0x77/* EMMS */) {
22572      if (sz != 4)
22573         goto decode_failure;
22574      do_EMMS_preamble();
22575      DIP("{f}emms\n");
22576      return delta;
22577   }
22578
22579   /* =-=-=-=-=-=-=-=-= SSE2ery =-=-=-=-=-=-=-=-= */
22580   /* Perhaps it's an SSE or SSE2 instruction.  We can try this
22581      without checking the guest hwcaps because SSE2 is a baseline
22582      facility in 64 bit mode. */
22583   {
22584      Bool decode_OK = False;
22585      delta = dis_ESC_0F__SSE2 ( &decode_OK,
22586                                 archinfo, vbi, pfx, sz, deltaIN, dres );
22587      if (decode_OK)
22588         return delta;
22589   }
22590
22591   /* =-=-=-=-=-=-=-=-= SSE3ery =-=-=-=-=-=-=-=-= */
22592   /* Perhaps it's a SSE3 instruction.  FIXME: check guest hwcaps
22593      first. */
22594   {
22595      Bool decode_OK = False;
22596      delta = dis_ESC_0F__SSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
22597      if (decode_OK)
22598         return delta;
22599   }
22600
22601   /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
22602   /* Perhaps it's a SSE4 instruction.  FIXME: check guest hwcaps
22603      first. */
22604   {
22605      Bool decode_OK = False;
22606      delta = dis_ESC_0F__SSE4 ( &decode_OK,
22607                                 archinfo, vbi, pfx, sz, deltaIN );
22608      if (decode_OK)
22609         return delta;
22610   }
22611
22612  decode_failure:
22613   return deltaIN; /* fail */
22614}
22615
22616
22617/*------------------------------------------------------------*/
22618/*---                                                      ---*/
22619/*--- Top-level post-escape decoders: dis_ESC_0F38         ---*/
22620/*---                                                      ---*/
22621/*------------------------------------------------------------*/
22622
22623__attribute__((noinline))
22624static
22625Long dis_ESC_0F38 (
22626        /*MB_OUT*/DisResult* dres,
22627        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
22628        Bool         resteerCisOk,
22629        void*        callback_opaque,
22630        const VexArchInfo* archinfo,
22631        const VexAbiInfo*  vbi,
22632        Prefix pfx, Int sz, Long deltaIN
22633     )
22634{
22635   Long   delta = deltaIN;
22636   UChar  opc   = getUChar(delta);
22637   delta++;
22638   switch (opc) {
22639
22640   case 0xF0:   /* 0F 38 F0 = MOVBE m16/32/64(E), r16/32/64(G) */
22641   case 0xF1: { /* 0F 38 F1 = MOVBE r16/32/64(G), m16/32/64(E) */
22642      if (!haveF2orF3(pfx) && !haveVEX(pfx)
22643          && (sz == 2 || sz == 4 || sz == 8)) {
22644         IRTemp addr  = IRTemp_INVALID;
22645         UChar  modrm = 0;
22646         Int    alen  = 0;
22647         HChar  dis_buf[50];
22648         modrm = getUChar(delta);
22649         if (epartIsReg(modrm)) break;
22650         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22651         delta += alen;
22652         IRType ty = szToITy(sz);
22653         IRTemp src = newTemp(ty);
22654         if (opc == 0xF0) { /* LOAD */
22655            assign(src, loadLE(ty, mkexpr(addr)));
22656            IRTemp dst = math_BSWAP(src, ty);
22657            putIRegG(sz, pfx, modrm, mkexpr(dst));
22658            DIP("movbe %s,%s\n", dis_buf, nameIRegG(sz, pfx, modrm));
22659         } else { /* STORE */
22660            assign(src, getIRegG(sz, pfx, modrm));
22661            IRTemp dst = math_BSWAP(src, ty);
22662            storeLE(mkexpr(addr), mkexpr(dst));
22663            DIP("movbe %s,%s\n", nameIRegG(sz, pfx, modrm), dis_buf);
22664         }
22665         return delta;
22666      }
22667      /* else fall through; maybe one of the decoders below knows what
22668         it is. */
22669      break;
22670   }
22671
22672   default:
22673      break;
22674   }
22675
22676   /* =-=-=-=-=-=-=-=-= SSSE3ery =-=-=-=-=-=-=-=-= */
22677   /* Perhaps it's an SSSE3 instruction.  FIXME: consult guest hwcaps
22678      rather than proceeding indiscriminately. */
22679   {
22680      Bool decode_OK = False;
22681      delta = dis_ESC_0F38__SupSSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
22682      if (decode_OK)
22683         return delta;
22684   }
22685
22686   /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
22687   /* Perhaps it's an SSE4 instruction.  FIXME: consult guest hwcaps
22688      rather than proceeding indiscriminately. */
22689   {
22690      Bool decode_OK = False;
22691      delta = dis_ESC_0F38__SSE4 ( &decode_OK, vbi, pfx, sz, deltaIN );
22692      if (decode_OK)
22693         return delta;
22694   }
22695
22696   /* Ignore previous decode attempts and restart from the beginning of
22697      the instruction. */
22698   delta = deltaIN;
22699   opc   = getUChar(delta);
22700   delta++;
22701
22702   switch (opc) {
22703
22704   case 0xF6: {
22705      /* 66 0F 38 F6 = ADCX r32/64(G), m32/64(E) */
22706      /* F3 0F 38 F6 = ADOX r32/64(G), m32/64(E) */
22707      /* These were introduced in Broadwell.  Gate them on AVX so as to at
22708         least reject them on earlier guests.  Has no host requirements. */
22709      if (have66noF2noF3(pfx) && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
22710         if (sz == 2) {
22711            sz = 4; /* 66 prefix but operand size is 4/8 */
22712         }
22713         delta = dis_op2_E_G ( vbi, pfx, Iop_Add8, WithFlagCarryX, True,
22714                               sz, delta, "adcx" );
22715         return delta;
22716      }
22717      if (haveF3no66noF2(pfx) && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
22718         delta = dis_op2_E_G ( vbi, pfx, Iop_Add8, WithFlagOverX, True,
22719                               sz, delta, "adox" );
22720         return delta;
22721      }
22722      /* else fall through */
22723      break;
22724   }
22725
22726   default:
22727      break;
22728   }
22729
22730  /*decode_failure:*/
22731   return deltaIN; /* fail */
22732}
22733
22734
22735/*------------------------------------------------------------*/
22736/*---                                                      ---*/
22737/*--- Top-level post-escape decoders: dis_ESC_0F3A         ---*/
22738/*---                                                      ---*/
22739/*------------------------------------------------------------*/
22740
22741__attribute__((noinline))
22742static
22743Long dis_ESC_0F3A (
22744        /*MB_OUT*/DisResult* dres,
22745        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
22746        Bool         resteerCisOk,
22747        void*        callback_opaque,
22748        const VexArchInfo* archinfo,
22749        const VexAbiInfo*  vbi,
22750        Prefix pfx, Int sz, Long deltaIN
22751     )
22752{
22753   Long   delta = deltaIN;
22754   UChar  opc   = getUChar(delta);
22755   delta++;
22756   switch (opc) {
22757
22758   default:
22759      break;
22760
22761   }
22762
22763   /* =-=-=-=-=-=-=-=-= SSSE3ery =-=-=-=-=-=-=-=-= */
22764   /* Perhaps it's an SSSE3 instruction.  FIXME: consult guest hwcaps
22765      rather than proceeding indiscriminately. */
22766   {
22767      Bool decode_OK = False;
22768      delta = dis_ESC_0F3A__SupSSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
22769      if (decode_OK)
22770         return delta;
22771   }
22772
22773   /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
22774   /* Perhaps it's an SSE4 instruction.  FIXME: consult guest hwcaps
22775      rather than proceeding indiscriminately. */
22776   {
22777      Bool decode_OK = False;
22778      delta = dis_ESC_0F3A__SSE4 ( &decode_OK, vbi, pfx, sz, deltaIN );
22779      if (decode_OK)
22780         return delta;
22781   }
22782
22783   return deltaIN; /* fail */
22784}
22785
22786
22787/*------------------------------------------------------------*/
22788/*---                                                      ---*/
22789/*--- Top-level post-escape decoders: dis_ESC_0F__VEX      ---*/
22790/*---                                                      ---*/
22791/*------------------------------------------------------------*/
22792
22793/* FIXME: common up with the _256_ version below? */
22794static
22795Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG (
22796        /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
22797        Prefix pfx, Long delta, const HChar* name,
22798        /* The actual operation.  Use either 'op' or 'opfn',
22799           but not both. */
22800        IROp op, IRTemp(*opFn)(IRTemp,IRTemp),
22801        Bool invertLeftArg,
22802        Bool swapArgs
22803     )
22804{
22805   UChar  modrm = getUChar(delta);
22806   UInt   rD    = gregOfRexRM(pfx, modrm);
22807   UInt   rSL   = getVexNvvvv(pfx);
22808   IRTemp tSL   = newTemp(Ity_V128);
22809   IRTemp tSR   = newTemp(Ity_V128);
22810   IRTemp addr  = IRTemp_INVALID;
22811   HChar  dis_buf[50];
22812   Int    alen  = 0;
22813   vassert(0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*WIG?*/);
22814
22815   assign(tSL, invertLeftArg ? unop(Iop_NotV128, getXMMReg(rSL))
22816                             : getXMMReg(rSL));
22817
22818   if (epartIsReg(modrm)) {
22819      UInt rSR = eregOfRexRM(pfx, modrm);
22820      delta += 1;
22821      assign(tSR, getXMMReg(rSR));
22822      DIP("%s %s,%s,%s\n",
22823          name, nameXMMReg(rSR), nameXMMReg(rSL), nameXMMReg(rD));
22824   } else {
22825      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
22826      delta += alen;
22827      assign(tSR, loadLE(Ity_V128, mkexpr(addr)));
22828      DIP("%s %s,%s,%s\n",
22829          name, dis_buf, nameXMMReg(rSL), nameXMMReg(rD));
22830   }
22831
22832   IRTemp res = IRTemp_INVALID;
22833   if (op != Iop_INVALID) {
22834      vassert(opFn == NULL);
22835      res = newTemp(Ity_V128);
22836      if (requiresRMode(op)) {
22837         IRTemp rm = newTemp(Ity_I32);
22838         assign(rm, get_FAKE_roundingmode()); /* XXXROUNDINGFIXME */
22839         assign(res, swapArgs
22840                        ? triop(op, mkexpr(rm), mkexpr(tSR), mkexpr(tSL))
22841                        : triop(op, mkexpr(rm), mkexpr(tSL), mkexpr(tSR)));
22842      } else {
22843         assign(res, swapArgs
22844                        ? binop(op, mkexpr(tSR), mkexpr(tSL))
22845                        : binop(op, mkexpr(tSL), mkexpr(tSR)));
22846      }
22847   } else {
22848      vassert(opFn != NULL);
22849      res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR);
22850   }
22851
22852   putYMMRegLoAndZU(rD, mkexpr(res));
22853
22854   *uses_vvvv = True;
22855   return delta;
22856}
22857
22858
22859/* Handle a VEX_NDS_128_66_0F_WIG (3-addr) insn, with a simple IROp
22860   for the operation, no inversion of the left arg, and no swapping of
22861   args. */
22862static
22863Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple (
22864        /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
22865        Prefix pfx, Long delta, const HChar* name,
22866        IROp op
22867     )
22868{
22869   return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
22870             uses_vvvv, vbi, pfx, delta, name, op, NULL, False, False);
22871}
22872
22873
22874/* Handle a VEX_NDS_128_66_0F_WIG (3-addr) insn, using the given IR
22875   generator to compute the result, no inversion of the left
22876   arg, and no swapping of args. */
22877static
22878Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex (
22879        /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
22880        Prefix pfx, Long delta, const HChar* name,
22881        IRTemp(*opFn)(IRTemp,IRTemp)
22882     )
22883{
22884   return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
22885             uses_vvvv, vbi, pfx, delta, name,
22886             Iop_INVALID, opFn, False, False );
22887}
22888
22889
22890/* Vector by scalar shift of V by the amount specified at the bottom
22891   of E. */
22892static ULong dis_AVX128_shiftV_byE ( const VexAbiInfo* vbi,
22893                                     Prefix pfx, Long delta,
22894                                     const HChar* opname, IROp op )
22895{
22896   HChar   dis_buf[50];
22897   Int     alen, size;
22898   IRTemp  addr;
22899   Bool    shl, shr, sar;
22900   UChar   modrm = getUChar(delta);
22901   UInt    rG    = gregOfRexRM(pfx,modrm);
22902   UInt    rV    = getVexNvvvv(pfx);;
22903   IRTemp  g0    = newTemp(Ity_V128);
22904   IRTemp  g1    = newTemp(Ity_V128);
22905   IRTemp  amt   = newTemp(Ity_I64);
22906   IRTemp  amt8  = newTemp(Ity_I8);
22907   if (epartIsReg(modrm)) {
22908      UInt rE = eregOfRexRM(pfx,modrm);
22909      assign( amt, getXMMRegLane64(rE, 0) );
22910      DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
22911          nameXMMReg(rV), nameXMMReg(rG) );
22912      delta++;
22913   } else {
22914      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22915      assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
22916      DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
22917      delta += alen;
22918   }
22919   assign( g0, getXMMReg(rV) );
22920   assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
22921
22922   shl = shr = sar = False;
22923   size = 0;
22924   switch (op) {
22925      case Iop_ShlN16x8: shl = True; size = 32; break;
22926      case Iop_ShlN32x4: shl = True; size = 32; break;
22927      case Iop_ShlN64x2: shl = True; size = 64; break;
22928      case Iop_SarN16x8: sar = True; size = 16; break;
22929      case Iop_SarN32x4: sar = True; size = 32; break;
22930      case Iop_ShrN16x8: shr = True; size = 16; break;
22931      case Iop_ShrN32x4: shr = True; size = 32; break;
22932      case Iop_ShrN64x2: shr = True; size = 64; break;
22933      default: vassert(0);
22934   }
22935
22936   if (shl || shr) {
22937     assign(
22938        g1,
22939        IRExpr_ITE(
22940           binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
22941           binop(op, mkexpr(g0), mkexpr(amt8)),
22942           mkV128(0x0000)
22943        )
22944     );
22945   } else
22946   if (sar) {
22947     assign(
22948        g1,
22949        IRExpr_ITE(
22950           binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
22951           binop(op, mkexpr(g0), mkexpr(amt8)),
22952           binop(op, mkexpr(g0), mkU8(size-1))
22953        )
22954     );
22955   } else {
22956      vassert(0);
22957   }
22958
22959   putYMMRegLoAndZU( rG, mkexpr(g1) );
22960   return delta;
22961}
22962
22963
22964/* Vector by scalar shift of V by the amount specified at the bottom
22965   of E. */
22966static ULong dis_AVX256_shiftV_byE ( const VexAbiInfo* vbi,
22967                                     Prefix pfx, Long delta,
22968                                     const HChar* opname, IROp op )
22969{
22970   HChar   dis_buf[50];
22971   Int     alen, size;
22972   IRTemp  addr;
22973   Bool    shl, shr, sar;
22974   UChar   modrm = getUChar(delta);
22975   UInt    rG    = gregOfRexRM(pfx,modrm);
22976   UInt    rV    = getVexNvvvv(pfx);;
22977   IRTemp  g0    = newTemp(Ity_V256);
22978   IRTemp  g1    = newTemp(Ity_V256);
22979   IRTemp  amt   = newTemp(Ity_I64);
22980   IRTemp  amt8  = newTemp(Ity_I8);
22981   if (epartIsReg(modrm)) {
22982      UInt rE = eregOfRexRM(pfx,modrm);
22983      assign( amt, getXMMRegLane64(rE, 0) );
22984      DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
22985          nameYMMReg(rV), nameYMMReg(rG) );
22986      delta++;
22987   } else {
22988      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22989      assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
22990      DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
22991      delta += alen;
22992   }
22993   assign( g0, getYMMReg(rV) );
22994   assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
22995
22996   shl = shr = sar = False;
22997   size = 0;
22998   switch (op) {
22999      case Iop_ShlN16x16: shl = True; size = 32; break;
23000      case Iop_ShlN32x8:  shl = True; size = 32; break;
23001      case Iop_ShlN64x4:  shl = True; size = 64; break;
23002      case Iop_SarN16x16: sar = True; size = 16; break;
23003      case Iop_SarN32x8:  sar = True; size = 32; break;
23004      case Iop_ShrN16x16: shr = True; size = 16; break;
23005      case Iop_ShrN32x8:  shr = True; size = 32; break;
23006      case Iop_ShrN64x4:  shr = True; size = 64; break;
23007      default: vassert(0);
23008   }
23009
23010   if (shl || shr) {
23011     assign(
23012        g1,
23013        IRExpr_ITE(
23014           binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
23015           binop(op, mkexpr(g0), mkexpr(amt8)),
23016           binop(Iop_V128HLtoV256, mkV128(0), mkV128(0))
23017        )
23018     );
23019   } else
23020   if (sar) {
23021     assign(
23022        g1,
23023        IRExpr_ITE(
23024           binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
23025           binop(op, mkexpr(g0), mkexpr(amt8)),
23026           binop(op, mkexpr(g0), mkU8(size-1))
23027        )
23028     );
23029   } else {
23030      vassert(0);
23031   }
23032
23033   putYMMReg( rG, mkexpr(g1) );
23034   return delta;
23035}
23036
23037
23038/* Vector by vector shift of V by the amount specified at the bottom
23039   of E.  Vector by vector shifts are defined for all shift amounts,
23040   so not using Iop_S*x* here (and SSE2 doesn't support variable shifts
23041   anyway).  */
23042static ULong dis_AVX_var_shiftV_byE ( const VexAbiInfo* vbi,
23043                                      Prefix pfx, Long delta,
23044                                      const HChar* opname, IROp op, Bool isYMM )
23045{
23046   HChar   dis_buf[50];
23047   Int     alen, size, i;
23048   IRTemp  addr;
23049   UChar   modrm = getUChar(delta);
23050   UInt    rG    = gregOfRexRM(pfx,modrm);
23051   UInt    rV    = getVexNvvvv(pfx);;
23052   IRTemp  sV    = isYMM ? newTemp(Ity_V256) : newTemp(Ity_V128);
23053   IRTemp  amt   = isYMM ? newTemp(Ity_V256) : newTemp(Ity_V128);
23054   IRTemp  amts[8], sVs[8], res[8];
23055   if (epartIsReg(modrm)) {
23056      UInt rE = eregOfRexRM(pfx,modrm);
23057      assign( amt, isYMM ? getYMMReg(rE) : getXMMReg(rE) );
23058      if (isYMM) {
23059         DIP("%s %s,%s,%s\n", opname, nameYMMReg(rE),
23060             nameYMMReg(rV), nameYMMReg(rG) );
23061      } else {
23062         DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
23063             nameXMMReg(rV), nameXMMReg(rG) );
23064      }
23065      delta++;
23066   } else {
23067      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23068      assign( amt, loadLE(isYMM ? Ity_V256 : Ity_V128, mkexpr(addr)) );
23069      if (isYMM) {
23070         DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV),
23071             nameYMMReg(rG) );
23072      } else {
23073         DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV),
23074             nameXMMReg(rG) );
23075      }
23076      delta += alen;
23077   }
23078   assign( sV, isYMM ? getYMMReg(rV) : getXMMReg(rV) );
23079
23080   size = 0;
23081   switch (op) {
23082      case Iop_Shl32: size = 32; break;
23083      case Iop_Shl64: size = 64; break;
23084      case Iop_Sar32: size = 32; break;
23085      case Iop_Shr32: size = 32; break;
23086      case Iop_Shr64: size = 64; break;
23087      default: vassert(0);
23088   }
23089
23090   for (i = 0; i < 8; i++) {
23091      sVs[i] = IRTemp_INVALID;
23092      amts[i] = IRTemp_INVALID;
23093   }
23094   switch (size) {
23095      case 32:
23096         if (isYMM) {
23097            breakupV256to32s( sV, &sVs[7], &sVs[6], &sVs[5], &sVs[4],
23098                                  &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
23099            breakupV256to32s( amt, &amts[7], &amts[6], &amts[5], &amts[4],
23100                                   &amts[3], &amts[2], &amts[1], &amts[0] );
23101         } else {
23102            breakupV128to32s( sV, &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
23103            breakupV128to32s( amt, &amts[3], &amts[2], &amts[1], &amts[0] );
23104        }
23105         break;
23106      case 64:
23107         if (isYMM) {
23108            breakupV256to64s( sV, &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
23109            breakupV256to64s( amt, &amts[3], &amts[2], &amts[1], &amts[0] );
23110         } else {
23111            breakupV128to64s( sV, &sVs[1], &sVs[0] );
23112            breakupV128to64s( amt, &amts[1], &amts[0] );
23113         }
23114         break;
23115      default: vassert(0);
23116   }
23117   for (i = 0; i < 8; i++)
23118      if (sVs[i] != IRTemp_INVALID) {
23119         res[i] = size == 32 ? newTemp(Ity_I32) : newTemp(Ity_I64);
23120         assign( res[i],
23121                 IRExpr_ITE(
23122                    binop(size == 32 ? Iop_CmpLT32U : Iop_CmpLT64U,
23123                          mkexpr(amts[i]),
23124                          size == 32 ? mkU32(size) : mkU64(size)),
23125                    binop(op, mkexpr(sVs[i]),
23126                               unop(size == 32 ? Iop_32to8 : Iop_64to8,
23127                                    mkexpr(amts[i]))),
23128                    op == Iop_Sar32 ? binop(op, mkexpr(sVs[i]), mkU8(size-1))
23129                                    : size == 32 ? mkU32(0) : mkU64(0)
23130         ));
23131      }
23132   switch (size) {
23133      case 32:
23134         for (i = 0; i < 8; i++)
23135            putYMMRegLane32( rG, i, (i < 4 || isYMM)
23136                                    ? mkexpr(res[i]) : mkU32(0) );
23137         break;
23138      case 64:
23139         for (i = 0; i < 4; i++)
23140            putYMMRegLane64( rG, i, (i < 2 || isYMM)
23141                                    ? mkexpr(res[i]) : mkU64(0) );
23142         break;
23143      default: vassert(0);
23144   }
23145
23146   return delta;
23147}
23148
23149
23150/* Vector by scalar shift of E into V, by an immediate byte.  Modified
23151   version of dis_SSE_shiftE_imm. */
23152static
23153Long dis_AVX128_shiftE_to_V_imm( Prefix pfx,
23154                                 Long delta, const HChar* opname, IROp op )
23155{
23156   Bool    shl, shr, sar;
23157   UChar   rm   = getUChar(delta);
23158   IRTemp  e0   = newTemp(Ity_V128);
23159   IRTemp  e1   = newTemp(Ity_V128);
23160   UInt    rD   = getVexNvvvv(pfx);
23161   UChar   amt, size;
23162   vassert(epartIsReg(rm));
23163   vassert(gregLO3ofRM(rm) == 2
23164           || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
23165   amt = getUChar(delta+1);
23166   delta += 2;
23167   DIP("%s $%d,%s,%s\n", opname,
23168                         (Int)amt,
23169                         nameXMMReg(eregOfRexRM(pfx,rm)),
23170                         nameXMMReg(rD));
23171   assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
23172
23173   shl = shr = sar = False;
23174   size = 0;
23175   switch (op) {
23176      case Iop_ShlN16x8: shl = True; size = 16; break;
23177      case Iop_ShlN32x4: shl = True; size = 32; break;
23178      case Iop_ShlN64x2: shl = True; size = 64; break;
23179      case Iop_SarN16x8: sar = True; size = 16; break;
23180      case Iop_SarN32x4: sar = True; size = 32; break;
23181      case Iop_ShrN16x8: shr = True; size = 16; break;
23182      case Iop_ShrN32x4: shr = True; size = 32; break;
23183      case Iop_ShrN64x2: shr = True; size = 64; break;
23184      default: vassert(0);
23185   }
23186
23187   if (shl || shr) {
23188     assign( e1, amt >= size
23189                    ? mkV128(0x0000)
23190                    : binop(op, mkexpr(e0), mkU8(amt))
23191     );
23192   } else
23193   if (sar) {
23194     assign( e1, amt >= size
23195                    ? binop(op, mkexpr(e0), mkU8(size-1))
23196                    : binop(op, mkexpr(e0), mkU8(amt))
23197     );
23198   } else {
23199      vassert(0);
23200   }
23201
23202   putYMMRegLoAndZU( rD, mkexpr(e1) );
23203   return delta;
23204}
23205
23206
23207/* Vector by scalar shift of E into V, by an immediate byte.  Modified
23208   version of dis_AVX128_shiftE_to_V_imm. */
23209static
23210Long dis_AVX256_shiftE_to_V_imm( Prefix pfx,
23211                                 Long delta, const HChar* opname, IROp op )
23212{
23213   Bool    shl, shr, sar;
23214   UChar   rm   = getUChar(delta);
23215   IRTemp  e0   = newTemp(Ity_V256);
23216   IRTemp  e1   = newTemp(Ity_V256);
23217   UInt    rD   = getVexNvvvv(pfx);
23218   UChar   amt, size;
23219   vassert(epartIsReg(rm));
23220   vassert(gregLO3ofRM(rm) == 2
23221           || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
23222   amt = getUChar(delta+1);
23223   delta += 2;
23224   DIP("%s $%d,%s,%s\n", opname,
23225                         (Int)amt,
23226                         nameYMMReg(eregOfRexRM(pfx,rm)),
23227                         nameYMMReg(rD));
23228   assign( e0, getYMMReg(eregOfRexRM(pfx,rm)) );
23229
23230   shl = shr = sar = False;
23231   size = 0;
23232   switch (op) {
23233      case Iop_ShlN16x16: shl = True; size = 16; break;
23234      case Iop_ShlN32x8:  shl = True; size = 32; break;
23235      case Iop_ShlN64x4:  shl = True; size = 64; break;
23236      case Iop_SarN16x16: sar = True; size = 16; break;
23237      case Iop_SarN32x8:  sar = True; size = 32; break;
23238      case Iop_ShrN16x16: shr = True; size = 16; break;
23239      case Iop_ShrN32x8:  shr = True; size = 32; break;
23240      case Iop_ShrN64x4:  shr = True; size = 64; break;
23241      default: vassert(0);
23242   }
23243
23244
23245   if (shl || shr) {
23246     assign( e1, amt >= size
23247                    ? binop(Iop_V128HLtoV256, mkV128(0), mkV128(0))
23248                    : binop(op, mkexpr(e0), mkU8(amt))
23249     );
23250   } else
23251   if (sar) {
23252     assign( e1, amt >= size
23253                    ? binop(op, mkexpr(e0), mkU8(size-1))
23254                    : binop(op, mkexpr(e0), mkU8(amt))
23255     );
23256   } else {
23257      vassert(0);
23258   }
23259
23260   putYMMReg( rD, mkexpr(e1) );
23261   return delta;
23262}
23263
23264
23265/* Lower 64-bit lane only AVX128 binary operation:
23266   G[63:0]    = V[63:0] `op` E[63:0]
23267   G[127:64]  = V[127:64]
23268   G[255:128] = 0.
23269   The specified op must be of the 64F0x2 kind, so that it
23270   copies the upper half of the left operand to the result.
23271*/
23272static Long dis_AVX128_E_V_to_G_lo64 ( /*OUT*/Bool* uses_vvvv,
23273                                       const VexAbiInfo* vbi,
23274                                       Prefix pfx, Long delta,
23275                                       const HChar* opname, IROp op )
23276{
23277   HChar   dis_buf[50];
23278   Int     alen;
23279   IRTemp  addr;
23280   UChar   rm    = getUChar(delta);
23281   UInt    rG    = gregOfRexRM(pfx,rm);
23282   UInt    rV    = getVexNvvvv(pfx);
23283   IRExpr* vpart = getXMMReg(rV);
23284   if (epartIsReg(rm)) {
23285      UInt rE = eregOfRexRM(pfx,rm);
23286      putXMMReg( rG, binop(op, vpart, getXMMReg(rE)) );
23287      DIP("%s %s,%s,%s\n", opname,
23288          nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
23289      delta = delta+1;
23290   } else {
23291      /* We can only do a 64-bit memory read, so the upper half of the
23292         E operand needs to be made simply of zeroes. */
23293      IRTemp epart = newTemp(Ity_V128);
23294      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23295      assign( epart, unop( Iop_64UtoV128,
23296                           loadLE(Ity_I64, mkexpr(addr))) );
23297      putXMMReg( rG, binop(op, vpart, mkexpr(epart)) );
23298      DIP("%s %s,%s,%s\n", opname,
23299          dis_buf, nameXMMReg(rV), nameXMMReg(rG));
23300      delta = delta+alen;
23301   }
23302   putYMMRegLane128( rG, 1, mkV128(0) );
23303   *uses_vvvv = True;
23304   return delta;
23305}
23306
23307
23308/* Lower 64-bit lane only AVX128 unary operation:
23309   G[63:0]    = op(E[63:0])
23310   G[127:64]  = V[127:64]
23311   G[255:128] = 0
23312   The specified op must be of the 64F0x2 kind, so that it
23313   copies the upper half of the operand to the result.
23314*/
23315static Long dis_AVX128_E_V_to_G_lo64_unary ( /*OUT*/Bool* uses_vvvv,
23316                                             const VexAbiInfo* vbi,
23317                                             Prefix pfx, Long delta,
23318                                             const HChar* opname, IROp op )
23319{
23320   HChar   dis_buf[50];
23321   Int     alen;
23322   IRTemp  addr;
23323   UChar   rm  = getUChar(delta);
23324   UInt    rG  = gregOfRexRM(pfx,rm);
23325   UInt    rV  = getVexNvvvv(pfx);
23326   IRTemp  e64 = newTemp(Ity_I64);
23327
23328   /* Fetch E[63:0] */
23329   if (epartIsReg(rm)) {
23330      UInt rE = eregOfRexRM(pfx,rm);
23331      assign(e64, getXMMRegLane64(rE, 0));
23332      DIP("%s %s,%s,%s\n", opname,
23333          nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
23334      delta += 1;
23335   } else {
23336      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23337      assign(e64, loadLE(Ity_I64, mkexpr(addr)));
23338      DIP("%s %s,%s,%s\n", opname,
23339          dis_buf, nameXMMReg(rV), nameXMMReg(rG));
23340      delta += alen;
23341   }
23342
23343   /* Create a value 'arg' as V[127:64]++E[63:0] */
23344   IRTemp arg = newTemp(Ity_V128);
23345   assign(arg,
23346          binop(Iop_SetV128lo64,
23347                getXMMReg(rV), mkexpr(e64)));
23348   /* and apply op to it */
23349   putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
23350   *uses_vvvv = True;
23351   return delta;
23352}
23353
23354
23355/* Lower 32-bit lane only AVX128 unary operation:
23356   G[31:0]    = op(E[31:0])
23357   G[127:32]  = V[127:32]
23358   G[255:128] = 0
23359   The specified op must be of the 32F0x4 kind, so that it
23360   copies the upper 3/4 of the operand to the result.
23361*/
23362static Long dis_AVX128_E_V_to_G_lo32_unary ( /*OUT*/Bool* uses_vvvv,
23363                                             const VexAbiInfo* vbi,
23364                                             Prefix pfx, Long delta,
23365                                             const HChar* opname, IROp op )
23366{
23367   HChar   dis_buf[50];
23368   Int     alen;
23369   IRTemp  addr;
23370   UChar   rm  = getUChar(delta);
23371   UInt    rG  = gregOfRexRM(pfx,rm);
23372   UInt    rV  = getVexNvvvv(pfx);
23373   IRTemp  e32 = newTemp(Ity_I32);
23374
23375   /* Fetch E[31:0] */
23376   if (epartIsReg(rm)) {
23377      UInt rE = eregOfRexRM(pfx,rm);
23378      assign(e32, getXMMRegLane32(rE, 0));
23379      DIP("%s %s,%s,%s\n", opname,
23380          nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
23381      delta += 1;
23382   } else {
23383      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23384      assign(e32, loadLE(Ity_I32, mkexpr(addr)));
23385      DIP("%s %s,%s,%s\n", opname,
23386          dis_buf, nameXMMReg(rV), nameXMMReg(rG));
23387      delta += alen;
23388   }
23389
23390   /* Create a value 'arg' as V[127:32]++E[31:0] */
23391   IRTemp arg = newTemp(Ity_V128);
23392   assign(arg,
23393          binop(Iop_SetV128lo32,
23394                getXMMReg(rV), mkexpr(e32)));
23395   /* and apply op to it */
23396   putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
23397   *uses_vvvv = True;
23398   return delta;
23399}
23400
23401
23402/* Lower 32-bit lane only AVX128 binary operation:
23403   G[31:0]    = V[31:0] `op` E[31:0]
23404   G[127:32]  = V[127:32]
23405   G[255:128] = 0.
23406   The specified op must be of the 32F0x4 kind, so that it
23407   copies the upper 3/4 of the left operand to the result.
23408*/
23409static Long dis_AVX128_E_V_to_G_lo32 ( /*OUT*/Bool* uses_vvvv,
23410                                       const VexAbiInfo* vbi,
23411                                       Prefix pfx, Long delta,
23412                                       const HChar* opname, IROp op )
23413{
23414   HChar   dis_buf[50];
23415   Int     alen;
23416   IRTemp  addr;
23417   UChar   rm    = getUChar(delta);
23418   UInt    rG    = gregOfRexRM(pfx,rm);
23419   UInt    rV    = getVexNvvvv(pfx);
23420   IRExpr* vpart = getXMMReg(rV);
23421   if (epartIsReg(rm)) {
23422      UInt rE = eregOfRexRM(pfx,rm);
23423      putXMMReg( rG, binop(op, vpart, getXMMReg(rE)) );
23424      DIP("%s %s,%s,%s\n", opname,
23425          nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
23426      delta = delta+1;
23427   } else {
23428      /* We can only do a 32-bit memory read, so the upper 3/4 of the
23429         E operand needs to be made simply of zeroes. */
23430      IRTemp epart = newTemp(Ity_V128);
23431      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23432      assign( epart, unop( Iop_32UtoV128,
23433                           loadLE(Ity_I32, mkexpr(addr))) );
23434      putXMMReg( rG, binop(op, vpart, mkexpr(epart)) );
23435      DIP("%s %s,%s,%s\n", opname,
23436          dis_buf, nameXMMReg(rV), nameXMMReg(rG));
23437      delta = delta+alen;
23438   }
23439   putYMMRegLane128( rG, 1, mkV128(0) );
23440   *uses_vvvv = True;
23441   return delta;
23442}
23443
23444
23445/* All-lanes AVX128 binary operation:
23446   G[127:0]   = V[127:0] `op` E[127:0]
23447   G[255:128] = 0.
23448*/
23449static Long dis_AVX128_E_V_to_G ( /*OUT*/Bool* uses_vvvv,
23450                                  const VexAbiInfo* vbi,
23451                                  Prefix pfx, Long delta,
23452                                  const HChar* opname, IROp op )
23453{
23454   return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
23455             uses_vvvv, vbi, pfx, delta, opname, op,
23456             NULL, False/*!invertLeftArg*/, False/*!swapArgs*/
23457   );
23458}
23459
23460
23461/* Handles AVX128 32F/64F comparisons.  A derivative of
23462   dis_SSEcmp_E_to_G.  It can fail, in which case it returns the
23463   original delta to indicate failure. */
23464static
23465Long dis_AVX128_cmp_V_E_to_G ( /*OUT*/Bool* uses_vvvv,
23466                               const VexAbiInfo* vbi,
23467                               Prefix pfx, Long delta,
23468                               const HChar* opname, Bool all_lanes, Int sz )
23469{
23470   vassert(sz == 4 || sz == 8);
23471   Long    deltaIN = delta;
23472   HChar   dis_buf[50];
23473   Int     alen;
23474   UInt    imm8;
23475   IRTemp  addr;
23476   Bool    preSwap = False;
23477   IROp    op      = Iop_INVALID;
23478   Bool    postNot = False;
23479   IRTemp  plain   = newTemp(Ity_V128);
23480   UChar   rm      = getUChar(delta);
23481   UInt    rG      = gregOfRexRM(pfx, rm);
23482   UInt    rV      = getVexNvvvv(pfx);
23483   IRTemp argL     = newTemp(Ity_V128);
23484   IRTemp argR     = newTemp(Ity_V128);
23485
23486   assign(argL, getXMMReg(rV));
23487   if (epartIsReg(rm)) {
23488      imm8 = getUChar(delta+1);
23489      Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
23490      if (!ok) return deltaIN; /* FAIL */
23491      UInt rE = eregOfRexRM(pfx,rm);
23492      assign(argR, getXMMReg(rE));
23493      delta += 1+1;
23494      DIP("%s $%u,%s,%s,%s\n",
23495          opname, imm8,
23496          nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
23497   } else {
23498      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
23499      imm8 = getUChar(delta+alen);
23500      Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
23501      if (!ok) return deltaIN; /* FAIL */
23502      assign(argR,
23503             all_lanes   ? loadLE(Ity_V128, mkexpr(addr))
23504             : sz == 8   ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
23505             : /*sz==4*/   unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr))));
23506      delta += alen+1;
23507      DIP("%s $%u,%s,%s,%s\n",
23508          opname, imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
23509   }
23510
23511   assign(plain, preSwap ? binop(op, mkexpr(argR), mkexpr(argL))
23512                         : binop(op, mkexpr(argL), mkexpr(argR)));
23513
23514   if (all_lanes) {
23515      /* This is simple: just invert the result, if necessary, and
23516         have done. */
23517      if (postNot) {
23518         putYMMRegLoAndZU( rG, unop(Iop_NotV128, mkexpr(plain)) );
23519      } else {
23520         putYMMRegLoAndZU( rG, mkexpr(plain) );
23521      }
23522   }
23523   else
23524   if (!preSwap) {
23525      /* More complex.  It's a one-lane-only, hence need to possibly
23526         invert only that one lane.  But at least the other lanes are
23527         correctly "in" the result, having been copied from the left
23528         operand (argL). */
23529      if (postNot) {
23530         IRExpr* mask = mkV128(sz==4 ? 0x000F : 0x00FF);
23531         putYMMRegLoAndZU( rG, binop(Iop_XorV128, mkexpr(plain),
23532                                                  mask) );
23533      } else {
23534         putYMMRegLoAndZU( rG, mkexpr(plain) );
23535      }
23536   }
23537   else {
23538      /* This is the most complex case.  One-lane-only, but the args
23539         were swapped.  So we have to possibly invert the bottom lane,
23540         and (definitely) we have to copy the upper lane(s) from argL
23541         since, due to the swapping, what's currently there is from
23542         argR, which is not correct. */
23543      IRTemp res     = newTemp(Ity_V128);
23544      IRTemp mask    = newTemp(Ity_V128);
23545      IRTemp notMask = newTemp(Ity_V128);
23546      assign(mask,    mkV128(sz==4 ? 0x000F : 0x00FF));
23547      assign(notMask, mkV128(sz==4 ? 0xFFF0 : 0xFF00));
23548      if (postNot) {
23549         assign(res,
23550                binop(Iop_OrV128,
23551                      binop(Iop_AndV128,
23552                            unop(Iop_NotV128, mkexpr(plain)),
23553                            mkexpr(mask)),
23554                      binop(Iop_AndV128, mkexpr(argL), mkexpr(notMask))));
23555      } else {
23556         assign(res,
23557                binop(Iop_OrV128,
23558                      binop(Iop_AndV128,
23559                            mkexpr(plain),
23560                            mkexpr(mask)),
23561                      binop(Iop_AndV128, mkexpr(argL), mkexpr(notMask))));
23562      }
23563      putYMMRegLoAndZU( rG, mkexpr(res) );
23564   }
23565
23566   *uses_vvvv = True;
23567   return delta;
23568}
23569
23570
23571/* Handles AVX256 32F/64F comparisons.  A derivative of
23572   dis_SSEcmp_E_to_G.  It can fail, in which case it returns the
23573   original delta to indicate failure. */
23574static
23575Long dis_AVX256_cmp_V_E_to_G ( /*OUT*/Bool* uses_vvvv,
23576                               const VexAbiInfo* vbi,
23577                               Prefix pfx, Long delta,
23578                               const HChar* opname, Int sz )
23579{
23580   vassert(sz == 4 || sz == 8);
23581   Long    deltaIN = delta;
23582   HChar   dis_buf[50];
23583   Int     alen;
23584   UInt    imm8;
23585   IRTemp  addr;
23586   Bool    preSwap = False;
23587   IROp    op      = Iop_INVALID;
23588   Bool    postNot = False;
23589   IRTemp  plain   = newTemp(Ity_V256);
23590   UChar   rm      = getUChar(delta);
23591   UInt    rG      = gregOfRexRM(pfx, rm);
23592   UInt    rV      = getVexNvvvv(pfx);
23593   IRTemp argL     = newTemp(Ity_V256);
23594   IRTemp argR     = newTemp(Ity_V256);
23595   IRTemp argLhi   = IRTemp_INVALID;
23596   IRTemp argLlo   = IRTemp_INVALID;
23597   IRTemp argRhi   = IRTemp_INVALID;
23598   IRTemp argRlo   = IRTemp_INVALID;
23599
23600   assign(argL, getYMMReg(rV));
23601   if (epartIsReg(rm)) {
23602      imm8 = getUChar(delta+1);
23603      Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8,
23604                             True/*all_lanes*/, sz);
23605      if (!ok) return deltaIN; /* FAIL */
23606      UInt rE = eregOfRexRM(pfx,rm);
23607      assign(argR, getYMMReg(rE));
23608      delta += 1+1;
23609      DIP("%s $%u,%s,%s,%s\n",
23610          opname, imm8,
23611          nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
23612   } else {
23613      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
23614      imm8 = getUChar(delta+alen);
23615      Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8,
23616                             True/*all_lanes*/, sz);
23617      if (!ok) return deltaIN; /* FAIL */
23618      assign(argR, loadLE(Ity_V256, mkexpr(addr)) );
23619      delta += alen+1;
23620      DIP("%s $%u,%s,%s,%s\n",
23621          opname, imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
23622   }
23623
23624   breakupV256toV128s( preSwap ? argR : argL, &argLhi, &argLlo );
23625   breakupV256toV128s( preSwap ? argL : argR, &argRhi, &argRlo );
23626   assign(plain, binop( Iop_V128HLtoV256,
23627                        binop(op, mkexpr(argLhi), mkexpr(argRhi)),
23628                        binop(op, mkexpr(argLlo), mkexpr(argRlo)) ) );
23629
23630   /* This is simple: just invert the result, if necessary, and
23631      have done. */
23632   if (postNot) {
23633      putYMMReg( rG, unop(Iop_NotV256, mkexpr(plain)) );
23634   } else {
23635      putYMMReg( rG, mkexpr(plain) );
23636   }
23637
23638   *uses_vvvv = True;
23639   return delta;
23640}
23641
23642
23643/* Handles AVX128 unary E-to-G all-lanes operations. */
23644static
23645Long dis_AVX128_E_to_G_unary ( /*OUT*/Bool* uses_vvvv,
23646                               const VexAbiInfo* vbi,
23647                               Prefix pfx, Long delta,
23648                               const HChar* opname,
23649                               IRTemp (*opFn)(IRTemp) )
23650{
23651   HChar  dis_buf[50];
23652   Int    alen;
23653   IRTemp addr;
23654   IRTemp res  = newTemp(Ity_V128);
23655   IRTemp arg  = newTemp(Ity_V128);
23656   UChar  rm   = getUChar(delta);
23657   UInt   rG   = gregOfRexRM(pfx, rm);
23658   if (epartIsReg(rm)) {
23659      UInt rE = eregOfRexRM(pfx,rm);
23660      assign(arg, getXMMReg(rE));
23661      delta += 1;
23662      DIP("%s %s,%s\n", opname, nameXMMReg(rE), nameXMMReg(rG));
23663   } else {
23664      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23665      assign(arg, loadLE(Ity_V128, mkexpr(addr)));
23666      delta += alen;
23667      DIP("%s %s,%s\n", opname, dis_buf, nameXMMReg(rG));
23668   }
23669   res = opFn(arg);
23670   putYMMRegLoAndZU( rG, mkexpr(res) );
23671   *uses_vvvv = False;
23672   return delta;
23673}
23674
23675
23676/* Handles AVX128 unary E-to-G all-lanes operations. */
23677static
23678Long dis_AVX128_E_to_G_unary_all ( /*OUT*/Bool* uses_vvvv,
23679                                   const VexAbiInfo* vbi,
23680                                   Prefix pfx, Long delta,
23681                                   const HChar* opname, IROp op )
23682{
23683   HChar  dis_buf[50];
23684   Int    alen;
23685   IRTemp addr;
23686   IRTemp arg  = newTemp(Ity_V128);
23687   UChar  rm   = getUChar(delta);
23688   UInt   rG   = gregOfRexRM(pfx, rm);
23689   if (epartIsReg(rm)) {
23690      UInt rE = eregOfRexRM(pfx,rm);
23691      assign(arg, getXMMReg(rE));
23692      delta += 1;
23693      DIP("%s %s,%s\n", opname, nameXMMReg(rE), nameXMMReg(rG));
23694   } else {
23695      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23696      assign(arg, loadLE(Ity_V128, mkexpr(addr)));
23697      delta += alen;
23698      DIP("%s %s,%s\n", opname, dis_buf, nameXMMReg(rG));
23699   }
23700   // Sqrt32Fx4 and Sqrt64Fx2 take a rounding mode, which is faked
23701   // up in the usual way.
23702   Bool needsIRRM = op == Iop_Sqrt32Fx4 || op == Iop_Sqrt64Fx2;
23703   /* XXXROUNDINGFIXME */
23704   IRExpr* res = needsIRRM ? binop(op, get_FAKE_roundingmode(), mkexpr(arg))
23705                           : unop(op, mkexpr(arg));
23706   putYMMRegLoAndZU( rG, res );
23707   *uses_vvvv = False;
23708   return delta;
23709}
23710
23711
23712/* FIXME: common up with the _128_ version above? */
23713static
23714Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG (
23715        /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
23716        Prefix pfx, Long delta, const HChar* name,
23717        /* The actual operation.  Use either 'op' or 'opfn',
23718           but not both. */
23719        IROp op, IRTemp(*opFn)(IRTemp,IRTemp),
23720        Bool invertLeftArg,
23721        Bool swapArgs
23722     )
23723{
23724   UChar  modrm = getUChar(delta);
23725   UInt   rD    = gregOfRexRM(pfx, modrm);
23726   UInt   rSL   = getVexNvvvv(pfx);
23727   IRTemp tSL   = newTemp(Ity_V256);
23728   IRTemp tSR   = newTemp(Ity_V256);
23729   IRTemp addr  = IRTemp_INVALID;
23730   HChar  dis_buf[50];
23731   Int    alen  = 0;
23732   vassert(1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*WIG?*/);
23733
23734   assign(tSL, invertLeftArg ? unop(Iop_NotV256, getYMMReg(rSL))
23735                             : getYMMReg(rSL));
23736
23737   if (epartIsReg(modrm)) {
23738      UInt rSR = eregOfRexRM(pfx, modrm);
23739      delta += 1;
23740      assign(tSR, getYMMReg(rSR));
23741      DIP("%s %s,%s,%s\n",
23742          name, nameYMMReg(rSR), nameYMMReg(rSL), nameYMMReg(rD));
23743   } else {
23744      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
23745      delta += alen;
23746      assign(tSR, loadLE(Ity_V256, mkexpr(addr)));
23747      DIP("%s %s,%s,%s\n",
23748          name, dis_buf, nameYMMReg(rSL), nameYMMReg(rD));
23749   }
23750
23751   IRTemp res = IRTemp_INVALID;
23752   if (op != Iop_INVALID) {
23753      vassert(opFn == NULL);
23754      res = newTemp(Ity_V256);
23755      if (requiresRMode(op)) {
23756         IRTemp rm = newTemp(Ity_I32);
23757         assign(rm, get_FAKE_roundingmode()); /* XXXROUNDINGFIXME */
23758         assign(res, swapArgs
23759                        ? triop(op, mkexpr(rm), mkexpr(tSR), mkexpr(tSL))
23760                        : triop(op, mkexpr(rm), mkexpr(tSL), mkexpr(tSR)));
23761      } else {
23762         assign(res, swapArgs
23763                        ? binop(op, mkexpr(tSR), mkexpr(tSL))
23764                        : binop(op, mkexpr(tSL), mkexpr(tSR)));
23765      }
23766   } else {
23767      vassert(opFn != NULL);
23768      res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR);
23769   }
23770
23771   putYMMReg(rD, mkexpr(res));
23772
23773   *uses_vvvv = True;
23774   return delta;
23775}
23776
23777
23778/* All-lanes AVX256 binary operation:
23779   G[255:0] = V[255:0] `op` E[255:0]
23780*/
23781static Long dis_AVX256_E_V_to_G ( /*OUT*/Bool* uses_vvvv,
23782                                  const VexAbiInfo* vbi,
23783                                  Prefix pfx, Long delta,
23784                                  const HChar* opname, IROp op )
23785{
23786   return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
23787             uses_vvvv, vbi, pfx, delta, opname, op,
23788             NULL, False/*!invertLeftArg*/, False/*!swapArgs*/
23789   );
23790}
23791
23792
23793/* Handle a VEX_NDS_256_66_0F_WIG (3-addr) insn, with a simple IROp
23794   for the operation, no inversion of the left arg, and no swapping of
23795   args. */
23796static
23797Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple (
23798        /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
23799        Prefix pfx, Long delta, const HChar* name,
23800        IROp op
23801     )
23802{
23803   return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
23804             uses_vvvv, vbi, pfx, delta, name, op, NULL, False, False);
23805}
23806
23807
23808/* Handle a VEX_NDS_256_66_0F_WIG (3-addr) insn, using the given IR
23809   generator to compute the result, no inversion of the left
23810   arg, and no swapping of args. */
23811static
23812Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex (
23813        /*OUT*/Bool* uses_vvvv, const VexAbiInfo* vbi,
23814        Prefix pfx, Long delta, const HChar* name,
23815        IRTemp(*opFn)(IRTemp,IRTemp)
23816     )
23817{
23818   return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
23819             uses_vvvv, vbi, pfx, delta, name,
23820             Iop_INVALID, opFn, False, False );
23821}
23822
23823
23824/* Handles AVX256 unary E-to-G all-lanes operations. */
23825static
23826Long dis_AVX256_E_to_G_unary ( /*OUT*/Bool* uses_vvvv,
23827                               const VexAbiInfo* vbi,
23828                               Prefix pfx, Long delta,
23829                               const HChar* opname,
23830                               IRTemp (*opFn)(IRTemp) )
23831{
23832   HChar  dis_buf[50];
23833   Int    alen;
23834   IRTemp addr;
23835   IRTemp res  = newTemp(Ity_V256);
23836   IRTemp arg  = newTemp(Ity_V256);
23837   UChar  rm   = getUChar(delta);
23838   UInt   rG   = gregOfRexRM(pfx, rm);
23839   if (epartIsReg(rm)) {
23840      UInt rE = eregOfRexRM(pfx,rm);
23841      assign(arg, getYMMReg(rE));
23842      delta += 1;
23843      DIP("%s %s,%s\n", opname, nameYMMReg(rE), nameYMMReg(rG));
23844   } else {
23845      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23846      assign(arg, loadLE(Ity_V256, mkexpr(addr)));
23847      delta += alen;
23848      DIP("%s %s,%s\n", opname, dis_buf, nameYMMReg(rG));
23849   }
23850   res = opFn(arg);
23851   putYMMReg( rG, mkexpr(res) );
23852   *uses_vvvv = False;
23853   return delta;
23854}
23855
23856
23857/* Handles AVX256 unary E-to-G all-lanes operations. */
23858static
23859Long dis_AVX256_E_to_G_unary_all ( /*OUT*/Bool* uses_vvvv,
23860                                   const VexAbiInfo* vbi,
23861                                   Prefix pfx, Long delta,
23862                                   const HChar* opname, IROp op )
23863{
23864   HChar  dis_buf[50];
23865   Int    alen;
23866   IRTemp addr;
23867   IRTemp arg  = newTemp(Ity_V256);
23868   UChar  rm   = getUChar(delta);
23869   UInt   rG   = gregOfRexRM(pfx, rm);
23870   if (epartIsReg(rm)) {
23871      UInt rE = eregOfRexRM(pfx,rm);
23872      assign(arg, getYMMReg(rE));
23873      delta += 1;
23874      DIP("%s %s,%s\n", opname, nameYMMReg(rE), nameYMMReg(rG));
23875   } else {
23876      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23877      assign(arg, loadLE(Ity_V256, mkexpr(addr)));
23878      delta += alen;
23879      DIP("%s %s,%s\n", opname, dis_buf, nameYMMReg(rG));
23880   }
23881   putYMMReg( rG, unop(op, mkexpr(arg)) );
23882   *uses_vvvv = False;
23883   return delta;
23884}
23885
23886
23887/* The use of ReinterpF64asI64 is ugly.  Surely could do better if we
23888   had a variant of Iop_64x4toV256 that took F64s as args instead. */
23889static Long dis_CVTDQ2PD_256 ( const VexAbiInfo* vbi, Prefix pfx,
23890                               Long delta )
23891{
23892   IRTemp addr  = IRTemp_INVALID;
23893   Int    alen  = 0;
23894   HChar  dis_buf[50];
23895   UChar  modrm = getUChar(delta);
23896   IRTemp sV    = newTemp(Ity_V128);
23897   UInt   rG    = gregOfRexRM(pfx,modrm);
23898   if (epartIsReg(modrm)) {
23899      UInt rE = eregOfRexRM(pfx,modrm);
23900      assign( sV, getXMMReg(rE) );
23901      delta += 1;
23902      DIP("vcvtdq2pd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
23903   } else {
23904      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23905      assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
23906      delta += alen;
23907      DIP("vcvtdq2pd %s,%s\n", dis_buf, nameYMMReg(rG) );
23908   }
23909   IRTemp s3, s2, s1, s0;
23910   s3 = s2 = s1 = s0 = IRTemp_INVALID;
23911   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
23912   IRExpr* res
23913      = IRExpr_Qop(
23914           Iop_64x4toV256,
23915           unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s3))),
23916           unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s2))),
23917           unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s1))),
23918           unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s0)))
23919        );
23920   putYMMReg(rG, res);
23921   return delta;
23922}
23923
23924
23925static Long dis_CVTPD2PS_256 ( const VexAbiInfo* vbi, Prefix pfx,
23926                               Long delta )
23927{
23928   IRTemp addr  = IRTemp_INVALID;
23929   Int    alen  = 0;
23930   HChar  dis_buf[50];
23931   UChar  modrm = getUChar(delta);
23932   UInt   rG    = gregOfRexRM(pfx,modrm);
23933   IRTemp argV  = newTemp(Ity_V256);
23934   IRTemp rmode = newTemp(Ity_I32);
23935   if (epartIsReg(modrm)) {
23936      UInt rE = eregOfRexRM(pfx,modrm);
23937      assign( argV, getYMMReg(rE) );
23938      delta += 1;
23939      DIP("vcvtpd2psy %s,%s\n", nameYMMReg(rE), nameXMMReg(rG));
23940   } else {
23941      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23942      assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
23943      delta += alen;
23944      DIP("vcvtpd2psy %s,%s\n", dis_buf, nameXMMReg(rG) );
23945   }
23946
23947   assign( rmode, get_sse_roundingmode() );
23948   IRTemp t3, t2, t1, t0;
23949   t3 = t2 = t1 = t0 = IRTemp_INVALID;
23950   breakupV256to64s( argV, &t3, &t2, &t1, &t0 );
23951#  define CVT(_t)  binop( Iop_F64toF32, mkexpr(rmode), \
23952                          unop(Iop_ReinterpI64asF64, mkexpr(_t)) )
23953   putXMMRegLane32F( rG, 3, CVT(t3) );
23954   putXMMRegLane32F( rG, 2, CVT(t2) );
23955   putXMMRegLane32F( rG, 1, CVT(t1) );
23956   putXMMRegLane32F( rG, 0, CVT(t0) );
23957#  undef CVT
23958   putYMMRegLane128( rG, 1, mkV128(0) );
23959   return delta;
23960}
23961
23962
23963static IRTemp math_VPUNPCK_YMM ( IRTemp tL, IRType tR, IROp op )
23964{
23965   IRTemp tLhi, tLlo, tRhi, tRlo;
23966   tLhi = tLlo = tRhi = tRlo = IRTemp_INVALID;
23967   IRTemp res = newTemp(Ity_V256);
23968   breakupV256toV128s( tL, &tLhi, &tLlo );
23969   breakupV256toV128s( tR, &tRhi, &tRlo );
23970   assign( res, binop( Iop_V128HLtoV256,
23971                       binop( op, mkexpr(tRhi), mkexpr(tLhi) ),
23972                       binop( op, mkexpr(tRlo), mkexpr(tLlo) ) ) );
23973   return res;
23974}
23975
23976
23977static IRTemp math_VPUNPCKLBW_YMM ( IRTemp tL, IRTemp tR )
23978{
23979   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO8x16 );
23980}
23981
23982
23983static IRTemp math_VPUNPCKLWD_YMM ( IRTemp tL, IRTemp tR )
23984{
23985   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO16x8 );
23986}
23987
23988
23989static IRTemp math_VPUNPCKLDQ_YMM ( IRTemp tL, IRTemp tR )
23990{
23991   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO32x4 );
23992}
23993
23994
23995static IRTemp math_VPUNPCKLQDQ_YMM ( IRTemp tL, IRTemp tR )
23996{
23997   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO64x2 );
23998}
23999
24000
24001static IRTemp math_VPUNPCKHBW_YMM ( IRTemp tL, IRTemp tR )
24002{
24003   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI8x16 );
24004}
24005
24006
24007static IRTemp math_VPUNPCKHWD_YMM ( IRTemp tL, IRTemp tR )
24008{
24009   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI16x8 );
24010}
24011
24012
24013static IRTemp math_VPUNPCKHDQ_YMM ( IRTemp tL, IRTemp tR )
24014{
24015   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI32x4 );
24016}
24017
24018
24019static IRTemp math_VPUNPCKHQDQ_YMM ( IRTemp tL, IRTemp tR )
24020{
24021   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI64x2 );
24022}
24023
24024
24025static IRTemp math_VPACKSSWB_YMM ( IRTemp tL, IRTemp tR )
24026{
24027   return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin16Sto8Sx16 );
24028}
24029
24030
24031static IRTemp math_VPACKUSWB_YMM ( IRTemp tL, IRTemp tR )
24032{
24033   return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin16Sto8Ux16 );
24034}
24035
24036
24037static IRTemp math_VPACKSSDW_YMM ( IRTemp tL, IRTemp tR )
24038{
24039   return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin32Sto16Sx8 );
24040}
24041
24042
24043static IRTemp math_VPACKUSDW_YMM ( IRTemp tL, IRTemp tR )
24044{
24045   return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin32Sto16Ux8 );
24046}
24047
24048
24049__attribute__((noinline))
24050static
24051Long dis_ESC_0F__VEX (
24052        /*MB_OUT*/DisResult* dres,
24053        /*OUT*/   Bool*      uses_vvvv,
24054        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
24055        Bool         resteerCisOk,
24056        void*        callback_opaque,
24057        const VexArchInfo* archinfo,
24058        const VexAbiInfo*  vbi,
24059        Prefix pfx, Int sz, Long deltaIN
24060     )
24061{
24062   IRTemp addr  = IRTemp_INVALID;
24063   Int    alen  = 0;
24064   HChar  dis_buf[50];
24065   Long   delta = deltaIN;
24066   UChar  opc   = getUChar(delta);
24067   delta++;
24068   *uses_vvvv = False;
24069
24070   switch (opc) {
24071
24072   case 0x10:
24073      /* VMOVSD m64, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */
24074      /* Move 64 bits from E (mem only) to G (lo half xmm).
24075         Bits 255-64 of the dest are zeroed out. */
24076      if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) {
24077         UChar modrm = getUChar(delta);
24078         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24079         UInt   rG   = gregOfRexRM(pfx,modrm);
24080         IRTemp z128 = newTemp(Ity_V128);
24081         assign(z128, mkV128(0));
24082         putXMMReg( rG, mkexpr(z128) );
24083         /* FIXME: ALIGNMENT CHECK? */
24084         putXMMRegLane64( rG, 0, loadLE(Ity_I64, mkexpr(addr)) );
24085         putYMMRegLane128( rG, 1, mkexpr(z128) );
24086         DIP("vmovsd %s,%s\n", dis_buf, nameXMMReg(rG));
24087         delta += alen;
24088         goto decode_success;
24089      }
24090      /* VMOVSD xmm3, xmm2, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */
24091      /* Reg form. */
24092      if (haveF2no66noF3(pfx) && epartIsReg(getUChar(delta))) {
24093         UChar modrm = getUChar(delta);
24094         UInt  rG    = gregOfRexRM(pfx, modrm);
24095         UInt  rE    = eregOfRexRM(pfx, modrm);
24096         UInt  rV    = getVexNvvvv(pfx);
24097         delta++;
24098         DIP("vmovsd %s,%s,%s\n",
24099             nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
24100         IRTemp res = newTemp(Ity_V128);
24101         assign(res, binop(Iop_64HLtoV128,
24102                           getXMMRegLane64(rV, 1),
24103                           getXMMRegLane64(rE, 0)));
24104         putYMMRegLoAndZU(rG, mkexpr(res));
24105         *uses_vvvv = True;
24106         goto decode_success;
24107      }
24108      /* VMOVSS m32, xmm1 = VEX.LIG.F3.0F.WIG 10 /r */
24109      /* Move 32 bits from E (mem only) to G (lo half xmm).
24110         Bits 255-32 of the dest are zeroed out. */
24111      if (haveF3no66noF2(pfx) && !epartIsReg(getUChar(delta))) {
24112         UChar modrm = getUChar(delta);
24113         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24114         UInt   rG   = gregOfRexRM(pfx,modrm);
24115         IRTemp z128 = newTemp(Ity_V128);
24116         assign(z128, mkV128(0));
24117         putXMMReg( rG, mkexpr(z128) );
24118         /* FIXME: ALIGNMENT CHECK? */
24119         putXMMRegLane32( rG, 0, loadLE(Ity_I32, mkexpr(addr)) );
24120         putYMMRegLane128( rG, 1, mkexpr(z128) );
24121         DIP("vmovss %s,%s\n", dis_buf, nameXMMReg(rG));
24122         delta += alen;
24123         goto decode_success;
24124      }
24125      /* VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 10 /r */
24126      /* Reg form. */
24127      if (haveF3no66noF2(pfx) && epartIsReg(getUChar(delta))) {
24128         UChar modrm = getUChar(delta);
24129         UInt  rG    = gregOfRexRM(pfx, modrm);
24130         UInt  rE    = eregOfRexRM(pfx, modrm);
24131         UInt  rV    = getVexNvvvv(pfx);
24132         delta++;
24133         DIP("vmovss %s,%s,%s\n",
24134             nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
24135         IRTemp res = newTemp(Ity_V128);
24136         assign( res, binop( Iop_64HLtoV128,
24137                             getXMMRegLane64(rV, 1),
24138                             binop(Iop_32HLto64,
24139                                   getXMMRegLane32(rV, 1),
24140                                   getXMMRegLane32(rE, 0)) ) );
24141         putYMMRegLoAndZU(rG, mkexpr(res));
24142         *uses_vvvv = True;
24143         goto decode_success;
24144      }
24145      /* VMOVUPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 10 /r */
24146      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24147         UChar modrm = getUChar(delta);
24148         UInt  rG    = gregOfRexRM(pfx, modrm);
24149         if (epartIsReg(modrm)) {
24150            UInt rE = eregOfRexRM(pfx,modrm);
24151            putYMMRegLoAndZU( rG, getXMMReg( rE ));
24152            DIP("vmovupd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
24153            delta += 1;
24154         } else {
24155            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24156            putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
24157            DIP("vmovupd %s,%s\n", dis_buf, nameXMMReg(rG));
24158            delta += alen;
24159         }
24160         goto decode_success;
24161      }
24162      /* VMOVUPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 10 /r */
24163      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24164         UChar modrm = getUChar(delta);
24165         UInt  rG    = gregOfRexRM(pfx, modrm);
24166         if (epartIsReg(modrm)) {
24167            UInt rE = eregOfRexRM(pfx,modrm);
24168            putYMMReg( rG, getYMMReg( rE ));
24169            DIP("vmovupd %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
24170            delta += 1;
24171         } else {
24172            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24173            putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
24174            DIP("vmovupd %s,%s\n", dis_buf, nameYMMReg(rG));
24175            delta += alen;
24176         }
24177         goto decode_success;
24178      }
24179      /* VMOVUPS xmm2/m128, xmm1 = VEX.128.0F.WIG 10 /r */
24180      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24181         UChar modrm = getUChar(delta);
24182         UInt  rG    = gregOfRexRM(pfx, modrm);
24183         if (epartIsReg(modrm)) {
24184            UInt rE = eregOfRexRM(pfx,modrm);
24185            putYMMRegLoAndZU( rG, getXMMReg( rE ));
24186            DIP("vmovups %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
24187            delta += 1;
24188         } else {
24189            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24190            putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
24191            DIP("vmovups %s,%s\n", dis_buf, nameXMMReg(rG));
24192            delta += alen;
24193         }
24194         goto decode_success;
24195      }
24196      /* VMOVUPS ymm2/m256, ymm1 = VEX.256.0F.WIG 10 /r */
24197      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24198         UChar modrm = getUChar(delta);
24199         UInt  rG    = gregOfRexRM(pfx, modrm);
24200         if (epartIsReg(modrm)) {
24201            UInt rE = eregOfRexRM(pfx,modrm);
24202            putYMMReg( rG, getYMMReg( rE ));
24203            DIP("vmovups %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
24204            delta += 1;
24205         } else {
24206            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24207            putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
24208            DIP("vmovups %s,%s\n", dis_buf, nameYMMReg(rG));
24209            delta += alen;
24210         }
24211         goto decode_success;
24212      }
24213      break;
24214
24215   case 0x11:
24216      /* VMOVSD xmm1, m64 = VEX.LIG.F2.0F.WIG 11 /r */
24217      /* Move 64 bits from G (low half xmm) to mem only. */
24218      if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) {
24219         UChar modrm = getUChar(delta);
24220         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24221         UInt   rG   = gregOfRexRM(pfx,modrm);
24222         /* FIXME: ALIGNMENT CHECK? */
24223         storeLE( mkexpr(addr), getXMMRegLane64(rG, 0));
24224         DIP("vmovsd %s,%s\n", nameXMMReg(rG), dis_buf);
24225         delta += alen;
24226         goto decode_success;
24227      }
24228      /* VMOVSD xmm3, xmm2, xmm1 = VEX.LIG.F2.0F.WIG 11 /r */
24229      /* Reg form. */
24230      if (haveF2no66noF3(pfx) && epartIsReg(getUChar(delta))) {
24231         UChar modrm = getUChar(delta);
24232         UInt  rG    = gregOfRexRM(pfx, modrm);
24233         UInt  rE    = eregOfRexRM(pfx, modrm);
24234         UInt  rV    = getVexNvvvv(pfx);
24235         delta++;
24236         DIP("vmovsd %s,%s,%s\n",
24237             nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
24238         IRTemp res = newTemp(Ity_V128);
24239         assign(res, binop(Iop_64HLtoV128,
24240                           getXMMRegLane64(rV, 1),
24241                           getXMMRegLane64(rE, 0)));
24242         putYMMRegLoAndZU(rG, mkexpr(res));
24243         *uses_vvvv = True;
24244         goto decode_success;
24245      }
24246      /* VMOVSS xmm1, m64 = VEX.LIG.F3.0F.WIG 11 /r */
24247      /* Move 32 bits from G (low 1/4 xmm) to mem only. */
24248      if (haveF3no66noF2(pfx) && !epartIsReg(getUChar(delta))) {
24249         UChar modrm = getUChar(delta);
24250         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24251         UInt   rG   = gregOfRexRM(pfx,modrm);
24252         /* FIXME: ALIGNMENT CHECK? */
24253         storeLE( mkexpr(addr), getXMMRegLane32(rG, 0));
24254         DIP("vmovss %s,%s\n", nameXMMReg(rG), dis_buf);
24255         delta += alen;
24256         goto decode_success;
24257      }
24258      /* VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 11 /r */
24259      /* Reg form. */
24260      if (haveF3no66noF2(pfx) && epartIsReg(getUChar(delta))) {
24261         UChar modrm = getUChar(delta);
24262         UInt  rG    = gregOfRexRM(pfx, modrm);
24263         UInt  rE    = eregOfRexRM(pfx, modrm);
24264         UInt  rV    = getVexNvvvv(pfx);
24265         delta++;
24266         DIP("vmovss %s,%s,%s\n",
24267             nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
24268         IRTemp res = newTemp(Ity_V128);
24269         assign( res, binop( Iop_64HLtoV128,
24270                             getXMMRegLane64(rV, 1),
24271                             binop(Iop_32HLto64,
24272                                   getXMMRegLane32(rV, 1),
24273                                   getXMMRegLane32(rE, 0)) ) );
24274         putYMMRegLoAndZU(rG, mkexpr(res));
24275         *uses_vvvv = True;
24276         goto decode_success;
24277      }
24278      /* VMOVUPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 11 /r */
24279      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24280         UChar modrm = getUChar(delta);
24281         UInt  rG    = gregOfRexRM(pfx,modrm);
24282         if (epartIsReg(modrm)) {
24283            UInt rE = eregOfRexRM(pfx,modrm);
24284            putYMMRegLoAndZU( rE, getXMMReg(rG) );
24285            DIP("vmovupd %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
24286            delta += 1;
24287         } else {
24288            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24289            storeLE( mkexpr(addr), getXMMReg(rG) );
24290            DIP("vmovupd %s,%s\n", nameXMMReg(rG), dis_buf);
24291            delta += alen;
24292         }
24293         goto decode_success;
24294      }
24295      /* VMOVUPD ymm1, ymm2/m256 = VEX.256.66.0F.WIG 11 /r */
24296      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24297         UChar modrm = getUChar(delta);
24298         UInt  rG    = gregOfRexRM(pfx,modrm);
24299         if (epartIsReg(modrm)) {
24300            UInt rE = eregOfRexRM(pfx,modrm);
24301            putYMMReg( rE, getYMMReg(rG) );
24302            DIP("vmovupd %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
24303            delta += 1;
24304         } else {
24305            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24306            storeLE( mkexpr(addr), getYMMReg(rG) );
24307            DIP("vmovupd %s,%s\n", nameYMMReg(rG), dis_buf);
24308            delta += alen;
24309         }
24310         goto decode_success;
24311      }
24312      /* VMOVUPS xmm1, xmm2/m128 = VEX.128.0F.WIG 11 /r */
24313      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24314         UChar modrm = getUChar(delta);
24315         UInt  rG    = gregOfRexRM(pfx,modrm);
24316         if (epartIsReg(modrm)) {
24317            UInt rE = eregOfRexRM(pfx,modrm);
24318            putYMMRegLoAndZU( rE, getXMMReg(rG) );
24319            DIP("vmovups %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
24320            delta += 1;
24321         } else {
24322            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24323            storeLE( mkexpr(addr), getXMMReg(rG) );
24324            DIP("vmovups %s,%s\n", nameXMMReg(rG), dis_buf);
24325            delta += alen;
24326         }
24327         goto decode_success;
24328      }
24329      /* VMOVUPS ymm1, ymm2/m256 = VEX.256.0F.WIG 11 /r */
24330      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24331         UChar modrm = getUChar(delta);
24332         UInt  rG    = gregOfRexRM(pfx,modrm);
24333         if (epartIsReg(modrm)) {
24334            UInt rE = eregOfRexRM(pfx,modrm);
24335            putYMMReg( rE, getYMMReg(rG) );
24336            DIP("vmovups %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
24337            delta += 1;
24338         } else {
24339            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24340            storeLE( mkexpr(addr), getYMMReg(rG) );
24341            DIP("vmovups %s,%s\n", nameYMMReg(rG), dis_buf);
24342            delta += alen;
24343         }
24344         goto decode_success;
24345      }
24346      break;
24347
24348   case 0x12:
24349      /* VMOVDDUP xmm2/m64, xmm1 = VEX.128.F2.0F.WIG /12 r */
24350      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24351         delta = dis_MOVDDUP_128( vbi, pfx, delta, True/*isAvx*/ );
24352         goto decode_success;
24353      }
24354      /* VMOVDDUP ymm2/m256, ymm1 = VEX.256.F2.0F.WIG /12 r */
24355      if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24356         delta = dis_MOVDDUP_256( vbi, pfx, delta );
24357         goto decode_success;
24358      }
24359      /* VMOVHLPS xmm3, xmm2, xmm1 = VEX.NDS.128.0F.WIG 12 /r */
24360      /* Insn only exists in reg form */
24361      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
24362          && epartIsReg(getUChar(delta))) {
24363         UChar modrm = getUChar(delta);
24364         UInt  rG    = gregOfRexRM(pfx, modrm);
24365         UInt  rE    = eregOfRexRM(pfx, modrm);
24366         UInt  rV    = getVexNvvvv(pfx);
24367         delta++;
24368         DIP("vmovhlps %s,%s,%s\n",
24369             nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
24370         IRTemp res = newTemp(Ity_V128);
24371         assign(res, binop(Iop_64HLtoV128,
24372                           getXMMRegLane64(rV, 1),
24373                           getXMMRegLane64(rE, 1)));
24374         putYMMRegLoAndZU(rG, mkexpr(res));
24375         *uses_vvvv = True;
24376         goto decode_success;
24377      }
24378      /* VMOVLPS m64, xmm1, xmm2 = VEX.NDS.128.0F.WIG 12 /r */
24379      /* Insn exists only in mem form, it appears. */
24380      /* VMOVLPD m64, xmm1, xmm2 = VEX.NDS.128.66.0F.WIG 12 /r */
24381      /* Insn exists only in mem form, it appears. */
24382      if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
24383          && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
24384         UChar modrm = getUChar(delta);
24385         UInt  rG    = gregOfRexRM(pfx, modrm);
24386         UInt  rV    = getVexNvvvv(pfx);
24387         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24388         delta += alen;
24389         DIP("vmovlpd %s,%s,%s\n",
24390             dis_buf, nameXMMReg(rV), nameXMMReg(rG));
24391         IRTemp res = newTemp(Ity_V128);
24392         assign(res, binop(Iop_64HLtoV128,
24393                           getXMMRegLane64(rV, 1),
24394                           loadLE(Ity_I64, mkexpr(addr))));
24395         putYMMRegLoAndZU(rG, mkexpr(res));
24396         *uses_vvvv = True;
24397         goto decode_success;
24398      }
24399      /* VMOVSLDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 12 /r */
24400      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
24401         delta = dis_MOVSxDUP_128( vbi, pfx, delta, True/*isAvx*/,
24402                                   True/*isL*/ );
24403         goto decode_success;
24404      }
24405      /* VMOVSLDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 12 /r */
24406      if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
24407         delta = dis_MOVSxDUP_256( vbi, pfx, delta, True/*isL*/ );
24408         goto decode_success;
24409      }
24410      break;
24411
24412   case 0x13:
24413      /* VMOVLPS xmm1, m64 = VEX.128.0F.WIG 13 /r */
24414      /* Insn exists only in mem form, it appears. */
24415      /* VMOVLPD xmm1, m64 = VEX.128.66.0F.WIG 13 /r */
24416      /* Insn exists only in mem form, it appears. */
24417      if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
24418          && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
24419         UChar modrm = getUChar(delta);
24420         UInt  rG    = gregOfRexRM(pfx, modrm);
24421         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24422         delta += alen;
24423         storeLE( mkexpr(addr), getXMMRegLane64( rG, 0));
24424         DIP("vmovlpd %s,%s\n", nameXMMReg(rG), dis_buf);
24425         goto decode_success;
24426      }
24427      break;
24428
24429   case 0x14:
24430   case 0x15:
24431      /* VUNPCKLPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 14 /r */
24432      /* VUNPCKHPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 15 /r */
24433      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24434         Bool   hi    = opc == 0x15;
24435         UChar  modrm = getUChar(delta);
24436         UInt   rG    = gregOfRexRM(pfx,modrm);
24437         UInt   rV    = getVexNvvvv(pfx);
24438         IRTemp eV    = newTemp(Ity_V128);
24439         IRTemp vV    = newTemp(Ity_V128);
24440         assign( vV, getXMMReg(rV) );
24441         if (epartIsReg(modrm)) {
24442            UInt rE = eregOfRexRM(pfx,modrm);
24443            assign( eV, getXMMReg(rE) );
24444            delta += 1;
24445            DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
24446                nameXMMReg(rE), nameXMMReg(rG));
24447         } else {
24448            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24449            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
24450            delta += alen;
24451            DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
24452                dis_buf, nameXMMReg(rG));
24453         }
24454         IRTemp res = math_UNPCKxPS_128( eV, vV, hi );
24455         putYMMRegLoAndZU( rG, mkexpr(res) );
24456         *uses_vvvv = True;
24457         goto decode_success;
24458      }
24459      /* VUNPCKLPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 14 /r */
24460      /* VUNPCKHPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 15 /r */
24461      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24462         Bool   hi    = opc == 0x15;
24463         UChar  modrm = getUChar(delta);
24464         UInt   rG    = gregOfRexRM(pfx,modrm);
24465         UInt   rV    = getVexNvvvv(pfx);
24466         IRTemp eV    = newTemp(Ity_V256);
24467         IRTemp vV    = newTemp(Ity_V256);
24468         assign( vV, getYMMReg(rV) );
24469         if (epartIsReg(modrm)) {
24470            UInt rE = eregOfRexRM(pfx,modrm);
24471            assign( eV, getYMMReg(rE) );
24472            delta += 1;
24473            DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
24474                nameYMMReg(rE), nameYMMReg(rG));
24475         } else {
24476            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24477            assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
24478            delta += alen;
24479            DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
24480                dis_buf, nameYMMReg(rG));
24481         }
24482         IRTemp res = math_UNPCKxPS_256( eV, vV, hi );
24483         putYMMReg( rG, mkexpr(res) );
24484         *uses_vvvv = True;
24485         goto decode_success;
24486      }
24487      /* VUNPCKLPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 14 /r */
24488      /* VUNPCKHPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 15 /r */
24489      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24490         Bool   hi    = opc == 0x15;
24491         UChar  modrm = getUChar(delta);
24492         UInt   rG    = gregOfRexRM(pfx,modrm);
24493         UInt   rV    = getVexNvvvv(pfx);
24494         IRTemp eV    = newTemp(Ity_V128);
24495         IRTemp vV    = newTemp(Ity_V128);
24496         assign( vV, getXMMReg(rV) );
24497         if (epartIsReg(modrm)) {
24498            UInt rE = eregOfRexRM(pfx,modrm);
24499            assign( eV, getXMMReg(rE) );
24500            delta += 1;
24501            DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
24502                nameXMMReg(rE), nameXMMReg(rG));
24503         } else {
24504            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24505            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
24506            delta += alen;
24507            DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
24508                dis_buf, nameXMMReg(rG));
24509         }
24510         IRTemp res = math_UNPCKxPD_128( eV, vV, hi );
24511         putYMMRegLoAndZU( rG, mkexpr(res) );
24512         *uses_vvvv = True;
24513         goto decode_success;
24514      }
24515      /* VUNPCKLPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 14 /r */
24516      /* VUNPCKHPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 15 /r */
24517      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24518         Bool   hi    = opc == 0x15;
24519         UChar  modrm = getUChar(delta);
24520         UInt   rG    = gregOfRexRM(pfx,modrm);
24521         UInt   rV    = getVexNvvvv(pfx);
24522         IRTemp eV    = newTemp(Ity_V256);
24523         IRTemp vV    = newTemp(Ity_V256);
24524         assign( vV, getYMMReg(rV) );
24525         if (epartIsReg(modrm)) {
24526            UInt rE = eregOfRexRM(pfx,modrm);
24527            assign( eV, getYMMReg(rE) );
24528            delta += 1;
24529            DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
24530                nameYMMReg(rE), nameYMMReg(rG));
24531         } else {
24532            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24533            assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
24534            delta += alen;
24535            DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
24536                dis_buf, nameYMMReg(rG));
24537         }
24538         IRTemp res = math_UNPCKxPD_256( eV, vV, hi );
24539         putYMMReg( rG, mkexpr(res) );
24540         *uses_vvvv = True;
24541         goto decode_success;
24542      }
24543      break;
24544
24545   case 0x16:
24546      /* VMOVLHPS xmm3, xmm2, xmm1 = VEX.NDS.128.0F.WIG 16 /r */
24547      /* Insn only exists in reg form */
24548      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
24549          && epartIsReg(getUChar(delta))) {
24550         UChar modrm = getUChar(delta);
24551         UInt  rG    = gregOfRexRM(pfx, modrm);
24552         UInt  rE    = eregOfRexRM(pfx, modrm);
24553         UInt  rV    = getVexNvvvv(pfx);
24554         delta++;
24555         DIP("vmovlhps %s,%s,%s\n",
24556             nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
24557         IRTemp res = newTemp(Ity_V128);
24558         assign(res, binop(Iop_64HLtoV128,
24559                           getXMMRegLane64(rE, 0),
24560                           getXMMRegLane64(rV, 0)));
24561         putYMMRegLoAndZU(rG, mkexpr(res));
24562         *uses_vvvv = True;
24563         goto decode_success;
24564      }
24565      /* VMOVHPS m64, xmm1, xmm2 = VEX.NDS.128.0F.WIG 16 /r */
24566      /* Insn exists only in mem form, it appears. */
24567      /* VMOVHPD m64, xmm1, xmm2 = VEX.NDS.128.66.0F.WIG 16 /r */
24568      /* Insn exists only in mem form, it appears. */
24569      if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
24570          && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
24571         UChar modrm = getUChar(delta);
24572         UInt  rG    = gregOfRexRM(pfx, modrm);
24573         UInt  rV    = getVexNvvvv(pfx);
24574         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24575         delta += alen;
24576         DIP("vmovhp%c %s,%s,%s\n", have66(pfx) ? 'd' : 's',
24577             dis_buf, nameXMMReg(rV), nameXMMReg(rG));
24578         IRTemp res = newTemp(Ity_V128);
24579         assign(res, binop(Iop_64HLtoV128,
24580                           loadLE(Ity_I64, mkexpr(addr)),
24581                           getXMMRegLane64(rV, 0)));
24582         putYMMRegLoAndZU(rG, mkexpr(res));
24583         *uses_vvvv = True;
24584         goto decode_success;
24585      }
24586      /* VMOVSHDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 16 /r */
24587      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
24588         delta = dis_MOVSxDUP_128( vbi, pfx, delta, True/*isAvx*/,
24589                                   False/*!isL*/ );
24590         goto decode_success;
24591      }
24592      /* VMOVSHDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 16 /r */
24593      if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
24594         delta = dis_MOVSxDUP_256( vbi, pfx, delta, False/*!isL*/ );
24595         goto decode_success;
24596      }
24597      break;
24598
24599   case 0x17:
24600      /* VMOVHPS xmm1, m64 = VEX.128.0F.WIG 17 /r */
24601      /* Insn exists only in mem form, it appears. */
24602      /* VMOVHPD xmm1, m64 = VEX.128.66.0F.WIG 17 /r */
24603      /* Insn exists only in mem form, it appears. */
24604      if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
24605          && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
24606         UChar modrm = getUChar(delta);
24607         UInt  rG    = gregOfRexRM(pfx, modrm);
24608         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24609         delta += alen;
24610         storeLE( mkexpr(addr), getXMMRegLane64( rG, 1));
24611         DIP("vmovhp%c %s,%s\n", have66(pfx) ? 'd' : 's',
24612             nameXMMReg(rG), dis_buf);
24613         goto decode_success;
24614      }
24615      break;
24616
24617   case 0x28:
24618      /* VMOVAPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 28 /r */
24619      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24620         UChar modrm = getUChar(delta);
24621         UInt  rG    = gregOfRexRM(pfx, modrm);
24622         if (epartIsReg(modrm)) {
24623            UInt rE = eregOfRexRM(pfx,modrm);
24624            putYMMRegLoAndZU( rG, getXMMReg( rE ));
24625            DIP("vmovapd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
24626            delta += 1;
24627         } else {
24628            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24629            gen_SEGV_if_not_16_aligned( addr );
24630            putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
24631            DIP("vmovapd %s,%s\n", dis_buf, nameXMMReg(rG));
24632            delta += alen;
24633         }
24634         goto decode_success;
24635      }
24636      /* VMOVAPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 28 /r */
24637      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24638         UChar modrm = getUChar(delta);
24639         UInt  rG    = gregOfRexRM(pfx, modrm);
24640         if (epartIsReg(modrm)) {
24641            UInt rE = eregOfRexRM(pfx,modrm);
24642            putYMMReg( rG, getYMMReg( rE ));
24643            DIP("vmovapd %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
24644            delta += 1;
24645         } else {
24646            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24647            gen_SEGV_if_not_32_aligned( addr );
24648            putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
24649            DIP("vmovapd %s,%s\n", dis_buf, nameYMMReg(rG));
24650            delta += alen;
24651         }
24652         goto decode_success;
24653      }
24654      /* VMOVAPS xmm2/m128, xmm1 = VEX.128.0F.WIG 28 /r */
24655      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24656         UChar modrm = getUChar(delta);
24657         UInt  rG    = gregOfRexRM(pfx, modrm);
24658         if (epartIsReg(modrm)) {
24659            UInt rE = eregOfRexRM(pfx,modrm);
24660            putYMMRegLoAndZU( rG, getXMMReg( rE ));
24661            DIP("vmovaps %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
24662            delta += 1;
24663         } else {
24664            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24665            gen_SEGV_if_not_16_aligned( addr );
24666            putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
24667            DIP("vmovaps %s,%s\n", dis_buf, nameXMMReg(rG));
24668            delta += alen;
24669         }
24670         goto decode_success;
24671      }
24672      /* VMOVAPS ymm2/m256, ymm1 = VEX.256.0F.WIG 28 /r */
24673      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24674         UChar modrm = getUChar(delta);
24675         UInt  rG    = gregOfRexRM(pfx, modrm);
24676         if (epartIsReg(modrm)) {
24677            UInt rE = eregOfRexRM(pfx,modrm);
24678            putYMMReg( rG, getYMMReg( rE ));
24679            DIP("vmovaps %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
24680            delta += 1;
24681         } else {
24682            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24683            gen_SEGV_if_not_32_aligned( addr );
24684            putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
24685            DIP("vmovaps %s,%s\n", dis_buf, nameYMMReg(rG));
24686            delta += alen;
24687         }
24688         goto decode_success;
24689      }
24690      break;
24691
24692   case 0x29:
24693      /* VMOVAPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 29 /r */
24694      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24695         UChar modrm = getUChar(delta);
24696         UInt  rG    = gregOfRexRM(pfx,modrm);
24697         if (epartIsReg(modrm)) {
24698            UInt rE = eregOfRexRM(pfx,modrm);
24699            putYMMRegLoAndZU( rE, getXMMReg(rG) );
24700            DIP("vmovapd %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
24701            delta += 1;
24702         } else {
24703            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24704            gen_SEGV_if_not_16_aligned( addr );
24705            storeLE( mkexpr(addr), getXMMReg(rG) );
24706            DIP("vmovapd %s,%s\n", nameXMMReg(rG), dis_buf );
24707            delta += alen;
24708         }
24709         goto decode_success;
24710      }
24711      /* VMOVAPD ymm1, ymm2/m256 = VEX.256.66.0F.WIG 29 /r */
24712      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24713         UChar modrm = getUChar(delta);
24714         UInt  rG    = gregOfRexRM(pfx,modrm);
24715         if (epartIsReg(modrm)) {
24716            UInt rE = eregOfRexRM(pfx,modrm);
24717            putYMMReg( rE, getYMMReg(rG) );
24718            DIP("vmovapd %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
24719            delta += 1;
24720         } else {
24721            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24722            gen_SEGV_if_not_32_aligned( addr );
24723            storeLE( mkexpr(addr), getYMMReg(rG) );
24724            DIP("vmovapd %s,%s\n", nameYMMReg(rG), dis_buf );
24725            delta += alen;
24726         }
24727         goto decode_success;
24728      }
24729      /* VMOVAPS xmm1, xmm2/m128 = VEX.128.0F.WIG 29 /r */
24730      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24731         UChar modrm = getUChar(delta);
24732         UInt  rG    = gregOfRexRM(pfx,modrm);
24733         if (epartIsReg(modrm)) {
24734            UInt rE = eregOfRexRM(pfx,modrm);
24735            putYMMRegLoAndZU( rE, getXMMReg(rG) );
24736            DIP("vmovaps %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
24737            delta += 1;
24738            goto decode_success;
24739         } else {
24740            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24741            gen_SEGV_if_not_16_aligned( addr );
24742            storeLE( mkexpr(addr), getXMMReg(rG) );
24743            DIP("vmovaps %s,%s\n", nameXMMReg(rG), dis_buf );
24744            delta += alen;
24745            goto decode_success;
24746         }
24747      }
24748      /* VMOVAPS ymm1, ymm2/m256 = VEX.256.0F.WIG 29 /r */
24749      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24750         UChar modrm = getUChar(delta);
24751         UInt  rG    = gregOfRexRM(pfx,modrm);
24752         if (epartIsReg(modrm)) {
24753            UInt rE = eregOfRexRM(pfx,modrm);
24754            putYMMReg( rE, getYMMReg(rG) );
24755            DIP("vmovaps %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
24756            delta += 1;
24757            goto decode_success;
24758         } else {
24759            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24760            gen_SEGV_if_not_32_aligned( addr );
24761            storeLE( mkexpr(addr), getYMMReg(rG) );
24762            DIP("vmovaps %s,%s\n", nameYMMReg(rG), dis_buf );
24763            delta += alen;
24764            goto decode_success;
24765         }
24766      }
24767      break;
24768
24769   case 0x2A: {
24770      IRTemp rmode = newTemp(Ity_I32);
24771      assign( rmode, get_sse_roundingmode() );
24772      /* VCVTSI2SD r/m32, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W0 2A /r */
24773      if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
24774         UChar  modrm = getUChar(delta);
24775         UInt   rV    = getVexNvvvv(pfx);
24776         UInt   rD    = gregOfRexRM(pfx, modrm);
24777         IRTemp arg32 = newTemp(Ity_I32);
24778         if (epartIsReg(modrm)) {
24779            UInt rS = eregOfRexRM(pfx,modrm);
24780            assign( arg32, getIReg32(rS) );
24781            delta += 1;
24782            DIP("vcvtsi2sdl %s,%s,%s\n",
24783                nameIReg32(rS), nameXMMReg(rV), nameXMMReg(rD));
24784         } else {
24785            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24786            assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
24787            delta += alen;
24788            DIP("vcvtsi2sdl %s,%s,%s\n",
24789                dis_buf, nameXMMReg(rV), nameXMMReg(rD));
24790         }
24791         putXMMRegLane64F( rD, 0,
24792                           unop(Iop_I32StoF64, mkexpr(arg32)));
24793         putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
24794         putYMMRegLane128( rD, 1, mkV128(0) );
24795         *uses_vvvv = True;
24796         goto decode_success;
24797      }
24798      /* VCVTSI2SD r/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W1 2A /r */
24799      if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
24800         UChar  modrm = getUChar(delta);
24801         UInt   rV    = getVexNvvvv(pfx);
24802         UInt   rD    = gregOfRexRM(pfx, modrm);
24803         IRTemp arg64 = newTemp(Ity_I64);
24804         if (epartIsReg(modrm)) {
24805            UInt rS = eregOfRexRM(pfx,modrm);
24806            assign( arg64, getIReg64(rS) );
24807            delta += 1;
24808            DIP("vcvtsi2sdq %s,%s,%s\n",
24809                nameIReg64(rS), nameXMMReg(rV), nameXMMReg(rD));
24810         } else {
24811            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24812            assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
24813            delta += alen;
24814            DIP("vcvtsi2sdq %s,%s,%s\n",
24815                dis_buf, nameXMMReg(rV), nameXMMReg(rD));
24816         }
24817         putXMMRegLane64F( rD, 0,
24818                           binop( Iop_I64StoF64,
24819                                  get_sse_roundingmode(),
24820                                  mkexpr(arg64)) );
24821         putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
24822         putYMMRegLane128( rD, 1, mkV128(0) );
24823         *uses_vvvv = True;
24824         goto decode_success;
24825      }
24826      /* VCVTSI2SS r/m64, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W1 2A /r */
24827      if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
24828         UChar  modrm = getUChar(delta);
24829         UInt   rV    = getVexNvvvv(pfx);
24830         UInt   rD    = gregOfRexRM(pfx, modrm);
24831         IRTemp arg64 = newTemp(Ity_I64);
24832         if (epartIsReg(modrm)) {
24833            UInt rS = eregOfRexRM(pfx,modrm);
24834            assign( arg64, getIReg64(rS) );
24835            delta += 1;
24836            DIP("vcvtsi2ssq %s,%s,%s\n",
24837                nameIReg64(rS), nameXMMReg(rV), nameXMMReg(rD));
24838         } else {
24839            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24840            assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
24841            delta += alen;
24842            DIP("vcvtsi2ssq %s,%s,%s\n",
24843                dis_buf, nameXMMReg(rV), nameXMMReg(rD));
24844         }
24845         putXMMRegLane32F( rD, 0,
24846                           binop(Iop_F64toF32,
24847                                 mkexpr(rmode),
24848                                 binop(Iop_I64StoF64, mkexpr(rmode),
24849                                                      mkexpr(arg64)) ) );
24850         putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
24851         putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
24852         putYMMRegLane128( rD, 1, mkV128(0) );
24853         *uses_vvvv = True;
24854         goto decode_success;
24855      }
24856      /* VCVTSI2SS r/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W0 2A /r */
24857      if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
24858         UChar  modrm = getUChar(delta);
24859         UInt   rV    = getVexNvvvv(pfx);
24860         UInt   rD    = gregOfRexRM(pfx, modrm);
24861         IRTemp arg32 = newTemp(Ity_I32);
24862         if (epartIsReg(modrm)) {
24863            UInt rS = eregOfRexRM(pfx,modrm);
24864            assign( arg32, getIReg32(rS) );
24865            delta += 1;
24866            DIP("vcvtsi2ssl %s,%s,%s\n",
24867                nameIReg32(rS), nameXMMReg(rV), nameXMMReg(rD));
24868         } else {
24869            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24870            assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
24871            delta += alen;
24872            DIP("vcvtsi2ssl %s,%s,%s\n",
24873                dis_buf, nameXMMReg(rV), nameXMMReg(rD));
24874         }
24875         putXMMRegLane32F( rD, 0,
24876                           binop(Iop_F64toF32,
24877                                 mkexpr(rmode),
24878                                 unop(Iop_I32StoF64, mkexpr(arg32)) ) );
24879         putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
24880         putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
24881         putYMMRegLane128( rD, 1, mkV128(0) );
24882         *uses_vvvv = True;
24883         goto decode_success;
24884      }
24885      break;
24886   }
24887
24888   case 0x2B:
24889      /* VMOVNTPD xmm1, m128 = VEX.128.66.0F.WIG 2B /r */
24890      /* VMOVNTPS xmm1, m128 = VEX.128.0F.WIG 2B /r */
24891      if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
24892          && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
24893         UChar  modrm = getUChar(delta);
24894         UInt   rS    = gregOfRexRM(pfx, modrm);
24895         IRTemp tS    = newTemp(Ity_V128);
24896         assign(tS, getXMMReg(rS));
24897         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
24898         delta += alen;
24899         gen_SEGV_if_not_16_aligned(addr);
24900         storeLE(mkexpr(addr), mkexpr(tS));
24901         DIP("vmovntp%c %s,%s\n", have66(pfx) ? 'd' : 's',
24902             nameXMMReg(rS), dis_buf);
24903         goto decode_success;
24904      }
24905      /* VMOVNTPD ymm1, m256 = VEX.256.66.0F.WIG 2B /r */
24906      /* VMOVNTPS ymm1, m256 = VEX.256.0F.WIG 2B /r */
24907      if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
24908          && 1==getVexL(pfx)/*256*/ && !epartIsReg(getUChar(delta))) {
24909         UChar  modrm = getUChar(delta);
24910         UInt   rS    = gregOfRexRM(pfx, modrm);
24911         IRTemp tS    = newTemp(Ity_V256);
24912         assign(tS, getYMMReg(rS));
24913         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
24914         delta += alen;
24915         gen_SEGV_if_not_32_aligned(addr);
24916         storeLE(mkexpr(addr), mkexpr(tS));
24917         DIP("vmovntp%c %s,%s\n", have66(pfx) ? 'd' : 's',
24918             nameYMMReg(rS), dis_buf);
24919         goto decode_success;
24920      }
24921      break;
24922
24923   case 0x2C:
24924      /* VCVTTSD2SI xmm1/m32, r32 = VEX.LIG.F2.0F.W0 2C /r */
24925      if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
24926         delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
24927         goto decode_success;
24928      }
24929      /* VCVTTSD2SI xmm1/m64, r64 = VEX.LIG.F2.0F.W1 2C /r */
24930      if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
24931         delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
24932         goto decode_success;
24933      }
24934      /* VCVTTSS2SI xmm1/m32, r32 = VEX.LIG.F3.0F.W0 2C /r */
24935      if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
24936         delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
24937         goto decode_success;
24938      }
24939      /* VCVTTSS2SI xmm1/m64, r64 = VEX.LIG.F3.0F.W1 2C /r */
24940      if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
24941         delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
24942         goto decode_success;
24943      }
24944      break;
24945
24946   case 0x2D:
24947      /* VCVTSD2SI xmm1/m32, r32 = VEX.LIG.F2.0F.W0 2D /r */
24948      if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
24949         delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
24950         goto decode_success;
24951      }
24952      /* VCVTSD2SI xmm1/m64, r64 = VEX.LIG.F2.0F.W1 2D /r */
24953      if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
24954         delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
24955         goto decode_success;
24956      }
24957      /* VCVTSS2SI xmm1/m32, r32 = VEX.LIG.F3.0F.W0 2D /r */
24958      if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
24959         delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
24960         goto decode_success;
24961      }
24962      /* VCVTSS2SI xmm1/m64, r64 = VEX.LIG.F3.0F.W1 2D /r */
24963      if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
24964         delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
24965         goto decode_success;
24966      }
24967      break;
24968
24969   case 0x2E:
24970   case 0x2F:
24971      /* VUCOMISD xmm2/m64, xmm1 = VEX.LIG.66.0F.WIG 2E /r */
24972      /* VCOMISD  xmm2/m64, xmm1 = VEX.LIG.66.0F.WIG 2F /r */
24973      if (have66noF2noF3(pfx)) {
24974         delta = dis_COMISD( vbi, pfx, delta, True/*isAvx*/, opc );
24975         goto decode_success;
24976      }
24977      /* VUCOMISS xmm2/m32, xmm1 = VEX.LIG.0F.WIG 2E /r */
24978      /* VCOMISS xmm2/m32, xmm1  = VEX.LIG.0F.WIG 2F /r */
24979      if (haveNo66noF2noF3(pfx)) {
24980         delta = dis_COMISS( vbi, pfx, delta, True/*isAvx*/, opc );
24981         goto decode_success;
24982      }
24983      break;
24984
24985   case 0x50:
24986      /* VMOVMSKPD xmm2, r32 = VEX.128.66.0F.WIG 50 /r */
24987      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24988         delta = dis_MOVMSKPD_128( vbi, pfx, delta, True/*isAvx*/ );
24989         goto decode_success;
24990      }
24991      /* VMOVMSKPD ymm2, r32 = VEX.256.66.0F.WIG 50 /r */
24992      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24993         delta = dis_MOVMSKPD_256( vbi, pfx, delta );
24994         goto decode_success;
24995      }
24996      /* VMOVMSKPS xmm2, r32 = VEX.128.0F.WIG 50 /r */
24997      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24998         delta = dis_MOVMSKPS_128( vbi, pfx, delta, True/*isAvx*/ );
24999         goto decode_success;
25000      }
25001      /* VMOVMSKPS ymm2, r32 = VEX.256.0F.WIG 50 /r */
25002      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25003         delta = dis_MOVMSKPS_256( vbi, pfx, delta );
25004         goto decode_success;
25005      }
25006      break;
25007
25008   case 0x51:
25009      /* VSQRTSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 51 /r */
25010      if (haveF3no66noF2(pfx)) {
25011         delta = dis_AVX128_E_V_to_G_lo32_unary(
25012                    uses_vvvv, vbi, pfx, delta, "vsqrtss", Iop_Sqrt32F0x4 );
25013         goto decode_success;
25014      }
25015      /* VSQRTPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 51 /r */
25016      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25017         delta = dis_AVX128_E_to_G_unary_all(
25018                    uses_vvvv, vbi, pfx, delta, "vsqrtps", Iop_Sqrt32Fx4 );
25019         goto decode_success;
25020      }
25021      /* VSQRTPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 51 /r */
25022      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25023         delta = dis_AVX256_E_to_G_unary_all(
25024                    uses_vvvv, vbi, pfx, delta, "vsqrtps", Iop_Sqrt32Fx8 );
25025         goto decode_success;
25026      }
25027      /* VSQRTSD xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F2.0F.WIG 51 /r */
25028      if (haveF2no66noF3(pfx)) {
25029         delta = dis_AVX128_E_V_to_G_lo64_unary(
25030                    uses_vvvv, vbi, pfx, delta, "vsqrtsd", Iop_Sqrt64F0x2 );
25031         goto decode_success;
25032      }
25033      /* VSQRTPD xmm2/m128(E), xmm1(G) = VEX.NDS.128.66.0F.WIG 51 /r */
25034      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25035         delta = dis_AVX128_E_to_G_unary_all(
25036                    uses_vvvv, vbi, pfx, delta, "vsqrtpd", Iop_Sqrt64Fx2 );
25037         goto decode_success;
25038      }
25039      /* VSQRTPD ymm2/m256(E), ymm1(G) = VEX.NDS.256.66.0F.WIG 51 /r */
25040      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25041         delta = dis_AVX256_E_to_G_unary_all(
25042                    uses_vvvv, vbi, pfx, delta, "vsqrtpd", Iop_Sqrt64Fx4 );
25043         goto decode_success;
25044      }
25045      break;
25046
25047   case 0x52:
25048      /* VRSQRTSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 52 /r */
25049      if (haveF3no66noF2(pfx)) {
25050         delta = dis_AVX128_E_V_to_G_lo32_unary(
25051                    uses_vvvv, vbi, pfx, delta, "vrsqrtss",
25052                    Iop_RSqrtEst32F0x4 );
25053         goto decode_success;
25054      }
25055      /* VRSQRTPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 52 /r */
25056      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25057         delta = dis_AVX128_E_to_G_unary_all(
25058                    uses_vvvv, vbi, pfx, delta, "vrsqrtps", Iop_RSqrtEst32Fx4 );
25059         goto decode_success;
25060      }
25061      /* VRSQRTPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 52 /r */
25062      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25063         delta = dis_AVX256_E_to_G_unary_all(
25064                    uses_vvvv, vbi, pfx, delta, "vrsqrtps", Iop_RSqrtEst32Fx8 );
25065         goto decode_success;
25066      }
25067      break;
25068
25069   case 0x53:
25070      /* VRCPSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 53 /r */
25071      if (haveF3no66noF2(pfx)) {
25072         delta = dis_AVX128_E_V_to_G_lo32_unary(
25073                    uses_vvvv, vbi, pfx, delta, "vrcpss", Iop_RecipEst32F0x4 );
25074         goto decode_success;
25075      }
25076      /* VRCPPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 53 /r */
25077      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25078         delta = dis_AVX128_E_to_G_unary_all(
25079                    uses_vvvv, vbi, pfx, delta, "vrcpps", Iop_RecipEst32Fx4 );
25080         goto decode_success;
25081      }
25082      /* VRCPPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 53 /r */
25083      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25084         delta = dis_AVX256_E_to_G_unary_all(
25085                    uses_vvvv, vbi, pfx, delta, "vrcpps", Iop_RecipEst32Fx8 );
25086         goto decode_success;
25087      }
25088      break;
25089
25090   case 0x54:
25091      /* VANDPD r/m, rV, r ::: r = rV & r/m */
25092      /* VANDPD = VEX.NDS.128.66.0F.WIG 54 /r */
25093      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25094         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25095                    uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV128 );
25096         goto decode_success;
25097      }
25098      /* VANDPD r/m, rV, r ::: r = rV & r/m */
25099      /* VANDPD = VEX.NDS.256.66.0F.WIG 54 /r */
25100      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25101         delta = dis_AVX256_E_V_to_G(
25102                    uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV256 );
25103         goto decode_success;
25104      }
25105      /* VANDPS = VEX.NDS.128.0F.WIG 54 /r */
25106      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25107         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25108                    uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV128 );
25109         goto decode_success;
25110      }
25111      /* VANDPS = VEX.NDS.256.0F.WIG 54 /r */
25112      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25113         delta = dis_AVX256_E_V_to_G(
25114                    uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV256 );
25115         goto decode_success;
25116      }
25117      break;
25118
25119   case 0x55:
25120      /* VANDNPD r/m, rV, r ::: r = (not rV) & r/m */
25121      /* VANDNPD = VEX.NDS.128.66.0F.WIG 55 /r */
25122      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25123         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25124                    uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV128,
25125                    NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
25126         goto decode_success;
25127      }
25128      /* VANDNPD = VEX.NDS.256.66.0F.WIG 55 /r */
25129      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25130         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
25131                    uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV256,
25132                    NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
25133         goto decode_success;
25134      }
25135      /* VANDNPS = VEX.NDS.128.0F.WIG 55 /r */
25136      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25137         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25138                    uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV128,
25139                    NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
25140         goto decode_success;
25141      }
25142      /* VANDNPS = VEX.NDS.256.0F.WIG 55 /r */
25143      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25144         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
25145                    uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV256,
25146                    NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
25147         goto decode_success;
25148      }
25149      break;
25150
25151   case 0x56:
25152      /* VORPD r/m, rV, r ::: r = rV | r/m */
25153      /* VORPD = VEX.NDS.128.66.0F.WIG 56 /r */
25154      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25155         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25156                    uses_vvvv, vbi, pfx, delta, "vorpd", Iop_OrV128 );
25157         goto decode_success;
25158      }
25159      /* VORPD r/m, rV, r ::: r = rV | r/m */
25160      /* VORPD = VEX.NDS.256.66.0F.WIG 56 /r */
25161      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25162         delta = dis_AVX256_E_V_to_G(
25163                    uses_vvvv, vbi, pfx, delta, "vorpd", Iop_OrV256 );
25164         goto decode_success;
25165      }
25166      /* VORPS r/m, rV, r ::: r = rV | r/m */
25167      /* VORPS = VEX.NDS.128.0F.WIG 56 /r */
25168      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25169         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25170                    uses_vvvv, vbi, pfx, delta, "vorps", Iop_OrV128 );
25171         goto decode_success;
25172      }
25173      /* VORPS r/m, rV, r ::: r = rV | r/m */
25174      /* VORPS = VEX.NDS.256.0F.WIG 56 /r */
25175      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25176         delta = dis_AVX256_E_V_to_G(
25177                    uses_vvvv, vbi, pfx, delta, "vorps", Iop_OrV256 );
25178         goto decode_success;
25179      }
25180      break;
25181
25182   case 0x57:
25183      /* VXORPD r/m, rV, r ::: r = rV ^ r/m */
25184      /* VXORPD = VEX.NDS.128.66.0F.WIG 57 /r */
25185      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25186         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25187                    uses_vvvv, vbi, pfx, delta, "vxorpd", Iop_XorV128 );
25188         goto decode_success;
25189      }
25190      /* VXORPD r/m, rV, r ::: r = rV ^ r/m */
25191      /* VXORPD = VEX.NDS.256.66.0F.WIG 57 /r */
25192      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25193         delta = dis_AVX256_E_V_to_G(
25194                    uses_vvvv, vbi, pfx, delta, "vxorpd", Iop_XorV256 );
25195         goto decode_success;
25196      }
25197      /* VXORPS r/m, rV, r ::: r = rV ^ r/m */
25198      /* VXORPS = VEX.NDS.128.0F.WIG 57 /r */
25199      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25200         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25201                    uses_vvvv, vbi, pfx, delta, "vxorps", Iop_XorV128 );
25202         goto decode_success;
25203      }
25204      /* VXORPS r/m, rV, r ::: r = rV ^ r/m */
25205      /* VXORPS = VEX.NDS.256.0F.WIG 57 /r */
25206      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25207         delta = dis_AVX256_E_V_to_G(
25208                    uses_vvvv, vbi, pfx, delta, "vxorps", Iop_XorV256 );
25209         goto decode_success;
25210      }
25211      break;
25212
25213   case 0x58:
25214      /* VADDSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 58 /r */
25215      if (haveF2no66noF3(pfx)) {
25216         delta = dis_AVX128_E_V_to_G_lo64(
25217                    uses_vvvv, vbi, pfx, delta, "vaddsd", Iop_Add64F0x2 );
25218         goto decode_success;
25219      }
25220      /* VADDSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 58 /r */
25221      if (haveF3no66noF2(pfx)) {
25222         delta = dis_AVX128_E_V_to_G_lo32(
25223                    uses_vvvv, vbi, pfx, delta, "vaddss", Iop_Add32F0x4 );
25224         goto decode_success;
25225      }
25226      /* VADDPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 58 /r */
25227      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25228         delta = dis_AVX128_E_V_to_G(
25229                    uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx4 );
25230         goto decode_success;
25231      }
25232      /* VADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 58 /r */
25233      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25234         delta = dis_AVX256_E_V_to_G(
25235                    uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx8 );
25236         goto decode_success;
25237      }
25238      /* VADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 58 /r */
25239      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25240         delta = dis_AVX128_E_V_to_G(
25241                    uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx2 );
25242         goto decode_success;
25243      }
25244      /* VADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 58 /r */
25245      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25246         delta = dis_AVX256_E_V_to_G(
25247                    uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx4 );
25248         goto decode_success;
25249      }
25250      break;
25251
25252   case 0x59:
25253      /* VMULSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 59 /r */
25254      if (haveF2no66noF3(pfx)) {
25255         delta = dis_AVX128_E_V_to_G_lo64(
25256                    uses_vvvv, vbi, pfx, delta, "vmulsd", Iop_Mul64F0x2 );
25257         goto decode_success;
25258      }
25259      /* VMULSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 59 /r */
25260      if (haveF3no66noF2(pfx)) {
25261         delta = dis_AVX128_E_V_to_G_lo32(
25262                    uses_vvvv, vbi, pfx, delta, "vmulss", Iop_Mul32F0x4 );
25263         goto decode_success;
25264      }
25265      /* VMULPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 59 /r */
25266      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25267         delta = dis_AVX128_E_V_to_G(
25268                    uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx4 );
25269         goto decode_success;
25270      }
25271      /* VMULPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 59 /r */
25272      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25273         delta = dis_AVX256_E_V_to_G(
25274                    uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx8 );
25275         goto decode_success;
25276      }
25277      /* VMULPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 59 /r */
25278      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25279         delta = dis_AVX128_E_V_to_G(
25280                    uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx2 );
25281         goto decode_success;
25282      }
25283      /* VMULPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 59 /r */
25284      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25285         delta = dis_AVX256_E_V_to_G(
25286                    uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx4 );
25287         goto decode_success;
25288      }
25289      break;
25290
25291   case 0x5A:
25292      /* VCVTPS2PD xmm2/m64, xmm1 = VEX.128.0F.WIG 5A /r */
25293      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25294         delta = dis_CVTPS2PD_128( vbi, pfx, delta, True/*isAvx*/ );
25295         goto decode_success;
25296      }
25297      /* VCVTPS2PD xmm2/m128, ymm1 = VEX.256.0F.WIG 5A /r */
25298      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25299         delta = dis_CVTPS2PD_256( vbi, pfx, delta );
25300         goto decode_success;
25301      }
25302      /* VCVTPD2PS xmm2/m128, xmm1 = VEX.128.66.0F.WIG 5A /r */
25303      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25304         delta = dis_CVTPD2PS_128( vbi, pfx, delta, True/*isAvx*/ );
25305         goto decode_success;
25306      }
25307      /* VCVTPD2PS ymm2/m256, xmm1 = VEX.256.66.0F.WIG 5A /r */
25308      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25309         delta = dis_CVTPD2PS_256( vbi, pfx, delta );
25310         goto decode_success;
25311      }
25312      /* VCVTSD2SS xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5A /r */
25313      if (haveF2no66noF3(pfx)) {
25314         UChar  modrm = getUChar(delta);
25315         UInt   rV    = getVexNvvvv(pfx);
25316         UInt   rD    = gregOfRexRM(pfx, modrm);
25317         IRTemp f64lo = newTemp(Ity_F64);
25318         IRTemp rmode = newTemp(Ity_I32);
25319         assign( rmode, get_sse_roundingmode() );
25320         if (epartIsReg(modrm)) {
25321            UInt rS = eregOfRexRM(pfx,modrm);
25322            assign(f64lo, getXMMRegLane64F(rS, 0));
25323            delta += 1;
25324            DIP("vcvtsd2ss %s,%s,%s\n",
25325                nameXMMReg(rS), nameXMMReg(rV), nameXMMReg(rD));
25326         } else {
25327            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
25328            assign(f64lo, loadLE(Ity_F64, mkexpr(addr)) );
25329            delta += alen;
25330            DIP("vcvtsd2ss %s,%s,%s\n",
25331                dis_buf, nameXMMReg(rV), nameXMMReg(rD));
25332         }
25333         putXMMRegLane32F( rD, 0,
25334                           binop( Iop_F64toF32, mkexpr(rmode),
25335                                                mkexpr(f64lo)) );
25336         putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
25337         putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
25338         putYMMRegLane128( rD, 1, mkV128(0) );
25339         *uses_vvvv = True;
25340         goto decode_success;
25341      }
25342      /* VCVTSS2SD xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5A /r */
25343      if (haveF3no66noF2(pfx)) {
25344         UChar  modrm = getUChar(delta);
25345         UInt   rV    = getVexNvvvv(pfx);
25346         UInt   rD    = gregOfRexRM(pfx, modrm);
25347         IRTemp f32lo = newTemp(Ity_F32);
25348         if (epartIsReg(modrm)) {
25349            UInt rS = eregOfRexRM(pfx,modrm);
25350            assign(f32lo, getXMMRegLane32F(rS, 0));
25351            delta += 1;
25352            DIP("vcvtss2sd %s,%s,%s\n",
25353                nameXMMReg(rS), nameXMMReg(rV), nameXMMReg(rD));
25354         } else {
25355            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
25356            assign(f32lo, loadLE(Ity_F32, mkexpr(addr)) );
25357            delta += alen;
25358            DIP("vcvtss2sd %s,%s,%s\n",
25359                dis_buf, nameXMMReg(rV), nameXMMReg(rD));
25360         }
25361         putXMMRegLane64F( rD, 0,
25362                           unop( Iop_F32toF64, mkexpr(f32lo)) );
25363         putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
25364         putYMMRegLane128( rD, 1, mkV128(0) );
25365         *uses_vvvv = True;
25366         goto decode_success;
25367      }
25368      break;
25369
25370   case 0x5B:
25371      /* VCVTPS2DQ xmm2/m128, xmm1 = VEX.128.66.0F.WIG 5B /r */
25372      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25373         delta = dis_CVTxPS2DQ_128( vbi, pfx, delta,
25374                                    True/*isAvx*/, False/*!r2zero*/ );
25375         goto decode_success;
25376      }
25377      /* VCVTPS2DQ ymm2/m256, ymm1 = VEX.256.66.0F.WIG 5B /r */
25378      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25379         delta = dis_CVTxPS2DQ_256( vbi, pfx, delta,
25380                                    False/*!r2zero*/ );
25381         goto decode_success;
25382      }
25383      /* VCVTTPS2DQ xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 5B /r */
25384      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
25385         delta = dis_CVTxPS2DQ_128( vbi, pfx, delta,
25386                                    True/*isAvx*/, True/*r2zero*/ );
25387         goto decode_success;
25388      }
25389      /* VCVTTPS2DQ ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 5B /r */
25390      if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
25391         delta = dis_CVTxPS2DQ_256( vbi, pfx, delta,
25392                                    True/*r2zero*/ );
25393         goto decode_success;
25394      }
25395      /* VCVTDQ2PS xmm2/m128, xmm1 = VEX.128.0F.WIG 5B /r */
25396      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25397         delta = dis_CVTDQ2PS_128 ( vbi, pfx, delta, True/*isAvx*/ );
25398         goto decode_success;
25399      }
25400      /* VCVTDQ2PS ymm2/m256, ymm1 = VEX.256.0F.WIG 5B /r */
25401      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25402         delta = dis_CVTDQ2PS_256 ( vbi, pfx, delta );
25403         goto decode_success;
25404      }
25405      break;
25406
25407   case 0x5C:
25408      /* VSUBSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5C /r */
25409      if (haveF2no66noF3(pfx)) {
25410         delta = dis_AVX128_E_V_to_G_lo64(
25411                    uses_vvvv, vbi, pfx, delta, "vsubsd", Iop_Sub64F0x2 );
25412         goto decode_success;
25413      }
25414      /* VSUBSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5C /r */
25415      if (haveF3no66noF2(pfx)) {
25416         delta = dis_AVX128_E_V_to_G_lo32(
25417                    uses_vvvv, vbi, pfx, delta, "vsubss", Iop_Sub32F0x4 );
25418         goto decode_success;
25419      }
25420      /* VSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5C /r */
25421      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25422         delta = dis_AVX128_E_V_to_G(
25423                    uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx4 );
25424         goto decode_success;
25425      }
25426      /* VSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5C /r */
25427      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25428         delta = dis_AVX256_E_V_to_G(
25429                    uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx8 );
25430         goto decode_success;
25431      }
25432      /* VSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5C /r */
25433      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25434         delta = dis_AVX128_E_V_to_G(
25435                    uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx2 );
25436         goto decode_success;
25437      }
25438      /* VSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5C /r */
25439      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25440         delta = dis_AVX256_E_V_to_G(
25441                    uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx4 );
25442         goto decode_success;
25443      }
25444      break;
25445
25446   case 0x5D:
25447      /* VMINSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5D /r */
25448      if (haveF2no66noF3(pfx)) {
25449         delta = dis_AVX128_E_V_to_G_lo64(
25450                    uses_vvvv, vbi, pfx, delta, "vminsd", Iop_Min64F0x2 );
25451         goto decode_success;
25452      }
25453      /* VMINSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5D /r */
25454      if (haveF3no66noF2(pfx)) {
25455         delta = dis_AVX128_E_V_to_G_lo32(
25456                    uses_vvvv, vbi, pfx, delta, "vminss", Iop_Min32F0x4 );
25457         goto decode_success;
25458      }
25459      /* VMINPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5D /r */
25460      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25461         delta = dis_AVX128_E_V_to_G(
25462                    uses_vvvv, vbi, pfx, delta, "vminps", Iop_Min32Fx4 );
25463         goto decode_success;
25464      }
25465      /* VMINPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5D /r */
25466      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25467         delta = dis_AVX256_E_V_to_G(
25468                    uses_vvvv, vbi, pfx, delta, "vminps", Iop_Min32Fx8 );
25469         goto decode_success;
25470      }
25471      /* VMINPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5D /r */
25472      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25473         delta = dis_AVX128_E_V_to_G(
25474                    uses_vvvv, vbi, pfx, delta, "vminpd", Iop_Min64Fx2 );
25475         goto decode_success;
25476      }
25477      /* VMINPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5D /r */
25478      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25479         delta = dis_AVX256_E_V_to_G(
25480                    uses_vvvv, vbi, pfx, delta, "vminpd", Iop_Min64Fx4 );
25481         goto decode_success;
25482      }
25483      break;
25484
25485   case 0x5E:
25486      /* VDIVSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5E /r */
25487      if (haveF2no66noF3(pfx)) {
25488         delta = dis_AVX128_E_V_to_G_lo64(
25489                    uses_vvvv, vbi, pfx, delta, "vdivsd", Iop_Div64F0x2 );
25490         goto decode_success;
25491      }
25492      /* VDIVSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5E /r */
25493      if (haveF3no66noF2(pfx)) {
25494         delta = dis_AVX128_E_V_to_G_lo32(
25495                    uses_vvvv, vbi, pfx, delta, "vdivss", Iop_Div32F0x4 );
25496         goto decode_success;
25497      }
25498      /* VDIVPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5E /r */
25499      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25500         delta = dis_AVX128_E_V_to_G(
25501                    uses_vvvv, vbi, pfx, delta, "vdivps", Iop_Div32Fx4 );
25502         goto decode_success;
25503      }
25504      /* VDIVPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5E /r */
25505      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25506         delta = dis_AVX256_E_V_to_G(
25507                    uses_vvvv, vbi, pfx, delta, "vdivps", Iop_Div32Fx8 );
25508         goto decode_success;
25509      }
25510      /* VDIVPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5E /r */
25511      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25512         delta = dis_AVX128_E_V_to_G(
25513                    uses_vvvv, vbi, pfx, delta, "vdivpd", Iop_Div64Fx2 );
25514         goto decode_success;
25515      }
25516      /* VDIVPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5E /r */
25517      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25518         delta = dis_AVX256_E_V_to_G(
25519                    uses_vvvv, vbi, pfx, delta, "vdivpd", Iop_Div64Fx4 );
25520         goto decode_success;
25521      }
25522      break;
25523
25524   case 0x5F:
25525      /* VMAXSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5F /r */
25526      if (haveF2no66noF3(pfx)) {
25527         delta = dis_AVX128_E_V_to_G_lo64(
25528                    uses_vvvv, vbi, pfx, delta, "vmaxsd", Iop_Max64F0x2 );
25529         goto decode_success;
25530      }
25531      /* VMAXSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5F /r */
25532      if (haveF3no66noF2(pfx)) {
25533         delta = dis_AVX128_E_V_to_G_lo32(
25534                    uses_vvvv, vbi, pfx, delta, "vmaxss", Iop_Max32F0x4 );
25535         goto decode_success;
25536      }
25537      /* VMAXPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5F /r */
25538      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25539         delta = dis_AVX128_E_V_to_G(
25540                    uses_vvvv, vbi, pfx, delta, "vmaxps", Iop_Max32Fx4 );
25541         goto decode_success;
25542      }
25543      /* VMAXPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5F /r */
25544      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25545         delta = dis_AVX256_E_V_to_G(
25546                    uses_vvvv, vbi, pfx, delta, "vmaxps", Iop_Max32Fx8 );
25547         goto decode_success;
25548      }
25549      /* VMAXPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5F /r */
25550      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25551         delta = dis_AVX128_E_V_to_G(
25552                    uses_vvvv, vbi, pfx, delta, "vmaxpd", Iop_Max64Fx2 );
25553         goto decode_success;
25554      }
25555      /* VMAXPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5F /r */
25556      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25557         delta = dis_AVX256_E_V_to_G(
25558                    uses_vvvv, vbi, pfx, delta, "vmaxpd", Iop_Max64Fx4 );
25559         goto decode_success;
25560      }
25561      break;
25562
25563   case 0x60:
25564      /* VPUNPCKLBW r/m, rV, r ::: r = interleave-lo-bytes(rV, r/m) */
25565      /* VPUNPCKLBW = VEX.NDS.128.66.0F.WIG 60 /r */
25566      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25567         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25568                    uses_vvvv, vbi, pfx, delta, "vpunpcklbw",
25569                    Iop_InterleaveLO8x16, NULL,
25570                    False/*!invertLeftArg*/, True/*swapArgs*/ );
25571         goto decode_success;
25572      }
25573      /* VPUNPCKLBW r/m, rV, r ::: r = interleave-lo-bytes(rV, r/m) */
25574      /* VPUNPCKLBW = VEX.NDS.256.66.0F.WIG 60 /r */
25575      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25576         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25577                    uses_vvvv, vbi, pfx, delta, "vpunpcklbw",
25578                    math_VPUNPCKLBW_YMM );
25579         goto decode_success;
25580      }
25581      break;
25582
25583   case 0x61:
25584      /* VPUNPCKLWD r/m, rV, r ::: r = interleave-lo-words(rV, r/m) */
25585      /* VPUNPCKLWD = VEX.NDS.128.66.0F.WIG 61 /r */
25586      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25587         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25588                    uses_vvvv, vbi, pfx, delta, "vpunpcklwd",
25589                    Iop_InterleaveLO16x8, NULL,
25590                    False/*!invertLeftArg*/, True/*swapArgs*/ );
25591         goto decode_success;
25592      }
25593      /* VPUNPCKLWD r/m, rV, r ::: r = interleave-lo-words(rV, r/m) */
25594      /* VPUNPCKLWD = VEX.NDS.256.66.0F.WIG 61 /r */
25595      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25596         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25597                    uses_vvvv, vbi, pfx, delta, "vpunpcklwd",
25598                    math_VPUNPCKLWD_YMM );
25599         goto decode_success;
25600      }
25601      break;
25602
25603   case 0x62:
25604      /* VPUNPCKLDQ r/m, rV, r ::: r = interleave-lo-dwords(rV, r/m) */
25605      /* VPUNPCKLDQ = VEX.NDS.128.66.0F.WIG 62 /r */
25606      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25607         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25608                    uses_vvvv, vbi, pfx, delta, "vpunpckldq",
25609                    Iop_InterleaveLO32x4, NULL,
25610                    False/*!invertLeftArg*/, True/*swapArgs*/ );
25611         goto decode_success;
25612      }
25613      /* VPUNPCKLDQ r/m, rV, r ::: r = interleave-lo-dwords(rV, r/m) */
25614      /* VPUNPCKLDQ = VEX.NDS.256.66.0F.WIG 62 /r */
25615      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25616         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25617                    uses_vvvv, vbi, pfx, delta, "vpunpckldq",
25618                    math_VPUNPCKLDQ_YMM );
25619         goto decode_success;
25620      }
25621      break;
25622
25623   case 0x63:
25624      /* VPACKSSWB r/m, rV, r ::: r = QNarrowBin16Sto8Sx16(rV, r/m) */
25625      /* VPACKSSWB = VEX.NDS.128.66.0F.WIG 63 /r */
25626      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25627         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25628                    uses_vvvv, vbi, pfx, delta, "vpacksswb",
25629                    Iop_QNarrowBin16Sto8Sx16, NULL,
25630                    False/*!invertLeftArg*/, True/*swapArgs*/ );
25631         goto decode_success;
25632      }
25633      /* VPACKSSWB r/m, rV, r ::: r = QNarrowBin16Sto8Sx16(rV, r/m) */
25634      /* VPACKSSWB = VEX.NDS.256.66.0F.WIG 63 /r */
25635      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25636         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25637                    uses_vvvv, vbi, pfx, delta, "vpacksswb",
25638                    math_VPACKSSWB_YMM );
25639         goto decode_success;
25640      }
25641      break;
25642
25643   case 0x64:
25644      /* VPCMPGTB r/m, rV, r ::: r = rV `>s-by-8s` r/m */
25645      /* VPCMPGTB = VEX.NDS.128.66.0F.WIG 64 /r */
25646      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25647         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25648                    uses_vvvv, vbi, pfx, delta, "vpcmpgtb", Iop_CmpGT8Sx16 );
25649         goto decode_success;
25650      }
25651      /* VPCMPGTB r/m, rV, r ::: r = rV `>s-by-8s` r/m */
25652      /* VPCMPGTB = VEX.NDS.256.66.0F.WIG 64 /r */
25653      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25654         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
25655                    uses_vvvv, vbi, pfx, delta, "vpcmpgtb", Iop_CmpGT8Sx32 );
25656         goto decode_success;
25657      }
25658      break;
25659
25660   case 0x65:
25661      /* VPCMPGTW r/m, rV, r ::: r = rV `>s-by-16s` r/m */
25662      /* VPCMPGTW = VEX.NDS.128.66.0F.WIG 65 /r */
25663      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25664         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25665                    uses_vvvv, vbi, pfx, delta, "vpcmpgtw", Iop_CmpGT16Sx8 );
25666         goto decode_success;
25667      }
25668      /* VPCMPGTW r/m, rV, r ::: r = rV `>s-by-16s` r/m */
25669      /* VPCMPGTW = VEX.NDS.256.66.0F.WIG 65 /r */
25670      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25671         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
25672                    uses_vvvv, vbi, pfx, delta, "vpcmpgtw", Iop_CmpGT16Sx16 );
25673         goto decode_success;
25674      }
25675      break;
25676
25677   case 0x66:
25678      /* VPCMPGTD r/m, rV, r ::: r = rV `>s-by-32s` r/m */
25679      /* VPCMPGTD = VEX.NDS.128.66.0F.WIG 66 /r */
25680      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25681         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25682                    uses_vvvv, vbi, pfx, delta, "vpcmpgtd", Iop_CmpGT32Sx4 );
25683         goto decode_success;
25684      }
25685      /* VPCMPGTD r/m, rV, r ::: r = rV `>s-by-32s` r/m */
25686      /* VPCMPGTD = VEX.NDS.256.66.0F.WIG 66 /r */
25687      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25688         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
25689                    uses_vvvv, vbi, pfx, delta, "vpcmpgtd", Iop_CmpGT32Sx8 );
25690         goto decode_success;
25691      }
25692      break;
25693
25694   case 0x67:
25695      /* VPACKUSWB r/m, rV, r ::: r = QNarrowBin16Sto8Ux16(rV, r/m) */
25696      /* VPACKUSWB = VEX.NDS.128.66.0F.WIG 67 /r */
25697      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25698         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25699                    uses_vvvv, vbi, pfx, delta, "vpackuswb",
25700                    Iop_QNarrowBin16Sto8Ux16, NULL,
25701                    False/*!invertLeftArg*/, True/*swapArgs*/ );
25702         goto decode_success;
25703      }
25704      /* VPACKUSWB r/m, rV, r ::: r = QNarrowBin16Sto8Ux16(rV, r/m) */
25705      /* VPACKUSWB = VEX.NDS.256.66.0F.WIG 67 /r */
25706      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25707         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25708                    uses_vvvv, vbi, pfx, delta, "vpackuswb",
25709                    math_VPACKUSWB_YMM );
25710         goto decode_success;
25711      }
25712      break;
25713
25714   case 0x68:
25715      /* VPUNPCKHBW r/m, rV, r ::: r = interleave-hi-bytes(rV, r/m) */
25716      /* VPUNPCKHBW = VEX.NDS.128.0F.WIG 68 /r */
25717      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25718         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25719                    uses_vvvv, vbi, pfx, delta, "vpunpckhbw",
25720                    Iop_InterleaveHI8x16, NULL,
25721                    False/*!invertLeftArg*/, True/*swapArgs*/ );
25722         goto decode_success;
25723      }
25724      /* VPUNPCKHBW r/m, rV, r ::: r = interleave-hi-bytes(rV, r/m) */
25725      /* VPUNPCKHBW = VEX.NDS.256.0F.WIG 68 /r */
25726      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25727         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25728                    uses_vvvv, vbi, pfx, delta, "vpunpckhbw",
25729                    math_VPUNPCKHBW_YMM );
25730         goto decode_success;
25731      }
25732      break;
25733
25734   case 0x69:
25735      /* VPUNPCKHWD r/m, rV, r ::: r = interleave-hi-words(rV, r/m) */
25736      /* VPUNPCKHWD = VEX.NDS.128.0F.WIG 69 /r */
25737      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25738         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25739                    uses_vvvv, vbi, pfx, delta, "vpunpckhwd",
25740                    Iop_InterleaveHI16x8, NULL,
25741                    False/*!invertLeftArg*/, True/*swapArgs*/ );
25742         goto decode_success;
25743      }
25744      /* VPUNPCKHWD r/m, rV, r ::: r = interleave-hi-words(rV, r/m) */
25745      /* VPUNPCKHWD = VEX.NDS.256.0F.WIG 69 /r */
25746      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25747         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25748                    uses_vvvv, vbi, pfx, delta, "vpunpckhwd",
25749                    math_VPUNPCKHWD_YMM );
25750         goto decode_success;
25751      }
25752      break;
25753
25754   case 0x6A:
25755      /* VPUNPCKHDQ r/m, rV, r ::: r = interleave-hi-dwords(rV, r/m) */
25756      /* VPUNPCKHDQ = VEX.NDS.128.66.0F.WIG 6A /r */
25757      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25758         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25759                    uses_vvvv, vbi, pfx, delta, "vpunpckhdq",
25760                    Iop_InterleaveHI32x4, NULL,
25761                    False/*!invertLeftArg*/, True/*swapArgs*/ );
25762         goto decode_success;
25763      }
25764      /* VPUNPCKHDQ r/m, rV, r ::: r = interleave-hi-dwords(rV, r/m) */
25765      /* VPUNPCKHDQ = VEX.NDS.256.66.0F.WIG 6A /r */
25766      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25767         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25768                    uses_vvvv, vbi, pfx, delta, "vpunpckhdq",
25769                    math_VPUNPCKHDQ_YMM );
25770         goto decode_success;
25771      }
25772      break;
25773
25774   case 0x6B:
25775      /* VPACKSSDW r/m, rV, r ::: r = QNarrowBin32Sto16Sx8(rV, r/m) */
25776      /* VPACKSSDW = VEX.NDS.128.66.0F.WIG 6B /r */
25777      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25778         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25779                    uses_vvvv, vbi, pfx, delta, "vpackssdw",
25780                    Iop_QNarrowBin32Sto16Sx8, NULL,
25781                    False/*!invertLeftArg*/, True/*swapArgs*/ );
25782         goto decode_success;
25783      }
25784      /* VPACKSSDW r/m, rV, r ::: r = QNarrowBin32Sto16Sx8(rV, r/m) */
25785      /* VPACKSSDW = VEX.NDS.256.66.0F.WIG 6B /r */
25786      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25787         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25788                    uses_vvvv, vbi, pfx, delta, "vpackssdw",
25789                    math_VPACKSSDW_YMM );
25790         goto decode_success;
25791      }
25792      break;
25793
25794   case 0x6C:
25795      /* VPUNPCKLQDQ r/m, rV, r ::: r = interleave-lo-64bitses(rV, r/m) */
25796      /* VPUNPCKLQDQ = VEX.NDS.128.0F.WIG 6C /r */
25797      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25798         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25799                    uses_vvvv, vbi, pfx, delta, "vpunpcklqdq",
25800                    Iop_InterleaveLO64x2, NULL,
25801                    False/*!invertLeftArg*/, True/*swapArgs*/ );
25802         goto decode_success;
25803      }
25804      /* VPUNPCKLQDQ r/m, rV, r ::: r = interleave-lo-64bitses(rV, r/m) */
25805      /* VPUNPCKLQDQ = VEX.NDS.256.0F.WIG 6C /r */
25806      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25807         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25808                    uses_vvvv, vbi, pfx, delta, "vpunpcklqdq",
25809                    math_VPUNPCKLQDQ_YMM );
25810         goto decode_success;
25811      }
25812      break;
25813
25814   case 0x6D:
25815      /* VPUNPCKHQDQ r/m, rV, r ::: r = interleave-hi-64bitses(rV, r/m) */
25816      /* VPUNPCKHQDQ = VEX.NDS.128.0F.WIG 6D /r */
25817      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25818         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25819                    uses_vvvv, vbi, pfx, delta, "vpunpckhqdq",
25820                    Iop_InterleaveHI64x2, NULL,
25821                    False/*!invertLeftArg*/, True/*swapArgs*/ );
25822         goto decode_success;
25823      }
25824      /* VPUNPCKHQDQ r/m, rV, r ::: r = interleave-hi-64bitses(rV, r/m) */
25825      /* VPUNPCKHQDQ = VEX.NDS.256.0F.WIG 6D /r */
25826      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25827         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25828                    uses_vvvv, vbi, pfx, delta, "vpunpckhqdq",
25829                    math_VPUNPCKHQDQ_YMM );
25830         goto decode_success;
25831      }
25832      break;
25833
25834   case 0x6E:
25835      /* VMOVD r32/m32, xmm1 = VEX.128.66.0F.W0 6E */
25836      if (have66noF2noF3(pfx)
25837          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
25838         vassert(sz == 2); /* even tho we are transferring 4, not 2. */
25839         UChar modrm = getUChar(delta);
25840         if (epartIsReg(modrm)) {
25841            delta += 1;
25842            putYMMRegLoAndZU(
25843               gregOfRexRM(pfx,modrm),
25844               unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
25845            );
25846            DIP("vmovd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
25847                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
25848        } else {
25849            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
25850            delta += alen;
25851            putYMMRegLoAndZU(
25852               gregOfRexRM(pfx,modrm),
25853               unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)))
25854                             );
25855            DIP("vmovd %s, %s\n", dis_buf,
25856                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
25857         }
25858         goto decode_success;
25859      }
25860      /* VMOVQ r64/m64, xmm1 = VEX.128.66.0F.W1 6E */
25861      if (have66noF2noF3(pfx)
25862          && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
25863         vassert(sz == 2); /* even tho we are transferring 8, not 2. */
25864         UChar modrm = getUChar(delta);
25865         if (epartIsReg(modrm)) {
25866            delta += 1;
25867            putYMMRegLoAndZU(
25868               gregOfRexRM(pfx,modrm),
25869               unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
25870            );
25871            DIP("vmovq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
25872                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
25873        } else {
25874            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
25875            delta += alen;
25876            putYMMRegLoAndZU(
25877               gregOfRexRM(pfx,modrm),
25878               unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)))
25879                             );
25880            DIP("vmovq %s, %s\n", dis_buf,
25881                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
25882         }
25883         goto decode_success;
25884      }
25885      break;
25886
25887   case 0x6F:
25888      /* VMOVDQA ymm2/m256, ymm1 = VEX.256.66.0F.WIG 6F */
25889      /* VMOVDQU ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 6F */
25890      if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
25891          && 1==getVexL(pfx)/*256*/) {
25892         UChar  modrm = getUChar(delta);
25893         UInt   rD    = gregOfRexRM(pfx, modrm);
25894         IRTemp tD    = newTemp(Ity_V256);
25895         Bool   isA   = have66noF2noF3(pfx);
25896         HChar  ch    = isA ? 'a' : 'u';
25897         if (epartIsReg(modrm)) {
25898            UInt rS = eregOfRexRM(pfx, modrm);
25899            delta += 1;
25900            assign(tD, getYMMReg(rS));
25901            DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), nameYMMReg(rD));
25902         } else {
25903            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
25904            delta += alen;
25905            if (isA)
25906               gen_SEGV_if_not_32_aligned(addr);
25907            assign(tD, loadLE(Ity_V256, mkexpr(addr)));
25908            DIP("vmovdq%c %s,%s\n", ch, dis_buf, nameYMMReg(rD));
25909         }
25910         putYMMReg(rD, mkexpr(tD));
25911         goto decode_success;
25912      }
25913      /* VMOVDQA xmm2/m128, xmm1 = VEX.128.66.0F.WIG 6F */
25914      /* VMOVDQU xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 6F */
25915      if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
25916          && 0==getVexL(pfx)/*128*/) {
25917         UChar  modrm = getUChar(delta);
25918         UInt   rD    = gregOfRexRM(pfx, modrm);
25919         IRTemp tD    = newTemp(Ity_V128);
25920         Bool   isA   = have66noF2noF3(pfx);
25921         HChar  ch    = isA ? 'a' : 'u';
25922         if (epartIsReg(modrm)) {
25923            UInt rS = eregOfRexRM(pfx, modrm);
25924            delta += 1;
25925            assign(tD, getXMMReg(rS));
25926            DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), nameXMMReg(rD));
25927         } else {
25928            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
25929            delta += alen;
25930            if (isA)
25931               gen_SEGV_if_not_16_aligned(addr);
25932            assign(tD, loadLE(Ity_V128, mkexpr(addr)));
25933            DIP("vmovdq%c %s,%s\n", ch, dis_buf, nameXMMReg(rD));
25934         }
25935         putYMMRegLoAndZU(rD, mkexpr(tD));
25936         goto decode_success;
25937      }
25938      break;
25939
25940   case 0x70:
25941      /* VPSHUFD imm8, xmm2/m128, xmm1 = VEX.128.66.0F.WIG 70 /r ib */
25942      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25943         delta = dis_PSHUFD_32x4( vbi, pfx, delta, True/*writesYmm*/);
25944         goto decode_success;
25945      }
25946      /* VPSHUFD imm8, ymm2/m256, ymm1 = VEX.256.66.0F.WIG 70 /r ib */
25947      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25948         delta = dis_PSHUFD_32x8( vbi, pfx, delta);
25949         goto decode_success;
25950      }
25951      /* VPSHUFLW imm8, xmm2/m128, xmm1 = VEX.128.F2.0F.WIG 70 /r ib */
25952      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25953         delta = dis_PSHUFxW_128( vbi, pfx, delta,
25954                                  True/*isAvx*/, False/*!xIsH*/ );
25955         goto decode_success;
25956      }
25957      /* VPSHUFLW imm8, ymm2/m256, ymm1 = VEX.256.F2.0F.WIG 70 /r ib */
25958      if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25959         delta = dis_PSHUFxW_256( vbi, pfx, delta, False/*!xIsH*/ );
25960         goto decode_success;
25961      }
25962      /* VPSHUFHW imm8, xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 70 /r ib */
25963      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
25964         delta = dis_PSHUFxW_128( vbi, pfx, delta,
25965                                  True/*isAvx*/, True/*xIsH*/ );
25966         goto decode_success;
25967      }
25968      /* VPSHUFHW imm8, ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 70 /r ib */
25969      if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
25970         delta = dis_PSHUFxW_256( vbi, pfx, delta, True/*xIsH*/ );
25971         goto decode_success;
25972      }
25973      break;
25974
25975   case 0x71:
25976      /* VPSRLW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /2 ib */
25977      /* VPSRAW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /4 ib */
25978      /* VPSLLW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /6 ib */
25979      if (have66noF2noF3(pfx)
25980          && 0==getVexL(pfx)/*128*/
25981          && epartIsReg(getUChar(delta))) {
25982         if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
25983            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
25984                                                "vpsrlw", Iop_ShrN16x8 );
25985            *uses_vvvv = True;
25986            goto decode_success;
25987         }
25988         if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
25989            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
25990                                                "vpsraw", Iop_SarN16x8 );
25991            *uses_vvvv = True;
25992            goto decode_success;
25993         }
25994         if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
25995            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
25996                                                "vpsllw", Iop_ShlN16x8 );
25997            *uses_vvvv = True;
25998            goto decode_success;
25999         }
26000         /* else fall through */
26001      }
26002      /* VPSRLW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /2 ib */
26003      /* VPSRAW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /4 ib */
26004      /* VPSLLW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /6 ib */
26005      if (have66noF2noF3(pfx)
26006          && 1==getVexL(pfx)/*256*/
26007          && epartIsReg(getUChar(delta))) {
26008         if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
26009            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
26010                                                "vpsrlw", Iop_ShrN16x16 );
26011            *uses_vvvv = True;
26012            goto decode_success;
26013         }
26014         if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
26015            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
26016                                                "vpsraw", Iop_SarN16x16 );
26017            *uses_vvvv = True;
26018            goto decode_success;
26019         }
26020         if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
26021            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
26022                                                "vpsllw", Iop_ShlN16x16 );
26023            *uses_vvvv = True;
26024            goto decode_success;
26025         }
26026         /* else fall through */
26027      }
26028      break;
26029
26030   case 0x72:
26031      /* VPSRLD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /2 ib */
26032      /* VPSRAD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /4 ib */
26033      /* VPSLLD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /6 ib */
26034      if (have66noF2noF3(pfx)
26035          && 0==getVexL(pfx)/*128*/
26036          && epartIsReg(getUChar(delta))) {
26037         if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
26038            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
26039                                                "vpsrld", Iop_ShrN32x4 );
26040            *uses_vvvv = True;
26041            goto decode_success;
26042         }
26043         if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
26044            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
26045                                                "vpsrad", Iop_SarN32x4 );
26046            *uses_vvvv = True;
26047            goto decode_success;
26048         }
26049         if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
26050            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
26051                                                "vpslld", Iop_ShlN32x4 );
26052            *uses_vvvv = True;
26053            goto decode_success;
26054         }
26055         /* else fall through */
26056      }
26057      /* VPSRLD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /2 ib */
26058      /* VPSRAD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /4 ib */
26059      /* VPSLLD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /6 ib */
26060      if (have66noF2noF3(pfx)
26061          && 1==getVexL(pfx)/*256*/
26062          && epartIsReg(getUChar(delta))) {
26063         if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
26064            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
26065                                                "vpsrld", Iop_ShrN32x8 );
26066            *uses_vvvv = True;
26067            goto decode_success;
26068         }
26069         if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
26070            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
26071                                                "vpsrad", Iop_SarN32x8 );
26072            *uses_vvvv = True;
26073            goto decode_success;
26074         }
26075         if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
26076            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
26077                                                "vpslld", Iop_ShlN32x8 );
26078            *uses_vvvv = True;
26079            goto decode_success;
26080         }
26081         /* else fall through */
26082      }
26083      break;
26084
26085   case 0x73:
26086      /* VPSRLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /3 ib */
26087      /* VPSLLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /7 ib */
26088      /* VPSRLQ  imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /2 ib */
26089      /* VPSLLQ  imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /6 ib */
26090      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
26091          && epartIsReg(getUChar(delta))) {
26092         Int    rS   = eregOfRexRM(pfx,getUChar(delta));
26093         Int    rD   = getVexNvvvv(pfx);
26094         IRTemp vecS = newTemp(Ity_V128);
26095         if (gregLO3ofRM(getUChar(delta)) == 3) {
26096            Int imm = (Int)getUChar(delta+1);
26097            DIP("vpsrldq $%d,%s,%s\n", imm, nameXMMReg(rS), nameXMMReg(rD));
26098            delta += 2;
26099            assign( vecS, getXMMReg(rS) );
26100            putYMMRegLoAndZU(rD, mkexpr(math_PSRLDQ( vecS, imm )));
26101            *uses_vvvv = True;
26102            goto decode_success;
26103         }
26104         if (gregLO3ofRM(getUChar(delta)) == 7) {
26105            Int imm = (Int)getUChar(delta+1);
26106            DIP("vpslldq $%d,%s,%s\n", imm, nameXMMReg(rS), nameXMMReg(rD));
26107            delta += 2;
26108            assign( vecS, getXMMReg(rS) );
26109            putYMMRegLoAndZU(rD, mkexpr(math_PSLLDQ( vecS, imm )));
26110            *uses_vvvv = True;
26111            goto decode_success;
26112         }
26113         if (gregLO3ofRM(getUChar(delta)) == 2) {
26114            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
26115                                                "vpsrlq", Iop_ShrN64x2 );
26116            *uses_vvvv = True;
26117            goto decode_success;
26118         }
26119         if (gregLO3ofRM(getUChar(delta)) == 6) {
26120            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
26121                                                "vpsllq", Iop_ShlN64x2 );
26122            *uses_vvvv = True;
26123            goto decode_success;
26124         }
26125         /* else fall through */
26126      }
26127      /* VPSRLDQ imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /3 ib */
26128      /* VPSLLDQ imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /7 ib */
26129      /* VPSRLQ  imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /2 ib */
26130      /* VPSLLQ  imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /6 ib */
26131      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
26132          && epartIsReg(getUChar(delta))) {
26133         Int    rS   = eregOfRexRM(pfx,getUChar(delta));
26134         Int    rD   = getVexNvvvv(pfx);
26135         if (gregLO3ofRM(getUChar(delta)) == 3) {
26136            IRTemp vecS0 = newTemp(Ity_V128);
26137            IRTemp vecS1 = newTemp(Ity_V128);
26138            Int imm = (Int)getUChar(delta+1);
26139            DIP("vpsrldq $%d,%s,%s\n", imm, nameYMMReg(rS), nameYMMReg(rD));
26140            delta += 2;
26141            assign( vecS0, getYMMRegLane128(rS, 0));
26142            assign( vecS1, getYMMRegLane128(rS, 1));
26143            putYMMRegLane128(rD, 0, mkexpr(math_PSRLDQ( vecS0, imm )));
26144            putYMMRegLane128(rD, 1, mkexpr(math_PSRLDQ( vecS1, imm )));
26145            *uses_vvvv = True;
26146            goto decode_success;
26147         }
26148         if (gregLO3ofRM(getUChar(delta)) == 7) {
26149            IRTemp vecS0 = newTemp(Ity_V128);
26150            IRTemp vecS1 = newTemp(Ity_V128);
26151            Int imm = (Int)getUChar(delta+1);
26152            DIP("vpslldq $%d,%s,%s\n", imm, nameYMMReg(rS), nameYMMReg(rD));
26153            delta += 2;
26154            assign( vecS0, getYMMRegLane128(rS, 0));
26155            assign( vecS1, getYMMRegLane128(rS, 1));
26156            putYMMRegLane128(rD, 0, mkexpr(math_PSLLDQ( vecS0, imm )));
26157            putYMMRegLane128(rD, 1, mkexpr(math_PSLLDQ( vecS1, imm )));
26158            *uses_vvvv = True;
26159            goto decode_success;
26160         }
26161         if (gregLO3ofRM(getUChar(delta)) == 2) {
26162            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
26163                                                "vpsrlq", Iop_ShrN64x4 );
26164            *uses_vvvv = True;
26165            goto decode_success;
26166         }
26167         if (gregLO3ofRM(getUChar(delta)) == 6) {
26168            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
26169                                                "vpsllq", Iop_ShlN64x4 );
26170            *uses_vvvv = True;
26171            goto decode_success;
26172         }
26173         /* else fall through */
26174      }
26175      break;
26176
26177   case 0x74:
26178      /* VPCMPEQB r/m, rV, r ::: r = rV `eq-by-8s` r/m */
26179      /* VPCMPEQB = VEX.NDS.128.66.0F.WIG 74 /r */
26180      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26181         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26182                    uses_vvvv, vbi, pfx, delta, "vpcmpeqb", Iop_CmpEQ8x16 );
26183         goto decode_success;
26184      }
26185      /* VPCMPEQB r/m, rV, r ::: r = rV `eq-by-8s` r/m */
26186      /* VPCMPEQB = VEX.NDS.256.66.0F.WIG 74 /r */
26187      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26188         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26189                    uses_vvvv, vbi, pfx, delta, "vpcmpeqb", Iop_CmpEQ8x32 );
26190         goto decode_success;
26191      }
26192      break;
26193
26194   case 0x75:
26195      /* VPCMPEQW r/m, rV, r ::: r = rV `eq-by-16s` r/m */
26196      /* VPCMPEQW = VEX.NDS.128.66.0F.WIG 75 /r */
26197      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26198         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26199                    uses_vvvv, vbi, pfx, delta, "vpcmpeqw", Iop_CmpEQ16x8 );
26200         goto decode_success;
26201      }
26202      /* VPCMPEQW r/m, rV, r ::: r = rV `eq-by-16s` r/m */
26203      /* VPCMPEQW = VEX.NDS.256.66.0F.WIG 75 /r */
26204      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26205         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26206                    uses_vvvv, vbi, pfx, delta, "vpcmpeqw", Iop_CmpEQ16x16 );
26207         goto decode_success;
26208      }
26209      break;
26210
26211   case 0x76:
26212      /* VPCMPEQD r/m, rV, r ::: r = rV `eq-by-32s` r/m */
26213      /* VPCMPEQD = VEX.NDS.128.66.0F.WIG 76 /r */
26214      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26215         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26216                    uses_vvvv, vbi, pfx, delta, "vpcmpeqd", Iop_CmpEQ32x4 );
26217         goto decode_success;
26218      }
26219      /* VPCMPEQD r/m, rV, r ::: r = rV `eq-by-32s` r/m */
26220      /* VPCMPEQD = VEX.NDS.256.66.0F.WIG 76 /r */
26221      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26222         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26223                    uses_vvvv, vbi, pfx, delta, "vpcmpeqd", Iop_CmpEQ32x8 );
26224         goto decode_success;
26225      }
26226      break;
26227
26228   case 0x77:
26229      /* VZEROUPPER = VEX.128.0F.WIG 77 */
26230      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26231         Int i;
26232         IRTemp zero128 = newTemp(Ity_V128);
26233         assign(zero128, mkV128(0));
26234         for (i = 0; i < 16; i++) {
26235            putYMMRegLane128(i, 1, mkexpr(zero128));
26236         }
26237         DIP("vzeroupper\n");
26238         goto decode_success;
26239      }
26240      /* VZEROALL = VEX.256.0F.WIG 77 */
26241      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26242         Int i;
26243         IRTemp zero128 = newTemp(Ity_V128);
26244         assign(zero128, mkV128(0));
26245         for (i = 0; i < 16; i++) {
26246            putYMMRegLoAndZU(i, mkexpr(zero128));
26247         }
26248         DIP("vzeroall\n");
26249         goto decode_success;
26250      }
26251      break;
26252
26253   case 0x7C:
26254   case 0x7D:
26255      /* VHADDPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG 7C /r */
26256      /* VHSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG 7D /r */
26257      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26258         IRTemp sV     = newTemp(Ity_V128);
26259         IRTemp dV     = newTemp(Ity_V128);
26260         Bool   isAdd  = opc == 0x7C;
26261         const HChar* str = isAdd ? "add" : "sub";
26262         UChar modrm   = getUChar(delta);
26263         UInt   rG     = gregOfRexRM(pfx,modrm);
26264         UInt   rV     = getVexNvvvv(pfx);
26265         if (epartIsReg(modrm)) {
26266            UInt rE = eregOfRexRM(pfx,modrm);
26267            assign( sV, getXMMReg(rE) );
26268            DIP("vh%spd %s,%s,%s\n", str, nameXMMReg(rE),
26269                nameXMMReg(rV), nameXMMReg(rG));
26270            delta += 1;
26271         } else {
26272            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
26273            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
26274            DIP("vh%spd %s,%s,%s\n", str, dis_buf,
26275                nameXMMReg(rV), nameXMMReg(rG));
26276            delta += alen;
26277         }
26278         assign( dV, getXMMReg(rV) );
26279         putYMMRegLoAndZU( rG, mkexpr( math_HADDPS_128 ( dV, sV, isAdd ) ) );
26280         *uses_vvvv = True;
26281         goto decode_success;
26282      }
26283      /* VHADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG 7C /r */
26284      /* VHSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG 7D /r */
26285      if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26286         IRTemp sV     = newTemp(Ity_V256);
26287         IRTemp dV     = newTemp(Ity_V256);
26288         IRTemp s1, s0, d1, d0;
26289         Bool   isAdd  = opc == 0x7C;
26290         const HChar* str = isAdd ? "add" : "sub";
26291         UChar modrm   = getUChar(delta);
26292         UInt   rG     = gregOfRexRM(pfx,modrm);
26293         UInt   rV     = getVexNvvvv(pfx);
26294         s1 = s0 = d1 = d0 = IRTemp_INVALID;
26295         if (epartIsReg(modrm)) {
26296            UInt rE = eregOfRexRM(pfx,modrm);
26297            assign( sV, getYMMReg(rE) );
26298            DIP("vh%spd %s,%s,%s\n", str, nameYMMReg(rE),
26299                nameYMMReg(rV), nameYMMReg(rG));
26300            delta += 1;
26301         } else {
26302            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
26303            assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
26304            DIP("vh%spd %s,%s,%s\n", str, dis_buf,
26305                nameYMMReg(rV), nameYMMReg(rG));
26306            delta += alen;
26307         }
26308         assign( dV, getYMMReg(rV) );
26309         breakupV256toV128s( dV, &d1, &d0 );
26310         breakupV256toV128s( sV, &s1, &s0 );
26311         putYMMReg( rG, binop(Iop_V128HLtoV256,
26312                              mkexpr( math_HADDPS_128 ( d1, s1, isAdd ) ),
26313                              mkexpr( math_HADDPS_128 ( d0, s0, isAdd ) ) ) );
26314         *uses_vvvv = True;
26315         goto decode_success;
26316      }
26317      /* VHADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 7C /r */
26318      /* VHSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 7D /r */
26319      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26320         IRTemp sV     = newTemp(Ity_V128);
26321         IRTemp dV     = newTemp(Ity_V128);
26322         Bool   isAdd  = opc == 0x7C;
26323         const HChar* str = isAdd ? "add" : "sub";
26324         UChar modrm   = getUChar(delta);
26325         UInt   rG     = gregOfRexRM(pfx,modrm);
26326         UInt   rV     = getVexNvvvv(pfx);
26327         if (epartIsReg(modrm)) {
26328            UInt rE = eregOfRexRM(pfx,modrm);
26329            assign( sV, getXMMReg(rE) );
26330            DIP("vh%spd %s,%s,%s\n", str, nameXMMReg(rE),
26331                nameXMMReg(rV), nameXMMReg(rG));
26332            delta += 1;
26333         } else {
26334            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
26335            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
26336            DIP("vh%spd %s,%s,%s\n", str, dis_buf,
26337                nameXMMReg(rV), nameXMMReg(rG));
26338            delta += alen;
26339         }
26340         assign( dV, getXMMReg(rV) );
26341         putYMMRegLoAndZU( rG, mkexpr( math_HADDPD_128 ( dV, sV, isAdd ) ) );
26342         *uses_vvvv = True;
26343         goto decode_success;
26344      }
26345      /* VHADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 7C /r */
26346      /* VHSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 7D /r */
26347      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26348         IRTemp sV     = newTemp(Ity_V256);
26349         IRTemp dV     = newTemp(Ity_V256);
26350         IRTemp s1, s0, d1, d0;
26351         Bool   isAdd  = opc == 0x7C;
26352         const HChar* str = isAdd ? "add" : "sub";
26353         UChar modrm   = getUChar(delta);
26354         UInt   rG     = gregOfRexRM(pfx,modrm);
26355         UInt   rV     = getVexNvvvv(pfx);
26356         s1 = s0 = d1 = d0 = IRTemp_INVALID;
26357         if (epartIsReg(modrm)) {
26358            UInt rE = eregOfRexRM(pfx,modrm);
26359            assign( sV, getYMMReg(rE) );
26360            DIP("vh%spd %s,%s,%s\n", str, nameYMMReg(rE),
26361                nameYMMReg(rV), nameYMMReg(rG));
26362            delta += 1;
26363         } else {
26364            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
26365            assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
26366            DIP("vh%spd %s,%s,%s\n", str, dis_buf,
26367                nameYMMReg(rV), nameYMMReg(rG));
26368            delta += alen;
26369         }
26370         assign( dV, getYMMReg(rV) );
26371         breakupV256toV128s( dV, &d1, &d0 );
26372         breakupV256toV128s( sV, &s1, &s0 );
26373         putYMMReg( rG, binop(Iop_V128HLtoV256,
26374                              mkexpr( math_HADDPD_128 ( d1, s1, isAdd ) ),
26375                              mkexpr( math_HADDPD_128 ( d0, s0, isAdd ) ) ) );
26376         *uses_vvvv = True;
26377         goto decode_success;
26378      }
26379      break;
26380
26381   case 0x7E:
26382      /* Note the Intel docs don't make sense for this.  I think they
26383         are wrong.  They seem to imply it is a store when in fact I
26384         think it is a load.  Also it's unclear whether this is W0, W1
26385         or WIG. */
26386      /* VMOVQ xmm2/m64, xmm1 = VEX.128.F3.0F.W0 7E /r */
26387      if (haveF3no66noF2(pfx)
26388          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
26389         vassert(sz == 4); /* even tho we are transferring 8, not 4. */
26390         UChar modrm = getUChar(delta);
26391         UInt  rG    = gregOfRexRM(pfx,modrm);
26392         if (epartIsReg(modrm)) {
26393            UInt rE = eregOfRexRM(pfx,modrm);
26394            putXMMRegLane64( rG, 0, getXMMRegLane64( rE, 0 ));
26395            DIP("vmovq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
26396            delta += 1;
26397         } else {
26398            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
26399            putXMMRegLane64( rG, 0, loadLE(Ity_I64, mkexpr(addr)) );
26400            DIP("vmovq %s,%s\n", dis_buf, nameXMMReg(rG));
26401            delta += alen;
26402         }
26403         /* zero bits 255:64 */
26404         putXMMRegLane64( rG, 1, mkU64(0) );
26405         putYMMRegLane128( rG, 1, mkV128(0) );
26406         goto decode_success;
26407      }
26408      /* VMOVQ xmm1, r64 = VEX.128.66.0F.W1 7E /r (reg case only) */
26409      /* Moves from G to E, so is a store-form insn */
26410      /* Intel docs list this in the VMOVD entry for some reason. */
26411      if (have66noF2noF3(pfx)
26412          && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
26413         UChar modrm = getUChar(delta);
26414         UInt  rG    = gregOfRexRM(pfx,modrm);
26415         if (epartIsReg(modrm)) {
26416            UInt rE = eregOfRexRM(pfx,modrm);
26417            DIP("vmovq %s,%s\n", nameXMMReg(rG), nameIReg64(rE));
26418            putIReg64(rE, getXMMRegLane64(rG, 0));
26419            delta += 1;
26420         } else {
26421            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
26422            storeLE( mkexpr(addr), getXMMRegLane64(rG, 0) );
26423            DIP("vmovq %s,%s\n", dis_buf, nameXMMReg(rG));
26424            delta += alen;
26425         }
26426         goto decode_success;
26427      }
26428      /* VMOVD xmm1, m32/r32 = VEX.128.66.0F.W0 7E /r (reg case only) */
26429      /* Moves from G to E, so is a store-form insn */
26430      if (have66noF2noF3(pfx)
26431          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
26432         UChar modrm = getUChar(delta);
26433         UInt  rG    = gregOfRexRM(pfx,modrm);
26434         if (epartIsReg(modrm)) {
26435            UInt rE = eregOfRexRM(pfx,modrm);
26436            DIP("vmovd %s,%s\n", nameXMMReg(rG), nameIReg32(rE));
26437            putIReg32(rE, getXMMRegLane32(rG, 0));
26438            delta += 1;
26439         } else {
26440            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
26441            storeLE( mkexpr(addr), getXMMRegLane32(rG, 0) );
26442            DIP("vmovd %s,%s\n", dis_buf, nameXMMReg(rG));
26443            delta += alen;
26444         }
26445         goto decode_success;
26446      }
26447      break;
26448
26449   case 0x7F:
26450      /* VMOVDQA ymm1, ymm2/m256 = VEX.256.66.0F.WIG 7F */
26451      /* VMOVDQU ymm1, ymm2/m256 = VEX.256.F3.0F.WIG 7F */
26452      if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
26453          && 1==getVexL(pfx)/*256*/) {
26454         UChar  modrm = getUChar(delta);
26455         UInt   rS    = gregOfRexRM(pfx, modrm);
26456         IRTemp tS    = newTemp(Ity_V256);
26457         Bool   isA   = have66noF2noF3(pfx);
26458         HChar  ch    = isA ? 'a' : 'u';
26459         assign(tS, getYMMReg(rS));
26460         if (epartIsReg(modrm)) {
26461            UInt rD = eregOfRexRM(pfx, modrm);
26462            delta += 1;
26463            putYMMReg(rD, mkexpr(tS));
26464            DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), nameYMMReg(rD));
26465         } else {
26466            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
26467            delta += alen;
26468            if (isA)
26469               gen_SEGV_if_not_32_aligned(addr);
26470            storeLE(mkexpr(addr), mkexpr(tS));
26471            DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), dis_buf);
26472         }
26473         goto decode_success;
26474      }
26475      /* VMOVDQA xmm1, xmm2/m128 = VEX.128.66.0F.WIG 7F */
26476      /* VMOVDQU xmm1, xmm2/m128 = VEX.128.F3.0F.WIG 7F */
26477      if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
26478          && 0==getVexL(pfx)/*128*/) {
26479         UChar  modrm = getUChar(delta);
26480         UInt   rS    = gregOfRexRM(pfx, modrm);
26481         IRTemp tS    = newTemp(Ity_V128);
26482         Bool   isA   = have66noF2noF3(pfx);
26483         HChar  ch    = isA ? 'a' : 'u';
26484         assign(tS, getXMMReg(rS));
26485         if (epartIsReg(modrm)) {
26486            UInt rD = eregOfRexRM(pfx, modrm);
26487            delta += 1;
26488            putYMMRegLoAndZU(rD, mkexpr(tS));
26489            DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), nameXMMReg(rD));
26490         } else {
26491            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
26492            delta += alen;
26493            if (isA)
26494               gen_SEGV_if_not_16_aligned(addr);
26495            storeLE(mkexpr(addr), mkexpr(tS));
26496            DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), dis_buf);
26497         }
26498         goto decode_success;
26499      }
26500      break;
26501
26502   case 0xAE:
26503      /* VSTMXCSR m32 = VEX.LZ.0F.WIG AE /3 */
26504      if (haveNo66noF2noF3(pfx)
26505          && 0==getVexL(pfx)/*LZ*/
26506          && 0==getRexW(pfx) /* be paranoid -- Intel docs don't require this */
26507          && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 3
26508          && sz == 4) {
26509         delta = dis_STMXCSR(vbi, pfx, delta, True/*isAvx*/);
26510         goto decode_success;
26511      }
26512      /* VLDMXCSR m32 = VEX.LZ.0F.WIG AE /2 */
26513      if (haveNo66noF2noF3(pfx)
26514          && 0==getVexL(pfx)/*LZ*/
26515          && 0==getRexW(pfx) /* be paranoid -- Intel docs don't require this */
26516          && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 2
26517          && sz == 4) {
26518         delta = dis_LDMXCSR(vbi, pfx, delta, True/*isAvx*/);
26519         goto decode_success;
26520      }
26521      break;
26522
26523   case 0xC2:
26524      /* VCMPSD xmm3/m64(E=argL), xmm2(V=argR), xmm1(G) */
26525      /* = VEX.NDS.LIG.F2.0F.WIG C2 /r ib */
26526      if (haveF2no66noF3(pfx)) {
26527         Long delta0 = delta;
26528         delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
26529                                          "vcmpsd", False/*!all_lanes*/,
26530                                          8/*sz*/);
26531         if (delta > delta0) goto decode_success;
26532         /* else fall through -- decoding has failed */
26533      }
26534      /* VCMPSS xmm3/m32(E=argL), xmm2(V=argR), xmm1(G) */
26535      /* = VEX.NDS.LIG.F3.0F.WIG C2 /r ib */
26536      if (haveF3no66noF2(pfx)) {
26537         Long delta0 = delta;
26538         delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
26539                                          "vcmpss", False/*!all_lanes*/,
26540                                          4/*sz*/);
26541         if (delta > delta0) goto decode_success;
26542         /* else fall through -- decoding has failed */
26543      }
26544      /* VCMPPD xmm3/m128(E=argL), xmm2(V=argR), xmm1(G) */
26545      /* = VEX.NDS.128.66.0F.WIG C2 /r ib */
26546      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26547         Long delta0 = delta;
26548         delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
26549                                          "vcmppd", True/*all_lanes*/,
26550                                          8/*sz*/);
26551         if (delta > delta0) goto decode_success;
26552         /* else fall through -- decoding has failed */
26553      }
26554      /* VCMPPD ymm3/m256(E=argL), ymm2(V=argR), ymm1(G) */
26555      /* = VEX.NDS.256.66.0F.WIG C2 /r ib */
26556      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26557         Long delta0 = delta;
26558         delta = dis_AVX256_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
26559                                          "vcmppd", 8/*sz*/);
26560         if (delta > delta0) goto decode_success;
26561         /* else fall through -- decoding has failed */
26562      }
26563      /* VCMPPS xmm3/m128(E=argL), xmm2(V=argR), xmm1(G) */
26564      /* = VEX.NDS.128.0F.WIG C2 /r ib */
26565      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26566         Long delta0 = delta;
26567         delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
26568                                          "vcmpps", True/*all_lanes*/,
26569                                          4/*sz*/);
26570         if (delta > delta0) goto decode_success;
26571         /* else fall through -- decoding has failed */
26572      }
26573      /* VCMPPS ymm3/m256(E=argL), ymm2(V=argR), ymm1(G) */
26574      /* = VEX.NDS.256.0F.WIG C2 /r ib */
26575      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26576         Long delta0 = delta;
26577         delta = dis_AVX256_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
26578                                          "vcmpps", 4/*sz*/);
26579         if (delta > delta0) goto decode_success;
26580         /* else fall through -- decoding has failed */
26581      }
26582      break;
26583
26584   case 0xC4:
26585      /* VPINSRW r32/m16, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG C4 /r ib */
26586      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26587         UChar  modrm = getUChar(delta);
26588         UInt   rG    = gregOfRexRM(pfx, modrm);
26589         UInt   rV    = getVexNvvvv(pfx);
26590         Int    imm8;
26591         IRTemp new16 = newTemp(Ity_I16);
26592
26593         if ( epartIsReg( modrm ) ) {
26594            imm8 = (Int)(getUChar(delta+1) & 7);
26595            assign( new16, unop(Iop_32to16,
26596                                getIReg32(eregOfRexRM(pfx,modrm))) );
26597            delta += 1+1;
26598            DIP( "vpinsrw $%d,%s,%s\n", imm8,
26599                 nameIReg32( eregOfRexRM(pfx, modrm) ), nameXMMReg(rG) );
26600         } else {
26601            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
26602            imm8 = (Int)(getUChar(delta+alen) & 7);
26603            assign( new16, loadLE( Ity_I16, mkexpr(addr) ));
26604            delta += alen+1;
26605            DIP( "vpinsrw $%d,%s,%s\n",
26606                 imm8, dis_buf, nameXMMReg(rG) );
26607         }
26608
26609         IRTemp src_vec = newTemp(Ity_V128);
26610         assign(src_vec, getXMMReg( rV ));
26611         IRTemp res_vec = math_PINSRW_128( src_vec, new16, imm8 );
26612         putYMMRegLoAndZU( rG, mkexpr(res_vec) );
26613         *uses_vvvv = True;
26614         goto decode_success;
26615      }
26616      break;
26617
26618   case 0xC5:
26619      /* VPEXTRW imm8, xmm1, reg32 = VEX.128.66.0F.W0 C5 /r ib */
26620      if (have66noF2noF3(pfx)
26621         && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
26622         Long delta0 = delta;
26623         delta = dis_PEXTRW_128_EregOnly_toG( vbi, pfx, delta,
26624                                              True/*isAvx*/ );
26625         if (delta > delta0) goto decode_success;
26626         /* else fall through -- decoding has failed */
26627      }
26628      break;
26629
26630   case 0xC6:
26631      /* VSHUFPS imm8, xmm3/m128, xmm2, xmm1, xmm2 */
26632      /* = VEX.NDS.128.0F.WIG C6 /r ib */
26633      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26634         Int    imm8 = 0;
26635         IRTemp eV   = newTemp(Ity_V128);
26636         IRTemp vV   = newTemp(Ity_V128);
26637         UInt  modrm = getUChar(delta);
26638         UInt  rG    = gregOfRexRM(pfx,modrm);
26639         UInt  rV    = getVexNvvvv(pfx);
26640         assign( vV, getXMMReg(rV) );
26641         if (epartIsReg(modrm)) {
26642            UInt rE = eregOfRexRM(pfx,modrm);
26643            assign( eV, getXMMReg(rE) );
26644            imm8 = (Int)getUChar(delta+1);
26645            delta += 1+1;
26646            DIP("vshufps $%d,%s,%s,%s\n",
26647                imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
26648         } else {
26649            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
26650            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
26651            imm8 = (Int)getUChar(delta+alen);
26652            delta += 1+alen;
26653            DIP("vshufps $%d,%s,%s,%s\n",
26654                imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
26655         }
26656         IRTemp res = math_SHUFPS_128( eV, vV, imm8 );
26657         putYMMRegLoAndZU( rG, mkexpr(res) );
26658         *uses_vvvv = True;
26659         goto decode_success;
26660      }
26661      /* VSHUFPS imm8, ymm3/m256, ymm2, ymm1, ymm2 */
26662      /* = VEX.NDS.256.0F.WIG C6 /r ib */
26663      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26664         Int    imm8 = 0;
26665         IRTemp eV   = newTemp(Ity_V256);
26666         IRTemp vV   = newTemp(Ity_V256);
26667         UInt  modrm = getUChar(delta);
26668         UInt  rG    = gregOfRexRM(pfx,modrm);
26669         UInt  rV    = getVexNvvvv(pfx);
26670         assign( vV, getYMMReg(rV) );
26671         if (epartIsReg(modrm)) {
26672            UInt rE = eregOfRexRM(pfx,modrm);
26673            assign( eV, getYMMReg(rE) );
26674            imm8 = (Int)getUChar(delta+1);
26675            delta += 1+1;
26676            DIP("vshufps $%d,%s,%s,%s\n",
26677                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
26678         } else {
26679            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
26680            assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
26681            imm8 = (Int)getUChar(delta+alen);
26682            delta += 1+alen;
26683            DIP("vshufps $%d,%s,%s,%s\n",
26684                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
26685         }
26686         IRTemp res = math_SHUFPS_256( eV, vV, imm8 );
26687         putYMMReg( rG, mkexpr(res) );
26688         *uses_vvvv = True;
26689         goto decode_success;
26690      }
26691      /* VSHUFPD imm8, xmm3/m128, xmm2, xmm1, xmm2 */
26692      /* = VEX.NDS.128.66.0F.WIG C6 /r ib */
26693      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26694         Int    imm8 = 0;
26695         IRTemp eV   = newTemp(Ity_V128);
26696         IRTemp vV   = newTemp(Ity_V128);
26697         UInt  modrm = getUChar(delta);
26698         UInt  rG    = gregOfRexRM(pfx,modrm);
26699         UInt  rV    = getVexNvvvv(pfx);
26700         assign( vV, getXMMReg(rV) );
26701         if (epartIsReg(modrm)) {
26702            UInt rE = eregOfRexRM(pfx,modrm);
26703            assign( eV, getXMMReg(rE) );
26704            imm8 = (Int)getUChar(delta+1);
26705            delta += 1+1;
26706            DIP("vshufpd $%d,%s,%s,%s\n",
26707                imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
26708         } else {
26709            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
26710            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
26711            imm8 = (Int)getUChar(delta+alen);
26712            delta += 1+alen;
26713            DIP("vshufpd $%d,%s,%s,%s\n",
26714                imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
26715         }
26716         IRTemp res = math_SHUFPD_128( eV, vV, imm8 );
26717         putYMMRegLoAndZU( rG, mkexpr(res) );
26718         *uses_vvvv = True;
26719         goto decode_success;
26720      }
26721      /* VSHUFPD imm8, ymm3/m256, ymm2, ymm1, ymm2 */
26722      /* = VEX.NDS.256.66.0F.WIG C6 /r ib */
26723      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26724         Int    imm8 = 0;
26725         IRTemp eV   = newTemp(Ity_V256);
26726         IRTemp vV   = newTemp(Ity_V256);
26727         UInt  modrm = getUChar(delta);
26728         UInt  rG    = gregOfRexRM(pfx,modrm);
26729         UInt  rV    = getVexNvvvv(pfx);
26730         assign( vV, getYMMReg(rV) );
26731         if (epartIsReg(modrm)) {
26732            UInt rE = eregOfRexRM(pfx,modrm);
26733            assign( eV, getYMMReg(rE) );
26734            imm8 = (Int)getUChar(delta+1);
26735            delta += 1+1;
26736            DIP("vshufpd $%d,%s,%s,%s\n",
26737                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
26738         } else {
26739            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
26740            assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
26741            imm8 = (Int)getUChar(delta+alen);
26742            delta += 1+alen;
26743            DIP("vshufpd $%d,%s,%s,%s\n",
26744                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
26745         }
26746         IRTemp res = math_SHUFPD_256( eV, vV, imm8 );
26747         putYMMReg( rG, mkexpr(res) );
26748         *uses_vvvv = True;
26749         goto decode_success;
26750      }
26751      break;
26752
26753   case 0xD0:
26754      /* VADDSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D0 /r */
26755      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26756         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
26757                    uses_vvvv, vbi, pfx, delta,
26758                    "vaddsubpd", math_ADDSUBPD_128 );
26759         goto decode_success;
26760      }
26761      /* VADDSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D0 /r */
26762      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26763         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
26764                    uses_vvvv, vbi, pfx, delta,
26765                    "vaddsubpd", math_ADDSUBPD_256 );
26766         goto decode_success;
26767      }
26768      /* VADDSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG D0 /r */
26769      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26770         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
26771                    uses_vvvv, vbi, pfx, delta,
26772                    "vaddsubps", math_ADDSUBPS_128 );
26773         goto decode_success;
26774      }
26775      /* VADDSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG D0 /r */
26776      if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26777         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
26778                    uses_vvvv, vbi, pfx, delta,
26779                    "vaddsubps", math_ADDSUBPS_256 );
26780         goto decode_success;
26781      }
26782      break;
26783
26784   case 0xD1:
26785      /* VPSRLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D1 /r */
26786      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26787         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
26788                                        "vpsrlw", Iop_ShrN16x8 );
26789         *uses_vvvv = True;
26790         goto decode_success;
26791
26792      }
26793      /* VPSRLW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D1 /r */
26794      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26795         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
26796                                        "vpsrlw", Iop_ShrN16x16 );
26797         *uses_vvvv = True;
26798         goto decode_success;
26799
26800      }
26801      break;
26802
26803   case 0xD2:
26804      /* VPSRLD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D2 /r */
26805      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26806         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
26807                                        "vpsrld", Iop_ShrN32x4 );
26808         *uses_vvvv = True;
26809         goto decode_success;
26810      }
26811      /* VPSRLD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D2 /r */
26812      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26813         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
26814                                        "vpsrld", Iop_ShrN32x8 );
26815         *uses_vvvv = True;
26816         goto decode_success;
26817      }
26818      break;
26819
26820   case 0xD3:
26821      /* VPSRLQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D3 /r */
26822      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26823         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
26824                                        "vpsrlq", Iop_ShrN64x2 );
26825         *uses_vvvv = True;
26826         goto decode_success;
26827      }
26828      /* VPSRLQ xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D3 /r */
26829      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26830         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
26831                                        "vpsrlq", Iop_ShrN64x4 );
26832         *uses_vvvv = True;
26833         goto decode_success;
26834      }
26835      break;
26836
26837   case 0xD4:
26838      /* VPADDQ r/m, rV, r ::: r = rV + r/m */
26839      /* VPADDQ = VEX.NDS.128.66.0F.WIG D4 /r */
26840      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26841         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26842                    uses_vvvv, vbi, pfx, delta, "vpaddq", Iop_Add64x2 );
26843         goto decode_success;
26844      }
26845      /* VPADDQ r/m, rV, r ::: r = rV + r/m */
26846      /* VPADDQ = VEX.NDS.256.66.0F.WIG D4 /r */
26847      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26848         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26849                    uses_vvvv, vbi, pfx, delta, "vpaddq", Iop_Add64x4 );
26850         goto decode_success;
26851      }
26852      break;
26853
26854   case 0xD5:
26855      /* VPMULLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D5 /r */
26856      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26857         delta = dis_AVX128_E_V_to_G(
26858                    uses_vvvv, vbi, pfx, delta, "vpmullw", Iop_Mul16x8 );
26859         goto decode_success;
26860      }
26861      /* VPMULLW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D5 /r */
26862      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26863         delta = dis_AVX256_E_V_to_G(
26864                    uses_vvvv, vbi, pfx, delta, "vpmullw", Iop_Mul16x16 );
26865         goto decode_success;
26866      }
26867      break;
26868
26869   case 0xD6:
26870      /* I can't even find any Intel docs for this one. */
26871      /* Basically: 66 0F D6 = MOVQ -- move 64 bits from G (lo half
26872         xmm) to E (mem or lo half xmm).  Looks like L==0(128), W==0
26873         (WIG, maybe?) */
26874      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
26875          && 0==getRexW(pfx)/*this might be redundant, dunno*/) {
26876         UChar modrm = getUChar(delta);
26877         UInt  rG    = gregOfRexRM(pfx,modrm);
26878         if (epartIsReg(modrm)) {
26879            /* fall through, awaiting test case */
26880            /* dst: lo half copied, hi half zeroed */
26881         } else {
26882            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
26883            storeLE( mkexpr(addr), getXMMRegLane64( rG, 0 ));
26884            DIP("vmovq %s,%s\n", nameXMMReg(rG), dis_buf );
26885            delta += alen;
26886            goto decode_success;
26887         }
26888      }
26889      break;
26890
26891   case 0xD7:
26892      /* VEX.128.66.0F.WIG D7 /r = VPMOVMSKB xmm1, r32 */
26893      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26894         delta = dis_PMOVMSKB_128( vbi, pfx, delta, True/*isAvx*/ );
26895         goto decode_success;
26896      }
26897      /* VEX.128.66.0F.WIG D7 /r = VPMOVMSKB ymm1, r32 */
26898      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26899         delta = dis_PMOVMSKB_256( vbi, pfx, delta );
26900         goto decode_success;
26901      }
26902      break;
26903
26904   case 0xD8:
26905      /* VPSUBUSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D8 /r */
26906      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26907         delta = dis_AVX128_E_V_to_G(
26908                    uses_vvvv, vbi, pfx, delta, "vpsubusb", Iop_QSub8Ux16 );
26909         goto decode_success;
26910      }
26911      /* VPSUBUSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D8 /r */
26912      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26913         delta = dis_AVX256_E_V_to_G(
26914                    uses_vvvv, vbi, pfx, delta, "vpsubusb", Iop_QSub8Ux32 );
26915         goto decode_success;
26916      }
26917      break;
26918
26919   case 0xD9:
26920      /* VPSUBUSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D9 /r */
26921      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26922         delta = dis_AVX128_E_V_to_G(
26923                    uses_vvvv, vbi, pfx, delta, "vpsubusw", Iop_QSub16Ux8 );
26924         goto decode_success;
26925      }
26926      /* VPSUBUSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D9 /r */
26927      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26928         delta = dis_AVX256_E_V_to_G(
26929                    uses_vvvv, vbi, pfx, delta, "vpsubusw", Iop_QSub16Ux16 );
26930         goto decode_success;
26931      }
26932      break;
26933
26934   case 0xDA:
26935      /* VPMINUB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DA /r */
26936      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26937         delta = dis_AVX128_E_V_to_G(
26938                    uses_vvvv, vbi, pfx, delta, "vpminub", Iop_Min8Ux16 );
26939         goto decode_success;
26940      }
26941      /* VPMINUB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DA /r */
26942      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26943         delta = dis_AVX256_E_V_to_G(
26944                    uses_vvvv, vbi, pfx, delta, "vpminub", Iop_Min8Ux32 );
26945         goto decode_success;
26946      }
26947      break;
26948
26949   case 0xDB:
26950      /* VPAND r/m, rV, r ::: r = rV & r/m */
26951      /* VEX.NDS.128.66.0F.WIG DB /r = VPAND xmm3/m128, xmm2, xmm1 */
26952      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26953         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26954                    uses_vvvv, vbi, pfx, delta, "vpand", Iop_AndV128 );
26955         goto decode_success;
26956      }
26957      /* VPAND r/m, rV, r ::: r = rV & r/m */
26958      /* VEX.NDS.256.66.0F.WIG DB /r = VPAND ymm3/m256, ymm2, ymm1 */
26959      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26960         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26961                    uses_vvvv, vbi, pfx, delta, "vpand", Iop_AndV256 );
26962         goto decode_success;
26963      }
26964      break;
26965
26966   case 0xDC:
26967      /* VPADDUSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DC /r */
26968      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26969         delta = dis_AVX128_E_V_to_G(
26970                    uses_vvvv, vbi, pfx, delta, "vpaddusb", Iop_QAdd8Ux16 );
26971         goto decode_success;
26972      }
26973      /* VPADDUSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DC /r */
26974      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26975         delta = dis_AVX256_E_V_to_G(
26976                    uses_vvvv, vbi, pfx, delta, "vpaddusb", Iop_QAdd8Ux32 );
26977         goto decode_success;
26978      }
26979      break;
26980
26981   case 0xDD:
26982      /* VPADDUSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DD /r */
26983      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26984         delta = dis_AVX128_E_V_to_G(
26985                    uses_vvvv, vbi, pfx, delta, "vpaddusw", Iop_QAdd16Ux8 );
26986         goto decode_success;
26987      }
26988      /* VPADDUSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DD /r */
26989      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26990         delta = dis_AVX256_E_V_to_G(
26991                    uses_vvvv, vbi, pfx, delta, "vpaddusw", Iop_QAdd16Ux16 );
26992         goto decode_success;
26993      }
26994      break;
26995
26996   case 0xDE:
26997      /* VPMAXUB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DE /r */
26998      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26999         delta = dis_AVX128_E_V_to_G(
27000                    uses_vvvv, vbi, pfx, delta, "vpmaxub", Iop_Max8Ux16 );
27001         goto decode_success;
27002      }
27003      /* VPMAXUB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DE /r */
27004      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27005         delta = dis_AVX256_E_V_to_G(
27006                    uses_vvvv, vbi, pfx, delta, "vpmaxub", Iop_Max8Ux32 );
27007         goto decode_success;
27008      }
27009      break;
27010
27011   case 0xDF:
27012      /* VPANDN r/m, rV, r ::: r = rV & ~r/m (is that correct, re the ~ ?) */
27013      /* VEX.NDS.128.66.0F.WIG DF /r = VPANDN xmm3/m128, xmm2, xmm1 */
27014      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27015         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
27016                    uses_vvvv, vbi, pfx, delta, "vpandn", Iop_AndV128,
27017                    NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
27018         goto decode_success;
27019      }
27020      /* VPANDN r/m, rV, r ::: r = rV & ~r/m (is that correct, re the ~ ?) */
27021      /* VEX.NDS.256.66.0F.WIG DF /r = VPANDN ymm3/m256, ymm2, ymm1 */
27022      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27023         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
27024                    uses_vvvv, vbi, pfx, delta, "vpandn", Iop_AndV256,
27025                    NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
27026         goto decode_success;
27027      }
27028      break;
27029
27030   case 0xE0:
27031      /* VPAVGB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E0 /r */
27032      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27033         delta = dis_AVX128_E_V_to_G(
27034                    uses_vvvv, vbi, pfx, delta, "vpavgb", Iop_Avg8Ux16 );
27035         goto decode_success;
27036      }
27037      /* VPAVGB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E0 /r */
27038      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27039         delta = dis_AVX256_E_V_to_G(
27040                    uses_vvvv, vbi, pfx, delta, "vpavgb", Iop_Avg8Ux32 );
27041         goto decode_success;
27042      }
27043      break;
27044
27045   case 0xE1:
27046      /* VPSRAW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E1 /r */
27047      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27048         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
27049                                        "vpsraw", Iop_SarN16x8 );
27050         *uses_vvvv = True;
27051         goto decode_success;
27052      }
27053      /* VPSRAW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E1 /r */
27054      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27055         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
27056                                        "vpsraw", Iop_SarN16x16 );
27057         *uses_vvvv = True;
27058         goto decode_success;
27059      }
27060      break;
27061
27062   case 0xE2:
27063      /* VPSRAD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E2 /r */
27064      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27065         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
27066                                        "vpsrad", Iop_SarN32x4 );
27067         *uses_vvvv = True;
27068         goto decode_success;
27069      }
27070      /* VPSRAD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E2 /r */
27071      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27072         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
27073                                        "vpsrad", Iop_SarN32x8 );
27074         *uses_vvvv = True;
27075         goto decode_success;
27076      }
27077      break;
27078
27079   case 0xE3:
27080      /* VPAVGW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E3 /r */
27081      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27082         delta = dis_AVX128_E_V_to_G(
27083                    uses_vvvv, vbi, pfx, delta, "vpavgw", Iop_Avg16Ux8 );
27084         goto decode_success;
27085      }
27086      /* VPAVGW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E3 /r */
27087      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27088         delta = dis_AVX256_E_V_to_G(
27089                    uses_vvvv, vbi, pfx, delta, "vpavgw", Iop_Avg16Ux16 );
27090         goto decode_success;
27091      }
27092      break;
27093
27094   case 0xE4:
27095      /* VPMULHUW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E4 /r */
27096      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27097         delta = dis_AVX128_E_V_to_G(
27098                    uses_vvvv, vbi, pfx, delta, "vpmulhuw", Iop_MulHi16Ux8 );
27099         goto decode_success;
27100      }
27101      /* VPMULHUW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E4 /r */
27102      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27103         delta = dis_AVX256_E_V_to_G(
27104                    uses_vvvv, vbi, pfx, delta, "vpmulhuw", Iop_MulHi16Ux16 );
27105         goto decode_success;
27106      }
27107      break;
27108
27109   case 0xE5:
27110      /* VPMULHW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E5 /r */
27111      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27112         delta = dis_AVX128_E_V_to_G(
27113                    uses_vvvv, vbi, pfx, delta, "vpmulhw", Iop_MulHi16Sx8 );
27114         goto decode_success;
27115      }
27116      /* VPMULHW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E5 /r */
27117      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27118         delta = dis_AVX256_E_V_to_G(
27119                    uses_vvvv, vbi, pfx, delta, "vpmulhw", Iop_MulHi16Sx16 );
27120         goto decode_success;
27121      }
27122      break;
27123
27124   case 0xE6:
27125      /* VCVTDQ2PD xmm2/m64, xmm1 = VEX.128.F3.0F.WIG E6 /r */
27126      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
27127         delta = dis_CVTDQ2PD_128(vbi, pfx, delta, True/*isAvx*/);
27128         goto decode_success;
27129      }
27130      /* VCVTDQ2PD xmm2/m128, ymm1 = VEX.256.F3.0F.WIG E6 /r */
27131      if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
27132         delta = dis_CVTDQ2PD_256(vbi, pfx, delta);
27133         goto decode_success;
27134      }
27135      /* VCVTTPD2DQ xmm2/m128, xmm1 = VEX.128.66.0F.WIG E6 /r */
27136      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27137         delta = dis_CVTxPD2DQ_128(vbi, pfx, delta, True/*isAvx*/,
27138                                   True/*r2zero*/);
27139         goto decode_success;
27140      }
27141      /* VCVTTPD2DQ ymm2/m256, xmm1 = VEX.256.66.0F.WIG E6 /r */
27142      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27143         delta = dis_CVTxPD2DQ_256(vbi, pfx, delta, True/*r2zero*/);
27144         goto decode_success;
27145      }
27146      /* VCVTPD2DQ xmm2/m128, xmm1 = VEX.128.F2.0F.WIG E6 /r */
27147      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27148         delta = dis_CVTxPD2DQ_128(vbi, pfx, delta, True/*isAvx*/,
27149                                   False/*!r2zero*/);
27150         goto decode_success;
27151      }
27152      /* VCVTPD2DQ ymm2/m256, xmm1 = VEX.256.F2.0F.WIG E6 /r */
27153      if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27154         delta = dis_CVTxPD2DQ_256(vbi, pfx, delta, False/*!r2zero*/);
27155         goto decode_success;
27156      }
27157      break;
27158
27159   case 0xE7:
27160      /* VMOVNTDQ xmm1, m128 = VEX.128.66.0F.WIG E7 /r */
27161      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27162         UChar modrm = getUChar(delta);
27163         UInt rG     = gregOfRexRM(pfx,modrm);
27164         if (!epartIsReg(modrm)) {
27165            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
27166            gen_SEGV_if_not_16_aligned( addr );
27167            storeLE( mkexpr(addr), getXMMReg(rG) );
27168            DIP("vmovntdq %s,%s\n", dis_buf, nameXMMReg(rG));
27169            delta += alen;
27170            goto decode_success;
27171         }
27172         /* else fall through */
27173      }
27174      /* VMOVNTDQ ymm1, m256 = VEX.256.66.0F.WIG E7 /r */
27175      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27176         UChar modrm = getUChar(delta);
27177         UInt rG     = gregOfRexRM(pfx,modrm);
27178         if (!epartIsReg(modrm)) {
27179            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
27180            gen_SEGV_if_not_32_aligned( addr );
27181            storeLE( mkexpr(addr), getYMMReg(rG) );
27182            DIP("vmovntdq %s,%s\n", dis_buf, nameYMMReg(rG));
27183            delta += alen;
27184            goto decode_success;
27185         }
27186         /* else fall through */
27187      }
27188      break;
27189
27190   case 0xE8:
27191      /* VPSUBSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E8 /r */
27192      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27193         delta = dis_AVX128_E_V_to_G(
27194                    uses_vvvv, vbi, pfx, delta, "vpsubsb", Iop_QSub8Sx16 );
27195         goto decode_success;
27196      }
27197      /* VPSUBSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E8 /r */
27198      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27199         delta = dis_AVX256_E_V_to_G(
27200                    uses_vvvv, vbi, pfx, delta, "vpsubsb", Iop_QSub8Sx32 );
27201         goto decode_success;
27202      }
27203      break;
27204
27205   case 0xE9:
27206      /* VPSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E9 /r */
27207      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27208         delta = dis_AVX128_E_V_to_G(
27209                    uses_vvvv, vbi, pfx, delta, "vpsubsw", Iop_QSub16Sx8 );
27210         goto decode_success;
27211      }
27212      /* VPSUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E9 /r */
27213      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27214         delta = dis_AVX256_E_V_to_G(
27215                    uses_vvvv, vbi, pfx, delta, "vpsubsw", Iop_QSub16Sx16 );
27216         goto decode_success;
27217      }
27218      break;
27219
27220   case 0xEA:
27221      /* VPMINSW r/m, rV, r ::: r = min-signed16s(rV, r/m) */
27222      /* VPMINSW = VEX.NDS.128.66.0F.WIG EA /r */
27223      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27224         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27225                    uses_vvvv, vbi, pfx, delta, "vpminsw", Iop_Min16Sx8 );
27226         goto decode_success;
27227      }
27228      /* VPMINSW r/m, rV, r ::: r = min-signed16s(rV, r/m) */
27229      /* VPMINSW = VEX.NDS.256.66.0F.WIG EA /r */
27230      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27231         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27232                    uses_vvvv, vbi, pfx, delta, "vpminsw", Iop_Min16Sx16 );
27233         goto decode_success;
27234      }
27235      break;
27236
27237   case 0xEB:
27238      /* VPOR r/m, rV, r ::: r = rV | r/m */
27239      /* VPOR = VEX.NDS.128.66.0F.WIG EB /r */
27240      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27241         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27242                    uses_vvvv, vbi, pfx, delta, "vpor", Iop_OrV128 );
27243         goto decode_success;
27244      }
27245      /* VPOR r/m, rV, r ::: r = rV | r/m */
27246      /* VPOR = VEX.NDS.256.66.0F.WIG EB /r */
27247      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27248         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27249                    uses_vvvv, vbi, pfx, delta, "vpor", Iop_OrV256 );
27250         goto decode_success;
27251      }
27252      break;
27253
27254   case 0xEC:
27255      /* VPADDSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG EC /r */
27256      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27257         delta = dis_AVX128_E_V_to_G(
27258                    uses_vvvv, vbi, pfx, delta, "vpaddsb", Iop_QAdd8Sx16 );
27259         goto decode_success;
27260      }
27261      /* VPADDSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG EC /r */
27262      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27263         delta = dis_AVX256_E_V_to_G(
27264                    uses_vvvv, vbi, pfx, delta, "vpaddsb", Iop_QAdd8Sx32 );
27265         goto decode_success;
27266      }
27267      break;
27268
27269   case 0xED:
27270      /* VPADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG ED /r */
27271      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27272         delta = dis_AVX128_E_V_to_G(
27273                    uses_vvvv, vbi, pfx, delta, "vpaddsw", Iop_QAdd16Sx8 );
27274         goto decode_success;
27275      }
27276      /* VPADDSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG ED /r */
27277      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27278         delta = dis_AVX256_E_V_to_G(
27279                    uses_vvvv, vbi, pfx, delta, "vpaddsw", Iop_QAdd16Sx16 );
27280         goto decode_success;
27281      }
27282      break;
27283
27284   case 0xEE:
27285      /* VPMAXSW r/m, rV, r ::: r = max-signed16s(rV, r/m) */
27286      /* VPMAXSW = VEX.NDS.128.66.0F.WIG EE /r */
27287      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27288         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27289                    uses_vvvv, vbi, pfx, delta, "vpmaxsw", Iop_Max16Sx8 );
27290         goto decode_success;
27291      }
27292      /* VPMAXSW r/m, rV, r ::: r = max-signed16s(rV, r/m) */
27293      /* VPMAXSW = VEX.NDS.256.66.0F.WIG EE /r */
27294      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27295         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27296                    uses_vvvv, vbi, pfx, delta, "vpmaxsw", Iop_Max16Sx16 );
27297         goto decode_success;
27298      }
27299      break;
27300
27301   case 0xEF:
27302      /* VPXOR r/m, rV, r ::: r = rV ^ r/m */
27303      /* VPXOR = VEX.NDS.128.66.0F.WIG EF /r */
27304      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27305         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27306                    uses_vvvv, vbi, pfx, delta, "vpxor", Iop_XorV128 );
27307         goto decode_success;
27308      }
27309      /* VPXOR r/m, rV, r ::: r = rV ^ r/m */
27310      /* VPXOR = VEX.NDS.256.66.0F.WIG EF /r */
27311      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27312         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27313                    uses_vvvv, vbi, pfx, delta, "vpxor", Iop_XorV256 );
27314         goto decode_success;
27315      }
27316      break;
27317
27318   case 0xF0:
27319      /* VLDDQU m256, ymm1 = VEX.256.F2.0F.WIG F0 /r */
27320      if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27321         UChar  modrm = getUChar(delta);
27322         UInt   rD    = gregOfRexRM(pfx, modrm);
27323         IRTemp tD    = newTemp(Ity_V256);
27324         if (epartIsReg(modrm)) break;
27325         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
27326         delta += alen;
27327         assign(tD, loadLE(Ity_V256, mkexpr(addr)));
27328         DIP("vlddqu %s,%s\n", dis_buf, nameYMMReg(rD));
27329         putYMMReg(rD, mkexpr(tD));
27330         goto decode_success;
27331      }
27332      /* VLDDQU m128, xmm1 = VEX.128.F2.0F.WIG F0 /r */
27333      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27334         UChar  modrm = getUChar(delta);
27335         UInt   rD    = gregOfRexRM(pfx, modrm);
27336         IRTemp tD    = newTemp(Ity_V128);
27337         if (epartIsReg(modrm)) break;
27338         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
27339         delta += alen;
27340         assign(tD, loadLE(Ity_V128, mkexpr(addr)));
27341         DIP("vlddqu %s,%s\n", dis_buf, nameXMMReg(rD));
27342         putYMMRegLoAndZU(rD, mkexpr(tD));
27343         goto decode_success;
27344      }
27345      break;
27346
27347   case 0xF1:
27348      /* VPSLLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F1 /r */
27349      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27350         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
27351                                        "vpsllw", Iop_ShlN16x8 );
27352         *uses_vvvv = True;
27353         goto decode_success;
27354
27355      }
27356      /* VPSLLW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F1 /r */
27357      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27358         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
27359                                        "vpsllw", Iop_ShlN16x16 );
27360         *uses_vvvv = True;
27361         goto decode_success;
27362
27363      }
27364      break;
27365
27366   case 0xF2:
27367      /* VPSLLD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F2 /r */
27368      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27369         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
27370                                        "vpslld", Iop_ShlN32x4 );
27371         *uses_vvvv = True;
27372         goto decode_success;
27373      }
27374      /* VPSLLD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F2 /r */
27375      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27376         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
27377                                        "vpslld", Iop_ShlN32x8 );
27378         *uses_vvvv = True;
27379         goto decode_success;
27380      }
27381      break;
27382
27383   case 0xF3:
27384      /* VPSLLQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F3 /r */
27385      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27386         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
27387                                        "vpsllq", Iop_ShlN64x2 );
27388         *uses_vvvv = True;
27389         goto decode_success;
27390      }
27391      /* VPSLLQ xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F3 /r */
27392      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27393         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
27394                                        "vpsllq", Iop_ShlN64x4 );
27395         *uses_vvvv = True;
27396         goto decode_success;
27397      }
27398      break;
27399
27400   case 0xF4:
27401      /* VPMULUDQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F4 /r */
27402      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27403         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
27404                    uses_vvvv, vbi, pfx, delta,
27405                    "vpmuludq", math_PMULUDQ_128 );
27406         goto decode_success;
27407      }
27408      /* VPMULUDQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F4 /r */
27409      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27410         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
27411                    uses_vvvv, vbi, pfx, delta,
27412                    "vpmuludq", math_PMULUDQ_256 );
27413         goto decode_success;
27414      }
27415      break;
27416
27417   case 0xF5:
27418      /* VPMADDWD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F5 /r */
27419      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27420         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
27421                    uses_vvvv, vbi, pfx, delta,
27422                    "vpmaddwd", math_PMADDWD_128 );
27423         goto decode_success;
27424      }
27425      /* VPMADDWD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F5 /r */
27426      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27427         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
27428                    uses_vvvv, vbi, pfx, delta,
27429                    "vpmaddwd", math_PMADDWD_256 );
27430         goto decode_success;
27431      }
27432      break;
27433
27434   case 0xF6:
27435      /* VPSADBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F6 /r */
27436      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27437         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
27438                    uses_vvvv, vbi, pfx, delta,
27439                    "vpsadbw", math_PSADBW_128 );
27440         goto decode_success;
27441      }
27442      /* VPSADBW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F6 /r */
27443      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27444         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
27445                    uses_vvvv, vbi, pfx, delta,
27446                    "vpsadbw", math_PSADBW_256 );
27447         goto decode_success;
27448      }
27449      break;
27450
27451   case 0xF7:
27452      /* VMASKMOVDQU xmm2, xmm1 = VEX.128.66.0F.WIG F7 /r */
27453      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
27454          && epartIsReg(getUChar(delta))) {
27455         delta = dis_MASKMOVDQU( vbi, pfx, delta, True/*isAvx*/ );
27456         goto decode_success;
27457      }
27458      break;
27459
27460   case 0xF8:
27461      /* VPSUBB r/m, rV, r ::: r = rV - r/m */
27462      /* VPSUBB = VEX.NDS.128.66.0F.WIG F8 /r */
27463      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27464         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27465                    uses_vvvv, vbi, pfx, delta, "vpsubb", Iop_Sub8x16 );
27466         goto decode_success;
27467      }
27468      /* VPSUBB r/m, rV, r ::: r = rV - r/m */
27469      /* VPSUBB = VEX.NDS.256.66.0F.WIG F8 /r */
27470      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27471         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27472                    uses_vvvv, vbi, pfx, delta, "vpsubb", Iop_Sub8x32 );
27473         goto decode_success;
27474      }
27475      break;
27476
27477   case 0xF9:
27478      /* VPSUBW r/m, rV, r ::: r = rV - r/m */
27479      /* VPSUBW = VEX.NDS.128.66.0F.WIG F9 /r */
27480      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27481         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27482                    uses_vvvv, vbi, pfx, delta, "vpsubw", Iop_Sub16x8 );
27483         goto decode_success;
27484      }
27485      /* VPSUBW r/m, rV, r ::: r = rV - r/m */
27486      /* VPSUBW = VEX.NDS.256.66.0F.WIG F9 /r */
27487      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27488         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27489                    uses_vvvv, vbi, pfx, delta, "vpsubw", Iop_Sub16x16 );
27490         goto decode_success;
27491      }
27492      break;
27493
27494   case 0xFA:
27495      /* VPSUBD r/m, rV, r ::: r = rV - r/m */
27496      /* VPSUBD = VEX.NDS.128.66.0F.WIG FA /r */
27497      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27498         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27499                    uses_vvvv, vbi, pfx, delta, "vpsubd", Iop_Sub32x4 );
27500         goto decode_success;
27501      }
27502      /* VPSUBD r/m, rV, r ::: r = rV - r/m */
27503      /* VPSUBD = VEX.NDS.256.66.0F.WIG FA /r */
27504      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27505         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27506                    uses_vvvv, vbi, pfx, delta, "vpsubd", Iop_Sub32x8 );
27507         goto decode_success;
27508      }
27509      break;
27510
27511   case 0xFB:
27512      /* VPSUBQ r/m, rV, r ::: r = rV - r/m */
27513      /* VPSUBQ = VEX.NDS.128.66.0F.WIG FB /r */
27514      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27515         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27516                    uses_vvvv, vbi, pfx, delta, "vpsubq", Iop_Sub64x2 );
27517         goto decode_success;
27518      }
27519      /* VPSUBQ r/m, rV, r ::: r = rV - r/m */
27520      /* VPSUBQ = VEX.NDS.256.66.0F.WIG FB /r */
27521      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27522         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27523                    uses_vvvv, vbi, pfx, delta, "vpsubq", Iop_Sub64x4 );
27524         goto decode_success;
27525      }
27526      break;
27527
27528   case 0xFC:
27529      /* VPADDB r/m, rV, r ::: r = rV + r/m */
27530      /* VPADDB = VEX.NDS.128.66.0F.WIG FC /r */
27531      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27532         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27533                    uses_vvvv, vbi, pfx, delta, "vpaddb", Iop_Add8x16 );
27534         goto decode_success;
27535      }
27536      /* VPADDB r/m, rV, r ::: r = rV + r/m */
27537      /* VPADDB = VEX.NDS.256.66.0F.WIG FC /r */
27538      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27539         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27540                    uses_vvvv, vbi, pfx, delta, "vpaddb", Iop_Add8x32 );
27541         goto decode_success;
27542      }
27543      break;
27544
27545   case 0xFD:
27546      /* VPADDW r/m, rV, r ::: r = rV + r/m */
27547      /* VPADDW = VEX.NDS.128.66.0F.WIG FD /r */
27548      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27549         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27550                    uses_vvvv, vbi, pfx, delta, "vpaddw", Iop_Add16x8 );
27551         goto decode_success;
27552      }
27553      /* VPADDW r/m, rV, r ::: r = rV + r/m */
27554      /* VPADDW = VEX.NDS.256.66.0F.WIG FD /r */
27555      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27556         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27557                    uses_vvvv, vbi, pfx, delta, "vpaddw", Iop_Add16x16 );
27558         goto decode_success;
27559      }
27560      break;
27561
27562   case 0xFE:
27563      /* VPADDD r/m, rV, r ::: r = rV + r/m */
27564      /* VPADDD = VEX.NDS.128.66.0F.WIG FE /r */
27565      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27566         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
27567                    uses_vvvv, vbi, pfx, delta, "vpaddd", Iop_Add32x4 );
27568         goto decode_success;
27569      }
27570      /* VPADDD r/m, rV, r ::: r = rV + r/m */
27571      /* VPADDD = VEX.NDS.256.66.0F.WIG FE /r */
27572      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27573         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
27574                    uses_vvvv, vbi, pfx, delta, "vpaddd", Iop_Add32x8 );
27575         goto decode_success;
27576      }
27577      break;
27578
27579   default:
27580      break;
27581
27582   }
27583
27584  //decode_failure:
27585   return deltaIN;
27586
27587  decode_success:
27588   return delta;
27589}
27590
27591
27592/*------------------------------------------------------------*/
27593/*---                                                      ---*/
27594/*--- Top-level post-escape decoders: dis_ESC_0F38__VEX    ---*/
27595/*---                                                      ---*/
27596/*------------------------------------------------------------*/
27597
27598static IRTemp math_PERMILPS_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
27599{
27600   /* In the control vector, zero out all but the bottom two bits of
27601      each 32-bit lane. */
27602   IRExpr* cv1 = binop(Iop_ShrN32x4,
27603                       binop(Iop_ShlN32x4, mkexpr(ctrlV), mkU8(30)),
27604                       mkU8(30));
27605   /* And use the resulting cleaned-up control vector as steering
27606      in a Perm operation. */
27607   IRTemp res = newTemp(Ity_V128);
27608   assign(res, binop(Iop_Perm32x4, mkexpr(dataV), cv1));
27609   return res;
27610}
27611
27612static IRTemp math_PERMILPS_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
27613{
27614   IRTemp dHi, dLo, cHi, cLo;
27615   dHi = dLo = cHi = cLo = IRTemp_INVALID;
27616   breakupV256toV128s( dataV, &dHi, &dLo );
27617   breakupV256toV128s( ctrlV, &cHi, &cLo );
27618   IRTemp rHi = math_PERMILPS_VAR_128( dHi, cHi );
27619   IRTemp rLo = math_PERMILPS_VAR_128( dLo, cLo );
27620   IRTemp res = newTemp(Ity_V256);
27621   assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
27622   return res;
27623}
27624
27625static IRTemp math_PERMILPD_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
27626{
27627   /* No cleverness here .. */
27628   IRTemp dHi, dLo, cHi, cLo;
27629   dHi = dLo = cHi = cLo = IRTemp_INVALID;
27630   breakupV128to64s( dataV, &dHi, &dLo );
27631   breakupV128to64s( ctrlV, &cHi, &cLo );
27632   IRExpr* rHi
27633      = IRExpr_ITE( unop(Iop_64to1,
27634                         binop(Iop_Shr64, mkexpr(cHi), mkU8(1))),
27635                    mkexpr(dHi), mkexpr(dLo) );
27636   IRExpr* rLo
27637      = IRExpr_ITE( unop(Iop_64to1,
27638                         binop(Iop_Shr64, mkexpr(cLo), mkU8(1))),
27639                    mkexpr(dHi), mkexpr(dLo) );
27640   IRTemp res = newTemp(Ity_V128);
27641   assign(res, binop(Iop_64HLtoV128, rHi, rLo));
27642   return res;
27643}
27644
27645static IRTemp math_PERMILPD_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
27646{
27647   IRTemp dHi, dLo, cHi, cLo;
27648   dHi = dLo = cHi = cLo = IRTemp_INVALID;
27649   breakupV256toV128s( dataV, &dHi, &dLo );
27650   breakupV256toV128s( ctrlV, &cHi, &cLo );
27651   IRTemp rHi = math_PERMILPD_VAR_128( dHi, cHi );
27652   IRTemp rLo = math_PERMILPD_VAR_128( dLo, cLo );
27653   IRTemp res = newTemp(Ity_V256);
27654   assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
27655   return res;
27656}
27657
27658static IRTemp math_VPERMD ( IRTemp ctrlV, IRTemp dataV )
27659{
27660   /* In the control vector, zero out all but the bottom three bits of
27661      each 32-bit lane. */
27662   IRExpr* cv1 = binop(Iop_ShrN32x8,
27663                       binop(Iop_ShlN32x8, mkexpr(ctrlV), mkU8(29)),
27664                       mkU8(29));
27665   /* And use the resulting cleaned-up control vector as steering
27666      in a Perm operation. */
27667   IRTemp res = newTemp(Ity_V256);
27668   assign(res, binop(Iop_Perm32x8, mkexpr(dataV), cv1));
27669   return res;
27670}
27671
27672static Long dis_SHIFTX ( /*OUT*/Bool* uses_vvvv,
27673                         const VexAbiInfo* vbi, Prefix pfx, Long delta,
27674                         const HChar* opname, IROp op8 )
27675{
27676   HChar   dis_buf[50];
27677   Int     alen;
27678   Int     size = getRexW(pfx) ? 8 : 4;
27679   IRType  ty   = szToITy(size);
27680   IRTemp  src  = newTemp(ty);
27681   IRTemp  amt  = newTemp(ty);
27682   UChar   rm   = getUChar(delta);
27683
27684   assign( amt, getIRegV(size,pfx) );
27685   if (epartIsReg(rm)) {
27686      assign( src, getIRegE(size,pfx,rm) );
27687      DIP("%s %s,%s,%s\n", opname, nameIRegV(size,pfx),
27688                           nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
27689      delta++;
27690   } else {
27691      IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
27692      assign( src, loadLE(ty, mkexpr(addr)) );
27693      DIP("%s %s,%s,%s\n", opname, nameIRegV(size,pfx), dis_buf,
27694                           nameIRegG(size,pfx,rm));
27695      delta += alen;
27696   }
27697
27698   putIRegG( size, pfx, rm,
27699             binop(mkSizedOp(ty,op8), mkexpr(src),
27700                   narrowTo(Ity_I8, binop(mkSizedOp(ty,Iop_And8), mkexpr(amt),
27701                                          mkU(ty,8*size-1)))) );
27702   /* Flags aren't modified.  */
27703   *uses_vvvv = True;
27704   return delta;
27705}
27706
27707
27708static Long dis_FMA ( const VexAbiInfo* vbi, Prefix pfx, Long delta, UChar opc )
27709{
27710   UChar  modrm   = getUChar(delta);
27711   UInt   rG      = gregOfRexRM(pfx, modrm);
27712   UInt   rV      = getVexNvvvv(pfx);
27713   Bool   scalar  = (opc & 0xF) > 7 && (opc & 1);
27714   IRType ty      = getRexW(pfx) ? Ity_F64 : Ity_F32;
27715   IRType vty     = scalar ? ty : (getVexL(pfx) ? Ity_V256 : Ity_V128);
27716   IRTemp addr    = IRTemp_INVALID;
27717   HChar  dis_buf[50];
27718   Int    alen    = 0;
27719   const HChar *name;
27720   const HChar *suffix;
27721   const HChar *order;
27722   Bool   negateRes   = False;
27723   Bool   negateZeven = False;
27724   Bool   negateZodd  = False;
27725   UInt   count = 0;
27726
27727   switch (opc & 0xF) {
27728      case 0x6: name = "addsub"; negateZeven = True; break;
27729      case 0x7: name = "subadd"; negateZodd = True; break;
27730      case 0x8:
27731      case 0x9: name = "add"; break;
27732      case 0xA:
27733      case 0xB: name = "sub"; negateZeven = True; negateZodd = True;
27734         break;
27735      case 0xC:
27736      case 0xD: name = "add"; negateRes = True; negateZeven = True;
27737                                                negateZodd = True; break;
27738      case 0xE:
27739      case 0xF: name = "sub"; negateRes = True; break;
27740      default:  vpanic("dis_FMA(amd64)"); break;
27741   }
27742   switch (opc & 0xF0) {
27743      case 0x90: order = "132"; break;
27744      case 0xA0: order = "213"; break;
27745      case 0xB0: order = "231"; break;
27746      default:   vpanic("dis_FMA(amd64)"); break;
27747   }
27748   if (scalar) {
27749      suffix = ty == Ity_F64 ? "sd" : "ss";
27750   } else {
27751      suffix = ty == Ity_F64 ? "pd" : "ps";
27752   }
27753
27754   // Figure out |count| (the number of elements) by considering |vty| and |ty|.
27755   count = sizeofIRType(vty) / sizeofIRType(ty);
27756   vassert(count == 1 || count == 2 || count == 4 || count == 8);
27757
27758   // Fetch operands into the first |count| elements of |sX|, |sY| and |sZ|.
27759   UInt i;
27760   IRExpr *sX[8], *sY[8], *sZ[8], *res[8];
27761   for (i = 0; i < 8; i++) sX[i] = sY[i] = sZ[i] = res[i] = NULL;
27762
27763   IRExpr* (*getYMMRegLane)(UInt,Int)
27764      = ty == Ity_F32 ? getYMMRegLane32F : getYMMRegLane64F;
27765   void (*putYMMRegLane)(UInt,Int,IRExpr*)
27766      = ty == Ity_F32 ? putYMMRegLane32F : putYMMRegLane64F;
27767
27768   for (i = 0; i < count; i++) {
27769      sX[i] = getYMMRegLane(rG, i);
27770      sZ[i] = getYMMRegLane(rV, i);
27771   }
27772
27773   if (epartIsReg(modrm)) {
27774      UInt rE = eregOfRexRM(pfx, modrm);
27775      delta += 1;
27776      for (i = 0; i < count; i++) {
27777         sY[i] = getYMMRegLane(rE, i);
27778      }
27779      if (vty == Ity_V256) {
27780         DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
27781             name, order, suffix, nameYMMReg(rE), nameYMMReg(rV),
27782             nameYMMReg(rG));
27783      } else {
27784         DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
27785             name, order, suffix, nameXMMReg(rE), nameXMMReg(rV),
27786             nameXMMReg(rG));
27787      }
27788   } else {
27789      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
27790      delta += alen;
27791      for (i = 0; i < count; i++) {
27792         sY[i] = loadLE(ty, binop(Iop_Add64, mkexpr(addr),
27793                                  mkU64(i * sizeofIRType(ty))));
27794      }
27795      if (vty == Ity_V256) {
27796         DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
27797             name, order, suffix, dis_buf, nameYMMReg(rV),
27798             nameYMMReg(rG));
27799      } else {
27800         DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
27801             name, order, suffix, dis_buf, nameXMMReg(rV),
27802             nameXMMReg(rG));
27803      }
27804   }
27805
27806   /* vX/vY/vZ are now in 132 order.  If the instruction requires a different
27807      order, swap them around.  */
27808
27809#  define COPY_ARR(_dst, _src) \
27810      do { for (int j = 0; j < 8; j++) { _dst[j] = _src[j]; } } while (0)
27811
27812   if ((opc & 0xF0) != 0x90) {
27813      IRExpr* temp[8];
27814      COPY_ARR(temp, sX);
27815      if ((opc & 0xF0) == 0xA0) {
27816         COPY_ARR(sX, sZ);
27817         COPY_ARR(sZ, sY);
27818         COPY_ARR(sY, temp);
27819      } else {
27820         COPY_ARR(sX, sZ);
27821         COPY_ARR(sZ, temp);
27822      }
27823   }
27824
27825#  undef COPY_ARR
27826
27827   for (i = 0; i < count; i++) {
27828      IROp opNEG = ty == Ity_F64 ? Iop_NegF64 : Iop_NegF32;
27829      if ((i & 1) ? negateZodd : negateZeven) {
27830         sZ[i] = unop(opNEG, sZ[i]);
27831      }
27832      res[i] = IRExpr_Qop(ty == Ity_F64 ? Iop_MAddF64 : Iop_MAddF32,
27833                          get_FAKE_roundingmode(), sX[i], sY[i], sZ[i]);
27834      if (negateRes) {
27835         res[i] = unop(opNEG, res[i]);
27836      }
27837   }
27838
27839   for (i = 0; i < count; i++) {
27840      putYMMRegLane(rG, i, res[i]);
27841   }
27842
27843   switch (vty) {
27844      case Ity_F32:  putYMMRegLane32(rG, 1, mkU32(0)); /*fallthru*/
27845      case Ity_F64:  putYMMRegLane64(rG, 1, mkU64(0)); /*fallthru*/
27846      case Ity_V128: putYMMRegLane128(rG, 1, mkV128(0)); /*fallthru*/
27847      case Ity_V256: break;
27848      default: vassert(0);
27849   }
27850
27851   return delta;
27852}
27853
27854
27855/* Masked load or masked store. */
27856static ULong dis_VMASKMOV ( Bool *uses_vvvv, const VexAbiInfo* vbi,
27857                            Prefix pfx, Long delta,
27858                            const HChar* opname, Bool isYMM, IRType ty,
27859                            Bool isLoad )
27860{
27861   HChar   dis_buf[50];
27862   Int     alen, i;
27863   IRTemp  addr;
27864   UChar   modrm = getUChar(delta);
27865   UInt    rG    = gregOfRexRM(pfx,modrm);
27866   UInt    rV    = getVexNvvvv(pfx);
27867
27868   addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
27869   delta += alen;
27870
27871   /**/ if (isLoad && isYMM) {
27872      DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
27873   }
27874   else if (isLoad && !isYMM) {
27875      DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
27876   }
27877
27878   else if (!isLoad && isYMM) {
27879      DIP("%s %s,%s,%s\n", opname, nameYMMReg(rG), nameYMMReg(rV), dis_buf );
27880   }
27881   else {
27882      vassert(!isLoad && !isYMM);
27883      DIP("%s %s,%s,%s\n", opname, nameXMMReg(rG), nameXMMReg(rV), dis_buf );
27884   }
27885
27886   vassert(ty == Ity_I32 || ty == Ity_I64);
27887   Bool laneIs32 = ty == Ity_I32;
27888
27889   Int nLanes = (isYMM ? 2 : 1) * (laneIs32 ? 4 : 2);
27890
27891   for (i = 0; i < nLanes; i++) {
27892      IRExpr* shAmt = laneIs32 ? mkU8(31)    : mkU8(63);
27893      IRExpr* one   = laneIs32 ? mkU32(1)    : mkU64(1);
27894      IROp    opSHR = laneIs32 ? Iop_Shr32   : Iop_Shr64;
27895      IROp    opEQ  = laneIs32 ? Iop_CmpEQ32 : Iop_CmpEQ64;
27896      IRExpr* lane  = (laneIs32 ? getYMMRegLane32 : getYMMRegLane64)( rV, i );
27897
27898      IRTemp  cond = newTemp(Ity_I1);
27899      assign(cond, binop(opEQ, binop(opSHR, lane, shAmt), one));
27900
27901      IRTemp  data = newTemp(ty);
27902      IRExpr* ea   = binop(Iop_Add64, mkexpr(addr),
27903                                      mkU64(i * (laneIs32 ? 4 : 8)));
27904      if (isLoad) {
27905         stmt(
27906            IRStmt_LoadG(
27907               Iend_LE, laneIs32 ? ILGop_Ident32 : ILGop_Ident64,
27908               data, ea, laneIs32 ? mkU32(0) : mkU64(0), mkexpr(cond)
27909         ));
27910         (laneIs32 ? putYMMRegLane32 : putYMMRegLane64)( rG, i, mkexpr(data) );
27911      } else {
27912         assign(data, (laneIs32 ? getYMMRegLane32 : getYMMRegLane64)( rG, i ));
27913         stmt( IRStmt_StoreG(Iend_LE, ea, mkexpr(data), mkexpr(cond)) );
27914      }
27915   }
27916
27917   if (isLoad && !isYMM)
27918      putYMMRegLane128( rG, 1, mkV128(0) );
27919
27920   *uses_vvvv = True;
27921   return delta;
27922}
27923
27924
27925/* Gather.  */
27926static ULong dis_VGATHER ( Bool *uses_vvvv, const VexAbiInfo* vbi,
27927                           Prefix pfx, Long delta,
27928                           const HChar* opname, Bool isYMM,
27929                           Bool isVM64x, IRType ty )
27930{
27931   HChar  dis_buf[50];
27932   Int    alen, i, vscale, count1, count2;
27933   IRTemp addr;
27934   UChar  modrm = getUChar(delta);
27935   UInt   rG    = gregOfRexRM(pfx,modrm);
27936   UInt   rV    = getVexNvvvv(pfx);
27937   UInt   rI;
27938   IRType dstTy = (isYMM && (ty == Ity_I64 || !isVM64x)) ? Ity_V256 : Ity_V128;
27939   IRType idxTy = (isYMM && (ty == Ity_I32 || isVM64x)) ? Ity_V256 : Ity_V128;
27940   IRTemp cond;
27941   addr = disAVSIBMode ( &alen, vbi, pfx, delta, dis_buf, &rI,
27942                         idxTy, &vscale );
27943   if (addr == IRTemp_INVALID || rI == rG || rI == rV || rG == rV)
27944      return delta;
27945   if (dstTy == Ity_V256) {
27946      DIP("%s %s,%s,%s\n", opname, nameYMMReg(rV), dis_buf, nameYMMReg(rG) );
27947   } else {
27948      DIP("%s %s,%s,%s\n", opname, nameXMMReg(rV), dis_buf, nameXMMReg(rG) );
27949   }
27950   delta += alen;
27951
27952   if (ty == Ity_I32) {
27953      count1 = isYMM ? 8 : 4;
27954      count2 = isVM64x ? count1 / 2 : count1;
27955   } else {
27956      count1 = count2 = isYMM ? 4 : 2;
27957   }
27958
27959   /* First update the mask register to copies of the sign bit.  */
27960   if (ty == Ity_I32) {
27961      if (isYMM)
27962         putYMMReg( rV, binop(Iop_SarN32x8, getYMMReg( rV ), mkU8(31)) );
27963      else
27964         putYMMRegLoAndZU( rV, binop(Iop_SarN32x4, getXMMReg( rV ), mkU8(31)) );
27965   } else {
27966      for (i = 0; i < count1; i++) {
27967         putYMMRegLane64( rV, i, binop(Iop_Sar64, getYMMRegLane64( rV, i ),
27968                                       mkU8(63)) );
27969      }
27970   }
27971
27972   /* Next gather the individual elements.  If any fault occurs, the
27973      corresponding mask element will be set and the loop stops.  */
27974   for (i = 0; i < count2; i++) {
27975      IRExpr *expr, *addr_expr;
27976      cond = newTemp(Ity_I1);
27977      assign( cond,
27978              binop(ty == Ity_I32 ? Iop_CmpLT32S : Iop_CmpLT64S,
27979                    ty == Ity_I32 ? getYMMRegLane32( rV, i )
27980                                  : getYMMRegLane64( rV, i ),
27981                    mkU(ty, 0)) );
27982      expr = ty == Ity_I32 ? getYMMRegLane32( rG, i )
27983                           : getYMMRegLane64( rG, i );
27984      addr_expr = isVM64x ? getYMMRegLane64( rI, i )
27985                          : unop(Iop_32Sto64, getYMMRegLane32( rI, i ));
27986      switch (vscale) {
27987         case 2: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(1)); break;
27988         case 4: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(2)); break;
27989         case 8: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(3)); break;
27990         default: break;
27991      }
27992      addr_expr = binop(Iop_Add64, mkexpr(addr), addr_expr);
27993      addr_expr = handleAddrOverrides(vbi, pfx, addr_expr);
27994      addr_expr = IRExpr_ITE(mkexpr(cond), addr_expr, getIReg64(R_RSP));
27995      expr = IRExpr_ITE(mkexpr(cond), loadLE(ty, addr_expr), expr);
27996      if (ty == Ity_I32) {
27997         putYMMRegLane32( rG, i, expr );
27998         putYMMRegLane32( rV, i, mkU32(0) );
27999      } else {
28000         putYMMRegLane64( rG, i, expr);
28001         putYMMRegLane64( rV, i, mkU64(0) );
28002      }
28003   }
28004
28005   if (!isYMM || (ty == Ity_I32 && isVM64x)) {
28006      if (ty == Ity_I64 || isYMM)
28007         putYMMRegLane128( rV, 1, mkV128(0) );
28008      else if (ty == Ity_I32 && count2 == 2) {
28009         putYMMRegLane64( rV, 1, mkU64(0) );
28010         putYMMRegLane64( rG, 1, mkU64(0) );
28011      }
28012      putYMMRegLane128( rG, 1, mkV128(0) );
28013   }
28014
28015   *uses_vvvv = True;
28016   return delta;
28017}
28018
28019
28020__attribute__((noinline))
28021static
28022Long dis_ESC_0F38__VEX (
28023        /*MB_OUT*/DisResult* dres,
28024        /*OUT*/   Bool*      uses_vvvv,
28025        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
28026        Bool         resteerCisOk,
28027        void*        callback_opaque,
28028        const VexArchInfo* archinfo,
28029        const VexAbiInfo*  vbi,
28030        Prefix pfx, Int sz, Long deltaIN
28031     )
28032{
28033   IRTemp addr  = IRTemp_INVALID;
28034   Int    alen  = 0;
28035   HChar  dis_buf[50];
28036   Long   delta = deltaIN;
28037   UChar  opc   = getUChar(delta);
28038   delta++;
28039   *uses_vvvv = False;
28040
28041   switch (opc) {
28042
28043   case 0x00:
28044      /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */
28045      /* VPSHUFB = VEX.NDS.128.66.0F38.WIG 00 /r */
28046      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28047         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
28048                    uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_XMM );
28049         goto decode_success;
28050      }
28051      /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */
28052      /* VPSHUFB = VEX.NDS.256.66.0F38.WIG 00 /r */
28053      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28054         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
28055                    uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_YMM );
28056         goto decode_success;
28057      }
28058      break;
28059
28060   case 0x01:
28061   case 0x02:
28062   case 0x03:
28063      /* VPHADDW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 01 /r */
28064      /* VPHADDD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 02 /r */
28065      /* VPHADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 03 /r */
28066      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28067         delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
28068         *uses_vvvv = True;
28069         goto decode_success;
28070      }
28071      /* VPHADDW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 01 /r */
28072      /* VPHADDD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 02 /r */
28073      /* VPHADDSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 03 /r */
28074      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28075         delta = dis_PHADD_256( vbi, pfx, delta, opc );
28076         *uses_vvvv = True;
28077         goto decode_success;
28078      }
28079      break;
28080
28081   case 0x04:
28082      /* VPMADDUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 04 /r */
28083      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28084         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
28085                    uses_vvvv, vbi, pfx, delta, "vpmaddubsw",
28086                    math_PMADDUBSW_128 );
28087         goto decode_success;
28088      }
28089      /* VPMADDUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 04 /r */
28090      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28091         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
28092                    uses_vvvv, vbi, pfx, delta, "vpmaddubsw",
28093                    math_PMADDUBSW_256 );
28094         goto decode_success;
28095      }
28096      break;
28097
28098   case 0x05:
28099   case 0x06:
28100   case 0x07:
28101      /* VPHSUBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 05 /r */
28102      /* VPHSUBD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 06 /r */
28103      /* VPHSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 07 /r */
28104      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28105         delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
28106         *uses_vvvv = True;
28107         goto decode_success;
28108      }
28109      /* VPHSUBW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 05 /r */
28110      /* VPHSUBD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 06 /r */
28111      /* VPHSUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 07 /r */
28112      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28113         delta = dis_PHADD_256( vbi, pfx, delta, opc );
28114         *uses_vvvv = True;
28115         goto decode_success;
28116      }
28117      break;
28118
28119   case 0x08:
28120   case 0x09:
28121   case 0x0A:
28122      /* VPSIGNB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 08 /r */
28123      /* VPSIGNW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 09 /r */
28124      /* VPSIGND xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0A /r */
28125      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28126         IRTemp sV      = newTemp(Ity_V128);
28127         IRTemp dV      = newTemp(Ity_V128);
28128         IRTemp sHi, sLo, dHi, dLo;
28129         sHi = sLo = dHi = dLo = IRTemp_INVALID;
28130         HChar  ch      = '?';
28131         Int    laneszB = 0;
28132         UChar  modrm   = getUChar(delta);
28133         UInt   rG      = gregOfRexRM(pfx,modrm);
28134         UInt   rV      = getVexNvvvv(pfx);
28135
28136         switch (opc) {
28137            case 0x08: laneszB = 1; ch = 'b'; break;
28138            case 0x09: laneszB = 2; ch = 'w'; break;
28139            case 0x0A: laneszB = 4; ch = 'd'; break;
28140            default: vassert(0);
28141         }
28142
28143         assign( dV, getXMMReg(rV) );
28144
28145         if (epartIsReg(modrm)) {
28146            UInt rE = eregOfRexRM(pfx,modrm);
28147            assign( sV, getXMMReg(rE) );
28148            delta += 1;
28149            DIP("vpsign%c %s,%s,%s\n", ch, nameXMMReg(rE),
28150                nameXMMReg(rV), nameXMMReg(rG));
28151         } else {
28152            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
28153            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
28154            delta += alen;
28155            DIP("vpsign%c %s,%s,%s\n", ch, dis_buf,
28156                nameXMMReg(rV), nameXMMReg(rG));
28157         }
28158
28159         breakupV128to64s( dV, &dHi, &dLo );
28160         breakupV128to64s( sV, &sHi, &sLo );
28161
28162         putYMMRegLoAndZU(
28163            rG,
28164            binop(Iop_64HLtoV128,
28165                  dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
28166                  dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
28167            )
28168         );
28169         *uses_vvvv = True;
28170         goto decode_success;
28171      }
28172      /* VPSIGNB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 08 /r */
28173      /* VPSIGNW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 09 /r */
28174      /* VPSIGND ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 0A /r */
28175      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28176         IRTemp sV      = newTemp(Ity_V256);
28177         IRTemp dV      = newTemp(Ity_V256);
28178         IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
28179         s3 = s2 = s1 = s0 = IRTemp_INVALID;
28180         d3 = d2 = d1 = d0 = IRTemp_INVALID;
28181         UChar  ch      = '?';
28182         Int    laneszB = 0;
28183         UChar  modrm   = getUChar(delta);
28184         UInt   rG      = gregOfRexRM(pfx,modrm);
28185         UInt   rV      = getVexNvvvv(pfx);
28186
28187         switch (opc) {
28188            case 0x08: laneszB = 1; ch = 'b'; break;
28189            case 0x09: laneszB = 2; ch = 'w'; break;
28190            case 0x0A: laneszB = 4; ch = 'd'; break;
28191            default: vassert(0);
28192         }
28193
28194         assign( dV, getYMMReg(rV) );
28195
28196         if (epartIsReg(modrm)) {
28197            UInt rE = eregOfRexRM(pfx,modrm);
28198            assign( sV, getYMMReg(rE) );
28199            delta += 1;
28200            DIP("vpsign%c %s,%s,%s\n", ch, nameYMMReg(rE),
28201                nameYMMReg(rV), nameYMMReg(rG));
28202         } else {
28203            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
28204            assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
28205            delta += alen;
28206            DIP("vpsign%c %s,%s,%s\n", ch, dis_buf,
28207                nameYMMReg(rV), nameYMMReg(rG));
28208         }
28209
28210         breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
28211         breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
28212
28213         putYMMReg(
28214            rG,
28215            binop( Iop_V128HLtoV256,
28216                   binop(Iop_64HLtoV128,
28217                         dis_PSIGN_helper( mkexpr(s3), mkexpr(d3), laneszB ),
28218                         dis_PSIGN_helper( mkexpr(s2), mkexpr(d2), laneszB )
28219                   ),
28220                   binop(Iop_64HLtoV128,
28221                         dis_PSIGN_helper( mkexpr(s1), mkexpr(d1), laneszB ),
28222                         dis_PSIGN_helper( mkexpr(s0), mkexpr(d0), laneszB )
28223                   )
28224            )
28225         );
28226         *uses_vvvv = True;
28227         goto decode_success;
28228      }
28229      break;
28230
28231   case 0x0B:
28232      /* VPMULHRSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0B /r */
28233      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28234         IRTemp sV      = newTemp(Ity_V128);
28235         IRTemp dV      = newTemp(Ity_V128);
28236         IRTemp sHi, sLo, dHi, dLo;
28237         sHi = sLo = dHi = dLo = IRTemp_INVALID;
28238         UChar  modrm   = getUChar(delta);
28239         UInt   rG      = gregOfRexRM(pfx,modrm);
28240         UInt   rV      = getVexNvvvv(pfx);
28241
28242         assign( dV, getXMMReg(rV) );
28243
28244         if (epartIsReg(modrm)) {
28245            UInt rE = eregOfRexRM(pfx,modrm);
28246            assign( sV, getXMMReg(rE) );
28247            delta += 1;
28248            DIP("vpmulhrsw %s,%s,%s\n", nameXMMReg(rE),
28249                nameXMMReg(rV), nameXMMReg(rG));
28250         } else {
28251            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
28252            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
28253            delta += alen;
28254            DIP("vpmulhrsw %s,%s,%s\n", dis_buf,
28255                nameXMMReg(rV), nameXMMReg(rG));
28256         }
28257
28258         breakupV128to64s( dV, &dHi, &dLo );
28259         breakupV128to64s( sV, &sHi, &sLo );
28260
28261         putYMMRegLoAndZU(
28262            rG,
28263            binop(Iop_64HLtoV128,
28264                  dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
28265                  dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
28266            )
28267         );
28268         *uses_vvvv = True;
28269         goto decode_success;
28270      }
28271      /* VPMULHRSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 0B /r */
28272      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28273         IRTemp sV      = newTemp(Ity_V256);
28274         IRTemp dV      = newTemp(Ity_V256);
28275         IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
28276         s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
28277         UChar  modrm   = getUChar(delta);
28278         UInt   rG      = gregOfRexRM(pfx,modrm);
28279         UInt   rV      = getVexNvvvv(pfx);
28280
28281         assign( dV, getYMMReg(rV) );
28282
28283         if (epartIsReg(modrm)) {
28284            UInt rE = eregOfRexRM(pfx,modrm);
28285            assign( sV, getYMMReg(rE) );
28286            delta += 1;
28287            DIP("vpmulhrsw %s,%s,%s\n", nameYMMReg(rE),
28288                nameYMMReg(rV), nameYMMReg(rG));
28289         } else {
28290            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
28291            assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
28292            delta += alen;
28293            DIP("vpmulhrsw %s,%s,%s\n", dis_buf,
28294                nameYMMReg(rV), nameYMMReg(rG));
28295         }
28296
28297         breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
28298         breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
28299
28300         putYMMReg(
28301            rG,
28302            binop(Iop_V128HLtoV256,
28303                  binop(Iop_64HLtoV128,
28304                        dis_PMULHRSW_helper( mkexpr(s3), mkexpr(d3) ),
28305                        dis_PMULHRSW_helper( mkexpr(s2), mkexpr(d2) ) ),
28306                  binop(Iop_64HLtoV128,
28307                        dis_PMULHRSW_helper( mkexpr(s1), mkexpr(d1) ),
28308                        dis_PMULHRSW_helper( mkexpr(s0), mkexpr(d0) ) )
28309            )
28310         );
28311         *uses_vvvv = True;
28312         dres->hint = Dis_HintVerbose;
28313         goto decode_success;
28314      }
28315      break;
28316
28317   case 0x0C:
28318      /* VPERMILPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0C /r */
28319      if (have66noF2noF3(pfx)
28320          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
28321         UChar  modrm = getUChar(delta);
28322         UInt   rG    = gregOfRexRM(pfx, modrm);
28323         UInt   rV    = getVexNvvvv(pfx);
28324         IRTemp ctrlV = newTemp(Ity_V128);
28325         if (epartIsReg(modrm)) {
28326            UInt rE = eregOfRexRM(pfx, modrm);
28327            delta += 1;
28328            DIP("vpermilps %s,%s,%s\n",
28329                nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
28330            assign(ctrlV, getXMMReg(rE));
28331         } else {
28332            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28333            delta += alen;
28334            DIP("vpermilps %s,%s,%s\n",
28335                dis_buf, nameXMMReg(rV), nameXMMReg(rG));
28336            assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
28337         }
28338         IRTemp dataV = newTemp(Ity_V128);
28339         assign(dataV, getXMMReg(rV));
28340         IRTemp resV = math_PERMILPS_VAR_128(dataV, ctrlV);
28341         putYMMRegLoAndZU(rG, mkexpr(resV));
28342         *uses_vvvv = True;
28343         goto decode_success;
28344      }
28345      /* VPERMILPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0C /r */
28346      if (have66noF2noF3(pfx)
28347          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
28348         UChar  modrm = getUChar(delta);
28349         UInt   rG    = gregOfRexRM(pfx, modrm);
28350         UInt   rV    = getVexNvvvv(pfx);
28351         IRTemp ctrlV = newTemp(Ity_V256);
28352         if (epartIsReg(modrm)) {
28353            UInt rE = eregOfRexRM(pfx, modrm);
28354            delta += 1;
28355            DIP("vpermilps %s,%s,%s\n",
28356                nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
28357            assign(ctrlV, getYMMReg(rE));
28358         } else {
28359            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28360            delta += alen;
28361            DIP("vpermilps %s,%s,%s\n",
28362                dis_buf, nameYMMReg(rV), nameYMMReg(rG));
28363            assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
28364         }
28365         IRTemp dataV = newTemp(Ity_V256);
28366         assign(dataV, getYMMReg(rV));
28367         IRTemp resV = math_PERMILPS_VAR_256(dataV, ctrlV);
28368         putYMMReg(rG, mkexpr(resV));
28369         *uses_vvvv = True;
28370         goto decode_success;
28371      }
28372      break;
28373
28374   case 0x0D:
28375      /* VPERMILPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0D /r */
28376      if (have66noF2noF3(pfx)
28377          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
28378         UChar  modrm = getUChar(delta);
28379         UInt   rG    = gregOfRexRM(pfx, modrm);
28380         UInt   rV    = getVexNvvvv(pfx);
28381         IRTemp ctrlV = newTemp(Ity_V128);
28382         if (epartIsReg(modrm)) {
28383            UInt rE = eregOfRexRM(pfx, modrm);
28384            delta += 1;
28385            DIP("vpermilpd %s,%s,%s\n",
28386                nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
28387            assign(ctrlV, getXMMReg(rE));
28388         } else {
28389            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28390            delta += alen;
28391            DIP("vpermilpd %s,%s,%s\n",
28392                dis_buf, nameXMMReg(rV), nameXMMReg(rG));
28393            assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
28394         }
28395         IRTemp dataV = newTemp(Ity_V128);
28396         assign(dataV, getXMMReg(rV));
28397         IRTemp resV = math_PERMILPD_VAR_128(dataV, ctrlV);
28398         putYMMRegLoAndZU(rG, mkexpr(resV));
28399         *uses_vvvv = True;
28400         goto decode_success;
28401      }
28402      /* VPERMILPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0D /r */
28403      if (have66noF2noF3(pfx)
28404          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
28405         UChar  modrm = getUChar(delta);
28406         UInt   rG    = gregOfRexRM(pfx, modrm);
28407         UInt   rV    = getVexNvvvv(pfx);
28408         IRTemp ctrlV = newTemp(Ity_V256);
28409         if (epartIsReg(modrm)) {
28410            UInt rE = eregOfRexRM(pfx, modrm);
28411            delta += 1;
28412            DIP("vpermilpd %s,%s,%s\n",
28413                nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
28414            assign(ctrlV, getYMMReg(rE));
28415         } else {
28416            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28417            delta += alen;
28418            DIP("vpermilpd %s,%s,%s\n",
28419                dis_buf, nameYMMReg(rV), nameYMMReg(rG));
28420            assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
28421         }
28422         IRTemp dataV = newTemp(Ity_V256);
28423         assign(dataV, getYMMReg(rV));
28424         IRTemp resV = math_PERMILPD_VAR_256(dataV, ctrlV);
28425         putYMMReg(rG, mkexpr(resV));
28426         *uses_vvvv = True;
28427         goto decode_success;
28428      }
28429      break;
28430
28431   case 0x0E:
28432      /* VTESTPS xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 0E /r */
28433      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28434         delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 32 );
28435         goto decode_success;
28436      }
28437      /* VTESTPS ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 0E /r */
28438      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28439         delta = dis_xTESTy_256( vbi, pfx, delta, 32 );
28440         goto decode_success;
28441      }
28442      break;
28443
28444   case 0x0F:
28445      /* VTESTPD xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 0F /r */
28446      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28447         delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 64 );
28448         goto decode_success;
28449      }
28450      /* VTESTPD ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 0F /r */
28451      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28452         delta = dis_xTESTy_256( vbi, pfx, delta, 64 );
28453         goto decode_success;
28454      }
28455      break;
28456
28457   case 0x16:
28458      /* VPERMPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 16 /r */
28459      if (have66noF2noF3(pfx)
28460          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
28461         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
28462                    uses_vvvv, vbi, pfx, delta, "vpermps", math_VPERMD );
28463         goto decode_success;
28464      }
28465      break;
28466
28467   case 0x17:
28468      /* VPTEST xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 17 /r */
28469      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28470         delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 0 );
28471         goto decode_success;
28472      }
28473      /* VPTEST ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 17 /r */
28474      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28475         delta = dis_xTESTy_256( vbi, pfx, delta, 0 );
28476         goto decode_success;
28477      }
28478      break;
28479
28480   case 0x18:
28481      /* VBROADCASTSS m32, xmm1 = VEX.128.66.0F38.WIG 18 /r */
28482      if (have66noF2noF3(pfx)
28483          && 0==getVexL(pfx)/*128*/
28484          && !epartIsReg(getUChar(delta))) {
28485         UChar modrm = getUChar(delta);
28486         UInt  rG    = gregOfRexRM(pfx, modrm);
28487         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28488         delta += alen;
28489         DIP("vbroadcastss %s,%s\n", dis_buf, nameXMMReg(rG));
28490         IRTemp t32 = newTemp(Ity_I32);
28491         assign(t32, loadLE(Ity_I32, mkexpr(addr)));
28492         IRTemp t64 = newTemp(Ity_I64);
28493         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
28494         IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
28495         putYMMRegLoAndZU(rG, res);
28496         goto decode_success;
28497      }
28498      /* VBROADCASTSS m32, ymm1 = VEX.256.66.0F38.WIG 18 /r */
28499      if (have66noF2noF3(pfx)
28500          && 1==getVexL(pfx)/*256*/
28501          && !epartIsReg(getUChar(delta))) {
28502         UChar modrm = getUChar(delta);
28503         UInt  rG    = gregOfRexRM(pfx, modrm);
28504         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28505         delta += alen;
28506         DIP("vbroadcastss %s,%s\n", dis_buf, nameYMMReg(rG));
28507         IRTemp t32 = newTemp(Ity_I32);
28508         assign(t32, loadLE(Ity_I32, mkexpr(addr)));
28509         IRTemp t64 = newTemp(Ity_I64);
28510         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
28511         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
28512                                                  mkexpr(t64), mkexpr(t64));
28513         putYMMReg(rG, res);
28514         goto decode_success;
28515      }
28516      /* VBROADCASTSS xmm2, xmm1 = VEX.128.66.0F38.WIG 18 /r */
28517      if (have66noF2noF3(pfx)
28518          && 0==getVexL(pfx)/*128*/
28519          && epartIsReg(getUChar(delta))) {
28520         UChar modrm = getUChar(delta);
28521         UInt  rG    = gregOfRexRM(pfx, modrm);
28522         UInt  rE    = eregOfRexRM(pfx, modrm);
28523         DIP("vbroadcastss %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
28524         IRTemp t32 = newTemp(Ity_I32);
28525         assign(t32, getXMMRegLane32(rE, 0));
28526         IRTemp t64 = newTemp(Ity_I64);
28527         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
28528         IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
28529         putYMMRegLoAndZU(rG, res);
28530         delta++;
28531         goto decode_success;
28532      }
28533      /* VBROADCASTSS xmm2, ymm1 = VEX.256.66.0F38.WIG 18 /r */
28534      if (have66noF2noF3(pfx)
28535          && 1==getVexL(pfx)/*256*/
28536          && epartIsReg(getUChar(delta))) {
28537         UChar modrm = getUChar(delta);
28538         UInt  rG    = gregOfRexRM(pfx, modrm);
28539         UInt  rE    = eregOfRexRM(pfx, modrm);
28540         DIP("vbroadcastss %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
28541         IRTemp t32 = newTemp(Ity_I32);
28542         assign(t32, getXMMRegLane32(rE, 0));
28543         IRTemp t64 = newTemp(Ity_I64);
28544         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
28545         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
28546                                                  mkexpr(t64), mkexpr(t64));
28547         putYMMReg(rG, res);
28548         delta++;
28549         goto decode_success;
28550      }
28551      break;
28552
28553   case 0x19:
28554      /* VBROADCASTSD m64, ymm1 = VEX.256.66.0F38.WIG 19 /r */
28555      if (have66noF2noF3(pfx)
28556          && 1==getVexL(pfx)/*256*/
28557          && !epartIsReg(getUChar(delta))) {
28558         UChar modrm = getUChar(delta);
28559         UInt  rG    = gregOfRexRM(pfx, modrm);
28560         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28561         delta += alen;
28562         DIP("vbroadcastsd %s,%s\n", dis_buf, nameYMMReg(rG));
28563         IRTemp t64 = newTemp(Ity_I64);
28564         assign(t64, loadLE(Ity_I64, mkexpr(addr)));
28565         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
28566                                                  mkexpr(t64), mkexpr(t64));
28567         putYMMReg(rG, res);
28568         goto decode_success;
28569      }
28570      /* VBROADCASTSD xmm2, ymm1 = VEX.256.66.0F38.WIG 19 /r */
28571      if (have66noF2noF3(pfx)
28572          && 1==getVexL(pfx)/*256*/
28573          && epartIsReg(getUChar(delta))) {
28574         UChar modrm = getUChar(delta);
28575         UInt  rG    = gregOfRexRM(pfx, modrm);
28576         UInt  rE    = eregOfRexRM(pfx, modrm);
28577         DIP("vbroadcastsd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
28578         IRTemp t64 = newTemp(Ity_I64);
28579         assign(t64, getXMMRegLane64(rE, 0));
28580         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
28581                                                  mkexpr(t64), mkexpr(t64));
28582         putYMMReg(rG, res);
28583         delta++;
28584         goto decode_success;
28585      }
28586      break;
28587
28588   case 0x1A:
28589      /* VBROADCASTF128 m128, ymm1 = VEX.256.66.0F38.WIG 1A /r */
28590      if (have66noF2noF3(pfx)
28591          && 1==getVexL(pfx)/*256*/
28592          && !epartIsReg(getUChar(delta))) {
28593         UChar modrm = getUChar(delta);
28594         UInt  rG    = gregOfRexRM(pfx, modrm);
28595         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28596         delta += alen;
28597         DIP("vbroadcastf128 %s,%s\n", dis_buf, nameYMMReg(rG));
28598         IRTemp t128 = newTemp(Ity_V128);
28599         assign(t128, loadLE(Ity_V128, mkexpr(addr)));
28600         putYMMReg( rG, binop(Iop_V128HLtoV256, mkexpr(t128), mkexpr(t128)) );
28601         goto decode_success;
28602      }
28603      break;
28604
28605   case 0x1C:
28606      /* VPABSB xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1C /r */
28607      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28608         delta = dis_AVX128_E_to_G_unary(
28609                    uses_vvvv, vbi, pfx, delta,
28610                    "vpabsb", math_PABS_XMM_pap1 );
28611         goto decode_success;
28612      }
28613      /* VPABSB ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1C /r */
28614      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28615         delta = dis_AVX256_E_to_G_unary(
28616                    uses_vvvv, vbi, pfx, delta,
28617                    "vpabsb", math_PABS_YMM_pap1 );
28618         goto decode_success;
28619      }
28620      break;
28621
28622   case 0x1D:
28623      /* VPABSW xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1D /r */
28624      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28625         delta = dis_AVX128_E_to_G_unary(
28626                    uses_vvvv, vbi, pfx, delta,
28627                    "vpabsw", math_PABS_XMM_pap2 );
28628         goto decode_success;
28629      }
28630      /* VPABSW ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1D /r */
28631      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28632         delta = dis_AVX256_E_to_G_unary(
28633                    uses_vvvv, vbi, pfx, delta,
28634                    "vpabsw", math_PABS_YMM_pap2 );
28635         goto decode_success;
28636      }
28637      break;
28638
28639   case 0x1E:
28640      /* VPABSD xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1E /r */
28641      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28642         delta = dis_AVX128_E_to_G_unary(
28643                    uses_vvvv, vbi, pfx, delta,
28644                    "vpabsd", math_PABS_XMM_pap4 );
28645         goto decode_success;
28646      }
28647      /* VPABSD ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1E /r */
28648      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28649         delta = dis_AVX256_E_to_G_unary(
28650                    uses_vvvv, vbi, pfx, delta,
28651                    "vpabsd", math_PABS_YMM_pap4 );
28652         goto decode_success;
28653      }
28654      break;
28655
28656   case 0x20:
28657      /* VPMOVSXBW xmm2/m64, xmm1 */
28658      /* VPMOVSXBW = VEX.128.66.0F38.WIG 20 /r */
28659      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28660         delta = dis_PMOVxXBW_128( vbi, pfx, delta,
28661                                   True/*isAvx*/, False/*!xIsZ*/ );
28662         goto decode_success;
28663      }
28664      /* VPMOVSXBW xmm2/m128, ymm1 */
28665      /* VPMOVSXBW = VEX.256.66.0F38.WIG 20 /r */
28666      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28667         delta = dis_PMOVxXBW_256( vbi, pfx, delta, False/*!xIsZ*/ );
28668         goto decode_success;
28669      }
28670      break;
28671
28672   case 0x21:
28673      /* VPMOVSXBD xmm2/m32, xmm1 */
28674      /* VPMOVSXBD = VEX.128.66.0F38.WIG 21 /r */
28675      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28676         delta = dis_PMOVxXBD_128( vbi, pfx, delta,
28677                                   True/*isAvx*/, False/*!xIsZ*/ );
28678         goto decode_success;
28679      }
28680      /* VPMOVSXBD xmm2/m64, ymm1 */
28681      /* VPMOVSXBD = VEX.256.66.0F38.WIG 21 /r */
28682      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28683         delta = dis_PMOVxXBD_256( vbi, pfx, delta, False/*!xIsZ*/ );
28684         goto decode_success;
28685      }
28686      break;
28687
28688   case 0x22:
28689      /* VPMOVSXBQ xmm2/m16, xmm1 */
28690      /* VPMOVSXBQ = VEX.128.66.0F38.WIG 22 /r */
28691      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28692         delta = dis_PMOVSXBQ_128( vbi, pfx, delta, True/*isAvx*/ );
28693         goto decode_success;
28694      }
28695      /* VPMOVSXBQ xmm2/m32, ymm1 */
28696      /* VPMOVSXBQ = VEX.256.66.0F38.WIG 22 /r */
28697      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28698         delta = dis_PMOVSXBQ_256( vbi, pfx, delta );
28699         goto decode_success;
28700      }
28701      break;
28702
28703   case 0x23:
28704      /* VPMOVSXWD xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 23 /r */
28705      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28706         delta = dis_PMOVxXWD_128( vbi, pfx, delta,
28707                                   True/*isAvx*/, False/*!xIsZ*/ );
28708         goto decode_success;
28709      }
28710      /* VPMOVSXWD xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 23 /r */
28711      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28712         delta = dis_PMOVxXWD_256( vbi, pfx, delta, False/*!xIsZ*/ );
28713         goto decode_success;
28714      }
28715      break;
28716
28717   case 0x24:
28718      /* VPMOVSXWQ xmm2/m32, xmm1 = VEX.128.66.0F38.WIG 24 /r */
28719      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28720         delta = dis_PMOVSXWQ_128( vbi, pfx, delta, True/*isAvx*/ );
28721         goto decode_success;
28722      }
28723      /* VPMOVSXWQ xmm2/m64, ymm1 = VEX.256.66.0F38.WIG 24 /r */
28724      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28725         delta = dis_PMOVSXWQ_256( vbi, pfx, delta );
28726         goto decode_success;
28727      }
28728      break;
28729
28730   case 0x25:
28731      /* VPMOVSXDQ xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 25 /r */
28732      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28733         delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
28734                                   True/*isAvx*/, False/*!xIsZ*/ );
28735         goto decode_success;
28736      }
28737      /* VPMOVSXDQ xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 25 /r */
28738      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28739         delta = dis_PMOVxXDQ_256( vbi, pfx, delta, False/*!xIsZ*/ );
28740         goto decode_success;
28741      }
28742      break;
28743
28744   case 0x28:
28745      /* VPMULDQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 28 /r */
28746      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28747         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
28748                    uses_vvvv, vbi, pfx, delta,
28749                    "vpmuldq", math_PMULDQ_128 );
28750         goto decode_success;
28751      }
28752      /* VPMULDQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 28 /r */
28753      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28754         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
28755                    uses_vvvv, vbi, pfx, delta,
28756                    "vpmuldq", math_PMULDQ_256 );
28757         goto decode_success;
28758      }
28759      break;
28760
28761   case 0x29:
28762      /* VPCMPEQQ r/m, rV, r ::: r = rV `eq-by-64s` r/m */
28763      /* VPCMPEQQ = VEX.NDS.128.66.0F38.WIG 29 /r */
28764      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28765         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
28766                    uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x2 );
28767         goto decode_success;
28768      }
28769      /* VPCMPEQQ r/m, rV, r ::: r = rV `eq-by-64s` r/m */
28770      /* VPCMPEQQ = VEX.NDS.256.66.0F38.WIG 29 /r */
28771      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28772         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
28773                    uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x4 );
28774         goto decode_success;
28775      }
28776      break;
28777
28778   case 0x2A:
28779      /* VMOVNTDQA m128, xmm1 = VEX.128.66.0F38.WIG 2A /r */
28780      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28781          && !epartIsReg(getUChar(delta))) {
28782         UChar  modrm = getUChar(delta);
28783         UInt   rD    = gregOfRexRM(pfx, modrm);
28784         IRTemp tD    = newTemp(Ity_V128);
28785         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28786         delta += alen;
28787         gen_SEGV_if_not_16_aligned(addr);
28788         assign(tD, loadLE(Ity_V128, mkexpr(addr)));
28789         DIP("vmovntdqa %s,%s\n", dis_buf, nameXMMReg(rD));
28790         putYMMRegLoAndZU(rD, mkexpr(tD));
28791         goto decode_success;
28792      }
28793      /* VMOVNTDQA m256, ymm1 = VEX.256.66.0F38.WIG 2A /r */
28794      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28795          && !epartIsReg(getUChar(delta))) {
28796         UChar  modrm = getUChar(delta);
28797         UInt   rD    = gregOfRexRM(pfx, modrm);
28798         IRTemp tD    = newTemp(Ity_V256);
28799         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28800         delta += alen;
28801         gen_SEGV_if_not_32_aligned(addr);
28802         assign(tD, loadLE(Ity_V256, mkexpr(addr)));
28803         DIP("vmovntdqa %s,%s\n", dis_buf, nameYMMReg(rD));
28804         putYMMReg(rD, mkexpr(tD));
28805         goto decode_success;
28806      }
28807      break;
28808
28809   case 0x2B:
28810      /* VPACKUSDW r/m, rV, r ::: r = QNarrowBin32Sto16Ux8(rV, r/m) */
28811      /* VPACKUSDW = VEX.NDS.128.66.0F38.WIG 2B /r */
28812      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28813         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
28814                    uses_vvvv, vbi, pfx, delta, "vpackusdw",
28815                    Iop_QNarrowBin32Sto16Ux8, NULL,
28816                    False/*!invertLeftArg*/, True/*swapArgs*/ );
28817         goto decode_success;
28818      }
28819      /* VPACKUSDW r/m, rV, r ::: r = QNarrowBin32Sto16Ux8(rV, r/m) */
28820      /* VPACKUSDW = VEX.NDS.256.66.0F38.WIG 2B /r */
28821      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28822         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
28823                    uses_vvvv, vbi, pfx, delta, "vpackusdw",
28824                    math_VPACKUSDW_YMM );
28825         goto decode_success;
28826      }
28827      break;
28828
28829   case 0x2C:
28830      /* VMASKMOVPS m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 2C /r */
28831      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28832          && 0==getRexW(pfx)/*W0*/
28833          && !epartIsReg(getUChar(delta))) {
28834         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
28835                               /*!isYMM*/False, Ity_I32, /*isLoad*/True );
28836         goto decode_success;
28837      }
28838      /* VMASKMOVPS m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 2C /r */
28839      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28840          && 0==getRexW(pfx)/*W0*/
28841          && !epartIsReg(getUChar(delta))) {
28842         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
28843                               /*isYMM*/True, Ity_I32, /*isLoad*/True );
28844         goto decode_success;
28845      }
28846      break;
28847
28848   case 0x2D:
28849      /* VMASKMOVPD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 2D /r */
28850      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28851          && 0==getRexW(pfx)/*W0*/
28852          && !epartIsReg(getUChar(delta))) {
28853         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
28854                               /*!isYMM*/False, Ity_I64, /*isLoad*/True );
28855         goto decode_success;
28856      }
28857      /* VMASKMOVPD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 2D /r */
28858      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28859          && 0==getRexW(pfx)/*W0*/
28860          && !epartIsReg(getUChar(delta))) {
28861         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
28862                               /*isYMM*/True, Ity_I64, /*isLoad*/True );
28863         goto decode_success;
28864      }
28865      break;
28866
28867   case 0x2E:
28868      /* VMASKMOVPS xmm1, xmm2, m128 = VEX.NDS.128.66.0F38.W0 2E /r */
28869      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28870          && 0==getRexW(pfx)/*W0*/
28871          && !epartIsReg(getUChar(delta))) {
28872         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
28873                               /*!isYMM*/False, Ity_I32, /*!isLoad*/False );
28874         goto decode_success;
28875      }
28876      /* VMASKMOVPS ymm1, ymm2, m256 = VEX.NDS.256.66.0F38.W0 2E /r */
28877      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28878          && 0==getRexW(pfx)/*W0*/
28879          && !epartIsReg(getUChar(delta))) {
28880         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
28881                               /*isYMM*/True, Ity_I32, /*!isLoad*/False );
28882         goto decode_success;
28883      }
28884      break;
28885
28886   case 0x2F:
28887      /* VMASKMOVPD xmm1, xmm2, m128 = VEX.NDS.128.66.0F38.W0 2F /r */
28888      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28889          && 0==getRexW(pfx)/*W0*/
28890          && !epartIsReg(getUChar(delta))) {
28891         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
28892                               /*!isYMM*/False, Ity_I64, /*!isLoad*/False );
28893         goto decode_success;
28894      }
28895      /* VMASKMOVPD ymm1, ymm2, m256 = VEX.NDS.256.66.0F38.W0 2F /r */
28896      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28897          && 0==getRexW(pfx)/*W0*/
28898          && !epartIsReg(getUChar(delta))) {
28899         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
28900                               /*isYMM*/True, Ity_I64, /*!isLoad*/False );
28901         goto decode_success;
28902      }
28903      break;
28904
28905   case 0x30:
28906      /* VPMOVZXBW xmm2/m64, xmm1 */
28907      /* VPMOVZXBW = VEX.128.66.0F38.WIG 30 /r */
28908      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28909         delta = dis_PMOVxXBW_128( vbi, pfx, delta,
28910                                   True/*isAvx*/, True/*xIsZ*/ );
28911         goto decode_success;
28912      }
28913      /* VPMOVZXBW xmm2/m128, ymm1 */
28914      /* VPMOVZXBW = VEX.256.66.0F38.WIG 30 /r */
28915      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28916         delta = dis_PMOVxXBW_256( vbi, pfx, delta, True/*xIsZ*/ );
28917         goto decode_success;
28918      }
28919      break;
28920
28921   case 0x31:
28922      /* VPMOVZXBD xmm2/m32, xmm1 */
28923      /* VPMOVZXBD = VEX.128.66.0F38.WIG 31 /r */
28924      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28925         delta = dis_PMOVxXBD_128( vbi, pfx, delta,
28926                                   True/*isAvx*/, True/*xIsZ*/ );
28927         goto decode_success;
28928      }
28929      /* VPMOVZXBD xmm2/m64, ymm1 */
28930      /* VPMOVZXBD = VEX.256.66.0F38.WIG 31 /r */
28931      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28932         delta = dis_PMOVxXBD_256( vbi, pfx, delta, True/*xIsZ*/ );
28933         goto decode_success;
28934      }
28935      break;
28936
28937   case 0x32:
28938      /* VPMOVZXBQ xmm2/m16, xmm1 */
28939      /* VPMOVZXBQ = VEX.128.66.0F38.WIG 32 /r */
28940      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28941         delta = dis_PMOVZXBQ_128( vbi, pfx, delta, True/*isAvx*/ );
28942         goto decode_success;
28943      }
28944      /* VPMOVZXBQ xmm2/m32, ymm1 */
28945      /* VPMOVZXBQ = VEX.256.66.0F38.WIG 32 /r */
28946      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28947         delta = dis_PMOVZXBQ_256( vbi, pfx, delta );
28948         goto decode_success;
28949      }
28950      break;
28951
28952   case 0x33:
28953      /* VPMOVZXWD xmm2/m64, xmm1 */
28954      /* VPMOVZXWD = VEX.128.66.0F38.WIG 33 /r */
28955      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28956         delta = dis_PMOVxXWD_128( vbi, pfx, delta,
28957                                   True/*isAvx*/, True/*xIsZ*/ );
28958         goto decode_success;
28959      }
28960      /* VPMOVZXWD xmm2/m128, ymm1 */
28961      /* VPMOVZXWD = VEX.256.66.0F38.WIG 33 /r */
28962      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28963         delta = dis_PMOVxXWD_256( vbi, pfx, delta, True/*xIsZ*/ );
28964         goto decode_success;
28965      }
28966      break;
28967
28968   case 0x34:
28969      /* VPMOVZXWQ xmm2/m32, xmm1 = VEX.128.66.0F38.WIG 34 /r */
28970      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28971         delta = dis_PMOVZXWQ_128( vbi, pfx, delta, True/*isAvx*/ );
28972         goto decode_success;
28973      }
28974      /* VPMOVZXWQ xmm2/m64, ymm1 = VEX.256.66.0F38.WIG 34 /r */
28975      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28976         delta = dis_PMOVZXWQ_256( vbi, pfx, delta );
28977         goto decode_success;
28978      }
28979      break;
28980
28981   case 0x35:
28982      /* VPMOVZXDQ xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 35 /r */
28983      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28984         delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
28985                                   True/*isAvx*/, True/*xIsZ*/ );
28986         goto decode_success;
28987      }
28988      /* VPMOVZXDQ xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 35 /r */
28989      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28990         delta = dis_PMOVxXDQ_256( vbi, pfx, delta, True/*xIsZ*/ );
28991         goto decode_success;
28992      }
28993      break;
28994
28995   case 0x36:
28996      /* VPERMD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 36 /r */
28997      if (have66noF2noF3(pfx)
28998          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
28999         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
29000                    uses_vvvv, vbi, pfx, delta, "vpermd", math_VPERMD );
29001         goto decode_success;
29002      }
29003      break;
29004
29005   case 0x37:
29006      /* VPCMPGTQ r/m, rV, r ::: r = rV `>s-by-64s` r/m */
29007      /* VPCMPGTQ = VEX.NDS.128.66.0F38.WIG 37 /r */
29008      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29009         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
29010                    uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx2 );
29011         goto decode_success;
29012      }
29013      /* VPCMPGTQ r/m, rV, r ::: r = rV `>s-by-64s` r/m */
29014      /* VPCMPGTQ = VEX.NDS.256.66.0F38.WIG 37 /r */
29015      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29016         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
29017                    uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx4 );
29018         goto decode_success;
29019      }
29020      break;
29021
29022   case 0x38:
29023      /* VPMINSB r/m, rV, r ::: r = min-signed-8s(rV, r/m) */
29024      /* VPMINSB = VEX.NDS.128.66.0F38.WIG 38 /r */
29025      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29026         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
29027                    uses_vvvv, vbi, pfx, delta, "vpminsb", Iop_Min8Sx16 );
29028         goto decode_success;
29029      }
29030      /* VPMINSB r/m, rV, r ::: r = min-signed-8s(rV, r/m) */
29031      /* VPMINSB = VEX.NDS.256.66.0F38.WIG 38 /r */
29032      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29033         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
29034                    uses_vvvv, vbi, pfx, delta, "vpminsb", Iop_Min8Sx32 );
29035         goto decode_success;
29036      }
29037      break;
29038
29039   case 0x39:
29040      /* VPMINSD r/m, rV, r ::: r = min-signed-32s(rV, r/m) */
29041      /* VPMINSD = VEX.NDS.128.66.0F38.WIG 39 /r */
29042      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29043         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
29044                    uses_vvvv, vbi, pfx, delta, "vpminsd", Iop_Min32Sx4 );
29045         goto decode_success;
29046      }
29047      /* VPMINSD r/m, rV, r ::: r = min-signed-32s(rV, r/m) */
29048      /* VPMINSD = VEX.NDS.256.66.0F38.WIG 39 /r */
29049      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29050         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
29051                    uses_vvvv, vbi, pfx, delta, "vpminsd", Iop_Min32Sx8 );
29052         goto decode_success;
29053      }
29054      break;
29055
29056   case 0x3A:
29057      /* VPMINUW r/m, rV, r ::: r = min-unsigned-16s(rV, r/m) */
29058      /* VPMINUW = VEX.NDS.128.66.0F38.WIG 3A /r */
29059      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29060         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
29061                    uses_vvvv, vbi, pfx, delta, "vpminuw", Iop_Min16Ux8 );
29062         goto decode_success;
29063      }
29064      /* VPMINUW r/m, rV, r ::: r = min-unsigned-16s(rV, r/m) */
29065      /* VPMINUW = VEX.NDS.256.66.0F38.WIG 3A /r */
29066      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29067         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
29068                    uses_vvvv, vbi, pfx, delta, "vpminuw", Iop_Min16Ux16 );
29069         goto decode_success;
29070      }
29071      break;
29072
29073   case 0x3B:
29074      /* VPMINUD r/m, rV, r ::: r = min-unsigned-32s(rV, r/m) */
29075      /* VPMINUD = VEX.NDS.128.66.0F38.WIG 3B /r */
29076      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29077         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
29078                    uses_vvvv, vbi, pfx, delta, "vpminud", Iop_Min32Ux4 );
29079         goto decode_success;
29080      }
29081      /* VPMINUD r/m, rV, r ::: r = min-unsigned-32s(rV, r/m) */
29082      /* VPMINUD = VEX.NDS.256.66.0F38.WIG 3B /r */
29083      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29084         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
29085                    uses_vvvv, vbi, pfx, delta, "vpminud", Iop_Min32Ux8 );
29086         goto decode_success;
29087      }
29088      break;
29089
29090   case 0x3C:
29091      /* VPMAXSB r/m, rV, r ::: r = max-signed-8s(rV, r/m) */
29092      /* VPMAXSB = VEX.NDS.128.66.0F38.WIG 3C /r */
29093      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29094         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
29095                    uses_vvvv, vbi, pfx, delta, "vpmaxsb", Iop_Max8Sx16 );
29096         goto decode_success;
29097      }
29098      /* VPMAXSB r/m, rV, r ::: r = max-signed-8s(rV, r/m) */
29099      /* VPMAXSB = VEX.NDS.256.66.0F38.WIG 3C /r */
29100      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29101         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
29102                    uses_vvvv, vbi, pfx, delta, "vpmaxsb", Iop_Max8Sx32 );
29103         goto decode_success;
29104      }
29105      break;
29106
29107   case 0x3D:
29108      /* VPMAXSD r/m, rV, r ::: r = max-signed-32s(rV, r/m) */
29109      /* VPMAXSD = VEX.NDS.128.66.0F38.WIG 3D /r */
29110      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29111         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
29112                    uses_vvvv, vbi, pfx, delta, "vpmaxsd", Iop_Max32Sx4 );
29113         goto decode_success;
29114      }
29115      /* VPMAXSD r/m, rV, r ::: r = max-signed-32s(rV, r/m) */
29116      /* VPMAXSD = VEX.NDS.256.66.0F38.WIG 3D /r */
29117      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29118         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
29119                    uses_vvvv, vbi, pfx, delta, "vpmaxsd", Iop_Max32Sx8 );
29120         goto decode_success;
29121      }
29122      break;
29123
29124   case 0x3E:
29125      /* VPMAXUW r/m, rV, r ::: r = max-unsigned-16s(rV, r/m) */
29126      /* VPMAXUW = VEX.NDS.128.66.0F38.WIG 3E /r */
29127      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29128         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
29129                    uses_vvvv, vbi, pfx, delta, "vpmaxuw", Iop_Max16Ux8 );
29130         goto decode_success;
29131      }
29132      /* VPMAXUW r/m, rV, r ::: r = max-unsigned-16s(rV, r/m) */
29133      /* VPMAXUW = VEX.NDS.256.66.0F38.WIG 3E /r */
29134      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29135         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
29136                    uses_vvvv, vbi, pfx, delta, "vpmaxuw", Iop_Max16Ux16 );
29137         goto decode_success;
29138      }
29139      break;
29140
29141   case 0x3F:
29142      /* VPMAXUD r/m, rV, r ::: r = max-unsigned-32s(rV, r/m) */
29143      /* VPMAXUD = VEX.NDS.128.66.0F38.WIG 3F /r */
29144      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29145         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
29146                    uses_vvvv, vbi, pfx, delta, "vpmaxud", Iop_Max32Ux4 );
29147         goto decode_success;
29148      }
29149      /* VPMAXUD r/m, rV, r ::: r = max-unsigned-32s(rV, r/m) */
29150      /* VPMAXUD = VEX.NDS.256.66.0F38.WIG 3F /r */
29151      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29152         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
29153                    uses_vvvv, vbi, pfx, delta, "vpmaxud", Iop_Max32Ux8 );
29154         goto decode_success;
29155      }
29156      break;
29157
29158   case 0x40:
29159      /* VPMULLD r/m, rV, r ::: r = mul-32s(rV, r/m) */
29160      /* VPMULLD = VEX.NDS.128.66.0F38.WIG 40 /r */
29161      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29162         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
29163                    uses_vvvv, vbi, pfx, delta, "vpmulld", Iop_Mul32x4 );
29164         goto decode_success;
29165      }
29166      /* VPMULLD r/m, rV, r ::: r = mul-32s(rV, r/m) */
29167      /* VPMULLD = VEX.NDS.256.66.0F38.WIG 40 /r */
29168      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29169         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
29170                    uses_vvvv, vbi, pfx, delta, "vpmulld", Iop_Mul32x8 );
29171         goto decode_success;
29172      }
29173      break;
29174
29175   case 0x41:
29176      /* VPHMINPOSUW xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 41 /r */
29177      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29178         delta = dis_PHMINPOSUW_128( vbi, pfx, delta, True/*isAvx*/ );
29179         goto decode_success;
29180      }
29181      break;
29182
29183   case 0x45:
29184      /* VPSRLVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 45 /r */
29185      /* VPSRLVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 45 /r */
29186      if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
29187         delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsrlvd",
29188                                         Iop_Shr32, 1==getVexL(pfx) );
29189         *uses_vvvv = True;
29190         goto decode_success;
29191      }
29192      /* VPSRLVQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 45 /r */
29193      /* VPSRLVQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 45 /r */
29194      if (have66noF2noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
29195         delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsrlvq",
29196                                         Iop_Shr64, 1==getVexL(pfx) );
29197         *uses_vvvv = True;
29198         goto decode_success;
29199      }
29200      break;
29201
29202   case 0x46:
29203      /* VPSRAVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 46 /r */
29204      /* VPSRAVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 46 /r */
29205      if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
29206         delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsravd",
29207                                         Iop_Sar32, 1==getVexL(pfx) );
29208         *uses_vvvv = True;
29209         goto decode_success;
29210      }
29211      break;
29212
29213   case 0x47:
29214      /* VPSLLVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 47 /r */
29215      /* VPSLLVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 47 /r */
29216      if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
29217         delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsllvd",
29218                                         Iop_Shl32, 1==getVexL(pfx) );
29219         *uses_vvvv = True;
29220         goto decode_success;
29221      }
29222      /* VPSLLVQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 47 /r */
29223      /* VPSLLVQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 47 /r */
29224      if (have66noF2noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
29225         delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsllvq",
29226                                         Iop_Shl64, 1==getVexL(pfx) );
29227         *uses_vvvv = True;
29228         goto decode_success;
29229      }
29230      break;
29231
29232   case 0x58:
29233      /* VPBROADCASTD xmm2/m32, xmm1 = VEX.128.66.0F38.W0 58 /r */
29234      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29235          && 0==getRexW(pfx)/*W0*/) {
29236         UChar modrm = getUChar(delta);
29237         UInt  rG    = gregOfRexRM(pfx, modrm);
29238         IRTemp t32 = newTemp(Ity_I32);
29239         if (epartIsReg(modrm)) {
29240            UInt rE = eregOfRexRM(pfx, modrm);
29241            delta++;
29242            DIP("vpbroadcastd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
29243            assign(t32, getXMMRegLane32(rE, 0));
29244         } else {
29245            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
29246            delta += alen;
29247            DIP("vpbroadcastd %s,%s\n", dis_buf, nameXMMReg(rG));
29248            assign(t32, loadLE(Ity_I32, mkexpr(addr)));
29249         }
29250         IRTemp t64 = newTemp(Ity_I64);
29251         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
29252         IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
29253         putYMMRegLoAndZU(rG, res);
29254         goto decode_success;
29255      }
29256      /* VPBROADCASTD xmm2/m32, ymm1 = VEX.256.66.0F38.W0 58 /r */
29257      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29258          && 0==getRexW(pfx)/*W0*/) {
29259         UChar modrm = getUChar(delta);
29260         UInt  rG    = gregOfRexRM(pfx, modrm);
29261         IRTemp t32 = newTemp(Ity_I32);
29262         if (epartIsReg(modrm)) {
29263            UInt rE = eregOfRexRM(pfx, modrm);
29264            delta++;
29265            DIP("vpbroadcastd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
29266            assign(t32, getXMMRegLane32(rE, 0));
29267         } else {
29268            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
29269            delta += alen;
29270            DIP("vpbroadcastd %s,%s\n", dis_buf, nameYMMReg(rG));
29271            assign(t32, loadLE(Ity_I32, mkexpr(addr)));
29272         }
29273         IRTemp t64 = newTemp(Ity_I64);
29274         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
29275         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
29276                                                  mkexpr(t64), mkexpr(t64));
29277         putYMMReg(rG, res);
29278         goto decode_success;
29279      }
29280      break;
29281
29282   case 0x59:
29283      /* VPBROADCASTQ xmm2/m64, xmm1 = VEX.128.66.0F38.W0 59 /r */
29284      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29285          && 0==getRexW(pfx)/*W0*/) {
29286         UChar modrm = getUChar(delta);
29287         UInt  rG    = gregOfRexRM(pfx, modrm);
29288         IRTemp t64 = newTemp(Ity_I64);
29289         if (epartIsReg(modrm)) {
29290            UInt rE = eregOfRexRM(pfx, modrm);
29291            delta++;
29292            DIP("vpbroadcastq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
29293            assign(t64, getXMMRegLane64(rE, 0));
29294         } else {
29295            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
29296            delta += alen;
29297            DIP("vpbroadcastq %s,%s\n", dis_buf, nameXMMReg(rG));
29298            assign(t64, loadLE(Ity_I64, mkexpr(addr)));
29299         }
29300         IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
29301         putYMMRegLoAndZU(rG, res);
29302         goto decode_success;
29303      }
29304      /* VPBROADCASTQ xmm2/m64, ymm1 = VEX.256.66.0F38.W0 59 /r */
29305      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29306          && 0==getRexW(pfx)/*W0*/) {
29307         UChar modrm = getUChar(delta);
29308         UInt  rG    = gregOfRexRM(pfx, modrm);
29309         IRTemp t64 = newTemp(Ity_I64);
29310         if (epartIsReg(modrm)) {
29311            UInt rE = eregOfRexRM(pfx, modrm);
29312            delta++;
29313            DIP("vpbroadcastq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
29314            assign(t64, getXMMRegLane64(rE, 0));
29315         } else {
29316            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
29317            delta += alen;
29318            DIP("vpbroadcastq %s,%s\n", dis_buf, nameYMMReg(rG));
29319            assign(t64, loadLE(Ity_I64, mkexpr(addr)));
29320         }
29321         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
29322                                                  mkexpr(t64), mkexpr(t64));
29323         putYMMReg(rG, res);
29324         goto decode_success;
29325      }
29326      break;
29327
29328   case 0x5A:
29329      /* VBROADCASTI128 m128, ymm1 = VEX.256.66.0F38.WIG 5A /r */
29330      if (have66noF2noF3(pfx)
29331          && 1==getVexL(pfx)/*256*/
29332          && !epartIsReg(getUChar(delta))) {
29333         UChar modrm = getUChar(delta);
29334         UInt  rG    = gregOfRexRM(pfx, modrm);
29335         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
29336         delta += alen;
29337         DIP("vbroadcasti128 %s,%s\n", dis_buf, nameYMMReg(rG));
29338         IRTemp t128 = newTemp(Ity_V128);
29339         assign(t128, loadLE(Ity_V128, mkexpr(addr)));
29340         putYMMReg( rG, binop(Iop_V128HLtoV256, mkexpr(t128), mkexpr(t128)) );
29341         goto decode_success;
29342      }
29343      break;
29344
29345   case 0x78:
29346      /* VPBROADCASTB xmm2/m8, xmm1 = VEX.128.66.0F38.W0 78 /r */
29347      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29348          && 0==getRexW(pfx)/*W0*/) {
29349         UChar modrm = getUChar(delta);
29350         UInt  rG    = gregOfRexRM(pfx, modrm);
29351         IRTemp t8   = newTemp(Ity_I8);
29352         if (epartIsReg(modrm)) {
29353            UInt rE = eregOfRexRM(pfx, modrm);
29354            delta++;
29355            DIP("vpbroadcastb %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
29356            assign(t8, unop(Iop_32to8, getXMMRegLane32(rE, 0)));
29357         } else {
29358            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
29359            delta += alen;
29360            DIP("vpbroadcastb %s,%s\n", dis_buf, nameXMMReg(rG));
29361            assign(t8, loadLE(Ity_I8, mkexpr(addr)));
29362         }
29363         IRTemp t16 = newTemp(Ity_I16);
29364         assign(t16, binop(Iop_8HLto16, mkexpr(t8), mkexpr(t8)));
29365         IRTemp t32 = newTemp(Ity_I32);
29366         assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
29367         IRTemp t64 = newTemp(Ity_I64);
29368         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
29369         IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
29370         putYMMRegLoAndZU(rG, res);
29371         goto decode_success;
29372      }
29373      /* VPBROADCASTB xmm2/m8, ymm1 = VEX.256.66.0F38.W0 78 /r */
29374      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29375          && 0==getRexW(pfx)/*W0*/) {
29376         UChar modrm = getUChar(delta);
29377         UInt  rG    = gregOfRexRM(pfx, modrm);
29378         IRTemp t8   = newTemp(Ity_I8);
29379         if (epartIsReg(modrm)) {
29380            UInt rE = eregOfRexRM(pfx, modrm);
29381            delta++;
29382            DIP("vpbroadcastb %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
29383            assign(t8, unop(Iop_32to8, getXMMRegLane32(rE, 0)));
29384         } else {
29385            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
29386            delta += alen;
29387            DIP("vpbroadcastb %s,%s\n", dis_buf, nameYMMReg(rG));
29388            assign(t8, loadLE(Ity_I8, mkexpr(addr)));
29389         }
29390         IRTemp t16 = newTemp(Ity_I16);
29391         assign(t16, binop(Iop_8HLto16, mkexpr(t8), mkexpr(t8)));
29392         IRTemp t32 = newTemp(Ity_I32);
29393         assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
29394         IRTemp t64 = newTemp(Ity_I64);
29395         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
29396         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
29397                                                  mkexpr(t64), mkexpr(t64));
29398         putYMMReg(rG, res);
29399         goto decode_success;
29400      }
29401      break;
29402
29403   case 0x79:
29404      /* VPBROADCASTW xmm2/m16, xmm1 = VEX.128.66.0F38.W0 79 /r */
29405      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29406          && 0==getRexW(pfx)/*W0*/) {
29407         UChar modrm = getUChar(delta);
29408         UInt  rG    = gregOfRexRM(pfx, modrm);
29409         IRTemp t16  = newTemp(Ity_I16);
29410         if (epartIsReg(modrm)) {
29411            UInt rE = eregOfRexRM(pfx, modrm);
29412            delta++;
29413            DIP("vpbroadcastw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
29414            assign(t16, unop(Iop_32to16, getXMMRegLane32(rE, 0)));
29415         } else {
29416            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
29417            delta += alen;
29418            DIP("vpbroadcastw %s,%s\n", dis_buf, nameXMMReg(rG));
29419            assign(t16, loadLE(Ity_I16, mkexpr(addr)));
29420         }
29421         IRTemp t32 = newTemp(Ity_I32);
29422         assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
29423         IRTemp t64 = newTemp(Ity_I64);
29424         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
29425         IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
29426         putYMMRegLoAndZU(rG, res);
29427         goto decode_success;
29428      }
29429      /* VPBROADCASTW xmm2/m16, ymm1 = VEX.256.66.0F38.W0 79 /r */
29430      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29431          && 0==getRexW(pfx)/*W0*/) {
29432         UChar modrm = getUChar(delta);
29433         UInt  rG    = gregOfRexRM(pfx, modrm);
29434         IRTemp t16  = newTemp(Ity_I16);
29435         if (epartIsReg(modrm)) {
29436            UInt rE = eregOfRexRM(pfx, modrm);
29437            delta++;
29438            DIP("vpbroadcastw %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
29439            assign(t16, unop(Iop_32to16, getXMMRegLane32(rE, 0)));
29440         } else {
29441            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
29442            delta += alen;
29443            DIP("vpbroadcastw %s,%s\n", dis_buf, nameYMMReg(rG));
29444            assign(t16, loadLE(Ity_I16, mkexpr(addr)));
29445         }
29446         IRTemp t32 = newTemp(Ity_I32);
29447         assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
29448         IRTemp t64 = newTemp(Ity_I64);
29449         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
29450         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
29451                                                  mkexpr(t64), mkexpr(t64));
29452         putYMMReg(rG, res);
29453         goto decode_success;
29454      }
29455      break;
29456
29457   case 0x8C:
29458      /* VPMASKMOVD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 8C /r */
29459      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29460          && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29461         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
29462                               /*!isYMM*/False, Ity_I32, /*isLoad*/True );
29463         goto decode_success;
29464      }
29465      /* VPMASKMOVD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 8C /r */
29466      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29467          && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29468         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
29469                               /*isYMM*/True, Ity_I32, /*isLoad*/True );
29470         goto decode_success;
29471      }
29472      /* VPMASKMOVQ m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 8C /r */
29473      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29474          && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29475         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
29476                               /*!isYMM*/False, Ity_I64, /*isLoad*/True );
29477         goto decode_success;
29478      }
29479      /* VPMASKMOVQ m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 8C /r */
29480      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29481          && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29482         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
29483                               /*isYMM*/True, Ity_I64, /*isLoad*/True );
29484         goto decode_success;
29485      }
29486      break;
29487
29488   case 0x8E:
29489      /* VPMASKMOVD xmm1, xmm2, m128 = VEX.NDS.128.66.0F38.W0 8E /r */
29490      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29491          && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29492         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
29493                               /*!isYMM*/False, Ity_I32, /*!isLoad*/False );
29494         goto decode_success;
29495      }
29496      /* VPMASKMOVD ymm1, ymm2, m256 = VEX.NDS.256.66.0F38.W0 8E /r */
29497      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29498          && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29499         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
29500                               /*isYMM*/True, Ity_I32, /*!isLoad*/False );
29501         goto decode_success;
29502      }
29503      /* VPMASKMOVQ xmm1, xmm2, m128 = VEX.NDS.128.66.0F38.W1 8E /r */
29504      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29505          && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29506         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
29507                               /*!isYMM*/False, Ity_I64, /*!isLoad*/False );
29508         goto decode_success;
29509      }
29510      /* VPMASKMOVQ ymm1, ymm2, m256 = VEX.NDS.256.66.0F38.W1 8E /r */
29511      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29512          && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29513         delta = dis_VMASKMOV( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
29514                               /*isYMM*/True, Ity_I64, /*!isLoad*/False );
29515         goto decode_success;
29516      }
29517      break;
29518
29519   case 0x90:
29520      /* VPGATHERDD xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W0 90 /r */
29521      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29522          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29523         Long delta0 = delta;
29524         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdd",
29525                              /*!isYMM*/False, /*!isVM64x*/False, Ity_I32 );
29526         if (delta != delta0)
29527            goto decode_success;
29528      }
29529      /* VPGATHERDD ymm2, vm32y, ymm1 = VEX.DDS.256.66.0F38.W0 90 /r */
29530      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29531          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29532         Long delta0 = delta;
29533         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdd",
29534                              /*isYMM*/True, /*!isVM64x*/False, Ity_I32 );
29535         if (delta != delta0)
29536            goto decode_success;
29537      }
29538      /* VPGATHERDQ xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W1 90 /r */
29539      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29540          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29541         Long delta0 = delta;
29542         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdq",
29543                              /*!isYMM*/False, /*!isVM64x*/False, Ity_I64 );
29544         if (delta != delta0)
29545            goto decode_success;
29546      }
29547      /* VPGATHERDQ ymm2, vm32x, ymm1 = VEX.DDS.256.66.0F38.W1 90 /r */
29548      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29549          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29550         Long delta0 = delta;
29551         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdq",
29552                              /*isYMM*/True, /*!isVM64x*/False, Ity_I64 );
29553         if (delta != delta0)
29554            goto decode_success;
29555      }
29556      break;
29557
29558   case 0x91:
29559      /* VPGATHERQD xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W0 91 /r */
29560      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29561          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29562         Long delta0 = delta;
29563         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqd",
29564                              /*!isYMM*/False, /*isVM64x*/True, Ity_I32 );
29565         if (delta != delta0)
29566            goto decode_success;
29567      }
29568      /* VPGATHERQD xmm2, vm64y, xmm1 = VEX.DDS.256.66.0F38.W0 91 /r */
29569      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29570          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29571         Long delta0 = delta;
29572         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqd",
29573                              /*isYMM*/True, /*isVM64x*/True, Ity_I32 );
29574         if (delta != delta0)
29575            goto decode_success;
29576      }
29577      /* VPGATHERQQ xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W1 91 /r */
29578      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29579          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29580         Long delta0 = delta;
29581         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqq",
29582                              /*!isYMM*/False, /*isVM64x*/True, Ity_I64 );
29583         if (delta != delta0)
29584            goto decode_success;
29585      }
29586      /* VPGATHERQQ ymm2, vm64y, ymm1 = VEX.DDS.256.66.0F38.W1 91 /r */
29587      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29588          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29589         Long delta0 = delta;
29590         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqq",
29591                              /*isYMM*/True, /*isVM64x*/True, Ity_I64 );
29592         if (delta != delta0)
29593            goto decode_success;
29594      }
29595      break;
29596
29597   case 0x92:
29598      /* VGATHERDPS xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W0 92 /r */
29599      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29600          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29601         Long delta0 = delta;
29602         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdps",
29603                              /*!isYMM*/False, /*!isVM64x*/False, Ity_I32 );
29604         if (delta != delta0)
29605            goto decode_success;
29606      }
29607      /* VGATHERDPS ymm2, vm32y, ymm1 = VEX.DDS.256.66.0F38.W0 92 /r */
29608      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29609          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29610         Long delta0 = delta;
29611         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdps",
29612                              /*isYMM*/True, /*!isVM64x*/False, Ity_I32 );
29613         if (delta != delta0)
29614            goto decode_success;
29615      }
29616      /* VGATHERDPD xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W1 92 /r */
29617      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29618          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29619         Long delta0 = delta;
29620         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdpd",
29621                              /*!isYMM*/False, /*!isVM64x*/False, Ity_I64 );
29622         if (delta != delta0)
29623            goto decode_success;
29624      }
29625      /* VGATHERDPD ymm2, vm32x, ymm1 = VEX.DDS.256.66.0F38.W1 92 /r */
29626      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29627          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29628         Long delta0 = delta;
29629         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdpd",
29630                              /*isYMM*/True, /*!isVM64x*/False, Ity_I64 );
29631         if (delta != delta0)
29632            goto decode_success;
29633      }
29634      break;
29635
29636   case 0x93:
29637      /* VGATHERQPS xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W0 93 /r */
29638      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29639          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29640         Long delta0 = delta;
29641         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqps",
29642                              /*!isYMM*/False, /*isVM64x*/True, Ity_I32 );
29643         if (delta != delta0)
29644            goto decode_success;
29645      }
29646      /* VGATHERQPS xmm2, vm64y, xmm1 = VEX.DDS.256.66.0F38.W0 93 /r */
29647      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29648          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
29649         Long delta0 = delta;
29650         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqps",
29651                              /*isYMM*/True, /*isVM64x*/True, Ity_I32 );
29652         if (delta != delta0)
29653            goto decode_success;
29654      }
29655      /* VGATHERQPD xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W1 93 /r */
29656      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
29657          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29658         Long delta0 = delta;
29659         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqpd",
29660                              /*!isYMM*/False, /*isVM64x*/True, Ity_I64 );
29661         if (delta != delta0)
29662            goto decode_success;
29663      }
29664      /* VGATHERQPD ymm2, vm64y, ymm1 = VEX.DDS.256.66.0F38.W1 93 /r */
29665      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29666          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
29667         Long delta0 = delta;
29668         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqpd",
29669                              /*isYMM*/True, /*isVM64x*/True, Ity_I64 );
29670         if (delta != delta0)
29671            goto decode_success;
29672      }
29673      break;
29674
29675   case 0x96 ... 0x9F:
29676   case 0xA6 ... 0xAF:
29677   case 0xB6 ... 0xBF:
29678      /* VFMADDSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 96 /r */
29679      /* VFMADDSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 96 /r */
29680      /* VFMADDSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 96 /r */
29681      /* VFMADDSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 96 /r */
29682      /* VFMSUBADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 97 /r */
29683      /* VFMSUBADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 97 /r */
29684      /* VFMSUBADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 97 /r */
29685      /* VFMSUBADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 97 /r */
29686      /* VFMADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 98 /r */
29687      /* VFMADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 98 /r */
29688      /* VFMADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 98 /r */
29689      /* VFMADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 98 /r */
29690      /* VFMADD132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 99 /r */
29691      /* VFMADD132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 99 /r */
29692      /* VFMSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9A /r */
29693      /* VFMSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9A /r */
29694      /* VFMSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9A /r */
29695      /* VFMSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9A /r */
29696      /* VFMSUB132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9B /r */
29697      /* VFMSUB132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9B /r */
29698      /* VFNMADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9C /r */
29699      /* VFNMADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9C /r */
29700      /* VFNMADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9C /r */
29701      /* VFNMADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9C /r */
29702      /* VFNMADD132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9D /r */
29703      /* VFNMADD132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9D /r */
29704      /* VFNMSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9E /r */
29705      /* VFNMSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9E /r */
29706      /* VFNMSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9E /r */
29707      /* VFNMSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9E /r */
29708      /* VFNMSUB132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9F /r */
29709      /* VFNMSUB132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9F /r */
29710      /* VFMADDSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A6 /r */
29711      /* VFMADDSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A6 /r */
29712      /* VFMADDSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A6 /r */
29713      /* VFMADDSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A6 /r */
29714      /* VFMSUBADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A7 /r */
29715      /* VFMSUBADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A7 /r */
29716      /* VFMSUBADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A7 /r */
29717      /* VFMSUBADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A7 /r */
29718      /* VFMADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A8 /r */
29719      /* VFMADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A8 /r */
29720      /* VFMADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A8 /r */
29721      /* VFMADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A8 /r */
29722      /* VFMADD213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 A9 /r */
29723      /* VFMADD213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 A9 /r */
29724      /* VFMSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AA /r */
29725      /* VFMSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AA /r */
29726      /* VFMSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AA /r */
29727      /* VFMSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AA /r */
29728      /* VFMSUB213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AB /r */
29729      /* VFMSUB213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AB /r */
29730      /* VFNMADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AC /r */
29731      /* VFNMADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AC /r */
29732      /* VFNMADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AC /r */
29733      /* VFNMADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AC /r */
29734      /* VFNMADD213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AD /r */
29735      /* VFNMADD213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AD /r */
29736      /* VFNMSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AE /r */
29737      /* VFNMSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AE /r */
29738      /* VFNMSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AE /r */
29739      /* VFNMSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AE /r */
29740      /* VFNMSUB213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AF /r */
29741      /* VFNMSUB213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AF /r */
29742      /* VFMADDSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B6 /r */
29743      /* VFMADDSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B6 /r */
29744      /* VFMADDSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B6 /r */
29745      /* VFMADDSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B6 /r */
29746      /* VFMSUBADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B7 /r */
29747      /* VFMSUBADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B7 /r */
29748      /* VFMSUBADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B7 /r */
29749      /* VFMSUBADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B7 /r */
29750      /* VFMADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B8 /r */
29751      /* VFMADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B8 /r */
29752      /* VFMADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B8 /r */
29753      /* VFMADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B8 /r */
29754      /* VFMADD231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 B9 /r */
29755      /* VFMADD231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 B9 /r */
29756      /* VFMSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BA /r */
29757      /* VFMSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BA /r */
29758      /* VFMSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BA /r */
29759      /* VFMSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BA /r */
29760      /* VFMSUB231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BB /r */
29761      /* VFMSUB231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BB /r */
29762      /* VFNMADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BC /r */
29763      /* VFNMADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BC /r */
29764      /* VFNMADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BC /r */
29765      /* VFNMADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BC /r */
29766      /* VFNMADD231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BD /r */
29767      /* VFNMADD231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BD /r */
29768      /* VFNMSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BE /r */
29769      /* VFNMSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BE /r */
29770      /* VFNMSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BE /r */
29771      /* VFNMSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BE /r */
29772      /* VFNMSUB231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BF /r */
29773      /* VFNMSUB231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BF /r */
29774      if (have66noF2noF3(pfx)) {
29775         delta = dis_FMA( vbi, pfx, delta, opc );
29776         *uses_vvvv = True;
29777         dres->hint = Dis_HintVerbose;
29778         goto decode_success;
29779      }
29780      break;
29781
29782   case 0xDB:
29783   case 0xDC:
29784   case 0xDD:
29785   case 0xDE:
29786   case 0xDF:
29787      /* VAESIMC xmm2/m128, xmm1 = VEX.128.66.0F38.WIG DB /r */
29788      /* VAESENC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DC /r */
29789      /* VAESENCLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DD /r */
29790      /* VAESDEC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DE /r */
29791      /* VAESDECLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DF /r */
29792      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29793         delta = dis_AESx( vbi, pfx, delta, True/*!isAvx*/, opc );
29794         if (opc != 0xDB) *uses_vvvv = True;
29795         goto decode_success;
29796      }
29797      break;
29798
29799   case 0xF2:
29800      /* ANDN r/m32, r32b, r32a = VEX.NDS.LZ.0F38.W0 F2 /r */
29801      /* ANDN r/m64, r64b, r64a = VEX.NDS.LZ.0F38.W1 F2 /r */
29802      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
29803         Int     size = getRexW(pfx) ? 8 : 4;
29804         IRType  ty   = szToITy(size);
29805         IRTemp  dst  = newTemp(ty);
29806         IRTemp  src1 = newTemp(ty);
29807         IRTemp  src2 = newTemp(ty);
29808         UChar   rm   = getUChar(delta);
29809
29810         assign( src1, getIRegV(size,pfx) );
29811         if (epartIsReg(rm)) {
29812            assign( src2, getIRegE(size,pfx,rm) );
29813            DIP("andn %s,%s,%s\n", nameIRegE(size,pfx,rm),
29814                nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
29815            delta++;
29816         } else {
29817            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
29818            assign( src2, loadLE(ty, mkexpr(addr)) );
29819            DIP("andn %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
29820                nameIRegG(size,pfx,rm));
29821            delta += alen;
29822         }
29823
29824         assign( dst, binop( mkSizedOp(ty,Iop_And8),
29825                             unop( mkSizedOp(ty,Iop_Not8), mkexpr(src1) ),
29826                             mkexpr(src2) ) );
29827         putIRegG( size, pfx, rm, mkexpr(dst) );
29828         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
29829                                               ? AMD64G_CC_OP_ANDN64
29830                                               : AMD64G_CC_OP_ANDN32)) );
29831         stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
29832         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
29833         *uses_vvvv = True;
29834         goto decode_success;
29835      }
29836      break;
29837
29838   case 0xF3:
29839      /* BLSI r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /3 */
29840      /* BLSI r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /3 */
29841      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
29842          && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 3) {
29843         Int     size = getRexW(pfx) ? 8 : 4;
29844         IRType  ty   = szToITy(size);
29845         IRTemp  src  = newTemp(ty);
29846         IRTemp  dst  = newTemp(ty);
29847         UChar   rm   = getUChar(delta);
29848
29849         if (epartIsReg(rm)) {
29850            assign( src, getIRegE(size,pfx,rm) );
29851            DIP("blsi %s,%s\n", nameIRegE(size,pfx,rm),
29852                nameIRegV(size,pfx));
29853            delta++;
29854         } else {
29855            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
29856            assign( src, loadLE(ty, mkexpr(addr)) );
29857            DIP("blsi %s,%s\n", dis_buf, nameIRegV(size,pfx));
29858            delta += alen;
29859         }
29860
29861         assign( dst, binop(mkSizedOp(ty,Iop_And8),
29862                            binop(mkSizedOp(ty,Iop_Sub8), mkU(ty, 0),
29863                                  mkexpr(src)), mkexpr(src)) );
29864         putIRegV( size, pfx, mkexpr(dst) );
29865         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
29866                                               ? AMD64G_CC_OP_BLSI64
29867                                               : AMD64G_CC_OP_BLSI32)) );
29868         stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
29869         stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
29870         *uses_vvvv = True;
29871         goto decode_success;
29872      }
29873      /* BLSMSK r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /2 */
29874      /* BLSMSK r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /2 */
29875      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
29876          && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 2) {
29877         Int     size = getRexW(pfx) ? 8 : 4;
29878         IRType  ty   = szToITy(size);
29879         IRTemp  src  = newTemp(ty);
29880         IRTemp  dst  = newTemp(ty);
29881         UChar   rm   = getUChar(delta);
29882
29883         if (epartIsReg(rm)) {
29884            assign( src, getIRegE(size,pfx,rm) );
29885            DIP("blsmsk %s,%s\n", nameIRegE(size,pfx,rm),
29886                nameIRegV(size,pfx));
29887            delta++;
29888         } else {
29889            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
29890            assign( src, loadLE(ty, mkexpr(addr)) );
29891            DIP("blsmsk %s,%s\n", dis_buf, nameIRegV(size,pfx));
29892            delta += alen;
29893         }
29894
29895         assign( dst, binop(mkSizedOp(ty,Iop_Xor8),
29896                            binop(mkSizedOp(ty,Iop_Sub8), mkexpr(src),
29897                                  mkU(ty, 1)), mkexpr(src)) );
29898         putIRegV( size, pfx, mkexpr(dst) );
29899         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
29900                                               ? AMD64G_CC_OP_BLSMSK64
29901                                               : AMD64G_CC_OP_BLSMSK32)) );
29902         stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
29903         stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
29904         *uses_vvvv = True;
29905         goto decode_success;
29906      }
29907      /* BLSR r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /1 */
29908      /* BLSR r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /1 */
29909      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
29910          && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 1) {
29911         Int     size = getRexW(pfx) ? 8 : 4;
29912         IRType  ty   = szToITy(size);
29913         IRTemp  src  = newTemp(ty);
29914         IRTemp  dst  = newTemp(ty);
29915         UChar   rm   = getUChar(delta);
29916
29917         if (epartIsReg(rm)) {
29918            assign( src, getIRegE(size,pfx,rm) );
29919            DIP("blsr %s,%s\n", nameIRegE(size,pfx,rm),
29920                nameIRegV(size,pfx));
29921            delta++;
29922         } else {
29923            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
29924            assign( src, loadLE(ty, mkexpr(addr)) );
29925            DIP("blsr %s,%s\n", dis_buf, nameIRegV(size,pfx));
29926            delta += alen;
29927         }
29928
29929         assign( dst, binop(mkSizedOp(ty,Iop_And8),
29930                            binop(mkSizedOp(ty,Iop_Sub8), mkexpr(src),
29931                                  mkU(ty, 1)), mkexpr(src)) );
29932         putIRegV( size, pfx, mkexpr(dst) );
29933         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
29934                                               ? AMD64G_CC_OP_BLSR64
29935                                               : AMD64G_CC_OP_BLSR32)) );
29936         stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
29937         stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
29938         *uses_vvvv = True;
29939         goto decode_success;
29940      }
29941      break;
29942
29943   case 0xF5:
29944      /* BZHI r32b, r/m32, r32a = VEX.NDS.LZ.0F38.W0 F5 /r */
29945      /* BZHI r64b, r/m64, r64a = VEX.NDS.LZ.0F38.W1 F5 /r */
29946      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
29947         Int     size  = getRexW(pfx) ? 8 : 4;
29948         IRType  ty    = szToITy(size);
29949         IRTemp  dst   = newTemp(ty);
29950         IRTemp  src1  = newTemp(ty);
29951         IRTemp  src2  = newTemp(ty);
29952         IRTemp  start = newTemp(Ity_I8);
29953         IRTemp  cond  = newTemp(Ity_I1);
29954         UChar   rm    = getUChar(delta);
29955
29956         assign( src2, getIRegV(size,pfx) );
29957         if (epartIsReg(rm)) {
29958            assign( src1, getIRegE(size,pfx,rm) );
29959            DIP("bzhi %s,%s,%s\n", nameIRegV(size,pfx),
29960                nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
29961            delta++;
29962         } else {
29963            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
29964            assign( src1, loadLE(ty, mkexpr(addr)) );
29965            DIP("bzhi %s,%s,%s\n", nameIRegV(size,pfx), dis_buf,
29966                nameIRegG(size,pfx,rm));
29967            delta += alen;
29968         }
29969
29970         assign( start, narrowTo( Ity_I8, mkexpr(src2) ) );
29971         assign( cond, binop(Iop_CmpLT32U,
29972                             unop(Iop_8Uto32, mkexpr(start)),
29973                             mkU32(8*size)) );
29974         /* if (start < opsize) {
29975               if (start == 0)
29976                  dst = 0;
29977               else
29978                  dst = (src1 << (opsize-start)) u>> (opsize-start);
29979            } else {
29980               dst = src1;
29981            } */
29982         assign( dst,
29983                 IRExpr_ITE(
29984                    mkexpr(cond),
29985                    IRExpr_ITE(
29986                       binop(Iop_CmpEQ8, mkexpr(start), mkU8(0)),
29987                       mkU(ty, 0),
29988                       binop(
29989                          mkSizedOp(ty,Iop_Shr8),
29990                          binop(
29991                             mkSizedOp(ty,Iop_Shl8),
29992                             mkexpr(src1),
29993                             binop(Iop_Sub8, mkU8(8*size), mkexpr(start))
29994                          ),
29995                          binop(Iop_Sub8, mkU8(8*size), mkexpr(start))
29996                       )
29997                    ),
29998                    mkexpr(src1)
29999                 )
30000               );
30001         putIRegG( size, pfx, rm, mkexpr(dst) );
30002         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
30003                                               ? AMD64G_CC_OP_BLSR64
30004                                               : AMD64G_CC_OP_BLSR32)) );
30005         stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
30006         stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(cond))) );
30007         *uses_vvvv = True;
30008         goto decode_success;
30009      }
30010      /* PDEP r/m32, r32b, r32a = VEX.NDS.LZ.F2.0F38.W0 F5 /r */
30011      /* PDEP r/m64, r64b, r64a = VEX.NDS.LZ.F2.0F38.W1 F5 /r */
30012      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
30013         Int     size = getRexW(pfx) ? 8 : 4;
30014         IRType  ty   = szToITy(size);
30015         IRTemp  src  = newTemp(ty);
30016         IRTemp  mask = newTemp(ty);
30017         UChar   rm   = getUChar(delta);
30018
30019         assign( src, getIRegV(size,pfx) );
30020         if (epartIsReg(rm)) {
30021            assign( mask, getIRegE(size,pfx,rm) );
30022            DIP("pdep %s,%s,%s\n", nameIRegE(size,pfx,rm),
30023                nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
30024            delta++;
30025         } else {
30026            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
30027            assign( mask, loadLE(ty, mkexpr(addr)) );
30028            DIP("pdep %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
30029                nameIRegG(size,pfx,rm));
30030            delta += alen;
30031         }
30032
30033         IRExpr** args = mkIRExprVec_2( widenUto64(mkexpr(src)),
30034                                        widenUto64(mkexpr(mask)) );
30035         putIRegG( size, pfx, rm,
30036                   narrowTo(ty, mkIRExprCCall(Ity_I64, 0/*regparms*/,
30037                                              "amd64g_calculate_pdep",
30038                                              &amd64g_calculate_pdep, args)) );
30039         *uses_vvvv = True;
30040         /* Flags aren't modified.  */
30041         goto decode_success;
30042      }
30043      /* PEXT r/m32, r32b, r32a = VEX.NDS.LZ.F3.0F38.W0 F5 /r */
30044      /* PEXT r/m64, r64b, r64a = VEX.NDS.LZ.F3.0F38.W1 F5 /r */
30045      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
30046         Int     size = getRexW(pfx) ? 8 : 4;
30047         IRType  ty   = szToITy(size);
30048         IRTemp  src  = newTemp(ty);
30049         IRTemp  mask = newTemp(ty);
30050         UChar   rm   = getUChar(delta);
30051
30052         assign( src, getIRegV(size,pfx) );
30053         if (epartIsReg(rm)) {
30054            assign( mask, getIRegE(size,pfx,rm) );
30055            DIP("pext %s,%s,%s\n", nameIRegE(size,pfx,rm),
30056                nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
30057            delta++;
30058         } else {
30059            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
30060            assign( mask, loadLE(ty, mkexpr(addr)) );
30061            DIP("pext %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
30062                nameIRegG(size,pfx,rm));
30063            delta += alen;
30064         }
30065
30066         /* First mask off bits not set in mask, they are ignored
30067            and it should be fine if they contain undefined values.  */
30068         IRExpr* masked = binop(mkSizedOp(ty,Iop_And8),
30069                                mkexpr(src), mkexpr(mask));
30070         IRExpr** args = mkIRExprVec_2( widenUto64(masked),
30071                                        widenUto64(mkexpr(mask)) );
30072         putIRegG( size, pfx, rm,
30073                   narrowTo(ty, mkIRExprCCall(Ity_I64, 0/*regparms*/,
30074                                              "amd64g_calculate_pext",
30075                                              &amd64g_calculate_pext, args)) );
30076         *uses_vvvv = True;
30077         /* Flags aren't modified.  */
30078         goto decode_success;
30079      }
30080      break;
30081
30082   case 0xF6:
30083      /* MULX r/m32, r32b, r32a = VEX.NDD.LZ.F2.0F38.W0 F6 /r */
30084      /* MULX r/m64, r64b, r64a = VEX.NDD.LZ.F2.0F38.W1 F6 /r */
30085      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
30086         Int     size = getRexW(pfx) ? 8 : 4;
30087         IRType  ty   = szToITy(size);
30088         IRTemp  src1 = newTemp(ty);
30089         IRTemp  src2 = newTemp(ty);
30090         IRTemp  res  = newTemp(size == 8 ? Ity_I128 : Ity_I64);
30091         UChar   rm   = getUChar(delta);
30092
30093         assign( src1, getIRegRDX(size) );
30094         if (epartIsReg(rm)) {
30095            assign( src2, getIRegE(size,pfx,rm) );
30096            DIP("mulx %s,%s,%s\n", nameIRegE(size,pfx,rm),
30097                nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
30098            delta++;
30099         } else {
30100            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
30101            assign( src2, loadLE(ty, mkexpr(addr)) );
30102            DIP("mulx %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
30103                nameIRegG(size,pfx,rm));
30104            delta += alen;
30105         }
30106
30107         assign( res, binop(size == 8 ? Iop_MullU64 : Iop_MullU32,
30108                            mkexpr(src1), mkexpr(src2)) );
30109         putIRegV( size, pfx,
30110                   unop(size == 8 ? Iop_128to64 : Iop_64to32, mkexpr(res)) );
30111         putIRegG( size, pfx, rm,
30112                   unop(size == 8 ? Iop_128HIto64 : Iop_64HIto32,
30113                        mkexpr(res)) );
30114         *uses_vvvv = True;
30115         /* Flags aren't modified.  */
30116         goto decode_success;
30117      }
30118      break;
30119
30120   case 0xF7:
30121      /* SARX r32b, r/m32, r32a = VEX.NDS.LZ.F3.0F38.W0 F7 /r */
30122      /* SARX r64b, r/m64, r64a = VEX.NDS.LZ.F3.0F38.W1 F7 /r */
30123      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
30124         delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "sarx", Iop_Sar8 );
30125         goto decode_success;
30126      }
30127      /* SHLX r32b, r/m32, r32a = VEX.NDS.LZ.66.0F38.W0 F7 /r */
30128      /* SHLX r64b, r/m64, r64a = VEX.NDS.LZ.66.0F38.W1 F7 /r */
30129      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
30130         delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "shlx", Iop_Shl8 );
30131         goto decode_success;
30132      }
30133      /* SHRX r32b, r/m32, r32a = VEX.NDS.LZ.F2.0F38.W0 F7 /r */
30134      /* SHRX r64b, r/m64, r64a = VEX.NDS.LZ.F2.0F38.W1 F7 /r */
30135      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
30136         delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "shrx", Iop_Shr8 );
30137         goto decode_success;
30138      }
30139      /* BEXTR r32b, r/m32, r32a = VEX.NDS.LZ.0F38.W0 F7 /r */
30140      /* BEXTR r64b, r/m64, r64a = VEX.NDS.LZ.0F38.W1 F7 /r */
30141      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
30142         Int     size  = getRexW(pfx) ? 8 : 4;
30143         IRType  ty    = szToITy(size);
30144         IRTemp  dst   = newTemp(ty);
30145         IRTemp  src1  = newTemp(ty);
30146         IRTemp  src2  = newTemp(ty);
30147         IRTemp  stle  = newTemp(Ity_I16);
30148         IRTemp  start = newTemp(Ity_I8);
30149         IRTemp  len   = newTemp(Ity_I8);
30150         UChar   rm    = getUChar(delta);
30151
30152         assign( src2, getIRegV(size,pfx) );
30153         if (epartIsReg(rm)) {
30154            assign( src1, getIRegE(size,pfx,rm) );
30155            DIP("bextr %s,%s,%s\n", nameIRegV(size,pfx),
30156                nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
30157            delta++;
30158         } else {
30159            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
30160            assign( src1, loadLE(ty, mkexpr(addr)) );
30161            DIP("bextr %s,%s,%s\n", nameIRegV(size,pfx), dis_buf,
30162                nameIRegG(size,pfx,rm));
30163            delta += alen;
30164         }
30165
30166         assign( stle, narrowTo( Ity_I16, mkexpr(src2) ) );
30167         assign( start, unop( Iop_16to8, mkexpr(stle) ) );
30168         assign( len, unop( Iop_16HIto8, mkexpr(stle) ) );
30169         /* if (start+len < opsize) {
30170               if (len != 0)
30171                  dst = (src1 << (opsize-start-len)) u>> (opsize-len);
30172               else
30173                  dst = 0;
30174            } else {
30175               if (start < opsize)
30176                  dst = src1 u>> start;
30177               else
30178                  dst = 0;
30179            } */
30180         assign( dst,
30181                 IRExpr_ITE(
30182                    binop(Iop_CmpLT32U,
30183                          binop(Iop_Add32,
30184                                unop(Iop_8Uto32, mkexpr(start)),
30185                                unop(Iop_8Uto32, mkexpr(len))),
30186                          mkU32(8*size)),
30187                    IRExpr_ITE(
30188                       binop(Iop_CmpEQ8, mkexpr(len), mkU8(0)),
30189                       mkU(ty, 0),
30190                       binop(mkSizedOp(ty,Iop_Shr8),
30191                             binop(mkSizedOp(ty,Iop_Shl8), mkexpr(src1),
30192                                   binop(Iop_Sub8,
30193                                         binop(Iop_Sub8, mkU8(8*size),
30194                                               mkexpr(start)),
30195                                         mkexpr(len))),
30196                             binop(Iop_Sub8, mkU8(8*size),
30197                                   mkexpr(len)))
30198                    ),
30199                    IRExpr_ITE(
30200                       binop(Iop_CmpLT32U,
30201                             unop(Iop_8Uto32, mkexpr(start)),
30202                             mkU32(8*size)),
30203                       binop(mkSizedOp(ty,Iop_Shr8), mkexpr(src1),
30204                             mkexpr(start)),
30205                       mkU(ty, 0)
30206                    )
30207                 )
30208               );
30209         putIRegG( size, pfx, rm, mkexpr(dst) );
30210         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
30211                                               ? AMD64G_CC_OP_ANDN64
30212                                               : AMD64G_CC_OP_ANDN32)) );
30213         stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
30214         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
30215         *uses_vvvv = True;
30216         goto decode_success;
30217      }
30218      break;
30219
30220   default:
30221      break;
30222
30223   }
30224
30225  //decode_failure:
30226   return deltaIN;
30227
30228  decode_success:
30229   return delta;
30230}
30231
30232/* operand format:
30233 * [0] = dst
30234 * [n] = srcn
30235 */
30236static Long decode_vregW(Int count, Long delta, UChar modrm, Prefix pfx,
30237                         const VexAbiInfo* vbi, IRTemp *v, UInt *dst, Int swap)
30238{
30239   v[0] = newTemp(Ity_V128);
30240   v[1] = newTemp(Ity_V128);
30241   v[2] = newTemp(Ity_V128);
30242   v[3] = newTemp(Ity_V128);
30243   IRTemp addr = IRTemp_INVALID;
30244   Int    alen = 0;
30245   HChar  dis_buf[50];
30246
30247   *dst = gregOfRexRM(pfx, modrm);
30248   assign( v[0], getXMMReg(*dst) );
30249
30250   if ( epartIsReg( modrm ) ) {
30251      UInt ereg = eregOfRexRM(pfx, modrm);
30252      assign(swap ? v[count-1] : v[count-2], getXMMReg(ereg) );
30253      DIS(dis_buf, "%s", nameXMMReg(ereg));
30254   } else {
30255      Bool extra_byte = (getUChar(delta - 3) & 0xF) != 9;
30256                 addr = disAMode(&alen, vbi, pfx, delta, dis_buf, extra_byte);
30257      assign(swap ? v[count-1] : v[count-2], loadLE(Ity_V128, mkexpr(addr)));
30258      delta += alen - 1;
30259   }
30260
30261   UInt vvvv = getVexNvvvv(pfx);
30262   switch(count) {
30263      case 2:
30264         DIP( "%s,%s", nameXMMReg(*dst), dis_buf );
30265         break;
30266      case 3:
30267         assign( swap ? v[1] : v[2], getXMMReg(vvvv) );
30268         DIP( "%s,%s,%s", nameXMMReg(*dst), nameXMMReg(vvvv), dis_buf );
30269         break;
30270      case 4:
30271         {
30272            assign( v[1], getXMMReg(vvvv) );
30273            UInt src2 = getUChar(delta + 1) >> 4;
30274            assign( swap ? v[2] : v[3], getXMMReg(src2) );
30275            DIP( "%s,%s,%s,%s", nameXMMReg(*dst), nameXMMReg(vvvv),
30276                                nameXMMReg(src2), dis_buf );
30277         }
30278         break;
30279   }
30280   return delta + 1;
30281}
30282
30283static Long dis_FMA4 (Prefix pfx, Long delta, UChar opc,
30284                      Bool* uses_vvvv, const VexAbiInfo* vbi )
30285{
30286   UInt dst;
30287   *uses_vvvv = True;
30288
30289   UChar  modrm   = getUChar(delta);
30290
30291   Bool zero_64F = False;
30292   Bool zero_96F = False;
30293   UInt is_F32   = ((opc & 0x01) == 0x00) ? 1 : 0;
30294   Bool neg      = (opc & 0xF0) == 0x70;
30295   Bool alt      = (opc & 0xF0) == 0x50;
30296   Bool sub      = alt ? (opc & 0x0E) != 0x0E : (opc & 0x0C) == 0x0C;
30297
30298   IRTemp operand[4];
30299   switch(opc & 0xF) {
30300      case 0x0A: zero_96F = (opc >> 4) != 0x05; break;
30301      case 0x0B: zero_64F = (opc >> 4) != 0x05; break;
30302      case 0x0E: zero_96F = (opc >> 4) != 0x05; break;
30303      case 0x0F: zero_64F = (opc >> 4) != 0x05; break;
30304      default: break;
30305   }
30306   DIP("vfm%s",                  neg ?   "n" : "");
30307   if(alt) DIP("%s",             sub ? "add" : "sub");
30308   DIP("%s",                     sub ? "sub" : "add");
30309   DIP("%c ", (zero_64F || zero_96F) ?   's' : 'p');
30310   DIP("%c ",                is_F32  ?   's' : 'd');
30311   delta = decode_vregW(4, delta, modrm, pfx, vbi, operand, &dst, getRexW(pfx));
30312   DIP("\n");
30313   IRExpr *src[3];
30314
30315   void (*putXMM[2])(UInt,Int,IRExpr*) = {&putXMMRegLane64F, &putXMMRegLane32F};
30316
30317   IROp size_op[] = {Iop_V128to64, Iop_V128HIto64, Iop_64to32, Iop_64HIto32};
30318   IROp neg_op[]  = {Iop_NegF64, Iop_NegF32};
30319   int i, j;
30320   for(i = 0; i < is_F32 * 2 + 2; i++) {
30321      for(j = 0; j < 3; j++) {
30322         if(is_F32) {
30323            src[j] = unop(Iop_ReinterpI32asF32,
30324                        unop(size_op[i%2+2],
30325                           unop(size_op[i/2],
30326                                 mkexpr(operand[j + 1])
30327                              )
30328                           ));
30329         } else {
30330            src[j] = unop(Iop_ReinterpI64asF64,
30331                        unop(size_op[i%2],
30332                           mkexpr(operand[j + 1])
30333                        ));
30334         }
30335      }
30336      putXMM[is_F32](dst, i, IRExpr_Qop(is_F32 ? Iop_MAddF32 : Iop_MAddF64,
30337                                             get_FAKE_roundingmode(),
30338                                             neg ? unop(neg_op[is_F32], src[0])
30339                                                 : src[0],
30340                                             src[1],
30341                                             sub ? unop(neg_op[is_F32], src[2])
30342                                                 : src[2]
30343                                          ));
30344      if(alt) {
30345         sub = !sub;
30346      }
30347   }
30348
30349   /* Zero out top bits of ymm/xmm register. */
30350   putYMMRegLane128( dst, 1, mkV128(0) );
30351
30352   if(zero_64F || zero_96F) {
30353      putXMMRegLane64( dst, 1, IRExpr_Const(IRConst_U64(0)));
30354   }
30355
30356   if(zero_96F) {
30357      putXMMRegLane32( dst, 1, IRExpr_Const(IRConst_U32(0)));
30358   }
30359
30360   return delta+1;
30361}
30362
30363/*------------------------------------------------------------*/
30364/*---                                                      ---*/
30365/*--- Top-level post-escape decoders: dis_ESC_0F3A__VEX    ---*/
30366/*---                                                      ---*/
30367/*------------------------------------------------------------*/
30368
30369static IRTemp math_VPERMILPS_128 ( IRTemp sV, UInt imm8 )
30370{
30371   vassert(imm8 < 256);
30372   IRTemp s3, s2, s1, s0;
30373   s3 = s2 = s1 = s0 = IRTemp_INVALID;
30374   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
30375#  define SEL(_nn) (((_nn)==0) ? s0 : ((_nn)==1) ? s1 \
30376                                    : ((_nn)==2) ? s2 : s3)
30377   IRTemp res = newTemp(Ity_V128);
30378   assign(res, mkV128from32s( SEL((imm8 >> 6) & 3),
30379                              SEL((imm8 >> 4) & 3),
30380                              SEL((imm8 >> 2) & 3),
30381                              SEL((imm8 >> 0) & 3) ));
30382#  undef SEL
30383   return res;
30384}
30385
30386__attribute__((noinline))
30387static
30388Long dis_ESC_0F3A__VEX (
30389        /*MB_OUT*/DisResult* dres,
30390        /*OUT*/   Bool*      uses_vvvv,
30391        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
30392        Bool         resteerCisOk,
30393        void*        callback_opaque,
30394        const VexArchInfo* archinfo,
30395        const VexAbiInfo*  vbi,
30396        Prefix pfx, Int sz, Long deltaIN
30397     )
30398{
30399   IRTemp addr  = IRTemp_INVALID;
30400   Int    alen  = 0;
30401   HChar  dis_buf[50];
30402   Long   delta = deltaIN;
30403   UChar  opc   = getUChar(delta);
30404   delta++;
30405   *uses_vvvv = False;
30406
30407   switch (opc) {
30408
30409   case 0x00:
30410   case 0x01:
30411      /* VPERMQ imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.W1 00 /r ib */
30412      /* VPERMPD imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.W1 01 /r ib */
30413      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
30414          && 1==getRexW(pfx)/*W1*/) {
30415         UChar  modrm = getUChar(delta);
30416         UInt   imm8  = 0;
30417         UInt   rG    = gregOfRexRM(pfx, modrm);
30418         IRTemp sV    = newTemp(Ity_V256);
30419         const HChar *name  = opc == 0 ? "vpermq" : "vpermpd";
30420         if (epartIsReg(modrm)) {
30421            UInt rE = eregOfRexRM(pfx, modrm);
30422            delta += 1;
30423            imm8 = getUChar(delta);
30424            DIP("%s $%u,%s,%s\n",
30425                name, imm8, nameYMMReg(rE), nameYMMReg(rG));
30426            assign(sV, getYMMReg(rE));
30427         } else {
30428            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30429            delta += alen;
30430            imm8 = getUChar(delta);
30431            DIP("%s $%u,%s,%s\n",
30432                name, imm8, dis_buf, nameYMMReg(rG));
30433            assign(sV, loadLE(Ity_V256, mkexpr(addr)));
30434         }
30435         delta++;
30436         IRTemp s[4];
30437         s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
30438         breakupV256to64s(sV, &s[3], &s[2], &s[1], &s[0]);
30439         IRTemp dV = newTemp(Ity_V256);
30440         assign(dV, IRExpr_Qop(Iop_64x4toV256,
30441                               mkexpr(s[(imm8 >> 6) & 3]),
30442                               mkexpr(s[(imm8 >> 4) & 3]),
30443                               mkexpr(s[(imm8 >> 2) & 3]),
30444                               mkexpr(s[(imm8 >> 0) & 3])));
30445         putYMMReg(rG, mkexpr(dV));
30446         goto decode_success;
30447      }
30448      break;
30449
30450   case 0x02:
30451      /* VPBLENDD imm8, xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 02 /r ib */
30452      if (have66noF2noF3(pfx)
30453          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
30454         UChar  modrm = getUChar(delta);
30455         UInt   imm8  = 0;
30456         UInt   rG    = gregOfRexRM(pfx, modrm);
30457         UInt   rV    = getVexNvvvv(pfx);
30458         IRTemp sV    = newTemp(Ity_V128);
30459         IRTemp dV    = newTemp(Ity_V128);
30460         UInt   i;
30461         IRTemp s[4], d[4];
30462         assign(sV, getXMMReg(rV));
30463         if (epartIsReg(modrm)) {
30464            UInt rE = eregOfRexRM(pfx, modrm);
30465            delta += 1;
30466            imm8 = getUChar(delta);
30467            DIP("vpblendd $%u,%s,%s,%s\n",
30468                imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
30469            assign(dV, getXMMReg(rE));
30470         } else {
30471            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30472            delta += alen;
30473            imm8 = getUChar(delta);
30474            DIP("vpblendd $%u,%s,%s,%s\n",
30475                imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
30476            assign(dV, loadLE(Ity_V128, mkexpr(addr)));
30477         }
30478         delta++;
30479         for (i = 0; i < 4; i++) {
30480            s[i] = IRTemp_INVALID;
30481            d[i] = IRTemp_INVALID;
30482         }
30483         breakupV128to32s( sV, &s[3], &s[2], &s[1], &s[0] );
30484         breakupV128to32s( dV, &d[3], &d[2], &d[1], &d[0] );
30485         for (i = 0; i < 4; i++)
30486            putYMMRegLane32(rG, i, mkexpr((imm8 & (1<<i)) ? d[i] : s[i]));
30487         putYMMRegLane128(rG, 1, mkV128(0));
30488         *uses_vvvv = True;
30489         goto decode_success;
30490      }
30491      /* VPBLENDD imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F3A.W0 02 /r ib */
30492      if (have66noF2noF3(pfx)
30493          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
30494         UChar  modrm = getUChar(delta);
30495         UInt   imm8  = 0;
30496         UInt   rG    = gregOfRexRM(pfx, modrm);
30497         UInt   rV    = getVexNvvvv(pfx);
30498         IRTemp sV    = newTemp(Ity_V256);
30499         IRTemp dV    = newTemp(Ity_V256);
30500         UInt   i;
30501         IRTemp s[8], d[8];
30502         assign(sV, getYMMReg(rV));
30503         if (epartIsReg(modrm)) {
30504            UInt rE = eregOfRexRM(pfx, modrm);
30505            delta += 1;
30506            imm8 = getUChar(delta);
30507            DIP("vpblendd $%u,%s,%s,%s\n",
30508                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
30509            assign(dV, getYMMReg(rE));
30510         } else {
30511            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30512            delta += alen;
30513            imm8 = getUChar(delta);
30514            DIP("vpblendd $%u,%s,%s,%s\n",
30515                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
30516            assign(dV, loadLE(Ity_V256, mkexpr(addr)));
30517         }
30518         delta++;
30519         for (i = 0; i < 8; i++) {
30520            s[i] = IRTemp_INVALID;
30521            d[i] = IRTemp_INVALID;
30522         }
30523         breakupV256to32s( sV, &s[7], &s[6], &s[5], &s[4],
30524                               &s[3], &s[2], &s[1], &s[0] );
30525         breakupV256to32s( dV, &d[7], &d[6], &d[5], &d[4],
30526                               &d[3], &d[2], &d[1], &d[0] );
30527         for (i = 0; i < 8; i++)
30528            putYMMRegLane32(rG, i, mkexpr((imm8 & (1<<i)) ? d[i] : s[i]));
30529         *uses_vvvv = True;
30530         goto decode_success;
30531      }
30532      break;
30533
30534   case 0x04:
30535      /* VPERMILPS imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.WIG 04 /r ib */
30536      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
30537         UChar  modrm = getUChar(delta);
30538         UInt   imm8  = 0;
30539         UInt   rG    = gregOfRexRM(pfx, modrm);
30540         IRTemp sV    = newTemp(Ity_V256);
30541         if (epartIsReg(modrm)) {
30542            UInt rE = eregOfRexRM(pfx, modrm);
30543            delta += 1;
30544            imm8 = getUChar(delta);
30545            DIP("vpermilps $%u,%s,%s\n",
30546                imm8, nameYMMReg(rE), nameYMMReg(rG));
30547            assign(sV, getYMMReg(rE));
30548         } else {
30549            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30550            delta += alen;
30551            imm8 = getUChar(delta);
30552            DIP("vpermilps $%u,%s,%s\n",
30553                imm8, dis_buf, nameYMMReg(rG));
30554            assign(sV, loadLE(Ity_V256, mkexpr(addr)));
30555         }
30556         delta++;
30557         IRTemp  sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
30558         breakupV256toV128s( sV, &sVhi, &sVlo );
30559         IRTemp  dVhi = math_VPERMILPS_128( sVhi, imm8 );
30560         IRTemp  dVlo = math_VPERMILPS_128( sVlo, imm8 );
30561         IRExpr* res  = binop(Iop_V128HLtoV256, mkexpr(dVhi), mkexpr(dVlo));
30562         putYMMReg(rG, res);
30563         goto decode_success;
30564      }
30565      /* VPERMILPS imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG 04 /r ib */
30566      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30567         UChar  modrm = getUChar(delta);
30568         UInt   imm8  = 0;
30569         UInt   rG    = gregOfRexRM(pfx, modrm);
30570         IRTemp sV    = newTemp(Ity_V128);
30571         if (epartIsReg(modrm)) {
30572            UInt rE = eregOfRexRM(pfx, modrm);
30573            delta += 1;
30574            imm8 = getUChar(delta);
30575            DIP("vpermilps $%u,%s,%s\n",
30576                imm8, nameXMMReg(rE), nameXMMReg(rG));
30577            assign(sV, getXMMReg(rE));
30578         } else {
30579            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30580            delta += alen;
30581            imm8 = getUChar(delta);
30582            DIP("vpermilps $%u,%s,%s\n",
30583                imm8, dis_buf, nameXMMReg(rG));
30584            assign(sV, loadLE(Ity_V128, mkexpr(addr)));
30585         }
30586         delta++;
30587         putYMMRegLoAndZU(rG, mkexpr ( math_VPERMILPS_128 ( sV, imm8 ) ) );
30588         goto decode_success;
30589      }
30590      break;
30591
30592   case 0x05:
30593      /* VPERMILPD imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG 05 /r ib */
30594      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30595         UChar  modrm = getUChar(delta);
30596         UInt   imm8  = 0;
30597         UInt   rG    = gregOfRexRM(pfx, modrm);
30598         IRTemp sV    = newTemp(Ity_V128);
30599         if (epartIsReg(modrm)) {
30600            UInt rE = eregOfRexRM(pfx, modrm);
30601            delta += 1;
30602            imm8 = getUChar(delta);
30603            DIP("vpermilpd $%u,%s,%s\n",
30604                imm8, nameXMMReg(rE), nameXMMReg(rG));
30605            assign(sV, getXMMReg(rE));
30606         } else {
30607            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30608            delta += alen;
30609            imm8 = getUChar(delta);
30610            DIP("vpermilpd $%u,%s,%s\n",
30611                imm8, dis_buf, nameXMMReg(rG));
30612            assign(sV, loadLE(Ity_V128, mkexpr(addr)));
30613         }
30614         delta++;
30615         IRTemp s1 = newTemp(Ity_I64);
30616         IRTemp s0 = newTemp(Ity_I64);
30617         assign(s1, unop(Iop_V128HIto64, mkexpr(sV)));
30618         assign(s0, unop(Iop_V128to64,   mkexpr(sV)));
30619         IRTemp dV = newTemp(Ity_V128);
30620         assign(dV, binop(Iop_64HLtoV128,
30621                               mkexpr((imm8 & (1<<1)) ? s1 : s0),
30622                               mkexpr((imm8 & (1<<0)) ? s1 : s0)));
30623         putYMMRegLoAndZU(rG, mkexpr(dV));
30624         goto decode_success;
30625      }
30626      /* VPERMILPD imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.WIG 05 /r ib */
30627      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
30628         UChar  modrm = getUChar(delta);
30629         UInt   imm8  = 0;
30630         UInt   rG    = gregOfRexRM(pfx, modrm);
30631         IRTemp sV    = newTemp(Ity_V256);
30632         if (epartIsReg(modrm)) {
30633            UInt rE = eregOfRexRM(pfx, modrm);
30634            delta += 1;
30635            imm8 = getUChar(delta);
30636            DIP("vpermilpd $%u,%s,%s\n",
30637                imm8, nameYMMReg(rE), nameYMMReg(rG));
30638            assign(sV, getYMMReg(rE));
30639         } else {
30640            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30641            delta += alen;
30642            imm8 = getUChar(delta);
30643            DIP("vpermilpd $%u,%s,%s\n",
30644                imm8, dis_buf, nameYMMReg(rG));
30645            assign(sV, loadLE(Ity_V256, mkexpr(addr)));
30646         }
30647         delta++;
30648         IRTemp s3, s2, s1, s0;
30649         s3 = s2 = s1 = s0 = IRTemp_INVALID;
30650         breakupV256to64s(sV, &s3, &s2, &s1, &s0);
30651         IRTemp dV = newTemp(Ity_V256);
30652         assign(dV, IRExpr_Qop(Iop_64x4toV256,
30653                               mkexpr((imm8 & (1<<3)) ? s3 : s2),
30654                               mkexpr((imm8 & (1<<2)) ? s3 : s2),
30655                               mkexpr((imm8 & (1<<1)) ? s1 : s0),
30656                               mkexpr((imm8 & (1<<0)) ? s1 : s0)));
30657         putYMMReg(rG, mkexpr(dV));
30658         goto decode_success;
30659      }
30660      break;
30661
30662   case 0x06:
30663      /* VPERM2F128 imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.66.0F3A.W0 06 /r ib */
30664      if (have66noF2noF3(pfx)
30665          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
30666         UChar  modrm = getUChar(delta);
30667         UInt   imm8  = 0;
30668         UInt   rG    = gregOfRexRM(pfx, modrm);
30669         UInt   rV    = getVexNvvvv(pfx);
30670         IRTemp s00   = newTemp(Ity_V128);
30671         IRTemp s01   = newTemp(Ity_V128);
30672         IRTemp s10   = newTemp(Ity_V128);
30673         IRTemp s11   = newTemp(Ity_V128);
30674         assign(s00, getYMMRegLane128(rV, 0));
30675         assign(s01, getYMMRegLane128(rV, 1));
30676         if (epartIsReg(modrm)) {
30677            UInt rE = eregOfRexRM(pfx, modrm);
30678            delta += 1;
30679            imm8 = getUChar(delta);
30680            DIP("vperm2f128 $%u,%s,%s,%s\n",
30681                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
30682            assign(s10, getYMMRegLane128(rE, 0));
30683            assign(s11, getYMMRegLane128(rE, 1));
30684         } else {
30685            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30686            delta += alen;
30687            imm8 = getUChar(delta);
30688            DIP("vperm2f128 $%u,%s,%s,%s\n",
30689                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
30690            assign(s10, loadLE(Ity_V128, binop(Iop_Add64,
30691                                               mkexpr(addr), mkU64(0))));
30692            assign(s11, loadLE(Ity_V128, binop(Iop_Add64,
30693                                               mkexpr(addr), mkU64(16))));
30694         }
30695         delta++;
30696#        define SEL(_nn) (((_nn)==0) ? s00 : ((_nn)==1) ? s01 \
30697                                           : ((_nn)==2) ? s10 : s11)
30698         putYMMRegLane128(rG, 0, mkexpr(SEL((imm8 >> 0) & 3)));
30699         putYMMRegLane128(rG, 1, mkexpr(SEL((imm8 >> 4) & 3)));
30700#        undef SEL
30701         if (imm8 & (1<<3)) putYMMRegLane128(rG, 0, mkV128(0));
30702         if (imm8 & (1<<7)) putYMMRegLane128(rG, 1, mkV128(0));
30703         *uses_vvvv = True;
30704         goto decode_success;
30705      }
30706      break;
30707
30708   case 0x08:
30709      /* VROUNDPS imm8, xmm2/m128, xmm1 */
30710      /* VROUNDPS = VEX.NDS.128.66.0F3A.WIG 08 ib */
30711      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30712         UChar  modrm = getUChar(delta);
30713         UInt   rG    = gregOfRexRM(pfx, modrm);
30714         IRTemp src   = newTemp(Ity_V128);
30715         IRTemp s0    = IRTemp_INVALID;
30716         IRTemp s1    = IRTemp_INVALID;
30717         IRTemp s2    = IRTemp_INVALID;
30718         IRTemp s3    = IRTemp_INVALID;
30719         IRTemp rm    = newTemp(Ity_I32);
30720         Int    imm   = 0;
30721
30722         modrm = getUChar(delta);
30723
30724         if (epartIsReg(modrm)) {
30725            UInt rE = eregOfRexRM(pfx, modrm);
30726            assign( src, getXMMReg( rE ) );
30727            imm = getUChar(delta+1);
30728            if (imm & ~15) break;
30729            delta += 1+1;
30730            DIP( "vroundps $%d,%s,%s\n", imm, nameXMMReg(rE), nameXMMReg(rG) );
30731         } else {
30732            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30733            assign( src, loadLE(Ity_V128, mkexpr(addr) ) );
30734            imm = getUChar(delta+alen);
30735            if (imm & ~15) break;
30736            delta += alen+1;
30737            DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameXMMReg(rG) );
30738         }
30739
30740         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
30741            that encoding is the same as the encoding for IRRoundingMode,
30742            we can use that value directly in the IR as a rounding
30743            mode. */
30744         assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
30745
30746         breakupV128to32s( src, &s3, &s2, &s1, &s0 );
30747         putYMMRegLane128( rG, 1, mkV128(0) );
30748#        define CVT(s) binop(Iop_RoundF32toInt, mkexpr(rm), \
30749                             unop(Iop_ReinterpI32asF32, mkexpr(s)))
30750         putYMMRegLane32F( rG, 3, CVT(s3) );
30751         putYMMRegLane32F( rG, 2, CVT(s2) );
30752         putYMMRegLane32F( rG, 1, CVT(s1) );
30753         putYMMRegLane32F( rG, 0, CVT(s0) );
30754#        undef CVT
30755         goto decode_success;
30756      }
30757      /* VROUNDPS imm8, ymm2/m256, ymm1 */
30758      /* VROUNDPS = VEX.NDS.256.66.0F3A.WIG 08 ib */
30759      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
30760         UChar  modrm = getUChar(delta);
30761         UInt   rG    = gregOfRexRM(pfx, modrm);
30762         IRTemp src   = newTemp(Ity_V256);
30763         IRTemp s0    = IRTemp_INVALID;
30764         IRTemp s1    = IRTemp_INVALID;
30765         IRTemp s2    = IRTemp_INVALID;
30766         IRTemp s3    = IRTemp_INVALID;
30767         IRTemp s4    = IRTemp_INVALID;
30768         IRTemp s5    = IRTemp_INVALID;
30769         IRTemp s6    = IRTemp_INVALID;
30770         IRTemp s7    = IRTemp_INVALID;
30771         IRTemp rm    = newTemp(Ity_I32);
30772         Int    imm   = 0;
30773
30774         modrm = getUChar(delta);
30775
30776         if (epartIsReg(modrm)) {
30777            UInt rE = eregOfRexRM(pfx, modrm);
30778            assign( src, getYMMReg( rE ) );
30779            imm = getUChar(delta+1);
30780            if (imm & ~15) break;
30781            delta += 1+1;
30782            DIP( "vroundps $%d,%s,%s\n", imm, nameYMMReg(rE), nameYMMReg(rG) );
30783         } else {
30784            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30785            assign( src, loadLE(Ity_V256, mkexpr(addr) ) );
30786            imm = getUChar(delta+alen);
30787            if (imm & ~15) break;
30788            delta += alen+1;
30789            DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameYMMReg(rG) );
30790         }
30791
30792         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
30793            that encoding is the same as the encoding for IRRoundingMode,
30794            we can use that value directly in the IR as a rounding
30795            mode. */
30796         assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
30797
30798         breakupV256to32s( src, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
30799#        define CVT(s) binop(Iop_RoundF32toInt, mkexpr(rm), \
30800                             unop(Iop_ReinterpI32asF32, mkexpr(s)))
30801         putYMMRegLane32F( rG, 7, CVT(s7) );
30802         putYMMRegLane32F( rG, 6, CVT(s6) );
30803         putYMMRegLane32F( rG, 5, CVT(s5) );
30804         putYMMRegLane32F( rG, 4, CVT(s4) );
30805         putYMMRegLane32F( rG, 3, CVT(s3) );
30806         putYMMRegLane32F( rG, 2, CVT(s2) );
30807         putYMMRegLane32F( rG, 1, CVT(s1) );
30808         putYMMRegLane32F( rG, 0, CVT(s0) );
30809#        undef CVT
30810         goto decode_success;
30811      }
30812
30813   case 0x09:
30814      /* VROUNDPD imm8, xmm2/m128, xmm1 */
30815      /* VROUNDPD = VEX.NDS.128.66.0F3A.WIG 09 ib */
30816      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30817         UChar  modrm = getUChar(delta);
30818         UInt   rG    = gregOfRexRM(pfx, modrm);
30819         IRTemp src   = newTemp(Ity_V128);
30820         IRTemp s0    = IRTemp_INVALID;
30821         IRTemp s1    = IRTemp_INVALID;
30822         IRTemp rm    = newTemp(Ity_I32);
30823         Int    imm   = 0;
30824
30825         modrm = getUChar(delta);
30826
30827         if (epartIsReg(modrm)) {
30828            UInt rE = eregOfRexRM(pfx, modrm);
30829            assign( src, getXMMReg( rE ) );
30830            imm = getUChar(delta+1);
30831            if (imm & ~15) break;
30832            delta += 1+1;
30833            DIP( "vroundpd $%d,%s,%s\n", imm, nameXMMReg(rE), nameXMMReg(rG) );
30834         } else {
30835            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30836            assign( src, loadLE(Ity_V128, mkexpr(addr) ) );
30837            imm = getUChar(delta+alen);
30838            if (imm & ~15) break;
30839            delta += alen+1;
30840            DIP( "vroundpd $%d,%s,%s\n", imm, dis_buf, nameXMMReg(rG) );
30841         }
30842
30843         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
30844            that encoding is the same as the encoding for IRRoundingMode,
30845            we can use that value directly in the IR as a rounding
30846            mode. */
30847         assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
30848
30849         breakupV128to64s( src, &s1, &s0 );
30850         putYMMRegLane128( rG, 1, mkV128(0) );
30851#        define CVT(s) binop(Iop_RoundF64toInt, mkexpr(rm), \
30852                             unop(Iop_ReinterpI64asF64, mkexpr(s)))
30853         putYMMRegLane64F( rG, 1, CVT(s1) );
30854         putYMMRegLane64F( rG, 0, CVT(s0) );
30855#        undef CVT
30856         goto decode_success;
30857      }
30858      /* VROUNDPD imm8, ymm2/m256, ymm1 */
30859      /* VROUNDPD = VEX.NDS.256.66.0F3A.WIG 09 ib */
30860      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
30861         UChar  modrm = getUChar(delta);
30862         UInt   rG    = gregOfRexRM(pfx, modrm);
30863         IRTemp src   = newTemp(Ity_V256);
30864         IRTemp s0    = IRTemp_INVALID;
30865         IRTemp s1    = IRTemp_INVALID;
30866         IRTemp s2    = IRTemp_INVALID;
30867         IRTemp s3    = IRTemp_INVALID;
30868         IRTemp rm    = newTemp(Ity_I32);
30869         Int    imm   = 0;
30870
30871         modrm = getUChar(delta);
30872
30873         if (epartIsReg(modrm)) {
30874            UInt rE = eregOfRexRM(pfx, modrm);
30875            assign( src, getYMMReg( rE ) );
30876            imm = getUChar(delta+1);
30877            if (imm & ~15) break;
30878            delta += 1+1;
30879            DIP( "vroundpd $%d,%s,%s\n", imm, nameYMMReg(rE), nameYMMReg(rG) );
30880         } else {
30881            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30882            assign( src, loadLE(Ity_V256, mkexpr(addr) ) );
30883            imm = getUChar(delta+alen);
30884            if (imm & ~15) break;
30885            delta += alen+1;
30886            DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameYMMReg(rG) );
30887         }
30888
30889         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
30890            that encoding is the same as the encoding for IRRoundingMode,
30891            we can use that value directly in the IR as a rounding
30892            mode. */
30893         assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
30894
30895         breakupV256to64s( src, &s3, &s2, &s1, &s0 );
30896#        define CVT(s) binop(Iop_RoundF64toInt, mkexpr(rm), \
30897                             unop(Iop_ReinterpI64asF64, mkexpr(s)))
30898         putYMMRegLane64F( rG, 3, CVT(s3) );
30899         putYMMRegLane64F( rG, 2, CVT(s2) );
30900         putYMMRegLane64F( rG, 1, CVT(s1) );
30901         putYMMRegLane64F( rG, 0, CVT(s0) );
30902#        undef CVT
30903         goto decode_success;
30904      }
30905
30906   case 0x0A:
30907   case 0x0B:
30908      /* VROUNDSS imm8, xmm3/m32, xmm2, xmm1 */
30909      /* VROUNDSS = VEX.NDS.128.66.0F3A.WIG 0A ib */
30910      /* VROUNDSD imm8, xmm3/m64, xmm2, xmm1 */
30911      /* VROUNDSD = VEX.NDS.128.66.0F3A.WIG 0B ib */
30912      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30913         UChar  modrm = getUChar(delta);
30914         UInt   rG    = gregOfRexRM(pfx, modrm);
30915         UInt   rV    = getVexNvvvv(pfx);
30916         Bool   isD   = opc == 0x0B;
30917         IRTemp src   = newTemp(isD ? Ity_F64 : Ity_F32);
30918         IRTemp res   = newTemp(isD ? Ity_F64 : Ity_F32);
30919         Int    imm   = 0;
30920
30921         if (epartIsReg(modrm)) {
30922            UInt rE = eregOfRexRM(pfx, modrm);
30923            assign( src,
30924                    isD ? getXMMRegLane64F(rE, 0) : getXMMRegLane32F(rE, 0) );
30925            imm = getUChar(delta+1);
30926            if (imm & ~15) break;
30927            delta += 1+1;
30928            DIP( "vrounds%c $%d,%s,%s,%s\n",
30929                 isD ? 'd' : 's',
30930                 imm, nameXMMReg( rE ), nameXMMReg( rV ), nameXMMReg( rG ) );
30931         } else {
30932            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30933            assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
30934            imm = getUChar(delta+alen);
30935            if (imm & ~15) break;
30936            delta += alen+1;
30937            DIP( "vrounds%c $%d,%s,%s,%s\n",
30938                 isD ? 'd' : 's',
30939                 imm, dis_buf, nameXMMReg( rV ), nameXMMReg( rG ) );
30940         }
30941
30942         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
30943            that encoding is the same as the encoding for IRRoundingMode,
30944            we can use that value directly in the IR as a rounding
30945            mode. */
30946         assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
30947                           (imm & 4) ? get_sse_roundingmode()
30948                                     : mkU32(imm & 3),
30949                           mkexpr(src)) );
30950
30951         if (isD)
30952            putXMMRegLane64F( rG, 0, mkexpr(res) );
30953         else {
30954            putXMMRegLane32F( rG, 0, mkexpr(res) );
30955            putXMMRegLane32F( rG, 1, getXMMRegLane32F( rV, 1 ) );
30956         }
30957         putXMMRegLane64F( rG, 1, getXMMRegLane64F( rV, 1 ) );
30958         putYMMRegLane128( rG, 1, mkV128(0) );
30959         *uses_vvvv = True;
30960         goto decode_success;
30961      }
30962      break;
30963
30964   case 0x0C:
30965      /* VBLENDPS imm8, ymm3/m256, ymm2, ymm1 */
30966      /* VBLENDPS = VEX.NDS.256.66.0F3A.WIG 0C /r ib */
30967      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
30968         UChar  modrm = getUChar(delta);
30969         UInt   imm8;
30970         UInt   rG    = gregOfRexRM(pfx, modrm);
30971         UInt   rV    = getVexNvvvv(pfx);
30972         IRTemp sV    = newTemp(Ity_V256);
30973         IRTemp sE    = newTemp(Ity_V256);
30974         assign ( sV, getYMMReg(rV) );
30975         if (epartIsReg(modrm)) {
30976            UInt rE = eregOfRexRM(pfx, modrm);
30977            delta += 1;
30978            imm8 = getUChar(delta);
30979            DIP("vblendps $%u,%s,%s,%s\n",
30980                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
30981            assign(sE, getYMMReg(rE));
30982         } else {
30983            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30984            delta += alen;
30985            imm8 = getUChar(delta);
30986            DIP("vblendps $%u,%s,%s,%s\n",
30987                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
30988            assign(sE, loadLE(Ity_V256, mkexpr(addr)));
30989         }
30990         delta++;
30991         putYMMReg( rG,
30992                    mkexpr( math_BLENDPS_256( sE, sV, imm8) ) );
30993         *uses_vvvv = True;
30994         goto decode_success;
30995      }
30996      /* VBLENDPS imm8, xmm3/m128, xmm2, xmm1 */
30997      /* VBLENDPS = VEX.NDS.128.66.0F3A.WIG 0C /r ib */
30998      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30999         UChar  modrm = getUChar(delta);
31000         UInt   imm8;
31001         UInt   rG    = gregOfRexRM(pfx, modrm);
31002         UInt   rV    = getVexNvvvv(pfx);
31003         IRTemp sV    = newTemp(Ity_V128);
31004         IRTemp sE    = newTemp(Ity_V128);
31005         assign ( sV, getXMMReg(rV) );
31006         if (epartIsReg(modrm)) {
31007            UInt rE = eregOfRexRM(pfx, modrm);
31008            delta += 1;
31009            imm8 = getUChar(delta);
31010            DIP("vblendps $%u,%s,%s,%s\n",
31011                imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
31012            assign(sE, getXMMReg(rE));
31013         } else {
31014            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31015            delta += alen;
31016            imm8 = getUChar(delta);
31017            DIP("vblendps $%u,%s,%s,%s\n",
31018                imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
31019            assign(sE, loadLE(Ity_V128, mkexpr(addr)));
31020         }
31021         delta++;
31022         putYMMRegLoAndZU( rG,
31023                           mkexpr( math_BLENDPS_128( sE, sV, imm8) ) );
31024         *uses_vvvv = True;
31025         goto decode_success;
31026      }
31027      break;
31028
31029   case 0x0D:
31030      /* VBLENDPD imm8, ymm3/m256, ymm2, ymm1 */
31031      /* VBLENDPD = VEX.NDS.256.66.0F3A.WIG 0D /r ib */
31032      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
31033         UChar  modrm = getUChar(delta);
31034         UInt   imm8;
31035         UInt   rG    = gregOfRexRM(pfx, modrm);
31036         UInt   rV    = getVexNvvvv(pfx);
31037         IRTemp sV    = newTemp(Ity_V256);
31038         IRTemp sE    = newTemp(Ity_V256);
31039         assign ( sV, getYMMReg(rV) );
31040         if (epartIsReg(modrm)) {
31041            UInt rE = eregOfRexRM(pfx, modrm);
31042            delta += 1;
31043            imm8 = getUChar(delta);
31044            DIP("vblendpd $%u,%s,%s,%s\n",
31045                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
31046            assign(sE, getYMMReg(rE));
31047         } else {
31048            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31049            delta += alen;
31050            imm8 = getUChar(delta);
31051            DIP("vblendpd $%u,%s,%s,%s\n",
31052                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
31053            assign(sE, loadLE(Ity_V256, mkexpr(addr)));
31054         }
31055         delta++;
31056         putYMMReg( rG,
31057                    mkexpr( math_BLENDPD_256( sE, sV, imm8) ) );
31058         *uses_vvvv = True;
31059         goto decode_success;
31060      }
31061      /* VBLENDPD imm8, xmm3/m128, xmm2, xmm1 */
31062      /* VBLENDPD = VEX.NDS.128.66.0F3A.WIG 0D /r ib */
31063      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31064         UChar  modrm = getUChar(delta);
31065         UInt   imm8;
31066         UInt   rG    = gregOfRexRM(pfx, modrm);
31067         UInt   rV    = getVexNvvvv(pfx);
31068         IRTemp sV    = newTemp(Ity_V128);
31069         IRTemp sE    = newTemp(Ity_V128);
31070         assign ( sV, getXMMReg(rV) );
31071         if (epartIsReg(modrm)) {
31072            UInt rE = eregOfRexRM(pfx, modrm);
31073            delta += 1;
31074            imm8 = getUChar(delta);
31075            DIP("vblendpd $%u,%s,%s,%s\n",
31076                imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
31077            assign(sE, getXMMReg(rE));
31078         } else {
31079            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31080            delta += alen;
31081            imm8 = getUChar(delta);
31082            DIP("vblendpd $%u,%s,%s,%s\n",
31083                imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
31084            assign(sE, loadLE(Ity_V128, mkexpr(addr)));
31085         }
31086         delta++;
31087         putYMMRegLoAndZU( rG,
31088                           mkexpr( math_BLENDPD_128( sE, sV, imm8) ) );
31089         *uses_vvvv = True;
31090         goto decode_success;
31091      }
31092      break;
31093
31094   case 0x0E:
31095      /* VPBLENDW imm8, xmm3/m128, xmm2, xmm1 */
31096      /* VPBLENDW = VEX.NDS.128.66.0F3A.WIG 0E /r ib */
31097      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31098         UChar  modrm = getUChar(delta);
31099         UInt   imm8;
31100         UInt   rG    = gregOfRexRM(pfx, modrm);
31101         UInt   rV    = getVexNvvvv(pfx);
31102         IRTemp sV    = newTemp(Ity_V128);
31103         IRTemp sE    = newTemp(Ity_V128);
31104         assign ( sV, getXMMReg(rV) );
31105         if (epartIsReg(modrm)) {
31106            UInt rE = eregOfRexRM(pfx, modrm);
31107            delta += 1;
31108            imm8 = getUChar(delta);
31109            DIP("vpblendw $%u,%s,%s,%s\n",
31110                imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
31111            assign(sE, getXMMReg(rE));
31112         } else {
31113            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31114            delta += alen;
31115            imm8 = getUChar(delta);
31116            DIP("vpblendw $%u,%s,%s,%s\n",
31117                imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
31118            assign(sE, loadLE(Ity_V128, mkexpr(addr)));
31119         }
31120         delta++;
31121         putYMMRegLoAndZU( rG,
31122                           mkexpr( math_PBLENDW_128( sE, sV, imm8) ) );
31123         *uses_vvvv = True;
31124         goto decode_success;
31125      }
31126      /* VPBLENDW imm8, ymm3/m256, ymm2, ymm1 */
31127      /* VPBLENDW = VEX.NDS.256.66.0F3A.WIG 0E /r ib */
31128      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
31129         UChar  modrm = getUChar(delta);
31130         UInt   imm8;
31131         UInt   rG    = gregOfRexRM(pfx, modrm);
31132         UInt   rV    = getVexNvvvv(pfx);
31133         IRTemp sV    = newTemp(Ity_V256);
31134         IRTemp sE    = newTemp(Ity_V256);
31135         IRTemp sVhi, sVlo, sEhi, sElo;
31136         sVhi = sVlo = sEhi = sElo = IRTemp_INVALID;
31137         assign ( sV, getYMMReg(rV) );
31138         if (epartIsReg(modrm)) {
31139            UInt rE = eregOfRexRM(pfx, modrm);
31140            delta += 1;
31141            imm8 = getUChar(delta);
31142            DIP("vpblendw $%u,%s,%s,%s\n",
31143                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
31144            assign(sE, getYMMReg(rE));
31145         } else {
31146            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31147            delta += alen;
31148            imm8 = getUChar(delta);
31149            DIP("vpblendw $%u,%s,%s,%s\n",
31150                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
31151            assign(sE, loadLE(Ity_V256, mkexpr(addr)));
31152         }
31153         delta++;
31154         breakupV256toV128s( sV, &sVhi, &sVlo );
31155         breakupV256toV128s( sE, &sEhi, &sElo );
31156         putYMMReg( rG, binop( Iop_V128HLtoV256,
31157                               mkexpr( math_PBLENDW_128( sEhi, sVhi, imm8) ),
31158                               mkexpr( math_PBLENDW_128( sElo, sVlo, imm8) ) ) );
31159         *uses_vvvv = True;
31160         goto decode_success;
31161      }
31162      break;
31163
31164   case 0x0F:
31165      /* VPALIGNR imm8, xmm3/m128, xmm2, xmm1 */
31166      /* VPALIGNR = VEX.NDS.128.66.0F3A.WIG 0F /r ib */
31167      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31168         UChar  modrm = getUChar(delta);
31169         UInt   rG    = gregOfRexRM(pfx, modrm);
31170         UInt   rV    = getVexNvvvv(pfx);
31171         IRTemp sV    = newTemp(Ity_V128);
31172         IRTemp dV    = newTemp(Ity_V128);
31173         UInt   imm8;
31174
31175         assign( dV, getXMMReg(rV) );
31176
31177         if ( epartIsReg( modrm ) ) {
31178            UInt   rE = eregOfRexRM(pfx, modrm);
31179            assign( sV, getXMMReg(rE) );
31180            imm8 = getUChar(delta+1);
31181            delta += 1+1;
31182            DIP("vpalignr $%u,%s,%s,%s\n", imm8, nameXMMReg(rE),
31183                                           nameXMMReg(rV), nameXMMReg(rG));
31184         } else {
31185            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31186            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
31187            imm8 = getUChar(delta+alen);
31188            delta += alen+1;
31189            DIP("vpalignr $%u,%s,%s,%s\n", imm8, dis_buf,
31190                                           nameXMMReg(rV), nameXMMReg(rG));
31191         }
31192
31193         IRTemp res = math_PALIGNR_XMM( sV, dV, imm8 );
31194         putYMMRegLoAndZU( rG, mkexpr(res) );
31195         *uses_vvvv = True;
31196         goto decode_success;
31197      }
31198      /* VPALIGNR imm8, ymm3/m256, ymm2, ymm1 */
31199      /* VPALIGNR = VEX.NDS.256.66.0F3A.WIG 0F /r ib */
31200      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
31201         UChar  modrm = getUChar(delta);
31202         UInt   rG    = gregOfRexRM(pfx, modrm);
31203         UInt   rV    = getVexNvvvv(pfx);
31204         IRTemp sV    = newTemp(Ity_V256);
31205         IRTemp dV    = newTemp(Ity_V256);
31206         IRTemp sHi, sLo, dHi, dLo;
31207         sHi = sLo = dHi = dLo = IRTemp_INVALID;
31208         UInt   imm8;
31209
31210         assign( dV, getYMMReg(rV) );
31211
31212         if ( epartIsReg( modrm ) ) {
31213            UInt   rE = eregOfRexRM(pfx, modrm);
31214            assign( sV, getYMMReg(rE) );
31215            imm8 = getUChar(delta+1);
31216            delta += 1+1;
31217            DIP("vpalignr $%u,%s,%s,%s\n", imm8, nameYMMReg(rE),
31218                                           nameYMMReg(rV), nameYMMReg(rG));
31219         } else {
31220            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31221            assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
31222            imm8 = getUChar(delta+alen);
31223            delta += alen+1;
31224            DIP("vpalignr $%u,%s,%s,%s\n", imm8, dis_buf,
31225                                           nameYMMReg(rV), nameYMMReg(rG));
31226         }
31227
31228         breakupV256toV128s( dV, &dHi, &dLo );
31229         breakupV256toV128s( sV, &sHi, &sLo );
31230         putYMMReg( rG, binop( Iop_V128HLtoV256,
31231                               mkexpr( math_PALIGNR_XMM( sHi, dHi, imm8 ) ),
31232                               mkexpr( math_PALIGNR_XMM( sLo, dLo, imm8 ) ) )
31233                    );
31234         *uses_vvvv = True;
31235         goto decode_success;
31236      }
31237      break;
31238
31239   case 0x14:
31240      /* VPEXTRB imm8, xmm2, reg/m8 = VEX.128.66.0F3A.W0 14 /r ib */
31241      if (have66noF2noF3(pfx)
31242          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
31243         delta = dis_PEXTRB_128_GtoE( vbi, pfx, delta, False/*!isAvx*/ );
31244         goto decode_success;
31245      }
31246      break;
31247
31248   case 0x15:
31249      /* VPEXTRW imm8, reg/m16, xmm2 */
31250      /* VPEXTRW = VEX.128.66.0F3A.W0 15 /r ib */
31251      if (have66noF2noF3(pfx)
31252          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
31253         delta = dis_PEXTRW( vbi, pfx, delta, True/*isAvx*/ );
31254         goto decode_success;
31255      }
31256      break;
31257
31258   case 0x16:
31259      /* VPEXTRD imm8, r32/m32, xmm2 */
31260      /* VPEXTRD = VEX.128.66.0F3A.W0 16 /r ib */
31261      if (have66noF2noF3(pfx)
31262          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
31263         delta = dis_PEXTRD( vbi, pfx, delta, True/*isAvx*/ );
31264         goto decode_success;
31265      }
31266      /* VPEXTRQ = VEX.128.66.0F3A.W1 16 /r ib */
31267      if (have66noF2noF3(pfx)
31268          && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
31269         delta = dis_PEXTRQ( vbi, pfx, delta, True/*isAvx*/ );
31270         goto decode_success;
31271      }
31272      break;
31273
31274   case 0x17:
31275      /* VEXTRACTPS imm8, xmm1, r32/m32 = VEX.128.66.0F3A.WIG 17 /r ib */
31276      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31277         delta = dis_EXTRACTPS( vbi, pfx, delta, True/*isAvx*/ );
31278         goto decode_success;
31279      }
31280      break;
31281
31282   case 0x18:
31283      /* VINSERTF128 r/m, rV, rD
31284         ::: rD = insertinto(a lane in rV, 128 bits from r/m) */
31285      /* VINSERTF128 = VEX.NDS.256.66.0F3A.W0 18 /r ib */
31286      if (have66noF2noF3(pfx)
31287          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
31288         UChar  modrm = getUChar(delta);
31289         UInt   ib    = 0;
31290         UInt   rG    = gregOfRexRM(pfx, modrm);
31291         UInt   rV    = getVexNvvvv(pfx);
31292         IRTemp t128  = newTemp(Ity_V128);
31293         if (epartIsReg(modrm)) {
31294            UInt rE = eregOfRexRM(pfx, modrm);
31295            delta += 1;
31296            assign(t128, getXMMReg(rE));
31297            ib = getUChar(delta);
31298            DIP("vinsertf128 $%u,%s,%s,%s\n",
31299                ib, nameXMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
31300         } else {
31301            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31302            assign(t128, loadLE(Ity_V128, mkexpr(addr)));
31303            delta += alen;
31304            ib = getUChar(delta);
31305            DIP("vinsertf128 $%u,%s,%s,%s\n",
31306                ib, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
31307         }
31308         delta++;
31309         putYMMRegLane128(rG, 0,   getYMMRegLane128(rV, 0));
31310         putYMMRegLane128(rG, 1,   getYMMRegLane128(rV, 1));
31311         putYMMRegLane128(rG, ib & 1, mkexpr(t128));
31312         *uses_vvvv = True;
31313         goto decode_success;
31314      }
31315      break;
31316
31317   case 0x19:
31318     /* VEXTRACTF128 $lane_no, rS, r/m
31319        ::: r/m:V128 = a lane of rS:V256 (RM format) */
31320     /* VEXTRACTF128 = VEX.256.66.0F3A.W0 19 /r ib */
31321      if (have66noF2noF3(pfx)
31322          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
31323         UChar  modrm = getUChar(delta);
31324         UInt   ib    = 0;
31325         UInt   rS    = gregOfRexRM(pfx, modrm);
31326         IRTemp t128  = newTemp(Ity_V128);
31327         if (epartIsReg(modrm)) {
31328            UInt rD = eregOfRexRM(pfx, modrm);
31329            delta += 1;
31330            ib = getUChar(delta);
31331            assign(t128, getYMMRegLane128(rS, ib & 1));
31332            putYMMRegLoAndZU(rD, mkexpr(t128));
31333            DIP("vextractf128 $%u,%s,%s\n",
31334                ib, nameXMMReg(rS), nameYMMReg(rD));
31335         } else {
31336            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31337            delta += alen;
31338            ib = getUChar(delta);
31339            assign(t128, getYMMRegLane128(rS, ib & 1));
31340            storeLE(mkexpr(addr), mkexpr(t128));
31341            DIP("vextractf128 $%u,%s,%s\n",
31342                ib, nameYMMReg(rS), dis_buf);
31343         }
31344         delta++;
31345         /* doesn't use vvvv */
31346         goto decode_success;
31347      }
31348      break;
31349
31350   case 0x20:
31351      /* VPINSRB r32/m8, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 20 /r ib */
31352      if (have66noF2noF3(pfx)
31353          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
31354         UChar  modrm  = getUChar(delta);
31355         UInt   rG     = gregOfRexRM(pfx, modrm);
31356         UInt   rV     = getVexNvvvv(pfx);
31357         Int    imm8;
31358         IRTemp src_u8 = newTemp(Ity_I8);
31359
31360         if ( epartIsReg( modrm ) ) {
31361            UInt rE = eregOfRexRM(pfx,modrm);
31362            imm8 = (Int)(getUChar(delta+1) & 15);
31363            assign( src_u8, unop(Iop_32to8, getIReg32( rE )) );
31364            delta += 1+1;
31365            DIP( "vpinsrb $%d,%s,%s,%s\n",
31366                 imm8, nameIReg32(rE), nameXMMReg(rV), nameXMMReg(rG) );
31367         } else {
31368            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31369            imm8 = (Int)(getUChar(delta+alen) & 15);
31370            assign( src_u8, loadLE( Ity_I8, mkexpr(addr) ) );
31371            delta += alen+1;
31372            DIP( "vpinsrb $%d,%s,%s,%s\n",
31373                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
31374         }
31375
31376         IRTemp src_vec = newTemp(Ity_V128);
31377         assign(src_vec, getXMMReg( rV ));
31378         IRTemp res_vec = math_PINSRB_128( src_vec, src_u8, imm8 );
31379         putYMMRegLoAndZU( rG, mkexpr(res_vec) );
31380         *uses_vvvv = True;
31381         goto decode_success;
31382      }
31383      break;
31384
31385   case 0x21:
31386      /* VINSERTPS imm8, xmm3/m32, xmm2, xmm1
31387         = VEX.NDS.128.66.0F3A.WIG 21 /r ib */
31388      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31389         UChar  modrm = getUChar(delta);
31390         UInt   rG    = gregOfRexRM(pfx, modrm);
31391         UInt   rV    = getVexNvvvv(pfx);
31392         UInt   imm8;
31393         IRTemp d2ins = newTemp(Ity_I32); /* comes from the E part */
31394         const IRTemp inval = IRTemp_INVALID;
31395
31396         if ( epartIsReg( modrm ) ) {
31397            UInt   rE = eregOfRexRM(pfx, modrm);
31398            IRTemp vE = newTemp(Ity_V128);
31399            assign( vE, getXMMReg(rE) );
31400            IRTemp dsE[4] = { inval, inval, inval, inval };
31401            breakupV128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
31402            imm8 = getUChar(delta+1);
31403            d2ins = dsE[(imm8 >> 6) & 3]; /* "imm8_count_s" */
31404            delta += 1+1;
31405            DIP( "insertps $%u, %s,%s\n",
31406                 imm8, nameXMMReg(rE), nameXMMReg(rG) );
31407         } else {
31408            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31409            assign( d2ins, loadLE( Ity_I32, mkexpr(addr) ) );
31410            imm8 = getUChar(delta+alen);
31411            delta += alen+1;
31412            DIP( "insertps $%u, %s,%s\n",
31413                 imm8, dis_buf, nameXMMReg(rG) );
31414         }
31415
31416         IRTemp vV = newTemp(Ity_V128);
31417         assign( vV, getXMMReg(rV) );
31418
31419         putYMMRegLoAndZU( rG, mkexpr(math_INSERTPS( vV, d2ins, imm8 )) );
31420         *uses_vvvv = True;
31421         goto decode_success;
31422      }
31423      break;
31424
31425   case 0x22:
31426      /* VPINSRD r32/m32, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 22 /r ib */
31427      if (have66noF2noF3(pfx)
31428          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
31429         UChar  modrm = getUChar(delta);
31430         UInt   rG    = gregOfRexRM(pfx, modrm);
31431         UInt   rV    = getVexNvvvv(pfx);
31432         Int    imm8_10;
31433         IRTemp src_u32 = newTemp(Ity_I32);
31434
31435         if ( epartIsReg( modrm ) ) {
31436            UInt rE = eregOfRexRM(pfx,modrm);
31437            imm8_10 = (Int)(getUChar(delta+1) & 3);
31438            assign( src_u32, getIReg32( rE ) );
31439            delta += 1+1;
31440            DIP( "vpinsrd $%d,%s,%s,%s\n",
31441                 imm8_10, nameIReg32(rE), nameXMMReg(rV), nameXMMReg(rG) );
31442         } else {
31443            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31444            imm8_10 = (Int)(getUChar(delta+alen) & 3);
31445            assign( src_u32, loadLE( Ity_I32, mkexpr(addr) ) );
31446            delta += alen+1;
31447            DIP( "vpinsrd $%d,%s,%s,%s\n",
31448                 imm8_10, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
31449         }
31450
31451         IRTemp src_vec = newTemp(Ity_V128);
31452         assign(src_vec, getXMMReg( rV ));
31453         IRTemp res_vec = math_PINSRD_128( src_vec, src_u32, imm8_10 );
31454         putYMMRegLoAndZU( rG, mkexpr(res_vec) );
31455         *uses_vvvv = True;
31456         goto decode_success;
31457      }
31458      /* VPINSRQ r64/m64, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W1 22 /r ib */
31459      if (have66noF2noF3(pfx)
31460          && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
31461         UChar  modrm = getUChar(delta);
31462         UInt   rG    = gregOfRexRM(pfx, modrm);
31463         UInt   rV    = getVexNvvvv(pfx);
31464         Int    imm8_0;
31465         IRTemp src_u64 = newTemp(Ity_I64);
31466
31467         if ( epartIsReg( modrm ) ) {
31468            UInt rE = eregOfRexRM(pfx,modrm);
31469            imm8_0 = (Int)(getUChar(delta+1) & 1);
31470            assign( src_u64, getIReg64( rE ) );
31471            delta += 1+1;
31472            DIP( "vpinsrq $%d,%s,%s,%s\n",
31473                 imm8_0, nameIReg64(rE), nameXMMReg(rV), nameXMMReg(rG) );
31474         } else {
31475            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31476            imm8_0 = (Int)(getUChar(delta+alen) & 1);
31477            assign( src_u64, loadLE( Ity_I64, mkexpr(addr) ) );
31478            delta += alen+1;
31479            DIP( "vpinsrd $%d,%s,%s,%s\n",
31480                 imm8_0, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
31481         }
31482
31483         IRTemp src_vec = newTemp(Ity_V128);
31484         assign(src_vec, getXMMReg( rV ));
31485         IRTemp res_vec = math_PINSRQ_128( src_vec, src_u64, imm8_0 );
31486         putYMMRegLoAndZU( rG, mkexpr(res_vec) );
31487         *uses_vvvv = True;
31488         goto decode_success;
31489      }
31490      break;
31491
31492   case 0x38:
31493      /* VINSERTI128 r/m, rV, rD
31494         ::: rD = insertinto(a lane in rV, 128 bits from r/m) */
31495      /* VINSERTI128 = VEX.NDS.256.66.0F3A.W0 38 /r ib */
31496      if (have66noF2noF3(pfx)
31497          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
31498         UChar  modrm = getUChar(delta);
31499         UInt   ib    = 0;
31500         UInt   rG    = gregOfRexRM(pfx, modrm);
31501         UInt   rV    = getVexNvvvv(pfx);
31502         IRTemp t128  = newTemp(Ity_V128);
31503         if (epartIsReg(modrm)) {
31504            UInt rE = eregOfRexRM(pfx, modrm);
31505            delta += 1;
31506            assign(t128, getXMMReg(rE));
31507            ib = getUChar(delta);
31508            DIP("vinserti128 $%u,%s,%s,%s\n",
31509                ib, nameXMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
31510         } else {
31511            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31512            assign(t128, loadLE(Ity_V128, mkexpr(addr)));
31513            delta += alen;
31514            ib = getUChar(delta);
31515            DIP("vinserti128 $%u,%s,%s,%s\n",
31516                ib, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
31517         }
31518         delta++;
31519         putYMMRegLane128(rG, 0,   getYMMRegLane128(rV, 0));
31520         putYMMRegLane128(rG, 1,   getYMMRegLane128(rV, 1));
31521         putYMMRegLane128(rG, ib & 1, mkexpr(t128));
31522         *uses_vvvv = True;
31523         goto decode_success;
31524      }
31525      break;
31526
31527   case 0x39:
31528      /* VEXTRACTI128 $lane_no, rS, r/m
31529         ::: r/m:V128 = a lane of rS:V256 (RM format) */
31530      /* VEXTRACTI128 = VEX.256.66.0F3A.W0 39 /r ib */
31531      if (have66noF2noF3(pfx)
31532          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
31533         UChar  modrm = getUChar(delta);
31534         UInt   ib    = 0;
31535         UInt   rS    = gregOfRexRM(pfx, modrm);
31536         IRTemp t128  = newTemp(Ity_V128);
31537         if (epartIsReg(modrm)) {
31538            UInt rD = eregOfRexRM(pfx, modrm);
31539            delta += 1;
31540            ib = getUChar(delta);
31541            assign(t128, getYMMRegLane128(rS, ib & 1));
31542            putYMMRegLoAndZU(rD, mkexpr(t128));
31543            DIP("vextracti128 $%u,%s,%s\n",
31544                ib, nameXMMReg(rS), nameYMMReg(rD));
31545         } else {
31546            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31547            delta += alen;
31548            ib = getUChar(delta);
31549            assign(t128, getYMMRegLane128(rS, ib & 1));
31550            storeLE(mkexpr(addr), mkexpr(t128));
31551            DIP("vextracti128 $%u,%s,%s\n",
31552                ib, nameYMMReg(rS), dis_buf);
31553         }
31554         delta++;
31555         /* doesn't use vvvv */
31556         goto decode_success;
31557      }
31558      break;
31559
31560   case 0x40:
31561      /* VDPPS imm8, xmm3/m128,xmm2,xmm1 = VEX.NDS.128.66.0F3A.WIG 40 /r ib */
31562      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31563         UChar  modrm   = getUChar(delta);
31564         UInt   rG      = gregOfRexRM(pfx, modrm);
31565         UInt   rV      = getVexNvvvv(pfx);
31566         IRTemp dst_vec = newTemp(Ity_V128);
31567         Int    imm8;
31568         if (epartIsReg( modrm )) {
31569            UInt rE = eregOfRexRM(pfx,modrm);
31570            imm8 = (Int)getUChar(delta+1);
31571            assign( dst_vec, getXMMReg( rE ) );
31572            delta += 1+1;
31573            DIP( "vdpps $%d,%s,%s,%s\n",
31574                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
31575         } else {
31576            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31577            imm8 = (Int)getUChar(delta+alen);
31578            assign( dst_vec, loadLE( Ity_V128, mkexpr(addr) ) );
31579            delta += alen+1;
31580            DIP( "vdpps $%d,%s,%s,%s\n",
31581                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
31582         }
31583
31584         IRTemp src_vec = newTemp(Ity_V128);
31585         assign(src_vec, getXMMReg( rV ));
31586         IRTemp res_vec = math_DPPS_128( src_vec, dst_vec, imm8 );
31587         putYMMRegLoAndZU( rG, mkexpr(res_vec) );
31588         *uses_vvvv = True;
31589         goto decode_success;
31590      }
31591      /* VDPPS imm8, ymm3/m128,ymm2,ymm1 = VEX.NDS.256.66.0F3A.WIG 40 /r ib */
31592      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
31593         UChar  modrm   = getUChar(delta);
31594         UInt   rG      = gregOfRexRM(pfx, modrm);
31595         UInt   rV      = getVexNvvvv(pfx);
31596         IRTemp dst_vec = newTemp(Ity_V256);
31597         Int    imm8;
31598         if (epartIsReg( modrm )) {
31599            UInt rE = eregOfRexRM(pfx,modrm);
31600            imm8 = (Int)getUChar(delta+1);
31601            assign( dst_vec, getYMMReg( rE ) );
31602            delta += 1+1;
31603            DIP( "vdpps $%d,%s,%s,%s\n",
31604                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG) );
31605         } else {
31606            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31607            imm8 = (Int)getUChar(delta+alen);
31608            assign( dst_vec, loadLE( Ity_V256, mkexpr(addr) ) );
31609            delta += alen+1;
31610            DIP( "vdpps $%d,%s,%s,%s\n",
31611                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
31612         }
31613
31614         IRTemp src_vec = newTemp(Ity_V256);
31615         assign(src_vec, getYMMReg( rV ));
31616         IRTemp s0, s1, d0, d1;
31617         s0 = s1 = d0 = d1 = IRTemp_INVALID;
31618         breakupV256toV128s( dst_vec, &d1, &d0 );
31619         breakupV256toV128s( src_vec, &s1, &s0 );
31620         putYMMReg( rG, binop( Iop_V128HLtoV256,
31621                               mkexpr( math_DPPS_128(s1, d1, imm8) ),
31622                               mkexpr( math_DPPS_128(s0, d0, imm8) ) ) );
31623         *uses_vvvv = True;
31624         goto decode_success;
31625      }
31626      break;
31627
31628   case 0x41:
31629      /* VDPPD imm8, xmm3/m128,xmm2,xmm1 = VEX.NDS.128.66.0F3A.WIG 41 /r ib */
31630      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31631         UChar  modrm   = getUChar(delta);
31632         UInt   rG      = gregOfRexRM(pfx, modrm);
31633         UInt   rV      = getVexNvvvv(pfx);
31634         IRTemp dst_vec = newTemp(Ity_V128);
31635         Int    imm8;
31636         if (epartIsReg( modrm )) {
31637            UInt rE = eregOfRexRM(pfx,modrm);
31638            imm8 = (Int)getUChar(delta+1);
31639            assign( dst_vec, getXMMReg( rE ) );
31640            delta += 1+1;
31641            DIP( "vdppd $%d,%s,%s,%s\n",
31642                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
31643         } else {
31644            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31645            imm8 = (Int)getUChar(delta+alen);
31646            assign( dst_vec, loadLE( Ity_V128, mkexpr(addr) ) );
31647            delta += alen+1;
31648            DIP( "vdppd $%d,%s,%s,%s\n",
31649                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
31650         }
31651
31652         IRTemp src_vec = newTemp(Ity_V128);
31653         assign(src_vec, getXMMReg( rV ));
31654         IRTemp res_vec = math_DPPD_128( src_vec, dst_vec, imm8 );
31655         putYMMRegLoAndZU( rG, mkexpr(res_vec) );
31656         *uses_vvvv = True;
31657         goto decode_success;
31658      }
31659      break;
31660
31661   case 0x42:
31662      /* VMPSADBW imm8, xmm3/m128,xmm2,xmm1 */
31663      /* VMPSADBW = VEX.NDS.128.66.0F3A.WIG 42 /r ib */
31664      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31665         UChar  modrm   = getUChar(delta);
31666         Int    imm8;
31667         IRTemp src_vec = newTemp(Ity_V128);
31668         IRTemp dst_vec = newTemp(Ity_V128);
31669         UInt   rG      = gregOfRexRM(pfx, modrm);
31670         UInt   rV      = getVexNvvvv(pfx);
31671
31672         assign( dst_vec, getXMMReg(rV) );
31673
31674         if ( epartIsReg( modrm ) ) {
31675            UInt rE = eregOfRexRM(pfx, modrm);
31676
31677            imm8 = (Int)getUChar(delta+1);
31678            assign( src_vec, getXMMReg(rE) );
31679            delta += 1+1;
31680            DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
31681                 nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
31682         } else {
31683            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
31684                             1/* imm8 is 1 byte after the amode */ );
31685            assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
31686            imm8 = (Int)getUChar(delta+alen);
31687            delta += alen+1;
31688            DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
31689                 dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
31690         }
31691
31692         putYMMRegLoAndZU( rG, mkexpr( math_MPSADBW_128(dst_vec,
31693                                                        src_vec, imm8) ) );
31694         *uses_vvvv = True;
31695         goto decode_success;
31696      }
31697      /* VMPSADBW imm8, ymm3/m256,ymm2,ymm1 */
31698      /* VMPSADBW = VEX.NDS.256.66.0F3A.WIG 42 /r ib */
31699      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
31700         UChar  modrm   = getUChar(delta);
31701         Int    imm8;
31702         IRTemp src_vec = newTemp(Ity_V256);
31703         IRTemp dst_vec = newTemp(Ity_V256);
31704         UInt   rG      = gregOfRexRM(pfx, modrm);
31705         UInt   rV      = getVexNvvvv(pfx);
31706         IRTemp sHi, sLo, dHi, dLo;
31707         sHi = sLo = dHi = dLo = IRTemp_INVALID;
31708
31709         assign( dst_vec, getYMMReg(rV) );
31710
31711         if ( epartIsReg( modrm ) ) {
31712            UInt rE = eregOfRexRM(pfx, modrm);
31713
31714            imm8 = (Int)getUChar(delta+1);
31715            assign( src_vec, getYMMReg(rE) );
31716            delta += 1+1;
31717            DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
31718                 nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG) );
31719         } else {
31720            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
31721                             1/* imm8 is 1 byte after the amode */ );
31722            assign( src_vec, loadLE( Ity_V256, mkexpr(addr) ) );
31723            imm8 = (Int)getUChar(delta+alen);
31724            delta += alen+1;
31725            DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
31726                 dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
31727         }
31728
31729         breakupV256toV128s( dst_vec, &dHi, &dLo );
31730         breakupV256toV128s( src_vec, &sHi, &sLo );
31731         putYMMReg( rG, binop( Iop_V128HLtoV256,
31732                               mkexpr( math_MPSADBW_128(dHi, sHi, imm8 >> 3) ),
31733                               mkexpr( math_MPSADBW_128(dLo, sLo, imm8) ) ) );
31734         *uses_vvvv = True;
31735         goto decode_success;
31736      }
31737      break;
31738
31739   case 0x44:
31740      /* VPCLMULQDQ imm8, xmm3/m128,xmm2,xmm1 */
31741      /* VPCLMULQDQ = VEX.NDS.128.66.0F3A.WIG 44 /r ib */
31742      /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
31743       * Carry-less multiplication of selected XMM quadwords into XMM
31744       * registers (a.k.a multiplication of polynomials over GF(2))
31745       */
31746      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31747         UChar  modrm = getUChar(delta);
31748         Int imm8;
31749         IRTemp sV    = newTemp(Ity_V128);
31750         IRTemp dV    = newTemp(Ity_V128);
31751         UInt   rG    = gregOfRexRM(pfx, modrm);
31752         UInt   rV    = getVexNvvvv(pfx);
31753
31754         assign( dV, getXMMReg(rV) );
31755
31756         if ( epartIsReg( modrm ) ) {
31757            UInt rE = eregOfRexRM(pfx, modrm);
31758            imm8 = (Int)getUChar(delta+1);
31759            assign( sV, getXMMReg(rE) );
31760            delta += 1+1;
31761            DIP( "vpclmulqdq $%d, %s,%s,%s\n", imm8,
31762                 nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
31763         } else {
31764            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
31765                             1/* imm8 is 1 byte after the amode */ );
31766            assign( sV, loadLE( Ity_V128, mkexpr(addr) ) );
31767            imm8 = (Int)getUChar(delta+alen);
31768            delta += alen+1;
31769            DIP( "vpclmulqdq $%d, %s,%s,%s\n",
31770                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
31771         }
31772
31773         putYMMRegLoAndZU( rG, mkexpr( math_PCLMULQDQ(dV, sV, imm8) ) );
31774         *uses_vvvv = True;
31775         goto decode_success;
31776      }
31777      break;
31778
31779   case 0x46:
31780      /* VPERM2I128 imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.66.0F3A.W0 46 /r ib */
31781      if (have66noF2noF3(pfx)
31782          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
31783         UChar  modrm = getUChar(delta);
31784         UInt   imm8  = 0;
31785         UInt   rG    = gregOfRexRM(pfx, modrm);
31786         UInt   rV    = getVexNvvvv(pfx);
31787         IRTemp s00   = newTemp(Ity_V128);
31788         IRTemp s01   = newTemp(Ity_V128);
31789         IRTemp s10   = newTemp(Ity_V128);
31790         IRTemp s11   = newTemp(Ity_V128);
31791         assign(s00, getYMMRegLane128(rV, 0));
31792         assign(s01, getYMMRegLane128(rV, 1));
31793         if (epartIsReg(modrm)) {
31794            UInt rE = eregOfRexRM(pfx, modrm);
31795            delta += 1;
31796            imm8 = getUChar(delta);
31797            DIP("vperm2i128 $%u,%s,%s,%s\n",
31798                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
31799            assign(s10, getYMMRegLane128(rE, 0));
31800            assign(s11, getYMMRegLane128(rE, 1));
31801         } else {
31802            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
31803            delta += alen;
31804            imm8 = getUChar(delta);
31805            DIP("vperm2i128 $%u,%s,%s,%s\n",
31806                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
31807            assign(s10, loadLE(Ity_V128, binop(Iop_Add64,
31808                                               mkexpr(addr), mkU64(0))));
31809            assign(s11, loadLE(Ity_V128, binop(Iop_Add64,
31810                                               mkexpr(addr), mkU64(16))));
31811         }
31812         delta++;
31813#        define SEL(_nn) (((_nn)==0) ? s00 : ((_nn)==1) ? s01 \
31814                                           : ((_nn)==2) ? s10 : s11)
31815         putYMMRegLane128(rG, 0, mkexpr(SEL((imm8 >> 0) & 3)));
31816         putYMMRegLane128(rG, 1, mkexpr(SEL((imm8 >> 4) & 3)));
31817#        undef SEL
31818         if (imm8 & (1<<3)) putYMMRegLane128(rG, 0, mkV128(0));
31819         if (imm8 & (1<<7)) putYMMRegLane128(rG, 1, mkV128(0));
31820         *uses_vvvv = True;
31821         goto decode_success;
31822      }
31823      break;
31824
31825   case 0x4A:
31826      /* VBLENDVPS xmmG, xmmE/memE, xmmV, xmmIS4
31827         ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
31828      /* VBLENDVPS = VEX.NDS.128.66.0F3A.WIG 4A /r /is4 */
31829      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31830         delta = dis_VBLENDV_128 ( vbi, pfx, delta,
31831                                   "vblendvps", 4, Iop_SarN32x4 );
31832         *uses_vvvv = True;
31833         goto decode_success;
31834      }
31835      /* VBLENDVPS ymmG, ymmE/memE, ymmV, ymmIS4
31836         ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
31837      /* VBLENDVPS = VEX.NDS.256.66.0F3A.WIG 4A /r /is4 */
31838      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
31839         delta = dis_VBLENDV_256 ( vbi, pfx, delta,
31840                                   "vblendvps", 4, Iop_SarN32x4 );
31841         *uses_vvvv = True;
31842         goto decode_success;
31843      }
31844      break;
31845
31846   case 0x4B:
31847      /* VBLENDVPD xmmG, xmmE/memE, xmmV, xmmIS4
31848         ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
31849      /* VBLENDVPD = VEX.NDS.128.66.0F3A.WIG 4B /r /is4 */
31850      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31851         delta = dis_VBLENDV_128 ( vbi, pfx, delta,
31852                                   "vblendvpd", 8, Iop_SarN64x2 );
31853         *uses_vvvv = True;
31854         goto decode_success;
31855      }
31856      /* VBLENDVPD ymmG, ymmE/memE, ymmV, ymmIS4
31857         ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
31858      /* VBLENDVPD = VEX.NDS.256.66.0F3A.WIG 4B /r /is4 */
31859      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
31860         delta = dis_VBLENDV_256 ( vbi, pfx, delta,
31861                                   "vblendvpd", 8, Iop_SarN64x2 );
31862         *uses_vvvv = True;
31863         goto decode_success;
31864      }
31865      break;
31866
31867   case 0x4C:
31868      /* VPBLENDVB xmmG, xmmE/memE, xmmV, xmmIS4
31869         ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
31870      /* VPBLENDVB = VEX.NDS.128.66.0F3A.WIG 4C /r /is4 */
31871      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31872         delta = dis_VBLENDV_128 ( vbi, pfx, delta,
31873                                   "vpblendvb", 1, Iop_SarN8x16 );
31874         *uses_vvvv = True;
31875         goto decode_success;
31876      }
31877      /* VPBLENDVB ymmG, ymmE/memE, ymmV, ymmIS4
31878         ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
31879      /* VPBLENDVB = VEX.NDS.256.66.0F3A.WIG 4C /r /is4 */
31880      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
31881         delta = dis_VBLENDV_256 ( vbi, pfx, delta,
31882                                   "vpblendvb", 1, Iop_SarN8x16 );
31883         *uses_vvvv = True;
31884         goto decode_success;
31885      }
31886      break;
31887
31888   case 0x60:
31889   case 0x61:
31890   case 0x62:
31891   case 0x63:
31892      /* VEX.128.66.0F3A.WIG 63 /r ib = VPCMPISTRI imm8, xmm2/m128, xmm1
31893         VEX.128.66.0F3A.WIG 62 /r ib = VPCMPISTRM imm8, xmm2/m128, xmm1
31894         VEX.128.66.0F3A.WIG 61 /r ib = VPCMPESTRI imm8, xmm2/m128, xmm1
31895         VEX.128.66.0F3A.WIG 60 /r ib = VPCMPESTRM imm8, xmm2/m128, xmm1
31896         (selected special cases that actually occur in glibc,
31897          not by any means a complete implementation.)
31898      */
31899      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31900         Long delta0 = delta;
31901         delta = dis_PCMPxSTRx( vbi, pfx, delta, True/*isAvx*/, opc );
31902         if (delta > delta0) goto decode_success;
31903         /* else fall though; dis_PCMPxSTRx failed to decode it */
31904      }
31905      break;
31906
31907   case 0x5C ... 0x5F:
31908   case 0x68 ... 0x6F:
31909   case 0x78 ... 0x7F:
31910      /* FIXME: list the instructions decoded here */
31911      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31912         Long delta0 = delta;
31913         delta = dis_FMA4( pfx, delta, opc, uses_vvvv, vbi );
31914         if (delta > delta0) {
31915            dres->hint = Dis_HintVerbose;
31916            goto decode_success;
31917         }
31918         /* else fall though; dis_FMA4 failed to decode it */
31919      }
31920      break;
31921
31922   case 0xDF:
31923      /* VAESKEYGENASSIST imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG DF /r */
31924      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31925         delta = dis_AESKEYGENASSIST( vbi, pfx, delta, True/*!isAvx*/ );
31926         goto decode_success;
31927      }
31928      break;
31929
31930   case 0xF0:
31931      /* RORX imm8, r/m32, r32a = VEX.LZ.F2.0F3A.W0 F0 /r /i */
31932      /* RORX imm8, r/m64, r64a = VEX.LZ.F2.0F3A.W1 F0 /r /i */
31933      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
31934         Int     size = getRexW(pfx) ? 8 : 4;
31935         IRType  ty   = szToITy(size);
31936         IRTemp  src  = newTemp(ty);
31937         UChar   rm   = getUChar(delta);
31938         UChar   imm8;
31939
31940         if (epartIsReg(rm)) {
31941            imm8 = getUChar(delta+1);
31942            assign( src, getIRegE(size,pfx,rm) );
31943            DIP("rorx %d,%s,%s\n", imm8, nameIRegE(size,pfx,rm),
31944                                   nameIRegG(size,pfx,rm));
31945            delta += 2;
31946         } else {
31947            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
31948            imm8 = getUChar(delta+alen);
31949            assign( src, loadLE(ty, mkexpr(addr)) );
31950            DIP("rorx %d,%s,%s\n", imm8, dis_buf, nameIRegG(size,pfx,rm));
31951            delta += alen + 1;
31952         }
31953         imm8 &= 8*size-1;
31954
31955         /* dst = (src >>u imm8) | (src << (size-imm8)) */
31956         putIRegG( size, pfx, rm,
31957                   imm8 == 0 ? mkexpr(src)
31958                   : binop( mkSizedOp(ty,Iop_Or8),
31959                            binop( mkSizedOp(ty,Iop_Shr8), mkexpr(src),
31960                                   mkU8(imm8) ),
31961                            binop( mkSizedOp(ty,Iop_Shl8), mkexpr(src),
31962                                   mkU8(8*size-imm8) ) ) );
31963         /* Flags aren't modified.  */
31964         goto decode_success;
31965      }
31966      break;
31967
31968   default:
31969      break;
31970
31971   }
31972
31973  //decode_failure:
31974   return deltaIN;
31975
31976  decode_success:
31977   return delta;
31978}
31979
31980
31981/*------------------------------------------------------------*/
31982/*---                                                      ---*/
31983/*--- Disassemble a single instruction                     ---*/
31984/*---                                                      ---*/
31985/*------------------------------------------------------------*/
31986
31987/* Disassemble a single instruction into IR.  The instruction is
31988   located in host memory at &guest_code[delta]. */
31989
31990static
31991DisResult disInstr_AMD64_WRK (
31992             /*OUT*/Bool* expect_CAS,
31993             Bool         (*resteerOkFn) ( /*opaque*/void*, Addr ),
31994             Bool         resteerCisOk,
31995             void*        callback_opaque,
31996             Long         delta64,
31997             const VexArchInfo* archinfo,
31998             const VexAbiInfo*  vbi,
31999             Bool         sigill_diag
32000          )
32001{
32002   IRTemp    t1, t2;
32003   UChar     pre;
32004   Int       n, n_prefixes;
32005   DisResult dres;
32006
32007   /* The running delta */
32008   Long delta = delta64;
32009
32010   /* Holds eip at the start of the insn, so that we can print
32011      consistent error messages for unimplemented insns. */
32012   Long delta_start = delta;
32013
32014   /* sz denotes the nominal data-op size of the insn; we change it to
32015      2 if an 0x66 prefix is seen and 8 if REX.W is 1.  In case of
32016      conflict REX.W takes precedence. */
32017   Int sz = 4;
32018
32019   /* pfx holds the summary of prefixes. */
32020   Prefix pfx = PFX_EMPTY;
32021
32022   /* Holds the computed opcode-escape indication. */
32023   Escape esc = ESC_NONE;
32024
32025   /* Set result defaults. */
32026   dres.whatNext    = Dis_Continue;
32027   dres.len         = 0;
32028   dres.continueAt  = 0;
32029   dres.jk_StopHere = Ijk_INVALID;
32030   dres.hint        = Dis_HintNone;
32031   *expect_CAS = False;
32032
32033   vassert(guest_RIP_next_assumed == 0);
32034   vassert(guest_RIP_next_mustcheck == False);
32035
32036   t1 = t2 = IRTemp_INVALID;
32037
32038   DIP("\t0x%llx:  ", guest_RIP_bbstart+delta);
32039
32040   /* Spot "Special" instructions (see comment at top of file). */
32041   {
32042      const UChar* code = guest_code + delta;
32043      /* Spot the 16-byte preamble:
32044         48C1C703   rolq $3,  %rdi
32045         48C1C70D   rolq $13, %rdi
32046         48C1C73D   rolq $61, %rdi
32047         48C1C733   rolq $51, %rdi
32048      */
32049      if (code[ 0] == 0x48 && code[ 1] == 0xC1 && code[ 2] == 0xC7
32050                                               && code[ 3] == 0x03 &&
32051          code[ 4] == 0x48 && code[ 5] == 0xC1 && code[ 6] == 0xC7
32052                                               && code[ 7] == 0x0D &&
32053          code[ 8] == 0x48 && code[ 9] == 0xC1 && code[10] == 0xC7
32054                                               && code[11] == 0x3D &&
32055          code[12] == 0x48 && code[13] == 0xC1 && code[14] == 0xC7
32056                                               && code[15] == 0x33) {
32057         /* Got a "Special" instruction preamble.  Which one is it? */
32058         if (code[16] == 0x48 && code[17] == 0x87
32059                              && code[18] == 0xDB /* xchgq %rbx,%rbx */) {
32060            /* %RDX = client_request ( %RAX ) */
32061            DIP("%%rdx = client_request ( %%rax )\n");
32062            delta += 19;
32063            jmp_lit(&dres, Ijk_ClientReq, guest_RIP_bbstart+delta);
32064            vassert(dres.whatNext == Dis_StopHere);
32065            goto decode_success;
32066         }
32067         else
32068         if (code[16] == 0x48 && code[17] == 0x87
32069                              && code[18] == 0xC9 /* xchgq %rcx,%rcx */) {
32070            /* %RAX = guest_NRADDR */
32071            DIP("%%rax = guest_NRADDR\n");
32072            delta += 19;
32073            putIRegRAX(8, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
32074            goto decode_success;
32075         }
32076         else
32077         if (code[16] == 0x48 && code[17] == 0x87
32078                              && code[18] == 0xD2 /* xchgq %rdx,%rdx */) {
32079            /* call-noredir *%RAX */
32080            DIP("call-noredir *%%rax\n");
32081            delta += 19;
32082            t1 = newTemp(Ity_I64);
32083            assign(t1, getIRegRAX(8));
32084            t2 = newTemp(Ity_I64);
32085            assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
32086            putIReg64(R_RSP, mkexpr(t2));
32087            storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta));
32088            jmp_treg(&dres, Ijk_NoRedir, t1);
32089            vassert(dres.whatNext == Dis_StopHere);
32090            goto decode_success;
32091         }
32092         else
32093         if (code[16] == 0x48 && code[17] == 0x87
32094                              && code[18] == 0xff /* xchgq %rdi,%rdi */) {
32095           /* IR injection */
32096            DIP("IR injection\n");
32097            vex_inject_ir(irsb, Iend_LE);
32098
32099            // Invalidate the current insn. The reason is that the IRop we're
32100            // injecting here can change. In which case the translation has to
32101            // be redone. For ease of handling, we simply invalidate all the
32102            // time.
32103            stmt(IRStmt_Put(OFFB_CMSTART, mkU64(guest_RIP_curr_instr)));
32104            stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(19)));
32105
32106            delta += 19;
32107
32108            stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_bbstart + delta) ) );
32109            dres.whatNext    = Dis_StopHere;
32110            dres.jk_StopHere = Ijk_InvalICache;
32111            goto decode_success;
32112         }
32113         /* We don't know what it is. */
32114         goto decode_failure;
32115         /*NOTREACHED*/
32116      }
32117   }
32118
32119   /* Eat prefixes, summarising the result in pfx and sz, and rejecting
32120      as many invalid combinations as possible. */
32121   n_prefixes = 0;
32122   while (True) {
32123      if (n_prefixes > 7) goto decode_failure;
32124      pre = getUChar(delta);
32125      switch (pre) {
32126         case 0x66: pfx |= PFX_66; break;
32127         case 0x67: pfx |= PFX_ASO; break;
32128         case 0xF2: pfx |= PFX_F2; break;
32129         case 0xF3: pfx |= PFX_F3; break;
32130         case 0xF0: pfx |= PFX_LOCK; *expect_CAS = True; break;
32131         case 0x2E: pfx |= PFX_CS; break;
32132         case 0x3E: pfx |= PFX_DS; break;
32133         case 0x26: pfx |= PFX_ES; break;
32134         case 0x64: pfx |= PFX_FS; break;
32135         case 0x65: pfx |= PFX_GS; break;
32136         case 0x36: pfx |= PFX_SS; break;
32137         case 0x40 ... 0x4F:
32138            pfx |= PFX_REX;
32139            if (pre & (1<<3)) pfx |= PFX_REXW;
32140            if (pre & (1<<2)) pfx |= PFX_REXR;
32141            if (pre & (1<<1)) pfx |= PFX_REXX;
32142            if (pre & (1<<0)) pfx |= PFX_REXB;
32143            break;
32144         default:
32145            goto not_a_legacy_prefix;
32146      }
32147      n_prefixes++;
32148      delta++;
32149   }
32150
32151   not_a_legacy_prefix:
32152   /* We've used up all the non-VEX prefixes.  Parse and validate a
32153      VEX prefix if that's appropriate. */
32154   if (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX) {
32155      /* Used temporarily for holding VEX prefixes. */
32156      UChar vex0 = getUChar(delta);
32157      if (vex0 == 0xC4) {
32158         /* 3-byte VEX */
32159         UChar vex1 = getUChar(delta+1);
32160         UChar vex2 = getUChar(delta+2);
32161         delta += 3;
32162         pfx |= PFX_VEX;
32163         /* Snarf contents of byte 1 */
32164         /* R */ pfx |= (vex1 & (1<<7)) ? 0 : PFX_REXR;
32165         /* X */ pfx |= (vex1 & (1<<6)) ? 0 : PFX_REXX;
32166         /* B */ pfx |= (vex1 & (1<<5)) ? 0 : PFX_REXB;
32167         /* m-mmmm */
32168         switch (vex1 & 0x1F) {
32169            case 1: esc = ESC_0F;   break;
32170            case 2: esc = ESC_0F38; break;
32171            case 3: esc = ESC_0F3A; break;
32172            /* Any other m-mmmm field will #UD */
32173            default: goto decode_failure;
32174         }
32175         /* Snarf contents of byte 2 */
32176         /* W */    pfx |= (vex2 & (1<<7)) ? PFX_REXW : 0;
32177         /* ~v3 */  pfx |= (vex2 & (1<<6)) ? 0 : PFX_VEXnV3;
32178         /* ~v2 */  pfx |= (vex2 & (1<<5)) ? 0 : PFX_VEXnV2;
32179         /* ~v1 */  pfx |= (vex2 & (1<<4)) ? 0 : PFX_VEXnV1;
32180         /* ~v0 */  pfx |= (vex2 & (1<<3)) ? 0 : PFX_VEXnV0;
32181         /* L */    pfx |= (vex2 & (1<<2)) ? PFX_VEXL : 0;
32182         /* pp */
32183         switch (vex2 & 3) {
32184            case 0: break;
32185            case 1: pfx |= PFX_66; break;
32186            case 2: pfx |= PFX_F3; break;
32187            case 3: pfx |= PFX_F2; break;
32188            default: vassert(0);
32189         }
32190      }
32191      else if (vex0 == 0xC5) {
32192         /* 2-byte VEX */
32193         UChar vex1 = getUChar(delta+1);
32194         delta += 2;
32195         pfx |= PFX_VEX;
32196         /* Snarf contents of byte 1 */
32197         /* R */    pfx |= (vex1 & (1<<7)) ? 0 : PFX_REXR;
32198         /* ~v3 */  pfx |= (vex1 & (1<<6)) ? 0 : PFX_VEXnV3;
32199         /* ~v2 */  pfx |= (vex1 & (1<<5)) ? 0 : PFX_VEXnV2;
32200         /* ~v1 */  pfx |= (vex1 & (1<<4)) ? 0 : PFX_VEXnV1;
32201         /* ~v0 */  pfx |= (vex1 & (1<<3)) ? 0 : PFX_VEXnV0;
32202         /* L */    pfx |= (vex1 & (1<<2)) ? PFX_VEXL : 0;
32203         /* pp */
32204         switch (vex1 & 3) {
32205            case 0: break;
32206            case 1: pfx |= PFX_66; break;
32207            case 2: pfx |= PFX_F3; break;
32208            case 3: pfx |= PFX_F2; break;
32209            default: vassert(0);
32210         }
32211         /* implied: */
32212         esc = ESC_0F;
32213      }
32214      /* Can't have both VEX and REX */
32215      if ((pfx & PFX_VEX) && (pfx & PFX_REX))
32216         goto decode_failure; /* can't have both */
32217   }
32218
32219   /* Dump invalid combinations */
32220   n = 0;
32221   if (pfx & PFX_F2) n++;
32222   if (pfx & PFX_F3) n++;
32223   if (n > 1)
32224      goto decode_failure; /* can't have both */
32225
32226   n = 0;
32227   if (pfx & PFX_CS) n++;
32228   if (pfx & PFX_DS) n++;
32229   if (pfx & PFX_ES) n++;
32230   if (pfx & PFX_FS) n++;
32231   if (pfx & PFX_GS) n++;
32232   if (pfx & PFX_SS) n++;
32233   if (n > 1)
32234      goto decode_failure; /* multiple seg overrides == illegal */
32235
32236   /* We have a %fs prefix.  Reject it if there's no evidence in 'vbi'
32237      that we should accept it. */
32238   if ((pfx & PFX_FS) && !vbi->guest_amd64_assume_fs_is_const)
32239      goto decode_failure;
32240
32241   /* Ditto for %gs prefixes. */
32242   if ((pfx & PFX_GS) && !vbi->guest_amd64_assume_gs_is_const)
32243      goto decode_failure;
32244
32245   /* Set up sz. */
32246   sz = 4;
32247   if (pfx & PFX_66) sz = 2;
32248   if ((pfx & PFX_REX) && (pfx & PFX_REXW)) sz = 8;
32249
32250   /* Now we should be looking at the primary opcode byte or the
32251      leading escapes.  Check that any LOCK prefix is actually
32252      allowed. */
32253   if (haveLOCK(pfx)) {
32254      if (can_be_used_with_LOCK_prefix( &guest_code[delta] )) {
32255         DIP("lock ");
32256      } else {
32257         *expect_CAS = False;
32258         goto decode_failure;
32259      }
32260   }
32261
32262   /* Eat up opcode escape bytes, until we're really looking at the
32263      primary opcode byte.  But only if there's no VEX present. */
32264   if (!(pfx & PFX_VEX)) {
32265      vassert(esc == ESC_NONE);
32266      pre = getUChar(delta);
32267      if (pre == 0x0F) {
32268         delta++;
32269         pre = getUChar(delta);
32270         switch (pre) {
32271            case 0x38: esc = ESC_0F38; delta++; break;
32272            case 0x3A: esc = ESC_0F3A; delta++; break;
32273            default:   esc = ESC_0F; break;
32274         }
32275      }
32276   }
32277
32278   /* So now we're really really looking at the primary opcode
32279      byte. */
32280   Long delta_at_primary_opcode = delta;
32281
32282   if (!(pfx & PFX_VEX)) {
32283      /* Handle non-VEX prefixed instructions.  "Legacy" (non-VEX) SSE
32284         instructions preserve the upper 128 bits of YMM registers;
32285         iow we can simply ignore the presence of the upper halves of
32286         these registers. */
32287      switch (esc) {
32288         case ESC_NONE:
32289            delta = dis_ESC_NONE( &dres, expect_CAS,
32290                                  resteerOkFn, resteerCisOk, callback_opaque,
32291                                  archinfo, vbi, pfx, sz, delta );
32292            break;
32293         case ESC_0F:
32294            delta = dis_ESC_0F  ( &dres, expect_CAS,
32295                                  resteerOkFn, resteerCisOk, callback_opaque,
32296                                  archinfo, vbi, pfx, sz, delta );
32297            break;
32298         case ESC_0F38:
32299            delta = dis_ESC_0F38( &dres,
32300                                  resteerOkFn, resteerCisOk, callback_opaque,
32301                                  archinfo, vbi, pfx, sz, delta );
32302            break;
32303         case ESC_0F3A:
32304            delta = dis_ESC_0F3A( &dres,
32305                                  resteerOkFn, resteerCisOk, callback_opaque,
32306                                  archinfo, vbi, pfx, sz, delta );
32307            break;
32308         default:
32309            vassert(0);
32310      }
32311   } else {
32312      /* VEX prefixed instruction */
32313      /* Sloppy Intel wording: "An instruction encoded with a VEX.128
32314         prefix that loads a YMM register operand ..." zeroes out bits
32315         128 and above of the register. */
32316      Bool uses_vvvv = False;
32317      switch (esc) {
32318         case ESC_0F:
32319            delta = dis_ESC_0F__VEX ( &dres, &uses_vvvv,
32320                                      resteerOkFn, resteerCisOk,
32321                                      callback_opaque,
32322                                      archinfo, vbi, pfx, sz, delta );
32323            break;
32324         case ESC_0F38:
32325            delta = dis_ESC_0F38__VEX ( &dres, &uses_vvvv,
32326                                        resteerOkFn, resteerCisOk,
32327                                        callback_opaque,
32328                                        archinfo, vbi, pfx, sz, delta );
32329            break;
32330         case ESC_0F3A:
32331            delta = dis_ESC_0F3A__VEX ( &dres, &uses_vvvv,
32332                                        resteerOkFn, resteerCisOk,
32333                                        callback_opaque,
32334                                        archinfo, vbi, pfx, sz, delta );
32335            break;
32336         case ESC_NONE:
32337            /* The presence of a VEX prefix, by Intel definition,
32338               always implies at least an 0F escape. */
32339            goto decode_failure;
32340         default:
32341            vassert(0);
32342      }
32343      /* If the insn doesn't use VEX.vvvv then it must be all ones.
32344         Check this. */
32345      if (!uses_vvvv) {
32346         if (getVexNvvvv(pfx) != 0)
32347            goto decode_failure;
32348      }
32349   }
32350
32351   vassert(delta - delta_at_primary_opcode >= 0);
32352   vassert(delta - delta_at_primary_opcode < 16/*let's say*/);
32353
32354   /* Use delta == delta_at_primary_opcode to denote decode failure.
32355      This implies that any successful decode must use at least one
32356      byte up. */
32357   if (delta == delta_at_primary_opcode)
32358      goto decode_failure;
32359   else
32360      goto decode_success; /* \o/ */
32361
32362
32363  decode_failure:
32364   /* All decode failures end up here. */
32365   if (sigill_diag) {
32366      vex_printf("vex amd64->IR: unhandled instruction bytes: "
32367                 "0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
32368                 getUChar(delta_start+0),
32369                 getUChar(delta_start+1),
32370                 getUChar(delta_start+2),
32371                 getUChar(delta_start+3),
32372                 getUChar(delta_start+4),
32373                 getUChar(delta_start+5),
32374                 getUChar(delta_start+6),
32375                 getUChar(delta_start+7),
32376                 getUChar(delta_start+8),
32377                 getUChar(delta_start+9) );
32378      vex_printf("vex amd64->IR:   REX=%d REX.W=%d REX.R=%d REX.X=%d REX.B=%d\n",
32379                 haveREX(pfx) ? 1 : 0, getRexW(pfx), getRexR(pfx),
32380                 getRexX(pfx), getRexB(pfx));
32381      vex_printf("vex amd64->IR:   VEX=%d VEX.L=%d VEX.nVVVV=0x%x ESC=%s\n",
32382                 haveVEX(pfx) ? 1 : 0, getVexL(pfx),
32383                 getVexNvvvv(pfx),
32384                 esc==ESC_NONE ? "NONE" :
32385                   esc==ESC_0F ? "0F" :
32386                   esc==ESC_0F38 ? "0F38" :
32387                   esc==ESC_0F3A ? "0F3A" : "???");
32388      vex_printf("vex amd64->IR:   PFX.66=%d PFX.F2=%d PFX.F3=%d\n",
32389                 have66(pfx) ? 1 : 0, haveF2(pfx) ? 1 : 0,
32390                 haveF3(pfx) ? 1 : 0);
32391   }
32392
32393   /* Tell the dispatcher that this insn cannot be decoded, and so has
32394      not been executed, and (is currently) the next to be executed.
32395      RIP should be up-to-date since it made so at the start of each
32396      insn, but nevertheless be paranoid and update it again right
32397      now. */
32398   stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
32399   jmp_lit(&dres, Ijk_NoDecode, guest_RIP_curr_instr);
32400   vassert(dres.whatNext == Dis_StopHere);
32401   dres.len = 0;
32402   /* We also need to say that a CAS is not expected now, regardless
32403      of what it might have been set to at the start of the function,
32404      since the IR that we've emitted just above (to synthesis a
32405      SIGILL) does not involve any CAS, and presumably no other IR has
32406      been emitted for this (non-decoded) insn. */
32407   *expect_CAS = False;
32408   return dres;
32409
32410
32411  decode_success:
32412   /* All decode successes end up here. */
32413   switch (dres.whatNext) {
32414      case Dis_Continue:
32415         stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_bbstart + delta) ) );
32416         break;
32417      case Dis_ResteerU:
32418      case Dis_ResteerC:
32419         stmt( IRStmt_Put( OFFB_RIP, mkU64(dres.continueAt) ) );
32420         break;
32421      case Dis_StopHere:
32422         break;
32423      default:
32424         vassert(0);
32425   }
32426
32427   DIP("\n");
32428   dres.len = toUInt(delta - delta_start);
32429   return dres;
32430}
32431
32432#undef DIP
32433#undef DIS
32434
32435
32436/*------------------------------------------------------------*/
32437/*--- Top-level fn                                         ---*/
32438/*------------------------------------------------------------*/
32439
32440/* Disassemble a single instruction into IR.  The instruction
32441   is located in host memory at &guest_code[delta]. */
32442
32443DisResult disInstr_AMD64 ( IRSB*        irsb_IN,
32444                           Bool         (*resteerOkFn) ( void*, Addr ),
32445                           Bool         resteerCisOk,
32446                           void*        callback_opaque,
32447                           const UChar* guest_code_IN,
32448                           Long         delta,
32449                           Addr         guest_IP,
32450                           VexArch      guest_arch,
32451                           const VexArchInfo* archinfo,
32452                           const VexAbiInfo*  abiinfo,
32453                           VexEndness   host_endness_IN,
32454                           Bool         sigill_diag_IN )
32455{
32456   Int       i, x1, x2;
32457   Bool      expect_CAS, has_CAS;
32458   DisResult dres;
32459
32460   /* Set globals (see top of this file) */
32461   vassert(guest_arch == VexArchAMD64);
32462   guest_code           = guest_code_IN;
32463   irsb                 = irsb_IN;
32464   host_endness         = host_endness_IN;
32465   guest_RIP_curr_instr = guest_IP;
32466   guest_RIP_bbstart    = guest_IP - delta;
32467
32468   /* We'll consult these after doing disInstr_AMD64_WRK. */
32469   guest_RIP_next_assumed   = 0;
32470   guest_RIP_next_mustcheck = False;
32471
32472   x1 = irsb_IN->stmts_used;
32473   expect_CAS = False;
32474   dres = disInstr_AMD64_WRK ( &expect_CAS, resteerOkFn,
32475                               resteerCisOk,
32476                               callback_opaque,
32477                               delta, archinfo, abiinfo, sigill_diag_IN );
32478   x2 = irsb_IN->stmts_used;
32479   vassert(x2 >= x1);
32480
32481   /* If disInstr_AMD64_WRK tried to figure out the next rip, check it
32482      got it right.  Failure of this assertion is serious and denotes
32483      a bug in disInstr. */
32484   if (guest_RIP_next_mustcheck
32485       && guest_RIP_next_assumed != guest_RIP_curr_instr + dres.len) {
32486      vex_printf("\n");
32487      vex_printf("assumed next %%rip = 0x%llx\n",
32488                 guest_RIP_next_assumed );
32489      vex_printf(" actual next %%rip = 0x%llx\n",
32490                 guest_RIP_curr_instr + dres.len );
32491      vpanic("disInstr_AMD64: disInstr miscalculated next %rip");
32492   }
32493
32494   /* See comment at the top of disInstr_AMD64_WRK for meaning of
32495      expect_CAS.  Here, we (sanity-)check for the presence/absence of
32496      IRCAS as directed by the returned expect_CAS value. */
32497   has_CAS = False;
32498   for (i = x1; i < x2; i++) {
32499      if (irsb_IN->stmts[i]->tag == Ist_CAS)
32500         has_CAS = True;
32501   }
32502
32503   if (expect_CAS != has_CAS) {
32504      /* inconsistency detected.  re-disassemble the instruction so as
32505         to generate a useful error message; then assert. */
32506      vex_traceflags |= VEX_TRACE_FE;
32507      dres = disInstr_AMD64_WRK ( &expect_CAS, resteerOkFn,
32508                                  resteerCisOk,
32509                                  callback_opaque,
32510                                  delta, archinfo, abiinfo, sigill_diag_IN );
32511      for (i = x1; i < x2; i++) {
32512         vex_printf("\t\t");
32513         ppIRStmt(irsb_IN->stmts[i]);
32514         vex_printf("\n");
32515      }
32516      /* Failure of this assertion is serious and denotes a bug in
32517         disInstr. */
32518      vpanic("disInstr_AMD64: inconsistency in LOCK prefix handling");
32519   }
32520
32521   return dres;
32522}
32523
32524
32525/*------------------------------------------------------------*/
32526/*--- Unused stuff                                         ---*/
32527/*------------------------------------------------------------*/
32528
32529// A potentially more Memcheck-friendly version of gen_LZCNT, if
32530// this should ever be needed.
32531//
32532//static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
32533//{
32534//   /* Scheme is simple: propagate the most significant 1-bit into all
32535//      lower positions in the word.  This gives a word of the form
32536//      0---01---1.  Now invert it, giving a word of the form
32537//      1---10---0, then do a population-count idiom (to count the 1s,
32538//      which is the number of leading zeroes, or the word size if the
32539//      original word was 0.
32540//   */
32541//   Int i;
32542//   IRTemp t[7];
32543//   for (i = 0; i < 7; i++) {
32544//      t[i] = newTemp(ty);
32545//   }
32546//   if (ty == Ity_I64) {
32547//      assign(t[0], binop(Iop_Or64, mkexpr(src),
32548//                                   binop(Iop_Shr64, mkexpr(src),  mkU8(1))));
32549//      assign(t[1], binop(Iop_Or64, mkexpr(t[0]),
32550//                                   binop(Iop_Shr64, mkexpr(t[0]), mkU8(2))));
32551//      assign(t[2], binop(Iop_Or64, mkexpr(t[1]),
32552//                                   binop(Iop_Shr64, mkexpr(t[1]), mkU8(4))));
32553//      assign(t[3], binop(Iop_Or64, mkexpr(t[2]),
32554//                                   binop(Iop_Shr64, mkexpr(t[2]), mkU8(8))));
32555//      assign(t[4], binop(Iop_Or64, mkexpr(t[3]),
32556//                                   binop(Iop_Shr64, mkexpr(t[3]), mkU8(16))));
32557//      assign(t[5], binop(Iop_Or64, mkexpr(t[4]),
32558//                                   binop(Iop_Shr64, mkexpr(t[4]), mkU8(32))));
32559//      assign(t[6], unop(Iop_Not64, mkexpr(t[5])));
32560//      return gen_POPCOUNT(ty, t[6]);
32561//   }
32562//   if (ty == Ity_I32) {
32563//      assign(t[0], binop(Iop_Or32, mkexpr(src),
32564//                                   binop(Iop_Shr32, mkexpr(src),  mkU8(1))));
32565//      assign(t[1], binop(Iop_Or32, mkexpr(t[0]),
32566//                                   binop(Iop_Shr32, mkexpr(t[0]), mkU8(2))));
32567//      assign(t[2], binop(Iop_Or32, mkexpr(t[1]),
32568//                                   binop(Iop_Shr32, mkexpr(t[1]), mkU8(4))));
32569//      assign(t[3], binop(Iop_Or32, mkexpr(t[2]),
32570//                                   binop(Iop_Shr32, mkexpr(t[2]), mkU8(8))));
32571//      assign(t[4], binop(Iop_Or32, mkexpr(t[3]),
32572//                                   binop(Iop_Shr32, mkexpr(t[3]), mkU8(16))));
32573//      assign(t[5], unop(Iop_Not32, mkexpr(t[4])));
32574//      return gen_POPCOUNT(ty, t[5]);
32575//   }
32576//   if (ty == Ity_I16) {
32577//      assign(t[0], binop(Iop_Or16, mkexpr(src),
32578//                                   binop(Iop_Shr16, mkexpr(src),  mkU8(1))));
32579//      assign(t[1], binop(Iop_Or16, mkexpr(t[0]),
32580//                                   binop(Iop_Shr16, mkexpr(t[0]), mkU8(2))));
32581//      assign(t[2], binop(Iop_Or16, mkexpr(t[1]),
32582//                                   binop(Iop_Shr16, mkexpr(t[1]), mkU8(4))));
32583//      assign(t[3], binop(Iop_Or16, mkexpr(t[2]),
32584//                                   binop(Iop_Shr16, mkexpr(t[2]), mkU8(8))));
32585//      assign(t[4], unop(Iop_Not16, mkexpr(t[3])));
32586//      return gen_POPCOUNT(ty, t[4]);
32587//   }
32588//   vassert(0);
32589//}
32590
32591
32592/*--------------------------------------------------------------------*/
32593/*--- end                                       guest_amd64_toIR.c ---*/
32594/*--------------------------------------------------------------------*/
32595