1
2/*--------------------------------------------------------------------*/
3/*--- begin                                     guest_amd64_toIR.c ---*/
4/*--------------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2013 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36/* Translates AMD64 code to IR. */
37
38/* TODO:
39
40   All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
41   to ensure a 64-bit value is being written.
42
43   x87 FP Limitations:
44
45   * all arithmetic done at 64 bits
46
47   * no FP exceptions, except for handling stack over/underflow
48
49   * FP rounding mode observed only for float->int conversions and
50     int->float conversions which could lose accuracy, and for
51     float-to-float rounding.  For all other operations,
52     round-to-nearest is used, regardless.
53
54   * some of the FCOM cases could do with testing -- not convinced
55     that the args are the right way round.
56
57   * FSAVE does not re-initialise the FPU; it should do
58
59   * FINIT not only initialises the FPU environment, it also zeroes
60     all the FP registers.  It should leave the registers unchanged.
61
62    SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
63    per Intel docs this bit has no meaning anyway.  Since PUSHF is the
64    only way to observe eflags[1], a proper fix would be to make that
65    bit be set by PUSHF.
66
67    This module uses global variables and so is not MT-safe (if that
68    should ever become relevant).
69*/
70
71/* Notes re address size overrides (0x67).
72
73   According to the AMD documentation (24594 Rev 3.09, Sept 2003,
74   "AMD64 Architecture Programmer's Manual Volume 3: General-Purpose
75   and System Instructions"), Section 1.2.3 ("Address-Size Override
76   Prefix"):
77
78   0x67 applies to all explicit memory references, causing the top
79   32 bits of the effective address to become zero.
80
81   0x67 has no effect on stack references (push/pop); these always
82   use a 64-bit address.
83
84   0x67 changes the interpretation of instructions which implicitly
85   reference RCX/RSI/RDI, so that in fact ECX/ESI/EDI are used
86   instead.  These are:
87
88      cmp{s,sb,sw,sd,sq}
89      in{s,sb,sw,sd}
90      jcxz, jecxz, jrcxz
91      lod{s,sb,sw,sd,sq}
92      loop{,e,bz,be,z}
93      mov{s,sb,sw,sd,sq}
94      out{s,sb,sw,sd}
95      rep{,e,ne,nz}
96      sca{s,sb,sw,sd,sq}
97      sto{s,sb,sw,sd,sq}
98      xlat{,b} */
99
100/* "Special" instructions.
101
102   This instruction decoder can decode three special instructions
103   which mean nothing natively (are no-ops as far as regs/mem are
104   concerned) but have meaning for supporting Valgrind.  A special
105   instruction is flagged by the 16-byte preamble 48C1C703 48C1C70D
106   48C1C73D 48C1C733 (in the standard interpretation, that means: rolq
107   $3, %rdi; rolq $13, %rdi; rolq $61, %rdi; rolq $51, %rdi).
108   Following that, one of the following 3 are allowed (standard
109   interpretation in parentheses):
110
111      4887DB (xchgq %rbx,%rbx)   %RDX = client_request ( %RAX )
112      4887C9 (xchgq %rcx,%rcx)   %RAX = guest_NRADDR
113      4887D2 (xchgq %rdx,%rdx)   call-noredir *%RAX
114      4887F6 (xchgq %rdi,%rdi)   IR injection
115
116   Any other bytes following the 16-byte preamble are illegal and
117   constitute a failure in instruction decoding.  This all assumes
118   that the preamble will never occur except in specific code
119   fragments designed for Valgrind to catch.
120
121   No prefixes may precede a "Special" instruction.
122*/
123
124/* casLE (implementation of lock-prefixed insns) and rep-prefixed
125   insns: the side-exit back to the start of the insn is done with
126   Ijk_Boring.  This is quite wrong, it should be done with
127   Ijk_NoRedir, since otherwise the side exit, which is intended to
128   restart the instruction for whatever reason, could go somewhere
129   entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
130   no-redir jumps performance critical, at least for rep-prefixed
131   instructions, since all iterations thereof would involve such a
132   jump.  It's not such a big deal with casLE since the side exit is
133   only taken if the CAS fails, that is, the location is contended,
134   which is relatively unlikely.
135
136   Note also, the test for CAS success vs failure is done using
137   Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
138   Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
139   shouldn't definedness-check these comparisons.  See
140   COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
141   background/rationale.
142*/
143
144/* LOCK prefixed instructions.  These are translated using IR-level
145   CAS statements (IRCAS) and are believed to preserve atomicity, even
146   from the point of view of some other process racing against a
147   simulated one (presumably they communicate via a shared memory
148   segment).
149
150   Handlers which are aware of LOCK prefixes are:
151      dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
152      dis_cmpxchg_G_E  (cmpxchg)
153      dis_Grp1         (add, or, adc, sbb, and, sub, xor)
154      dis_Grp3         (not, neg)
155      dis_Grp4         (inc, dec)
156      dis_Grp5         (inc, dec)
157      dis_Grp8_Imm     (bts, btc, btr)
158      dis_bt_G_E       (bts, btc, btr)
159      dis_xadd_G_E     (xadd)
160*/
161
162
163#include "libvex_basictypes.h"
164#include "libvex_ir.h"
165#include "libvex.h"
166#include "libvex_guest_amd64.h"
167
168#include "main_util.h"
169#include "main_globals.h"
170#include "guest_generic_bb_to_IR.h"
171#include "guest_generic_x87.h"
172#include "guest_amd64_defs.h"
173
174
175/*------------------------------------------------------------*/
176/*--- Globals                                              ---*/
177/*------------------------------------------------------------*/
178
179/* These are set at the start of the translation of an insn, right
180   down in disInstr_AMD64, so that we don't have to pass them around
181   endlessly.  They are all constant during the translation of any
182   given insn. */
183
184/* These are set at the start of the translation of a BB, so
185   that we don't have to pass them around endlessly. */
186
187/* We need to know this to do sub-register accesses correctly. */
188static Bool host_is_bigendian;
189
190/* Pointer to the guest code area (points to start of BB, not to the
191   insn being processed). */
192static UChar* guest_code;
193
194/* The guest address corresponding to guest_code[0]. */
195static Addr64 guest_RIP_bbstart;
196
197/* The guest address for the instruction currently being
198   translated. */
199static Addr64 guest_RIP_curr_instr;
200
201/* The IRSB* into which we're generating code. */
202static IRSB* irsb;
203
204/* For ensuring that %rip-relative addressing is done right.  A read
205   of %rip generates the address of the next instruction.  It may be
206   that we don't conveniently know that inside disAMode().  For sanity
207   checking, if the next insn %rip is needed, we make a guess at what
208   it is, record that guess here, and set the accompanying Bool to
209   indicate that -- after this insn's decode is finished -- that guess
210   needs to be checked.  */
211
212/* At the start of each insn decode, is set to (0, False).
213   After the decode, if _mustcheck is now True, _assumed is
214   checked. */
215
216static Addr64 guest_RIP_next_assumed;
217static Bool   guest_RIP_next_mustcheck;
218
219
220/*------------------------------------------------------------*/
221/*--- Helpers for constructing IR.                         ---*/
222/*------------------------------------------------------------*/
223
224/* Generate a new temporary of the given type. */
225static IRTemp newTemp ( IRType ty )
226{
227   vassert(isPlausibleIRType(ty));
228   return newIRTemp( irsb->tyenv, ty );
229}
230
231/* Add a statement to the list held by "irsb". */
232static void stmt ( IRStmt* st )
233{
234   addStmtToIRSB( irsb, st );
235}
236
237/* Generate a statement "dst := e". */
238static void assign ( IRTemp dst, IRExpr* e )
239{
240   stmt( IRStmt_WrTmp(dst, e) );
241}
242
243static IRExpr* unop ( IROp op, IRExpr* a )
244{
245   return IRExpr_Unop(op, a);
246}
247
248static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
249{
250   return IRExpr_Binop(op, a1, a2);
251}
252
253static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
254{
255   return IRExpr_Triop(op, a1, a2, a3);
256}
257
258static IRExpr* mkexpr ( IRTemp tmp )
259{
260   return IRExpr_RdTmp(tmp);
261}
262
263static IRExpr* mkU8 ( ULong i )
264{
265   vassert(i < 256);
266   return IRExpr_Const(IRConst_U8( (UChar)i ));
267}
268
269static IRExpr* mkU16 ( ULong i )
270{
271   vassert(i < 0x10000ULL);
272   return IRExpr_Const(IRConst_U16( (UShort)i ));
273}
274
275static IRExpr* mkU32 ( ULong i )
276{
277   vassert(i < 0x100000000ULL);
278   return IRExpr_Const(IRConst_U32( (UInt)i ));
279}
280
281static IRExpr* mkU64 ( ULong i )
282{
283   return IRExpr_Const(IRConst_U64(i));
284}
285
286static IRExpr* mkU ( IRType ty, ULong i )
287{
288   switch (ty) {
289      case Ity_I8:  return mkU8(i);
290      case Ity_I16: return mkU16(i);
291      case Ity_I32: return mkU32(i);
292      case Ity_I64: return mkU64(i);
293      default: vpanic("mkU(amd64)");
294   }
295}
296
297static void storeLE ( IRExpr* addr, IRExpr* data )
298{
299   stmt( IRStmt_Store(Iend_LE, addr, data) );
300}
301
302static IRExpr* loadLE ( IRType ty, IRExpr* addr )
303{
304   return IRExpr_Load(Iend_LE, ty, addr);
305}
306
307static IROp mkSizedOp ( IRType ty, IROp op8 )
308{
309   vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
310           || op8 == Iop_Mul8
311           || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
312           || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
313           || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
314           || op8 == Iop_CasCmpNE8
315           || op8 == Iop_Not8 );
316   switch (ty) {
317      case Ity_I8:  return 0 +op8;
318      case Ity_I16: return 1 +op8;
319      case Ity_I32: return 2 +op8;
320      case Ity_I64: return 3 +op8;
321      default: vpanic("mkSizedOp(amd64)");
322   }
323}
324
325static
326IRExpr* doScalarWidening ( Int szSmall, Int szBig, Bool signd, IRExpr* src )
327{
328   if (szSmall == 1 && szBig == 4) {
329      return unop(signd ? Iop_8Sto32 : Iop_8Uto32, src);
330   }
331   if (szSmall == 1 && szBig == 2) {
332      return unop(signd ? Iop_8Sto16 : Iop_8Uto16, src);
333   }
334   if (szSmall == 2 && szBig == 4) {
335      return unop(signd ? Iop_16Sto32 : Iop_16Uto32, src);
336   }
337   if (szSmall == 1 && szBig == 8 && !signd) {
338      return unop(Iop_8Uto64, src);
339   }
340   if (szSmall == 1 && szBig == 8 && signd) {
341      return unop(Iop_8Sto64, src);
342   }
343   if (szSmall == 2 && szBig == 8 && !signd) {
344      return unop(Iop_16Uto64, src);
345   }
346   if (szSmall == 2 && szBig == 8 && signd) {
347      return unop(Iop_16Sto64, src);
348   }
349   vpanic("doScalarWidening(amd64)");
350}
351
352
353
354/*------------------------------------------------------------*/
355/*--- Debugging output                                     ---*/
356/*------------------------------------------------------------*/
357
358/* Bomb out if we can't handle something. */
359__attribute__ ((noreturn))
360static void unimplemented ( const HChar* str )
361{
362   vex_printf("amd64toIR: unimplemented feature\n");
363   vpanic(str);
364}
365
366#define DIP(format, args...)           \
367   if (vex_traceflags & VEX_TRACE_FE)  \
368      vex_printf(format, ## args)
369
370#define DIS(buf, format, args...)      \
371   if (vex_traceflags & VEX_TRACE_FE)  \
372      vex_sprintf(buf, format, ## args)
373
374
375/*------------------------------------------------------------*/
376/*--- Offsets of various parts of the amd64 guest state.   ---*/
377/*------------------------------------------------------------*/
378
379#define OFFB_RAX       offsetof(VexGuestAMD64State,guest_RAX)
380#define OFFB_RBX       offsetof(VexGuestAMD64State,guest_RBX)
381#define OFFB_RCX       offsetof(VexGuestAMD64State,guest_RCX)
382#define OFFB_RDX       offsetof(VexGuestAMD64State,guest_RDX)
383#define OFFB_RSP       offsetof(VexGuestAMD64State,guest_RSP)
384#define OFFB_RBP       offsetof(VexGuestAMD64State,guest_RBP)
385#define OFFB_RSI       offsetof(VexGuestAMD64State,guest_RSI)
386#define OFFB_RDI       offsetof(VexGuestAMD64State,guest_RDI)
387#define OFFB_R8        offsetof(VexGuestAMD64State,guest_R8)
388#define OFFB_R9        offsetof(VexGuestAMD64State,guest_R9)
389#define OFFB_R10       offsetof(VexGuestAMD64State,guest_R10)
390#define OFFB_R11       offsetof(VexGuestAMD64State,guest_R11)
391#define OFFB_R12       offsetof(VexGuestAMD64State,guest_R12)
392#define OFFB_R13       offsetof(VexGuestAMD64State,guest_R13)
393#define OFFB_R14       offsetof(VexGuestAMD64State,guest_R14)
394#define OFFB_R15       offsetof(VexGuestAMD64State,guest_R15)
395
396#define OFFB_RIP       offsetof(VexGuestAMD64State,guest_RIP)
397
398#define OFFB_FS_ZERO   offsetof(VexGuestAMD64State,guest_FS_ZERO)
399#define OFFB_GS_0x60   offsetof(VexGuestAMD64State,guest_GS_0x60)
400
401#define OFFB_CC_OP     offsetof(VexGuestAMD64State,guest_CC_OP)
402#define OFFB_CC_DEP1   offsetof(VexGuestAMD64State,guest_CC_DEP1)
403#define OFFB_CC_DEP2   offsetof(VexGuestAMD64State,guest_CC_DEP2)
404#define OFFB_CC_NDEP   offsetof(VexGuestAMD64State,guest_CC_NDEP)
405
406#define OFFB_FPREGS    offsetof(VexGuestAMD64State,guest_FPREG[0])
407#define OFFB_FPTAGS    offsetof(VexGuestAMD64State,guest_FPTAG[0])
408#define OFFB_DFLAG     offsetof(VexGuestAMD64State,guest_DFLAG)
409#define OFFB_ACFLAG    offsetof(VexGuestAMD64State,guest_ACFLAG)
410#define OFFB_IDFLAG    offsetof(VexGuestAMD64State,guest_IDFLAG)
411#define OFFB_FTOP      offsetof(VexGuestAMD64State,guest_FTOP)
412#define OFFB_FC3210    offsetof(VexGuestAMD64State,guest_FC3210)
413#define OFFB_FPROUND   offsetof(VexGuestAMD64State,guest_FPROUND)
414
415#define OFFB_SSEROUND  offsetof(VexGuestAMD64State,guest_SSEROUND)
416#define OFFB_YMM0      offsetof(VexGuestAMD64State,guest_YMM0)
417#define OFFB_YMM1      offsetof(VexGuestAMD64State,guest_YMM1)
418#define OFFB_YMM2      offsetof(VexGuestAMD64State,guest_YMM2)
419#define OFFB_YMM3      offsetof(VexGuestAMD64State,guest_YMM3)
420#define OFFB_YMM4      offsetof(VexGuestAMD64State,guest_YMM4)
421#define OFFB_YMM5      offsetof(VexGuestAMD64State,guest_YMM5)
422#define OFFB_YMM6      offsetof(VexGuestAMD64State,guest_YMM6)
423#define OFFB_YMM7      offsetof(VexGuestAMD64State,guest_YMM7)
424#define OFFB_YMM8      offsetof(VexGuestAMD64State,guest_YMM8)
425#define OFFB_YMM9      offsetof(VexGuestAMD64State,guest_YMM9)
426#define OFFB_YMM10     offsetof(VexGuestAMD64State,guest_YMM10)
427#define OFFB_YMM11     offsetof(VexGuestAMD64State,guest_YMM11)
428#define OFFB_YMM12     offsetof(VexGuestAMD64State,guest_YMM12)
429#define OFFB_YMM13     offsetof(VexGuestAMD64State,guest_YMM13)
430#define OFFB_YMM14     offsetof(VexGuestAMD64State,guest_YMM14)
431#define OFFB_YMM15     offsetof(VexGuestAMD64State,guest_YMM15)
432#define OFFB_YMM16     offsetof(VexGuestAMD64State,guest_YMM16)
433
434#define OFFB_EMNOTE    offsetof(VexGuestAMD64State,guest_EMNOTE)
435#define OFFB_CMSTART   offsetof(VexGuestAMD64State,guest_CMSTART)
436#define OFFB_CMLEN     offsetof(VexGuestAMD64State,guest_CMLEN)
437
438#define OFFB_NRADDR    offsetof(VexGuestAMD64State,guest_NRADDR)
439
440
441/*------------------------------------------------------------*/
442/*--- Helper bits and pieces for deconstructing the        ---*/
443/*--- amd64 insn stream.                                   ---*/
444/*------------------------------------------------------------*/
445
446/* This is the AMD64 register encoding -- integer regs. */
447#define R_RAX 0
448#define R_RCX 1
449#define R_RDX 2
450#define R_RBX 3
451#define R_RSP 4
452#define R_RBP 5
453#define R_RSI 6
454#define R_RDI 7
455#define R_R8  8
456#define R_R9  9
457#define R_R10 10
458#define R_R11 11
459#define R_R12 12
460#define R_R13 13
461#define R_R14 14
462#define R_R15 15
463
464/* This is the Intel register encoding -- segment regs. */
465#define R_ES 0
466#define R_CS 1
467#define R_SS 2
468#define R_DS 3
469#define R_FS 4
470#define R_GS 5
471
472
473/* Various simple conversions */
474
475static ULong extend_s_8to64 ( UChar x )
476{
477   return (ULong)((((Long)x) << 56) >> 56);
478}
479
480static ULong extend_s_16to64 ( UShort x )
481{
482   return (ULong)((((Long)x) << 48) >> 48);
483}
484
485static ULong extend_s_32to64 ( UInt x )
486{
487   return (ULong)((((Long)x) << 32) >> 32);
488}
489
490/* Figure out whether the mod and rm parts of a modRM byte refer to a
491   register or memory.  If so, the byte will have the form 11XXXYYY,
492   where YYY is the register number. */
493inline
494static Bool epartIsReg ( UChar mod_reg_rm )
495{
496   return toBool(0xC0 == (mod_reg_rm & 0xC0));
497}
498
499/* Extract the 'g' field from a modRM byte.  This only produces 3
500   bits, which is not a complete register number.  You should avoid
501   this function if at all possible. */
502inline
503static Int gregLO3ofRM ( UChar mod_reg_rm )
504{
505   return (Int)( (mod_reg_rm >> 3) & 7 );
506}
507
508/* Ditto the 'e' field of a modRM byte. */
509inline
510static Int eregLO3ofRM ( UChar mod_reg_rm )
511{
512   return (Int)(mod_reg_rm & 0x7);
513}
514
515/* Get a 8/16/32-bit unsigned value out of the insn stream. */
516
517static inline UChar getUChar ( Long delta )
518{
519   UChar v = guest_code[delta+0];
520   return v;
521}
522
523static UInt getUDisp16 ( Long delta )
524{
525   UInt v = guest_code[delta+1]; v <<= 8;
526   v |= guest_code[delta+0];
527   return v & 0xFFFF;
528}
529
530//.. static UInt getUDisp ( Int size, Long delta )
531//.. {
532//..    switch (size) {
533//..       case 4: return getUDisp32(delta);
534//..       case 2: return getUDisp16(delta);
535//..       case 1: return getUChar(delta);
536//..       default: vpanic("getUDisp(x86)");
537//..    }
538//..    return 0; /*notreached*/
539//.. }
540
541
542/* Get a byte value out of the insn stream and sign-extend to 64
543   bits. */
544static Long getSDisp8 ( Long delta )
545{
546   return extend_s_8to64( guest_code[delta] );
547}
548
549/* Get a 16-bit value out of the insn stream and sign-extend to 64
550   bits. */
551static Long getSDisp16 ( Long delta )
552{
553   UInt v = guest_code[delta+1]; v <<= 8;
554   v |= guest_code[delta+0];
555   return extend_s_16to64( (UShort)v );
556}
557
558/* Get a 32-bit value out of the insn stream and sign-extend to 64
559   bits. */
560static Long getSDisp32 ( Long delta )
561{
562   UInt v = guest_code[delta+3]; v <<= 8;
563   v |= guest_code[delta+2]; v <<= 8;
564   v |= guest_code[delta+1]; v <<= 8;
565   v |= guest_code[delta+0];
566   return extend_s_32to64( v );
567}
568
569/* Get a 64-bit value out of the insn stream. */
570static Long getDisp64 ( Long delta )
571{
572   ULong v = 0;
573   v |= guest_code[delta+7]; v <<= 8;
574   v |= guest_code[delta+6]; v <<= 8;
575   v |= guest_code[delta+5]; v <<= 8;
576   v |= guest_code[delta+4]; v <<= 8;
577   v |= guest_code[delta+3]; v <<= 8;
578   v |= guest_code[delta+2]; v <<= 8;
579   v |= guest_code[delta+1]; v <<= 8;
580   v |= guest_code[delta+0];
581   return v;
582}
583
584/* Note: because AMD64 doesn't allow 64-bit literals, it is an error
585   if this is called with size==8.  Should not happen. */
586static Long getSDisp ( Int size, Long delta )
587{
588   switch (size) {
589      case 4: return getSDisp32(delta);
590      case 2: return getSDisp16(delta);
591      case 1: return getSDisp8(delta);
592      default: vpanic("getSDisp(amd64)");
593  }
594}
595
596static ULong mkSizeMask ( Int sz )
597{
598   switch (sz) {
599      case 1: return 0x00000000000000FFULL;
600      case 2: return 0x000000000000FFFFULL;
601      case 4: return 0x00000000FFFFFFFFULL;
602      case 8: return 0xFFFFFFFFFFFFFFFFULL;
603      default: vpanic("mkSzMask(amd64)");
604   }
605}
606
607static Int imin ( Int a, Int b )
608{
609   return (a < b) ? a : b;
610}
611
612static IRType szToITy ( Int n )
613{
614   switch (n) {
615      case 1: return Ity_I8;
616      case 2: return Ity_I16;
617      case 4: return Ity_I32;
618      case 8: return Ity_I64;
619      default: vex_printf("\nszToITy(%d)\n", n);
620               vpanic("szToITy(amd64)");
621   }
622}
623
624
625/*------------------------------------------------------------*/
626/*--- For dealing with prefixes.                           ---*/
627/*------------------------------------------------------------*/
628
629/* The idea is to pass around an int holding a bitmask summarising
630   info from the prefixes seen on the current instruction, including
631   info from the REX byte.  This info is used in various places, but
632   most especially when making sense of register fields in
633   instructions.
634
635   The top 8 bits of the prefix are 0x55, just as a hacky way to
636   ensure it really is a valid prefix.
637
638   Things you can safely assume about a well-formed prefix:
639   * at most one segment-override bit (CS,DS,ES,FS,GS,SS) is set.
640   * if REX is not present then REXW,REXR,REXX,REXB will read
641     as zero.
642   * F2 and F3 will not both be 1.
643*/
644
645typedef UInt  Prefix;
646
647#define PFX_ASO    (1<<0)    /* address-size override present (0x67) */
648#define PFX_66     (1<<1)    /* operand-size override-to-16 present (0x66) */
649#define PFX_REX    (1<<2)    /* REX byte present (0x40 to 0x4F) */
650#define PFX_REXW   (1<<3)    /* REX W bit, if REX present, else 0 */
651#define PFX_REXR   (1<<4)    /* REX R bit, if REX present, else 0 */
652#define PFX_REXX   (1<<5)    /* REX X bit, if REX present, else 0 */
653#define PFX_REXB   (1<<6)    /* REX B bit, if REX present, else 0 */
654#define PFX_LOCK   (1<<7)    /* bus LOCK prefix present (0xF0) */
655#define PFX_F2     (1<<8)    /* REP/REPE/REPZ prefix present (0xF2) */
656#define PFX_F3     (1<<9)    /* REPNE/REPNZ prefix present (0xF3) */
657#define PFX_CS     (1<<10)   /* CS segment prefix present (0x2E) */
658#define PFX_DS     (1<<11)   /* DS segment prefix present (0x3E) */
659#define PFX_ES     (1<<12)   /* ES segment prefix present (0x26) */
660#define PFX_FS     (1<<13)   /* FS segment prefix present (0x64) */
661#define PFX_GS     (1<<14)   /* GS segment prefix present (0x65) */
662#define PFX_SS     (1<<15)   /* SS segment prefix present (0x36) */
663#define PFX_VEX    (1<<16)   /* VEX prefix present (0xC4 or 0xC5) */
664#define PFX_VEXL   (1<<17)   /* VEX L bit, if VEX present, else 0 */
665/* The extra register field VEX.vvvv is encoded (after not-ing it) as
666   PFX_VEXnV3 .. PFX_VEXnV0, so these must occupy adjacent bit
667   positions. */
668#define PFX_VEXnV0 (1<<18)   /* ~VEX vvvv[0], if VEX present, else 0 */
669#define PFX_VEXnV1 (1<<19)   /* ~VEX vvvv[1], if VEX present, else 0 */
670#define PFX_VEXnV2 (1<<20)   /* ~VEX vvvv[2], if VEX present, else 0 */
671#define PFX_VEXnV3 (1<<21)   /* ~VEX vvvv[3], if VEX present, else 0 */
672
673
674#define PFX_EMPTY 0x55000000
675
676static Bool IS_VALID_PFX ( Prefix pfx ) {
677   return toBool((pfx & 0xFF000000) == PFX_EMPTY);
678}
679
680static Bool haveREX ( Prefix pfx ) {
681   return toBool(pfx & PFX_REX);
682}
683
684static Int getRexW ( Prefix pfx ) {
685   return (pfx & PFX_REXW) ? 1 : 0;
686}
687static Int getRexR ( Prefix pfx ) {
688   return (pfx & PFX_REXR) ? 1 : 0;
689}
690static Int getRexX ( Prefix pfx ) {
691   return (pfx & PFX_REXX) ? 1 : 0;
692}
693static Int getRexB ( Prefix pfx ) {
694   return (pfx & PFX_REXB) ? 1 : 0;
695}
696
697/* Check a prefix doesn't have F2 or F3 set in it, since usually that
698   completely changes what instruction it really is. */
699static Bool haveF2orF3 ( Prefix pfx ) {
700   return toBool((pfx & (PFX_F2|PFX_F3)) > 0);
701}
702static Bool haveF2andF3 ( Prefix pfx ) {
703   return toBool((pfx & (PFX_F2|PFX_F3)) == (PFX_F2|PFX_F3));
704}
705static Bool haveF2 ( Prefix pfx ) {
706   return toBool((pfx & PFX_F2) > 0);
707}
708static Bool haveF3 ( Prefix pfx ) {
709   return toBool((pfx & PFX_F3) > 0);
710}
711
712static Bool have66 ( Prefix pfx ) {
713   return toBool((pfx & PFX_66) > 0);
714}
715static Bool haveASO ( Prefix pfx ) {
716   return toBool((pfx & PFX_ASO) > 0);
717}
718static Bool haveLOCK ( Prefix pfx ) {
719   return toBool((pfx & PFX_LOCK) > 0);
720}
721
722/* Return True iff pfx has 66 set and F2 and F3 clear */
723static Bool have66noF2noF3 ( Prefix pfx )
724{
725  return
726     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_66);
727}
728
729/* Return True iff pfx has F2 set and 66 and F3 clear */
730static Bool haveF2no66noF3 ( Prefix pfx )
731{
732  return
733     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F2);
734}
735
736/* Return True iff pfx has F3 set and 66 and F2 clear */
737static Bool haveF3no66noF2 ( Prefix pfx )
738{
739  return
740     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == PFX_F3);
741}
742
743/* Return True iff pfx has F3 set and F2 clear */
744static Bool haveF3noF2 ( Prefix pfx )
745{
746  return
747     toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F3);
748}
749
750/* Return True iff pfx has F2 set and F3 clear */
751static Bool haveF2noF3 ( Prefix pfx )
752{
753  return
754     toBool((pfx & (PFX_F2|PFX_F3)) == PFX_F2);
755}
756
757/* Return True iff pfx has 66, F2 and F3 clear */
758static Bool haveNo66noF2noF3 ( Prefix pfx )
759{
760  return
761     toBool((pfx & (PFX_66|PFX_F2|PFX_F3)) == 0);
762}
763
764/* Return True iff pfx has any of 66, F2 and F3 set */
765static Bool have66orF2orF3 ( Prefix pfx )
766{
767  return toBool( ! haveNo66noF2noF3(pfx) );
768}
769
770/* Return True iff pfx has 66 or F3 set */
771static Bool have66orF3 ( Prefix pfx )
772{
773   return toBool((pfx & (PFX_66|PFX_F3)) > 0);
774}
775
776/* Clear all the segment-override bits in a prefix. */
777static Prefix clearSegBits ( Prefix p )
778{
779   return
780      p & ~(PFX_CS | PFX_DS | PFX_ES | PFX_FS | PFX_GS | PFX_SS);
781}
782
783/* Get the (inverted, hence back to "normal") VEX.vvvv field. */
784static UInt getVexNvvvv ( Prefix pfx ) {
785   UInt r = (UInt)pfx;
786   r /= (UInt)PFX_VEXnV0; /* pray this turns into a shift */
787   return r & 0xF;
788}
789
790static Bool haveVEX ( Prefix pfx ) {
791   return toBool(pfx & PFX_VEX);
792}
793
794static Int getVexL ( Prefix pfx ) {
795   return (pfx & PFX_VEXL) ? 1 : 0;
796}
797
798
799/*------------------------------------------------------------*/
800/*--- For dealing with escapes                             ---*/
801/*------------------------------------------------------------*/
802
803
804/* Escapes come after the prefixes, but before the primary opcode
805   byte.  They escape the primary opcode byte into a bigger space.
806   The 0xF0000000 isn't significant, except so as to make it not
807   overlap valid Prefix values, for sanity checking.
808*/
809
810typedef
811   enum {
812      ESC_NONE=0xF0000000, // none
813      ESC_0F,              // 0F
814      ESC_0F38,            // 0F 38
815      ESC_0F3A             // 0F 3A
816   }
817   Escape;
818
819
820/*------------------------------------------------------------*/
821/*--- For dealing with integer registers                   ---*/
822/*------------------------------------------------------------*/
823
824/* This is somewhat complex.  The rules are:
825
826   For 64, 32 and 16 bit register references, the e or g fields in the
827   modrm bytes supply the low 3 bits of the register number.  The
828   fourth (most-significant) bit of the register number is supplied by
829   the REX byte, if it is present; else that bit is taken to be zero.
830
831   The REX.R bit supplies the high bit corresponding to the g register
832   field, and the REX.B bit supplies the high bit corresponding to the
833   e register field (when the mod part of modrm indicates that modrm's
834   e component refers to a register and not to memory).
835
836   The REX.X bit supplies a high register bit for certain registers
837   in SIB address modes, and is generally rarely used.
838
839   For 8 bit register references, the presence of the REX byte itself
840   has significance.  If there is no REX present, then the 3-bit
841   number extracted from the modrm e or g field is treated as an index
842   into the sequence %al %cl %dl %bl %ah %ch %dh %bh -- that is, the
843   old x86 encoding scheme.
844
845   But if there is a REX present, the register reference is
846   interpreted in the same way as for 64/32/16-bit references: a high
847   bit is extracted from REX, giving a 4-bit number, and the denoted
848   register is the lowest 8 bits of the 16 integer registers denoted
849   by the number.  In particular, values 3 through 7 of this sequence
850   do not refer to %ah %ch %dh %bh but instead to the lowest 8 bits of
851   %rsp %rbp %rsi %rdi.
852
853   The REX.W bit has no bearing at all on register numbers.  Instead
854   its presence indicates that the operand size is to be overridden
855   from its default value (32 bits) to 64 bits instead.  This is in
856   the same fashion that an 0x66 prefix indicates the operand size is
857   to be overridden from 32 bits down to 16 bits.  When both REX.W and
858   0x66 are present there is a conflict, and REX.W takes precedence.
859
860   Rather than try to handle this complexity using a single huge
861   function, several smaller ones are provided.  The aim is to make it
862   as difficult as possible to screw up register decoding in a subtle
863   and hard-to-track-down way.
864
865   Because these routines fish around in the host's memory (that is,
866   in the guest state area) for sub-parts of guest registers, their
867   correctness depends on the host's endianness.  So far these
868   routines only work for little-endian hosts.  Those for which
869   endianness is important have assertions to ensure sanity.
870*/
871
872
873/* About the simplest question you can ask: where do the 64-bit
874   integer registers live (in the guest state) ? */
875
876static Int integerGuestReg64Offset ( UInt reg )
877{
878   switch (reg) {
879      case R_RAX: return OFFB_RAX;
880      case R_RCX: return OFFB_RCX;
881      case R_RDX: return OFFB_RDX;
882      case R_RBX: return OFFB_RBX;
883      case R_RSP: return OFFB_RSP;
884      case R_RBP: return OFFB_RBP;
885      case R_RSI: return OFFB_RSI;
886      case R_RDI: return OFFB_RDI;
887      case R_R8:  return OFFB_R8;
888      case R_R9:  return OFFB_R9;
889      case R_R10: return OFFB_R10;
890      case R_R11: return OFFB_R11;
891      case R_R12: return OFFB_R12;
892      case R_R13: return OFFB_R13;
893      case R_R14: return OFFB_R14;
894      case R_R15: return OFFB_R15;
895      default: vpanic("integerGuestReg64Offset(amd64)");
896   }
897}
898
899
900/* Produce the name of an integer register, for printing purposes.
901   reg is a number in the range 0 .. 15 that has been generated from a
902   3-bit reg-field number and a REX extension bit.  irregular denotes
903   the case where sz==1 and no REX byte is present. */
904
905static
906const HChar* nameIReg ( Int sz, UInt reg, Bool irregular )
907{
908   static const HChar* ireg64_names[16]
909     = { "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
910         "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15" };
911   static const HChar* ireg32_names[16]
912     = { "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
913         "%r8d", "%r9d", "%r10d","%r11d","%r12d","%r13d","%r14d","%r15d" };
914   static const HChar* ireg16_names[16]
915     = { "%ax",  "%cx",  "%dx",  "%bx",  "%sp",  "%bp",  "%si",  "%di",
916         "%r8w", "%r9w", "%r10w","%r11w","%r12w","%r13w","%r14w","%r15w" };
917   static const HChar* ireg8_names[16]
918     = { "%al",  "%cl",  "%dl",  "%bl",  "%spl", "%bpl", "%sil", "%dil",
919         "%r8b", "%r9b", "%r10b","%r11b","%r12b","%r13b","%r14b","%r15b" };
920   static const HChar* ireg8_irregular[8]
921     = { "%al", "%cl", "%dl", "%bl", "%ah", "%ch", "%dh", "%bh" };
922
923   vassert(reg < 16);
924   if (sz == 1) {
925      if (irregular)
926         vassert(reg < 8);
927   } else {
928      vassert(irregular == False);
929   }
930
931   switch (sz) {
932      case 8: return ireg64_names[reg];
933      case 4: return ireg32_names[reg];
934      case 2: return ireg16_names[reg];
935      case 1: if (irregular) {
936                 return ireg8_irregular[reg];
937              } else {
938                 return ireg8_names[reg];
939              }
940      default: vpanic("nameIReg(amd64)");
941   }
942}
943
944/* Using the same argument conventions as nameIReg, produce the
945   guest state offset of an integer register. */
946
947static
948Int offsetIReg ( Int sz, UInt reg, Bool irregular )
949{
950   vassert(reg < 16);
951   if (sz == 1) {
952      if (irregular)
953         vassert(reg < 8);
954   } else {
955      vassert(irregular == False);
956   }
957
958   /* Deal with irregular case -- sz==1 and no REX present */
959   if (sz == 1 && irregular) {
960      switch (reg) {
961         case R_RSP: return 1+ OFFB_RAX;
962         case R_RBP: return 1+ OFFB_RCX;
963         case R_RSI: return 1+ OFFB_RDX;
964         case R_RDI: return 1+ OFFB_RBX;
965         default:    break; /* use the normal case */
966      }
967   }
968
969   /* Normal case */
970   return integerGuestReg64Offset(reg);
971}
972
973
974/* Read the %CL register :: Ity_I8, for shift/rotate operations. */
975
976static IRExpr* getIRegCL ( void )
977{
978   vassert(!host_is_bigendian);
979   return IRExpr_Get( OFFB_RCX, Ity_I8 );
980}
981
982
983/* Write to the %AH register. */
984
985static void putIRegAH ( IRExpr* e )
986{
987   vassert(!host_is_bigendian);
988   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I8);
989   stmt( IRStmt_Put( OFFB_RAX+1, e ) );
990}
991
992
993/* Read/write various widths of %RAX, as it has various
994   special-purpose uses. */
995
996static const HChar* nameIRegRAX ( Int sz )
997{
998   switch (sz) {
999      case 1: return "%al";
1000      case 2: return "%ax";
1001      case 4: return "%eax";
1002      case 8: return "%rax";
1003      default: vpanic("nameIRegRAX(amd64)");
1004   }
1005}
1006
1007static IRExpr* getIRegRAX ( Int sz )
1008{
1009   vassert(!host_is_bigendian);
1010   switch (sz) {
1011      case 1: return IRExpr_Get( OFFB_RAX, Ity_I8 );
1012      case 2: return IRExpr_Get( OFFB_RAX, Ity_I16 );
1013      case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RAX, Ity_I64 ));
1014      case 8: return IRExpr_Get( OFFB_RAX, Ity_I64 );
1015      default: vpanic("getIRegRAX(amd64)");
1016   }
1017}
1018
1019static void putIRegRAX ( Int sz, IRExpr* e )
1020{
1021   IRType ty = typeOfIRExpr(irsb->tyenv, e);
1022   vassert(!host_is_bigendian);
1023   switch (sz) {
1024      case 8: vassert(ty == Ity_I64);
1025              stmt( IRStmt_Put( OFFB_RAX, e ));
1026              break;
1027      case 4: vassert(ty == Ity_I32);
1028              stmt( IRStmt_Put( OFFB_RAX, unop(Iop_32Uto64,e) ));
1029              break;
1030      case 2: vassert(ty == Ity_I16);
1031              stmt( IRStmt_Put( OFFB_RAX, e ));
1032              break;
1033      case 1: vassert(ty == Ity_I8);
1034              stmt( IRStmt_Put( OFFB_RAX, e ));
1035              break;
1036      default: vpanic("putIRegRAX(amd64)");
1037   }
1038}
1039
1040
1041/* Read/write various widths of %RDX, as it has various
1042   special-purpose uses. */
1043
1044static const HChar* nameIRegRDX ( Int sz )
1045{
1046   switch (sz) {
1047      case 1: return "%dl";
1048      case 2: return "%dx";
1049      case 4: return "%edx";
1050      case 8: return "%rdx";
1051      default: vpanic("nameIRegRDX(amd64)");
1052   }
1053}
1054
1055static IRExpr* getIRegRDX ( Int sz )
1056{
1057   vassert(!host_is_bigendian);
1058   switch (sz) {
1059      case 1: return IRExpr_Get( OFFB_RDX, Ity_I8 );
1060      case 2: return IRExpr_Get( OFFB_RDX, Ity_I16 );
1061      case 4: return unop(Iop_64to32, IRExpr_Get( OFFB_RDX, Ity_I64 ));
1062      case 8: return IRExpr_Get( OFFB_RDX, Ity_I64 );
1063      default: vpanic("getIRegRDX(amd64)");
1064   }
1065}
1066
1067static void putIRegRDX ( Int sz, IRExpr* e )
1068{
1069   vassert(!host_is_bigendian);
1070   vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
1071   switch (sz) {
1072      case 8: stmt( IRStmt_Put( OFFB_RDX, e ));
1073              break;
1074      case 4: stmt( IRStmt_Put( OFFB_RDX, unop(Iop_32Uto64,e) ));
1075              break;
1076      case 2: stmt( IRStmt_Put( OFFB_RDX, e ));
1077              break;
1078      case 1: stmt( IRStmt_Put( OFFB_RDX, e ));
1079              break;
1080      default: vpanic("putIRegRDX(amd64)");
1081   }
1082}
1083
1084
1085/* Simplistic functions to deal with the integer registers as a
1086   straightforward bank of 16 64-bit regs. */
1087
1088static IRExpr* getIReg64 ( UInt regno )
1089{
1090   return IRExpr_Get( integerGuestReg64Offset(regno),
1091                      Ity_I64 );
1092}
1093
1094static void putIReg64 ( UInt regno, IRExpr* e )
1095{
1096   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
1097   stmt( IRStmt_Put( integerGuestReg64Offset(regno), e ) );
1098}
1099
1100static const HChar* nameIReg64 ( UInt regno )
1101{
1102   return nameIReg( 8, regno, False );
1103}
1104
1105
1106/* Simplistic functions to deal with the lower halves of integer
1107   registers as a straightforward bank of 16 32-bit regs. */
1108
1109static IRExpr* getIReg32 ( UInt regno )
1110{
1111   vassert(!host_is_bigendian);
1112   return unop(Iop_64to32,
1113               IRExpr_Get( integerGuestReg64Offset(regno),
1114                           Ity_I64 ));
1115}
1116
1117static void putIReg32 ( UInt regno, IRExpr* e )
1118{
1119   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
1120   stmt( IRStmt_Put( integerGuestReg64Offset(regno),
1121                     unop(Iop_32Uto64,e) ) );
1122}
1123
1124static const HChar* nameIReg32 ( UInt regno )
1125{
1126   return nameIReg( 4, regno, False );
1127}
1128
1129
1130/* Simplistic functions to deal with the lower quarters of integer
1131   registers as a straightforward bank of 16 16-bit regs. */
1132
1133static IRExpr* getIReg16 ( UInt regno )
1134{
1135   vassert(!host_is_bigendian);
1136   return IRExpr_Get( integerGuestReg64Offset(regno),
1137                      Ity_I16 );
1138}
1139
1140static void putIReg16 ( UInt regno, IRExpr* e )
1141{
1142   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
1143   stmt( IRStmt_Put( integerGuestReg64Offset(regno),
1144                     unop(Iop_16Uto64,e) ) );
1145}
1146
1147static const HChar* nameIReg16 ( UInt regno )
1148{
1149   return nameIReg( 2, regno, False );
1150}
1151
1152
1153/* Sometimes what we know is a 3-bit register number, a REX byte, and
1154   which field of the REX byte is to be used to extend to a 4-bit
1155   number.  These functions cater for that situation.
1156*/
1157static IRExpr* getIReg64rexX ( Prefix pfx, UInt lo3bits )
1158{
1159   vassert(lo3bits < 8);
1160   vassert(IS_VALID_PFX(pfx));
1161   return getIReg64( lo3bits | (getRexX(pfx) << 3) );
1162}
1163
1164static const HChar* nameIReg64rexX ( Prefix pfx, UInt lo3bits )
1165{
1166   vassert(lo3bits < 8);
1167   vassert(IS_VALID_PFX(pfx));
1168   return nameIReg( 8, lo3bits | (getRexX(pfx) << 3), False );
1169}
1170
1171static const HChar* nameIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
1172{
1173   vassert(lo3bits < 8);
1174   vassert(IS_VALID_PFX(pfx));
1175   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1176   return nameIReg( sz, lo3bits | (getRexB(pfx) << 3),
1177                        toBool(sz==1 && !haveREX(pfx)) );
1178}
1179
1180static IRExpr* getIRegRexB ( Int sz, Prefix pfx, UInt lo3bits )
1181{
1182   vassert(lo3bits < 8);
1183   vassert(IS_VALID_PFX(pfx));
1184   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1185   if (sz == 4) {
1186      sz = 8;
1187      return unop(Iop_64to32,
1188                  IRExpr_Get(
1189                     offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
1190                                     False/*!irregular*/ ),
1191                     szToITy(sz)
1192                 )
1193             );
1194   } else {
1195      return IRExpr_Get(
1196                offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
1197                                toBool(sz==1 && !haveREX(pfx)) ),
1198                szToITy(sz)
1199             );
1200   }
1201}
1202
1203static void putIRegRexB ( Int sz, Prefix pfx, UInt lo3bits, IRExpr* e )
1204{
1205   vassert(lo3bits < 8);
1206   vassert(IS_VALID_PFX(pfx));
1207   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1208   vassert(typeOfIRExpr(irsb->tyenv, e) == szToITy(sz));
1209   stmt( IRStmt_Put(
1210            offsetIReg( sz, lo3bits | (getRexB(pfx) << 3),
1211                            toBool(sz==1 && !haveREX(pfx)) ),
1212            sz==4 ? unop(Iop_32Uto64,e) : e
1213   ));
1214}
1215
1216
1217/* Functions for getting register numbers from modrm bytes and REX
1218   when we don't have to consider the complexities of integer subreg
1219   accesses.
1220*/
1221/* Extract the g reg field from a modRM byte, and augment it using the
1222   REX.R bit from the supplied REX byte.  The R bit usually is
1223   associated with the g register field.
1224*/
1225static UInt gregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
1226{
1227   Int reg = (Int)( (mod_reg_rm >> 3) & 7 );
1228   reg += (pfx & PFX_REXR) ? 8 : 0;
1229   return reg;
1230}
1231
1232/* Extract the e reg field from a modRM byte, and augment it using the
1233   REX.B bit from the supplied REX byte.  The B bit usually is
1234   associated with the e register field (when modrm indicates e is a
1235   register, that is).
1236*/
1237static UInt eregOfRexRM ( Prefix pfx, UChar mod_reg_rm )
1238{
1239   Int rm;
1240   vassert(epartIsReg(mod_reg_rm));
1241   rm = (Int)(mod_reg_rm & 0x7);
1242   rm += (pfx & PFX_REXB) ? 8 : 0;
1243   return rm;
1244}
1245
1246
1247/* General functions for dealing with integer register access. */
1248
1249/* Produce the guest state offset for a reference to the 'g' register
1250   field in a modrm byte, taking into account REX (or its absence),
1251   and the size of the access.
1252*/
1253static UInt offsetIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
1254{
1255   UInt reg;
1256   vassert(!host_is_bigendian);
1257   vassert(IS_VALID_PFX(pfx));
1258   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1259   reg = gregOfRexRM( pfx, mod_reg_rm );
1260   return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
1261}
1262
1263static
1264IRExpr* getIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
1265{
1266   if (sz == 4) {
1267      sz = 8;
1268      return unop(Iop_64to32,
1269                  IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
1270                              szToITy(sz) ));
1271   } else {
1272      return IRExpr_Get( offsetIRegG( sz, pfx, mod_reg_rm ),
1273                         szToITy(sz) );
1274   }
1275}
1276
1277static
1278void putIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
1279{
1280   vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
1281   if (sz == 4) {
1282      e = unop(Iop_32Uto64,e);
1283   }
1284   stmt( IRStmt_Put( offsetIRegG( sz, pfx, mod_reg_rm ), e ) );
1285}
1286
1287static
1288const HChar* nameIRegG ( Int sz, Prefix pfx, UChar mod_reg_rm )
1289{
1290   return nameIReg( sz, gregOfRexRM(pfx,mod_reg_rm),
1291                        toBool(sz==1 && !haveREX(pfx)) );
1292}
1293
1294
1295static
1296IRExpr* getIRegV ( Int sz, Prefix pfx )
1297{
1298   if (sz == 4) {
1299      sz = 8;
1300      return unop(Iop_64to32,
1301                  IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ),
1302                              szToITy(sz) ));
1303   } else {
1304      return IRExpr_Get( offsetIReg( sz, getVexNvvvv(pfx), False ),
1305                         szToITy(sz) );
1306   }
1307}
1308
1309static
1310void putIRegV ( Int sz, Prefix pfx, IRExpr* e )
1311{
1312   vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
1313   if (sz == 4) {
1314      e = unop(Iop_32Uto64,e);
1315   }
1316   stmt( IRStmt_Put( offsetIReg( sz, getVexNvvvv(pfx), False ), e ) );
1317}
1318
1319static
1320const HChar* nameIRegV ( Int sz, Prefix pfx )
1321{
1322   return nameIReg( sz, getVexNvvvv(pfx), False );
1323}
1324
1325
1326
1327/* Produce the guest state offset for a reference to the 'e' register
1328   field in a modrm byte, taking into account REX (or its absence),
1329   and the size of the access.  eregOfRexRM will assert if mod_reg_rm
1330   denotes a memory access rather than a register access.
1331*/
1332static UInt offsetIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
1333{
1334   UInt reg;
1335   vassert(!host_is_bigendian);
1336   vassert(IS_VALID_PFX(pfx));
1337   vassert(sz == 8 || sz == 4 || sz == 2 || sz == 1);
1338   reg = eregOfRexRM( pfx, mod_reg_rm );
1339   return offsetIReg( sz, reg, toBool(sz == 1 && !haveREX(pfx)) );
1340}
1341
1342static
1343IRExpr* getIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
1344{
1345   if (sz == 4) {
1346      sz = 8;
1347      return unop(Iop_64to32,
1348                  IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
1349                              szToITy(sz) ));
1350   } else {
1351      return IRExpr_Get( offsetIRegE( sz, pfx, mod_reg_rm ),
1352                         szToITy(sz) );
1353   }
1354}
1355
1356static
1357void putIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm, IRExpr* e )
1358{
1359   vassert(typeOfIRExpr(irsb->tyenv,e) == szToITy(sz));
1360   if (sz == 4) {
1361      e = unop(Iop_32Uto64,e);
1362   }
1363   stmt( IRStmt_Put( offsetIRegE( sz, pfx, mod_reg_rm ), e ) );
1364}
1365
1366static
1367const HChar* nameIRegE ( Int sz, Prefix pfx, UChar mod_reg_rm )
1368{
1369   return nameIReg( sz, eregOfRexRM(pfx,mod_reg_rm),
1370                        toBool(sz==1 && !haveREX(pfx)) );
1371}
1372
1373
1374/*------------------------------------------------------------*/
1375/*--- For dealing with XMM registers                       ---*/
1376/*------------------------------------------------------------*/
1377
1378static Int ymmGuestRegOffset ( UInt ymmreg )
1379{
1380   switch (ymmreg) {
1381      case 0:  return OFFB_YMM0;
1382      case 1:  return OFFB_YMM1;
1383      case 2:  return OFFB_YMM2;
1384      case 3:  return OFFB_YMM3;
1385      case 4:  return OFFB_YMM4;
1386      case 5:  return OFFB_YMM5;
1387      case 6:  return OFFB_YMM6;
1388      case 7:  return OFFB_YMM7;
1389      case 8:  return OFFB_YMM8;
1390      case 9:  return OFFB_YMM9;
1391      case 10: return OFFB_YMM10;
1392      case 11: return OFFB_YMM11;
1393      case 12: return OFFB_YMM12;
1394      case 13: return OFFB_YMM13;
1395      case 14: return OFFB_YMM14;
1396      case 15: return OFFB_YMM15;
1397      default: vpanic("ymmGuestRegOffset(amd64)");
1398   }
1399}
1400
1401static Int xmmGuestRegOffset ( UInt xmmreg )
1402{
1403   /* Correct for little-endian host only. */
1404   vassert(!host_is_bigendian);
1405   return ymmGuestRegOffset( xmmreg );
1406}
1407
1408/* Lanes of vector registers are always numbered from zero being the
1409   least significant lane (rightmost in the register).  */
1410
1411static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
1412{
1413   /* Correct for little-endian host only. */
1414   vassert(!host_is_bigendian);
1415   vassert(laneno >= 0 && laneno < 8);
1416   return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
1417}
1418
1419static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
1420{
1421   /* Correct for little-endian host only. */
1422   vassert(!host_is_bigendian);
1423   vassert(laneno >= 0 && laneno < 4);
1424   return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
1425}
1426
1427static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
1428{
1429   /* Correct for little-endian host only. */
1430   vassert(!host_is_bigendian);
1431   vassert(laneno >= 0 && laneno < 2);
1432   return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
1433}
1434
1435static Int ymmGuestRegLane128offset ( UInt ymmreg, Int laneno )
1436{
1437   /* Correct for little-endian host only. */
1438   vassert(!host_is_bigendian);
1439   vassert(laneno >= 0 && laneno < 2);
1440   return ymmGuestRegOffset( ymmreg ) + 16 * laneno;
1441}
1442
1443static Int ymmGuestRegLane64offset ( UInt ymmreg, Int laneno )
1444{
1445   /* Correct for little-endian host only. */
1446   vassert(!host_is_bigendian);
1447   vassert(laneno >= 0 && laneno < 4);
1448   return ymmGuestRegOffset( ymmreg ) + 8 * laneno;
1449}
1450
1451static Int ymmGuestRegLane32offset ( UInt ymmreg, Int laneno )
1452{
1453   /* Correct for little-endian host only. */
1454   vassert(!host_is_bigendian);
1455   vassert(laneno >= 0 && laneno < 8);
1456   return ymmGuestRegOffset( ymmreg ) + 4 * laneno;
1457}
1458
1459static IRExpr* getXMMReg ( UInt xmmreg )
1460{
1461   return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
1462}
1463
1464static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
1465{
1466   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
1467}
1468
1469static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
1470{
1471   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
1472}
1473
1474static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
1475{
1476   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
1477}
1478
1479static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
1480{
1481   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
1482}
1483
1484static IRExpr* getXMMRegLane16 ( UInt xmmreg, Int laneno )
1485{
1486  return IRExpr_Get( xmmGuestRegLane16offset(xmmreg,laneno), Ity_I16 );
1487}
1488
1489static void putXMMReg ( UInt xmmreg, IRExpr* e )
1490{
1491   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
1492   stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
1493}
1494
1495static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
1496{
1497   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
1498   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
1499}
1500
1501static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
1502{
1503   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
1504   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
1505}
1506
1507static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
1508{
1509   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
1510   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
1511}
1512
1513static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
1514{
1515   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
1516   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
1517}
1518
1519static IRExpr* getYMMReg ( UInt xmmreg )
1520{
1521   return IRExpr_Get( ymmGuestRegOffset(xmmreg), Ity_V256 );
1522}
1523
1524static IRExpr* getYMMRegLane128 ( UInt ymmreg, Int laneno )
1525{
1526   return IRExpr_Get( ymmGuestRegLane128offset(ymmreg,laneno), Ity_V128 );
1527}
1528
1529static IRExpr* getYMMRegLane64 ( UInt ymmreg, Int laneno )
1530{
1531   return IRExpr_Get( ymmGuestRegLane64offset(ymmreg,laneno), Ity_I64 );
1532}
1533
1534static IRExpr* getYMMRegLane32 ( UInt ymmreg, Int laneno )
1535{
1536   return IRExpr_Get( ymmGuestRegLane32offset(ymmreg,laneno), Ity_I32 );
1537}
1538
1539static void putYMMReg ( UInt ymmreg, IRExpr* e )
1540{
1541   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V256);
1542   stmt( IRStmt_Put( ymmGuestRegOffset(ymmreg), e ) );
1543}
1544
1545static void putYMMRegLane128 ( UInt ymmreg, Int laneno, IRExpr* e )
1546{
1547   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
1548   stmt( IRStmt_Put( ymmGuestRegLane128offset(ymmreg,laneno), e ) );
1549}
1550
1551static void putYMMRegLane64F ( UInt ymmreg, Int laneno, IRExpr* e )
1552{
1553   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
1554   stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
1555}
1556
1557static void putYMMRegLane64 ( UInt ymmreg, Int laneno, IRExpr* e )
1558{
1559   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
1560   stmt( IRStmt_Put( ymmGuestRegLane64offset(ymmreg,laneno), e ) );
1561}
1562
1563static void putYMMRegLane32F ( UInt ymmreg, Int laneno, IRExpr* e )
1564{
1565   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
1566   stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
1567}
1568
1569static void putYMMRegLane32 ( UInt ymmreg, Int laneno, IRExpr* e )
1570{
1571   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
1572   stmt( IRStmt_Put( ymmGuestRegLane32offset(ymmreg,laneno), e ) );
1573}
1574
1575static IRExpr* mkV128 ( UShort mask )
1576{
1577   return IRExpr_Const(IRConst_V128(mask));
1578}
1579
1580/* Write the low half of a YMM reg and zero out the upper half. */
1581static void putYMMRegLoAndZU ( UInt ymmreg, IRExpr* e )
1582{
1583   putYMMRegLane128( ymmreg, 0, e );
1584   putYMMRegLane128( ymmreg, 1, mkV128(0) );
1585}
1586
1587static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
1588{
1589   vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
1590   vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
1591   return unop(Iop_64to1,
1592               binop(Iop_And64,
1593                     unop(Iop_1Uto64,x),
1594                     unop(Iop_1Uto64,y)));
1595}
1596
1597/* Generate a compare-and-swap operation, operating on memory at
1598   'addr'.  The expected value is 'expVal' and the new value is
1599   'newVal'.  If the operation fails, then transfer control (with a
1600   no-redir jump (XXX no -- see comment at top of this file)) to
1601   'restart_point', which is presumably the address of the guest
1602   instruction again -- retrying, essentially. */
1603static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
1604                    Addr64 restart_point )
1605{
1606   IRCAS* cas;
1607   IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
1608   IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
1609   IRTemp oldTmp = newTemp(tyE);
1610   IRTemp expTmp = newTemp(tyE);
1611   vassert(tyE == tyN);
1612   vassert(tyE == Ity_I64 || tyE == Ity_I32
1613           || tyE == Ity_I16 || tyE == Ity_I8);
1614   assign(expTmp, expVal);
1615   cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
1616                  NULL, mkexpr(expTmp), NULL, newVal );
1617   stmt( IRStmt_CAS(cas) );
1618   stmt( IRStmt_Exit(
1619            binop( mkSizedOp(tyE,Iop_CasCmpNE8),
1620                   mkexpr(oldTmp), mkexpr(expTmp) ),
1621            Ijk_Boring, /*Ijk_NoRedir*/
1622            IRConst_U64( restart_point ),
1623            OFFB_RIP
1624         ));
1625}
1626
1627
1628/*------------------------------------------------------------*/
1629/*--- Helpers for %rflags.                                 ---*/
1630/*------------------------------------------------------------*/
1631
1632/* -------------- Evaluating the flags-thunk. -------------- */
1633
1634/* Build IR to calculate all the eflags from stored
1635   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1636   Ity_I64. */
1637static IRExpr* mk_amd64g_calculate_rflags_all ( void )
1638{
1639   IRExpr** args
1640      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1641                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1642                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1643                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1644   IRExpr* call
1645      = mkIRExprCCall(
1646           Ity_I64,
1647           0/*regparm*/,
1648           "amd64g_calculate_rflags_all", &amd64g_calculate_rflags_all,
1649           args
1650        );
1651   /* Exclude OP and NDEP from definedness checking.  We're only
1652      interested in DEP1 and DEP2. */
1653   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1654   return call;
1655}
1656
1657/* Build IR to calculate some particular condition from stored
1658   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
1659   Ity_Bit. */
1660static IRExpr* mk_amd64g_calculate_condition ( AMD64Condcode cond )
1661{
1662   IRExpr** args
1663      = mkIRExprVec_5( mkU64(cond),
1664                       IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1665                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1666                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1667                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1668   IRExpr* call
1669      = mkIRExprCCall(
1670           Ity_I64,
1671           0/*regparm*/,
1672           "amd64g_calculate_condition", &amd64g_calculate_condition,
1673           args
1674        );
1675   /* Exclude the requested condition, OP and NDEP from definedness
1676      checking.  We're only interested in DEP1 and DEP2. */
1677   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
1678   return unop(Iop_64to1, call);
1679}
1680
1681/* Build IR to calculate just the carry flag from stored
1682   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I64. */
1683static IRExpr* mk_amd64g_calculate_rflags_c ( void )
1684{
1685   IRExpr** args
1686      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I64),
1687                       IRExpr_Get(OFFB_CC_DEP1, Ity_I64),
1688                       IRExpr_Get(OFFB_CC_DEP2, Ity_I64),
1689                       IRExpr_Get(OFFB_CC_NDEP, Ity_I64) );
1690   IRExpr* call
1691      = mkIRExprCCall(
1692           Ity_I64,
1693           0/*regparm*/,
1694           "amd64g_calculate_rflags_c", &amd64g_calculate_rflags_c,
1695           args
1696        );
1697   /* Exclude OP and NDEP from definedness checking.  We're only
1698      interested in DEP1 and DEP2. */
1699   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
1700   return call;
1701}
1702
1703
1704/* -------------- Building the flags-thunk. -------------- */
1705
1706/* The machinery in this section builds the flag-thunk following a
1707   flag-setting operation.  Hence the various setFlags_* functions.
1708*/
1709
1710static Bool isAddSub ( IROp op8 )
1711{
1712   return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
1713}
1714
1715static Bool isLogic ( IROp op8 )
1716{
1717   return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
1718}
1719
1720/* U-widen 1/8/16/32/64 bit int expr to 64. */
1721static IRExpr* widenUto64 ( IRExpr* e )
1722{
1723   switch (typeOfIRExpr(irsb->tyenv,e)) {
1724      case Ity_I64: return e;
1725      case Ity_I32: return unop(Iop_32Uto64, e);
1726      case Ity_I16: return unop(Iop_16Uto64, e);
1727      case Ity_I8:  return unop(Iop_8Uto64, e);
1728      case Ity_I1:  return unop(Iop_1Uto64, e);
1729      default: vpanic("widenUto64");
1730   }
1731}
1732
1733/* S-widen 8/16/32/64 bit int expr to 32. */
1734static IRExpr* widenSto64 ( IRExpr* e )
1735{
1736   switch (typeOfIRExpr(irsb->tyenv,e)) {
1737      case Ity_I64: return e;
1738      case Ity_I32: return unop(Iop_32Sto64, e);
1739      case Ity_I16: return unop(Iop_16Sto64, e);
1740      case Ity_I8:  return unop(Iop_8Sto64, e);
1741      default: vpanic("widenSto64");
1742   }
1743}
1744
1745/* Narrow 8/16/32/64 bit int expr to 8/16/32/64.  Clearly only some
1746   of these combinations make sense. */
1747static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
1748{
1749   IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
1750   if (src_ty == dst_ty)
1751      return e;
1752   if (src_ty == Ity_I32 && dst_ty == Ity_I16)
1753      return unop(Iop_32to16, e);
1754   if (src_ty == Ity_I32 && dst_ty == Ity_I8)
1755      return unop(Iop_32to8, e);
1756   if (src_ty == Ity_I64 && dst_ty == Ity_I32)
1757      return unop(Iop_64to32, e);
1758   if (src_ty == Ity_I64 && dst_ty == Ity_I16)
1759      return unop(Iop_64to16, e);
1760   if (src_ty == Ity_I64 && dst_ty == Ity_I8)
1761      return unop(Iop_64to8, e);
1762
1763   vex_printf("\nsrc, dst tys are: ");
1764   ppIRType(src_ty);
1765   vex_printf(", ");
1766   ppIRType(dst_ty);
1767   vex_printf("\n");
1768   vpanic("narrowTo(amd64)");
1769}
1770
1771
1772/* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
1773   auto-sized up to the real op. */
1774
1775static
1776void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
1777{
1778   Int ccOp = 0;
1779   switch (ty) {
1780      case Ity_I8:  ccOp = 0; break;
1781      case Ity_I16: ccOp = 1; break;
1782      case Ity_I32: ccOp = 2; break;
1783      case Ity_I64: ccOp = 3; break;
1784      default: vassert(0);
1785   }
1786   switch (op8) {
1787      case Iop_Add8: ccOp += AMD64G_CC_OP_ADDB;   break;
1788      case Iop_Sub8: ccOp += AMD64G_CC_OP_SUBB;   break;
1789      default:       ppIROp(op8);
1790                     vpanic("setFlags_DEP1_DEP2(amd64)");
1791   }
1792   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
1793   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
1794   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(dep2))) );
1795}
1796
1797
1798/* Set the OP and DEP1 fields only, and write zero to DEP2. */
1799
1800static
1801void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
1802{
1803   Int ccOp = 0;
1804   switch (ty) {
1805      case Ity_I8:  ccOp = 0; break;
1806      case Ity_I16: ccOp = 1; break;
1807      case Ity_I32: ccOp = 2; break;
1808      case Ity_I64: ccOp = 3; break;
1809      default: vassert(0);
1810   }
1811   switch (op8) {
1812      case Iop_Or8:
1813      case Iop_And8:
1814      case Iop_Xor8: ccOp += AMD64G_CC_OP_LOGICB; break;
1815      default:       ppIROp(op8);
1816                     vpanic("setFlags_DEP1(amd64)");
1817   }
1818   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
1819   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dep1))) );
1820   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
1821}
1822
1823
1824/* For shift operations, we put in the result and the undershifted
1825   result.  Except if the shift amount is zero, the thunk is left
1826   unchanged. */
1827
1828static void setFlags_DEP1_DEP2_shift ( IROp    op64,
1829                                       IRTemp  res,
1830                                       IRTemp  resUS,
1831                                       IRType  ty,
1832                                       IRTemp  guard )
1833{
1834   Int ccOp = 0;
1835   switch (ty) {
1836      case Ity_I8:  ccOp = 0; break;
1837      case Ity_I16: ccOp = 1; break;
1838      case Ity_I32: ccOp = 2; break;
1839      case Ity_I64: ccOp = 3; break;
1840      default: vassert(0);
1841   }
1842
1843   vassert(guard);
1844
1845   /* Both kinds of right shifts are handled by the same thunk
1846      operation. */
1847   switch (op64) {
1848      case Iop_Shr64:
1849      case Iop_Sar64: ccOp += AMD64G_CC_OP_SHRB; break;
1850      case Iop_Shl64: ccOp += AMD64G_CC_OP_SHLB; break;
1851      default:        ppIROp(op64);
1852                      vpanic("setFlags_DEP1_DEP2_shift(amd64)");
1853   }
1854
1855   /* guard :: Ity_I8.  We need to convert it to I1. */
1856   IRTemp guardB = newTemp(Ity_I1);
1857   assign( guardB, binop(Iop_CmpNE8, mkexpr(guard), mkU8(0)) );
1858
1859   /* DEP1 contains the result, DEP2 contains the undershifted value. */
1860   stmt( IRStmt_Put( OFFB_CC_OP,
1861                     IRExpr_ITE( mkexpr(guardB),
1862                                 mkU64(ccOp),
1863                                 IRExpr_Get(OFFB_CC_OP,Ity_I64) ) ));
1864   stmt( IRStmt_Put( OFFB_CC_DEP1,
1865                     IRExpr_ITE( mkexpr(guardB),
1866                                 widenUto64(mkexpr(res)),
1867                                 IRExpr_Get(OFFB_CC_DEP1,Ity_I64) ) ));
1868   stmt( IRStmt_Put( OFFB_CC_DEP2,
1869                     IRExpr_ITE( mkexpr(guardB),
1870                                 widenUto64(mkexpr(resUS)),
1871                                 IRExpr_Get(OFFB_CC_DEP2,Ity_I64) ) ));
1872}
1873
1874
1875/* For the inc/dec case, we store in DEP1 the result value and in NDEP
1876   the former value of the carry flag, which unfortunately we have to
1877   compute. */
1878
1879static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
1880{
1881   Int ccOp = inc ? AMD64G_CC_OP_INCB : AMD64G_CC_OP_DECB;
1882
1883   switch (ty) {
1884      case Ity_I8:  ccOp += 0; break;
1885      case Ity_I16: ccOp += 1; break;
1886      case Ity_I32: ccOp += 2; break;
1887      case Ity_I64: ccOp += 3; break;
1888      default: vassert(0);
1889   }
1890
1891   /* This has to come first, because calculating the C flag
1892      may require reading all four thunk fields. */
1893   stmt( IRStmt_Put( OFFB_CC_NDEP, mk_amd64g_calculate_rflags_c()) );
1894   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(ccOp)) );
1895   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(res))) );
1896   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
1897}
1898
1899
1900/* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
1901   two arguments. */
1902
1903static
1904void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, ULong base_op )
1905{
1906   switch (ty) {
1907      case Ity_I8:
1908         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+0) ) );
1909         break;
1910      case Ity_I16:
1911         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+1) ) );
1912         break;
1913      case Ity_I32:
1914         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+2) ) );
1915         break;
1916      case Ity_I64:
1917         stmt( IRStmt_Put( OFFB_CC_OP, mkU64(base_op+3) ) );
1918         break;
1919      default:
1920         vpanic("setFlags_MUL(amd64)");
1921   }
1922   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(arg1)) ));
1923   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(arg2)) ));
1924}
1925
1926
1927/* -------------- Condition codes. -------------- */
1928
1929/* Condition codes, using the AMD encoding.  */
1930
1931static const HChar* name_AMD64Condcode ( AMD64Condcode cond )
1932{
1933   switch (cond) {
1934      case AMD64CondO:      return "o";
1935      case AMD64CondNO:     return "no";
1936      case AMD64CondB:      return "b";
1937      case AMD64CondNB:     return "ae"; /*"nb";*/
1938      case AMD64CondZ:      return "e"; /*"z";*/
1939      case AMD64CondNZ:     return "ne"; /*"nz";*/
1940      case AMD64CondBE:     return "be";
1941      case AMD64CondNBE:    return "a"; /*"nbe";*/
1942      case AMD64CondS:      return "s";
1943      case AMD64CondNS:     return "ns";
1944      case AMD64CondP:      return "p";
1945      case AMD64CondNP:     return "np";
1946      case AMD64CondL:      return "l";
1947      case AMD64CondNL:     return "ge"; /*"nl";*/
1948      case AMD64CondLE:     return "le";
1949      case AMD64CondNLE:    return "g"; /*"nle";*/
1950      case AMD64CondAlways: return "ALWAYS";
1951      default: vpanic("name_AMD64Condcode");
1952   }
1953}
1954
1955static
1956AMD64Condcode positiveIse_AMD64Condcode ( AMD64Condcode  cond,
1957                                          /*OUT*/Bool*   needInvert )
1958{
1959   vassert(cond >= AMD64CondO && cond <= AMD64CondNLE);
1960   if (cond & 1) {
1961      *needInvert = True;
1962      return cond-1;
1963   } else {
1964      *needInvert = False;
1965      return cond;
1966   }
1967}
1968
1969
1970/* -------------- Helpers for ADD/SUB with carry. -------------- */
1971
1972/* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
1973   appropriately.
1974
1975   Optionally, generate a store for the 'tres' value.  This can either
1976   be a normal store, or it can be a cas-with-possible-failure style
1977   store:
1978
1979   if taddr is IRTemp_INVALID, then no store is generated.
1980
1981   if taddr is not IRTemp_INVALID, then a store (using taddr as
1982   the address) is generated:
1983
1984     if texpVal is IRTemp_INVALID then a normal store is
1985     generated, and restart_point must be zero (it is irrelevant).
1986
1987     if texpVal is not IRTemp_INVALID then a cas-style store is
1988     generated.  texpVal is the expected value, restart_point
1989     is the restart point if the store fails, and texpVal must
1990     have the same type as tres.
1991
1992*/
1993static void helper_ADC ( Int sz,
1994                         IRTemp tres, IRTemp ta1, IRTemp ta2,
1995                         /* info about optional store: */
1996                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
1997{
1998   UInt    thunkOp;
1999   IRType  ty    = szToITy(sz);
2000   IRTemp  oldc  = newTemp(Ity_I64);
2001   IRTemp  oldcn = newTemp(ty);
2002   IROp    plus  = mkSizedOp(ty, Iop_Add8);
2003   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
2004
2005   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
2006
2007   switch (sz) {
2008      case 8:  thunkOp = AMD64G_CC_OP_ADCQ; break;
2009      case 4:  thunkOp = AMD64G_CC_OP_ADCL; break;
2010      case 2:  thunkOp = AMD64G_CC_OP_ADCW; break;
2011      case 1:  thunkOp = AMD64G_CC_OP_ADCB; break;
2012      default: vassert(0);
2013   }
2014
2015   /* oldc = old carry flag, 0 or 1 */
2016   assign( oldc,  binop(Iop_And64,
2017                        mk_amd64g_calculate_rflags_c(),
2018                        mkU64(1)) );
2019
2020   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
2021
2022   assign( tres, binop(plus,
2023                       binop(plus,mkexpr(ta1),mkexpr(ta2)),
2024                       mkexpr(oldcn)) );
2025
2026   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
2027      start of this function. */
2028   if (taddr != IRTemp_INVALID) {
2029      if (texpVal == IRTemp_INVALID) {
2030         vassert(restart_point == 0);
2031         storeLE( mkexpr(taddr), mkexpr(tres) );
2032      } else {
2033         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
2034         /* .. and hence 'texpVal' has the same type as 'tres'. */
2035         casLE( mkexpr(taddr),
2036                mkexpr(texpVal), mkexpr(tres), restart_point );
2037      }
2038   }
2039
2040   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
2041   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1))  ));
2042   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
2043                                                         mkexpr(oldcn)) )) );
2044   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
2045}
2046
2047
2048/* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
2049   appropriately.  As with helper_ADC, possibly generate a store of
2050   the result -- see comments on helper_ADC for details.
2051*/
2052static void helper_SBB ( Int sz,
2053                         IRTemp tres, IRTemp ta1, IRTemp ta2,
2054                         /* info about optional store: */
2055                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
2056{
2057   UInt    thunkOp;
2058   IRType  ty    = szToITy(sz);
2059   IRTemp  oldc  = newTemp(Ity_I64);
2060   IRTemp  oldcn = newTemp(ty);
2061   IROp    minus = mkSizedOp(ty, Iop_Sub8);
2062   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
2063
2064   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
2065
2066   switch (sz) {
2067      case 8:  thunkOp = AMD64G_CC_OP_SBBQ; break;
2068      case 4:  thunkOp = AMD64G_CC_OP_SBBL; break;
2069      case 2:  thunkOp = AMD64G_CC_OP_SBBW; break;
2070      case 1:  thunkOp = AMD64G_CC_OP_SBBB; break;
2071      default: vassert(0);
2072   }
2073
2074   /* oldc = old carry flag, 0 or 1 */
2075   assign( oldc, binop(Iop_And64,
2076                       mk_amd64g_calculate_rflags_c(),
2077                       mkU64(1)) );
2078
2079   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
2080
2081   assign( tres, binop(minus,
2082                       binop(minus,mkexpr(ta1),mkexpr(ta2)),
2083                       mkexpr(oldcn)) );
2084
2085   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
2086      start of this function. */
2087   if (taddr != IRTemp_INVALID) {
2088      if (texpVal == IRTemp_INVALID) {
2089         vassert(restart_point == 0);
2090         storeLE( mkexpr(taddr), mkexpr(tres) );
2091      } else {
2092         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
2093         /* .. and hence 'texpVal' has the same type as 'tres'. */
2094         casLE( mkexpr(taddr),
2095                mkexpr(texpVal), mkexpr(tres), restart_point );
2096      }
2097   }
2098
2099   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(thunkOp) ) );
2100   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(ta1) )) );
2101   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(binop(xor, mkexpr(ta2),
2102                                                         mkexpr(oldcn)) )) );
2103   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
2104}
2105
2106
2107/* -------------- Helpers for disassembly printing. -------------- */
2108
2109static const HChar* nameGrp1 ( Int opc_aux )
2110{
2111   static const HChar* grp1_names[8]
2112     = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
2113   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(amd64)");
2114   return grp1_names[opc_aux];
2115}
2116
2117static const HChar* nameGrp2 ( Int opc_aux )
2118{
2119   static const HChar* grp2_names[8]
2120     = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
2121   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(amd64)");
2122   return grp2_names[opc_aux];
2123}
2124
2125static const HChar* nameGrp4 ( Int opc_aux )
2126{
2127   static const HChar* grp4_names[8]
2128     = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
2129   if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(amd64)");
2130   return grp4_names[opc_aux];
2131}
2132
2133static const HChar* nameGrp5 ( Int opc_aux )
2134{
2135   static const HChar* grp5_names[8]
2136     = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
2137   if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(amd64)");
2138   return grp5_names[opc_aux];
2139}
2140
2141static const HChar* nameGrp8 ( Int opc_aux )
2142{
2143   static const HChar* grp8_names[8]
2144      = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
2145   if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(amd64)");
2146   return grp8_names[opc_aux];
2147}
2148
2149//.. static const HChar* nameSReg ( UInt sreg )
2150//.. {
2151//..    switch (sreg) {
2152//..       case R_ES: return "%es";
2153//..       case R_CS: return "%cs";
2154//..       case R_SS: return "%ss";
2155//..       case R_DS: return "%ds";
2156//..       case R_FS: return "%fs";
2157//..       case R_GS: return "%gs";
2158//..       default: vpanic("nameSReg(x86)");
2159//..    }
2160//.. }
2161
2162static const HChar* nameMMXReg ( Int mmxreg )
2163{
2164   static const HChar* mmx_names[8]
2165     = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
2166   if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(amd64,guest)");
2167   return mmx_names[mmxreg];
2168}
2169
2170static const HChar* nameXMMReg ( Int xmmreg )
2171{
2172   static const HChar* xmm_names[16]
2173     = { "%xmm0",  "%xmm1",  "%xmm2",  "%xmm3",
2174         "%xmm4",  "%xmm5",  "%xmm6",  "%xmm7",
2175         "%xmm8",  "%xmm9",  "%xmm10", "%xmm11",
2176         "%xmm12", "%xmm13", "%xmm14", "%xmm15" };
2177   if (xmmreg < 0 || xmmreg > 15) vpanic("nameXMMReg(amd64)");
2178   return xmm_names[xmmreg];
2179}
2180
2181static const HChar* nameMMXGran ( Int gran )
2182{
2183   switch (gran) {
2184      case 0: return "b";
2185      case 1: return "w";
2186      case 2: return "d";
2187      case 3: return "q";
2188      default: vpanic("nameMMXGran(amd64,guest)");
2189   }
2190}
2191
2192static HChar nameISize ( Int size )
2193{
2194   switch (size) {
2195      case 8: return 'q';
2196      case 4: return 'l';
2197      case 2: return 'w';
2198      case 1: return 'b';
2199      default: vpanic("nameISize(amd64)");
2200   }
2201}
2202
2203static const HChar* nameYMMReg ( Int ymmreg )
2204{
2205   static const HChar* ymm_names[16]
2206     = { "%ymm0",  "%ymm1",  "%ymm2",  "%ymm3",
2207         "%ymm4",  "%ymm5",  "%ymm6",  "%ymm7",
2208         "%ymm8",  "%ymm9",  "%ymm10", "%ymm11",
2209         "%ymm12", "%ymm13", "%ymm14", "%ymm15" };
2210   if (ymmreg < 0 || ymmreg > 15) vpanic("nameYMMReg(amd64)");
2211   return ymm_names[ymmreg];
2212}
2213
2214
2215/*------------------------------------------------------------*/
2216/*--- JMP helpers                                          ---*/
2217/*------------------------------------------------------------*/
2218
2219static void jmp_lit( /*MOD*/DisResult* dres,
2220                     IRJumpKind kind, Addr64 d64 )
2221{
2222   vassert(dres->whatNext    == Dis_Continue);
2223   vassert(dres->len         == 0);
2224   vassert(dres->continueAt  == 0);
2225   vassert(dres->jk_StopHere == Ijk_INVALID);
2226   dres->whatNext    = Dis_StopHere;
2227   dres->jk_StopHere = kind;
2228   stmt( IRStmt_Put( OFFB_RIP, mkU64(d64) ) );
2229}
2230
2231static void jmp_treg( /*MOD*/DisResult* dres,
2232                      IRJumpKind kind, IRTemp t )
2233{
2234   vassert(dres->whatNext    == Dis_Continue);
2235   vassert(dres->len         == 0);
2236   vassert(dres->continueAt  == 0);
2237   vassert(dres->jk_StopHere == Ijk_INVALID);
2238   dres->whatNext    = Dis_StopHere;
2239   dres->jk_StopHere = kind;
2240   stmt( IRStmt_Put( OFFB_RIP, mkexpr(t) ) );
2241}
2242
2243static
2244void jcc_01 ( /*MOD*/DisResult* dres,
2245              AMD64Condcode cond, Addr64 d64_false, Addr64 d64_true )
2246{
2247   Bool          invert;
2248   AMD64Condcode condPos;
2249   vassert(dres->whatNext    == Dis_Continue);
2250   vassert(dres->len         == 0);
2251   vassert(dres->continueAt  == 0);
2252   vassert(dres->jk_StopHere == Ijk_INVALID);
2253   dres->whatNext    = Dis_StopHere;
2254   dres->jk_StopHere = Ijk_Boring;
2255   condPos = positiveIse_AMD64Condcode ( cond, &invert );
2256   if (invert) {
2257      stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
2258                         Ijk_Boring,
2259                         IRConst_U64(d64_false),
2260                         OFFB_RIP ) );
2261      stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_true) ) );
2262   } else {
2263      stmt( IRStmt_Exit( mk_amd64g_calculate_condition(condPos),
2264                         Ijk_Boring,
2265                         IRConst_U64(d64_true),
2266                         OFFB_RIP ) );
2267      stmt( IRStmt_Put( OFFB_RIP, mkU64(d64_false) ) );
2268   }
2269}
2270
2271/* Let new_rsp be the %rsp value after a call/return.  Let nia be the
2272   guest address of the next instruction to be executed.
2273
2274   This function generates an AbiHint to say that -128(%rsp)
2275   .. -1(%rsp) should now be regarded as uninitialised.
2276*/
2277static
2278void make_redzone_AbiHint ( VexAbiInfo* vbi,
2279                            IRTemp new_rsp, IRTemp nia, const HChar* who )
2280{
2281   Int szB = vbi->guest_stack_redzone_size;
2282   vassert(szB >= 0);
2283
2284   /* A bit of a kludge.  Currently the only AbI we've guested AMD64
2285      for is ELF.  So just check it's the expected 128 value
2286      (paranoia). */
2287   vassert(szB == 128);
2288
2289   if (0) vex_printf("AbiHint: %s\n", who);
2290   vassert(typeOfIRTemp(irsb->tyenv, new_rsp) == Ity_I64);
2291   vassert(typeOfIRTemp(irsb->tyenv, nia) == Ity_I64);
2292   if (szB > 0)
2293      stmt( IRStmt_AbiHint(
2294               binop(Iop_Sub64, mkexpr(new_rsp), mkU64(szB)),
2295               szB,
2296               mkexpr(nia)
2297            ));
2298}
2299
2300
2301/*------------------------------------------------------------*/
2302/*--- Disassembling addressing modes                       ---*/
2303/*------------------------------------------------------------*/
2304
2305static
2306const HChar* segRegTxt ( Prefix pfx )
2307{
2308   if (pfx & PFX_CS) return "%cs:";
2309   if (pfx & PFX_DS) return "%ds:";
2310   if (pfx & PFX_ES) return "%es:";
2311   if (pfx & PFX_FS) return "%fs:";
2312   if (pfx & PFX_GS) return "%gs:";
2313   if (pfx & PFX_SS) return "%ss:";
2314   return ""; /* no override */
2315}
2316
2317
2318/* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
2319   linear address by adding any required segment override as indicated
2320   by sorb, and also dealing with any address size override
2321   present. */
2322static
2323IRExpr* handleAddrOverrides ( VexAbiInfo* vbi,
2324                              Prefix pfx, IRExpr* virtual )
2325{
2326   /* --- segment overrides --- */
2327   if (pfx & PFX_FS) {
2328      if (vbi->guest_amd64_assume_fs_is_zero) {
2329         /* Note that this is a linux-kernel specific hack that relies
2330            on the assumption that %fs is always zero. */
2331         /* return virtual + guest_FS_ZERO. */
2332         virtual = binop(Iop_Add64, virtual,
2333                                    IRExpr_Get(OFFB_FS_ZERO, Ity_I64));
2334      } else {
2335         unimplemented("amd64 %fs segment override");
2336      }
2337   }
2338
2339   if (pfx & PFX_GS) {
2340      if (vbi->guest_amd64_assume_gs_is_0x60) {
2341         /* Note that this is a darwin-kernel specific hack that relies
2342            on the assumption that %gs is always 0x60. */
2343         /* return virtual + guest_GS_0x60. */
2344         virtual = binop(Iop_Add64, virtual,
2345                                    IRExpr_Get(OFFB_GS_0x60, Ity_I64));
2346      } else {
2347         unimplemented("amd64 %gs segment override");
2348      }
2349   }
2350
2351   /* cs, ds, es and ss are simply ignored in 64-bit mode. */
2352
2353   /* --- address size override --- */
2354   if (haveASO(pfx))
2355      virtual = unop(Iop_32Uto64, unop(Iop_64to32, virtual));
2356
2357   return virtual;
2358}
2359
2360//.. {
2361//..    Int    sreg;
2362//..    IRType hWordTy;
2363//..    IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
2364//..
2365//..    if (sorb == 0)
2366//..       /* the common case - no override */
2367//..       return virtual;
2368//..
2369//..    switch (sorb) {
2370//..       case 0x3E: sreg = R_DS; break;
2371//..       case 0x26: sreg = R_ES; break;
2372//..       case 0x64: sreg = R_FS; break;
2373//..       case 0x65: sreg = R_GS; break;
2374//..       default: vpanic("handleAddrOverrides(x86,guest)");
2375//..    }
2376//..
2377//..    hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
2378//..
2379//..    seg_selector = newTemp(Ity_I32);
2380//..    ldt_ptr      = newTemp(hWordTy);
2381//..    gdt_ptr      = newTemp(hWordTy);
2382//..    r64          = newTemp(Ity_I64);
2383//..
2384//..    assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
2385//..    assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
2386//..    assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
2387//..
2388//..    /*
2389//..    Call this to do the translation and limit checks:
2390//..    ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
2391//..                                  UInt seg_selector, UInt virtual_addr )
2392//..    */
2393//..    assign(
2394//..       r64,
2395//..       mkIRExprCCall(
2396//..          Ity_I64,
2397//..          0/*regparms*/,
2398//..          "x86g_use_seg_selector",
2399//..          &x86g_use_seg_selector,
2400//..          mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
2401//..                         mkexpr(seg_selector), virtual)
2402//..       )
2403//..    );
2404//..
2405//..    /* If the high 32 of the result are non-zero, there was a
2406//..       failure in address translation.  In which case, make a
2407//..       quick exit.
2408//..    */
2409//..    stmt(
2410//..       IRStmt_Exit(
2411//..          binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
2412//..          Ijk_MapFail,
2413//..          IRConst_U32( guest_eip_curr_instr )
2414//..       )
2415//..    );
2416//..
2417//..    /* otherwise, here's the translated result. */
2418//..    return unop(Iop_64to32, mkexpr(r64));
2419//.. }
2420
2421
2422/* Generate IR to calculate an address indicated by a ModRM and
2423   following SIB bytes.  The expression, and the number of bytes in
2424   the address mode, are returned (the latter in *len).  Note that
2425   this fn should not be called if the R/M part of the address denotes
2426   a register instead of memory.  If print_codegen is true, text of
2427   the addressing mode is placed in buf.
2428
2429   The computed address is stored in a new tempreg, and the
2430   identity of the tempreg is returned.
2431
2432   extra_bytes holds the number of bytes after the amode, as supplied
2433   by the caller.  This is needed to make sense of %rip-relative
2434   addresses.  Note that the value that *len is set to is only the
2435   length of the amode itself and does not include the value supplied
2436   in extra_bytes.
2437 */
2438
2439static IRTemp disAMode_copy2tmp ( IRExpr* addr64 )
2440{
2441   IRTemp tmp = newTemp(Ity_I64);
2442   assign( tmp, addr64 );
2443   return tmp;
2444}
2445
2446static
2447IRTemp disAMode ( /*OUT*/Int* len,
2448                  VexAbiInfo* vbi, Prefix pfx, Long delta,
2449                  /*OUT*/HChar* buf, Int extra_bytes )
2450{
2451   UChar mod_reg_rm = getUChar(delta);
2452   delta++;
2453
2454   buf[0] = (UChar)0;
2455   vassert(extra_bytes >= 0 && extra_bytes < 10);
2456
2457   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
2458      jump table seems a bit excessive.
2459   */
2460   mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
2461   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
2462                                               /* is now XX0XXYYY */
2463   mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
2464   switch (mod_reg_rm) {
2465
2466      /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
2467         REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
2468      */
2469      case 0x00: case 0x01: case 0x02: case 0x03:
2470      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
2471         { UChar rm = toUChar(mod_reg_rm & 7);
2472           DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
2473           *len = 1;
2474           return disAMode_copy2tmp(
2475                  handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,rm)));
2476         }
2477
2478      /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
2479         REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
2480      */
2481      case 0x08: case 0x09: case 0x0A: case 0x0B:
2482      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
2483         { UChar rm = toUChar(mod_reg_rm & 7);
2484           Long d   = getSDisp8(delta);
2485           if (d == 0) {
2486              DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,rm));
2487           } else {
2488              DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
2489           }
2490           *len = 2;
2491           return disAMode_copy2tmp(
2492                  handleAddrOverrides(vbi, pfx,
2493                     binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
2494         }
2495
2496      /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
2497         REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
2498      */
2499      case 0x10: case 0x11: case 0x12: case 0x13:
2500      /* ! 14 */ case 0x15: case 0x16: case 0x17:
2501         { UChar rm = toUChar(mod_reg_rm & 7);
2502           Long  d  = getSDisp32(delta);
2503           DIS(buf, "%s%lld(%s)", segRegTxt(pfx), d, nameIRegRexB(8,pfx,rm));
2504           *len = 5;
2505           return disAMode_copy2tmp(
2506                  handleAddrOverrides(vbi, pfx,
2507                     binop(Iop_Add64,getIRegRexB(8,pfx,rm),mkU64(d))));
2508         }
2509
2510      /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
2511      /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
2512      case 0x18: case 0x19: case 0x1A: case 0x1B:
2513      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
2514         vpanic("disAMode(amd64): not an addr!");
2515
2516      /* RIP + disp32.  This assumes that guest_RIP_curr_instr is set
2517         correctly at the start of handling each instruction. */
2518      case 0x05:
2519         { Long d = getSDisp32(delta);
2520           *len = 5;
2521           DIS(buf, "%s%lld(%%rip)", segRegTxt(pfx), d);
2522           /* We need to know the next instruction's start address.
2523              Try and figure out what it is, record the guess, and ask
2524              the top-level driver logic (bbToIR_AMD64) to check we
2525              guessed right, after the instruction is completely
2526              decoded. */
2527           guest_RIP_next_mustcheck = True;
2528           guest_RIP_next_assumed = guest_RIP_bbstart
2529                                    + delta+4 + extra_bytes;
2530           return disAMode_copy2tmp(
2531                     handleAddrOverrides(vbi, pfx,
2532                        binop(Iop_Add64, mkU64(guest_RIP_next_assumed),
2533                                         mkU64(d))));
2534         }
2535
2536      case 0x04: {
2537         /* SIB, with no displacement.  Special cases:
2538            -- %rsp cannot act as an index value.
2539               If index_r indicates %rsp, zero is used for the index.
2540            -- when mod is zero and base indicates RBP or R13, base is
2541               instead a 32-bit sign-extended literal.
2542            It's all madness, I tell you.  Extract %index, %base and
2543            scale from the SIB byte.  The value denoted is then:
2544               | %index == %RSP && (%base == %RBP || %base == %R13)
2545               = d32 following SIB byte
2546               | %index == %RSP && !(%base == %RBP || %base == %R13)
2547               = %base
2548               | %index != %RSP && (%base == %RBP || %base == %R13)
2549               = d32 following SIB byte + (%index << scale)
2550               | %index != %RSP && !(%base == %RBP || %base == %R13)
2551               = %base + (%index << scale)
2552         */
2553         UChar sib     = getUChar(delta);
2554         UChar scale   = toUChar((sib >> 6) & 3);
2555         UChar index_r = toUChar((sib >> 3) & 7);
2556         UChar base_r  = toUChar(sib & 7);
2557         /* correct since #(R13) == 8 + #(RBP) */
2558         Bool  base_is_BPor13 = toBool(base_r == R_RBP);
2559         Bool  index_is_SP    = toBool(index_r == R_RSP && 0==getRexX(pfx));
2560         delta++;
2561
2562         if ((!index_is_SP) && (!base_is_BPor13)) {
2563            if (scale == 0) {
2564               DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
2565                         nameIRegRexB(8,pfx,base_r),
2566                         nameIReg64rexX(pfx,index_r));
2567            } else {
2568               DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
2569                         nameIRegRexB(8,pfx,base_r),
2570                         nameIReg64rexX(pfx,index_r), 1<<scale);
2571            }
2572            *len = 2;
2573            return
2574               disAMode_copy2tmp(
2575               handleAddrOverrides(vbi, pfx,
2576                  binop(Iop_Add64,
2577                        getIRegRexB(8,pfx,base_r),
2578                        binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
2579                              mkU8(scale)))));
2580         }
2581
2582         if ((!index_is_SP) && base_is_BPor13) {
2583            Long d = getSDisp32(delta);
2584            DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d,
2585                      nameIReg64rexX(pfx,index_r), 1<<scale);
2586            *len = 6;
2587            return
2588               disAMode_copy2tmp(
2589               handleAddrOverrides(vbi, pfx,
2590                  binop(Iop_Add64,
2591                        binop(Iop_Shl64, getIReg64rexX(pfx,index_r),
2592                                         mkU8(scale)),
2593                        mkU64(d))));
2594         }
2595
2596         if (index_is_SP && (!base_is_BPor13)) {
2597            DIS(buf, "%s(%s)", segRegTxt(pfx), nameIRegRexB(8,pfx,base_r));
2598            *len = 2;
2599            return disAMode_copy2tmp(
2600                   handleAddrOverrides(vbi, pfx, getIRegRexB(8,pfx,base_r)));
2601         }
2602
2603         if (index_is_SP && base_is_BPor13) {
2604            Long d = getSDisp32(delta);
2605            DIS(buf, "%s%lld", segRegTxt(pfx), d);
2606            *len = 6;
2607            return disAMode_copy2tmp(
2608                   handleAddrOverrides(vbi, pfx, mkU64(d)));
2609         }
2610
2611         vassert(0);
2612      }
2613
2614      /* SIB, with 8-bit displacement.  Special cases:
2615         -- %esp cannot act as an index value.
2616            If index_r indicates %esp, zero is used for the index.
2617         Denoted value is:
2618            | %index == %ESP
2619            = d8 + %base
2620            | %index != %ESP
2621            = d8 + %base + (%index << scale)
2622      */
2623      case 0x0C: {
2624         UChar sib     = getUChar(delta);
2625         UChar scale   = toUChar((sib >> 6) & 3);
2626         UChar index_r = toUChar((sib >> 3) & 7);
2627         UChar base_r  = toUChar(sib & 7);
2628         Long d        = getSDisp8(delta+1);
2629
2630         if (index_r == R_RSP && 0==getRexX(pfx)) {
2631            DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
2632                                   d, nameIRegRexB(8,pfx,base_r));
2633            *len = 3;
2634            return disAMode_copy2tmp(
2635                   handleAddrOverrides(vbi, pfx,
2636                      binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
2637         } else {
2638            if (scale == 0) {
2639               DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
2640                         nameIRegRexB(8,pfx,base_r),
2641                         nameIReg64rexX(pfx,index_r));
2642            } else {
2643               DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
2644                         nameIRegRexB(8,pfx,base_r),
2645                         nameIReg64rexX(pfx,index_r), 1<<scale);
2646            }
2647            *len = 3;
2648            return
2649                disAMode_copy2tmp(
2650                handleAddrOverrides(vbi, pfx,
2651                  binop(Iop_Add64,
2652                        binop(Iop_Add64,
2653                              getIRegRexB(8,pfx,base_r),
2654                              binop(Iop_Shl64,
2655                                    getIReg64rexX(pfx,index_r), mkU8(scale))),
2656                        mkU64(d))));
2657         }
2658         vassert(0); /*NOTREACHED*/
2659      }
2660
2661      /* SIB, with 32-bit displacement.  Special cases:
2662         -- %rsp cannot act as an index value.
2663            If index_r indicates %rsp, zero is used for the index.
2664         Denoted value is:
2665            | %index == %RSP
2666            = d32 + %base
2667            | %index != %RSP
2668            = d32 + %base + (%index << scale)
2669      */
2670      case 0x14: {
2671         UChar sib     = getUChar(delta);
2672         UChar scale   = toUChar((sib >> 6) & 3);
2673         UChar index_r = toUChar((sib >> 3) & 7);
2674         UChar base_r  = toUChar(sib & 7);
2675         Long d        = getSDisp32(delta+1);
2676
2677         if (index_r == R_RSP && 0==getRexX(pfx)) {
2678            DIS(buf, "%s%lld(%s)", segRegTxt(pfx),
2679                                   d, nameIRegRexB(8,pfx,base_r));
2680            *len = 6;
2681            return disAMode_copy2tmp(
2682                   handleAddrOverrides(vbi, pfx,
2683                      binop(Iop_Add64, getIRegRexB(8,pfx,base_r), mkU64(d)) ));
2684         } else {
2685            if (scale == 0) {
2686               DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
2687                         nameIRegRexB(8,pfx,base_r),
2688                         nameIReg64rexX(pfx,index_r));
2689            } else {
2690               DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
2691                         nameIRegRexB(8,pfx,base_r),
2692                         nameIReg64rexX(pfx,index_r), 1<<scale);
2693            }
2694            *len = 6;
2695            return
2696                disAMode_copy2tmp(
2697                handleAddrOverrides(vbi, pfx,
2698                  binop(Iop_Add64,
2699                        binop(Iop_Add64,
2700                              getIRegRexB(8,pfx,base_r),
2701                              binop(Iop_Shl64,
2702                                    getIReg64rexX(pfx,index_r), mkU8(scale))),
2703                        mkU64(d))));
2704         }
2705         vassert(0); /*NOTREACHED*/
2706      }
2707
2708      default:
2709         vpanic("disAMode(amd64)");
2710         return 0; /*notreached*/
2711   }
2712}
2713
2714
2715/* Similarly for VSIB addressing.  This returns just the addend,
2716   and fills in *rI and *vscale with the register number of the vector
2717   index and its multiplicand.  */
2718static
2719IRTemp disAVSIBMode ( /*OUT*/Int* len,
2720                      VexAbiInfo* vbi, Prefix pfx, Long delta,
2721                      /*OUT*/HChar* buf, /*OUT*/UInt* rI,
2722                      IRType ty, /*OUT*/Int* vscale )
2723{
2724   UChar mod_reg_rm = getUChar(delta);
2725   const HChar *vindex;
2726
2727   *len = 0;
2728   *rI = 0;
2729   *vscale = 0;
2730   buf[0] = (UChar)0;
2731   if ((mod_reg_rm & 7) != 4 || epartIsReg(mod_reg_rm))
2732      return IRTemp_INVALID;
2733
2734   UChar sib     = getUChar(delta+1);
2735   UChar scale   = toUChar((sib >> 6) & 3);
2736   UChar index_r = toUChar((sib >> 3) & 7);
2737   UChar base_r  = toUChar(sib & 7);
2738   Long  d       = 0;
2739   /* correct since #(R13) == 8 + #(RBP) */
2740   Bool  base_is_BPor13 = toBool(base_r == R_RBP);
2741   delta += 2;
2742   *len = 2;
2743
2744   *rI = index_r | (getRexX(pfx) << 3);
2745   if (ty == Ity_V128)
2746      vindex = nameXMMReg(*rI);
2747   else
2748      vindex = nameYMMReg(*rI);
2749   *vscale = 1<<scale;
2750
2751   switch (mod_reg_rm >> 6) {
2752   case 0:
2753      if (base_is_BPor13) {
2754         d = getSDisp32(delta);
2755         *len += 4;
2756         if (scale == 0) {
2757            DIS(buf, "%s%lld(,%s)", segRegTxt(pfx), d, vindex);
2758         } else {
2759            DIS(buf, "%s%lld(,%s,%d)", segRegTxt(pfx), d, vindex, 1<<scale);
2760         }
2761         return disAMode_copy2tmp( mkU64(d) );
2762      } else {
2763         if (scale == 0) {
2764            DIS(buf, "%s(%s,%s)", segRegTxt(pfx),
2765                     nameIRegRexB(8,pfx,base_r), vindex);
2766         } else {
2767            DIS(buf, "%s(%s,%s,%d)", segRegTxt(pfx),
2768                     nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
2769         }
2770      }
2771      break;
2772   case 1:
2773      d = getSDisp8(delta);
2774      *len += 1;
2775      goto have_disp;
2776   case 2:
2777      d = getSDisp32(delta);
2778      *len += 4;
2779   have_disp:
2780      if (scale == 0) {
2781         DIS(buf, "%s%lld(%s,%s)", segRegTxt(pfx), d,
2782                  nameIRegRexB(8,pfx,base_r), vindex);
2783      } else {
2784         DIS(buf, "%s%lld(%s,%s,%d)", segRegTxt(pfx), d,
2785                  nameIRegRexB(8,pfx,base_r), vindex, 1<<scale);
2786      }
2787      break;
2788   }
2789
2790   if (!d)
2791      return disAMode_copy2tmp( getIRegRexB(8,pfx,base_r) );
2792   return disAMode_copy2tmp( binop(Iop_Add64, getIRegRexB(8,pfx,base_r),
2793                                   mkU64(d)) );
2794}
2795
2796
2797/* Figure out the number of (insn-stream) bytes constituting the amode
2798   beginning at delta.  Is useful for getting hold of literals beyond
2799   the end of the amode before it has been disassembled.  */
2800
2801static UInt lengthAMode ( Prefix pfx, Long delta )
2802{
2803   UChar mod_reg_rm = getUChar(delta);
2804   delta++;
2805
2806   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
2807      jump table seems a bit excessive.
2808   */
2809   mod_reg_rm &= 0xC7;                         /* is now XX000YYY */
2810   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
2811                                               /* is now XX0XXYYY */
2812   mod_reg_rm &= 0x1F;                         /* is now 000XXYYY */
2813   switch (mod_reg_rm) {
2814
2815      /* REX.B==0: (%rax) .. (%rdi), not including (%rsp) or (%rbp).
2816         REX.B==1: (%r8)  .. (%r15), not including (%r12) or (%r13).
2817      */
2818      case 0x00: case 0x01: case 0x02: case 0x03:
2819      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
2820         return 1;
2821
2822      /* REX.B==0: d8(%rax) ... d8(%rdi), not including d8(%rsp)
2823         REX.B==1: d8(%r8)  ... d8(%r15), not including d8(%r12)
2824      */
2825      case 0x08: case 0x09: case 0x0A: case 0x0B:
2826      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
2827         return 2;
2828
2829      /* REX.B==0: d32(%rax) ... d32(%rdi), not including d32(%rsp)
2830         REX.B==1: d32(%r8)  ... d32(%r15), not including d32(%r12)
2831      */
2832      case 0x10: case 0x11: case 0x12: case 0x13:
2833      /* ! 14 */ case 0x15: case 0x16: case 0x17:
2834         return 5;
2835
2836      /* REX.B==0: a register, %rax .. %rdi.  This shouldn't happen. */
2837      /* REX.B==1: a register, %r8  .. %r16.  This shouldn't happen. */
2838      /* Not an address, but still handled. */
2839      case 0x18: case 0x19: case 0x1A: case 0x1B:
2840      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
2841         return 1;
2842
2843      /* RIP + disp32. */
2844      case 0x05:
2845         return 5;
2846
2847      case 0x04: {
2848         /* SIB, with no displacement. */
2849         UChar sib     = getUChar(delta);
2850         UChar base_r  = toUChar(sib & 7);
2851         /* correct since #(R13) == 8 + #(RBP) */
2852         Bool  base_is_BPor13 = toBool(base_r == R_RBP);
2853
2854         if (base_is_BPor13) {
2855            return 6;
2856         } else {
2857            return 2;
2858         }
2859      }
2860
2861      /* SIB, with 8-bit displacement. */
2862      case 0x0C:
2863         return 3;
2864
2865      /* SIB, with 32-bit displacement. */
2866      case 0x14:
2867         return 6;
2868
2869      default:
2870         vpanic("lengthAMode(amd64)");
2871         return 0; /*notreached*/
2872   }
2873}
2874
2875
2876/*------------------------------------------------------------*/
2877/*--- Disassembling common idioms                          ---*/
2878/*------------------------------------------------------------*/
2879
2880/* Handle binary integer instructions of the form
2881      op E, G  meaning
2882      op reg-or-mem, reg
2883   Is passed the a ptr to the modRM byte, the actual operation, and the
2884   data size.  Returns the address advanced completely over this
2885   instruction.
2886
2887   E(src) is reg-or-mem
2888   G(dst) is reg.
2889
2890   If E is reg, -->    GET %G,  tmp
2891                       OP %E,   tmp
2892                       PUT tmp, %G
2893
2894   If E is mem and OP is not reversible,
2895                -->    (getAddr E) -> tmpa
2896                       LD (tmpa), tmpa
2897                       GET %G, tmp2
2898                       OP tmpa, tmp2
2899                       PUT tmp2, %G
2900
2901   If E is mem and OP is reversible
2902                -->    (getAddr E) -> tmpa
2903                       LD (tmpa), tmpa
2904                       OP %G, tmpa
2905                       PUT tmpa, %G
2906*/
2907static
2908ULong dis_op2_E_G ( VexAbiInfo* vbi,
2909                    Prefix      pfx,
2910                    Bool        addSubCarry,
2911                    IROp        op8,
2912                    Bool        keep,
2913                    Int         size,
2914                    Long        delta0,
2915                    const HChar* t_amd64opc )
2916{
2917   HChar   dis_buf[50];
2918   Int     len;
2919   IRType  ty   = szToITy(size);
2920   IRTemp  dst1 = newTemp(ty);
2921   IRTemp  src  = newTemp(ty);
2922   IRTemp  dst0 = newTemp(ty);
2923   UChar   rm   = getUChar(delta0);
2924   IRTemp  addr = IRTemp_INVALID;
2925
2926   /* addSubCarry == True indicates the intended operation is
2927      add-with-carry or subtract-with-borrow. */
2928   if (addSubCarry) {
2929      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
2930      vassert(keep);
2931   }
2932
2933   if (epartIsReg(rm)) {
2934      /* Specially handle XOR reg,reg, because that doesn't really
2935         depend on reg, and doing the obvious thing potentially
2936         generates a spurious value check failure due to the bogus
2937         dependency. */
2938      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
2939          && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
2940         if (False && op8 == Iop_Sub8)
2941            vex_printf("vex amd64->IR: sbb %%r,%%r optimisation(1)\n");
2942         putIRegG(size,pfx,rm, mkU(ty,0));
2943      }
2944
2945      assign( dst0, getIRegG(size,pfx,rm) );
2946      assign( src,  getIRegE(size,pfx,rm) );
2947
2948      if (addSubCarry && op8 == Iop_Add8) {
2949         helper_ADC( size, dst1, dst0, src,
2950                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2951         putIRegG(size, pfx, rm, mkexpr(dst1));
2952      } else
2953      if (addSubCarry && op8 == Iop_Sub8) {
2954         helper_SBB( size, dst1, dst0, src,
2955                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2956         putIRegG(size, pfx, rm, mkexpr(dst1));
2957      } else {
2958         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
2959         if (isAddSub(op8))
2960            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2961         else
2962            setFlags_DEP1(op8, dst1, ty);
2963         if (keep)
2964            putIRegG(size, pfx, rm, mkexpr(dst1));
2965      }
2966
2967      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
2968                          nameIRegE(size,pfx,rm),
2969                          nameIRegG(size,pfx,rm));
2970      return 1+delta0;
2971   } else {
2972      /* E refers to memory */
2973      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
2974      assign( dst0, getIRegG(size,pfx,rm) );
2975      assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
2976
2977      if (addSubCarry && op8 == Iop_Add8) {
2978         helper_ADC( size, dst1, dst0, src,
2979                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2980         putIRegG(size, pfx, rm, mkexpr(dst1));
2981      } else
2982      if (addSubCarry && op8 == Iop_Sub8) {
2983         helper_SBB( size, dst1, dst0, src,
2984                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2985         putIRegG(size, pfx, rm, mkexpr(dst1));
2986      } else {
2987         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
2988         if (isAddSub(op8))
2989            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2990         else
2991            setFlags_DEP1(op8, dst1, ty);
2992         if (keep)
2993            putIRegG(size, pfx, rm, mkexpr(dst1));
2994      }
2995
2996      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
2997                          dis_buf, nameIRegG(size, pfx, rm));
2998      return len+delta0;
2999   }
3000}
3001
3002
3003
3004/* Handle binary integer instructions of the form
3005      op G, E  meaning
3006      op reg, reg-or-mem
3007   Is passed the a ptr to the modRM byte, the actual operation, and the
3008   data size.  Returns the address advanced completely over this
3009   instruction.
3010
3011   G(src) is reg.
3012   E(dst) is reg-or-mem
3013
3014   If E is reg, -->    GET %E,  tmp
3015                       OP %G,   tmp
3016                       PUT tmp, %E
3017
3018   If E is mem, -->    (getAddr E) -> tmpa
3019                       LD (tmpa), tmpv
3020                       OP %G, tmpv
3021                       ST tmpv, (tmpa)
3022*/
3023static
3024ULong dis_op2_G_E ( VexAbiInfo* vbi,
3025                    Prefix      pfx,
3026                    Bool        addSubCarry,
3027                    IROp        op8,
3028                    Bool        keep,
3029                    Int         size,
3030                    Long        delta0,
3031                    const HChar* t_amd64opc )
3032{
3033   HChar   dis_buf[50];
3034   Int     len;
3035   IRType  ty   = szToITy(size);
3036   IRTemp  dst1 = newTemp(ty);
3037   IRTemp  src  = newTemp(ty);
3038   IRTemp  dst0 = newTemp(ty);
3039   UChar   rm   = getUChar(delta0);
3040   IRTemp  addr = IRTemp_INVALID;
3041
3042   /* addSubCarry == True indicates the intended operation is
3043      add-with-carry or subtract-with-borrow. */
3044   if (addSubCarry) {
3045      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
3046      vassert(keep);
3047   }
3048
3049   if (epartIsReg(rm)) {
3050      /* Specially handle XOR reg,reg, because that doesn't really
3051         depend on reg, and doing the obvious thing potentially
3052         generates a spurious value check failure due to the bogus
3053         dependency.  Ditto SBB reg,reg. */
3054      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
3055          && offsetIRegG(size,pfx,rm) == offsetIRegE(size,pfx,rm)) {
3056         putIRegE(size,pfx,rm, mkU(ty,0));
3057      }
3058
3059      assign(dst0, getIRegE(size,pfx,rm));
3060      assign(src,  getIRegG(size,pfx,rm));
3061
3062      if (addSubCarry && op8 == Iop_Add8) {
3063         helper_ADC( size, dst1, dst0, src,
3064                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3065         putIRegE(size, pfx, rm, mkexpr(dst1));
3066      } else
3067      if (addSubCarry && op8 == Iop_Sub8) {
3068         helper_SBB( size, dst1, dst0, src,
3069                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3070         putIRegE(size, pfx, rm, mkexpr(dst1));
3071      } else {
3072         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
3073         if (isAddSub(op8))
3074            setFlags_DEP1_DEP2(op8, dst0, src, ty);
3075         else
3076            setFlags_DEP1(op8, dst1, ty);
3077         if (keep)
3078            putIRegE(size, pfx, rm, mkexpr(dst1));
3079      }
3080
3081      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
3082                          nameIRegG(size,pfx,rm),
3083                          nameIRegE(size,pfx,rm));
3084      return 1+delta0;
3085   }
3086
3087   /* E refers to memory */
3088   {
3089      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
3090      assign(dst0, loadLE(ty,mkexpr(addr)));
3091      assign(src,  getIRegG(size,pfx,rm));
3092
3093      if (addSubCarry && op8 == Iop_Add8) {
3094         if (haveLOCK(pfx)) {
3095            /* cas-style store */
3096            helper_ADC( size, dst1, dst0, src,
3097                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
3098         } else {
3099            /* normal store */
3100            helper_ADC( size, dst1, dst0, src,
3101                        /*store*/addr, IRTemp_INVALID, 0 );
3102         }
3103      } else
3104      if (addSubCarry && op8 == Iop_Sub8) {
3105         if (haveLOCK(pfx)) {
3106            /* cas-style store */
3107            helper_SBB( size, dst1, dst0, src,
3108                        /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
3109         } else {
3110            /* normal store */
3111            helper_SBB( size, dst1, dst0, src,
3112                        /*store*/addr, IRTemp_INVALID, 0 );
3113         }
3114      } else {
3115         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
3116         if (keep) {
3117            if (haveLOCK(pfx)) {
3118               if (0) vex_printf("locked case\n" );
3119               casLE( mkexpr(addr),
3120                      mkexpr(dst0)/*expval*/,
3121                      mkexpr(dst1)/*newval*/, guest_RIP_curr_instr );
3122            } else {
3123               if (0) vex_printf("nonlocked case\n");
3124               storeLE(mkexpr(addr), mkexpr(dst1));
3125            }
3126         }
3127         if (isAddSub(op8))
3128            setFlags_DEP1_DEP2(op8, dst0, src, ty);
3129         else
3130            setFlags_DEP1(op8, dst1, ty);
3131      }
3132
3133      DIP("%s%c %s,%s\n", t_amd64opc, nameISize(size),
3134                          nameIRegG(size,pfx,rm), dis_buf);
3135      return len+delta0;
3136   }
3137}
3138
3139
3140/* Handle move instructions of the form
3141      mov E, G  meaning
3142      mov reg-or-mem, reg
3143   Is passed the a ptr to the modRM byte, and the data size.  Returns
3144   the address advanced completely over this instruction.
3145
3146   E(src) is reg-or-mem
3147   G(dst) is reg.
3148
3149   If E is reg, -->    GET %E,  tmpv
3150                       PUT tmpv, %G
3151
3152   If E is mem  -->    (getAddr E) -> tmpa
3153                       LD (tmpa), tmpb
3154                       PUT tmpb, %G
3155*/
3156static
3157ULong dis_mov_E_G ( VexAbiInfo* vbi,
3158                    Prefix      pfx,
3159                    Int         size,
3160                    Long        delta0 )
3161{
3162   Int len;
3163   UChar rm = getUChar(delta0);
3164   HChar dis_buf[50];
3165
3166   if (epartIsReg(rm)) {
3167      putIRegG(size, pfx, rm, getIRegE(size, pfx, rm));
3168      DIP("mov%c %s,%s\n", nameISize(size),
3169                           nameIRegE(size,pfx,rm),
3170                           nameIRegG(size,pfx,rm));
3171      return 1+delta0;
3172   }
3173
3174   /* E refers to memory */
3175   {
3176      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
3177      putIRegG(size, pfx, rm, loadLE(szToITy(size), mkexpr(addr)));
3178      DIP("mov%c %s,%s\n", nameISize(size),
3179                           dis_buf,
3180                           nameIRegG(size,pfx,rm));
3181      return delta0+len;
3182   }
3183}
3184
3185
3186/* Handle move instructions of the form
3187      mov G, E  meaning
3188      mov reg, reg-or-mem
3189   Is passed the a ptr to the modRM byte, and the data size.  Returns
3190   the address advanced completely over this instruction.
3191   We have to decide here whether F2 or F3 are acceptable.  F2 never is.
3192
3193   G(src) is reg.
3194   E(dst) is reg-or-mem
3195
3196   If E is reg, -->    GET %G,  tmp
3197                       PUT tmp, %E
3198
3199   If E is mem, -->    (getAddr E) -> tmpa
3200                       GET %G, tmpv
3201                       ST tmpv, (tmpa)
3202*/
3203static
3204ULong dis_mov_G_E ( VexAbiInfo*  vbi,
3205                    Prefix       pfx,
3206                    Int          size,
3207                    Long         delta0,
3208                    /*OUT*/Bool* ok )
3209{
3210   Int   len;
3211   UChar rm = getUChar(delta0);
3212   HChar dis_buf[50];
3213
3214   *ok = True;
3215
3216   if (epartIsReg(rm)) {
3217      if (haveF2orF3(pfx)) { *ok = False; return delta0; }
3218      putIRegE(size, pfx, rm, getIRegG(size, pfx, rm));
3219      DIP("mov%c %s,%s\n", nameISize(size),
3220                           nameIRegG(size,pfx,rm),
3221                           nameIRegE(size,pfx,rm));
3222      return 1+delta0;
3223   }
3224
3225   /* E refers to memory */
3226   {
3227      if (haveF2(pfx)) { *ok = False; return delta0; }
3228      /* F3(XRELEASE) is acceptable, though. */
3229      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
3230      storeLE( mkexpr(addr), getIRegG(size, pfx, rm) );
3231      DIP("mov%c %s,%s\n", nameISize(size),
3232                           nameIRegG(size,pfx,rm),
3233                           dis_buf);
3234      return len+delta0;
3235   }
3236}
3237
3238
3239/* op $immediate, AL/AX/EAX/RAX. */
3240static
3241ULong dis_op_imm_A ( Int    size,
3242                     Bool   carrying,
3243                     IROp   op8,
3244                     Bool   keep,
3245                     Long   delta,
3246                     const HChar* t_amd64opc )
3247{
3248   Int    size4 = imin(size,4);
3249   IRType ty    = szToITy(size);
3250   IRTemp dst0  = newTemp(ty);
3251   IRTemp src   = newTemp(ty);
3252   IRTemp dst1  = newTemp(ty);
3253   Long  lit    = getSDisp(size4,delta);
3254   assign(dst0, getIRegRAX(size));
3255   assign(src,  mkU(ty,lit & mkSizeMask(size)));
3256
3257   if (isAddSub(op8) && !carrying) {
3258      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
3259      setFlags_DEP1_DEP2(op8, dst0, src, ty);
3260   }
3261   else
3262   if (isLogic(op8)) {
3263      vassert(!carrying);
3264      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
3265      setFlags_DEP1(op8, dst1, ty);
3266   }
3267   else
3268   if (op8 == Iop_Add8 && carrying) {
3269      helper_ADC( size, dst1, dst0, src,
3270                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3271   }
3272   else
3273   if (op8 == Iop_Sub8 && carrying) {
3274      helper_SBB( size, dst1, dst0, src,
3275                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3276   }
3277   else
3278      vpanic("dis_op_imm_A(amd64,guest)");
3279
3280   if (keep)
3281      putIRegRAX(size, mkexpr(dst1));
3282
3283   DIP("%s%c $%lld, %s\n", t_amd64opc, nameISize(size),
3284                           lit, nameIRegRAX(size));
3285   return delta+size4;
3286}
3287
3288
3289/* Sign- and Zero-extending moves. */
3290static
3291ULong dis_movx_E_G ( VexAbiInfo* vbi,
3292                     Prefix pfx,
3293                     Long delta, Int szs, Int szd, Bool sign_extend )
3294{
3295   UChar rm = getUChar(delta);
3296   if (epartIsReg(rm)) {
3297      putIRegG(szd, pfx, rm,
3298                    doScalarWidening(
3299                       szs,szd,sign_extend,
3300                       getIRegE(szs,pfx,rm)));
3301      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
3302                               nameISize(szs),
3303                               nameISize(szd),
3304                               nameIRegE(szs,pfx,rm),
3305                               nameIRegG(szd,pfx,rm));
3306      return 1+delta;
3307   }
3308
3309   /* E refers to memory */
3310   {
3311      Int    len;
3312      HChar  dis_buf[50];
3313      IRTemp addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
3314      putIRegG(szd, pfx, rm,
3315                    doScalarWidening(
3316                       szs,szd,sign_extend,
3317                       loadLE(szToITy(szs),mkexpr(addr))));
3318      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
3319                               nameISize(szs),
3320                               nameISize(szd),
3321                               dis_buf,
3322                               nameIRegG(szd,pfx,rm));
3323      return len+delta;
3324   }
3325}
3326
3327
3328/* Generate code to divide ArchRegs RDX:RAX / EDX:EAX / DX:AX / AX by
3329   the 64 / 32 / 16 / 8 bit quantity in the given IRTemp.  */
3330static
3331void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
3332{
3333   /* special-case the 64-bit case */
3334   if (sz == 8) {
3335      IROp   op     = signed_divide ? Iop_DivModS128to64
3336                                    : Iop_DivModU128to64;
3337      IRTemp src128 = newTemp(Ity_I128);
3338      IRTemp dst128 = newTemp(Ity_I128);
3339      assign( src128, binop(Iop_64HLto128,
3340                            getIReg64(R_RDX),
3341                            getIReg64(R_RAX)) );
3342      assign( dst128, binop(op, mkexpr(src128), mkexpr(t)) );
3343      putIReg64( R_RAX, unop(Iop_128to64,mkexpr(dst128)) );
3344      putIReg64( R_RDX, unop(Iop_128HIto64,mkexpr(dst128)) );
3345   } else {
3346      IROp   op    = signed_divide ? Iop_DivModS64to32
3347                                   : Iop_DivModU64to32;
3348      IRTemp src64 = newTemp(Ity_I64);
3349      IRTemp dst64 = newTemp(Ity_I64);
3350      switch (sz) {
3351      case 4:
3352         assign( src64,
3353                 binop(Iop_32HLto64, getIRegRDX(4), getIRegRAX(4)) );
3354         assign( dst64,
3355                 binop(op, mkexpr(src64), mkexpr(t)) );
3356         putIRegRAX( 4, unop(Iop_64to32,mkexpr(dst64)) );
3357         putIRegRDX( 4, unop(Iop_64HIto32,mkexpr(dst64)) );
3358         break;
3359      case 2: {
3360         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
3361         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
3362         assign( src64, unop(widen3264,
3363                             binop(Iop_16HLto32,
3364                                   getIRegRDX(2),
3365                                   getIRegRAX(2))) );
3366         assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
3367         putIRegRAX( 2, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
3368         putIRegRDX( 2, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
3369         break;
3370      }
3371      case 1: {
3372         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
3373         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
3374         IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
3375         assign( src64, unop(widen3264,
3376                        unop(widen1632, getIRegRAX(2))) );
3377         assign( dst64,
3378                 binop(op, mkexpr(src64),
3379                           unop(widen1632, unop(widen816, mkexpr(t)))) );
3380         putIRegRAX( 1, unop(Iop_16to8,
3381                        unop(Iop_32to16,
3382                        unop(Iop_64to32,mkexpr(dst64)))) );
3383         putIRegAH( unop(Iop_16to8,
3384                    unop(Iop_32to16,
3385                    unop(Iop_64HIto32,mkexpr(dst64)))) );
3386         break;
3387      }
3388      default:
3389         vpanic("codegen_div(amd64)");
3390      }
3391   }
3392}
3393
3394static
3395ULong dis_Grp1 ( VexAbiInfo* vbi,
3396                 Prefix pfx,
3397                 Long delta, UChar modrm,
3398                 Int am_sz, Int d_sz, Int sz, Long d64 )
3399{
3400   Int     len;
3401   HChar   dis_buf[50];
3402   IRType  ty   = szToITy(sz);
3403   IRTemp  dst1 = newTemp(ty);
3404   IRTemp  src  = newTemp(ty);
3405   IRTemp  dst0 = newTemp(ty);
3406   IRTemp  addr = IRTemp_INVALID;
3407   IROp    op8  = Iop_INVALID;
3408   ULong   mask = mkSizeMask(sz);
3409
3410   switch (gregLO3ofRM(modrm)) {
3411      case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
3412      case 2: break;  // ADC
3413      case 3: break;  // SBB
3414      case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
3415      case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
3416      /*NOTREACHED*/
3417      default: vpanic("dis_Grp1(amd64): unhandled case");
3418   }
3419
3420   if (epartIsReg(modrm)) {
3421      vassert(am_sz == 1);
3422
3423      assign(dst0, getIRegE(sz,pfx,modrm));
3424      assign(src,  mkU(ty,d64 & mask));
3425
3426      if (gregLO3ofRM(modrm) == 2 /* ADC */) {
3427         helper_ADC( sz, dst1, dst0, src,
3428                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3429      } else
3430      if (gregLO3ofRM(modrm) == 3 /* SBB */) {
3431         helper_SBB( sz, dst1, dst0, src,
3432                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
3433      } else {
3434         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
3435         if (isAddSub(op8))
3436            setFlags_DEP1_DEP2(op8, dst0, src, ty);
3437         else
3438            setFlags_DEP1(op8, dst1, ty);
3439      }
3440
3441      if (gregLO3ofRM(modrm) < 7)
3442         putIRegE(sz, pfx, modrm, mkexpr(dst1));
3443
3444      delta += (am_sz + d_sz);
3445      DIP("%s%c $%lld, %s\n",
3446          nameGrp1(gregLO3ofRM(modrm)), nameISize(sz), d64,
3447          nameIRegE(sz,pfx,modrm));
3448   } else {
3449      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
3450
3451      assign(dst0, loadLE(ty,mkexpr(addr)));
3452      assign(src, mkU(ty,d64 & mask));
3453
3454      if (gregLO3ofRM(modrm) == 2 /* ADC */) {
3455         if (haveLOCK(pfx)) {
3456            /* cas-style store */
3457            helper_ADC( sz, dst1, dst0, src,
3458                       /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
3459         } else {
3460            /* normal store */
3461            helper_ADC( sz, dst1, dst0, src,
3462                        /*store*/addr, IRTemp_INVALID, 0 );
3463         }
3464      } else
3465      if (gregLO3ofRM(modrm) == 3 /* SBB */) {
3466         if (haveLOCK(pfx)) {
3467            /* cas-style store */
3468            helper_SBB( sz, dst1, dst0, src,
3469                       /*store*/addr, dst0/*expVal*/, guest_RIP_curr_instr );
3470         } else {
3471            /* normal store */
3472            helper_SBB( sz, dst1, dst0, src,
3473                        /*store*/addr, IRTemp_INVALID, 0 );
3474         }
3475      } else {
3476         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
3477         if (gregLO3ofRM(modrm) < 7) {
3478            if (haveLOCK(pfx)) {
3479               casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
3480                                    mkexpr(dst1)/*newVal*/,
3481                                    guest_RIP_curr_instr );
3482            } else {
3483               storeLE(mkexpr(addr), mkexpr(dst1));
3484            }
3485         }
3486         if (isAddSub(op8))
3487            setFlags_DEP1_DEP2(op8, dst0, src, ty);
3488         else
3489            setFlags_DEP1(op8, dst1, ty);
3490      }
3491
3492      delta += (len+d_sz);
3493      DIP("%s%c $%lld, %s\n",
3494          nameGrp1(gregLO3ofRM(modrm)), nameISize(sz),
3495          d64, dis_buf);
3496   }
3497   return delta;
3498}
3499
3500
3501/* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
3502   expression. */
3503
3504static
3505ULong dis_Grp2 ( VexAbiInfo* vbi,
3506                 Prefix pfx,
3507                 Long delta, UChar modrm,
3508                 Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
3509                 const HChar* shift_expr_txt, Bool* decode_OK )
3510{
3511   /* delta on entry points at the modrm byte. */
3512   HChar  dis_buf[50];
3513   Int    len;
3514   Bool   isShift, isRotate, isRotateC;
3515   IRType ty    = szToITy(sz);
3516   IRTemp dst0  = newTemp(ty);
3517   IRTemp dst1  = newTemp(ty);
3518   IRTemp addr  = IRTemp_INVALID;
3519
3520   *decode_OK = True;
3521
3522   vassert(sz == 1 || sz == 2 || sz == 4 || sz == 8);
3523
3524   /* Put value to shift/rotate in dst0. */
3525   if (epartIsReg(modrm)) {
3526      assign(dst0, getIRegE(sz, pfx, modrm));
3527      delta += (am_sz + d_sz);
3528   } else {
3529      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, /*xtra*/d_sz );
3530      assign(dst0, loadLE(ty,mkexpr(addr)));
3531      delta += len + d_sz;
3532   }
3533
3534   isShift = False;
3535   switch (gregLO3ofRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
3536
3537   isRotate = False;
3538   switch (gregLO3ofRM(modrm)) { case 0: case 1: isRotate = True; }
3539
3540   isRotateC = False;
3541   switch (gregLO3ofRM(modrm)) { case 2: case 3: isRotateC = True; }
3542
3543   if (!isShift && !isRotate && !isRotateC) {
3544      /*NOTREACHED*/
3545      vpanic("dis_Grp2(Reg): unhandled case(amd64)");
3546   }
3547
3548   if (isRotateC) {
3549      /* Call a helper; this insn is so ridiculous it does not deserve
3550         better.  One problem is, the helper has to calculate both the
3551         new value and the new flags.  This is more than 64 bits, and
3552         there is no way to return more than 64 bits from the helper.
3553         Hence the crude and obvious solution is to call it twice,
3554         using the sign of the sz field to indicate whether it is the
3555         value or rflags result we want.
3556      */
3557      Bool     left = toBool(gregLO3ofRM(modrm) == 2);
3558      IRExpr** argsVALUE;
3559      IRExpr** argsRFLAGS;
3560
3561      IRTemp new_value  = newTemp(Ity_I64);
3562      IRTemp new_rflags = newTemp(Ity_I64);
3563      IRTemp old_rflags = newTemp(Ity_I64);
3564
3565      assign( old_rflags, widenUto64(mk_amd64g_calculate_rflags_all()) );
3566
3567      argsVALUE
3568         = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
3569                          widenUto64(shift_expr),   /* rotate amount */
3570                          mkexpr(old_rflags),
3571                          mkU64(sz) );
3572      assign( new_value,
3573                 mkIRExprCCall(
3574                    Ity_I64,
3575                    0/*regparm*/,
3576                    left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
3577                    left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
3578                    argsVALUE
3579                 )
3580            );
3581
3582      argsRFLAGS
3583         = mkIRExprVec_4( widenUto64(mkexpr(dst0)), /* thing to rotate */
3584                          widenUto64(shift_expr),   /* rotate amount */
3585                          mkexpr(old_rflags),
3586                          mkU64(-sz) );
3587      assign( new_rflags,
3588                 mkIRExprCCall(
3589                    Ity_I64,
3590                    0/*regparm*/,
3591                    left ? "amd64g_calculate_RCL" : "amd64g_calculate_RCR",
3592                    left ? &amd64g_calculate_RCL  : &amd64g_calculate_RCR,
3593                    argsRFLAGS
3594                 )
3595            );
3596
3597      assign( dst1, narrowTo(ty, mkexpr(new_value)) );
3598      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
3599      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(new_rflags) ));
3600      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
3601      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
3602   }
3603
3604   else
3605   if (isShift) {
3606
3607      IRTemp pre64     = newTemp(Ity_I64);
3608      IRTemp res64     = newTemp(Ity_I64);
3609      IRTemp res64ss   = newTemp(Ity_I64);
3610      IRTemp shift_amt = newTemp(Ity_I8);
3611      UChar  mask      = toUChar(sz==8 ? 63 : 31);
3612      IROp   op64;
3613
3614      switch (gregLO3ofRM(modrm)) {
3615         case 4: op64 = Iop_Shl64; break;
3616         case 5: op64 = Iop_Shr64; break;
3617         case 6: op64 = Iop_Shl64; break;
3618         case 7: op64 = Iop_Sar64; break;
3619         /*NOTREACHED*/
3620         default: vpanic("dis_Grp2:shift"); break;
3621      }
3622
3623      /* Widen the value to be shifted to 64 bits, do the shift, and
3624         narrow back down.  This seems surprisingly long-winded, but
3625         unfortunately the AMD semantics requires that 8/16/32-bit
3626         shifts give defined results for shift values all the way up
3627         to 32, and this seems the simplest way to do it.  It has the
3628         advantage that the only IR level shifts generated are of 64
3629         bit values, and the shift amount is guaranteed to be in the
3630         range 0 .. 63, thereby observing the IR semantics requiring
3631         all shift values to be in the range 0 .. 2^word_size-1.
3632
3633         Therefore the shift amount is masked with 63 for 64-bit shifts
3634         and 31 for all others.
3635      */
3636      /* shift_amt = shift_expr & MASK, regardless of operation size */
3637      assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(mask)) );
3638
3639      /* suitably widen the value to be shifted to 64 bits. */
3640      assign( pre64, op64==Iop_Sar64 ? widenSto64(mkexpr(dst0))
3641                                     : widenUto64(mkexpr(dst0)) );
3642
3643      /* res64 = pre64 `shift` shift_amt */
3644      assign( res64, binop(op64, mkexpr(pre64), mkexpr(shift_amt)) );
3645
3646      /* res64ss = pre64 `shift` ((shift_amt - 1) & MASK) */
3647      assign( res64ss,
3648              binop(op64,
3649                    mkexpr(pre64),
3650                    binop(Iop_And8,
3651                          binop(Iop_Sub8,
3652                                mkexpr(shift_amt), mkU8(1)),
3653                          mkU8(mask))) );
3654
3655      /* Build the flags thunk. */
3656      setFlags_DEP1_DEP2_shift(op64, res64, res64ss, ty, shift_amt);
3657
3658      /* Narrow the result back down. */
3659      assign( dst1, narrowTo(ty, mkexpr(res64)) );
3660
3661   } /* if (isShift) */
3662
3663   else
3664   if (isRotate) {
3665      Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1
3666                                        : (ty==Ity_I32 ? 2 : 3));
3667      Bool   left      = toBool(gregLO3ofRM(modrm) == 0);
3668      IRTemp rot_amt   = newTemp(Ity_I8);
3669      IRTemp rot_amt64 = newTemp(Ity_I8);
3670      IRTemp oldFlags  = newTemp(Ity_I64);
3671      UChar  mask      = toUChar(sz==8 ? 63 : 31);
3672
3673      /* rot_amt = shift_expr & mask */
3674      /* By masking the rotate amount thusly, the IR-level Shl/Shr
3675         expressions never shift beyond the word size and thus remain
3676         well defined. */
3677      assign(rot_amt64, binop(Iop_And8, shift_expr, mkU8(mask)));
3678
3679      if (ty == Ity_I64)
3680         assign(rot_amt, mkexpr(rot_amt64));
3681      else
3682         assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt64), mkU8(8*sz-1)));
3683
3684      if (left) {
3685
3686         /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
3687         assign(dst1,
3688            binop( mkSizedOp(ty,Iop_Or8),
3689                   binop( mkSizedOp(ty,Iop_Shl8),
3690                          mkexpr(dst0),
3691                          mkexpr(rot_amt)
3692                   ),
3693                   binop( mkSizedOp(ty,Iop_Shr8),
3694                          mkexpr(dst0),
3695                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
3696                   )
3697            )
3698         );
3699         ccOp += AMD64G_CC_OP_ROLB;
3700
3701      } else { /* right */
3702
3703         /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
3704         assign(dst1,
3705            binop( mkSizedOp(ty,Iop_Or8),
3706                   binop( mkSizedOp(ty,Iop_Shr8),
3707                          mkexpr(dst0),
3708                          mkexpr(rot_amt)
3709                   ),
3710                   binop( mkSizedOp(ty,Iop_Shl8),
3711                          mkexpr(dst0),
3712                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
3713                   )
3714            )
3715         );
3716         ccOp += AMD64G_CC_OP_RORB;
3717
3718      }
3719
3720      /* dst1 now holds the rotated value.  Build flag thunk.  We
3721         need the resulting value for this, and the previous flags.
3722         Except don't set it if the rotate count is zero. */
3723
3724      assign(oldFlags, mk_amd64g_calculate_rflags_all());
3725
3726      /* rot_amt64 :: Ity_I8.  We need to convert it to I1. */
3727      IRTemp rot_amt64b = newTemp(Ity_I1);
3728      assign(rot_amt64b, binop(Iop_CmpNE8, mkexpr(rot_amt64), mkU8(0)) );
3729
3730      /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
3731      stmt( IRStmt_Put( OFFB_CC_OP,
3732                        IRExpr_ITE( mkexpr(rot_amt64b),
3733                                    mkU64(ccOp),
3734                                    IRExpr_Get(OFFB_CC_OP,Ity_I64) ) ));
3735      stmt( IRStmt_Put( OFFB_CC_DEP1,
3736                        IRExpr_ITE( mkexpr(rot_amt64b),
3737                                    widenUto64(mkexpr(dst1)),
3738                                    IRExpr_Get(OFFB_CC_DEP1,Ity_I64) ) ));
3739      stmt( IRStmt_Put( OFFB_CC_DEP2,
3740                        IRExpr_ITE( mkexpr(rot_amt64b),
3741                                    mkU64(0),
3742                                    IRExpr_Get(OFFB_CC_DEP2,Ity_I64) ) ));
3743      stmt( IRStmt_Put( OFFB_CC_NDEP,
3744                        IRExpr_ITE( mkexpr(rot_amt64b),
3745                                    mkexpr(oldFlags),
3746                                    IRExpr_Get(OFFB_CC_NDEP,Ity_I64) ) ));
3747   } /* if (isRotate) */
3748
3749   /* Save result, and finish up. */
3750   if (epartIsReg(modrm)) {
3751      putIRegE(sz, pfx, modrm, mkexpr(dst1));
3752      if (vex_traceflags & VEX_TRACE_FE) {
3753         vex_printf("%s%c ",
3754                    nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
3755         if (shift_expr_txt)
3756            vex_printf("%s", shift_expr_txt);
3757         else
3758            ppIRExpr(shift_expr);
3759         vex_printf(", %s\n", nameIRegE(sz,pfx,modrm));
3760      }
3761   } else {
3762      storeLE(mkexpr(addr), mkexpr(dst1));
3763      if (vex_traceflags & VEX_TRACE_FE) {
3764         vex_printf("%s%c ",
3765                    nameGrp2(gregLO3ofRM(modrm)), nameISize(sz) );
3766         if (shift_expr_txt)
3767            vex_printf("%s", shift_expr_txt);
3768         else
3769            ppIRExpr(shift_expr);
3770         vex_printf(", %s\n", dis_buf);
3771      }
3772   }
3773   return delta;
3774}
3775
3776
3777/* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
3778static
3779ULong dis_Grp8_Imm ( VexAbiInfo* vbi,
3780                     Prefix pfx,
3781                     Long delta, UChar modrm,
3782                     Int am_sz, Int sz, ULong src_val,
3783                     Bool* decode_OK )
3784{
3785   /* src_val denotes a d8.
3786      And delta on entry points at the modrm byte. */
3787
3788   IRType ty     = szToITy(sz);
3789   IRTemp t2     = newTemp(Ity_I64);
3790   IRTemp t2m    = newTemp(Ity_I64);
3791   IRTemp t_addr = IRTemp_INVALID;
3792   HChar  dis_buf[50];
3793   ULong  mask;
3794
3795   /* we're optimists :-) */
3796   *decode_OK = True;
3797
3798   /* Check whether F2 or F3 are acceptable. */
3799   if (epartIsReg(modrm)) {
3800      /* F2 or F3 are not allowed in the register case. */
3801      if (haveF2orF3(pfx)) {
3802         *decode_OK = False;
3803         return delta;
3804     }
3805   } else {
3806      /* F2 or F3 (but not both) are allowable provided LOCK is also
3807         present. */
3808      if (haveF2orF3(pfx)) {
3809         if (haveF2andF3(pfx) || !haveLOCK(pfx)) {
3810            *decode_OK = False;
3811            return delta;
3812         }
3813      }
3814   }
3815
3816   /* Limit src_val -- the bit offset -- to something within a word.
3817      The Intel docs say that literal offsets larger than a word are
3818      masked in this way. */
3819   switch (sz) {
3820      case 2:  src_val &= 15; break;
3821      case 4:  src_val &= 31; break;
3822      case 8:  src_val &= 63; break;
3823      default: *decode_OK = False; return delta;
3824   }
3825
3826   /* Invent a mask suitable for the operation. */
3827   switch (gregLO3ofRM(modrm)) {
3828      case 4: /* BT */  mask = 0;                  break;
3829      case 5: /* BTS */ mask = 1ULL << src_val;    break;
3830      case 6: /* BTR */ mask = ~(1ULL << src_val); break;
3831      case 7: /* BTC */ mask = 1ULL << src_val;    break;
3832         /* If this needs to be extended, probably simplest to make a
3833            new function to handle the other cases (0 .. 3).  The
3834            Intel docs do however not indicate any use for 0 .. 3, so
3835            we don't expect this to happen. */
3836      default: *decode_OK = False; return delta;
3837   }
3838
3839   /* Fetch the value to be tested and modified into t2, which is
3840      64-bits wide regardless of sz. */
3841   if (epartIsReg(modrm)) {
3842      vassert(am_sz == 1);
3843      assign( t2, widenUto64(getIRegE(sz, pfx, modrm)) );
3844      delta += (am_sz + 1);
3845      DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
3846                                nameISize(sz),
3847                                src_val, nameIRegE(sz,pfx,modrm));
3848   } else {
3849      Int len;
3850      t_addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 1 );
3851      delta  += (len+1);
3852      assign( t2, widenUto64(loadLE(ty, mkexpr(t_addr))) );
3853      DIP("%s%c $0x%llx, %s\n", nameGrp8(gregLO3ofRM(modrm)),
3854                                nameISize(sz),
3855                                src_val, dis_buf);
3856   }
3857
3858   /* Compute the new value into t2m, if non-BT. */
3859   switch (gregLO3ofRM(modrm)) {
3860      case 4: /* BT */
3861         break;
3862      case 5: /* BTS */
3863         assign( t2m, binop(Iop_Or64, mkU64(mask), mkexpr(t2)) );
3864         break;
3865      case 6: /* BTR */
3866         assign( t2m, binop(Iop_And64, mkU64(mask), mkexpr(t2)) );
3867         break;
3868      case 7: /* BTC */
3869         assign( t2m, binop(Iop_Xor64, mkU64(mask), mkexpr(t2)) );
3870         break;
3871     default:
3872         /*NOTREACHED*/ /*the previous switch guards this*/
3873         vassert(0);
3874   }
3875
3876   /* Write the result back, if non-BT. */
3877   if (gregLO3ofRM(modrm) != 4 /* BT */) {
3878      if (epartIsReg(modrm)) {
3879        putIRegE(sz, pfx, modrm, narrowTo(ty, mkexpr(t2m)));
3880      } else {
3881         if (haveLOCK(pfx)) {
3882            casLE( mkexpr(t_addr),
3883                   narrowTo(ty, mkexpr(t2))/*expd*/,
3884                   narrowTo(ty, mkexpr(t2m))/*new*/,
3885                   guest_RIP_curr_instr );
3886         } else {
3887            storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
3888         }
3889      }
3890   }
3891
3892   /* Copy relevant bit from t2 into the carry flag. */
3893   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
3894   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
3895   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
3896   stmt( IRStmt_Put(
3897            OFFB_CC_DEP1,
3898            binop(Iop_And64,
3899                  binop(Iop_Shr64, mkexpr(t2), mkU8(src_val)),
3900                  mkU64(1))
3901       ));
3902   /* Set NDEP even though it isn't used.  This makes redundant-PUT
3903      elimination of previous stores to this field work better. */
3904   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
3905
3906   return delta;
3907}
3908
3909
3910/* Signed/unsigned widening multiply.  Generate IR to multiply the
3911   value in RAX/EAX/AX/AL by the given IRTemp, and park the result in
3912   RDX:RAX/EDX:EAX/DX:AX/AX.
3913*/
3914static void codegen_mulL_A_D ( Int sz, Bool syned,
3915                               IRTemp tmp, const HChar* tmp_txt )
3916{
3917   IRType ty = szToITy(sz);
3918   IRTemp t1 = newTemp(ty);
3919
3920   assign( t1, getIRegRAX(sz) );
3921
3922   switch (ty) {
3923      case Ity_I64: {
3924         IRTemp res128  = newTemp(Ity_I128);
3925         IRTemp resHi   = newTemp(Ity_I64);
3926         IRTemp resLo   = newTemp(Ity_I64);
3927         IROp   mulOp   = syned ? Iop_MullS64 : Iop_MullU64;
3928         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
3929         setFlags_MUL ( Ity_I64, t1, tmp, tBaseOp );
3930         assign( res128, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
3931         assign( resHi, unop(Iop_128HIto64,mkexpr(res128)));
3932         assign( resLo, unop(Iop_128to64,mkexpr(res128)));
3933         putIReg64(R_RDX, mkexpr(resHi));
3934         putIReg64(R_RAX, mkexpr(resLo));
3935         break;
3936      }
3937      case Ity_I32: {
3938         IRTemp res64   = newTemp(Ity_I64);
3939         IRTemp resHi   = newTemp(Ity_I32);
3940         IRTemp resLo   = newTemp(Ity_I32);
3941         IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
3942         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
3943         setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
3944         assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
3945         assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
3946         assign( resLo, unop(Iop_64to32,mkexpr(res64)));
3947         putIRegRDX(4, mkexpr(resHi));
3948         putIRegRAX(4, mkexpr(resLo));
3949         break;
3950      }
3951      case Ity_I16: {
3952         IRTemp res32   = newTemp(Ity_I32);
3953         IRTemp resHi   = newTemp(Ity_I16);
3954         IRTemp resLo   = newTemp(Ity_I16);
3955         IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
3956         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
3957         setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
3958         assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
3959         assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
3960         assign( resLo, unop(Iop_32to16,mkexpr(res32)));
3961         putIRegRDX(2, mkexpr(resHi));
3962         putIRegRAX(2, mkexpr(resLo));
3963         break;
3964      }
3965      case Ity_I8: {
3966         IRTemp res16   = newTemp(Ity_I16);
3967         IRTemp resHi   = newTemp(Ity_I8);
3968         IRTemp resLo   = newTemp(Ity_I8);
3969         IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
3970         UInt   tBaseOp = syned ? AMD64G_CC_OP_SMULB : AMD64G_CC_OP_UMULB;
3971         setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
3972         assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
3973         assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
3974         assign( resLo, unop(Iop_16to8,mkexpr(res16)));
3975         putIRegRAX(2, mkexpr(res16));
3976         break;
3977      }
3978      default:
3979         ppIRType(ty);
3980         vpanic("codegen_mulL_A_D(amd64)");
3981   }
3982   DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
3983}
3984
3985
3986/* Group 3 extended opcodes.  We have to decide here whether F2 and F3
3987   might be valid.*/
3988static
3989ULong dis_Grp3 ( VexAbiInfo* vbi,
3990                 Prefix pfx, Int sz, Long delta, Bool* decode_OK )
3991{
3992   Long    d64;
3993   UChar   modrm;
3994   HChar   dis_buf[50];
3995   Int     len;
3996   IRTemp  addr;
3997   IRType  ty = szToITy(sz);
3998   IRTemp  t1 = newTemp(ty);
3999   IRTemp dst1, src, dst0;
4000   *decode_OK = True;
4001   modrm = getUChar(delta);
4002   if (epartIsReg(modrm)) {
4003      /* F2/XACQ and F3/XREL are always invalid in the non-mem case. */
4004      if (haveF2orF3(pfx)) goto unhandled;
4005      switch (gregLO3ofRM(modrm)) {
4006         case 0: { /* TEST */
4007            delta++;
4008            d64 = getSDisp(imin(4,sz), delta);
4009            delta += imin(4,sz);
4010            dst1 = newTemp(ty);
4011            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
4012                               getIRegE(sz,pfx,modrm),
4013                               mkU(ty, d64 & mkSizeMask(sz))));
4014            setFlags_DEP1( Iop_And8, dst1, ty );
4015            DIP("test%c $%lld, %s\n",
4016                nameISize(sz), d64,
4017                nameIRegE(sz, pfx, modrm));
4018            break;
4019         }
4020         case 1:
4021            *decode_OK = False;
4022            return delta;
4023         case 2: /* NOT */
4024            delta++;
4025            putIRegE(sz, pfx, modrm,
4026                              unop(mkSizedOp(ty,Iop_Not8),
4027                                   getIRegE(sz, pfx, modrm)));
4028            DIP("not%c %s\n", nameISize(sz),
4029                              nameIRegE(sz, pfx, modrm));
4030            break;
4031         case 3: /* NEG */
4032            delta++;
4033            dst0 = newTemp(ty);
4034            src  = newTemp(ty);
4035            dst1 = newTemp(ty);
4036            assign(dst0, mkU(ty,0));
4037            assign(src,  getIRegE(sz, pfx, modrm));
4038            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
4039                                                       mkexpr(src)));
4040            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
4041            putIRegE(sz, pfx, modrm, mkexpr(dst1));
4042            DIP("neg%c %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm));
4043            break;
4044         case 4: /* MUL (unsigned widening) */
4045            delta++;
4046            src = newTemp(ty);
4047            assign(src, getIRegE(sz,pfx,modrm));
4048            codegen_mulL_A_D ( sz, False, src,
4049                               nameIRegE(sz,pfx,modrm) );
4050            break;
4051         case 5: /* IMUL (signed widening) */
4052            delta++;
4053            src = newTemp(ty);
4054            assign(src, getIRegE(sz,pfx,modrm));
4055            codegen_mulL_A_D ( sz, True, src,
4056                               nameIRegE(sz,pfx,modrm) );
4057            break;
4058         case 6: /* DIV */
4059            delta++;
4060            assign( t1, getIRegE(sz, pfx, modrm) );
4061            codegen_div ( sz, t1, False );
4062            DIP("div%c %s\n", nameISize(sz),
4063                              nameIRegE(sz, pfx, modrm));
4064            break;
4065         case 7: /* IDIV */
4066            delta++;
4067            assign( t1, getIRegE(sz, pfx, modrm) );
4068            codegen_div ( sz, t1, True );
4069            DIP("idiv%c %s\n", nameISize(sz),
4070                               nameIRegE(sz, pfx, modrm));
4071            break;
4072         default:
4073            /*NOTREACHED*/
4074            vpanic("Grp3(amd64,R)");
4075      }
4076   } else {
4077      /* Decide if F2/XACQ or F3/XREL might be valid. */
4078      Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
4079      if ((gregLO3ofRM(modrm) == 3/*NEG*/ || gregLO3ofRM(modrm) == 2/*NOT*/)
4080          && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
4081         validF2orF3 = True;
4082      }
4083      if (!validF2orF3) goto unhandled;
4084      /* */
4085      addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
4086                        /* we have to inform disAMode of any immediate
4087                           bytes used */
4088                        gregLO3ofRM(modrm)==0/*TEST*/
4089                           ? imin(4,sz)
4090                           : 0
4091                      );
4092      t1   = newTemp(ty);
4093      delta += len;
4094      assign(t1, loadLE(ty,mkexpr(addr)));
4095      switch (gregLO3ofRM(modrm)) {
4096         case 0: { /* TEST */
4097            d64 = getSDisp(imin(4,sz), delta);
4098            delta += imin(4,sz);
4099            dst1 = newTemp(ty);
4100            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
4101                               mkexpr(t1),
4102                               mkU(ty, d64 & mkSizeMask(sz))));
4103            setFlags_DEP1( Iop_And8, dst1, ty );
4104            DIP("test%c $%lld, %s\n", nameISize(sz), d64, dis_buf);
4105            break;
4106         }
4107         case 1:
4108            *decode_OK = False;
4109            return delta;
4110         case 2: /* NOT */
4111            dst1 = newTemp(ty);
4112            assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
4113            if (haveLOCK(pfx)) {
4114               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
4115                                    guest_RIP_curr_instr );
4116            } else {
4117               storeLE( mkexpr(addr), mkexpr(dst1) );
4118            }
4119            DIP("not%c %s\n", nameISize(sz), dis_buf);
4120            break;
4121         case 3: /* NEG */
4122            dst0 = newTemp(ty);
4123            src  = newTemp(ty);
4124            dst1 = newTemp(ty);
4125            assign(dst0, mkU(ty,0));
4126            assign(src,  mkexpr(t1));
4127            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0),
4128                                                       mkexpr(src)));
4129            if (haveLOCK(pfx)) {
4130               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
4131                                    guest_RIP_curr_instr );
4132            } else {
4133               storeLE( mkexpr(addr), mkexpr(dst1) );
4134            }
4135            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
4136            DIP("neg%c %s\n", nameISize(sz), dis_buf);
4137            break;
4138         case 4: /* MUL (unsigned widening) */
4139            codegen_mulL_A_D ( sz, False, t1, dis_buf );
4140            break;
4141         case 5: /* IMUL */
4142            codegen_mulL_A_D ( sz, True, t1, dis_buf );
4143            break;
4144         case 6: /* DIV */
4145            codegen_div ( sz, t1, False );
4146            DIP("div%c %s\n", nameISize(sz), dis_buf);
4147            break;
4148         case 7: /* IDIV */
4149            codegen_div ( sz, t1, True );
4150            DIP("idiv%c %s\n", nameISize(sz), dis_buf);
4151            break;
4152         default:
4153            /*NOTREACHED*/
4154            vpanic("Grp3(amd64,M)");
4155      }
4156   }
4157   return delta;
4158  unhandled:
4159   *decode_OK = False;
4160   return delta;
4161}
4162
4163
4164/* Group 4 extended opcodes.  We have to decide here whether F2 and F3
4165   might be valid. */
4166static
4167ULong dis_Grp4 ( VexAbiInfo* vbi,
4168                 Prefix pfx, Long delta, Bool* decode_OK )
4169{
4170   Int   alen;
4171   UChar modrm;
4172   HChar dis_buf[50];
4173   IRType ty = Ity_I8;
4174   IRTemp t1 = newTemp(ty);
4175   IRTemp t2 = newTemp(ty);
4176
4177   *decode_OK = True;
4178
4179   modrm = getUChar(delta);
4180   if (epartIsReg(modrm)) {
4181      /* F2/XACQ and F3/XREL are always invalid in the non-mem case. */
4182      if (haveF2orF3(pfx)) goto unhandled;
4183      assign(t1, getIRegE(1, pfx, modrm));
4184      switch (gregLO3ofRM(modrm)) {
4185         case 0: /* INC */
4186            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
4187            putIRegE(1, pfx, modrm, mkexpr(t2));
4188            setFlags_INC_DEC( True, t2, ty );
4189            break;
4190         case 1: /* DEC */
4191            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
4192            putIRegE(1, pfx, modrm, mkexpr(t2));
4193            setFlags_INC_DEC( False, t2, ty );
4194            break;
4195         default:
4196            *decode_OK = False;
4197            return delta;
4198      }
4199      delta++;
4200      DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)),
4201                      nameIRegE(1, pfx, modrm));
4202   } else {
4203      /* Decide if F2/XACQ or F3/XREL might be valid. */
4204      Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
4205      if ((gregLO3ofRM(modrm) == 0/*INC*/ || gregLO3ofRM(modrm) == 1/*DEC*/)
4206          && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
4207         validF2orF3 = True;
4208      }
4209      if (!validF2orF3) goto unhandled;
4210      /* */
4211      IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
4212      assign( t1, loadLE(ty, mkexpr(addr)) );
4213      switch (gregLO3ofRM(modrm)) {
4214         case 0: /* INC */
4215            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
4216            if (haveLOCK(pfx)) {
4217               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
4218                      guest_RIP_curr_instr );
4219            } else {
4220               storeLE( mkexpr(addr), mkexpr(t2) );
4221            }
4222            setFlags_INC_DEC( True, t2, ty );
4223            break;
4224         case 1: /* DEC */
4225            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
4226            if (haveLOCK(pfx)) {
4227               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
4228                      guest_RIP_curr_instr );
4229            } else {
4230               storeLE( mkexpr(addr), mkexpr(t2) );
4231            }
4232            setFlags_INC_DEC( False, t2, ty );
4233            break;
4234         default:
4235            *decode_OK = False;
4236            return delta;
4237      }
4238      delta += alen;
4239      DIP("%sb %s\n", nameGrp4(gregLO3ofRM(modrm)), dis_buf);
4240   }
4241   return delta;
4242  unhandled:
4243   *decode_OK = False;
4244   return delta;
4245}
4246
4247
4248/* Group 5 extended opcodes.  We have to decide here whether F2 and F3
4249   might be valid. */
4250static
4251ULong dis_Grp5 ( VexAbiInfo* vbi,
4252                 Prefix pfx, Int sz, Long delta,
4253                 /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
4254{
4255   Int     len;
4256   UChar   modrm;
4257   HChar   dis_buf[50];
4258   IRTemp  addr = IRTemp_INVALID;
4259   IRType  ty = szToITy(sz);
4260   IRTemp  t1 = newTemp(ty);
4261   IRTemp  t2 = IRTemp_INVALID;
4262   IRTemp  t3 = IRTemp_INVALID;
4263   Bool    showSz = True;
4264
4265   *decode_OK = True;
4266
4267   modrm = getUChar(delta);
4268   if (epartIsReg(modrm)) {
4269      /* F2/XACQ and F3/XREL are always invalid in the non-mem case.
4270         F2/CALL and F2/JMP may have bnd prefix. */
4271     if (haveF2orF3(pfx)
4272         && ! (haveF2(pfx)
4273               && (gregLO3ofRM(modrm) == 2 || gregLO3ofRM(modrm) == 4)))
4274        goto unhandledR;
4275      assign(t1, getIRegE(sz,pfx,modrm));
4276      switch (gregLO3ofRM(modrm)) {
4277         case 0: /* INC */
4278            t2 = newTemp(ty);
4279            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
4280                             mkexpr(t1), mkU(ty,1)));
4281            setFlags_INC_DEC( True, t2, ty );
4282            putIRegE(sz,pfx,modrm, mkexpr(t2));
4283            break;
4284         case 1: /* DEC */
4285            t2 = newTemp(ty);
4286            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
4287                             mkexpr(t1), mkU(ty,1)));
4288            setFlags_INC_DEC( False, t2, ty );
4289            putIRegE(sz,pfx,modrm, mkexpr(t2));
4290            break;
4291         case 2: /* call Ev */
4292            /* Ignore any sz value and operate as if sz==8. */
4293            if (!(sz == 4 || sz == 8)) goto unhandledR;
4294            if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
4295            sz = 8;
4296            t3 = newTemp(Ity_I64);
4297            assign(t3, getIRegE(sz,pfx,modrm));
4298            t2 = newTemp(Ity_I64);
4299            assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
4300            putIReg64(R_RSP, mkexpr(t2));
4301            storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+1));
4302            make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(reg)");
4303            jmp_treg(dres, Ijk_Call, t3);
4304            vassert(dres->whatNext == Dis_StopHere);
4305            showSz = False;
4306            break;
4307         case 4: /* jmp Ev */
4308            /* Ignore any sz value and operate as if sz==8. */
4309            if (!(sz == 4 || sz == 8)) goto unhandledR;
4310            if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
4311            sz = 8;
4312            t3 = newTemp(Ity_I64);
4313            assign(t3, getIRegE(sz,pfx,modrm));
4314            jmp_treg(dres, Ijk_Boring, t3);
4315            vassert(dres->whatNext == Dis_StopHere);
4316            showSz = False;
4317            break;
4318         case 6: /* PUSH Ev */
4319            /* There is no encoding for 32-bit operand size; hence ... */
4320            if (sz == 4) sz = 8;
4321            if (sz == 8 || sz == 2) {
4322               ty = szToITy(sz); /* redo it, since sz might have changed */
4323               t3 = newTemp(ty);
4324               assign(t3, getIRegE(sz,pfx,modrm));
4325               t2 = newTemp(Ity_I64);
4326               assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
4327               putIReg64(R_RSP, mkexpr(t2) );
4328               storeLE( mkexpr(t2), mkexpr(t3) );
4329               break;
4330            } else {
4331               goto unhandledR; /* awaiting test case */
4332            }
4333         default:
4334         unhandledR:
4335            *decode_OK = False;
4336            return delta;
4337      }
4338      delta++;
4339      DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
4340                       showSz ? nameISize(sz) : ' ',
4341                       nameIRegE(sz, pfx, modrm));
4342   } else {
4343      /* Decide if F2/XACQ, F3/XREL, F2/CALL or F2/JMP might be valid. */
4344      Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
4345      if ((gregLO3ofRM(modrm) == 0/*INC*/ || gregLO3ofRM(modrm) == 1/*DEC*/)
4346          && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
4347         validF2orF3 = True;
4348      } else if ((gregLO3ofRM(modrm) == 2 || gregLO3ofRM(modrm) == 4)
4349                 && (haveF2(pfx) && !haveF3(pfx))) {
4350         validF2orF3 = True;
4351      }
4352      if (!validF2orF3) goto unhandledM;
4353      /* */
4354      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
4355      if (gregLO3ofRM(modrm) != 2 && gregLO3ofRM(modrm) != 4
4356                                  && gregLO3ofRM(modrm) != 6) {
4357         assign(t1, loadLE(ty,mkexpr(addr)));
4358      }
4359      switch (gregLO3ofRM(modrm)) {
4360         case 0: /* INC */
4361            t2 = newTemp(ty);
4362            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
4363                             mkexpr(t1), mkU(ty,1)));
4364            if (haveLOCK(pfx)) {
4365               casLE( mkexpr(addr),
4366                      mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
4367            } else {
4368               storeLE(mkexpr(addr),mkexpr(t2));
4369            }
4370            setFlags_INC_DEC( True, t2, ty );
4371            break;
4372         case 1: /* DEC */
4373            t2 = newTemp(ty);
4374            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
4375                             mkexpr(t1), mkU(ty,1)));
4376            if (haveLOCK(pfx)) {
4377               casLE( mkexpr(addr),
4378                      mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
4379            } else {
4380               storeLE(mkexpr(addr),mkexpr(t2));
4381            }
4382            setFlags_INC_DEC( False, t2, ty );
4383            break;
4384         case 2: /* call Ev */
4385            /* Ignore any sz value and operate as if sz==8. */
4386            if (!(sz == 4 || sz == 8)) goto unhandledM;
4387            if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
4388            sz = 8;
4389            t3 = newTemp(Ity_I64);
4390            assign(t3, loadLE(Ity_I64,mkexpr(addr)));
4391            t2 = newTemp(Ity_I64);
4392            assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
4393            putIReg64(R_RSP, mkexpr(t2));
4394            storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta+len));
4395            make_redzone_AbiHint(vbi, t2, t3/*nia*/, "call-Ev(mem)");
4396            jmp_treg(dres, Ijk_Call, t3);
4397            vassert(dres->whatNext == Dis_StopHere);
4398            showSz = False;
4399            break;
4400         case 4: /* JMP Ev */
4401            /* Ignore any sz value and operate as if sz==8. */
4402            if (!(sz == 4 || sz == 8)) goto unhandledM;
4403            if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
4404            sz = 8;
4405            t3 = newTemp(Ity_I64);
4406            assign(t3, loadLE(Ity_I64,mkexpr(addr)));
4407            jmp_treg(dres, Ijk_Boring, t3);
4408            vassert(dres->whatNext == Dis_StopHere);
4409            showSz = False;
4410            break;
4411         case 6: /* PUSH Ev */
4412            /* There is no encoding for 32-bit operand size; hence ... */
4413            if (sz == 4) sz = 8;
4414            if (sz == 8 || sz == 2) {
4415               ty = szToITy(sz); /* redo it, since sz might have changed */
4416               t3 = newTemp(ty);
4417               assign(t3, loadLE(ty,mkexpr(addr)));
4418               t2 = newTemp(Ity_I64);
4419               assign( t2, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
4420               putIReg64(R_RSP, mkexpr(t2) );
4421               storeLE( mkexpr(t2), mkexpr(t3) );
4422               break;
4423            } else {
4424               goto unhandledM; /* awaiting test case */
4425            }
4426         default:
4427         unhandledM:
4428            *decode_OK = False;
4429            return delta;
4430      }
4431      delta += len;
4432      DIP("%s%c %s\n", nameGrp5(gregLO3ofRM(modrm)),
4433                       showSz ? nameISize(sz) : ' ',
4434                       dis_buf);
4435   }
4436   return delta;
4437}
4438
4439
4440/*------------------------------------------------------------*/
4441/*--- Disassembling string ops (including REP prefixes)    ---*/
4442/*------------------------------------------------------------*/
4443
4444/* Code shared by all the string ops */
4445static
4446void dis_string_op_increment ( Int sz, IRTemp t_inc )
4447{
4448   UChar logSz;
4449   if (sz == 8 || sz == 4 || sz == 2) {
4450      logSz = 1;
4451      if (sz == 4) logSz = 2;
4452      if (sz == 8) logSz = 3;
4453      assign( t_inc,
4454              binop(Iop_Shl64, IRExpr_Get( OFFB_DFLAG, Ity_I64 ),
4455                               mkU8(logSz) ) );
4456   } else {
4457      assign( t_inc,
4458              IRExpr_Get( OFFB_DFLAG, Ity_I64 ) );
4459   }
4460}
4461
4462static
4463void dis_string_op( void (*dis_OP)( Int, IRTemp, Prefix pfx ),
4464                    Int sz, const HChar* name, Prefix pfx )
4465{
4466   IRTemp t_inc = newTemp(Ity_I64);
4467   /* Really we ought to inspect the override prefixes, but we don't.
4468      The following assertion catches any resulting sillyness. */
4469   vassert(pfx == clearSegBits(pfx));
4470   dis_string_op_increment(sz, t_inc);
4471   dis_OP( sz, t_inc, pfx );
4472   DIP("%s%c\n", name, nameISize(sz));
4473}
4474
4475static
4476void dis_MOVS ( Int sz, IRTemp t_inc, Prefix pfx )
4477{
4478   IRType ty = szToITy(sz);
4479   IRTemp td = newTemp(Ity_I64);   /* RDI */
4480   IRTemp ts = newTemp(Ity_I64);   /* RSI */
4481   IRExpr *incd, *incs;
4482
4483   if (haveASO(pfx)) {
4484      assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
4485      assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
4486   } else {
4487      assign( td, getIReg64(R_RDI) );
4488      assign( ts, getIReg64(R_RSI) );
4489   }
4490
4491   storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
4492
4493   incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
4494   incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
4495   if (haveASO(pfx)) {
4496      incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
4497      incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
4498   }
4499   putIReg64( R_RDI, incd );
4500   putIReg64( R_RSI, incs );
4501}
4502
4503static
4504void dis_LODS ( Int sz, IRTemp t_inc, Prefix pfx )
4505{
4506   IRType ty = szToITy(sz);
4507   IRTemp ts = newTemp(Ity_I64);   /* RSI */
4508   IRExpr *incs;
4509
4510   if (haveASO(pfx))
4511      assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
4512   else
4513      assign( ts, getIReg64(R_RSI) );
4514
4515   putIRegRAX ( sz, loadLE(ty, mkexpr(ts)) );
4516
4517   incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
4518   if (haveASO(pfx))
4519      incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
4520   putIReg64( R_RSI, incs );
4521}
4522
4523static
4524void dis_STOS ( Int sz, IRTemp t_inc, Prefix pfx )
4525{
4526   IRType ty = szToITy(sz);
4527   IRTemp ta = newTemp(ty);        /* rAX */
4528   IRTemp td = newTemp(Ity_I64);   /* RDI */
4529   IRExpr *incd;
4530
4531   assign( ta, getIRegRAX(sz) );
4532
4533   if (haveASO(pfx))
4534      assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
4535   else
4536      assign( td, getIReg64(R_RDI) );
4537
4538   storeLE( mkexpr(td), mkexpr(ta) );
4539
4540   incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
4541   if (haveASO(pfx))
4542      incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
4543   putIReg64( R_RDI, incd );
4544}
4545
4546static
4547void dis_CMPS ( Int sz, IRTemp t_inc, Prefix pfx )
4548{
4549   IRType ty  = szToITy(sz);
4550   IRTemp tdv = newTemp(ty);      /* (RDI) */
4551   IRTemp tsv = newTemp(ty);      /* (RSI) */
4552   IRTemp td  = newTemp(Ity_I64); /*  RDI  */
4553   IRTemp ts  = newTemp(Ity_I64); /*  RSI  */
4554   IRExpr *incd, *incs;
4555
4556   if (haveASO(pfx)) {
4557      assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
4558      assign( ts, unop(Iop_32Uto64, getIReg32(R_RSI)) );
4559   } else {
4560      assign( td, getIReg64(R_RDI) );
4561      assign( ts, getIReg64(R_RSI) );
4562   }
4563
4564   assign( tdv, loadLE(ty,mkexpr(td)) );
4565
4566   assign( tsv, loadLE(ty,mkexpr(ts)) );
4567
4568   setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
4569
4570   incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
4571   incs = binop(Iop_Add64, mkexpr(ts), mkexpr(t_inc));
4572   if (haveASO(pfx)) {
4573      incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
4574      incs = unop(Iop_32Uto64, unop(Iop_64to32, incs));
4575   }
4576   putIReg64( R_RDI, incd );
4577   putIReg64( R_RSI, incs );
4578}
4579
4580static
4581void dis_SCAS ( Int sz, IRTemp t_inc, Prefix pfx )
4582{
4583   IRType ty  = szToITy(sz);
4584   IRTemp ta  = newTemp(ty);       /*  rAX  */
4585   IRTemp td  = newTemp(Ity_I64);  /*  RDI  */
4586   IRTemp tdv = newTemp(ty);       /* (RDI) */
4587   IRExpr *incd;
4588
4589   assign( ta, getIRegRAX(sz) );
4590
4591   if (haveASO(pfx))
4592      assign( td, unop(Iop_32Uto64, getIReg32(R_RDI)) );
4593   else
4594      assign( td, getIReg64(R_RDI) );
4595
4596   assign( tdv, loadLE(ty,mkexpr(td)) );
4597
4598   setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
4599
4600   incd = binop(Iop_Add64, mkexpr(td), mkexpr(t_inc));
4601   if (haveASO(pfx))
4602      incd = unop(Iop_32Uto64, unop(Iop_64to32, incd));
4603   putIReg64( R_RDI, incd );
4604}
4605
4606
4607/* Wrap the appropriate string op inside a REP/REPE/REPNE.  We assume
4608   the insn is the last one in the basic block, and so emit a jump to
4609   the next insn, rather than just falling through. */
4610static
4611void dis_REP_op ( /*MOD*/DisResult* dres,
4612                  AMD64Condcode cond,
4613                  void (*dis_OP)(Int, IRTemp, Prefix),
4614                  Int sz, Addr64 rip, Addr64 rip_next, const HChar* name,
4615                  Prefix pfx )
4616{
4617   IRTemp t_inc = newTemp(Ity_I64);
4618   IRTemp tc;
4619   IRExpr* cmp;
4620
4621   /* Really we ought to inspect the override prefixes, but we don't.
4622      The following assertion catches any resulting sillyness. */
4623   vassert(pfx == clearSegBits(pfx));
4624
4625   if (haveASO(pfx)) {
4626      tc = newTemp(Ity_I32);  /*  ECX  */
4627      assign( tc, getIReg32(R_RCX) );
4628      cmp = binop(Iop_CmpEQ32, mkexpr(tc), mkU32(0));
4629   } else {
4630      tc = newTemp(Ity_I64);  /*  RCX  */
4631      assign( tc, getIReg64(R_RCX) );
4632      cmp = binop(Iop_CmpEQ64, mkexpr(tc), mkU64(0));
4633   }
4634
4635   stmt( IRStmt_Exit( cmp, Ijk_Boring,
4636                      IRConst_U64(rip_next), OFFB_RIP ) );
4637
4638   if (haveASO(pfx))
4639      putIReg32(R_RCX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
4640  else
4641      putIReg64(R_RCX, binop(Iop_Sub64, mkexpr(tc), mkU64(1)) );
4642
4643   dis_string_op_increment(sz, t_inc);
4644   dis_OP (sz, t_inc, pfx);
4645
4646   if (cond == AMD64CondAlways) {
4647      jmp_lit(dres, Ijk_Boring, rip);
4648      vassert(dres->whatNext == Dis_StopHere);
4649   } else {
4650      stmt( IRStmt_Exit( mk_amd64g_calculate_condition(cond),
4651                         Ijk_Boring,
4652                         IRConst_U64(rip),
4653                         OFFB_RIP ) );
4654      jmp_lit(dres, Ijk_Boring, rip_next);
4655      vassert(dres->whatNext == Dis_StopHere);
4656   }
4657   DIP("%s%c\n", name, nameISize(sz));
4658}
4659
4660
4661/*------------------------------------------------------------*/
4662/*--- Arithmetic, etc.                                     ---*/
4663/*------------------------------------------------------------*/
4664
4665/* IMUL E, G.  Supplied eip points to the modR/M byte. */
4666static
4667ULong dis_mul_E_G ( VexAbiInfo* vbi,
4668                    Prefix      pfx,
4669                    Int         size,
4670                    Long        delta0 )
4671{
4672   Int    alen;
4673   HChar  dis_buf[50];
4674   UChar  rm = getUChar(delta0);
4675   IRType ty = szToITy(size);
4676   IRTemp te = newTemp(ty);
4677   IRTemp tg = newTemp(ty);
4678   IRTemp resLo = newTemp(ty);
4679
4680   assign( tg, getIRegG(size, pfx, rm) );
4681   if (epartIsReg(rm)) {
4682      assign( te, getIRegE(size, pfx, rm) );
4683   } else {
4684      IRTemp addr = disAMode( &alen, vbi, pfx, delta0, dis_buf, 0 );
4685      assign( te, loadLE(ty,mkexpr(addr)) );
4686   }
4687
4688   setFlags_MUL ( ty, te, tg, AMD64G_CC_OP_SMULB );
4689
4690   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
4691
4692   putIRegG(size, pfx, rm, mkexpr(resLo) );
4693
4694   if (epartIsReg(rm)) {
4695      DIP("imul%c %s, %s\n", nameISize(size),
4696                             nameIRegE(size,pfx,rm),
4697                             nameIRegG(size,pfx,rm));
4698      return 1+delta0;
4699   } else {
4700      DIP("imul%c %s, %s\n", nameISize(size),
4701                             dis_buf,
4702                             nameIRegG(size,pfx,rm));
4703      return alen+delta0;
4704   }
4705}
4706
4707
4708/* IMUL I * E -> G.  Supplied rip points to the modR/M byte. */
4709static
4710ULong dis_imul_I_E_G ( VexAbiInfo* vbi,
4711                       Prefix      pfx,
4712                       Int         size,
4713                       Long        delta,
4714                       Int         litsize )
4715{
4716   Long   d64;
4717   Int    alen;
4718   HChar  dis_buf[50];
4719   UChar  rm = getUChar(delta);
4720   IRType ty = szToITy(size);
4721   IRTemp te = newTemp(ty);
4722   IRTemp tl = newTemp(ty);
4723   IRTemp resLo = newTemp(ty);
4724
4725   vassert(/*size == 1 ||*/ size == 2 || size == 4 || size == 8);
4726
4727   if (epartIsReg(rm)) {
4728      assign(te, getIRegE(size, pfx, rm));
4729      delta++;
4730   } else {
4731      IRTemp addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
4732                                     imin(4,litsize) );
4733      assign(te, loadLE(ty, mkexpr(addr)));
4734      delta += alen;
4735   }
4736   d64 = getSDisp(imin(4,litsize),delta);
4737   delta += imin(4,litsize);
4738
4739   d64 &= mkSizeMask(size);
4740   assign(tl, mkU(ty,d64));
4741
4742   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
4743
4744   setFlags_MUL ( ty, te, tl, AMD64G_CC_OP_SMULB );
4745
4746   putIRegG(size, pfx, rm, mkexpr(resLo));
4747
4748   DIP("imul%c $%lld, %s, %s\n",
4749       nameISize(size), d64,
4750       ( epartIsReg(rm) ? nameIRegE(size,pfx,rm) : dis_buf ),
4751       nameIRegG(size,pfx,rm) );
4752   return delta;
4753}
4754
4755
4756/* Generate an IR sequence to do a popcount operation on the supplied
4757   IRTemp, and return a new IRTemp holding the result.  'ty' may be
4758   Ity_I16, Ity_I32 or Ity_I64 only. */
4759static IRTemp gen_POPCOUNT ( IRType ty, IRTemp src )
4760{
4761   Int i;
4762   if (ty == Ity_I16) {
4763      IRTemp old = IRTemp_INVALID;
4764      IRTemp nyu = IRTemp_INVALID;
4765      IRTemp mask[4], shift[4];
4766      for (i = 0; i < 4; i++) {
4767         mask[i]  = newTemp(ty);
4768         shift[i] = 1 << i;
4769      }
4770      assign(mask[0], mkU16(0x5555));
4771      assign(mask[1], mkU16(0x3333));
4772      assign(mask[2], mkU16(0x0F0F));
4773      assign(mask[3], mkU16(0x00FF));
4774      old = src;
4775      for (i = 0; i < 4; i++) {
4776         nyu = newTemp(ty);
4777         assign(nyu,
4778                binop(Iop_Add16,
4779                      binop(Iop_And16,
4780                            mkexpr(old),
4781                            mkexpr(mask[i])),
4782                      binop(Iop_And16,
4783                            binop(Iop_Shr16, mkexpr(old), mkU8(shift[i])),
4784                            mkexpr(mask[i]))));
4785         old = nyu;
4786      }
4787      return nyu;
4788   }
4789   if (ty == Ity_I32) {
4790      IRTemp old = IRTemp_INVALID;
4791      IRTemp nyu = IRTemp_INVALID;
4792      IRTemp mask[5], shift[5];
4793      for (i = 0; i < 5; i++) {
4794         mask[i]  = newTemp(ty);
4795         shift[i] = 1 << i;
4796      }
4797      assign(mask[0], mkU32(0x55555555));
4798      assign(mask[1], mkU32(0x33333333));
4799      assign(mask[2], mkU32(0x0F0F0F0F));
4800      assign(mask[3], mkU32(0x00FF00FF));
4801      assign(mask[4], mkU32(0x0000FFFF));
4802      old = src;
4803      for (i = 0; i < 5; i++) {
4804         nyu = newTemp(ty);
4805         assign(nyu,
4806                binop(Iop_Add32,
4807                      binop(Iop_And32,
4808                            mkexpr(old),
4809                            mkexpr(mask[i])),
4810                      binop(Iop_And32,
4811                            binop(Iop_Shr32, mkexpr(old), mkU8(shift[i])),
4812                            mkexpr(mask[i]))));
4813         old = nyu;
4814      }
4815      return nyu;
4816   }
4817   if (ty == Ity_I64) {
4818      IRTemp old = IRTemp_INVALID;
4819      IRTemp nyu = IRTemp_INVALID;
4820      IRTemp mask[6], shift[6];
4821      for (i = 0; i < 6; i++) {
4822         mask[i]  = newTemp(ty);
4823         shift[i] = 1 << i;
4824      }
4825      assign(mask[0], mkU64(0x5555555555555555ULL));
4826      assign(mask[1], mkU64(0x3333333333333333ULL));
4827      assign(mask[2], mkU64(0x0F0F0F0F0F0F0F0FULL));
4828      assign(mask[3], mkU64(0x00FF00FF00FF00FFULL));
4829      assign(mask[4], mkU64(0x0000FFFF0000FFFFULL));
4830      assign(mask[5], mkU64(0x00000000FFFFFFFFULL));
4831      old = src;
4832      for (i = 0; i < 6; i++) {
4833         nyu = newTemp(ty);
4834         assign(nyu,
4835                binop(Iop_Add64,
4836                      binop(Iop_And64,
4837                            mkexpr(old),
4838                            mkexpr(mask[i])),
4839                      binop(Iop_And64,
4840                            binop(Iop_Shr64, mkexpr(old), mkU8(shift[i])),
4841                            mkexpr(mask[i]))));
4842         old = nyu;
4843      }
4844      return nyu;
4845   }
4846   /*NOTREACHED*/
4847   vassert(0);
4848}
4849
4850
4851/* Generate an IR sequence to do a count-leading-zeroes operation on
4852   the supplied IRTemp, and return a new IRTemp holding the result.
4853   'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
4854   the argument is zero, return the number of bits in the word (the
4855   natural semantics). */
4856static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
4857{
4858   vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
4859
4860   IRTemp src64 = newTemp(Ity_I64);
4861   assign(src64, widenUto64( mkexpr(src) ));
4862
4863   IRTemp src64x = newTemp(Ity_I64);
4864   assign(src64x,
4865          binop(Iop_Shl64, mkexpr(src64),
4866                           mkU8(64 - 8 * sizeofIRType(ty))));
4867
4868   // Clz64 has undefined semantics when its input is zero, so
4869   // special-case around that.
4870   IRTemp res64 = newTemp(Ity_I64);
4871   assign(res64,
4872          IRExpr_ITE(
4873             binop(Iop_CmpEQ64, mkexpr(src64x), mkU64(0)),
4874             mkU64(8 * sizeofIRType(ty)),
4875             unop(Iop_Clz64, mkexpr(src64x))
4876   ));
4877
4878   IRTemp res = newTemp(ty);
4879   assign(res, narrowTo(ty, mkexpr(res64)));
4880   return res;
4881}
4882
4883
4884/* Generate an IR sequence to do a count-trailing-zeroes operation on
4885   the supplied IRTemp, and return a new IRTemp holding the result.
4886   'ty' may be Ity_I16, Ity_I32 or Ity_I64 only.  In the case where
4887   the argument is zero, return the number of bits in the word (the
4888   natural semantics). */
4889static IRTemp gen_TZCNT ( IRType ty, IRTemp src )
4890{
4891   vassert(ty == Ity_I64 || ty == Ity_I32 || ty == Ity_I16);
4892
4893   IRTemp src64 = newTemp(Ity_I64);
4894   assign(src64, widenUto64( mkexpr(src) ));
4895
4896   // Ctz64 has undefined semantics when its input is zero, so
4897   // special-case around that.
4898   IRTemp res64 = newTemp(Ity_I64);
4899   assign(res64,
4900          IRExpr_ITE(
4901             binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0)),
4902             mkU64(8 * sizeofIRType(ty)),
4903             unop(Iop_Ctz64, mkexpr(src64))
4904   ));
4905
4906   IRTemp res = newTemp(ty);
4907   assign(res, narrowTo(ty, mkexpr(res64)));
4908   return res;
4909}
4910
4911
4912/*------------------------------------------------------------*/
4913/*---                                                      ---*/
4914/*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
4915/*---                                                      ---*/
4916/*------------------------------------------------------------*/
4917
4918/* --- Helper functions for dealing with the register stack. --- */
4919
4920/* --- Set the emulation-warning pseudo-register. --- */
4921
4922static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
4923{
4924   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
4925   stmt( IRStmt_Put( OFFB_EMNOTE, e ) );
4926}
4927
4928/* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
4929
4930static IRExpr* mkQNaN64 ( void )
4931{
4932  /* QNaN is 0 2047 1 0(51times)
4933     == 0b 11111111111b 1 0(51times)
4934     == 0x7FF8 0000 0000 0000
4935   */
4936   return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
4937}
4938
4939/* --------- Get/put the top-of-stack pointer :: Ity_I32 --------- */
4940
4941static IRExpr* get_ftop ( void )
4942{
4943   return IRExpr_Get( OFFB_FTOP, Ity_I32 );
4944}
4945
4946static void put_ftop ( IRExpr* e )
4947{
4948   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
4949   stmt( IRStmt_Put( OFFB_FTOP, e ) );
4950}
4951
4952/* --------- Get/put the C3210 bits. --------- */
4953
4954static IRExpr*  /* :: Ity_I64 */ get_C3210 ( void )
4955{
4956   return IRExpr_Get( OFFB_FC3210, Ity_I64 );
4957}
4958
4959static void put_C3210 ( IRExpr* e  /* :: Ity_I64 */ )
4960{
4961   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I64);
4962   stmt( IRStmt_Put( OFFB_FC3210, e ) );
4963}
4964
4965/* --------- Get/put the FPU rounding mode. --------- */
4966static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
4967{
4968   return unop(Iop_64to32, IRExpr_Get( OFFB_FPROUND, Ity_I64 ));
4969}
4970
4971static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
4972{
4973   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
4974   stmt( IRStmt_Put( OFFB_FPROUND, unop(Iop_32Uto64,e) ) );
4975}
4976
4977
4978/* --------- Synthesise a 2-bit FPU rounding mode. --------- */
4979/* Produces a value in 0 .. 3, which is encoded as per the type
4980   IRRoundingMode.  Since the guest_FPROUND value is also encoded as
4981   per IRRoundingMode, we merely need to get it and mask it for
4982   safety.
4983*/
4984static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
4985{
4986   return binop( Iop_And32, get_fpround(), mkU32(3) );
4987}
4988
4989static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
4990{
4991   return mkU32(Irrm_NEAREST);
4992}
4993
4994
4995/* --------- Get/set FP register tag bytes. --------- */
4996
4997/* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
4998
4999static void put_ST_TAG ( Int i, IRExpr* value )
5000{
5001   IRRegArray* descr;
5002   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
5003   descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
5004   stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
5005}
5006
5007/* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
5008   zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
5009
5010static IRExpr* get_ST_TAG ( Int i )
5011{
5012   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
5013   return IRExpr_GetI( descr, get_ftop(), i );
5014}
5015
5016
5017/* --------- Get/set FP registers. --------- */
5018
5019/* Given i, and some expression e, emit 'ST(i) = e' and set the
5020   register's tag to indicate the register is full.  The previous
5021   state of the register is not checked. */
5022
5023static void put_ST_UNCHECKED ( Int i, IRExpr* value )
5024{
5025   IRRegArray* descr;
5026   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
5027   descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
5028   stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
5029   /* Mark the register as in-use. */
5030   put_ST_TAG(i, mkU8(1));
5031}
5032
5033/* Given i, and some expression e, emit
5034      ST(i) = is_full(i) ? NaN : e
5035   and set the tag accordingly.
5036*/
5037
5038static void put_ST ( Int i, IRExpr* value )
5039{
5040   put_ST_UNCHECKED(
5041      i,
5042      IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
5043                  /* non-0 means full */
5044                  mkQNaN64(),
5045                  /* 0 means empty */
5046                  value
5047      )
5048   );
5049}
5050
5051
5052/* Given i, generate an expression yielding 'ST(i)'. */
5053
5054static IRExpr* get_ST_UNCHECKED ( Int i )
5055{
5056   IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
5057   return IRExpr_GetI( descr, get_ftop(), i );
5058}
5059
5060
5061/* Given i, generate an expression yielding
5062  is_full(i) ? ST(i) : NaN
5063*/
5064
5065static IRExpr* get_ST ( Int i )
5066{
5067   return
5068      IRExpr_ITE( binop(Iop_CmpNE8, get_ST_TAG(i), mkU8(0)),
5069                  /* non-0 means full */
5070                  get_ST_UNCHECKED(i),
5071                  /* 0 means empty */
5072                  mkQNaN64());
5073}
5074
5075
5076/* Given i, and some expression e, and a condition cond, generate IR
5077   which has the same effect as put_ST(i,e) when cond is true and has
5078   no effect when cond is false.  Given the lack of proper
5079   if-then-else in the IR, this is pretty tricky.
5080*/
5081
5082static void maybe_put_ST ( IRTemp cond, Int i, IRExpr* value )
5083{
5084   // new_tag = if cond then FULL else old_tag
5085   // new_val = if cond then (if old_tag==FULL then NaN else val)
5086   //                   else old_val
5087
5088   IRTemp old_tag = newTemp(Ity_I8);
5089   assign(old_tag, get_ST_TAG(i));
5090   IRTemp new_tag = newTemp(Ity_I8);
5091   assign(new_tag,
5092          IRExpr_ITE(mkexpr(cond), mkU8(1)/*FULL*/, mkexpr(old_tag)));
5093
5094   IRTemp old_val = newTemp(Ity_F64);
5095   assign(old_val, get_ST_UNCHECKED(i));
5096   IRTemp new_val = newTemp(Ity_F64);
5097   assign(new_val,
5098          IRExpr_ITE(mkexpr(cond),
5099                     IRExpr_ITE(binop(Iop_CmpNE8, mkexpr(old_tag), mkU8(0)),
5100                                /* non-0 means full */
5101                                mkQNaN64(),
5102                                /* 0 means empty */
5103                                value),
5104                     mkexpr(old_val)));
5105
5106   put_ST_UNCHECKED(i, mkexpr(new_val));
5107   // put_ST_UNCHECKED incorrectly sets tag(i) to always be FULL.  So
5108   // now set it to new_tag instead.
5109   put_ST_TAG(i, mkexpr(new_tag));
5110}
5111
5112/* Adjust FTOP downwards by one register. */
5113
5114static void fp_push ( void )
5115{
5116   put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
5117}
5118
5119/* Adjust FTOP downwards by one register when COND is 1:I1.  Else
5120   don't change it. */
5121
5122static void maybe_fp_push ( IRTemp cond )
5123{
5124   put_ftop( binop(Iop_Sub32, get_ftop(), unop(Iop_1Uto32,mkexpr(cond))) );
5125}
5126
5127/* Adjust FTOP upwards by one register, and mark the vacated register
5128   as empty.  */
5129
5130static void fp_pop ( void )
5131{
5132   put_ST_TAG(0, mkU8(0));
5133   put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
5134}
5135
5136/* Set the C2 bit of the FPU status register to e[0].  Assumes that
5137   e[31:1] == 0.
5138*/
5139static void set_C2 ( IRExpr* e )
5140{
5141   IRExpr* cleared = binop(Iop_And64, get_C3210(), mkU64(~AMD64G_FC_MASK_C2));
5142   put_C3210( binop(Iop_Or64,
5143                    cleared,
5144                    binop(Iop_Shl64, e, mkU8(AMD64G_FC_SHIFT_C2))) );
5145}
5146
5147/* Generate code to check that abs(d64) < 2^63 and is finite.  This is
5148   used to do the range checks for FSIN, FCOS, FSINCOS and FPTAN.  The
5149   test is simple, but the derivation of it is not so simple.
5150
5151   The exponent field for an IEEE754 double is 11 bits.  That means it
5152   can take values 0 through 0x7FF.  If the exponent has value 0x7FF,
5153   the number is either a NaN or an Infinity and so is not finite.
5154   Furthermore, a finite value of exactly 2^63 is the smallest value
5155   that has exponent value 0x43E.  Hence, what we need to do is
5156   extract the exponent, ignoring the sign bit and mantissa, and check
5157   it is < 0x43E, or <= 0x43D.
5158
5159   To make this easily applicable to 32- and 64-bit targets, a
5160   roundabout approach is used.  First the number is converted to I64,
5161   then the top 32 bits are taken.  Shifting them right by 20 bits
5162   places the sign bit and exponent in the bottom 12 bits.  Anding
5163   with 0x7FF gets rid of the sign bit, leaving just the exponent
5164   available for comparison.
5165*/
5166static IRTemp math_IS_TRIG_ARG_FINITE_AND_IN_RANGE ( IRTemp d64 )
5167{
5168   IRTemp i64 = newTemp(Ity_I64);
5169   assign(i64, unop(Iop_ReinterpF64asI64, mkexpr(d64)) );
5170   IRTemp exponent = newTemp(Ity_I32);
5171   assign(exponent,
5172          binop(Iop_And32,
5173                binop(Iop_Shr32, unop(Iop_64HIto32, mkexpr(i64)), mkU8(20)),
5174                mkU32(0x7FF)));
5175   IRTemp in_range_and_finite = newTemp(Ity_I1);
5176   assign(in_range_and_finite,
5177          binop(Iop_CmpLE32U, mkexpr(exponent), mkU32(0x43D)));
5178   return in_range_and_finite;
5179}
5180
5181/* Invent a plausible-looking FPU status word value:
5182      ((ftop & 7) << 11) | (c3210 & 0x4700)
5183 */
5184static IRExpr* get_FPU_sw ( void )
5185{
5186   return
5187      unop(Iop_32to16,
5188           binop(Iop_Or32,
5189                 binop(Iop_Shl32,
5190                       binop(Iop_And32, get_ftop(), mkU32(7)),
5191                             mkU8(11)),
5192                       binop(Iop_And32, unop(Iop_64to32, get_C3210()),
5193                                        mkU32(0x4700))
5194      ));
5195}
5196
5197
5198/* ------------------------------------------------------- */
5199/* Given all that stack-mangling junk, we can now go ahead
5200   and describe FP instructions.
5201*/
5202
5203/* ST(0) = ST(0) `op` mem64/32(addr)
5204   Need to check ST(0)'s tag on read, but not on write.
5205*/
5206static
5207void fp_do_op_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
5208                         IROp op, Bool dbl )
5209{
5210   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
5211   if (dbl) {
5212      put_ST_UNCHECKED(0,
5213         triop( op,
5214                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5215                get_ST(0),
5216                loadLE(Ity_F64,mkexpr(addr))
5217         ));
5218   } else {
5219      put_ST_UNCHECKED(0,
5220         triop( op,
5221                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5222                get_ST(0),
5223                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
5224         ));
5225   }
5226}
5227
5228
5229/* ST(0) = mem64/32(addr) `op` ST(0)
5230   Need to check ST(0)'s tag on read, but not on write.
5231*/
5232static
5233void fp_do_oprev_mem_ST_0 ( IRTemp addr, const HChar* op_txt, HChar* dis_buf,
5234                            IROp op, Bool dbl )
5235{
5236   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
5237   if (dbl) {
5238      put_ST_UNCHECKED(0,
5239         triop( op,
5240                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5241                loadLE(Ity_F64,mkexpr(addr)),
5242                get_ST(0)
5243         ));
5244   } else {
5245      put_ST_UNCHECKED(0,
5246         triop( op,
5247                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5248                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
5249                get_ST(0)
5250         ));
5251   }
5252}
5253
5254
5255/* ST(dst) = ST(dst) `op` ST(src).
5256   Check dst and src tags when reading but not on write.
5257*/
5258static
5259void fp_do_op_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
5260                      Bool pop_after )
5261{
5262   DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
5263   put_ST_UNCHECKED(
5264      st_dst,
5265      triop( op,
5266             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5267             get_ST(st_dst),
5268             get_ST(st_src) )
5269   );
5270   if (pop_after)
5271      fp_pop();
5272}
5273
5274/* ST(dst) = ST(src) `op` ST(dst).
5275   Check dst and src tags when reading but not on write.
5276*/
5277static
5278void fp_do_oprev_ST_ST ( const HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
5279                         Bool pop_after )
5280{
5281   DIP("f%s%s st(%u), st(%u)\n", op_txt, pop_after?"p":"", st_src, st_dst );
5282   put_ST_UNCHECKED(
5283      st_dst,
5284      triop( op,
5285             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5286             get_ST(st_src),
5287             get_ST(st_dst) )
5288   );
5289   if (pop_after)
5290      fp_pop();
5291}
5292
5293/* %rflags(Z,P,C) = UCOMI( st(0), st(i) ) */
5294static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
5295{
5296   DIP("fucomi%s %%st(0),%%st(%u)\n", pop_after ? "p" : "", i);
5297   /* This is a bit of a hack (and isn't really right).  It sets
5298      Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
5299      documentation implies A and S are unchanged.
5300   */
5301   /* It's also fishy in that it is used both for COMIP and
5302      UCOMIP, and they aren't the same (although similar). */
5303   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
5304   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
5305   stmt( IRStmt_Put(
5306            OFFB_CC_DEP1,
5307            binop( Iop_And64,
5308                   unop( Iop_32Uto64,
5309                         binop(Iop_CmpF64, get_ST(0), get_ST(i))),
5310                   mkU64(0x45)
5311        )));
5312   if (pop_after)
5313      fp_pop();
5314}
5315
5316
5317/* returns
5318   32to16( if e32 <s -32768 || e32 >s 32767 then -32768 else e32 )
5319*/
5320static IRExpr* x87ishly_qnarrow_32_to_16 ( IRExpr* e32 )
5321{
5322   IRTemp t32 = newTemp(Ity_I32);
5323   assign( t32, e32 );
5324   return
5325      IRExpr_ITE(
5326         binop(Iop_CmpLT64U,
5327               unop(Iop_32Uto64,
5328                    binop(Iop_Add32, mkexpr(t32), mkU32(32768))),
5329               mkU64(65536)),
5330         unop(Iop_32to16, mkexpr(t32)),
5331         mkU16( 0x8000 ) );
5332}
5333
5334
5335static
5336ULong dis_FPU ( /*OUT*/Bool* decode_ok,
5337                VexAbiInfo* vbi, Prefix pfx, Long delta )
5338{
5339   Int    len;
5340   UInt   r_src, r_dst;
5341   HChar  dis_buf[50];
5342   IRTemp t1, t2;
5343
5344   /* On entry, delta points at the second byte of the insn (the modrm
5345      byte).*/
5346   UChar first_opcode = getUChar(delta-1);
5347   UChar modrm        = getUChar(delta+0);
5348
5349   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
5350
5351   if (first_opcode == 0xD8) {
5352      if (modrm < 0xC0) {
5353
5354         /* bits 5,4,3 are an opcode extension, and the modRM also
5355           specifies an address. */
5356         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
5357         delta += len;
5358
5359         switch (gregLO3ofRM(modrm)) {
5360
5361            case 0: /* FADD single-real */
5362               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
5363               break;
5364
5365            case 1: /* FMUL single-real */
5366               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
5367               break;
5368
5369            case 2: /* FCOM single-real */
5370               DIP("fcoms %s\n", dis_buf);
5371               /* This forces C1 to zero, which isn't right. */
5372               /* The AMD documentation suggests that forcing C1 to
5373                  zero is correct (Eliot Moss) */
5374               put_C3210(
5375                   unop( Iop_32Uto64,
5376                       binop( Iop_And32,
5377                              binop(Iop_Shl32,
5378                                    binop(Iop_CmpF64,
5379                                          get_ST(0),
5380                                          unop(Iop_F32toF64,
5381                                               loadLE(Ity_F32,mkexpr(addr)))),
5382                                    mkU8(8)),
5383                              mkU32(0x4500)
5384                   )));
5385               break;
5386
5387            case 3: /* FCOMP single-real */
5388               /* The AMD documentation suggests that forcing C1 to
5389                  zero is correct (Eliot Moss) */
5390               DIP("fcomps %s\n", dis_buf);
5391               /* This forces C1 to zero, which isn't right. */
5392               put_C3210(
5393                   unop( Iop_32Uto64,
5394                       binop( Iop_And32,
5395                              binop(Iop_Shl32,
5396                                    binop(Iop_CmpF64,
5397                                          get_ST(0),
5398                                          unop(Iop_F32toF64,
5399                                               loadLE(Ity_F32,mkexpr(addr)))),
5400                                    mkU8(8)),
5401                              mkU32(0x4500)
5402                   )));
5403               fp_pop();
5404               break;
5405
5406            case 4: /* FSUB single-real */
5407               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
5408               break;
5409
5410            case 5: /* FSUBR single-real */
5411               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
5412               break;
5413
5414            case 6: /* FDIV single-real */
5415               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
5416               break;
5417
5418            case 7: /* FDIVR single-real */
5419               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
5420               break;
5421
5422            default:
5423               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
5424               vex_printf("first_opcode == 0xD8\n");
5425               goto decode_fail;
5426         }
5427      } else {
5428         delta++;
5429         switch (modrm) {
5430
5431            case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
5432               fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
5433               break;
5434
5435            case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
5436               fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
5437               break;
5438
5439            /* Dunno if this is right */
5440            case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
5441               r_dst = (UInt)modrm - 0xD0;
5442               DIP("fcom %%st(0),%%st(%d)\n", r_dst);
5443               /* This forces C1 to zero, which isn't right. */
5444               put_C3210(
5445                   unop(Iop_32Uto64,
5446                   binop( Iop_And32,
5447                          binop(Iop_Shl32,
5448                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
5449                                mkU8(8)),
5450                          mkU32(0x4500)
5451                   )));
5452               break;
5453
5454            /* Dunno if this is right */
5455            case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
5456               r_dst = (UInt)modrm - 0xD8;
5457               DIP("fcomp %%st(0),%%st(%d)\n", r_dst);
5458               /* This forces C1 to zero, which isn't right. */
5459               put_C3210(
5460                   unop(Iop_32Uto64,
5461                   binop( Iop_And32,
5462                          binop(Iop_Shl32,
5463                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
5464                                mkU8(8)),
5465                          mkU32(0x4500)
5466                   )));
5467               fp_pop();
5468               break;
5469
5470            case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
5471               fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
5472               break;
5473
5474            case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
5475               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
5476               break;
5477
5478            case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
5479               fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
5480               break;
5481
5482            case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
5483               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
5484               break;
5485
5486            default:
5487               goto decode_fail;
5488         }
5489      }
5490   }
5491
5492   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
5493   else
5494   if (first_opcode == 0xD9) {
5495      if (modrm < 0xC0) {
5496
5497         /* bits 5,4,3 are an opcode extension, and the modRM also
5498            specifies an address. */
5499         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
5500         delta += len;
5501
5502         switch (gregLO3ofRM(modrm)) {
5503
5504            case 0: /* FLD single-real */
5505               DIP("flds %s\n", dis_buf);
5506               fp_push();
5507               put_ST(0, unop(Iop_F32toF64,
5508                              loadLE(Ity_F32, mkexpr(addr))));
5509               break;
5510
5511            case 2: /* FST single-real */
5512               DIP("fsts %s\n", dis_buf);
5513               storeLE(mkexpr(addr),
5514                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
5515               break;
5516
5517            case 3: /* FSTP single-real */
5518               DIP("fstps %s\n", dis_buf);
5519               storeLE(mkexpr(addr),
5520                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
5521               fp_pop();
5522               break;
5523
5524            case 4: { /* FLDENV m28 */
5525               /* Uses dirty helper:
5526                     VexEmNote amd64g_do_FLDENV ( VexGuestX86State*, HWord ) */
5527               IRTemp    ew = newTemp(Ity_I32);
5528               IRTemp   w64 = newTemp(Ity_I64);
5529               IRDirty*   d = unsafeIRDirty_0_N (
5530                                 0/*regparms*/,
5531                                 "amd64g_dirtyhelper_FLDENV",
5532                                 &amd64g_dirtyhelper_FLDENV,
5533                                 mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
5534                              );
5535               d->tmp       = w64;
5536               /* declare we're reading memory */
5537               d->mFx   = Ifx_Read;
5538               d->mAddr = mkexpr(addr);
5539               d->mSize = 28;
5540
5541               /* declare we're writing guest state */
5542               d->nFxState = 4;
5543               vex_bzero(&d->fxState, sizeof(d->fxState));
5544
5545               d->fxState[0].fx     = Ifx_Write;
5546               d->fxState[0].offset = OFFB_FTOP;
5547               d->fxState[0].size   = sizeof(UInt);
5548
5549               d->fxState[1].fx     = Ifx_Write;
5550               d->fxState[1].offset = OFFB_FPTAGS;
5551               d->fxState[1].size   = 8 * sizeof(UChar);
5552
5553               d->fxState[2].fx     = Ifx_Write;
5554               d->fxState[2].offset = OFFB_FPROUND;
5555               d->fxState[2].size   = sizeof(ULong);
5556
5557               d->fxState[3].fx     = Ifx_Write;
5558               d->fxState[3].offset = OFFB_FC3210;
5559               d->fxState[3].size   = sizeof(ULong);
5560
5561               stmt( IRStmt_Dirty(d) );
5562
5563               /* ew contains any emulation warning we may need to
5564                  issue.  If needed, side-exit to the next insn,
5565                  reporting the warning, so that Valgrind's dispatcher
5566                  sees the warning. */
5567               assign(ew, unop(Iop_64to32,mkexpr(w64)) );
5568               put_emwarn( mkexpr(ew) );
5569               stmt(
5570                  IRStmt_Exit(
5571                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
5572                     Ijk_EmWarn,
5573                     IRConst_U64( guest_RIP_bbstart+delta ),
5574                     OFFB_RIP
5575                  )
5576               );
5577
5578               DIP("fldenv %s\n", dis_buf);
5579               break;
5580            }
5581
5582            case 5: {/* FLDCW */
5583               /* The only thing we observe in the control word is the
5584                  rounding mode.  Therefore, pass the 16-bit value
5585                  (x87 native-format control word) to a clean helper,
5586                  getting back a 64-bit value, the lower half of which
5587                  is the FPROUND value to store, and the upper half of
5588                  which is the emulation-warning token which may be
5589                  generated.
5590               */
5591               /* ULong amd64h_check_fldcw ( ULong ); */
5592               IRTemp t64 = newTemp(Ity_I64);
5593               IRTemp ew = newTemp(Ity_I32);
5594               DIP("fldcw %s\n", dis_buf);
5595               assign( t64, mkIRExprCCall(
5596                               Ity_I64, 0/*regparms*/,
5597                               "amd64g_check_fldcw",
5598                               &amd64g_check_fldcw,
5599                               mkIRExprVec_1(
5600                                  unop( Iop_16Uto64,
5601                                        loadLE(Ity_I16, mkexpr(addr)))
5602                               )
5603                            )
5604                     );
5605
5606               put_fpround( unop(Iop_64to32, mkexpr(t64)) );
5607               assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
5608               put_emwarn( mkexpr(ew) );
5609               /* Finally, if an emulation warning was reported,
5610                  side-exit to the next insn, reporting the warning,
5611                  so that Valgrind's dispatcher sees the warning. */
5612               stmt(
5613                  IRStmt_Exit(
5614                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
5615                     Ijk_EmWarn,
5616                     IRConst_U64( guest_RIP_bbstart+delta ),
5617                     OFFB_RIP
5618                  )
5619               );
5620               break;
5621            }
5622
5623            case 6: { /* FNSTENV m28 */
5624               /* Uses dirty helper:
5625                     void amd64g_do_FSTENV ( VexGuestAMD64State*, HWord ) */
5626               IRDirty* d = unsafeIRDirty_0_N (
5627                               0/*regparms*/,
5628                               "amd64g_dirtyhelper_FSTENV",
5629                               &amd64g_dirtyhelper_FSTENV,
5630                               mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
5631                            );
5632               /* declare we're writing memory */
5633               d->mFx   = Ifx_Write;
5634               d->mAddr = mkexpr(addr);
5635               d->mSize = 28;
5636
5637               /* declare we're reading guest state */
5638               d->nFxState = 4;
5639               vex_bzero(&d->fxState, sizeof(d->fxState));
5640
5641               d->fxState[0].fx     = Ifx_Read;
5642               d->fxState[0].offset = OFFB_FTOP;
5643               d->fxState[0].size   = sizeof(UInt);
5644
5645               d->fxState[1].fx     = Ifx_Read;
5646               d->fxState[1].offset = OFFB_FPTAGS;
5647               d->fxState[1].size   = 8 * sizeof(UChar);
5648
5649               d->fxState[2].fx     = Ifx_Read;
5650               d->fxState[2].offset = OFFB_FPROUND;
5651               d->fxState[2].size   = sizeof(ULong);
5652
5653               d->fxState[3].fx     = Ifx_Read;
5654               d->fxState[3].offset = OFFB_FC3210;
5655               d->fxState[3].size   = sizeof(ULong);
5656
5657               stmt( IRStmt_Dirty(d) );
5658
5659               DIP("fnstenv %s\n", dis_buf);
5660               break;
5661            }
5662
5663            case 7: /* FNSTCW */
5664               /* Fake up a native x87 FPU control word.  The only
5665                  thing it depends on is FPROUND[1:0], so call a clean
5666                  helper to cook it up. */
5667               /* ULong amd64g_create_fpucw ( ULong fpround ) */
5668               DIP("fnstcw %s\n", dis_buf);
5669               storeLE(
5670                  mkexpr(addr),
5671                  unop( Iop_64to16,
5672                        mkIRExprCCall(
5673                           Ity_I64, 0/*regp*/,
5674                           "amd64g_create_fpucw", &amd64g_create_fpucw,
5675                           mkIRExprVec_1( unop(Iop_32Uto64, get_fpround()) )
5676                        )
5677                  )
5678               );
5679               break;
5680
5681            default:
5682               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
5683               vex_printf("first_opcode == 0xD9\n");
5684               goto decode_fail;
5685         }
5686
5687      } else {
5688         delta++;
5689         switch (modrm) {
5690
5691            case 0xC0 ... 0xC7: /* FLD %st(?) */
5692               r_src = (UInt)modrm - 0xC0;
5693               DIP("fld %%st(%u)\n", r_src);
5694               t1 = newTemp(Ity_F64);
5695               assign(t1, get_ST(r_src));
5696               fp_push();
5697               put_ST(0, mkexpr(t1));
5698               break;
5699
5700            case 0xC8 ... 0xCF: /* FXCH %st(?) */
5701               r_src = (UInt)modrm - 0xC8;
5702               DIP("fxch %%st(%u)\n", r_src);
5703               t1 = newTemp(Ity_F64);
5704               t2 = newTemp(Ity_F64);
5705               assign(t1, get_ST(0));
5706               assign(t2, get_ST(r_src));
5707               put_ST_UNCHECKED(0, mkexpr(t2));
5708               put_ST_UNCHECKED(r_src, mkexpr(t1));
5709               break;
5710
5711            case 0xE0: /* FCHS */
5712               DIP("fchs\n");
5713               put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
5714               break;
5715
5716            case 0xE1: /* FABS */
5717               DIP("fabs\n");
5718               put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
5719               break;
5720
5721            case 0xE5: { /* FXAM */
5722               /* This is an interesting one.  It examines %st(0),
5723                  regardless of whether the tag says it's empty or not.
5724                  Here, just pass both the tag (in our format) and the
5725                  value (as a double, actually a ULong) to a helper
5726                  function. */
5727               IRExpr** args
5728                  = mkIRExprVec_2( unop(Iop_8Uto64, get_ST_TAG(0)),
5729                                   unop(Iop_ReinterpF64asI64,
5730                                        get_ST_UNCHECKED(0)) );
5731               put_C3210(mkIRExprCCall(
5732                            Ity_I64,
5733                            0/*regparm*/,
5734                            "amd64g_calculate_FXAM", &amd64g_calculate_FXAM,
5735                            args
5736                        ));
5737               DIP("fxam\n");
5738               break;
5739            }
5740
5741            case 0xE8: /* FLD1 */
5742               DIP("fld1\n");
5743               fp_push();
5744               /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
5745               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
5746               break;
5747
5748            case 0xE9: /* FLDL2T */
5749               DIP("fldl2t\n");
5750               fp_push();
5751               /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
5752               put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
5753               break;
5754
5755            case 0xEA: /* FLDL2E */
5756               DIP("fldl2e\n");
5757               fp_push();
5758               /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
5759               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
5760               break;
5761
5762            case 0xEB: /* FLDPI */
5763               DIP("fldpi\n");
5764               fp_push();
5765               /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
5766               put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
5767               break;
5768
5769            case 0xEC: /* FLDLG2 */
5770               DIP("fldlg2\n");
5771               fp_push();
5772               /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
5773               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
5774               break;
5775
5776            case 0xED: /* FLDLN2 */
5777               DIP("fldln2\n");
5778               fp_push();
5779               /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
5780               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
5781               break;
5782
5783            case 0xEE: /* FLDZ */
5784               DIP("fldz\n");
5785               fp_push();
5786               /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
5787               put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
5788               break;
5789
5790            case 0xF0: /* F2XM1 */
5791               DIP("f2xm1\n");
5792               put_ST_UNCHECKED(0,
5793                  binop(Iop_2xm1F64,
5794                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5795                        get_ST(0)));
5796               break;
5797
5798            case 0xF1: /* FYL2X */
5799               DIP("fyl2x\n");
5800               put_ST_UNCHECKED(1,
5801                  triop(Iop_Yl2xF64,
5802                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5803                        get_ST(1),
5804                        get_ST(0)));
5805               fp_pop();
5806               break;
5807
5808            case 0xF2: { /* FPTAN */
5809               DIP("fptan\n");
5810               IRTemp argD = newTemp(Ity_F64);
5811               assign(argD, get_ST(0));
5812               IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
5813               IRTemp resD = newTemp(Ity_F64);
5814               assign(resD,
5815                  IRExpr_ITE(
5816                     mkexpr(argOK),
5817                     binop(Iop_TanF64,
5818                           get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5819                           mkexpr(argD)),
5820                     mkexpr(argD))
5821               );
5822               put_ST_UNCHECKED(0, mkexpr(resD));
5823               /* Conditionally push 1.0 on the stack, if the arg is
5824                  in range */
5825               maybe_fp_push(argOK);
5826               maybe_put_ST(argOK, 0,
5827                            IRExpr_Const(IRConst_F64(1.0)));
5828               set_C2( binop(Iop_Xor64,
5829                             unop(Iop_1Uto64, mkexpr(argOK)),
5830                             mkU64(1)) );
5831               break;
5832            }
5833
5834            case 0xF3: /* FPATAN */
5835               DIP("fpatan\n");
5836               put_ST_UNCHECKED(1,
5837                  triop(Iop_AtanF64,
5838                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5839                        get_ST(1),
5840                        get_ST(0)));
5841               fp_pop();
5842               break;
5843
5844            case 0xF4: { /* FXTRACT */
5845               IRTemp argF = newTemp(Ity_F64);
5846               IRTemp sigF = newTemp(Ity_F64);
5847               IRTemp expF = newTemp(Ity_F64);
5848               IRTemp argI = newTemp(Ity_I64);
5849               IRTemp sigI = newTemp(Ity_I64);
5850               IRTemp expI = newTemp(Ity_I64);
5851               DIP("fxtract\n");
5852               assign( argF, get_ST(0) );
5853               assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
5854               assign( sigI,
5855                       mkIRExprCCall(
5856                          Ity_I64, 0/*regparms*/,
5857                          "x86amd64g_calculate_FXTRACT",
5858                          &x86amd64g_calculate_FXTRACT,
5859                          mkIRExprVec_2( mkexpr(argI),
5860                                         mkIRExpr_HWord(0)/*sig*/ ))
5861               );
5862               assign( expI,
5863                       mkIRExprCCall(
5864                          Ity_I64, 0/*regparms*/,
5865                          "x86amd64g_calculate_FXTRACT",
5866                          &x86amd64g_calculate_FXTRACT,
5867                          mkIRExprVec_2( mkexpr(argI),
5868                                         mkIRExpr_HWord(1)/*exp*/ ))
5869               );
5870               assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
5871               assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
5872               /* exponent */
5873               put_ST_UNCHECKED(0, mkexpr(expF) );
5874               fp_push();
5875               /* significand */
5876               put_ST(0, mkexpr(sigF) );
5877               break;
5878            }
5879
5880            case 0xF5: { /* FPREM1 -- IEEE compliant */
5881               IRTemp a1 = newTemp(Ity_F64);
5882               IRTemp a2 = newTemp(Ity_F64);
5883               DIP("fprem1\n");
5884               /* Do FPREM1 twice, once to get the remainder, and once
5885                  to get the C3210 flag values. */
5886               assign( a1, get_ST(0) );
5887               assign( a2, get_ST(1) );
5888               put_ST_UNCHECKED(0,
5889                  triop(Iop_PRem1F64,
5890                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5891                        mkexpr(a1),
5892                        mkexpr(a2)));
5893               put_C3210(
5894                  unop(Iop_32Uto64,
5895                  triop(Iop_PRem1C3210F64,
5896                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5897                        mkexpr(a1),
5898                        mkexpr(a2)) ));
5899               break;
5900            }
5901
5902            case 0xF7: /* FINCSTP */
5903               DIP("fincstp\n");
5904               put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
5905               break;
5906
5907            case 0xF8: { /* FPREM -- not IEEE compliant */
5908               IRTemp a1 = newTemp(Ity_F64);
5909               IRTemp a2 = newTemp(Ity_F64);
5910               DIP("fprem\n");
5911               /* Do FPREM twice, once to get the remainder, and once
5912                  to get the C3210 flag values. */
5913               assign( a1, get_ST(0) );
5914               assign( a2, get_ST(1) );
5915               put_ST_UNCHECKED(0,
5916                  triop(Iop_PRemF64,
5917                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5918                        mkexpr(a1),
5919                        mkexpr(a2)));
5920               put_C3210(
5921                  unop(Iop_32Uto64,
5922                  triop(Iop_PRemC3210F64,
5923                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5924                        mkexpr(a1),
5925                        mkexpr(a2)) ));
5926               break;
5927            }
5928
5929            case 0xF9: /* FYL2XP1 */
5930               DIP("fyl2xp1\n");
5931               put_ST_UNCHECKED(1,
5932                  triop(Iop_Yl2xp1F64,
5933                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5934                        get_ST(1),
5935                        get_ST(0)));
5936               fp_pop();
5937               break;
5938
5939            case 0xFA: /* FSQRT */
5940               DIP("fsqrt\n");
5941               put_ST_UNCHECKED(0,
5942                  binop(Iop_SqrtF64,
5943                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5944                        get_ST(0)));
5945               break;
5946
5947            case 0xFB: { /* FSINCOS */
5948               DIP("fsincos\n");
5949               IRTemp argD = newTemp(Ity_F64);
5950               assign(argD, get_ST(0));
5951               IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
5952               IRTemp resD = newTemp(Ity_F64);
5953               assign(resD,
5954                  IRExpr_ITE(
5955                     mkexpr(argOK),
5956                     binop(Iop_SinF64,
5957                           get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5958                           mkexpr(argD)),
5959                     mkexpr(argD))
5960               );
5961               put_ST_UNCHECKED(0, mkexpr(resD));
5962               /* Conditionally push the cos value on the stack, if
5963                  the arg is in range */
5964               maybe_fp_push(argOK);
5965               maybe_put_ST(argOK, 0,
5966                  binop(Iop_CosF64,
5967                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5968                        mkexpr(argD)));
5969               set_C2( binop(Iop_Xor64,
5970                             unop(Iop_1Uto64, mkexpr(argOK)),
5971                             mkU64(1)) );
5972               break;
5973            }
5974
5975            case 0xFC: /* FRNDINT */
5976               DIP("frndint\n");
5977               put_ST_UNCHECKED(0,
5978                  binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
5979               break;
5980
5981            case 0xFD: /* FSCALE */
5982               DIP("fscale\n");
5983               put_ST_UNCHECKED(0,
5984                  triop(Iop_ScaleF64,
5985                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5986                        get_ST(0),
5987                        get_ST(1)));
5988               break;
5989
5990            case 0xFE:   /* FSIN */
5991            case 0xFF: { /* FCOS */
5992               Bool isSIN = modrm == 0xFE;
5993               DIP("%s\n", isSIN ? "fsin" : "fcos");
5994               IRTemp argD = newTemp(Ity_F64);
5995               assign(argD, get_ST(0));
5996               IRTemp argOK = math_IS_TRIG_ARG_FINITE_AND_IN_RANGE(argD);
5997               IRTemp resD = newTemp(Ity_F64);
5998               assign(resD,
5999                  IRExpr_ITE(
6000                     mkexpr(argOK),
6001                     binop(isSIN ? Iop_SinF64 : Iop_CosF64,
6002                           get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6003                           mkexpr(argD)),
6004                     mkexpr(argD))
6005               );
6006               put_ST_UNCHECKED(0, mkexpr(resD));
6007               set_C2( binop(Iop_Xor64,
6008                             unop(Iop_1Uto64, mkexpr(argOK)),
6009                             mkU64(1)) );
6010               break;
6011            }
6012
6013            default:
6014               goto decode_fail;
6015         }
6016      }
6017   }
6018
6019   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
6020   else
6021   if (first_opcode == 0xDA) {
6022
6023      if (modrm < 0xC0) {
6024
6025         /* bits 5,4,3 are an opcode extension, and the modRM also
6026            specifies an address. */
6027         IROp   fop;
6028         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6029         delta += len;
6030         switch (gregLO3ofRM(modrm)) {
6031
6032            case 0: /* FIADD m32int */ /* ST(0) += m32int */
6033               DIP("fiaddl %s\n", dis_buf);
6034               fop = Iop_AddF64;
6035               goto do_fop_m32;
6036
6037            case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
6038               DIP("fimull %s\n", dis_buf);
6039               fop = Iop_MulF64;
6040               goto do_fop_m32;
6041
6042            case 4: /* FISUB m32int */ /* ST(0) -= m32int */
6043               DIP("fisubl %s\n", dis_buf);
6044               fop = Iop_SubF64;
6045               goto do_fop_m32;
6046
6047            case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
6048               DIP("fisubrl %s\n", dis_buf);
6049               fop = Iop_SubF64;
6050               goto do_foprev_m32;
6051
6052            case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
6053               DIP("fisubl %s\n", dis_buf);
6054               fop = Iop_DivF64;
6055               goto do_fop_m32;
6056
6057            case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
6058               DIP("fidivrl %s\n", dis_buf);
6059               fop = Iop_DivF64;
6060               goto do_foprev_m32;
6061
6062            do_fop_m32:
6063               put_ST_UNCHECKED(0,
6064                  triop(fop,
6065                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6066                        get_ST(0),
6067                        unop(Iop_I32StoF64,
6068                             loadLE(Ity_I32, mkexpr(addr)))));
6069               break;
6070
6071            do_foprev_m32:
6072               put_ST_UNCHECKED(0,
6073                  triop(fop,
6074                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6075                        unop(Iop_I32StoF64,
6076                             loadLE(Ity_I32, mkexpr(addr))),
6077                        get_ST(0)));
6078               break;
6079
6080            default:
6081               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
6082               vex_printf("first_opcode == 0xDA\n");
6083               goto decode_fail;
6084         }
6085
6086      } else {
6087
6088         delta++;
6089         switch (modrm) {
6090
6091            case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
6092               r_src = (UInt)modrm - 0xC0;
6093               DIP("fcmovb %%st(%u), %%st(0)\n", r_src);
6094               put_ST_UNCHECKED(0,
6095                                IRExpr_ITE(
6096                                    mk_amd64g_calculate_condition(AMD64CondB),
6097                                    get_ST(r_src), get_ST(0)) );
6098               break;
6099
6100            case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
6101               r_src = (UInt)modrm - 0xC8;
6102               DIP("fcmovz %%st(%u), %%st(0)\n", r_src);
6103               put_ST_UNCHECKED(0,
6104                                IRExpr_ITE(
6105                                    mk_amd64g_calculate_condition(AMD64CondZ),
6106                                    get_ST(r_src), get_ST(0)) );
6107               break;
6108
6109            case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
6110               r_src = (UInt)modrm - 0xD0;
6111               DIP("fcmovbe %%st(%u), %%st(0)\n", r_src);
6112               put_ST_UNCHECKED(0,
6113                                IRExpr_ITE(
6114                                    mk_amd64g_calculate_condition(AMD64CondBE),
6115                                    get_ST(r_src), get_ST(0)) );
6116               break;
6117
6118            case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
6119               r_src = (UInt)modrm - 0xD8;
6120               DIP("fcmovu %%st(%u), %%st(0)\n", r_src);
6121               put_ST_UNCHECKED(0,
6122                                IRExpr_ITE(
6123                                    mk_amd64g_calculate_condition(AMD64CondP),
6124                                    get_ST(r_src), get_ST(0)) );
6125               break;
6126
6127            case 0xE9: /* FUCOMPP %st(0),%st(1) */
6128               DIP("fucompp %%st(0),%%st(1)\n");
6129               /* This forces C1 to zero, which isn't right. */
6130               put_C3210(
6131                   unop(Iop_32Uto64,
6132                   binop( Iop_And32,
6133                          binop(Iop_Shl32,
6134                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
6135                                mkU8(8)),
6136                          mkU32(0x4500)
6137                   )));
6138               fp_pop();
6139               fp_pop();
6140               break;
6141
6142            default:
6143               goto decode_fail;
6144         }
6145
6146      }
6147   }
6148
6149   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
6150   else
6151   if (first_opcode == 0xDB) {
6152      if (modrm < 0xC0) {
6153
6154         /* bits 5,4,3 are an opcode extension, and the modRM also
6155            specifies an address. */
6156         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6157         delta += len;
6158
6159         switch (gregLO3ofRM(modrm)) {
6160
6161            case 0: /* FILD m32int */
6162               DIP("fildl %s\n", dis_buf);
6163               fp_push();
6164               put_ST(0, unop(Iop_I32StoF64,
6165                              loadLE(Ity_I32, mkexpr(addr))));
6166               break;
6167
6168            case 1: /* FISTTPL m32 (SSE3) */
6169               DIP("fisttpl %s\n", dis_buf);
6170               storeLE( mkexpr(addr),
6171                        binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
6172               fp_pop();
6173               break;
6174
6175            case 2: /* FIST m32 */
6176               DIP("fistl %s\n", dis_buf);
6177               storeLE( mkexpr(addr),
6178                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
6179               break;
6180
6181            case 3: /* FISTP m32 */
6182               DIP("fistpl %s\n", dis_buf);
6183               storeLE( mkexpr(addr),
6184                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
6185               fp_pop();
6186               break;
6187
6188            case 5: { /* FLD extended-real */
6189               /* Uses dirty helper:
6190                     ULong amd64g_loadF80le ( ULong )
6191                  addr holds the address.  First, do a dirty call to
6192                  get hold of the data. */
6193               IRTemp   val  = newTemp(Ity_I64);
6194               IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
6195
6196               IRDirty* d = unsafeIRDirty_1_N (
6197                               val,
6198                               0/*regparms*/,
6199                               "amd64g_dirtyhelper_loadF80le",
6200                               &amd64g_dirtyhelper_loadF80le,
6201                               args
6202                            );
6203               /* declare that we're reading memory */
6204               d->mFx   = Ifx_Read;
6205               d->mAddr = mkexpr(addr);
6206               d->mSize = 10;
6207
6208               /* execute the dirty call, dumping the result in val. */
6209               stmt( IRStmt_Dirty(d) );
6210               fp_push();
6211               put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
6212
6213               DIP("fldt %s\n", dis_buf);
6214               break;
6215            }
6216
6217            case 7: { /* FSTP extended-real */
6218               /* Uses dirty helper:
6219                     void amd64g_storeF80le ( ULong addr, ULong data )
6220               */
6221               IRExpr** args
6222                  = mkIRExprVec_2( mkexpr(addr),
6223                                   unop(Iop_ReinterpF64asI64, get_ST(0)) );
6224
6225               IRDirty* d = unsafeIRDirty_0_N (
6226                               0/*regparms*/,
6227                               "amd64g_dirtyhelper_storeF80le",
6228                               &amd64g_dirtyhelper_storeF80le,
6229                               args
6230                            );
6231               /* declare we're writing memory */
6232               d->mFx   = Ifx_Write;
6233               d->mAddr = mkexpr(addr);
6234               d->mSize = 10;
6235
6236               /* execute the dirty call. */
6237               stmt( IRStmt_Dirty(d) );
6238               fp_pop();
6239
6240               DIP("fstpt\n %s", dis_buf);
6241               break;
6242            }
6243
6244            default:
6245               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
6246               vex_printf("first_opcode == 0xDB\n");
6247               goto decode_fail;
6248         }
6249
6250      } else {
6251
6252         delta++;
6253         switch (modrm) {
6254
6255            case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
6256               r_src = (UInt)modrm - 0xC0;
6257               DIP("fcmovnb %%st(%u), %%st(0)\n", r_src);
6258               put_ST_UNCHECKED(0,
6259                                IRExpr_ITE(
6260                                    mk_amd64g_calculate_condition(AMD64CondNB),
6261                                    get_ST(r_src), get_ST(0)) );
6262               break;
6263
6264            case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
6265               r_src = (UInt)modrm - 0xC8;
6266               DIP("fcmovnz %%st(%u), %%st(0)\n", r_src);
6267               put_ST_UNCHECKED(
6268                  0,
6269                  IRExpr_ITE(
6270                     mk_amd64g_calculate_condition(AMD64CondNZ),
6271                     get_ST(r_src),
6272                     get_ST(0)
6273                  )
6274               );
6275               break;
6276
6277            case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
6278               r_src = (UInt)modrm - 0xD0;
6279               DIP("fcmovnbe %%st(%u), %%st(0)\n", r_src);
6280               put_ST_UNCHECKED(
6281                  0,
6282                  IRExpr_ITE(
6283                     mk_amd64g_calculate_condition(AMD64CondNBE),
6284                     get_ST(r_src),
6285                     get_ST(0)
6286                  )
6287               );
6288               break;
6289
6290            case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
6291               r_src = (UInt)modrm - 0xD8;
6292               DIP("fcmovnu %%st(%u), %%st(0)\n", r_src);
6293               put_ST_UNCHECKED(
6294                  0,
6295                  IRExpr_ITE(
6296                     mk_amd64g_calculate_condition(AMD64CondNP),
6297                     get_ST(r_src),
6298                     get_ST(0)
6299                  )
6300               );
6301               break;
6302
6303            case 0xE2:
6304               DIP("fnclex\n");
6305               break;
6306
6307            case 0xE3: {
6308               /* Uses dirty helper:
6309                     void amd64g_do_FINIT ( VexGuestAMD64State* ) */
6310               IRDirty* d  = unsafeIRDirty_0_N (
6311                                0/*regparms*/,
6312                                "amd64g_dirtyhelper_FINIT",
6313                                &amd64g_dirtyhelper_FINIT,
6314                                mkIRExprVec_1( IRExpr_BBPTR() )
6315                             );
6316
6317               /* declare we're writing guest state */
6318               d->nFxState = 5;
6319               vex_bzero(&d->fxState, sizeof(d->fxState));
6320
6321               d->fxState[0].fx     = Ifx_Write;
6322               d->fxState[0].offset = OFFB_FTOP;
6323               d->fxState[0].size   = sizeof(UInt);
6324
6325               d->fxState[1].fx     = Ifx_Write;
6326               d->fxState[1].offset = OFFB_FPREGS;
6327               d->fxState[1].size   = 8 * sizeof(ULong);
6328
6329               d->fxState[2].fx     = Ifx_Write;
6330               d->fxState[2].offset = OFFB_FPTAGS;
6331               d->fxState[2].size   = 8 * sizeof(UChar);
6332
6333               d->fxState[3].fx     = Ifx_Write;
6334               d->fxState[3].offset = OFFB_FPROUND;
6335               d->fxState[3].size   = sizeof(ULong);
6336
6337               d->fxState[4].fx     = Ifx_Write;
6338               d->fxState[4].offset = OFFB_FC3210;
6339               d->fxState[4].size   = sizeof(ULong);
6340
6341               stmt( IRStmt_Dirty(d) );
6342
6343               DIP("fninit\n");
6344               break;
6345            }
6346
6347            case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
6348               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
6349               break;
6350
6351            case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
6352               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
6353               break;
6354
6355            default:
6356               goto decode_fail;
6357         }
6358      }
6359   }
6360
6361   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
6362   else
6363   if (first_opcode == 0xDC) {
6364      if (modrm < 0xC0) {
6365
6366         /* bits 5,4,3 are an opcode extension, and the modRM also
6367            specifies an address. */
6368         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6369         delta += len;
6370
6371         switch (gregLO3ofRM(modrm)) {
6372
6373            case 0: /* FADD double-real */
6374               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
6375               break;
6376
6377            case 1: /* FMUL double-real */
6378               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
6379               break;
6380
6381//..             case 2: /* FCOM double-real */
6382//..                DIP("fcoml %s\n", dis_buf);
6383//..                /* This forces C1 to zero, which isn't right. */
6384//..                put_C3210(
6385//..                    binop( Iop_And32,
6386//..                           binop(Iop_Shl32,
6387//..                                 binop(Iop_CmpF64,
6388//..                                       get_ST(0),
6389//..                                       loadLE(Ity_F64,mkexpr(addr))),
6390//..                                 mkU8(8)),
6391//..                           mkU32(0x4500)
6392//..                    ));
6393//..                break;
6394
6395            case 3: /* FCOMP double-real */
6396               DIP("fcompl %s\n", dis_buf);
6397               /* This forces C1 to zero, which isn't right. */
6398               put_C3210(
6399                   unop(Iop_32Uto64,
6400                   binop( Iop_And32,
6401                          binop(Iop_Shl32,
6402                                binop(Iop_CmpF64,
6403                                      get_ST(0),
6404                                      loadLE(Ity_F64,mkexpr(addr))),
6405                                mkU8(8)),
6406                          mkU32(0x4500)
6407                   )));
6408               fp_pop();
6409               break;
6410
6411            case 4: /* FSUB double-real */
6412               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
6413               break;
6414
6415            case 5: /* FSUBR double-real */
6416               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
6417               break;
6418
6419            case 6: /* FDIV double-real */
6420               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
6421               break;
6422
6423            case 7: /* FDIVR double-real */
6424               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
6425               break;
6426
6427            default:
6428               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
6429               vex_printf("first_opcode == 0xDC\n");
6430               goto decode_fail;
6431         }
6432
6433      } else {
6434
6435         delta++;
6436         switch (modrm) {
6437
6438            case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
6439               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
6440               break;
6441
6442            case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
6443               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
6444               break;
6445
6446            case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
6447               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
6448               break;
6449
6450            case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
6451               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
6452               break;
6453
6454            case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
6455               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
6456               break;
6457
6458            case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
6459               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
6460               break;
6461
6462            default:
6463               goto decode_fail;
6464         }
6465
6466      }
6467   }
6468
6469   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
6470   else
6471   if (first_opcode == 0xDD) {
6472
6473      if (modrm < 0xC0) {
6474
6475         /* bits 5,4,3 are an opcode extension, and the modRM also
6476            specifies an address. */
6477         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6478         delta += len;
6479
6480         switch (gregLO3ofRM(modrm)) {
6481
6482            case 0: /* FLD double-real */
6483               DIP("fldl %s\n", dis_buf);
6484               fp_push();
6485               put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
6486               break;
6487
6488            case 1: /* FISTTPQ m64 (SSE3) */
6489               DIP("fistppll %s\n", dis_buf);
6490               storeLE( mkexpr(addr),
6491                        binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
6492               fp_pop();
6493               break;
6494
6495            case 2: /* FST double-real */
6496               DIP("fstl %s\n", dis_buf);
6497               storeLE(mkexpr(addr), get_ST(0));
6498               break;
6499
6500            case 3: /* FSTP double-real */
6501               DIP("fstpl %s\n", dis_buf);
6502               storeLE(mkexpr(addr), get_ST(0));
6503               fp_pop();
6504               break;
6505
6506            case 4: { /* FRSTOR m94/m108 */
6507               IRTemp   ew = newTemp(Ity_I32);
6508               IRTemp  w64 = newTemp(Ity_I64);
6509               IRDirty*  d;
6510               if ( have66(pfx) ) {
6511                  /* Uses dirty helper:
6512                     VexEmNote amd64g_dirtyhelper_FRSTORS
6513                                  ( VexGuestAMD64State*, HWord ) */
6514                  d = unsafeIRDirty_0_N (
6515                         0/*regparms*/,
6516                         "amd64g_dirtyhelper_FRSTORS",
6517                         &amd64g_dirtyhelper_FRSTORS,
6518                         mkIRExprVec_1( mkexpr(addr) )
6519                      );
6520                  d->mSize = 94;
6521               } else {
6522                  /* Uses dirty helper:
6523                     VexEmNote amd64g_dirtyhelper_FRSTOR
6524                                  ( VexGuestAMD64State*, HWord ) */
6525                  d = unsafeIRDirty_0_N (
6526                         0/*regparms*/,
6527                         "amd64g_dirtyhelper_FRSTOR",
6528                         &amd64g_dirtyhelper_FRSTOR,
6529                         mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
6530                      );
6531                  d->mSize = 108;
6532               }
6533
6534               d->tmp    = w64;
6535               /* declare we're reading memory */
6536               d->mFx   = Ifx_Read;
6537               d->mAddr = mkexpr(addr);
6538               /* d->mSize set above */
6539
6540               /* declare we're writing guest state */
6541               d->nFxState = 5;
6542               vex_bzero(&d->fxState, sizeof(d->fxState));
6543
6544               d->fxState[0].fx     = Ifx_Write;
6545               d->fxState[0].offset = OFFB_FTOP;
6546               d->fxState[0].size   = sizeof(UInt);
6547
6548               d->fxState[1].fx     = Ifx_Write;
6549               d->fxState[1].offset = OFFB_FPREGS;
6550               d->fxState[1].size   = 8 * sizeof(ULong);
6551
6552               d->fxState[2].fx     = Ifx_Write;
6553               d->fxState[2].offset = OFFB_FPTAGS;
6554               d->fxState[2].size   = 8 * sizeof(UChar);
6555
6556               d->fxState[3].fx     = Ifx_Write;
6557               d->fxState[3].offset = OFFB_FPROUND;
6558               d->fxState[3].size   = sizeof(ULong);
6559
6560               d->fxState[4].fx     = Ifx_Write;
6561               d->fxState[4].offset = OFFB_FC3210;
6562               d->fxState[4].size   = sizeof(ULong);
6563
6564               stmt( IRStmt_Dirty(d) );
6565
6566               /* ew contains any emulation warning we may need to
6567                  issue.  If needed, side-exit to the next insn,
6568                  reporting the warning, so that Valgrind's dispatcher
6569                  sees the warning. */
6570               assign(ew, unop(Iop_64to32,mkexpr(w64)) );
6571               put_emwarn( mkexpr(ew) );
6572               stmt(
6573                  IRStmt_Exit(
6574                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
6575                     Ijk_EmWarn,
6576                     IRConst_U64( guest_RIP_bbstart+delta ),
6577                     OFFB_RIP
6578                  )
6579               );
6580
6581               if ( have66(pfx) ) {
6582                  DIP("frstors %s\n", dis_buf);
6583               } else {
6584                  DIP("frstor %s\n", dis_buf);
6585               }
6586               break;
6587            }
6588
6589            case 6: { /* FNSAVE m94/m108 */
6590               IRDirty *d;
6591               if ( have66(pfx) ) {
6592                 /* Uses dirty helper:
6593                    void amd64g_dirtyhelper_FNSAVES ( VexGuestAMD64State*,
6594                                                      HWord ) */
6595                  d = unsafeIRDirty_0_N (
6596                         0/*regparms*/,
6597                         "amd64g_dirtyhelper_FNSAVES",
6598                         &amd64g_dirtyhelper_FNSAVES,
6599                         mkIRExprVec_1( mkexpr(addr) )
6600                         );
6601                  d->mSize = 94;
6602               } else {
6603                 /* Uses dirty helper:
6604                    void amd64g_dirtyhelper_FNSAVE ( VexGuestAMD64State*,
6605                                                     HWord ) */
6606                  d = unsafeIRDirty_0_N (
6607                         0/*regparms*/,
6608                         "amd64g_dirtyhelper_FNSAVE",
6609                         &amd64g_dirtyhelper_FNSAVE,
6610                         mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
6611                      );
6612                  d->mSize = 108;
6613               }
6614
6615               /* declare we're writing memory */
6616               d->mFx   = Ifx_Write;
6617               d->mAddr = mkexpr(addr);
6618               /* d->mSize set above */
6619
6620               /* declare we're reading guest state */
6621               d->nFxState = 5;
6622               vex_bzero(&d->fxState, sizeof(d->fxState));
6623
6624               d->fxState[0].fx     = Ifx_Read;
6625               d->fxState[0].offset = OFFB_FTOP;
6626               d->fxState[0].size   = sizeof(UInt);
6627
6628               d->fxState[1].fx     = Ifx_Read;
6629               d->fxState[1].offset = OFFB_FPREGS;
6630               d->fxState[1].size   = 8 * sizeof(ULong);
6631
6632               d->fxState[2].fx     = Ifx_Read;
6633               d->fxState[2].offset = OFFB_FPTAGS;
6634               d->fxState[2].size   = 8 * sizeof(UChar);
6635
6636               d->fxState[3].fx     = Ifx_Read;
6637               d->fxState[3].offset = OFFB_FPROUND;
6638               d->fxState[3].size   = sizeof(ULong);
6639
6640               d->fxState[4].fx     = Ifx_Read;
6641               d->fxState[4].offset = OFFB_FC3210;
6642               d->fxState[4].size   = sizeof(ULong);
6643
6644               stmt( IRStmt_Dirty(d) );
6645
6646               if ( have66(pfx) ) {
6647                 DIP("fnsaves %s\n", dis_buf);
6648               } else {
6649                 DIP("fnsave %s\n", dis_buf);
6650               }
6651               break;
6652            }
6653
6654            case 7: { /* FNSTSW m16 */
6655               IRExpr* sw = get_FPU_sw();
6656               vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
6657               storeLE( mkexpr(addr), sw );
6658               DIP("fnstsw %s\n", dis_buf);
6659               break;
6660            }
6661
6662            default:
6663               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
6664               vex_printf("first_opcode == 0xDD\n");
6665               goto decode_fail;
6666         }
6667      } else {
6668         delta++;
6669         switch (modrm) {
6670
6671            case 0xC0 ... 0xC7: /* FFREE %st(?) */
6672               r_dst = (UInt)modrm - 0xC0;
6673               DIP("ffree %%st(%u)\n", r_dst);
6674               put_ST_TAG ( r_dst, mkU8(0) );
6675               break;
6676
6677            case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
6678               r_dst = (UInt)modrm - 0xD0;
6679               DIP("fst %%st(0),%%st(%u)\n", r_dst);
6680               /* P4 manual says: "If the destination operand is a
6681                  non-empty register, the invalid-operation exception
6682                  is not generated.  Hence put_ST_UNCHECKED. */
6683               put_ST_UNCHECKED(r_dst, get_ST(0));
6684               break;
6685
6686            case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
6687               r_dst = (UInt)modrm - 0xD8;
6688               DIP("fstp %%st(0),%%st(%u)\n", r_dst);
6689               /* P4 manual says: "If the destination operand is a
6690                  non-empty register, the invalid-operation exception
6691                  is not generated.  Hence put_ST_UNCHECKED. */
6692               put_ST_UNCHECKED(r_dst, get_ST(0));
6693               fp_pop();
6694               break;
6695
6696            case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
6697               r_dst = (UInt)modrm - 0xE0;
6698               DIP("fucom %%st(0),%%st(%u)\n", r_dst);
6699               /* This forces C1 to zero, which isn't right. */
6700               put_C3210(
6701                   unop(Iop_32Uto64,
6702                   binop( Iop_And32,
6703                          binop(Iop_Shl32,
6704                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
6705                                mkU8(8)),
6706                          mkU32(0x4500)
6707                   )));
6708               break;
6709
6710            case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
6711               r_dst = (UInt)modrm - 0xE8;
6712               DIP("fucomp %%st(0),%%st(%u)\n", r_dst);
6713               /* This forces C1 to zero, which isn't right. */
6714               put_C3210(
6715                   unop(Iop_32Uto64,
6716                   binop( Iop_And32,
6717                          binop(Iop_Shl32,
6718                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
6719                                mkU8(8)),
6720                          mkU32(0x4500)
6721                   )));
6722               fp_pop();
6723               break;
6724
6725            default:
6726               goto decode_fail;
6727         }
6728      }
6729   }
6730
6731   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
6732   else
6733   if (first_opcode == 0xDE) {
6734
6735      if (modrm < 0xC0) {
6736
6737         /* bits 5,4,3 are an opcode extension, and the modRM also
6738            specifies an address. */
6739         IROp   fop;
6740         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6741         delta += len;
6742
6743         switch (gregLO3ofRM(modrm)) {
6744
6745            case 0: /* FIADD m16int */ /* ST(0) += m16int */
6746               DIP("fiaddw %s\n", dis_buf);
6747               fop = Iop_AddF64;
6748               goto do_fop_m16;
6749
6750            case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
6751               DIP("fimulw %s\n", dis_buf);
6752               fop = Iop_MulF64;
6753               goto do_fop_m16;
6754
6755            case 4: /* FISUB m16int */ /* ST(0) -= m16int */
6756               DIP("fisubw %s\n", dis_buf);
6757               fop = Iop_SubF64;
6758               goto do_fop_m16;
6759
6760            case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
6761               DIP("fisubrw %s\n", dis_buf);
6762               fop = Iop_SubF64;
6763               goto do_foprev_m16;
6764
6765            case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
6766               DIP("fisubw %s\n", dis_buf);
6767               fop = Iop_DivF64;
6768               goto do_fop_m16;
6769
6770            case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
6771               DIP("fidivrw %s\n", dis_buf);
6772               fop = Iop_DivF64;
6773               goto do_foprev_m16;
6774
6775            do_fop_m16:
6776               put_ST_UNCHECKED(0,
6777                  triop(fop,
6778                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6779                        get_ST(0),
6780                        unop(Iop_I32StoF64,
6781                             unop(Iop_16Sto32,
6782                                  loadLE(Ity_I16, mkexpr(addr))))));
6783               break;
6784
6785            do_foprev_m16:
6786               put_ST_UNCHECKED(0,
6787                  triop(fop,
6788                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
6789                        unop(Iop_I32StoF64,
6790                             unop(Iop_16Sto32,
6791                                  loadLE(Ity_I16, mkexpr(addr)))),
6792                        get_ST(0)));
6793               break;
6794
6795            default:
6796               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
6797               vex_printf("first_opcode == 0xDE\n");
6798               goto decode_fail;
6799         }
6800
6801      } else {
6802
6803         delta++;
6804         switch (modrm) {
6805
6806            case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
6807               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
6808               break;
6809
6810            case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
6811               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
6812               break;
6813
6814            case 0xD9: /* FCOMPP %st(0),%st(1) */
6815               DIP("fcompp %%st(0),%%st(1)\n");
6816               /* This forces C1 to zero, which isn't right. */
6817               put_C3210(
6818                   unop(Iop_32Uto64,
6819                   binop( Iop_And32,
6820                          binop(Iop_Shl32,
6821                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
6822                                mkU8(8)),
6823                          mkU32(0x4500)
6824                   )));
6825               fp_pop();
6826               fp_pop();
6827               break;
6828
6829            case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
6830               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
6831               break;
6832
6833            case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
6834               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
6835               break;
6836
6837            case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
6838               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
6839               break;
6840
6841            case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
6842               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
6843               break;
6844
6845            default:
6846               goto decode_fail;
6847         }
6848
6849      }
6850   }
6851
6852   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
6853   else
6854   if (first_opcode == 0xDF) {
6855
6856      if (modrm < 0xC0) {
6857
6858         /* bits 5,4,3 are an opcode extension, and the modRM also
6859            specifies an address. */
6860         IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
6861         delta += len;
6862
6863         switch (gregLO3ofRM(modrm)) {
6864
6865            case 0: /* FILD m16int */
6866               DIP("fildw %s\n", dis_buf);
6867               fp_push();
6868               put_ST(0, unop(Iop_I32StoF64,
6869                              unop(Iop_16Sto32,
6870                                   loadLE(Ity_I16, mkexpr(addr)))));
6871               break;
6872
6873            case 1: /* FISTTPS m16 (SSE3) */
6874               DIP("fisttps %s\n", dis_buf);
6875               storeLE( mkexpr(addr),
6876                        x87ishly_qnarrow_32_to_16(
6877                        binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) ));
6878               fp_pop();
6879               break;
6880
6881            case 2: /* FIST m16 */
6882               DIP("fists %s\n", dis_buf);
6883               storeLE( mkexpr(addr),
6884                        x87ishly_qnarrow_32_to_16(
6885                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
6886               break;
6887
6888            case 3: /* FISTP m16 */
6889               DIP("fistps %s\n", dis_buf);
6890               storeLE( mkexpr(addr),
6891                        x87ishly_qnarrow_32_to_16(
6892                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) ));
6893               fp_pop();
6894               break;
6895
6896            case 5: /* FILD m64 */
6897               DIP("fildll %s\n", dis_buf);
6898               fp_push();
6899               put_ST(0, binop(Iop_I64StoF64,
6900                               get_roundingmode(),
6901                               loadLE(Ity_I64, mkexpr(addr))));
6902               break;
6903
6904            case 7: /* FISTP m64 */
6905               DIP("fistpll %s\n", dis_buf);
6906               storeLE( mkexpr(addr),
6907                        binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
6908               fp_pop();
6909               break;
6910
6911            default:
6912               vex_printf("unhandled opc_aux = 0x%2x\n", gregLO3ofRM(modrm));
6913               vex_printf("first_opcode == 0xDF\n");
6914               goto decode_fail;
6915         }
6916
6917      } else {
6918
6919         delta++;
6920         switch (modrm) {
6921
6922            case 0xC0: /* FFREEP %st(0) */
6923               DIP("ffreep %%st(%d)\n", 0);
6924               put_ST_TAG ( 0, mkU8(0) );
6925               fp_pop();
6926               break;
6927
6928            case 0xE0: /* FNSTSW %ax */
6929               DIP("fnstsw %%ax\n");
6930               /* Invent a plausible-looking FPU status word value and
6931                  dump it in %AX:
6932                     ((ftop & 7) << 11) | (c3210 & 0x4700)
6933               */
6934               putIRegRAX(
6935                  2,
6936                  unop(Iop_32to16,
6937                       binop(Iop_Or32,
6938                             binop(Iop_Shl32,
6939                                   binop(Iop_And32, get_ftop(), mkU32(7)),
6940                                   mkU8(11)),
6941                             binop(Iop_And32,
6942                                   unop(Iop_64to32, get_C3210()),
6943                                   mkU32(0x4700))
6944               )));
6945               break;
6946
6947            case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
6948               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
6949               break;
6950
6951            case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
6952               /* not really right since COMIP != UCOMIP */
6953               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
6954               break;
6955
6956            default:
6957               goto decode_fail;
6958         }
6959      }
6960
6961   }
6962
6963   else
6964      goto decode_fail;
6965
6966   *decode_ok = True;
6967   return delta;
6968
6969  decode_fail:
6970   *decode_ok = False;
6971   return delta;
6972}
6973
6974
6975/*------------------------------------------------------------*/
6976/*---                                                      ---*/
6977/*--- MMX INSTRUCTIONS                                     ---*/
6978/*---                                                      ---*/
6979/*------------------------------------------------------------*/
6980
6981/* Effect of MMX insns on x87 FPU state (table 11-2 of
6982   IA32 arch manual, volume 3):
6983
6984   Read from, or write to MMX register (viz, any insn except EMMS):
6985   * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
6986   * FP stack pointer set to zero
6987
6988   EMMS:
6989   * All tags set to Invalid (empty) -- FPTAGS[i] := zero
6990   * FP stack pointer set to zero
6991*/
6992
6993static void do_MMX_preamble ( void )
6994{
6995   Int         i;
6996   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
6997   IRExpr*     zero  = mkU32(0);
6998   IRExpr*     tag1  = mkU8(1);
6999   put_ftop(zero);
7000   for (i = 0; i < 8; i++)
7001      stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag1) ) );
7002}
7003
7004static void do_EMMS_preamble ( void )
7005{
7006   Int         i;
7007   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
7008   IRExpr*     zero  = mkU32(0);
7009   IRExpr*     tag0  = mkU8(0);
7010   put_ftop(zero);
7011   for (i = 0; i < 8; i++)
7012      stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag0) ) );
7013}
7014
7015
7016static IRExpr* getMMXReg ( UInt archreg )
7017{
7018   vassert(archreg < 8);
7019   return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
7020}
7021
7022
7023static void putMMXReg ( UInt archreg, IRExpr* e )
7024{
7025   vassert(archreg < 8);
7026   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
7027   stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
7028}
7029
7030
7031/* Helper for non-shift MMX insns.  Note this is incomplete in the
7032   sense that it does not first call do_MMX_preamble() -- that is the
7033   responsibility of its caller. */
7034
7035static
7036ULong dis_MMXop_regmem_to_reg ( VexAbiInfo* vbi,
7037                                Prefix      pfx,
7038                                Long        delta,
7039                                UChar       opc,
7040                                const HChar* name,
7041                                Bool        show_granularity )
7042{
7043   HChar   dis_buf[50];
7044   UChar   modrm = getUChar(delta);
7045   Bool    isReg = epartIsReg(modrm);
7046   IRExpr* argL  = NULL;
7047   IRExpr* argR  = NULL;
7048   IRExpr* argG  = NULL;
7049   IRExpr* argE  = NULL;
7050   IRTemp  res   = newTemp(Ity_I64);
7051
7052   Bool    invG  = False;
7053   IROp    op    = Iop_INVALID;
7054   void*   hAddr = NULL;
7055   const HChar*  hName = NULL;
7056   Bool    eLeft = False;
7057
7058#  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
7059
7060   switch (opc) {
7061      /* Original MMX ones */
7062      case 0xFC: op = Iop_Add8x8; break;
7063      case 0xFD: op = Iop_Add16x4; break;
7064      case 0xFE: op = Iop_Add32x2; break;
7065
7066      case 0xEC: op = Iop_QAdd8Sx8; break;
7067      case 0xED: op = Iop_QAdd16Sx4; break;
7068
7069      case 0xDC: op = Iop_QAdd8Ux8; break;
7070      case 0xDD: op = Iop_QAdd16Ux4; break;
7071
7072      case 0xF8: op = Iop_Sub8x8;  break;
7073      case 0xF9: op = Iop_Sub16x4; break;
7074      case 0xFA: op = Iop_Sub32x2; break;
7075
7076      case 0xE8: op = Iop_QSub8Sx8; break;
7077      case 0xE9: op = Iop_QSub16Sx4; break;
7078
7079      case 0xD8: op = Iop_QSub8Ux8; break;
7080      case 0xD9: op = Iop_QSub16Ux4; break;
7081
7082      case 0xE5: op = Iop_MulHi16Sx4; break;
7083      case 0xD5: op = Iop_Mul16x4; break;
7084      case 0xF5: XXX(amd64g_calculate_mmx_pmaddwd); break;
7085
7086      case 0x74: op = Iop_CmpEQ8x8; break;
7087      case 0x75: op = Iop_CmpEQ16x4; break;
7088      case 0x76: op = Iop_CmpEQ32x2; break;
7089
7090      case 0x64: op = Iop_CmpGT8Sx8; break;
7091      case 0x65: op = Iop_CmpGT16Sx4; break;
7092      case 0x66: op = Iop_CmpGT32Sx2; break;
7093
7094      case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
7095      case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
7096      case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
7097
7098      case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
7099      case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
7100      case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
7101
7102      case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
7103      case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
7104      case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
7105
7106      case 0xDB: op = Iop_And64; break;
7107      case 0xDF: op = Iop_And64; invG = True; break;
7108      case 0xEB: op = Iop_Or64; break;
7109      case 0xEF: /* Possibly do better here if argL and argR are the
7110                    same reg */
7111                 op = Iop_Xor64; break;
7112
7113      /* Introduced in SSE1 */
7114      case 0xE0: op = Iop_Avg8Ux8;    break;
7115      case 0xE3: op = Iop_Avg16Ux4;   break;
7116      case 0xEE: op = Iop_Max16Sx4;   break;
7117      case 0xDE: op = Iop_Max8Ux8;    break;
7118      case 0xEA: op = Iop_Min16Sx4;   break;
7119      case 0xDA: op = Iop_Min8Ux8;    break;
7120      case 0xE4: op = Iop_MulHi16Ux4; break;
7121      case 0xF6: XXX(amd64g_calculate_mmx_psadbw); break;
7122
7123      /* Introduced in SSE2 */
7124      case 0xD4: op = Iop_Add64; break;
7125      case 0xFB: op = Iop_Sub64; break;
7126
7127      default:
7128         vex_printf("\n0x%x\n", (Int)opc);
7129         vpanic("dis_MMXop_regmem_to_reg");
7130   }
7131
7132#  undef XXX
7133
7134   argG = getMMXReg(gregLO3ofRM(modrm));
7135   if (invG)
7136      argG = unop(Iop_Not64, argG);
7137
7138   if (isReg) {
7139      delta++;
7140      argE = getMMXReg(eregLO3ofRM(modrm));
7141   } else {
7142      Int    len;
7143      IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7144      delta += len;
7145      argE = loadLE(Ity_I64, mkexpr(addr));
7146   }
7147
7148   if (eLeft) {
7149      argL = argE;
7150      argR = argG;
7151   } else {
7152      argL = argG;
7153      argR = argE;
7154   }
7155
7156   if (op != Iop_INVALID) {
7157      vassert(hName == NULL);
7158      vassert(hAddr == NULL);
7159      assign(res, binop(op, argL, argR));
7160   } else {
7161      vassert(hName != NULL);
7162      vassert(hAddr != NULL);
7163      assign( res,
7164              mkIRExprCCall(
7165                 Ity_I64,
7166                 0/*regparms*/, hName, hAddr,
7167                 mkIRExprVec_2( argL, argR )
7168              )
7169            );
7170   }
7171
7172   putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
7173
7174   DIP("%s%s %s, %s\n",
7175       name, show_granularity ? nameMMXGran(opc & 3) : "",
7176       ( isReg ? nameMMXReg(eregLO3ofRM(modrm)) : dis_buf ),
7177       nameMMXReg(gregLO3ofRM(modrm)) );
7178
7179   return delta;
7180}
7181
7182
7183/* Vector by scalar shift of G by the amount specified at the bottom
7184   of E.  This is a straight copy of dis_SSE_shiftG_byE. */
7185
7186static ULong dis_MMX_shiftG_byE ( VexAbiInfo* vbi,
7187                                  Prefix pfx, Long delta,
7188                                  const HChar* opname, IROp op )
7189{
7190   HChar   dis_buf[50];
7191   Int     alen, size;
7192   IRTemp  addr;
7193   Bool    shl, shr, sar;
7194   UChar   rm   = getUChar(delta);
7195   IRTemp  g0   = newTemp(Ity_I64);
7196   IRTemp  g1   = newTemp(Ity_I64);
7197   IRTemp  amt  = newTemp(Ity_I64);
7198   IRTemp  amt8 = newTemp(Ity_I8);
7199
7200   if (epartIsReg(rm)) {
7201      assign( amt, getMMXReg(eregLO3ofRM(rm)) );
7202      DIP("%s %s,%s\n", opname,
7203                        nameMMXReg(eregLO3ofRM(rm)),
7204                        nameMMXReg(gregLO3ofRM(rm)) );
7205      delta++;
7206   } else {
7207      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
7208      assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
7209      DIP("%s %s,%s\n", opname,
7210                        dis_buf,
7211                        nameMMXReg(gregLO3ofRM(rm)) );
7212      delta += alen;
7213   }
7214   assign( g0,   getMMXReg(gregLO3ofRM(rm)) );
7215   assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
7216
7217   shl = shr = sar = False;
7218   size = 0;
7219   switch (op) {
7220      case Iop_ShlN16x4: shl = True; size = 32; break;
7221      case Iop_ShlN32x2: shl = True; size = 32; break;
7222      case Iop_Shl64:    shl = True; size = 64; break;
7223      case Iop_ShrN16x4: shr = True; size = 16; break;
7224      case Iop_ShrN32x2: shr = True; size = 32; break;
7225      case Iop_Shr64:    shr = True; size = 64; break;
7226      case Iop_SarN16x4: sar = True; size = 16; break;
7227      case Iop_SarN32x2: sar = True; size = 32; break;
7228      default: vassert(0);
7229   }
7230
7231   if (shl || shr) {
7232     assign(
7233        g1,
7234        IRExpr_ITE(
7235           binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size)),
7236           binop(op, mkexpr(g0), mkexpr(amt8)),
7237           mkU64(0)
7238        )
7239     );
7240   } else
7241   if (sar) {
7242     assign(
7243        g1,
7244        IRExpr_ITE(
7245           binop(Iop_CmpLT64U,mkexpr(amt),mkU64(size)),
7246           binop(op, mkexpr(g0), mkexpr(amt8)),
7247           binop(op, mkexpr(g0), mkU8(size-1))
7248        )
7249     );
7250   } else {
7251      vassert(0);
7252   }
7253
7254   putMMXReg( gregLO3ofRM(rm), mkexpr(g1) );
7255   return delta;
7256}
7257
7258
7259/* Vector by scalar shift of E by an immediate byte.  This is a
7260   straight copy of dis_SSE_shiftE_imm. */
7261
7262static
7263ULong dis_MMX_shiftE_imm ( Long delta, const HChar* opname, IROp op )
7264{
7265   Bool    shl, shr, sar;
7266   UChar   rm   = getUChar(delta);
7267   IRTemp  e0   = newTemp(Ity_I64);
7268   IRTemp  e1   = newTemp(Ity_I64);
7269   UChar   amt, size;
7270   vassert(epartIsReg(rm));
7271   vassert(gregLO3ofRM(rm) == 2
7272           || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
7273   amt = getUChar(delta+1);
7274   delta += 2;
7275   DIP("%s $%d,%s\n", opname,
7276                      (Int)amt,
7277                      nameMMXReg(eregLO3ofRM(rm)) );
7278
7279   assign( e0, getMMXReg(eregLO3ofRM(rm)) );
7280
7281   shl = shr = sar = False;
7282   size = 0;
7283   switch (op) {
7284      case Iop_ShlN16x4: shl = True; size = 16; break;
7285      case Iop_ShlN32x2: shl = True; size = 32; break;
7286      case Iop_Shl64:    shl = True; size = 64; break;
7287      case Iop_SarN16x4: sar = True; size = 16; break;
7288      case Iop_SarN32x2: sar = True; size = 32; break;
7289      case Iop_ShrN16x4: shr = True; size = 16; break;
7290      case Iop_ShrN32x2: shr = True; size = 32; break;
7291      case Iop_Shr64:    shr = True; size = 64; break;
7292      default: vassert(0);
7293   }
7294
7295   if (shl || shr) {
7296     assign( e1, amt >= size
7297                    ? mkU64(0)
7298                    : binop(op, mkexpr(e0), mkU8(amt))
7299     );
7300   } else
7301   if (sar) {
7302     assign( e1, amt >= size
7303                    ? binop(op, mkexpr(e0), mkU8(size-1))
7304                    : binop(op, mkexpr(e0), mkU8(amt))
7305     );
7306   } else {
7307      vassert(0);
7308   }
7309
7310   putMMXReg( eregLO3ofRM(rm), mkexpr(e1) );
7311   return delta;
7312}
7313
7314
7315/* Completely handle all MMX instructions except emms. */
7316
7317static
7318ULong dis_MMX ( Bool* decode_ok,
7319                VexAbiInfo* vbi, Prefix pfx, Int sz, Long delta )
7320{
7321   Int   len;
7322   UChar modrm;
7323   HChar dis_buf[50];
7324   UChar opc = getUChar(delta);
7325   delta++;
7326
7327   /* dis_MMX handles all insns except emms. */
7328   do_MMX_preamble();
7329
7330   switch (opc) {
7331
7332      case 0x6E:
7333         if (sz == 4) {
7334            /* MOVD (src)ireg32-or-mem32 (E), (dst)mmxreg (G)*/
7335            modrm = getUChar(delta);
7336            if (epartIsReg(modrm)) {
7337               delta++;
7338               putMMXReg(
7339                  gregLO3ofRM(modrm),
7340                  binop( Iop_32HLto64,
7341                         mkU32(0),
7342                         getIReg32(eregOfRexRM(pfx,modrm)) ) );
7343               DIP("movd %s, %s\n",
7344                   nameIReg32(eregOfRexRM(pfx,modrm)),
7345                   nameMMXReg(gregLO3ofRM(modrm)));
7346            } else {
7347               IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7348               delta += len;
7349               putMMXReg(
7350                  gregLO3ofRM(modrm),
7351                  binop( Iop_32HLto64,
7352                         mkU32(0),
7353                         loadLE(Ity_I32, mkexpr(addr)) ) );
7354               DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
7355            }
7356         }
7357         else
7358         if (sz == 8) {
7359            /* MOVD (src)ireg64-or-mem64 (E), (dst)mmxreg (G)*/
7360            modrm = getUChar(delta);
7361            if (epartIsReg(modrm)) {
7362               delta++;
7363               putMMXReg( gregLO3ofRM(modrm),
7364                          getIReg64(eregOfRexRM(pfx,modrm)) );
7365               DIP("movd %s, %s\n",
7366                   nameIReg64(eregOfRexRM(pfx,modrm)),
7367                   nameMMXReg(gregLO3ofRM(modrm)));
7368            } else {
7369               IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7370               delta += len;
7371               putMMXReg( gregLO3ofRM(modrm),
7372                          loadLE(Ity_I64, mkexpr(addr)) );
7373               DIP("movd{64} %s, %s\n", dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
7374            }
7375         }
7376         else {
7377            goto mmx_decode_failure;
7378         }
7379         break;
7380
7381      case 0x7E:
7382         if (sz == 4) {
7383            /* MOVD (src)mmxreg (G), (dst)ireg32-or-mem32 (E) */
7384            modrm = getUChar(delta);
7385            if (epartIsReg(modrm)) {
7386               delta++;
7387               putIReg32( eregOfRexRM(pfx,modrm),
7388                          unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
7389               DIP("movd %s, %s\n",
7390                   nameMMXReg(gregLO3ofRM(modrm)),
7391                   nameIReg32(eregOfRexRM(pfx,modrm)));
7392            } else {
7393               IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7394               delta += len;
7395               storeLE( mkexpr(addr),
7396                        unop(Iop_64to32, getMMXReg(gregLO3ofRM(modrm)) ) );
7397               DIP("movd %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
7398            }
7399         }
7400         else
7401         if (sz == 8) {
7402            /* MOVD (src)mmxreg (G), (dst)ireg64-or-mem64 (E) */
7403            modrm = getUChar(delta);
7404            if (epartIsReg(modrm)) {
7405               delta++;
7406               putIReg64( eregOfRexRM(pfx,modrm),
7407                          getMMXReg(gregLO3ofRM(modrm)) );
7408               DIP("movd %s, %s\n",
7409                   nameMMXReg(gregLO3ofRM(modrm)),
7410                   nameIReg64(eregOfRexRM(pfx,modrm)));
7411            } else {
7412               IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7413               delta += len;
7414               storeLE( mkexpr(addr),
7415                       getMMXReg(gregLO3ofRM(modrm)) );
7416               DIP("movd{64} %s, %s\n", nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
7417            }
7418         } else {
7419            goto mmx_decode_failure;
7420         }
7421         break;
7422
7423      case 0x6F:
7424         /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
7425         if (sz != 4
7426             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7427            goto mmx_decode_failure;
7428         modrm = getUChar(delta);
7429         if (epartIsReg(modrm)) {
7430            delta++;
7431            putMMXReg( gregLO3ofRM(modrm), getMMXReg(eregLO3ofRM(modrm)) );
7432            DIP("movq %s, %s\n",
7433                nameMMXReg(eregLO3ofRM(modrm)),
7434                nameMMXReg(gregLO3ofRM(modrm)));
7435         } else {
7436            IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7437            delta += len;
7438            putMMXReg( gregLO3ofRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
7439            DIP("movq %s, %s\n",
7440                dis_buf, nameMMXReg(gregLO3ofRM(modrm)));
7441         }
7442         break;
7443
7444      case 0x7F:
7445         /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
7446         if (sz != 4
7447             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7448            goto mmx_decode_failure;
7449         modrm = getUChar(delta);
7450         if (epartIsReg(modrm)) {
7451            delta++;
7452            putMMXReg( eregLO3ofRM(modrm), getMMXReg(gregLO3ofRM(modrm)) );
7453            DIP("movq %s, %s\n",
7454                nameMMXReg(gregLO3ofRM(modrm)),
7455                nameMMXReg(eregLO3ofRM(modrm)));
7456         } else {
7457            IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
7458            delta += len;
7459            storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
7460            DIP("mov(nt)q %s, %s\n",
7461                nameMMXReg(gregLO3ofRM(modrm)), dis_buf);
7462         }
7463         break;
7464
7465      case 0xFC:
7466      case 0xFD:
7467      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
7468         if (sz != 4)
7469            goto mmx_decode_failure;
7470         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padd", True );
7471         break;
7472
7473      case 0xEC:
7474      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
7475         if (sz != 4
7476             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7477            goto mmx_decode_failure;
7478         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "padds", True );
7479         break;
7480
7481      case 0xDC:
7482      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
7483         if (sz != 4)
7484            goto mmx_decode_failure;
7485         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "paddus", True );
7486         break;
7487
7488      case 0xF8:
7489      case 0xF9:
7490      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
7491         if (sz != 4)
7492            goto mmx_decode_failure;
7493         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psub", True );
7494         break;
7495
7496      case 0xE8:
7497      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
7498         if (sz != 4)
7499            goto mmx_decode_failure;
7500         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubs", True );
7501         break;
7502
7503      case 0xD8:
7504      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
7505         if (sz != 4)
7506            goto mmx_decode_failure;
7507         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "psubus", True );
7508         break;
7509
7510      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
7511         if (sz != 4)
7512            goto mmx_decode_failure;
7513         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmulhw", False );
7514         break;
7515
7516      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
7517         if (sz != 4)
7518            goto mmx_decode_failure;
7519         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmullw", False );
7520         break;
7521
7522      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
7523         vassert(sz == 4);
7524         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pmaddwd", False );
7525         break;
7526
7527      case 0x74:
7528      case 0x75:
7529      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
7530         if (sz != 4)
7531            goto mmx_decode_failure;
7532         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpeq", True );
7533         break;
7534
7535      case 0x64:
7536      case 0x65:
7537      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
7538         if (sz != 4)
7539            goto mmx_decode_failure;
7540         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pcmpgt", True );
7541         break;
7542
7543      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
7544         if (sz != 4)
7545            goto mmx_decode_failure;
7546         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packssdw", False );
7547         break;
7548
7549      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
7550         if (sz != 4)
7551            goto mmx_decode_failure;
7552         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packsswb", False );
7553         break;
7554
7555      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
7556         if (sz != 4)
7557            goto mmx_decode_failure;
7558         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "packuswb", False );
7559         break;
7560
7561      case 0x68:
7562      case 0x69:
7563      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
7564         if (sz != 4
7565             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7566            goto mmx_decode_failure;
7567         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckh", True );
7568         break;
7569
7570      case 0x60:
7571      case 0x61:
7572      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
7573         if (sz != 4
7574             && /*ignore redundant REX.W*/!(sz==8 && haveNo66noF2noF3(pfx)))
7575            goto mmx_decode_failure;
7576         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "punpckl", True );
7577         break;
7578
7579      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
7580         if (sz != 4)
7581            goto mmx_decode_failure;
7582         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pand", False );
7583         break;
7584
7585      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
7586         if (sz != 4)
7587            goto mmx_decode_failure;
7588         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pandn", False );
7589         break;
7590
7591      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
7592         if (sz != 4)
7593            goto mmx_decode_failure;
7594         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "por", False );
7595         break;
7596
7597      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
7598         if (sz != 4)
7599            goto mmx_decode_failure;
7600         delta = dis_MMXop_regmem_to_reg ( vbi, pfx, delta, opc, "pxor", False );
7601         break;
7602
7603#     define SHIFT_BY_REG(_name,_op)                                     \
7604                delta = dis_MMX_shiftG_byE(vbi, pfx, delta, _name, _op); \
7605                break;
7606
7607      /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
7608      case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
7609      case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
7610      case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
7611
7612      /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
7613      case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
7614      case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
7615      case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
7616
7617      /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
7618      case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
7619      case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
7620
7621#     undef SHIFT_BY_REG
7622
7623      case 0x71:
7624      case 0x72:
7625      case 0x73: {
7626         /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
7627         UChar byte2, subopc;
7628         if (sz != 4)
7629            goto mmx_decode_failure;
7630         byte2  = getUChar(delta);      /* amode / sub-opcode */
7631         subopc = toUChar( (byte2 >> 3) & 7 );
7632
7633#        define SHIFT_BY_IMM(_name,_op)                        \
7634            do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
7635            } while (0)
7636
7637              if (subopc == 2 /*SRL*/ && opc == 0x71)
7638                  SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
7639         else if (subopc == 2 /*SRL*/ && opc == 0x72)
7640                 SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
7641         else if (subopc == 2 /*SRL*/ && opc == 0x73)
7642                 SHIFT_BY_IMM("psrlq", Iop_Shr64);
7643
7644         else if (subopc == 4 /*SAR*/ && opc == 0x71)
7645                 SHIFT_BY_IMM("psraw", Iop_SarN16x4);
7646         else if (subopc == 4 /*SAR*/ && opc == 0x72)
7647                 SHIFT_BY_IMM("psrad", Iop_SarN32x2);
7648
7649         else if (subopc == 6 /*SHL*/ && opc == 0x71)
7650                 SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
7651         else if (subopc == 6 /*SHL*/ && opc == 0x72)
7652                  SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
7653         else if (subopc == 6 /*SHL*/ && opc == 0x73)
7654                 SHIFT_BY_IMM("psllq", Iop_Shl64);
7655
7656         else goto mmx_decode_failure;
7657
7658#        undef SHIFT_BY_IMM
7659         break;
7660      }
7661
7662      case 0xF7: {
7663         IRTemp addr    = newTemp(Ity_I64);
7664         IRTemp regD    = newTemp(Ity_I64);
7665         IRTemp regM    = newTemp(Ity_I64);
7666         IRTemp mask    = newTemp(Ity_I64);
7667         IRTemp olddata = newTemp(Ity_I64);
7668         IRTemp newdata = newTemp(Ity_I64);
7669
7670         modrm = getUChar(delta);
7671         if (sz != 4 || (!epartIsReg(modrm)))
7672            goto mmx_decode_failure;
7673         delta++;
7674
7675         assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
7676         assign( regM, getMMXReg( eregLO3ofRM(modrm) ));
7677         assign( regD, getMMXReg( gregLO3ofRM(modrm) ));
7678         assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
7679         assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
7680         assign( newdata,
7681                 binop(Iop_Or64,
7682                       binop(Iop_And64,
7683                             mkexpr(regD),
7684                             mkexpr(mask) ),
7685                       binop(Iop_And64,
7686                             mkexpr(olddata),
7687                             unop(Iop_Not64, mkexpr(mask)))) );
7688         storeLE( mkexpr(addr), mkexpr(newdata) );
7689         DIP("maskmovq %s,%s\n", nameMMXReg( eregLO3ofRM(modrm) ),
7690                                 nameMMXReg( gregLO3ofRM(modrm) ) );
7691         break;
7692      }
7693
7694      /* --- MMX decode failure --- */
7695      default:
7696      mmx_decode_failure:
7697         *decode_ok = False;
7698         return delta; /* ignored */
7699
7700   }
7701
7702   *decode_ok = True;
7703   return delta;
7704}
7705
7706
7707/*------------------------------------------------------------*/
7708/*--- More misc arithmetic and other obscure insns.        ---*/
7709/*------------------------------------------------------------*/
7710
7711/* Generate base << amt with vacated places filled with stuff
7712   from xtra.  amt guaranteed in 0 .. 63. */
7713static
7714IRExpr* shiftL64_with_extras ( IRTemp base, IRTemp xtra, IRTemp amt )
7715{
7716   /* if   amt == 0
7717      then base
7718      else (base << amt) | (xtra >>u (64-amt))
7719   */
7720   return
7721      IRExpr_ITE(
7722         binop(Iop_CmpNE8, mkexpr(amt), mkU8(0)),
7723         binop(Iop_Or64,
7724               binop(Iop_Shl64, mkexpr(base), mkexpr(amt)),
7725               binop(Iop_Shr64, mkexpr(xtra),
7726                                binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
7727               ),
7728         mkexpr(base)
7729      );
7730}
7731
7732/* Generate base >>u amt with vacated places filled with stuff
7733   from xtra.  amt guaranteed in 0 .. 63. */
7734static
7735IRExpr* shiftR64_with_extras ( IRTemp xtra, IRTemp base, IRTemp amt )
7736{
7737   /* if   amt == 0
7738      then base
7739      else (base >>u amt) | (xtra << (64-amt))
7740   */
7741   return
7742      IRExpr_ITE(
7743         binop(Iop_CmpNE8, mkexpr(amt), mkU8(0)),
7744         binop(Iop_Or64,
7745               binop(Iop_Shr64, mkexpr(base), mkexpr(amt)),
7746               binop(Iop_Shl64, mkexpr(xtra),
7747                                binop(Iop_Sub8, mkU8(64), mkexpr(amt)))
7748               ),
7749         mkexpr(base)
7750      );
7751}
7752
7753/* Double length left and right shifts.  Apparently only required in
7754   v-size (no b- variant). */
7755static
7756ULong dis_SHLRD_Gv_Ev ( VexAbiInfo* vbi,
7757                        Prefix pfx,
7758                        Long delta, UChar modrm,
7759                        Int sz,
7760                        IRExpr* shift_amt,
7761                        Bool amt_is_literal,
7762                        const HChar* shift_amt_txt,
7763                        Bool left_shift )
7764{
7765   /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
7766      for printing it.   And eip on entry points at the modrm byte. */
7767   Int len;
7768   HChar dis_buf[50];
7769
7770   IRType ty     = szToITy(sz);
7771   IRTemp gsrc   = newTemp(ty);
7772   IRTemp esrc   = newTemp(ty);
7773   IRTemp addr   = IRTemp_INVALID;
7774   IRTemp tmpSH  = newTemp(Ity_I8);
7775   IRTemp tmpSS  = newTemp(Ity_I8);
7776   IRTemp tmp64  = IRTemp_INVALID;
7777   IRTemp res64  = IRTemp_INVALID;
7778   IRTemp rss64  = IRTemp_INVALID;
7779   IRTemp resTy  = IRTemp_INVALID;
7780   IRTemp rssTy  = IRTemp_INVALID;
7781   Int    mask   = sz==8 ? 63 : 31;
7782
7783   vassert(sz == 2 || sz == 4 || sz == 8);
7784
7785   /* The E-part is the destination; this is shifted.  The G-part
7786      supplies bits to be shifted into the E-part, but is not
7787      changed.
7788
7789      If shifting left, form a double-length word with E at the top
7790      and G at the bottom, and shift this left.  The result is then in
7791      the high part.
7792
7793      If shifting right, form a double-length word with G at the top
7794      and E at the bottom, and shift this right.  The result is then
7795      at the bottom.  */
7796
7797   /* Fetch the operands. */
7798
7799   assign( gsrc, getIRegG(sz, pfx, modrm) );
7800
7801   if (epartIsReg(modrm)) {
7802      delta++;
7803      assign( esrc, getIRegE(sz, pfx, modrm) );
7804      DIP("sh%cd%c %s, %s, %s\n",
7805          ( left_shift ? 'l' : 'r' ), nameISize(sz),
7806          shift_amt_txt,
7807          nameIRegG(sz, pfx, modrm), nameIRegE(sz, pfx, modrm));
7808   } else {
7809      addr = disAMode ( &len, vbi, pfx, delta, dis_buf,
7810                        /* # bytes following amode */
7811                        amt_is_literal ? 1 : 0 );
7812      delta += len;
7813      assign( esrc, loadLE(ty, mkexpr(addr)) );
7814      DIP("sh%cd%c %s, %s, %s\n",
7815          ( left_shift ? 'l' : 'r' ), nameISize(sz),
7816          shift_amt_txt,
7817          nameIRegG(sz, pfx, modrm), dis_buf);
7818   }
7819
7820   /* Calculate the masked shift amount (tmpSH), the masked subshift
7821      amount (tmpSS), the shifted value (res64) and the subshifted
7822      value (rss64). */
7823
7824   assign( tmpSH, binop(Iop_And8, shift_amt, mkU8(mask)) );
7825   assign( tmpSS, binop(Iop_And8,
7826                        binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
7827                        mkU8(mask)));
7828
7829   tmp64 = newTemp(Ity_I64);
7830   res64 = newTemp(Ity_I64);
7831   rss64 = newTemp(Ity_I64);
7832
7833   if (sz == 2 || sz == 4) {
7834
7835      /* G is xtra; E is data */
7836      /* what a freaking nightmare: */
7837      if (sz == 4 && left_shift) {
7838         assign( tmp64, binop(Iop_32HLto64, mkexpr(esrc), mkexpr(gsrc)) );
7839         assign( res64,
7840                 binop(Iop_Shr64,
7841                       binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
7842                       mkU8(32)) );
7843         assign( rss64,
7844                 binop(Iop_Shr64,
7845                       binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSS)),
7846                       mkU8(32)) );
7847      }
7848      else
7849      if (sz == 4 && !left_shift) {
7850         assign( tmp64, binop(Iop_32HLto64, mkexpr(gsrc), mkexpr(esrc)) );
7851         assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
7852         assign( rss64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSS)) );
7853      }
7854      else
7855      if (sz == 2 && left_shift) {
7856         assign( tmp64,
7857                 binop(Iop_32HLto64,
7858                       binop(Iop_16HLto32, mkexpr(esrc), mkexpr(gsrc)),
7859                       binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc))
7860         ));
7861         /* result formed by shifting [esrc'gsrc'gsrc'gsrc] */
7862         assign( res64,
7863                 binop(Iop_Shr64,
7864                       binop(Iop_Shl64, mkexpr(tmp64), mkexpr(tmpSH)),
7865                       mkU8(48)) );
7866         /* subshift formed by shifting [esrc'0000'0000'0000] */
7867         assign( rss64,
7868                 binop(Iop_Shr64,
7869                       binop(Iop_Shl64,
7870                             binop(Iop_Shl64, unop(Iop_16Uto64, mkexpr(esrc)),
7871                                              mkU8(48)),
7872                             mkexpr(tmpSS)),
7873                       mkU8(48)) );
7874      }
7875      else
7876      if (sz == 2 && !left_shift) {
7877         assign( tmp64,
7878                 binop(Iop_32HLto64,
7879                       binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(gsrc)),
7880                       binop(Iop_16HLto32, mkexpr(gsrc), mkexpr(esrc))
7881         ));
7882         /* result formed by shifting [gsrc'gsrc'gsrc'esrc] */
7883         assign( res64, binop(Iop_Shr64, mkexpr(tmp64), mkexpr(tmpSH)) );
7884         /* subshift formed by shifting [0000'0000'0000'esrc] */
7885         assign( rss64, binop(Iop_Shr64,
7886                              unop(Iop_16Uto64, mkexpr(esrc)),
7887                              mkexpr(tmpSS)) );
7888      }
7889
7890   } else {
7891
7892      vassert(sz == 8);
7893      if (left_shift) {
7894         assign( res64, shiftL64_with_extras( esrc, gsrc, tmpSH ));
7895         assign( rss64, shiftL64_with_extras( esrc, gsrc, tmpSS ));
7896      } else {
7897         assign( res64, shiftR64_with_extras( gsrc, esrc, tmpSH ));
7898         assign( rss64, shiftR64_with_extras( gsrc, esrc, tmpSS ));
7899      }
7900
7901   }
7902
7903   resTy = newTemp(ty);
7904   rssTy = newTemp(ty);
7905   assign( resTy, narrowTo(ty, mkexpr(res64)) );
7906   assign( rssTy, narrowTo(ty, mkexpr(rss64)) );
7907
7908   /* Put result back and write the flags thunk. */
7909   setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl64 : Iop_Sar64,
7910                              resTy, rssTy, ty, tmpSH );
7911
7912   if (epartIsReg(modrm)) {
7913      putIRegE(sz, pfx, modrm, mkexpr(resTy));
7914   } else {
7915      storeLE( mkexpr(addr), mkexpr(resTy) );
7916   }
7917
7918   if (amt_is_literal) delta++;
7919   return delta;
7920}
7921
7922
7923/* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
7924   required. */
7925
7926typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
7927
7928static const HChar* nameBtOp ( BtOp op )
7929{
7930   switch (op) {
7931      case BtOpNone:  return "";
7932      case BtOpSet:   return "s";
7933      case BtOpReset: return "r";
7934      case BtOpComp:  return "c";
7935      default: vpanic("nameBtOp(amd64)");
7936   }
7937}
7938
7939
7940static
7941ULong dis_bt_G_E ( VexAbiInfo* vbi,
7942                   Prefix pfx, Int sz, Long delta, BtOp op,
7943                   /*OUT*/Bool* decode_OK )
7944{
7945   HChar  dis_buf[50];
7946   UChar  modrm;
7947   Int    len;
7948   IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
7949          t_addr1, t_rsp, t_mask, t_new;
7950
7951   vassert(sz == 2 || sz == 4 || sz == 8);
7952
7953   t_fetched = t_bitno0 = t_bitno1 = t_bitno2
7954             = t_addr0 = t_addr1 = t_rsp
7955             = t_mask = t_new = IRTemp_INVALID;
7956
7957   t_fetched = newTemp(Ity_I8);
7958   t_new     = newTemp(Ity_I8);
7959   t_bitno0  = newTemp(Ity_I64);
7960   t_bitno1  = newTemp(Ity_I64);
7961   t_bitno2  = newTemp(Ity_I8);
7962   t_addr1   = newTemp(Ity_I64);
7963   modrm     = getUChar(delta);
7964
7965   *decode_OK = True;
7966   if (epartIsReg(modrm)) {
7967      /* F2 and F3 are never acceptable. */
7968      if (haveF2orF3(pfx)) {
7969         *decode_OK = False;
7970         return delta;
7971      }
7972   } else {
7973      /* F2 or F3 (but not both) are allowed, provided LOCK is also
7974         present, and only for the BTC/BTS/BTR cases (not BT). */
7975      if (haveF2orF3(pfx)) {
7976         if (haveF2andF3(pfx) || !haveLOCK(pfx) || op == BtOpNone) {
7977            *decode_OK = False;
7978            return delta;
7979         }
7980      }
7981   }
7982
7983   assign( t_bitno0, widenSto64(getIRegG(sz, pfx, modrm)) );
7984
7985   if (epartIsReg(modrm)) {
7986      delta++;
7987      /* Get it onto the client's stack.  Oh, this is a horrible
7988         kludge.  See https://bugs.kde.org/show_bug.cgi?id=245925.
7989         Because of the ELF ABI stack redzone, there may be live data
7990         up to 128 bytes below %RSP.  So we can't just push it on the
7991         stack, else we may wind up trashing live data, and causing
7992         impossible-to-find simulation errors.  (Yes, this did
7993         happen.)  So we need to drop RSP before at least 128 before
7994         pushing it.  That unfortunately means hitting Memcheck's
7995         fast-case painting code.  Ideally we should drop more than
7996         128, to reduce the chances of breaking buggy programs that
7997         have live data below -128(%RSP).  Memcheck fast-cases moves
7998         of 288 bytes due to the need to handle ppc64-linux quickly,
7999         so let's use 288.  Of course the real fix is to get rid of
8000         this kludge entirely.  */
8001      t_rsp = newTemp(Ity_I64);
8002      t_addr0 = newTemp(Ity_I64);
8003
8004      vassert(vbi->guest_stack_redzone_size == 128);
8005      assign( t_rsp, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(288)) );
8006      putIReg64(R_RSP, mkexpr(t_rsp));
8007
8008      storeLE( mkexpr(t_rsp), getIRegE(sz, pfx, modrm) );
8009
8010      /* Make t_addr0 point at it. */
8011      assign( t_addr0, mkexpr(t_rsp) );
8012
8013      /* Mask out upper bits of the shift amount, since we're doing a
8014         reg. */
8015      assign( t_bitno1, binop(Iop_And64,
8016                              mkexpr(t_bitno0),
8017                              mkU64(sz == 8 ? 63 : sz == 4 ? 31 : 15)) );
8018
8019   } else {
8020      t_addr0 = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
8021      delta += len;
8022      assign( t_bitno1, mkexpr(t_bitno0) );
8023   }
8024
8025   /* At this point: t_addr0 is the address being operated on.  If it
8026      was a reg, we will have pushed it onto the client's stack.
8027      t_bitno1 is the bit number, suitably masked in the case of a
8028      reg.  */
8029
8030   /* Now the main sequence. */
8031   assign( t_addr1,
8032           binop(Iop_Add64,
8033                 mkexpr(t_addr0),
8034                 binop(Iop_Sar64, mkexpr(t_bitno1), mkU8(3))) );
8035
8036   /* t_addr1 now holds effective address */
8037
8038   assign( t_bitno2,
8039           unop(Iop_64to8,
8040                binop(Iop_And64, mkexpr(t_bitno1), mkU64(7))) );
8041
8042   /* t_bitno2 contains offset of bit within byte */
8043
8044   if (op != BtOpNone) {
8045      t_mask = newTemp(Ity_I8);
8046      assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
8047   }
8048
8049   /* t_mask is now a suitable byte mask */
8050
8051   assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
8052
8053   if (op != BtOpNone) {
8054      switch (op) {
8055         case BtOpSet:
8056            assign( t_new,
8057                    binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
8058            break;
8059         case BtOpComp:
8060            assign( t_new,
8061                    binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
8062            break;
8063         case BtOpReset:
8064            assign( t_new,
8065                    binop(Iop_And8, mkexpr(t_fetched),
8066                                    unop(Iop_Not8, mkexpr(t_mask))) );
8067            break;
8068         default:
8069            vpanic("dis_bt_G_E(amd64)");
8070      }
8071      if ((haveLOCK(pfx)) && !epartIsReg(modrm)) {
8072         casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
8073                                 mkexpr(t_new)/*new*/,
8074                                 guest_RIP_curr_instr );
8075      } else {
8076         storeLE( mkexpr(t_addr1), mkexpr(t_new) );
8077      }
8078   }
8079
8080   /* Side effect done; now get selected bit into Carry flag */
8081   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
8082   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
8083   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
8084   stmt( IRStmt_Put(
8085            OFFB_CC_DEP1,
8086            binop(Iop_And64,
8087                  binop(Iop_Shr64,
8088                        unop(Iop_8Uto64, mkexpr(t_fetched)),
8089                        mkexpr(t_bitno2)),
8090                  mkU64(1)))
8091       );
8092   /* Set NDEP even though it isn't used.  This makes redundant-PUT
8093      elimination of previous stores to this field work better. */
8094   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
8095
8096   /* Move reg operand from stack back to reg */
8097   if (epartIsReg(modrm)) {
8098      /* t_rsp still points at it. */
8099      /* only write the reg if actually modifying it; doing otherwise
8100         zeroes the top half erroneously when doing btl due to
8101         standard zero-extend rule */
8102      if (op != BtOpNone)
8103         putIRegE(sz, pfx, modrm, loadLE(szToITy(sz), mkexpr(t_rsp)) );
8104      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t_rsp), mkU64(288)) );
8105   }
8106
8107   DIP("bt%s%c %s, %s\n",
8108       nameBtOp(op), nameISize(sz), nameIRegG(sz, pfx, modrm),
8109       ( epartIsReg(modrm) ? nameIRegE(sz, pfx, modrm) : dis_buf ) );
8110
8111   return delta;
8112}
8113
8114
8115
8116/* Handle BSF/BSR.  Only v-size seems necessary. */
8117static
8118ULong dis_bs_E_G ( VexAbiInfo* vbi,
8119                   Prefix pfx, Int sz, Long delta, Bool fwds )
8120{
8121   Bool   isReg;
8122   UChar  modrm;
8123   HChar  dis_buf[50];
8124
8125   IRType ty    = szToITy(sz);
8126   IRTemp src   = newTemp(ty);
8127   IRTemp dst   = newTemp(ty);
8128   IRTemp src64 = newTemp(Ity_I64);
8129   IRTemp dst64 = newTemp(Ity_I64);
8130   IRTemp srcB  = newTemp(Ity_I1);
8131
8132   vassert(sz == 8 || sz == 4 || sz == 2);
8133
8134   modrm = getUChar(delta);
8135   isReg = epartIsReg(modrm);
8136   if (isReg) {
8137      delta++;
8138      assign( src, getIRegE(sz, pfx, modrm) );
8139   } else {
8140      Int    len;
8141      IRTemp addr = disAMode( &len, vbi, pfx, delta, dis_buf, 0 );
8142      delta += len;
8143      assign( src, loadLE(ty, mkexpr(addr)) );
8144   }
8145
8146   DIP("bs%c%c %s, %s\n",
8147       fwds ? 'f' : 'r', nameISize(sz),
8148       ( isReg ? nameIRegE(sz, pfx, modrm) : dis_buf ),
8149       nameIRegG(sz, pfx, modrm));
8150
8151   /* First, widen src to 64 bits if it is not already. */
8152   assign( src64, widenUto64(mkexpr(src)) );
8153
8154   /* Generate a bool expression which is zero iff the original is
8155      zero, and nonzero otherwise.  Ask for a CmpNE version which, if
8156      instrumented by Memcheck, is instrumented expensively, since
8157      this may be used on the output of a preceding movmskb insn,
8158      which has been known to be partially defined, and in need of
8159      careful handling. */
8160   assign( srcB, binop(Iop_ExpCmpNE64, mkexpr(src64), mkU64(0)) );
8161
8162   /* Flags: Z is 1 iff source value is zero.  All others
8163      are undefined -- we force them to zero. */
8164   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
8165   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
8166   stmt( IRStmt_Put(
8167            OFFB_CC_DEP1,
8168            IRExpr_ITE( mkexpr(srcB),
8169                        /* src!=0 */
8170                        mkU64(0),
8171                        /* src==0 */
8172                        mkU64(AMD64G_CC_MASK_Z)
8173                        )
8174       ));
8175   /* Set NDEP even though it isn't used.  This makes redundant-PUT
8176      elimination of previous stores to this field work better. */
8177   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
8178
8179   /* Result: iff source value is zero, we can't use
8180      Iop_Clz64/Iop_Ctz64 as they have no defined result in that case.
8181      But anyway, amd64 semantics say the result is undefined in
8182      such situations.  Hence handle the zero case specially. */
8183
8184   /* Bleh.  What we compute:
8185
8186          bsf64:  if src == 0 then {dst is unchanged}
8187                              else Ctz64(src)
8188
8189          bsr64:  if src == 0 then {dst is unchanged}
8190                              else 63 - Clz64(src)
8191
8192          bsf32:  if src == 0 then {dst is unchanged}
8193                              else Ctz64(32Uto64(src))
8194
8195          bsr32:  if src == 0 then {dst is unchanged}
8196                              else 63 - Clz64(32Uto64(src))
8197
8198          bsf16:  if src == 0 then {dst is unchanged}
8199                              else Ctz64(32Uto64(16Uto32(src)))
8200
8201          bsr16:  if src == 0 then {dst is unchanged}
8202                              else 63 - Clz64(32Uto64(16Uto32(src)))
8203   */
8204
8205   /* The main computation, guarding against zero. */
8206   assign( dst64,
8207           IRExpr_ITE(
8208              mkexpr(srcB),
8209              /* src != 0 */
8210              fwds ? unop(Iop_Ctz64, mkexpr(src64))
8211                   : binop(Iop_Sub64,
8212                           mkU64(63),
8213                           unop(Iop_Clz64, mkexpr(src64))),
8214              /* src == 0 -- leave dst unchanged */
8215              widenUto64( getIRegG( sz, pfx, modrm ) )
8216           )
8217         );
8218
8219   if (sz == 2)
8220      assign( dst, unop(Iop_64to16, mkexpr(dst64)) );
8221   else
8222   if (sz == 4)
8223      assign( dst, unop(Iop_64to32, mkexpr(dst64)) );
8224   else
8225      assign( dst, mkexpr(dst64) );
8226
8227   /* dump result back */
8228   putIRegG( sz, pfx, modrm, mkexpr(dst) );
8229
8230   return delta;
8231}
8232
8233
8234/* swap rAX with the reg specified by reg and REX.B */
8235static
8236void codegen_xchg_rAX_Reg ( Prefix pfx, Int sz, UInt regLo3 )
8237{
8238   IRType ty = szToITy(sz);
8239   IRTemp t1 = newTemp(ty);
8240   IRTemp t2 = newTemp(ty);
8241   vassert(sz == 2 || sz == 4 || sz == 8);
8242   vassert(regLo3 < 8);
8243   if (sz == 8) {
8244      assign( t1, getIReg64(R_RAX) );
8245      assign( t2, getIRegRexB(8, pfx, regLo3) );
8246      putIReg64( R_RAX, mkexpr(t2) );
8247      putIRegRexB(8, pfx, regLo3, mkexpr(t1) );
8248   } else if (sz == 4) {
8249      assign( t1, getIReg32(R_RAX) );
8250      assign( t2, getIRegRexB(4, pfx, regLo3) );
8251      putIReg32( R_RAX, mkexpr(t2) );
8252      putIRegRexB(4, pfx, regLo3, mkexpr(t1) );
8253   } else {
8254      assign( t1, getIReg16(R_RAX) );
8255      assign( t2, getIRegRexB(2, pfx, regLo3) );
8256      putIReg16( R_RAX, mkexpr(t2) );
8257      putIRegRexB(2, pfx, regLo3, mkexpr(t1) );
8258   }
8259   DIP("xchg%c %s, %s\n",
8260       nameISize(sz), nameIRegRAX(sz),
8261                      nameIRegRexB(sz,pfx, regLo3));
8262}
8263
8264
8265static
8266void codegen_SAHF ( void )
8267{
8268   /* Set the flags to:
8269      (amd64g_calculate_flags_all() & AMD64G_CC_MASK_O)
8270                                    -- retain the old O flag
8271      | (%AH & (AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
8272                |AMD64G_CC_MASK_P|AMD64G_CC_MASK_C)
8273   */
8274   ULong  mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
8275                       |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
8276   IRTemp oldflags   = newTemp(Ity_I64);
8277   assign( oldflags, mk_amd64g_calculate_rflags_all() );
8278   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
8279   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
8280   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
8281   stmt( IRStmt_Put( OFFB_CC_DEP1,
8282         binop(Iop_Or64,
8283               binop(Iop_And64, mkexpr(oldflags), mkU64(AMD64G_CC_MASK_O)),
8284               binop(Iop_And64,
8285                     binop(Iop_Shr64, getIReg64(R_RAX), mkU8(8)),
8286                     mkU64(mask_SZACP))
8287              )
8288   ));
8289}
8290
8291
8292static
8293void codegen_LAHF ( void  )
8294{
8295   /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
8296   IRExpr* rax_with_hole;
8297   IRExpr* new_byte;
8298   IRExpr* new_rax;
8299   ULong   mask_SZACP = AMD64G_CC_MASK_S|AMD64G_CC_MASK_Z|AMD64G_CC_MASK_A
8300                        |AMD64G_CC_MASK_C|AMD64G_CC_MASK_P;
8301
8302   IRTemp  flags = newTemp(Ity_I64);
8303   assign( flags, mk_amd64g_calculate_rflags_all() );
8304
8305   rax_with_hole
8306      = binop(Iop_And64, getIReg64(R_RAX), mkU64(~0xFF00ULL));
8307   new_byte
8308      = binop(Iop_Or64, binop(Iop_And64, mkexpr(flags), mkU64(mask_SZACP)),
8309                        mkU64(1<<1));
8310   new_rax
8311      = binop(Iop_Or64, rax_with_hole,
8312                        binop(Iop_Shl64, new_byte, mkU8(8)));
8313   putIReg64(R_RAX, new_rax);
8314}
8315
8316
8317static
8318ULong dis_cmpxchg_G_E ( /*OUT*/Bool* ok,
8319                        VexAbiInfo*  vbi,
8320                        Prefix       pfx,
8321                        Int          size,
8322                        Long         delta0 )
8323{
8324   HChar dis_buf[50];
8325   Int   len;
8326
8327   IRType ty    = szToITy(size);
8328   IRTemp acc   = newTemp(ty);
8329   IRTemp src   = newTemp(ty);
8330   IRTemp dest  = newTemp(ty);
8331   IRTemp dest2 = newTemp(ty);
8332   IRTemp acc2  = newTemp(ty);
8333   IRTemp cond  = newTemp(Ity_I1);
8334   IRTemp addr  = IRTemp_INVALID;
8335   UChar  rm    = getUChar(delta0);
8336
8337   /* There are 3 cases to consider:
8338
8339      reg-reg: ignore any lock prefix, generate sequence based
8340               on ITE
8341
8342      reg-mem, not locked: ignore any lock prefix, generate sequence
8343                           based on ITE
8344
8345      reg-mem, locked: use IRCAS
8346   */
8347
8348   /* Decide whether F2 or F3 are acceptable.  Never for register
8349      case, but for the memory case, one or the other is OK provided
8350      LOCK is also present. */
8351   if (epartIsReg(rm)) {
8352      if (haveF2orF3(pfx)) {
8353         *ok = False;
8354         return delta0;
8355      }
8356   } else {
8357      if (haveF2orF3(pfx)) {
8358         if (haveF2andF3(pfx) || !haveLOCK(pfx)) {
8359            *ok = False;
8360            return delta0;
8361         }
8362      }
8363   }
8364
8365   if (epartIsReg(rm)) {
8366      /* case 1 */
8367      assign( dest, getIRegE(size, pfx, rm) );
8368      delta0++;
8369      assign( src, getIRegG(size, pfx, rm) );
8370      assign( acc, getIRegRAX(size) );
8371      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
8372      assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
8373      assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
8374      assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
8375      putIRegRAX(size, mkexpr(acc2));
8376      putIRegE(size, pfx, rm, mkexpr(dest2));
8377      DIP("cmpxchg%c %s,%s\n", nameISize(size),
8378                               nameIRegG(size,pfx,rm),
8379                               nameIRegE(size,pfx,rm) );
8380   }
8381   else if (!epartIsReg(rm) && !haveLOCK(pfx)) {
8382      /* case 2 */
8383      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
8384      assign( dest, loadLE(ty, mkexpr(addr)) );
8385      delta0 += len;
8386      assign( src, getIRegG(size, pfx, rm) );
8387      assign( acc, getIRegRAX(size) );
8388      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
8389      assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
8390      assign( dest2, IRExpr_ITE(mkexpr(cond), mkexpr(src), mkexpr(dest)) );
8391      assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
8392      putIRegRAX(size, mkexpr(acc2));
8393      storeLE( mkexpr(addr), mkexpr(dest2) );
8394      DIP("cmpxchg%c %s,%s\n", nameISize(size),
8395                               nameIRegG(size,pfx,rm), dis_buf);
8396   }
8397   else if (!epartIsReg(rm) && haveLOCK(pfx)) {
8398      /* case 3 */
8399      /* src is new value.  acc is expected value.  dest is old value.
8400         Compute success from the output of the IRCAS, and steer the
8401         new value for RAX accordingly: in case of success, RAX is
8402         unchanged. */
8403      addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
8404      delta0 += len;
8405      assign( src, getIRegG(size, pfx, rm) );
8406      assign( acc, getIRegRAX(size) );
8407      stmt( IRStmt_CAS(
8408         mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
8409                  NULL, mkexpr(acc), NULL, mkexpr(src) )
8410      ));
8411      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
8412      assign( cond, mk_amd64g_calculate_condition(AMD64CondZ) );
8413      assign( acc2,  IRExpr_ITE(mkexpr(cond), mkexpr(acc), mkexpr(dest)) );
8414      putIRegRAX(size, mkexpr(acc2));
8415      DIP("cmpxchg%c %s,%s\n", nameISize(size),
8416                               nameIRegG(size,pfx,rm), dis_buf);
8417   }
8418   else vassert(0);
8419
8420   *ok = True;
8421   return delta0;
8422}
8423
8424
8425/* Handle conditional move instructions of the form
8426      cmovcc E(reg-or-mem), G(reg)
8427
8428   E(src) is reg-or-mem
8429   G(dst) is reg.
8430
8431   If E is reg, -->    GET %E, tmps
8432                       GET %G, tmpd
8433                       CMOVcc tmps, tmpd
8434                       PUT tmpd, %G
8435
8436   If E is mem  -->    (getAddr E) -> tmpa
8437                       LD (tmpa), tmps
8438                       GET %G, tmpd
8439                       CMOVcc tmps, tmpd
8440                       PUT tmpd, %G
8441*/
8442static
8443ULong dis_cmov_E_G ( VexAbiInfo* vbi,
8444                     Prefix        pfx,
8445                     Int           sz,
8446                     AMD64Condcode cond,
8447                     Long          delta0 )
8448{
8449   UChar rm  = getUChar(delta0);
8450   HChar dis_buf[50];
8451   Int   len;
8452
8453   IRType ty   = szToITy(sz);
8454   IRTemp tmps = newTemp(ty);
8455   IRTemp tmpd = newTemp(ty);
8456
8457   if (epartIsReg(rm)) {
8458      assign( tmps, getIRegE(sz, pfx, rm) );
8459      assign( tmpd, getIRegG(sz, pfx, rm) );
8460
8461      putIRegG( sz, pfx, rm,
8462                IRExpr_ITE( mk_amd64g_calculate_condition(cond),
8463                            mkexpr(tmps),
8464                            mkexpr(tmpd) )
8465              );
8466      DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
8467                            nameIRegE(sz,pfx,rm),
8468                            nameIRegG(sz,pfx,rm));
8469      return 1+delta0;
8470   }
8471
8472   /* E refers to memory */
8473   {
8474      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
8475      assign( tmps, loadLE(ty, mkexpr(addr)) );
8476      assign( tmpd, getIRegG(sz, pfx, rm) );
8477
8478      putIRegG( sz, pfx, rm,
8479                IRExpr_ITE( mk_amd64g_calculate_condition(cond),
8480                            mkexpr(tmps),
8481                            mkexpr(tmpd) )
8482              );
8483
8484      DIP("cmov%s %s,%s\n", name_AMD64Condcode(cond),
8485                            dis_buf,
8486                            nameIRegG(sz,pfx,rm));
8487      return len+delta0;
8488   }
8489}
8490
8491
8492static
8493ULong dis_xadd_G_E ( /*OUT*/Bool* decode_ok,
8494                     VexAbiInfo* vbi,
8495                     Prefix pfx, Int sz, Long delta0 )
8496{
8497   Int   len;
8498   UChar rm = getUChar(delta0);
8499   HChar dis_buf[50];
8500
8501   IRType ty    = szToITy(sz);
8502   IRTemp tmpd  = newTemp(ty);
8503   IRTemp tmpt0 = newTemp(ty);
8504   IRTemp tmpt1 = newTemp(ty);
8505
8506   /* There are 3 cases to consider:
8507
8508      reg-reg: ignore any lock prefix,
8509               generate 'naive' (non-atomic) sequence
8510
8511      reg-mem, not locked: ignore any lock prefix, generate 'naive'
8512                           (non-atomic) sequence
8513
8514      reg-mem, locked: use IRCAS
8515   */
8516
8517   if (epartIsReg(rm)) {
8518      /* case 1 */
8519      assign( tmpd, getIRegE(sz, pfx, rm) );
8520      assign( tmpt0, getIRegG(sz, pfx, rm) );
8521      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
8522                           mkexpr(tmpd), mkexpr(tmpt0)) );
8523      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
8524      putIRegG(sz, pfx, rm, mkexpr(tmpd));
8525      putIRegE(sz, pfx, rm, mkexpr(tmpt1));
8526      DIP("xadd%c %s, %s\n",
8527          nameISize(sz), nameIRegG(sz,pfx,rm), nameIRegE(sz,pfx,rm));
8528      *decode_ok = True;
8529      return 1+delta0;
8530   }
8531   else if (!epartIsReg(rm) && !haveLOCK(pfx)) {
8532      /* case 2 */
8533      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
8534      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
8535      assign( tmpt0, getIRegG(sz, pfx, rm) );
8536      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
8537                           mkexpr(tmpd), mkexpr(tmpt0)) );
8538      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
8539      storeLE( mkexpr(addr), mkexpr(tmpt1) );
8540      putIRegG(sz, pfx, rm, mkexpr(tmpd));
8541      DIP("xadd%c %s, %s\n",
8542          nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
8543      *decode_ok = True;
8544      return len+delta0;
8545   }
8546   else if (!epartIsReg(rm) && haveLOCK(pfx)) {
8547      /* case 3 */
8548      IRTemp addr = disAMode ( &len, vbi, pfx, delta0, dis_buf, 0 );
8549      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
8550      assign( tmpt0, getIRegG(sz, pfx, rm) );
8551      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
8552                           mkexpr(tmpd), mkexpr(tmpt0)) );
8553      casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
8554                           mkexpr(tmpt1)/*newVal*/, guest_RIP_curr_instr );
8555      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
8556      putIRegG(sz, pfx, rm, mkexpr(tmpd));
8557      DIP("xadd%c %s, %s\n",
8558          nameISize(sz), nameIRegG(sz,pfx,rm), dis_buf);
8559      *decode_ok = True;
8560      return len+delta0;
8561   }
8562   /*UNREACHED*/
8563   vassert(0);
8564}
8565
8566//.. /* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
8567//..
8568//.. static
8569//.. UInt dis_mov_Ew_Sw ( UChar sorb, Long delta0 )
8570//.. {
8571//..    Int    len;
8572//..    IRTemp addr;
8573//..    UChar  rm  = getUChar(delta0);
8574//..    HChar  dis_buf[50];
8575//..
8576//..    if (epartIsReg(rm)) {
8577//..       putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
8578//..       DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
8579//..       return 1+delta0;
8580//..    } else {
8581//..       addr = disAMode ( &len, sorb, delta0, dis_buf );
8582//..       putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
8583//..       DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
8584//..       return len+delta0;
8585//..    }
8586//.. }
8587//..
8588//.. /* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
8589//..    dst is ireg and sz==4, zero out top half of it.  */
8590//..
8591//.. static
8592//.. UInt dis_mov_Sw_Ew ( UChar sorb,
8593//..                      Int   sz,
8594//..                      UInt  delta0 )
8595//.. {
8596//..    Int    len;
8597//..    IRTemp addr;
8598//..    UChar  rm  = getUChar(delta0);
8599//..    HChar  dis_buf[50];
8600//..
8601//..    vassert(sz == 2 || sz == 4);
8602//..
8603//..    if (epartIsReg(rm)) {
8604//..       if (sz == 4)
8605//..          putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
8606//..       else
8607//..          putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
8608//..
8609//..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
8610//..       return 1+delta0;
8611//..    } else {
8612//..       addr = disAMode ( &len, sorb, delta0, dis_buf );
8613//..       storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
8614//..       DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
8615//..       return len+delta0;
8616//..    }
8617//.. }
8618//..
8619//..
8620//.. static
8621//.. void dis_push_segreg ( UInt sreg, Int sz )
8622//.. {
8623//..     IRTemp t1 = newTemp(Ity_I16);
8624//..     IRTemp ta = newTemp(Ity_I32);
8625//..     vassert(sz == 2 || sz == 4);
8626//..
8627//..     assign( t1, getSReg(sreg) );
8628//..     assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
8629//..     putIReg(4, R_ESP, mkexpr(ta));
8630//..     storeLE( mkexpr(ta), mkexpr(t1) );
8631//..
8632//..     DIP("pushw %s\n", nameSReg(sreg));
8633//.. }
8634//..
8635//.. static
8636//.. void dis_pop_segreg ( UInt sreg, Int sz )
8637//.. {
8638//..     IRTemp t1 = newTemp(Ity_I16);
8639//..     IRTemp ta = newTemp(Ity_I32);
8640//..     vassert(sz == 2 || sz == 4);
8641//..
8642//..     assign( ta, getIReg(4, R_ESP) );
8643//..     assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
8644//..
8645//..     putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
8646//..     putSReg( sreg, mkexpr(t1) );
8647//..     DIP("pop %s\n", nameSReg(sreg));
8648//.. }
8649
8650static
8651void dis_ret ( /*MOD*/DisResult* dres, VexAbiInfo* vbi, ULong d64 )
8652{
8653   IRTemp t1 = newTemp(Ity_I64);
8654   IRTemp t2 = newTemp(Ity_I64);
8655   IRTemp t3 = newTemp(Ity_I64);
8656   assign(t1, getIReg64(R_RSP));
8657   assign(t2, loadLE(Ity_I64,mkexpr(t1)));
8658   assign(t3, binop(Iop_Add64, mkexpr(t1), mkU64(8+d64)));
8659   putIReg64(R_RSP, mkexpr(t3));
8660   make_redzone_AbiHint(vbi, t3, t2/*nia*/, "ret");
8661   jmp_treg(dres, Ijk_Ret, t2);
8662   vassert(dres->whatNext == Dis_StopHere);
8663}
8664
8665
8666/*------------------------------------------------------------*/
8667/*--- SSE/SSE2/SSE3 helpers                                ---*/
8668/*------------------------------------------------------------*/
8669
8670/* Indicates whether the op requires a rounding-mode argument.  Note
8671   that this covers only vector floating point arithmetic ops, and
8672   omits the scalar ones that need rounding modes.  Note also that
8673   inconsistencies here will get picked up later by the IR sanity
8674   checker, so this isn't correctness-critical. */
8675static Bool requiresRMode ( IROp op )
8676{
8677   switch (op) {
8678      /* 128 bit ops */
8679      case Iop_Add32Fx4: case Iop_Sub32Fx4:
8680      case Iop_Mul32Fx4: case Iop_Div32Fx4:
8681      case Iop_Add64Fx2: case Iop_Sub64Fx2:
8682      case Iop_Mul64Fx2: case Iop_Div64Fx2:
8683      /* 256 bit ops */
8684      case Iop_Add32Fx8: case Iop_Sub32Fx8:
8685      case Iop_Mul32Fx8: case Iop_Div32Fx8:
8686      case Iop_Add64Fx4: case Iop_Sub64Fx4:
8687      case Iop_Mul64Fx4: case Iop_Div64Fx4:
8688         return True;
8689      default:
8690         break;
8691   }
8692   return False;
8693}
8694
8695
8696/* Worker function; do not call directly.
8697   Handles full width G = G `op` E   and   G = (not G) `op` E.
8698*/
8699
8700static ULong dis_SSE_E_to_G_all_wrk (
8701                VexAbiInfo* vbi,
8702                Prefix pfx, Long delta,
8703                const HChar* opname, IROp op,
8704                Bool   invertG
8705             )
8706{
8707   HChar   dis_buf[50];
8708   Int     alen;
8709   IRTemp  addr;
8710   UChar   rm = getUChar(delta);
8711   Bool    needsRMode = requiresRMode(op);
8712   IRExpr* gpart
8713      = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRexRM(pfx,rm)))
8714                : getXMMReg(gregOfRexRM(pfx,rm));
8715   if (epartIsReg(rm)) {
8716      putXMMReg(
8717         gregOfRexRM(pfx,rm),
8718         needsRMode
8719            ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
8720                        gpart,
8721                        getXMMReg(eregOfRexRM(pfx,rm)))
8722            : binop(op, gpart,
8723                        getXMMReg(eregOfRexRM(pfx,rm)))
8724      );
8725      DIP("%s %s,%s\n", opname,
8726                        nameXMMReg(eregOfRexRM(pfx,rm)),
8727                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8728      return delta+1;
8729   } else {
8730      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8731      putXMMReg(
8732         gregOfRexRM(pfx,rm),
8733         needsRMode
8734            ? triop(op, get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
8735                        gpart,
8736                        loadLE(Ity_V128, mkexpr(addr)))
8737            : binop(op, gpart,
8738                        loadLE(Ity_V128, mkexpr(addr)))
8739      );
8740      DIP("%s %s,%s\n", opname,
8741                        dis_buf,
8742                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8743      return delta+alen;
8744   }
8745}
8746
8747
8748/* All lanes SSE binary operation, G = G `op` E. */
8749
8750static
8751ULong dis_SSE_E_to_G_all ( VexAbiInfo* vbi,
8752                           Prefix pfx, Long delta,
8753                           const HChar* opname, IROp op )
8754{
8755   return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, False );
8756}
8757
8758/* All lanes SSE binary operation, G = (not G) `op` E. */
8759
8760static
8761ULong dis_SSE_E_to_G_all_invG ( VexAbiInfo* vbi,
8762                                Prefix pfx, Long delta,
8763                                const HChar* opname, IROp op )
8764{
8765   return dis_SSE_E_to_G_all_wrk( vbi, pfx, delta, opname, op, True );
8766}
8767
8768
8769/* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
8770
8771static ULong dis_SSE_E_to_G_lo32 ( VexAbiInfo* vbi,
8772                                   Prefix pfx, Long delta,
8773                                   const HChar* opname, IROp op )
8774{
8775   HChar   dis_buf[50];
8776   Int     alen;
8777   IRTemp  addr;
8778   UChar   rm = getUChar(delta);
8779   IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
8780   if (epartIsReg(rm)) {
8781      putXMMReg( gregOfRexRM(pfx,rm),
8782                 binop(op, gpart,
8783                           getXMMReg(eregOfRexRM(pfx,rm))) );
8784      DIP("%s %s,%s\n", opname,
8785                        nameXMMReg(eregOfRexRM(pfx,rm)),
8786                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8787      return delta+1;
8788   } else {
8789      /* We can only do a 32-bit memory read, so the upper 3/4 of the
8790         E operand needs to be made simply of zeroes. */
8791      IRTemp epart = newTemp(Ity_V128);
8792      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8793      assign( epart, unop( Iop_32UtoV128,
8794                           loadLE(Ity_I32, mkexpr(addr))) );
8795      putXMMReg( gregOfRexRM(pfx,rm),
8796                 binop(op, gpart, mkexpr(epart)) );
8797      DIP("%s %s,%s\n", opname,
8798                        dis_buf,
8799                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8800      return delta+alen;
8801   }
8802}
8803
8804
8805/* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
8806
8807static ULong dis_SSE_E_to_G_lo64 ( VexAbiInfo* vbi,
8808                                   Prefix pfx, Long delta,
8809                                   const HChar* opname, IROp op )
8810{
8811   HChar   dis_buf[50];
8812   Int     alen;
8813   IRTemp  addr;
8814   UChar   rm = getUChar(delta);
8815   IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
8816   if (epartIsReg(rm)) {
8817      putXMMReg( gregOfRexRM(pfx,rm),
8818                 binop(op, gpart,
8819                           getXMMReg(eregOfRexRM(pfx,rm))) );
8820      DIP("%s %s,%s\n", opname,
8821                        nameXMMReg(eregOfRexRM(pfx,rm)),
8822                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8823      return delta+1;
8824   } else {
8825      /* We can only do a 64-bit memory read, so the upper half of the
8826         E operand needs to be made simply of zeroes. */
8827      IRTemp epart = newTemp(Ity_V128);
8828      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8829      assign( epart, unop( Iop_64UtoV128,
8830                           loadLE(Ity_I64, mkexpr(addr))) );
8831      putXMMReg( gregOfRexRM(pfx,rm),
8832                 binop(op, gpart, mkexpr(epart)) );
8833      DIP("%s %s,%s\n", opname,
8834                        dis_buf,
8835                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8836      return delta+alen;
8837   }
8838}
8839
8840
8841/* All lanes unary SSE operation, G = op(E). */
8842
8843static ULong dis_SSE_E_to_G_unary_all (
8844                VexAbiInfo* vbi,
8845                Prefix pfx, Long delta,
8846                const HChar* opname, IROp op
8847             )
8848{
8849   HChar   dis_buf[50];
8850   Int     alen;
8851   IRTemp  addr;
8852   UChar   rm = getUChar(delta);
8853   if (epartIsReg(rm)) {
8854      putXMMReg( gregOfRexRM(pfx,rm),
8855                 unop(op, getXMMReg(eregOfRexRM(pfx,rm))) );
8856      DIP("%s %s,%s\n", opname,
8857                        nameXMMReg(eregOfRexRM(pfx,rm)),
8858                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8859      return delta+1;
8860   } else {
8861      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8862      putXMMReg( gregOfRexRM(pfx,rm),
8863                 unop(op, loadLE(Ity_V128, mkexpr(addr))) );
8864      DIP("%s %s,%s\n", opname,
8865                        dis_buf,
8866                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8867      return delta+alen;
8868   }
8869}
8870
8871
8872/* Lowest 32-bit lane only unary SSE operation, G = op(E). */
8873
8874static ULong dis_SSE_E_to_G_unary_lo32 (
8875                VexAbiInfo* vbi,
8876                Prefix pfx, Long delta,
8877                const HChar* opname, IROp op
8878             )
8879{
8880   /* First we need to get the old G value and patch the low 32 bits
8881      of the E operand into it.  Then apply op and write back to G. */
8882   HChar   dis_buf[50];
8883   Int     alen;
8884   IRTemp  addr;
8885   UChar   rm = getUChar(delta);
8886   IRTemp  oldG0 = newTemp(Ity_V128);
8887   IRTemp  oldG1 = newTemp(Ity_V128);
8888
8889   assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
8890
8891   if (epartIsReg(rm)) {
8892      assign( oldG1,
8893              binop( Iop_SetV128lo32,
8894                     mkexpr(oldG0),
8895                     getXMMRegLane32(eregOfRexRM(pfx,rm), 0)) );
8896      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
8897      DIP("%s %s,%s\n", opname,
8898                        nameXMMReg(eregOfRexRM(pfx,rm)),
8899                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8900      return delta+1;
8901   } else {
8902      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8903      assign( oldG1,
8904              binop( Iop_SetV128lo32,
8905                     mkexpr(oldG0),
8906                     loadLE(Ity_I32, mkexpr(addr)) ));
8907      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
8908      DIP("%s %s,%s\n", opname,
8909                        dis_buf,
8910                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8911      return delta+alen;
8912   }
8913}
8914
8915
8916/* Lowest 64-bit lane only unary SSE operation, G = op(E). */
8917
8918static ULong dis_SSE_E_to_G_unary_lo64 (
8919                VexAbiInfo* vbi,
8920                Prefix pfx, Long delta,
8921                const HChar* opname, IROp op
8922             )
8923{
8924   /* First we need to get the old G value and patch the low 64 bits
8925      of the E operand into it.  Then apply op and write back to G. */
8926   HChar   dis_buf[50];
8927   Int     alen;
8928   IRTemp  addr;
8929   UChar   rm = getUChar(delta);
8930   IRTemp  oldG0 = newTemp(Ity_V128);
8931   IRTemp  oldG1 = newTemp(Ity_V128);
8932
8933   assign( oldG0, getXMMReg(gregOfRexRM(pfx,rm)) );
8934
8935   if (epartIsReg(rm)) {
8936      assign( oldG1,
8937              binop( Iop_SetV128lo64,
8938                     mkexpr(oldG0),
8939                     getXMMRegLane64(eregOfRexRM(pfx,rm), 0)) );
8940      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
8941      DIP("%s %s,%s\n", opname,
8942                        nameXMMReg(eregOfRexRM(pfx,rm)),
8943                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8944      return delta+1;
8945   } else {
8946      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8947      assign( oldG1,
8948              binop( Iop_SetV128lo64,
8949                     mkexpr(oldG0),
8950                     loadLE(Ity_I64, mkexpr(addr)) ));
8951      putXMMReg( gregOfRexRM(pfx,rm), unop(op, mkexpr(oldG1)) );
8952      DIP("%s %s,%s\n", opname,
8953                        dis_buf,
8954                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8955      return delta+alen;
8956   }
8957}
8958
8959
8960/* SSE integer binary operation:
8961      G = G `op` E   (eLeft == False)
8962      G = E `op` G   (eLeft == True)
8963*/
8964static ULong dis_SSEint_E_to_G(
8965                VexAbiInfo* vbi,
8966                Prefix pfx, Long delta,
8967                const HChar* opname, IROp op,
8968                Bool   eLeft
8969             )
8970{
8971   HChar   dis_buf[50];
8972   Int     alen;
8973   IRTemp  addr;
8974   UChar   rm = getUChar(delta);
8975   IRExpr* gpart = getXMMReg(gregOfRexRM(pfx,rm));
8976   IRExpr* epart = NULL;
8977   if (epartIsReg(rm)) {
8978      epart = getXMMReg(eregOfRexRM(pfx,rm));
8979      DIP("%s %s,%s\n", opname,
8980                        nameXMMReg(eregOfRexRM(pfx,rm)),
8981                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8982      delta += 1;
8983   } else {
8984      addr  = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
8985      epart = loadLE(Ity_V128, mkexpr(addr));
8986      DIP("%s %s,%s\n", opname,
8987                        dis_buf,
8988                        nameXMMReg(gregOfRexRM(pfx,rm)) );
8989      delta += alen;
8990   }
8991   putXMMReg( gregOfRexRM(pfx,rm),
8992              eLeft ? binop(op, epart, gpart)
8993                    : binop(op, gpart, epart) );
8994   return delta;
8995}
8996
8997
8998/* Helper for doing SSE FP comparisons.  False return ==> unhandled.
8999   This is all a bit of a kludge in that it ignores the subtleties of
9000   ordered-vs-unordered and signalling-vs-nonsignalling in the Intel
9001   spec. */
9002static Bool findSSECmpOp ( /*OUT*/Bool* preSwapP,
9003                           /*OUT*/IROp* opP,
9004                           /*OUT*/Bool* postNotP,
9005                           UInt imm8, Bool all_lanes, Int sz )
9006{
9007   if (imm8 >= 32) return False;
9008
9009   /* First, compute a (preSwap, op, postNot) triple from
9010      the supplied imm8. */
9011   Bool pre = False;
9012   IROp op  = Iop_INVALID;
9013   Bool not = False;
9014
9015#  define XXX(_pre, _op, _not) { pre = _pre; op = _op; not = _not; }
9016   // If you add a case here, add a corresponding test for both VCMPSD_128
9017   // and VCMPSS_128 in avx-1.c.
9018   switch (imm8) {
9019      // "O" = ordered, "U" = unordered
9020      // "Q" = non-signalling (quiet), "S" = signalling
9021      //
9022      //             swap operands?
9023      //             |
9024      //             |      cmp op          invert after?
9025      //             |      |               |
9026      //             v      v               v
9027      case 0x0:  XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_OQ
9028      case 0x1:  XXX(False, Iop_CmpLT32Fx4, False); break; // LT_OS
9029      case 0x2:  XXX(False, Iop_CmpLE32Fx4, False); break; // LE_OS
9030      case 0x3:  XXX(False, Iop_CmpUN32Fx4, False); break; // UNORD_Q
9031      case 0x4:  XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_UQ
9032      case 0x5:  XXX(False, Iop_CmpLT32Fx4, True);  break; // NLT_US
9033      case 0x6:  XXX(False, Iop_CmpLE32Fx4, True);  break; // NLE_US
9034      case 0x7:  XXX(False, Iop_CmpUN32Fx4, True);  break; // ORD_Q
9035      case 0x8:  XXX(False, Iop_CmpEQ32Fx4, False); break; // EQ_UQ
9036      case 0x9:  XXX(True,  Iop_CmpLE32Fx4, True);  break; // NGE_US
9037      /* "Enhanced Comparison Predicate[s] for VEX-Encoded [insns] */
9038      case 0xA:  XXX(True,  Iop_CmpLT32Fx4, True);  break; // NGT_US
9039      // 0xB  FALSE_OQ
9040      // 0xC: this isn't really right because it returns all-1s when
9041      // either operand is a NaN, and it should return all-0s.
9042      case 0xC:  XXX(False, Iop_CmpEQ32Fx4, True);  break; // NEQ_OQ
9043      case 0xD:  XXX(True,  Iop_CmpLE32Fx4, False); break; // GE_OS
9044      case 0xE:  XXX(True,  Iop_CmpLT32Fx4, False); break; // GT_OS
9045      // 0xF  TRUE_UQ
9046      // 0x10  EQ_OS
9047      case 0x11: XXX(False, Iop_CmpLT32Fx4, False); break; // LT_OQ
9048      case 0x12: XXX(False, Iop_CmpLE32Fx4, False); break; // LE_OQ
9049      // 0x13  UNORD_S
9050      // 0x14  NEQ_US
9051      // 0x15  NLT_UQ
9052      case 0x16: XXX(False, Iop_CmpLE32Fx4, True);  break; // NLE_UQ
9053      // 0x17  ORD_S
9054      // 0x18  EQ_US
9055      // 0x19  NGE_UQ
9056      // 0x1A  NGT_UQ
9057      // 0x1B  FALSE_OS
9058      // 0x1C  NEQ_OS
9059      // 0x1D  GE_OQ
9060      case 0x1E: XXX(True,  Iop_CmpLT32Fx4, False); break; // GT_OQ
9061      // 0x1F  TRUE_US
9062      /* Don't forget to add test cases to VCMPSS_128_<imm8> in
9063         avx-1.c if new cases turn up. */
9064      default: break;
9065   }
9066#  undef XXX
9067   if (op == Iop_INVALID) return False;
9068
9069   /* Now convert the op into one with the same arithmetic but that is
9070      correct for the width and laneage requirements. */
9071
9072   /**/ if (sz == 4 && all_lanes) {
9073      switch (op) {
9074         case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32Fx4; break;
9075         case Iop_CmpLT32Fx4: op = Iop_CmpLT32Fx4; break;
9076         case Iop_CmpLE32Fx4: op = Iop_CmpLE32Fx4; break;
9077         case Iop_CmpUN32Fx4: op = Iop_CmpUN32Fx4; break;
9078         default: vassert(0);
9079      }
9080   }
9081   else if (sz == 4 && !all_lanes) {
9082      switch (op) {
9083         case Iop_CmpEQ32Fx4: op = Iop_CmpEQ32F0x4; break;
9084         case Iop_CmpLT32Fx4: op = Iop_CmpLT32F0x4; break;
9085         case Iop_CmpLE32Fx4: op = Iop_CmpLE32F0x4; break;
9086         case Iop_CmpUN32Fx4: op = Iop_CmpUN32F0x4; break;
9087         default: vassert(0);
9088      }
9089   }
9090   else if (sz == 8 && all_lanes) {
9091      switch (op) {
9092         case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64Fx2; break;
9093         case Iop_CmpLT32Fx4: op = Iop_CmpLT64Fx2; break;
9094         case Iop_CmpLE32Fx4: op = Iop_CmpLE64Fx2; break;
9095         case Iop_CmpUN32Fx4: op = Iop_CmpUN64Fx2; break;
9096         default: vassert(0);
9097      }
9098   }
9099   else if (sz == 8 && !all_lanes) {
9100      switch (op) {
9101         case Iop_CmpEQ32Fx4: op = Iop_CmpEQ64F0x2; break;
9102         case Iop_CmpLT32Fx4: op = Iop_CmpLT64F0x2; break;
9103         case Iop_CmpLE32Fx4: op = Iop_CmpLE64F0x2; break;
9104         case Iop_CmpUN32Fx4: op = Iop_CmpUN64F0x2; break;
9105         default: vassert(0);
9106      }
9107   }
9108   else {
9109      vpanic("findSSECmpOp(amd64,guest)");
9110   }
9111
9112   *preSwapP = pre; *opP = op; *postNotP = not;
9113   return True;
9114}
9115
9116
9117/* Handles SSE 32F/64F comparisons.  It can fail, in which case it
9118   returns the original delta to indicate failure. */
9119
9120static Long dis_SSE_cmp_E_to_G ( VexAbiInfo* vbi,
9121                                 Prefix pfx, Long delta,
9122                                 const HChar* opname, Bool all_lanes, Int sz )
9123{
9124   Long    delta0 = delta;
9125   HChar   dis_buf[50];
9126   Int     alen;
9127   UInt    imm8;
9128   IRTemp  addr;
9129   Bool    preSwap = False;
9130   IROp    op      = Iop_INVALID;
9131   Bool    postNot = False;
9132   IRTemp  plain   = newTemp(Ity_V128);
9133   UChar   rm      = getUChar(delta);
9134   UShort  mask    = 0;
9135   vassert(sz == 4 || sz == 8);
9136   if (epartIsReg(rm)) {
9137      imm8 = getUChar(delta+1);
9138      if (imm8 >= 8) return delta0; /* FAIL */
9139      Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
9140      if (!ok) return delta0; /* FAIL */
9141      vassert(!preSwap); /* never needed for imm8 < 8 */
9142      assign( plain, binop(op, getXMMReg(gregOfRexRM(pfx,rm)),
9143                               getXMMReg(eregOfRexRM(pfx,rm))) );
9144      delta += 2;
9145      DIP("%s $%d,%s,%s\n", opname,
9146                            (Int)imm8,
9147                            nameXMMReg(eregOfRexRM(pfx,rm)),
9148                            nameXMMReg(gregOfRexRM(pfx,rm)) );
9149   } else {
9150      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
9151      imm8 = getUChar(delta+alen);
9152      if (imm8 >= 8) return delta0; /* FAIL */
9153      Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
9154      if (!ok) return delta0; /* FAIL */
9155      vassert(!preSwap); /* never needed for imm8 < 8 */
9156      assign( plain,
9157              binop(
9158                 op,
9159                 getXMMReg(gregOfRexRM(pfx,rm)),
9160                   all_lanes
9161                      ? loadLE(Ity_V128, mkexpr(addr))
9162                   : sz == 8
9163                      ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
9164                   : /*sz==4*/
9165                      unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
9166              )
9167      );
9168      delta += alen+1;
9169      DIP("%s $%d,%s,%s\n", opname,
9170                            (Int)imm8,
9171                            dis_buf,
9172                            nameXMMReg(gregOfRexRM(pfx,rm)) );
9173   }
9174
9175   if (postNot && all_lanes) {
9176      putXMMReg( gregOfRexRM(pfx,rm),
9177                 unop(Iop_NotV128, mkexpr(plain)) );
9178   }
9179   else
9180   if (postNot && !all_lanes) {
9181      mask = toUShort(sz==4 ? 0x000F : 0x00FF);
9182      putXMMReg( gregOfRexRM(pfx,rm),
9183                 binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
9184   }
9185   else {
9186      putXMMReg( gregOfRexRM(pfx,rm), mkexpr(plain) );
9187   }
9188
9189   return delta;
9190}
9191
9192
9193/* Vector by scalar shift of G by the amount specified at the bottom
9194   of E. */
9195
9196static ULong dis_SSE_shiftG_byE ( VexAbiInfo* vbi,
9197                                  Prefix pfx, Long delta,
9198                                  const HChar* opname, IROp op )
9199{
9200   HChar   dis_buf[50];
9201   Int     alen, size;
9202   IRTemp  addr;
9203   Bool    shl, shr, sar;
9204   UChar   rm   = getUChar(delta);
9205   IRTemp  g0   = newTemp(Ity_V128);
9206   IRTemp  g1   = newTemp(Ity_V128);
9207   IRTemp  amt  = newTemp(Ity_I64);
9208   IRTemp  amt8 = newTemp(Ity_I8);
9209   if (epartIsReg(rm)) {
9210      assign( amt, getXMMRegLane64(eregOfRexRM(pfx,rm), 0) );
9211      DIP("%s %s,%s\n", opname,
9212                        nameXMMReg(eregOfRexRM(pfx,rm)),
9213                        nameXMMReg(gregOfRexRM(pfx,rm)) );
9214      delta++;
9215   } else {
9216      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9217      assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
9218      DIP("%s %s,%s\n", opname,
9219                        dis_buf,
9220                        nameXMMReg(gregOfRexRM(pfx,rm)) );
9221      delta += alen;
9222   }
9223   assign( g0,   getXMMReg(gregOfRexRM(pfx,rm)) );
9224   assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
9225
9226   shl = shr = sar = False;
9227   size = 0;
9228   switch (op) {
9229      case Iop_ShlN16x8: shl = True; size = 32; break;
9230      case Iop_ShlN32x4: shl = True; size = 32; break;
9231      case Iop_ShlN64x2: shl = True; size = 64; break;
9232      case Iop_SarN16x8: sar = True; size = 16; break;
9233      case Iop_SarN32x4: sar = True; size = 32; break;
9234      case Iop_ShrN16x8: shr = True; size = 16; break;
9235      case Iop_ShrN32x4: shr = True; size = 32; break;
9236      case Iop_ShrN64x2: shr = True; size = 64; break;
9237      default: vassert(0);
9238   }
9239
9240   if (shl || shr) {
9241     assign(
9242        g1,
9243        IRExpr_ITE(
9244           binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
9245           binop(op, mkexpr(g0), mkexpr(amt8)),
9246           mkV128(0x0000)
9247        )
9248     );
9249   } else
9250   if (sar) {
9251     assign(
9252        g1,
9253        IRExpr_ITE(
9254           binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
9255           binop(op, mkexpr(g0), mkexpr(amt8)),
9256           binop(op, mkexpr(g0), mkU8(size-1))
9257        )
9258     );
9259   } else {
9260      vassert(0);
9261   }
9262
9263   putXMMReg( gregOfRexRM(pfx,rm), mkexpr(g1) );
9264   return delta;
9265}
9266
9267
9268/* Vector by scalar shift of E by an immediate byte. */
9269
9270static
9271ULong dis_SSE_shiftE_imm ( Prefix pfx,
9272                           Long delta, const HChar* opname, IROp op )
9273{
9274   Bool    shl, shr, sar;
9275   UChar   rm   = getUChar(delta);
9276   IRTemp  e0   = newTemp(Ity_V128);
9277   IRTemp  e1   = newTemp(Ity_V128);
9278   UChar   amt, size;
9279   vassert(epartIsReg(rm));
9280   vassert(gregLO3ofRM(rm) == 2
9281           || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
9282   amt = getUChar(delta+1);
9283   delta += 2;
9284   DIP("%s $%d,%s\n", opname,
9285                      (Int)amt,
9286                      nameXMMReg(eregOfRexRM(pfx,rm)) );
9287   assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
9288
9289   shl = shr = sar = False;
9290   size = 0;
9291   switch (op) {
9292      case Iop_ShlN16x8: shl = True; size = 16; break;
9293      case Iop_ShlN32x4: shl = True; size = 32; break;
9294      case Iop_ShlN64x2: shl = True; size = 64; break;
9295      case Iop_SarN16x8: sar = True; size = 16; break;
9296      case Iop_SarN32x4: sar = True; size = 32; break;
9297      case Iop_ShrN16x8: shr = True; size = 16; break;
9298      case Iop_ShrN32x4: shr = True; size = 32; break;
9299      case Iop_ShrN64x2: shr = True; size = 64; break;
9300      default: vassert(0);
9301   }
9302
9303   if (shl || shr) {
9304     assign( e1, amt >= size
9305                    ? mkV128(0x0000)
9306                    : binop(op, mkexpr(e0), mkU8(amt))
9307     );
9308   } else
9309   if (sar) {
9310     assign( e1, amt >= size
9311                    ? binop(op, mkexpr(e0), mkU8(size-1))
9312                    : binop(op, mkexpr(e0), mkU8(amt))
9313     );
9314   } else {
9315      vassert(0);
9316   }
9317
9318   putXMMReg( eregOfRexRM(pfx,rm), mkexpr(e1) );
9319   return delta;
9320}
9321
9322
9323/* Get the current SSE rounding mode. */
9324
9325static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
9326{
9327   return
9328      unop( Iop_64to32,
9329            binop( Iop_And64,
9330                   IRExpr_Get( OFFB_SSEROUND, Ity_I64 ),
9331                   mkU64(3) ));
9332}
9333
9334static void put_sse_roundingmode ( IRExpr* sseround )
9335{
9336   vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
9337   stmt( IRStmt_Put( OFFB_SSEROUND,
9338                     unop(Iop_32Uto64,sseround) ) );
9339}
9340
9341/* Break a V128-bit value up into four 32-bit ints. */
9342
9343static void breakupV128to32s ( IRTemp t128,
9344                               /*OUTs*/
9345                               IRTemp* t3, IRTemp* t2,
9346                               IRTemp* t1, IRTemp* t0 )
9347{
9348   IRTemp hi64 = newTemp(Ity_I64);
9349   IRTemp lo64 = newTemp(Ity_I64);
9350   assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
9351   assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
9352
9353   vassert(t0 && *t0 == IRTemp_INVALID);
9354   vassert(t1 && *t1 == IRTemp_INVALID);
9355   vassert(t2 && *t2 == IRTemp_INVALID);
9356   vassert(t3 && *t3 == IRTemp_INVALID);
9357
9358   *t0 = newTemp(Ity_I32);
9359   *t1 = newTemp(Ity_I32);
9360   *t2 = newTemp(Ity_I32);
9361   *t3 = newTemp(Ity_I32);
9362   assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
9363   assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
9364   assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
9365   assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
9366}
9367
9368/* Construct a V128-bit value from four 32-bit ints. */
9369
9370static IRExpr* mkV128from32s ( IRTemp t3, IRTemp t2,
9371                               IRTemp t1, IRTemp t0 )
9372{
9373   return
9374      binop( Iop_64HLtoV128,
9375             binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
9376             binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
9377   );
9378}
9379
9380/* Break a 64-bit value up into four 16-bit ints. */
9381
9382static void breakup64to16s ( IRTemp t64,
9383                             /*OUTs*/
9384                             IRTemp* t3, IRTemp* t2,
9385                             IRTemp* t1, IRTemp* t0 )
9386{
9387   IRTemp hi32 = newTemp(Ity_I32);
9388   IRTemp lo32 = newTemp(Ity_I32);
9389   assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
9390   assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
9391
9392   vassert(t0 && *t0 == IRTemp_INVALID);
9393   vassert(t1 && *t1 == IRTemp_INVALID);
9394   vassert(t2 && *t2 == IRTemp_INVALID);
9395   vassert(t3 && *t3 == IRTemp_INVALID);
9396
9397   *t0 = newTemp(Ity_I16);
9398   *t1 = newTemp(Ity_I16);
9399   *t2 = newTemp(Ity_I16);
9400   *t3 = newTemp(Ity_I16);
9401   assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
9402   assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
9403   assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
9404   assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
9405}
9406
9407/* Construct a 64-bit value from four 16-bit ints. */
9408
9409static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
9410                             IRTemp t1, IRTemp t0 )
9411{
9412   return
9413      binop( Iop_32HLto64,
9414             binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
9415             binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
9416   );
9417}
9418
9419/* Break a V256-bit value up into four 64-bit ints. */
9420
9421static void breakupV256to64s ( IRTemp t256,
9422                               /*OUTs*/
9423                               IRTemp* t3, IRTemp* t2,
9424                               IRTemp* t1, IRTemp* t0 )
9425{
9426   vassert(t0 && *t0 == IRTemp_INVALID);
9427   vassert(t1 && *t1 == IRTemp_INVALID);
9428   vassert(t2 && *t2 == IRTemp_INVALID);
9429   vassert(t3 && *t3 == IRTemp_INVALID);
9430   *t0 = newTemp(Ity_I64);
9431   *t1 = newTemp(Ity_I64);
9432   *t2 = newTemp(Ity_I64);
9433   *t3 = newTemp(Ity_I64);
9434   assign( *t0, unop(Iop_V256to64_0, mkexpr(t256)) );
9435   assign( *t1, unop(Iop_V256to64_1, mkexpr(t256)) );
9436   assign( *t2, unop(Iop_V256to64_2, mkexpr(t256)) );
9437   assign( *t3, unop(Iop_V256to64_3, mkexpr(t256)) );
9438}
9439
9440/* Break a V256-bit value up into two V128s. */
9441
9442static void breakupV256toV128s ( IRTemp t256,
9443                                 /*OUTs*/
9444                                 IRTemp* t1, IRTemp* t0 )
9445{
9446   vassert(t0 && *t0 == IRTemp_INVALID);
9447   vassert(t1 && *t1 == IRTemp_INVALID);
9448   *t0 = newTemp(Ity_V128);
9449   *t1 = newTemp(Ity_V128);
9450   assign(*t1, unop(Iop_V256toV128_1, mkexpr(t256)));
9451   assign(*t0, unop(Iop_V256toV128_0, mkexpr(t256)));
9452}
9453
9454/* Break a V256-bit value up into eight 32-bit ints.  */
9455
9456static void breakupV256to32s ( IRTemp t256,
9457                               /*OUTs*/
9458                               IRTemp* t7, IRTemp* t6,
9459                               IRTemp* t5, IRTemp* t4,
9460                               IRTemp* t3, IRTemp* t2,
9461                               IRTemp* t1, IRTemp* t0 )
9462{
9463   IRTemp t128_1 = IRTemp_INVALID;
9464   IRTemp t128_0 = IRTemp_INVALID;
9465   breakupV256toV128s( t256, &t128_1, &t128_0 );
9466   breakupV128to32s( t128_1, t7, t6, t5, t4 );
9467   breakupV128to32s( t128_0, t3, t2, t1, t0 );
9468}
9469
9470/* Break a V128-bit value up into two 64-bit ints. */
9471
9472static void breakupV128to64s ( IRTemp t128,
9473                               /*OUTs*/
9474                               IRTemp* t1, IRTemp* t0 )
9475{
9476   vassert(t0 && *t0 == IRTemp_INVALID);
9477   vassert(t1 && *t1 == IRTemp_INVALID);
9478   *t0 = newTemp(Ity_I64);
9479   *t1 = newTemp(Ity_I64);
9480   assign( *t0, unop(Iop_V128to64,   mkexpr(t128)) );
9481   assign( *t1, unop(Iop_V128HIto64, mkexpr(t128)) );
9482}
9483
9484/* Construct a V256-bit value from eight 32-bit ints. */
9485
9486static IRExpr* mkV256from32s ( IRTemp t7, IRTemp t6,
9487                               IRTemp t5, IRTemp t4,
9488                               IRTemp t3, IRTemp t2,
9489                               IRTemp t1, IRTemp t0 )
9490{
9491   return
9492      binop( Iop_V128HLtoV256,
9493             binop( Iop_64HLtoV128,
9494                    binop(Iop_32HLto64, mkexpr(t7), mkexpr(t6)),
9495                    binop(Iop_32HLto64, mkexpr(t5), mkexpr(t4)) ),
9496             binop( Iop_64HLtoV128,
9497                    binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
9498                    binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0)) )
9499   );
9500}
9501
9502/* Construct a V256-bit value from four 64-bit ints. */
9503
9504static IRExpr* mkV256from64s ( IRTemp t3, IRTemp t2,
9505                               IRTemp t1, IRTemp t0 )
9506{
9507   return
9508      binop( Iop_V128HLtoV256,
9509             binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)),
9510             binop(Iop_64HLtoV128, mkexpr(t1), mkexpr(t0))
9511   );
9512}
9513
9514/* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
9515   values (aa,bb), computes, for each of the 4 16-bit lanes:
9516
9517   (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
9518*/
9519static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
9520{
9521   IRTemp aa      = newTemp(Ity_I64);
9522   IRTemp bb      = newTemp(Ity_I64);
9523   IRTemp aahi32s = newTemp(Ity_I64);
9524   IRTemp aalo32s = newTemp(Ity_I64);
9525   IRTemp bbhi32s = newTemp(Ity_I64);
9526   IRTemp bblo32s = newTemp(Ity_I64);
9527   IRTemp rHi     = newTemp(Ity_I64);
9528   IRTemp rLo     = newTemp(Ity_I64);
9529   IRTemp one32x2 = newTemp(Ity_I64);
9530   assign(aa, aax);
9531   assign(bb, bbx);
9532   assign( aahi32s,
9533           binop(Iop_SarN32x2,
9534                 binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
9535                 mkU8(16) ));
9536   assign( aalo32s,
9537           binop(Iop_SarN32x2,
9538                 binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
9539                 mkU8(16) ));
9540   assign( bbhi32s,
9541           binop(Iop_SarN32x2,
9542                 binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
9543                 mkU8(16) ));
9544   assign( bblo32s,
9545           binop(Iop_SarN32x2,
9546                 binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
9547                 mkU8(16) ));
9548   assign(one32x2, mkU64( (1ULL << 32) + 1 ));
9549   assign(
9550      rHi,
9551      binop(
9552         Iop_ShrN32x2,
9553         binop(
9554            Iop_Add32x2,
9555            binop(
9556               Iop_ShrN32x2,
9557               binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
9558               mkU8(14)
9559            ),
9560            mkexpr(one32x2)
9561         ),
9562         mkU8(1)
9563      )
9564   );
9565   assign(
9566      rLo,
9567      binop(
9568         Iop_ShrN32x2,
9569         binop(
9570            Iop_Add32x2,
9571            binop(
9572               Iop_ShrN32x2,
9573               binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
9574               mkU8(14)
9575            ),
9576            mkexpr(one32x2)
9577         ),
9578         mkU8(1)
9579      )
9580   );
9581   return
9582      binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
9583}
9584
9585/* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
9586   values (aa,bb), computes, for each lane:
9587
9588          if aa_lane < 0 then - bb_lane
9589     else if aa_lane > 0 then bb_lane
9590     else 0
9591*/
9592static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
9593{
9594   IRTemp aa       = newTemp(Ity_I64);
9595   IRTemp bb       = newTemp(Ity_I64);
9596   IRTemp zero     = newTemp(Ity_I64);
9597   IRTemp bbNeg    = newTemp(Ity_I64);
9598   IRTemp negMask  = newTemp(Ity_I64);
9599   IRTemp posMask  = newTemp(Ity_I64);
9600   IROp   opSub    = Iop_INVALID;
9601   IROp   opCmpGTS = Iop_INVALID;
9602
9603   switch (laneszB) {
9604      case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
9605      case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
9606      case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
9607      default: vassert(0);
9608   }
9609
9610   assign( aa,      aax );
9611   assign( bb,      bbx );
9612   assign( zero,    mkU64(0) );
9613   assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
9614   assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
9615   assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
9616
9617   return
9618      binop(Iop_Or64,
9619            binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
9620            binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
9621
9622}
9623
9624
9625/* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
9626   value aa, computes, for each lane
9627
9628   if aa < 0 then -aa else aa
9629
9630   Note that the result is interpreted as unsigned, so that the
9631   absolute value of the most negative signed input can be
9632   represented.
9633*/
9634static IRTemp math_PABS_MMX ( IRTemp aa, Int laneszB )
9635{
9636   IRTemp res     = newTemp(Ity_I64);
9637   IRTemp zero    = newTemp(Ity_I64);
9638   IRTemp aaNeg   = newTemp(Ity_I64);
9639   IRTemp negMask = newTemp(Ity_I64);
9640   IRTemp posMask = newTemp(Ity_I64);
9641   IROp   opSub   = Iop_INVALID;
9642   IROp   opSarN  = Iop_INVALID;
9643
9644   switch (laneszB) {
9645      case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
9646      case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
9647      case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
9648      default: vassert(0);
9649   }
9650
9651   assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
9652   assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
9653   assign( zero,    mkU64(0) );
9654   assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
9655   assign( res,
9656           binop(Iop_Or64,
9657                 binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
9658                 binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) ));
9659   return res;
9660}
9661
9662/* XMM version of math_PABS_MMX. */
9663static IRTemp math_PABS_XMM ( IRTemp aa, Int laneszB )
9664{
9665   IRTemp res  = newTemp(Ity_V128);
9666   IRTemp aaHi = newTemp(Ity_I64);
9667   IRTemp aaLo = newTemp(Ity_I64);
9668   assign(aaHi, unop(Iop_V128HIto64, mkexpr(aa)));
9669   assign(aaLo, unop(Iop_V128to64, mkexpr(aa)));
9670   assign(res, binop(Iop_64HLtoV128,
9671                     mkexpr(math_PABS_MMX(aaHi, laneszB)),
9672                     mkexpr(math_PABS_MMX(aaLo, laneszB))));
9673   return res;
9674}
9675
9676/* Specialisations of math_PABS_XMM, since there's no easy way to do
9677   partial applications in C :-( */
9678static IRTemp math_PABS_XMM_pap4 ( IRTemp aa ) {
9679   return math_PABS_XMM(aa, 4);
9680}
9681
9682static IRTemp math_PABS_XMM_pap2 ( IRTemp aa ) {
9683   return math_PABS_XMM(aa, 2);
9684}
9685
9686static IRTemp math_PABS_XMM_pap1 ( IRTemp aa ) {
9687   return math_PABS_XMM(aa, 1);
9688}
9689
9690/* YMM version of math_PABS_XMM. */
9691static IRTemp math_PABS_YMM ( IRTemp aa, Int laneszB )
9692{
9693   IRTemp res  = newTemp(Ity_V256);
9694   IRTemp aaHi = IRTemp_INVALID;
9695   IRTemp aaLo = IRTemp_INVALID;
9696   breakupV256toV128s(aa, &aaHi, &aaLo);
9697   assign(res, binop(Iop_V128HLtoV256,
9698                     mkexpr(math_PABS_XMM(aaHi, laneszB)),
9699                     mkexpr(math_PABS_XMM(aaLo, laneszB))));
9700   return res;
9701}
9702
9703static IRTemp math_PABS_YMM_pap4 ( IRTemp aa ) {
9704   return math_PABS_YMM(aa, 4);
9705}
9706
9707static IRTemp math_PABS_YMM_pap2 ( IRTemp aa ) {
9708   return math_PABS_YMM(aa, 2);
9709}
9710
9711static IRTemp math_PABS_YMM_pap1 ( IRTemp aa ) {
9712   return math_PABS_YMM(aa, 1);
9713}
9714
9715static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
9716                                        IRTemp lo64, Long byteShift )
9717{
9718   vassert(byteShift >= 1 && byteShift <= 7);
9719   return
9720      binop(Iop_Or64,
9721            binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
9722            binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
9723      );
9724}
9725
9726static IRTemp math_PALIGNR_XMM ( IRTemp sV, IRTemp dV, UInt imm8 )
9727{
9728   IRTemp res = newTemp(Ity_V128);
9729   IRTemp sHi = newTemp(Ity_I64);
9730   IRTemp sLo = newTemp(Ity_I64);
9731   IRTemp dHi = newTemp(Ity_I64);
9732   IRTemp dLo = newTemp(Ity_I64);
9733   IRTemp rHi = newTemp(Ity_I64);
9734   IRTemp rLo = newTemp(Ity_I64);
9735
9736   assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
9737   assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
9738   assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
9739   assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
9740
9741   if (imm8 == 0) {
9742      assign( rHi, mkexpr(sHi) );
9743      assign( rLo, mkexpr(sLo) );
9744   }
9745   else if (imm8 >= 1 && imm8 <= 7) {
9746      assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, imm8) );
9747      assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, imm8) );
9748   }
9749   else if (imm8 == 8) {
9750      assign( rHi, mkexpr(dLo) );
9751      assign( rLo, mkexpr(sHi) );
9752   }
9753   else if (imm8 >= 9 && imm8 <= 15) {
9754      assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-8) );
9755      assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, imm8-8) );
9756   }
9757   else if (imm8 == 16) {
9758      assign( rHi, mkexpr(dHi) );
9759      assign( rLo, mkexpr(dLo) );
9760   }
9761   else if (imm8 >= 17 && imm8 <= 23) {
9762      assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-16))) );
9763      assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, imm8-16) );
9764   }
9765   else if (imm8 == 24) {
9766      assign( rHi, mkU64(0) );
9767      assign( rLo, mkexpr(dHi) );
9768   }
9769   else if (imm8 >= 25 && imm8 <= 31) {
9770      assign( rHi, mkU64(0) );
9771      assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(imm8-24))) );
9772   }
9773   else if (imm8 >= 32 && imm8 <= 255) {
9774      assign( rHi, mkU64(0) );
9775      assign( rLo, mkU64(0) );
9776   }
9777   else
9778      vassert(0);
9779
9780   assign( res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)));
9781   return res;
9782}
9783
9784
9785/* Generate a SIGSEGV followed by a restart of the current instruction
9786   if effective_addr is not 16-aligned.  This is required behaviour
9787   for some SSE3 instructions and all 128-bit SSSE3 instructions.
9788   This assumes that guest_RIP_curr_instr is set correctly! */
9789static
9790void gen_SEGV_if_not_XX_aligned ( IRTemp effective_addr, ULong mask )
9791{
9792   stmt(
9793      IRStmt_Exit(
9794         binop(Iop_CmpNE64,
9795               binop(Iop_And64,mkexpr(effective_addr),mkU64(mask)),
9796               mkU64(0)),
9797         Ijk_SigSEGV,
9798         IRConst_U64(guest_RIP_curr_instr),
9799         OFFB_RIP
9800      )
9801   );
9802}
9803
9804static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr ) {
9805   gen_SEGV_if_not_XX_aligned(effective_addr, 16-1);
9806}
9807
9808static void gen_SEGV_if_not_32_aligned ( IRTemp effective_addr ) {
9809   gen_SEGV_if_not_XX_aligned(effective_addr, 32-1);
9810}
9811
9812/* Helper for deciding whether a given insn (starting at the opcode
9813   byte) may validly be used with a LOCK prefix.  The following insns
9814   may be used with LOCK when their destination operand is in memory.
9815   AFAICS this is exactly the same for both 32-bit and 64-bit mode.
9816
9817   ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
9818   OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
9819   ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
9820   SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
9821   AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
9822   SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
9823   XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
9824
9825   DEC        FE /1,  FF /1
9826   INC        FE /0,  FF /0
9827
9828   NEG        F6 /3,  F7 /3
9829   NOT        F6 /2,  F7 /2
9830
9831   XCHG       86, 87
9832
9833   BTC        0F BB,  0F BA /7
9834   BTR        0F B3,  0F BA /6
9835   BTS        0F AB,  0F BA /5
9836
9837   CMPXCHG    0F B0,  0F B1
9838   CMPXCHG8B  0F C7 /1
9839
9840   XADD       0F C0,  0F C1
9841
9842   ------------------------------
9843
9844   80 /0  =  addb $imm8,  rm8
9845   81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
9846   82 /0  =  addb $imm8,  rm8
9847   83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
9848
9849   00     =  addb r8,  rm8
9850   01     =  addl r32, rm32  and  addw r16, rm16
9851
9852   Same for ADD OR ADC SBB AND SUB XOR
9853
9854   FE /1  = dec rm8
9855   FF /1  = dec rm32  and  dec rm16
9856
9857   FE /0  = inc rm8
9858   FF /0  = inc rm32  and  inc rm16
9859
9860   F6 /3  = neg rm8
9861   F7 /3  = neg rm32  and  neg rm16
9862
9863   F6 /2  = not rm8
9864   F7 /2  = not rm32  and  not rm16
9865
9866   0F BB     = btcw r16, rm16    and  btcl r32, rm32
9867   OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
9868
9869   Same for BTS, BTR
9870*/
9871static Bool can_be_used_with_LOCK_prefix ( UChar* opc )
9872{
9873   switch (opc[0]) {
9874      case 0x00: case 0x01: case 0x08: case 0x09:
9875      case 0x10: case 0x11: case 0x18: case 0x19:
9876      case 0x20: case 0x21: case 0x28: case 0x29:
9877      case 0x30: case 0x31:
9878         if (!epartIsReg(opc[1]))
9879            return True;
9880         break;
9881
9882      case 0x80: case 0x81: case 0x82: case 0x83:
9883         if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 6
9884             && !epartIsReg(opc[1]))
9885            return True;
9886         break;
9887
9888      case 0xFE: case 0xFF:
9889         if (gregLO3ofRM(opc[1]) >= 0 && gregLO3ofRM(opc[1]) <= 1
9890             && !epartIsReg(opc[1]))
9891            return True;
9892         break;
9893
9894      case 0xF6: case 0xF7:
9895         if (gregLO3ofRM(opc[1]) >= 2 && gregLO3ofRM(opc[1]) <= 3
9896             && !epartIsReg(opc[1]))
9897            return True;
9898         break;
9899
9900      case 0x86: case 0x87:
9901         if (!epartIsReg(opc[1]))
9902            return True;
9903         break;
9904
9905      case 0x0F: {
9906         switch (opc[1]) {
9907            case 0xBB: case 0xB3: case 0xAB:
9908               if (!epartIsReg(opc[2]))
9909                  return True;
9910               break;
9911            case 0xBA:
9912               if (gregLO3ofRM(opc[2]) >= 5 && gregLO3ofRM(opc[2]) <= 7
9913                   && !epartIsReg(opc[2]))
9914                  return True;
9915               break;
9916            case 0xB0: case 0xB1:
9917               if (!epartIsReg(opc[2]))
9918                  return True;
9919               break;
9920            case 0xC7:
9921               if (gregLO3ofRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
9922                  return True;
9923               break;
9924            case 0xC0: case 0xC1:
9925               if (!epartIsReg(opc[2]))
9926                  return True;
9927               break;
9928            default:
9929               break;
9930         } /* switch (opc[1]) */
9931         break;
9932      }
9933
9934      default:
9935         break;
9936   } /* switch (opc[0]) */
9937
9938   return False;
9939}
9940
9941
9942/*------------------------------------------------------------*/
9943/*---                                                      ---*/
9944/*--- Top-level SSE/SSE2: dis_ESC_0F__SSE2                 ---*/
9945/*---                                                      ---*/
9946/*------------------------------------------------------------*/
9947
9948static Long dis_COMISD ( VexAbiInfo* vbi, Prefix pfx,
9949                         Long delta, Bool isAvx, UChar opc )
9950{
9951   vassert(opc == 0x2F/*COMISD*/ || opc == 0x2E/*UCOMISD*/);
9952   Int    alen  = 0;
9953   HChar  dis_buf[50];
9954   IRTemp argL  = newTemp(Ity_F64);
9955   IRTemp argR  = newTemp(Ity_F64);
9956   UChar  modrm = getUChar(delta);
9957   IRTemp addr  = IRTemp_INVALID;
9958   if (epartIsReg(modrm)) {
9959      assign( argR, getXMMRegLane64F( eregOfRexRM(pfx,modrm),
9960                                      0/*lowest lane*/ ) );
9961      delta += 1;
9962      DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
9963                                opc==0x2E ? "u" : "",
9964                                nameXMMReg(eregOfRexRM(pfx,modrm)),
9965                                nameXMMReg(gregOfRexRM(pfx,modrm)) );
9966   } else {
9967      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
9968      assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
9969      delta += alen;
9970      DIP("%s%scomisd %s,%s\n", isAvx ? "v" : "",
9971                                opc==0x2E ? "u" : "",
9972                                dis_buf,
9973                                nameXMMReg(gregOfRexRM(pfx,modrm)) );
9974   }
9975   assign( argL, getXMMRegLane64F( gregOfRexRM(pfx,modrm),
9976                                   0/*lowest lane*/ ) );
9977
9978   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
9979   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
9980   stmt( IRStmt_Put(
9981            OFFB_CC_DEP1,
9982            binop( Iop_And64,
9983                   unop( Iop_32Uto64,
9984                         binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)) ),
9985                   mkU64(0x45)
9986       )));
9987   return delta;
9988}
9989
9990
9991static Long dis_COMISS ( VexAbiInfo* vbi, Prefix pfx,
9992                         Long delta, Bool isAvx, UChar opc )
9993{
9994   vassert(opc == 0x2F/*COMISS*/ || opc == 0x2E/*UCOMISS*/);
9995   Int    alen  = 0;
9996   HChar  dis_buf[50];
9997   IRTemp argL  = newTemp(Ity_F32);
9998   IRTemp argR  = newTemp(Ity_F32);
9999   UChar  modrm = getUChar(delta);
10000   IRTemp addr  = IRTemp_INVALID;
10001   if (epartIsReg(modrm)) {
10002      assign( argR, getXMMRegLane32F( eregOfRexRM(pfx,modrm),
10003                                      0/*lowest lane*/ ) );
10004      delta += 1;
10005      DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
10006                                opc==0x2E ? "u" : "",
10007                                nameXMMReg(eregOfRexRM(pfx,modrm)),
10008                                nameXMMReg(gregOfRexRM(pfx,modrm)) );
10009   } else {
10010      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10011      assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
10012      delta += alen;
10013      DIP("%s%scomiss %s,%s\n", isAvx ? "v" : "",
10014                                opc==0x2E ? "u" : "",
10015                                dis_buf,
10016                                nameXMMReg(gregOfRexRM(pfx,modrm)) );
10017   }
10018   assign( argL, getXMMRegLane32F( gregOfRexRM(pfx,modrm),
10019                                   0/*lowest lane*/ ) );
10020
10021   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
10022   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
10023   stmt( IRStmt_Put(
10024            OFFB_CC_DEP1,
10025            binop( Iop_And64,
10026                   unop( Iop_32Uto64,
10027                         binop(Iop_CmpF64,
10028                               unop(Iop_F32toF64,mkexpr(argL)),
10029                               unop(Iop_F32toF64,mkexpr(argR)))),
10030                   mkU64(0x45)
10031       )));
10032   return delta;
10033}
10034
10035
10036static Long dis_PSHUFD_32x4 ( VexAbiInfo* vbi, Prefix pfx,
10037                              Long delta, Bool writesYmm )
10038{
10039   Int    order;
10040   Int    alen  = 0;
10041   HChar  dis_buf[50];
10042   IRTemp sV    = newTemp(Ity_V128);
10043   UChar  modrm = getUChar(delta);
10044   const HChar* strV  = writesYmm ? "v" : "";
10045   IRTemp addr  = IRTemp_INVALID;
10046   if (epartIsReg(modrm)) {
10047      assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
10048      order = (Int)getUChar(delta+1);
10049      delta += 1+1;
10050      DIP("%spshufd $%d,%s,%s\n", strV, order,
10051                                  nameXMMReg(eregOfRexRM(pfx,modrm)),
10052                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
10053   } else {
10054      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
10055                        1/*byte after the amode*/ );
10056      assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
10057      order = (Int)getUChar(delta+alen);
10058      delta += alen+1;
10059      DIP("%spshufd $%d,%s,%s\n", strV, order,
10060                                 dis_buf,
10061                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
10062   }
10063
10064   IRTemp s3, s2, s1, s0;
10065   s3 = s2 = s1 = s0 = IRTemp_INVALID;
10066   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
10067
10068#  define SEL(n)  ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
10069   IRTemp dV = newTemp(Ity_V128);
10070   assign(dV,
10071          mkV128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
10072                         SEL((order>>2)&3), SEL((order>>0)&3) )
10073   );
10074#  undef SEL
10075
10076   (writesYmm ? putYMMRegLoAndZU : putXMMReg)
10077      (gregOfRexRM(pfx,modrm), mkexpr(dV));
10078   return delta;
10079}
10080
10081
10082static Long dis_PSHUFD_32x8 ( VexAbiInfo* vbi, Prefix pfx, Long delta )
10083{
10084   Int    order;
10085   Int    alen  = 0;
10086   HChar  dis_buf[50];
10087   IRTemp sV    = newTemp(Ity_V256);
10088   UChar  modrm = getUChar(delta);
10089   IRTemp addr  = IRTemp_INVALID;
10090   UInt   rG    = gregOfRexRM(pfx,modrm);
10091   if (epartIsReg(modrm)) {
10092      UInt rE = eregOfRexRM(pfx,modrm);
10093      assign( sV, getYMMReg(rE) );
10094      order = (Int)getUChar(delta+1);
10095      delta += 1+1;
10096      DIP("vpshufd $%d,%s,%s\n", order, nameYMMReg(rE), nameYMMReg(rG));
10097   } else {
10098      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
10099                        1/*byte after the amode*/ );
10100      assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
10101      order = (Int)getUChar(delta+alen);
10102      delta += alen+1;
10103      DIP("vpshufd $%d,%s,%s\n", order,  dis_buf, nameYMMReg(rG));
10104   }
10105
10106   IRTemp s[8];
10107   s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
10108   breakupV256to32s( sV, &s[7], &s[6], &s[5], &s[4],
10109                         &s[3], &s[2], &s[1], &s[0] );
10110
10111   putYMMReg( rG, mkV256from32s( s[4 + ((order>>6)&3)],
10112                                 s[4 + ((order>>4)&3)],
10113                                 s[4 + ((order>>2)&3)],
10114                                 s[4 + ((order>>0)&3)],
10115                                 s[0 + ((order>>6)&3)],
10116                                 s[0 + ((order>>4)&3)],
10117                                 s[0 + ((order>>2)&3)],
10118                                 s[0 + ((order>>0)&3)] ) );
10119   return delta;
10120}
10121
10122
10123static IRTemp math_PSRLDQ ( IRTemp sV, Int imm )
10124{
10125   IRTemp dV    = newTemp(Ity_V128);
10126   IRTemp hi64  = newTemp(Ity_I64);
10127   IRTemp lo64  = newTemp(Ity_I64);
10128   IRTemp hi64r = newTemp(Ity_I64);
10129   IRTemp lo64r = newTemp(Ity_I64);
10130
10131   vassert(imm >= 0 && imm <= 255);
10132   if (imm >= 16) {
10133      assign(dV, mkV128(0x0000));
10134      return dV;
10135   }
10136
10137   assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
10138   assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
10139
10140   if (imm == 0) {
10141      assign( lo64r, mkexpr(lo64) );
10142      assign( hi64r, mkexpr(hi64) );
10143   }
10144   else
10145   if (imm == 8) {
10146      assign( hi64r, mkU64(0) );
10147      assign( lo64r, mkexpr(hi64) );
10148   }
10149   else
10150   if (imm > 8) {
10151      assign( hi64r, mkU64(0) );
10152      assign( lo64r, binop( Iop_Shr64, mkexpr(hi64), mkU8( 8*(imm-8) ) ));
10153   } else {
10154      assign( hi64r, binop( Iop_Shr64, mkexpr(hi64), mkU8(8 * imm) ));
10155      assign( lo64r,
10156              binop( Iop_Or64,
10157                     binop(Iop_Shr64, mkexpr(lo64),
10158                           mkU8(8 * imm)),
10159                     binop(Iop_Shl64, mkexpr(hi64),
10160                           mkU8(8 * (8 - imm)) )
10161                     )
10162              );
10163   }
10164
10165   assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
10166   return dV;
10167}
10168
10169
10170static IRTemp math_PSLLDQ ( IRTemp sV, Int imm )
10171{
10172   IRTemp       dV    = newTemp(Ity_V128);
10173   IRTemp       hi64  = newTemp(Ity_I64);
10174   IRTemp       lo64  = newTemp(Ity_I64);
10175   IRTemp       hi64r = newTemp(Ity_I64);
10176   IRTemp       lo64r = newTemp(Ity_I64);
10177
10178   vassert(imm >= 0 && imm <= 255);
10179   if (imm >= 16) {
10180      assign(dV, mkV128(0x0000));
10181      return dV;
10182   }
10183
10184   assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
10185   assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
10186
10187   if (imm == 0) {
10188      assign( lo64r, mkexpr(lo64) );
10189      assign( hi64r, mkexpr(hi64) );
10190   }
10191   else
10192   if (imm == 8) {
10193      assign( lo64r, mkU64(0) );
10194      assign( hi64r, mkexpr(lo64) );
10195   }
10196   else
10197   if (imm > 8) {
10198      assign( lo64r, mkU64(0) );
10199      assign( hi64r, binop( Iop_Shl64, mkexpr(lo64), mkU8( 8*(imm-8) ) ));
10200   } else {
10201      assign( lo64r, binop( Iop_Shl64, mkexpr(lo64), mkU8(8 * imm) ));
10202      assign( hi64r,
10203              binop( Iop_Or64,
10204                     binop(Iop_Shl64, mkexpr(hi64),
10205                           mkU8(8 * imm)),
10206                     binop(Iop_Shr64, mkexpr(lo64),
10207                           mkU8(8 * (8 - imm)) )
10208                     )
10209              );
10210   }
10211
10212   assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
10213   return dV;
10214}
10215
10216
10217static Long dis_CVTxSD2SI ( VexAbiInfo* vbi, Prefix pfx,
10218                            Long delta, Bool isAvx, UChar opc, Int sz )
10219{
10220   vassert(opc == 0x2D/*CVTSD2SI*/ || opc == 0x2C/*CVTTSD2SI*/);
10221   HChar  dis_buf[50];
10222   Int    alen   = 0;
10223   UChar  modrm  = getUChar(delta);
10224   IRTemp addr   = IRTemp_INVALID;
10225   IRTemp rmode  = newTemp(Ity_I32);
10226   IRTemp f64lo  = newTemp(Ity_F64);
10227   Bool   r2zero = toBool(opc == 0x2C);
10228
10229   if (epartIsReg(modrm)) {
10230      delta += 1;
10231      assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
10232      DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
10233                                  nameXMMReg(eregOfRexRM(pfx,modrm)),
10234                                  nameIReg(sz, gregOfRexRM(pfx,modrm),
10235                                           False));
10236   } else {
10237      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10238      assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
10239      delta += alen;
10240      DIP("%scvt%ssd2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
10241                                  dis_buf,
10242                                  nameIReg(sz, gregOfRexRM(pfx,modrm),
10243                                           False));
10244   }
10245
10246   if (r2zero) {
10247      assign( rmode, mkU32((UInt)Irrm_ZERO) );
10248   } else {
10249      assign( rmode, get_sse_roundingmode() );
10250   }
10251
10252   if (sz == 4) {
10253      putIReg32( gregOfRexRM(pfx,modrm),
10254                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
10255   } else {
10256      vassert(sz == 8);
10257      putIReg64( gregOfRexRM(pfx,modrm),
10258                 binop( Iop_F64toI64S, mkexpr(rmode), mkexpr(f64lo)) );
10259   }
10260
10261   return delta;
10262}
10263
10264
10265static Long dis_CVTxSS2SI ( VexAbiInfo* vbi, Prefix pfx,
10266                            Long delta, Bool isAvx, UChar opc, Int sz )
10267{
10268   vassert(opc == 0x2D/*CVTSS2SI*/ || opc == 0x2C/*CVTTSS2SI*/);
10269   HChar  dis_buf[50];
10270   Int    alen   = 0;
10271   UChar  modrm  = getUChar(delta);
10272   IRTemp addr   = IRTemp_INVALID;
10273   IRTemp rmode  = newTemp(Ity_I32);
10274   IRTemp f32lo  = newTemp(Ity_F32);
10275   Bool   r2zero = toBool(opc == 0x2C);
10276
10277   if (epartIsReg(modrm)) {
10278      delta += 1;
10279      assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
10280      DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
10281                                  nameXMMReg(eregOfRexRM(pfx,modrm)),
10282                                  nameIReg(sz, gregOfRexRM(pfx,modrm),
10283                                           False));
10284   } else {
10285      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10286      assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
10287      delta += alen;
10288      DIP("%scvt%sss2si %s,%s\n", isAvx ? "v" : "", r2zero ? "t" : "",
10289                                  dis_buf,
10290                                  nameIReg(sz, gregOfRexRM(pfx,modrm),
10291                                           False));
10292   }
10293
10294   if (r2zero) {
10295      assign( rmode, mkU32((UInt)Irrm_ZERO) );
10296   } else {
10297      assign( rmode, get_sse_roundingmode() );
10298   }
10299
10300   if (sz == 4) {
10301      putIReg32( gregOfRexRM(pfx,modrm),
10302                 binop( Iop_F64toI32S,
10303                        mkexpr(rmode),
10304                        unop(Iop_F32toF64, mkexpr(f32lo))) );
10305   } else {
10306      vassert(sz == 8);
10307      putIReg64( gregOfRexRM(pfx,modrm),
10308                 binop( Iop_F64toI64S,
10309                        mkexpr(rmode),
10310                        unop(Iop_F32toF64, mkexpr(f32lo))) );
10311   }
10312
10313   return delta;
10314}
10315
10316
10317static Long dis_CVTPS2PD_128 ( VexAbiInfo* vbi, Prefix pfx,
10318                               Long delta, Bool isAvx )
10319{
10320   IRTemp addr  = IRTemp_INVALID;
10321   Int    alen  = 0;
10322   HChar  dis_buf[50];
10323   IRTemp f32lo = newTemp(Ity_F32);
10324   IRTemp f32hi = newTemp(Ity_F32);
10325   UChar  modrm = getUChar(delta);
10326   UInt   rG    = gregOfRexRM(pfx,modrm);
10327   if (epartIsReg(modrm)) {
10328      UInt rE = eregOfRexRM(pfx,modrm);
10329      assign( f32lo, getXMMRegLane32F(rE, 0) );
10330      assign( f32hi, getXMMRegLane32F(rE, 1) );
10331      delta += 1;
10332      DIP("%scvtps2pd %s,%s\n",
10333          isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
10334   } else {
10335      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10336      assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
10337      assign( f32hi, loadLE(Ity_F32,
10338                            binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
10339      delta += alen;
10340      DIP("%scvtps2pd %s,%s\n",
10341          isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
10342   }
10343
10344   putXMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32hi)) );
10345   putXMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32lo)) );
10346   if (isAvx)
10347      putYMMRegLane128( rG, 1, mkV128(0));
10348   return delta;
10349}
10350
10351
10352static Long dis_CVTPS2PD_256 ( VexAbiInfo* vbi, Prefix pfx,
10353                               Long delta )
10354{
10355   IRTemp addr  = IRTemp_INVALID;
10356   Int    alen  = 0;
10357   HChar  dis_buf[50];
10358   IRTemp f32_0 = newTemp(Ity_F32);
10359   IRTemp f32_1 = newTemp(Ity_F32);
10360   IRTemp f32_2 = newTemp(Ity_F32);
10361   IRTemp f32_3 = newTemp(Ity_F32);
10362   UChar  modrm = getUChar(delta);
10363   UInt   rG    = gregOfRexRM(pfx,modrm);
10364   if (epartIsReg(modrm)) {
10365      UInt rE = eregOfRexRM(pfx,modrm);
10366      assign( f32_0, getXMMRegLane32F(rE, 0) );
10367      assign( f32_1, getXMMRegLane32F(rE, 1) );
10368      assign( f32_2, getXMMRegLane32F(rE, 2) );
10369      assign( f32_3, getXMMRegLane32F(rE, 3) );
10370      delta += 1;
10371      DIP("vcvtps2pd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
10372   } else {
10373      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10374      assign( f32_0, loadLE(Ity_F32, mkexpr(addr)) );
10375      assign( f32_1, loadLE(Ity_F32,
10376                            binop(Iop_Add64,mkexpr(addr),mkU64(4))) );
10377      assign( f32_2, loadLE(Ity_F32,
10378                            binop(Iop_Add64,mkexpr(addr),mkU64(8))) );
10379      assign( f32_3, loadLE(Ity_F32,
10380                            binop(Iop_Add64,mkexpr(addr),mkU64(12))) );
10381      delta += alen;
10382      DIP("vcvtps2pd %s,%s\n", dis_buf, nameYMMReg(rG));
10383   }
10384
10385   putYMMRegLane64F( rG, 3, unop(Iop_F32toF64, mkexpr(f32_3)) );
10386   putYMMRegLane64F( rG, 2, unop(Iop_F32toF64, mkexpr(f32_2)) );
10387   putYMMRegLane64F( rG, 1, unop(Iop_F32toF64, mkexpr(f32_1)) );
10388   putYMMRegLane64F( rG, 0, unop(Iop_F32toF64, mkexpr(f32_0)) );
10389   return delta;
10390}
10391
10392
10393static Long dis_CVTPD2PS_128 ( VexAbiInfo* vbi, Prefix pfx,
10394                               Long delta, Bool isAvx )
10395{
10396   IRTemp addr  = IRTemp_INVALID;
10397   Int    alen  = 0;
10398   HChar  dis_buf[50];
10399   UChar  modrm = getUChar(delta);
10400   UInt   rG    = gregOfRexRM(pfx,modrm);
10401   IRTemp argV  = newTemp(Ity_V128);
10402   IRTemp rmode = newTemp(Ity_I32);
10403   if (epartIsReg(modrm)) {
10404      UInt rE = eregOfRexRM(pfx,modrm);
10405      assign( argV, getXMMReg(rE) );
10406      delta += 1;
10407      DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
10408          nameXMMReg(rE), nameXMMReg(rG));
10409   } else {
10410      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10411      assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10412      delta += alen;
10413      DIP("%scvtpd2ps %s,%s\n", isAvx ? "v" : "",
10414          dis_buf, nameXMMReg(rG) );
10415   }
10416
10417   assign( rmode, get_sse_roundingmode() );
10418   IRTemp t0 = newTemp(Ity_F64);
10419   IRTemp t1 = newTemp(Ity_F64);
10420   assign( t0, unop(Iop_ReinterpI64asF64,
10421                    unop(Iop_V128to64, mkexpr(argV))) );
10422   assign( t1, unop(Iop_ReinterpI64asF64,
10423                    unop(Iop_V128HIto64, mkexpr(argV))) );
10424
10425#  define CVT(_t)  binop( Iop_F64toF32, mkexpr(rmode), mkexpr(_t) )
10426   putXMMRegLane32(  rG, 3, mkU32(0) );
10427   putXMMRegLane32(  rG, 2, mkU32(0) );
10428   putXMMRegLane32F( rG, 1, CVT(t1) );
10429   putXMMRegLane32F( rG, 0, CVT(t0) );
10430#  undef CVT
10431   if (isAvx)
10432      putYMMRegLane128( rG, 1, mkV128(0) );
10433
10434   return delta;
10435}
10436
10437
10438static Long dis_CVTxPS2DQ_128 ( VexAbiInfo* vbi, Prefix pfx,
10439                                Long delta, Bool isAvx, Bool r2zero )
10440{
10441   IRTemp addr  = IRTemp_INVALID;
10442   Int    alen  = 0;
10443   HChar  dis_buf[50];
10444   UChar  modrm = getUChar(delta);
10445   IRTemp argV  = newTemp(Ity_V128);
10446   IRTemp rmode = newTemp(Ity_I32);
10447   UInt   rG    = gregOfRexRM(pfx,modrm);
10448   IRTemp t0, t1, t2, t3;
10449
10450   if (epartIsReg(modrm)) {
10451      UInt rE = eregOfRexRM(pfx,modrm);
10452      assign( argV, getXMMReg(rE) );
10453      delta += 1;
10454      DIP("%scvt%sps2dq %s,%s\n",
10455          isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
10456   } else {
10457      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10458      assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10459      delta += alen;
10460      DIP("%scvt%sps2dq %s,%s\n",
10461          isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
10462   }
10463
10464   assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
10465                         : get_sse_roundingmode() );
10466   t0 = t1 = t2 = t3 = IRTemp_INVALID;
10467   breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
10468   /* This is less than ideal.  If it turns out to be a performance
10469      bottleneck it can be improved. */
10470#  define CVT(_t)                             \
10471      binop( Iop_F64toI32S,                   \
10472             mkexpr(rmode),                   \
10473             unop( Iop_F32toF64,              \
10474                   unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
10475
10476   putXMMRegLane32( rG, 3, CVT(t3) );
10477   putXMMRegLane32( rG, 2, CVT(t2) );
10478   putXMMRegLane32( rG, 1, CVT(t1) );
10479   putXMMRegLane32( rG, 0, CVT(t0) );
10480#  undef CVT
10481   if (isAvx)
10482      putYMMRegLane128( rG, 1, mkV128(0) );
10483
10484   return delta;
10485}
10486
10487
10488static Long dis_CVTxPS2DQ_256 ( VexAbiInfo* vbi, Prefix pfx,
10489                                Long delta, Bool r2zero )
10490{
10491   IRTemp addr  = IRTemp_INVALID;
10492   Int    alen  = 0;
10493   HChar  dis_buf[50];
10494   UChar  modrm = getUChar(delta);
10495   IRTemp argV  = newTemp(Ity_V256);
10496   IRTemp rmode = newTemp(Ity_I32);
10497   UInt   rG    = gregOfRexRM(pfx,modrm);
10498   IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
10499
10500   if (epartIsReg(modrm)) {
10501      UInt rE = eregOfRexRM(pfx,modrm);
10502      assign( argV, getYMMReg(rE) );
10503      delta += 1;
10504      DIP("vcvt%sps2dq %s,%s\n",
10505          r2zero ? "t" : "", nameYMMReg(rE), nameYMMReg(rG));
10506   } else {
10507      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10508      assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
10509      delta += alen;
10510      DIP("vcvt%sps2dq %s,%s\n",
10511          r2zero ? "t" : "", dis_buf, nameYMMReg(rG) );
10512   }
10513
10514   assign( rmode, r2zero ? mkU32((UInt)Irrm_ZERO)
10515                         : get_sse_roundingmode() );
10516   t0 = t1 = t2 = t3 = t4 = t5 = t6 = t7 = IRTemp_INVALID;
10517   breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
10518   /* This is less than ideal.  If it turns out to be a performance
10519      bottleneck it can be improved. */
10520#  define CVT(_t)                             \
10521      binop( Iop_F64toI32S,                   \
10522             mkexpr(rmode),                   \
10523             unop( Iop_F32toF64,              \
10524                   unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
10525
10526   putYMMRegLane32( rG, 7, CVT(t7) );
10527   putYMMRegLane32( rG, 6, CVT(t6) );
10528   putYMMRegLane32( rG, 5, CVT(t5) );
10529   putYMMRegLane32( rG, 4, CVT(t4) );
10530   putYMMRegLane32( rG, 3, CVT(t3) );
10531   putYMMRegLane32( rG, 2, CVT(t2) );
10532   putYMMRegLane32( rG, 1, CVT(t1) );
10533   putYMMRegLane32( rG, 0, CVT(t0) );
10534#  undef CVT
10535
10536   return delta;
10537}
10538
10539
10540static Long dis_CVTxPD2DQ_128 ( VexAbiInfo* vbi, Prefix pfx,
10541                                Long delta, Bool isAvx, Bool r2zero )
10542{
10543   IRTemp addr  = IRTemp_INVALID;
10544   Int    alen  = 0;
10545   HChar  dis_buf[50];
10546   UChar  modrm = getUChar(delta);
10547   IRTemp argV  = newTemp(Ity_V128);
10548   IRTemp rmode = newTemp(Ity_I32);
10549   UInt   rG    = gregOfRexRM(pfx,modrm);
10550   IRTemp t0, t1;
10551
10552   if (epartIsReg(modrm)) {
10553      UInt rE = eregOfRexRM(pfx,modrm);
10554      assign( argV, getXMMReg(rE) );
10555      delta += 1;
10556      DIP("%scvt%spd2dq %s,%s\n",
10557          isAvx ? "v" : "", r2zero ? "t" : "", nameXMMReg(rE), nameXMMReg(rG));
10558   } else {
10559      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10560      assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10561      delta += alen;
10562      DIP("%scvt%spd2dqx %s,%s\n",
10563          isAvx ? "v" : "", r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
10564   }
10565
10566   if (r2zero) {
10567      assign(rmode, mkU32((UInt)Irrm_ZERO) );
10568   } else {
10569      assign( rmode, get_sse_roundingmode() );
10570   }
10571
10572   t0 = newTemp(Ity_F64);
10573   t1 = newTemp(Ity_F64);
10574   assign( t0, unop(Iop_ReinterpI64asF64,
10575                    unop(Iop_V128to64, mkexpr(argV))) );
10576   assign( t1, unop(Iop_ReinterpI64asF64,
10577                    unop(Iop_V128HIto64, mkexpr(argV))) );
10578
10579#  define CVT(_t)  binop( Iop_F64toI32S,                   \
10580                          mkexpr(rmode),                   \
10581                          mkexpr(_t) )
10582
10583   putXMMRegLane32( rG, 3, mkU32(0) );
10584   putXMMRegLane32( rG, 2, mkU32(0) );
10585   putXMMRegLane32( rG, 1, CVT(t1) );
10586   putXMMRegLane32( rG, 0, CVT(t0) );
10587#  undef CVT
10588   if (isAvx)
10589      putYMMRegLane128( rG, 1, mkV128(0) );
10590
10591   return delta;
10592}
10593
10594
10595static Long dis_CVTxPD2DQ_256 ( VexAbiInfo* vbi, Prefix pfx,
10596                                Long delta, Bool r2zero )
10597{
10598   IRTemp addr  = IRTemp_INVALID;
10599   Int    alen  = 0;
10600   HChar  dis_buf[50];
10601   UChar  modrm = getUChar(delta);
10602   IRTemp argV  = newTemp(Ity_V256);
10603   IRTemp rmode = newTemp(Ity_I32);
10604   UInt   rG    = gregOfRexRM(pfx,modrm);
10605   IRTemp t0, t1, t2, t3;
10606
10607   if (epartIsReg(modrm)) {
10608      UInt rE = eregOfRexRM(pfx,modrm);
10609      assign( argV, getYMMReg(rE) );
10610      delta += 1;
10611      DIP("vcvt%spd2dq %s,%s\n",
10612          r2zero ? "t" : "", nameYMMReg(rE), nameXMMReg(rG));
10613   } else {
10614      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10615      assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
10616      delta += alen;
10617      DIP("vcvt%spd2dqy %s,%s\n",
10618          r2zero ? "t" : "", dis_buf, nameXMMReg(rG) );
10619   }
10620
10621   if (r2zero) {
10622      assign(rmode, mkU32((UInt)Irrm_ZERO) );
10623   } else {
10624      assign( rmode, get_sse_roundingmode() );
10625   }
10626
10627   t0 = IRTemp_INVALID;
10628   t1 = IRTemp_INVALID;
10629   t2 = IRTemp_INVALID;
10630   t3 = IRTemp_INVALID;
10631   breakupV256to64s( argV, &t3, &t2, &t1, &t0 );
10632
10633#  define CVT(_t)  binop( Iop_F64toI32S,                   \
10634                          mkexpr(rmode),                   \
10635                          unop( Iop_ReinterpI64asF64,      \
10636                                mkexpr(_t) ) )
10637
10638   putXMMRegLane32( rG, 3, CVT(t3) );
10639   putXMMRegLane32( rG, 2, CVT(t2) );
10640   putXMMRegLane32( rG, 1, CVT(t1) );
10641   putXMMRegLane32( rG, 0, CVT(t0) );
10642#  undef CVT
10643   putYMMRegLane128( rG, 1, mkV128(0) );
10644
10645   return delta;
10646}
10647
10648
10649static Long dis_CVTDQ2PS_128 ( VexAbiInfo* vbi, Prefix pfx,
10650                               Long delta, Bool isAvx )
10651{
10652   IRTemp addr  = IRTemp_INVALID;
10653   Int    alen  = 0;
10654   HChar  dis_buf[50];
10655   UChar  modrm = getUChar(delta);
10656   IRTemp argV  = newTemp(Ity_V128);
10657   IRTemp rmode = newTemp(Ity_I32);
10658   UInt   rG    = gregOfRexRM(pfx,modrm);
10659   IRTemp t0, t1, t2, t3;
10660
10661   if (epartIsReg(modrm)) {
10662      UInt rE = eregOfRexRM(pfx,modrm);
10663      assign( argV, getXMMReg(rE) );
10664      delta += 1;
10665      DIP("%scvtdq2ps %s,%s\n",
10666          isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
10667   } else {
10668      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10669      assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
10670      delta += alen;
10671      DIP("%scvtdq2ps %s,%s\n",
10672          isAvx ? "v" : "", dis_buf, nameXMMReg(rG) );
10673   }
10674
10675   assign( rmode, get_sse_roundingmode() );
10676   t0 = IRTemp_INVALID;
10677   t1 = IRTemp_INVALID;
10678   t2 = IRTemp_INVALID;
10679   t3 = IRTemp_INVALID;
10680   breakupV128to32s( argV, &t3, &t2, &t1, &t0 );
10681
10682#  define CVT(_t)  binop( Iop_F64toF32,                    \
10683                          mkexpr(rmode),                   \
10684                          unop(Iop_I32StoF64,mkexpr(_t)))
10685
10686   putXMMRegLane32F( rG, 3, CVT(t3) );
10687   putXMMRegLane32F( rG, 2, CVT(t2) );
10688   putXMMRegLane32F( rG, 1, CVT(t1) );
10689   putXMMRegLane32F( rG, 0, CVT(t0) );
10690#  undef CVT
10691   if (isAvx)
10692      putYMMRegLane128( rG, 1, mkV128(0) );
10693
10694   return delta;
10695}
10696
10697static Long dis_CVTDQ2PS_256 ( VexAbiInfo* vbi, Prefix pfx,
10698                               Long delta )
10699{
10700   IRTemp addr   = IRTemp_INVALID;
10701   Int    alen   = 0;
10702   HChar  dis_buf[50];
10703   UChar  modrm  = getUChar(delta);
10704   IRTemp argV   = newTemp(Ity_V256);
10705   IRTemp rmode  = newTemp(Ity_I32);
10706   UInt   rG     = gregOfRexRM(pfx,modrm);
10707   IRTemp t0, t1, t2, t3, t4, t5, t6, t7;
10708
10709   if (epartIsReg(modrm)) {
10710      UInt rE = eregOfRexRM(pfx,modrm);
10711      assign( argV, getYMMReg(rE) );
10712      delta += 1;
10713      DIP("vcvtdq2ps %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
10714   } else {
10715      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
10716      assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
10717      delta += alen;
10718      DIP("vcvtdq2ps %s,%s\n", dis_buf, nameYMMReg(rG) );
10719   }
10720
10721   assign( rmode, get_sse_roundingmode() );
10722   t0 = IRTemp_INVALID;
10723   t1 = IRTemp_INVALID;
10724   t2 = IRTemp_INVALID;
10725   t3 = IRTemp_INVALID;
10726   t4 = IRTemp_INVALID;
10727   t5 = IRTemp_INVALID;
10728   t6 = IRTemp_INVALID;
10729   t7 = IRTemp_INVALID;
10730   breakupV256to32s( argV, &t7, &t6, &t5, &t4, &t3, &t2, &t1, &t0 );
10731
10732#  define CVT(_t)  binop( Iop_F64toF32,                    \
10733                          mkexpr(rmode),                   \
10734                          unop(Iop_I32StoF64,mkexpr(_t)))
10735
10736   putYMMRegLane32F( rG, 7, CVT(t7) );
10737   putYMMRegLane32F( rG, 6, CVT(t6) );
10738   putYMMRegLane32F( rG, 5, CVT(t5) );
10739   putYMMRegLane32F( rG, 4, CVT(t4) );
10740   putYMMRegLane32F( rG, 3, CVT(t3) );
10741   putYMMRegLane32F( rG, 2, CVT(t2) );
10742   putYMMRegLane32F( rG, 1, CVT(t1) );
10743   putYMMRegLane32F( rG, 0, CVT(t0) );
10744#  undef CVT
10745
10746   return delta;
10747}
10748
10749
10750static Long dis_PMOVMSKB_128 ( VexAbiInfo* vbi, Prefix pfx,
10751                               Long delta, Bool isAvx )
10752{
10753   UChar modrm = getUChar(delta);
10754   vassert(epartIsReg(modrm)); /* ensured by caller */
10755   UInt   rE = eregOfRexRM(pfx,modrm);
10756   UInt   rG = gregOfRexRM(pfx,modrm);
10757   IRTemp t0 = newTemp(Ity_V128);
10758   IRTemp t1 = newTemp(Ity_I32);
10759   assign(t0, getXMMReg(rE));
10760   assign(t1, unop(Iop_16Uto32, unop(Iop_GetMSBs8x16, mkexpr(t0))));
10761   putIReg32(rG, mkexpr(t1));
10762   DIP("%spmovmskb %s,%s\n", isAvx ? "v" : "", nameXMMReg(rE),
10763       nameIReg32(rG));
10764   delta += 1;
10765   return delta;
10766}
10767
10768
10769static Long dis_PMOVMSKB_256 ( VexAbiInfo* vbi, Prefix pfx,
10770                               Long delta  )
10771{
10772   UChar modrm = getUChar(delta);
10773   vassert(epartIsReg(modrm)); /* ensured by caller */
10774   UInt   rE = eregOfRexRM(pfx,modrm);
10775   UInt   rG = gregOfRexRM(pfx,modrm);
10776   IRTemp t0 = newTemp(Ity_V128);
10777   IRTemp t1 = newTemp(Ity_V128);
10778   IRTemp t2 = newTemp(Ity_I16);
10779   IRTemp t3 = newTemp(Ity_I16);
10780   assign(t0, getYMMRegLane128(rE, 0));
10781   assign(t1, getYMMRegLane128(rE, 1));
10782   assign(t2, unop(Iop_GetMSBs8x16, mkexpr(t0)));
10783   assign(t3, unop(Iop_GetMSBs8x16, mkexpr(t1)));
10784   putIReg32(rG, binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)));
10785   DIP("vpmovmskb %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
10786   delta += 1;
10787   return delta;
10788}
10789
10790
10791/* FIXME: why not just use InterleaveLO / InterleaveHI?  I think the
10792   relevant ops are "xIsH ? InterleaveHI32x4 : InterleaveLO32x4". */
10793/* Does the maths for 128 bit versions of UNPCKLPS and UNPCKHPS */
10794static IRTemp math_UNPCKxPS_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
10795{
10796   IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
10797   s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
10798   breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
10799   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
10800   IRTemp res = newTemp(Ity_V128);
10801   assign(res,  xIsH ? mkV128from32s( s3, d3, s2, d2 )
10802                     : mkV128from32s( s1, d1, s0, d0 ));
10803   return res;
10804}
10805
10806
10807/* FIXME: why not just use InterleaveLO / InterleaveHI ?? */
10808/* Does the maths for 128 bit versions of UNPCKLPD and UNPCKHPD */
10809static IRTemp math_UNPCKxPD_128 ( IRTemp sV, IRTemp dV, Bool xIsH )
10810{
10811   IRTemp s1 = newTemp(Ity_I64);
10812   IRTemp s0 = newTemp(Ity_I64);
10813   IRTemp d1 = newTemp(Ity_I64);
10814   IRTemp d0 = newTemp(Ity_I64);
10815   assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
10816   assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
10817   assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
10818   assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
10819   IRTemp res = newTemp(Ity_V128);
10820   assign(res, xIsH ? binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1))
10821                    : binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)));
10822   return res;
10823}
10824
10825
10826/* Does the maths for 256 bit versions of UNPCKLPD and UNPCKHPD.
10827   Doesn't seem like this fits in either of the Iop_Interleave{LO,HI}
10828   or the Iop_Cat{Odd,Even}Lanes idioms, hence just do it the stupid
10829   way. */
10830static IRTemp math_UNPCKxPD_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
10831{
10832   IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
10833   s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
10834   breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
10835   breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
10836   IRTemp res = newTemp(Ity_V256);
10837   assign(res, xIsH
10838               ? IRExpr_Qop(Iop_64x4toV256, mkexpr(s3), mkexpr(d3),
10839                                            mkexpr(s1), mkexpr(d1))
10840               : IRExpr_Qop(Iop_64x4toV256, mkexpr(s2), mkexpr(d2),
10841                                            mkexpr(s0), mkexpr(d0)));
10842   return res;
10843}
10844
10845
10846/* FIXME: this is really bad.  Surely can do something better here?
10847   One observation is that the steering in the upper and lower 128 bit
10848   halves is the same as with math_UNPCKxPS_128, so we simply split
10849   into two halves, and use that.  Consequently any improvement in
10850   math_UNPCKxPS_128 (probably, to use interleave-style primops)
10851   benefits this too. */
10852static IRTemp math_UNPCKxPS_256 ( IRTemp sV, IRTemp dV, Bool xIsH )
10853{
10854   IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
10855   IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
10856   breakupV256toV128s( sV, &sVhi, &sVlo );
10857   breakupV256toV128s( dV, &dVhi, &dVlo );
10858   IRTemp rVhi = math_UNPCKxPS_128(sVhi, dVhi, xIsH);
10859   IRTemp rVlo = math_UNPCKxPS_128(sVlo, dVlo, xIsH);
10860   IRTemp rV   = newTemp(Ity_V256);
10861   assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
10862   return rV;
10863}
10864
10865
10866static IRTemp math_SHUFPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
10867{
10868   IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
10869   s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
10870   vassert(imm8 < 256);
10871
10872   breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
10873   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
10874
10875#  define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
10876#  define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
10877   IRTemp res = newTemp(Ity_V128);
10878   assign(res,
10879          mkV128from32s( SELS((imm8>>6)&3), SELS((imm8>>4)&3),
10880                         SELD((imm8>>2)&3), SELD((imm8>>0)&3) ) );
10881#  undef SELD
10882#  undef SELS
10883   return res;
10884}
10885
10886
10887/* 256-bit SHUFPS appears to steer each of the 128-bit halves
10888   identically.  Hence do the clueless thing and use math_SHUFPS_128
10889   twice. */
10890static IRTemp math_SHUFPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
10891{
10892   IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
10893   IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
10894   breakupV256toV128s( sV, &sVhi, &sVlo );
10895   breakupV256toV128s( dV, &dVhi, &dVlo );
10896   IRTemp rVhi = math_SHUFPS_128(sVhi, dVhi, imm8);
10897   IRTemp rVlo = math_SHUFPS_128(sVlo, dVlo, imm8);
10898   IRTemp rV   = newTemp(Ity_V256);
10899   assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
10900   return rV;
10901}
10902
10903
10904static IRTemp math_SHUFPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
10905{
10906   IRTemp s1 = newTemp(Ity_I64);
10907   IRTemp s0 = newTemp(Ity_I64);
10908   IRTemp d1 = newTemp(Ity_I64);
10909   IRTemp d0 = newTemp(Ity_I64);
10910
10911   assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
10912   assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
10913   assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
10914   assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
10915
10916#  define SELD(n) mkexpr((n)==0 ? d0 : d1)
10917#  define SELS(n) mkexpr((n)==0 ? s0 : s1)
10918
10919   IRTemp res = newTemp(Ity_V128);
10920   assign(res, binop( Iop_64HLtoV128,
10921                      SELS((imm8>>1)&1), SELD((imm8>>0)&1) ) );
10922
10923#  undef SELD
10924#  undef SELS
10925   return res;
10926}
10927
10928
10929static IRTemp math_SHUFPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
10930{
10931   IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
10932   IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
10933   breakupV256toV128s( sV, &sVhi, &sVlo );
10934   breakupV256toV128s( dV, &dVhi, &dVlo );
10935   IRTemp rVhi = math_SHUFPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
10936   IRTemp rVlo = math_SHUFPD_128(sVlo, dVlo, imm8 & 3);
10937   IRTemp rV   = newTemp(Ity_V256);
10938   assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
10939   return rV;
10940}
10941
10942
10943static IRTemp math_BLENDPD_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
10944{
10945   UShort imm8_mask_16;
10946   IRTemp imm8_mask = newTemp(Ity_V128);
10947
10948   switch( imm8 & 3 ) {
10949      case 0:  imm8_mask_16 = 0x0000; break;
10950      case 1:  imm8_mask_16 = 0x00FF; break;
10951      case 2:  imm8_mask_16 = 0xFF00; break;
10952      case 3:  imm8_mask_16 = 0xFFFF; break;
10953      default: vassert(0);            break;
10954   }
10955   assign( imm8_mask, mkV128( imm8_mask_16 ) );
10956
10957   IRTemp res = newTemp(Ity_V128);
10958   assign ( res, binop( Iop_OrV128,
10959                        binop( Iop_AndV128, mkexpr(sV),
10960                                            mkexpr(imm8_mask) ),
10961                        binop( Iop_AndV128, mkexpr(dV),
10962                               unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
10963   return res;
10964}
10965
10966
10967static IRTemp math_BLENDPD_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
10968{
10969   IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
10970   IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
10971   breakupV256toV128s( sV, &sVhi, &sVlo );
10972   breakupV256toV128s( dV, &dVhi, &dVlo );
10973   IRTemp rVhi = math_BLENDPD_128(sVhi, dVhi, (imm8 >> 2) & 3);
10974   IRTemp rVlo = math_BLENDPD_128(sVlo, dVlo, imm8 & 3);
10975   IRTemp rV   = newTemp(Ity_V256);
10976   assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
10977   return rV;
10978}
10979
10980
10981static IRTemp math_BLENDPS_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
10982{
10983   UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
10984                             0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
10985                             0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0,
10986                             0xFFFF };
10987   IRTemp imm8_mask = newTemp(Ity_V128);
10988   assign( imm8_mask, mkV128( imm8_perms[ (imm8 & 15) ] ) );
10989
10990   IRTemp res = newTemp(Ity_V128);
10991   assign ( res, binop( Iop_OrV128,
10992                        binop( Iop_AndV128, mkexpr(sV),
10993                                            mkexpr(imm8_mask) ),
10994                        binop( Iop_AndV128, mkexpr(dV),
10995                               unop( Iop_NotV128, mkexpr(imm8_mask) ) ) ) );
10996   return res;
10997}
10998
10999
11000static IRTemp math_BLENDPS_256 ( IRTemp sV, IRTemp dV, UInt imm8 )
11001{
11002   IRTemp sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
11003   IRTemp dVhi = IRTemp_INVALID, dVlo = IRTemp_INVALID;
11004   breakupV256toV128s( sV, &sVhi, &sVlo );
11005   breakupV256toV128s( dV, &dVhi, &dVlo );
11006   IRTemp rVhi = math_BLENDPS_128(sVhi, dVhi, (imm8 >> 4) & 15);
11007   IRTemp rVlo = math_BLENDPS_128(sVlo, dVlo, imm8 & 15);
11008   IRTemp rV   = newTemp(Ity_V256);
11009   assign(rV, binop(Iop_V128HLtoV256, mkexpr(rVhi), mkexpr(rVlo)));
11010   return rV;
11011}
11012
11013
11014static IRTemp math_PBLENDW_128 ( IRTemp sV, IRTemp dV, UInt imm8 )
11015{
11016   /* Make w be a 16-bit version of imm8, formed by duplicating each
11017      bit in imm8. */
11018   Int i;
11019   UShort imm16 = 0;
11020   for (i = 0; i < 8; i++) {
11021      if (imm8 & (1 << i))
11022         imm16 |= (3 << (2*i));
11023   }
11024   IRTemp imm16_mask = newTemp(Ity_V128);
11025   assign( imm16_mask, mkV128( imm16 ));
11026
11027   IRTemp res = newTemp(Ity_V128);
11028   assign ( res, binop( Iop_OrV128,
11029                        binop( Iop_AndV128, mkexpr(sV),
11030                                            mkexpr(imm16_mask) ),
11031                        binop( Iop_AndV128, mkexpr(dV),
11032                               unop( Iop_NotV128, mkexpr(imm16_mask) ) ) ) );
11033   return res;
11034}
11035
11036
11037static IRTemp math_PMULUDQ_128 ( IRTemp sV, IRTemp dV )
11038{
11039   /* This is a really poor translation -- could be improved if
11040      performance critical */
11041   IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
11042   s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
11043   breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
11044   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
11045   IRTemp res = newTemp(Ity_V128);
11046   assign(res, binop(Iop_64HLtoV128,
11047                     binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)),
11048                     binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) ));
11049   return res;
11050}
11051
11052
11053static IRTemp math_PMULUDQ_256 ( IRTemp sV, IRTemp dV )
11054{
11055   /* This is a really poor translation -- could be improved if
11056      performance critical */
11057   IRTemp sHi, sLo, dHi, dLo;
11058   sHi = sLo = dHi = dLo = IRTemp_INVALID;
11059   breakupV256toV128s( dV, &dHi, &dLo);
11060   breakupV256toV128s( sV, &sHi, &sLo);
11061   IRTemp res = newTemp(Ity_V256);
11062   assign(res, binop(Iop_V128HLtoV256,
11063                     mkexpr(math_PMULUDQ_128(sHi, dHi)),
11064                     mkexpr(math_PMULUDQ_128(sLo, dLo))));
11065   return res;
11066}
11067
11068
11069static IRTemp math_PMULDQ_128 ( IRTemp dV, IRTemp sV )
11070{
11071   /* This is a really poor translation -- could be improved if
11072      performance critical */
11073   IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
11074   s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
11075   breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
11076   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
11077   IRTemp res = newTemp(Ity_V128);
11078   assign(res, binop(Iop_64HLtoV128,
11079                     binop( Iop_MullS32, mkexpr(d2), mkexpr(s2)),
11080                     binop( Iop_MullS32, mkexpr(d0), mkexpr(s0)) ));
11081   return res;
11082}
11083
11084
11085static IRTemp math_PMULDQ_256 ( IRTemp sV, IRTemp dV )
11086{
11087   /* This is a really poor translation -- could be improved if
11088      performance critical */
11089   IRTemp sHi, sLo, dHi, dLo;
11090   sHi = sLo = dHi = dLo = IRTemp_INVALID;
11091   breakupV256toV128s( dV, &dHi, &dLo);
11092   breakupV256toV128s( sV, &sHi, &sLo);
11093   IRTemp res = newTemp(Ity_V256);
11094   assign(res, binop(Iop_V128HLtoV256,
11095                     mkexpr(math_PMULDQ_128(sHi, dHi)),
11096                     mkexpr(math_PMULDQ_128(sLo, dLo))));
11097   return res;
11098}
11099
11100
11101static IRTemp math_PMADDWD_128 ( IRTemp dV, IRTemp sV )
11102{
11103   IRTemp sVhi, sVlo, dVhi, dVlo;
11104   IRTemp resHi = newTemp(Ity_I64);
11105   IRTemp resLo = newTemp(Ity_I64);
11106   sVhi = sVlo = dVhi = dVlo = IRTemp_INVALID;
11107   breakupV128to64s( sV, &sVhi, &sVlo );
11108   breakupV128to64s( dV, &dVhi, &dVlo );
11109   assign( resHi, mkIRExprCCall(Ity_I64, 0/*regparms*/,
11110                                "amd64g_calculate_mmx_pmaddwd",
11111                                &amd64g_calculate_mmx_pmaddwd,
11112                                mkIRExprVec_2( mkexpr(sVhi), mkexpr(dVhi))));
11113   assign( resLo, mkIRExprCCall(Ity_I64, 0/*regparms*/,
11114                                "amd64g_calculate_mmx_pmaddwd",
11115                                &amd64g_calculate_mmx_pmaddwd,
11116                                mkIRExprVec_2( mkexpr(sVlo), mkexpr(dVlo))));
11117   IRTemp res = newTemp(Ity_V128);
11118   assign( res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo))) ;
11119   return res;
11120}
11121
11122
11123static IRTemp math_PMADDWD_256 ( IRTemp dV, IRTemp sV )
11124{
11125   IRTemp sHi, sLo, dHi, dLo;
11126   sHi = sLo = dHi = dLo = IRTemp_INVALID;
11127   breakupV256toV128s( dV, &dHi, &dLo);
11128   breakupV256toV128s( sV, &sHi, &sLo);
11129   IRTemp res = newTemp(Ity_V256);
11130   assign(res, binop(Iop_V128HLtoV256,
11131                     mkexpr(math_PMADDWD_128(dHi, sHi)),
11132                     mkexpr(math_PMADDWD_128(dLo, sLo))));
11133   return res;
11134}
11135
11136
11137static IRTemp math_ADDSUBPD_128 ( IRTemp dV, IRTemp sV )
11138{
11139   IRTemp addV = newTemp(Ity_V128);
11140   IRTemp subV = newTemp(Ity_V128);
11141   IRTemp a1   = newTemp(Ity_I64);
11142   IRTemp s0   = newTemp(Ity_I64);
11143   IRTemp rm   = newTemp(Ity_I32);
11144
11145   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
11146   assign( addV, triop(Iop_Add64Fx2, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11147   assign( subV, triop(Iop_Sub64Fx2, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11148
11149   assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
11150   assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
11151
11152   IRTemp res = newTemp(Ity_V128);
11153   assign( res, binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
11154   return res;
11155}
11156
11157
11158static IRTemp math_ADDSUBPD_256 ( IRTemp dV, IRTemp sV )
11159{
11160   IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
11161   IRTemp addV = newTemp(Ity_V256);
11162   IRTemp subV = newTemp(Ity_V256);
11163   IRTemp rm   = newTemp(Ity_I32);
11164   a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
11165
11166   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
11167   assign( addV, triop(Iop_Add64Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11168   assign( subV, triop(Iop_Sub64Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11169
11170   breakupV256to64s( addV, &a3, &a2, &a1, &a0 );
11171   breakupV256to64s( subV, &s3, &s2, &s1, &s0 );
11172
11173   IRTemp res = newTemp(Ity_V256);
11174   assign( res, mkV256from64s( a3, s2, a1, s0 ) );
11175   return res;
11176}
11177
11178
11179static IRTemp math_ADDSUBPS_128 ( IRTemp dV, IRTemp sV )
11180{
11181   IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
11182   IRTemp addV = newTemp(Ity_V128);
11183   IRTemp subV = newTemp(Ity_V128);
11184   IRTemp rm   = newTemp(Ity_I32);
11185   a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
11186
11187   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
11188   assign( addV, triop(Iop_Add32Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11189   assign( subV, triop(Iop_Sub32Fx4, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11190
11191   breakupV128to32s( addV, &a3, &a2, &a1, &a0 );
11192   breakupV128to32s( subV, &s3, &s2, &s1, &s0 );
11193
11194   IRTemp res = newTemp(Ity_V128);
11195   assign( res, mkV128from32s( a3, s2, a1, s0 ) );
11196   return res;
11197}
11198
11199
11200static IRTemp math_ADDSUBPS_256 ( IRTemp dV, IRTemp sV )
11201{
11202   IRTemp a7, a6, a5, a4, a3, a2, a1, a0;
11203   IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
11204   IRTemp addV = newTemp(Ity_V256);
11205   IRTemp subV = newTemp(Ity_V256);
11206   IRTemp rm   = newTemp(Ity_I32);
11207   a7 = a6 = a5 = a4 = a3 = a2 = a1 = a0 = IRTemp_INVALID;
11208   s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
11209
11210   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
11211   assign( addV, triop(Iop_Add32Fx8, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11212   assign( subV, triop(Iop_Sub32Fx8, mkexpr(rm), mkexpr(dV), mkexpr(sV)) );
11213
11214   breakupV256to32s( addV, &a7, &a6, &a5, &a4, &a3, &a2, &a1, &a0 );
11215   breakupV256to32s( subV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
11216
11217   IRTemp res = newTemp(Ity_V256);
11218   assign( res, mkV256from32s( a7, s6, a5, s4, a3, s2, a1, s0 ) );
11219   return res;
11220}
11221
11222
11223/* Handle 128 bit PSHUFLW and PSHUFHW. */
11224static Long dis_PSHUFxW_128 ( VexAbiInfo* vbi, Prefix pfx,
11225                              Long delta, Bool isAvx, Bool xIsH )
11226{
11227   IRTemp addr  = IRTemp_INVALID;
11228   Int    alen  = 0;
11229   HChar  dis_buf[50];
11230   UChar  modrm = getUChar(delta);
11231   UInt   rG = gregOfRexRM(pfx,modrm);
11232   UInt   imm8;
11233   IRTemp sVmut, dVmut, sVcon, sV, dV, s3, s2, s1, s0;
11234   s3 = s2 = s1 = s0 = IRTemp_INVALID;
11235   sV    = newTemp(Ity_V128);
11236   dV    = newTemp(Ity_V128);
11237   sVmut = newTemp(Ity_I64);
11238   dVmut = newTemp(Ity_I64);
11239   sVcon = newTemp(Ity_I64);
11240   if (epartIsReg(modrm)) {
11241      UInt rE = eregOfRexRM(pfx,modrm);
11242      assign( sV, getXMMReg(rE) );
11243      imm8 = (UInt)getUChar(delta+1);
11244      delta += 1+1;
11245      DIP("%spshuf%cw $%u,%s,%s\n",
11246          isAvx ? "v" : "", xIsH ? 'h' : 'l',
11247          imm8, nameXMMReg(rE), nameXMMReg(rG));
11248   } else {
11249      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
11250      assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11251      imm8 = (UInt)getUChar(delta+alen);
11252      delta += alen+1;
11253      DIP("%spshuf%cw $%u,%s,%s\n",
11254          isAvx ? "v" : "", xIsH ? 'h' : 'l',
11255          imm8, dis_buf, nameXMMReg(rG));
11256   }
11257
11258   /* Get the to-be-changed (mut) and unchanging (con) bits of the
11259      source. */
11260   assign( sVmut, unop(xIsH ? Iop_V128HIto64 : Iop_V128to64,   mkexpr(sV)) );
11261   assign( sVcon, unop(xIsH ? Iop_V128to64   : Iop_V128HIto64, mkexpr(sV)) );
11262
11263   breakup64to16s( sVmut, &s3, &s2, &s1, &s0 );
11264#  define SEL(n) \
11265             ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11266   assign(dVmut, mk64from16s( SEL((imm8>>6)&3), SEL((imm8>>4)&3),
11267                              SEL((imm8>>2)&3), SEL((imm8>>0)&3) ));
11268#  undef SEL
11269
11270   assign(dV, xIsH ? binop(Iop_64HLtoV128, mkexpr(dVmut), mkexpr(sVcon))
11271                   : binop(Iop_64HLtoV128, mkexpr(sVcon), mkexpr(dVmut)) );
11272
11273   (isAvx ? putYMMRegLoAndZU : putXMMReg)(rG, mkexpr(dV));
11274   return delta;
11275}
11276
11277
11278/* Handle 256 bit PSHUFLW and PSHUFHW. */
11279static Long dis_PSHUFxW_256 ( VexAbiInfo* vbi, Prefix pfx,
11280                              Long delta, Bool xIsH )
11281{
11282   IRTemp addr  = IRTemp_INVALID;
11283   Int    alen  = 0;
11284   HChar  dis_buf[50];
11285   UChar  modrm = getUChar(delta);
11286   UInt   rG = gregOfRexRM(pfx,modrm);
11287   UInt   imm8;
11288   IRTemp sV, s[8], sV64[4], dVhi, dVlo;
11289   sV64[3] = sV64[2] = sV64[1] = sV64[0] = IRTemp_INVALID;
11290   s[7] = s[6] = s[5] = s[4] = s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
11291   sV    = newTemp(Ity_V256);
11292   dVhi  = newTemp(Ity_I64);
11293   dVlo  = newTemp(Ity_I64);
11294   if (epartIsReg(modrm)) {
11295      UInt rE = eregOfRexRM(pfx,modrm);
11296      assign( sV, getYMMReg(rE) );
11297      imm8 = (UInt)getUChar(delta+1);
11298      delta += 1+1;
11299      DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l',
11300          imm8, nameYMMReg(rE), nameYMMReg(rG));
11301   } else {
11302      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
11303      assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
11304      imm8 = (UInt)getUChar(delta+alen);
11305      delta += alen+1;
11306      DIP("vpshuf%cw $%u,%s,%s\n", xIsH ? 'h' : 'l',
11307          imm8, dis_buf, nameYMMReg(rG));
11308   }
11309
11310   breakupV256to64s( sV, &sV64[3], &sV64[2], &sV64[1], &sV64[0] );
11311   breakup64to16s( sV64[xIsH ? 3 : 2], &s[7], &s[6], &s[5], &s[4] );
11312   breakup64to16s( sV64[xIsH ? 1 : 0], &s[3], &s[2], &s[1], &s[0] );
11313
11314   assign( dVhi, mk64from16s( s[4 + ((imm8>>6)&3)], s[4 + ((imm8>>4)&3)],
11315                              s[4 + ((imm8>>2)&3)], s[4 + ((imm8>>0)&3)] ) );
11316   assign( dVlo, mk64from16s( s[0 + ((imm8>>6)&3)], s[0 + ((imm8>>4)&3)],
11317                              s[0 + ((imm8>>2)&3)], s[0 + ((imm8>>0)&3)] ) );
11318   putYMMReg( rG, mkV256from64s( xIsH ? dVhi : sV64[3],
11319                                 xIsH ? sV64[2] : dVhi,
11320                                 xIsH ? dVlo : sV64[1],
11321                                 xIsH ? sV64[0] : dVlo ) );
11322   return delta;
11323}
11324
11325
11326static Long dis_PEXTRW_128_EregOnly_toG ( VexAbiInfo* vbi, Prefix pfx,
11327                                          Long delta, Bool isAvx )
11328{
11329   Long   deltaIN = delta;
11330   UChar  modrm   = getUChar(delta);
11331   UInt   rG      = gregOfRexRM(pfx,modrm);
11332   IRTemp sV      = newTemp(Ity_V128);
11333   IRTemp d16     = newTemp(Ity_I16);
11334   UInt   imm8;
11335   IRTemp s0, s1, s2, s3;
11336   if (epartIsReg(modrm)) {
11337      UInt rE = eregOfRexRM(pfx,modrm);
11338      assign(sV, getXMMReg(rE));
11339      imm8 = getUChar(delta+1) & 7;
11340      delta += 1+1;
11341      DIP("%spextrw $%d,%s,%s\n", isAvx ? "v" : "",
11342          (Int)imm8, nameXMMReg(rE), nameIReg32(rG));
11343   } else {
11344      /* The memory case is disallowed, apparently. */
11345      return deltaIN; /* FAIL */
11346   }
11347   s3 = s2 = s1 = s0 = IRTemp_INVALID;
11348   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
11349   switch (imm8) {
11350      case 0:  assign(d16, unop(Iop_32to16,   mkexpr(s0))); break;
11351      case 1:  assign(d16, unop(Iop_32HIto16, mkexpr(s0))); break;
11352      case 2:  assign(d16, unop(Iop_32to16,   mkexpr(s1))); break;
11353      case 3:  assign(d16, unop(Iop_32HIto16, mkexpr(s1))); break;
11354      case 4:  assign(d16, unop(Iop_32to16,   mkexpr(s2))); break;
11355      case 5:  assign(d16, unop(Iop_32HIto16, mkexpr(s2))); break;
11356      case 6:  assign(d16, unop(Iop_32to16,   mkexpr(s3))); break;
11357      case 7:  assign(d16, unop(Iop_32HIto16, mkexpr(s3))); break;
11358      default: vassert(0);
11359   }
11360   putIReg32(rG, unop(Iop_16Uto32, mkexpr(d16)));
11361   return delta;
11362}
11363
11364
11365static Long dis_CVTDQ2PD_128 ( VexAbiInfo* vbi, Prefix pfx,
11366                               Long delta, Bool isAvx )
11367{
11368   IRTemp addr  = IRTemp_INVALID;
11369   Int    alen  = 0;
11370   HChar  dis_buf[50];
11371   UChar  modrm = getUChar(delta);
11372   IRTemp arg64 = newTemp(Ity_I64);
11373   UInt   rG    = gregOfRexRM(pfx,modrm);
11374   const HChar* mbV   = isAvx ? "v" : "";
11375   if (epartIsReg(modrm)) {
11376      UInt rE = eregOfRexRM(pfx,modrm);
11377      assign( arg64, getXMMRegLane64(rE, 0) );
11378      delta += 1;
11379      DIP("%scvtdq2pd %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG));
11380   } else {
11381      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11382      assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
11383      delta += alen;
11384      DIP("%scvtdq2pd %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
11385   }
11386   putXMMRegLane64F(
11387      rG, 0,
11388      unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
11389   );
11390   putXMMRegLane64F(
11391      rG, 1,
11392      unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
11393   );
11394   if (isAvx)
11395      putYMMRegLane128(rG, 1, mkV128(0));
11396   return delta;
11397}
11398
11399
11400static Long dis_STMXCSR ( VexAbiInfo* vbi, Prefix pfx,
11401                          Long delta, Bool isAvx )
11402{
11403   IRTemp addr  = IRTemp_INVALID;
11404   Int    alen  = 0;
11405   HChar  dis_buf[50];
11406   UChar  modrm = getUChar(delta);
11407   vassert(!epartIsReg(modrm)); /* ensured by caller */
11408   vassert(gregOfRexRM(pfx,modrm) == 3); /* ditto */
11409
11410   addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11411   delta += alen;
11412
11413   /* Fake up a native SSE mxcsr word.  The only thing it depends on
11414      is SSEROUND[1:0], so call a clean helper to cook it up.
11415   */
11416   /* ULong amd64h_create_mxcsr ( ULong sseround ) */
11417   DIP("%sstmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
11418   storeLE(
11419      mkexpr(addr),
11420      unop(Iop_64to32,
11421           mkIRExprCCall(
11422              Ity_I64, 0/*regp*/,
11423              "amd64g_create_mxcsr", &amd64g_create_mxcsr,
11424              mkIRExprVec_1( unop(Iop_32Uto64,get_sse_roundingmode()) )
11425           )
11426      )
11427   );
11428   return delta;
11429}
11430
11431
11432static Long dis_LDMXCSR ( VexAbiInfo* vbi, Prefix pfx,
11433                          Long delta, Bool isAvx )
11434{
11435   IRTemp addr  = IRTemp_INVALID;
11436   Int    alen  = 0;
11437   HChar  dis_buf[50];
11438   UChar  modrm = getUChar(delta);
11439   vassert(!epartIsReg(modrm)); /* ensured by caller */
11440   vassert(gregOfRexRM(pfx,modrm) == 2); /* ditto */
11441
11442   IRTemp t64 = newTemp(Ity_I64);
11443   IRTemp ew  = newTemp(Ity_I32);
11444
11445   addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11446   delta += alen;
11447   DIP("%sldmxcsr %s\n",  isAvx ? "v" : "", dis_buf);
11448
11449   /* The only thing we observe in %mxcsr is the rounding mode.
11450      Therefore, pass the 32-bit value (SSE native-format control
11451      word) to a clean helper, getting back a 64-bit value, the
11452      lower half of which is the SSEROUND value to store, and the
11453      upper half of which is the emulation-warning token which may
11454      be generated.
11455   */
11456   /* ULong amd64h_check_ldmxcsr ( ULong ); */
11457   assign( t64, mkIRExprCCall(
11458                   Ity_I64, 0/*regparms*/,
11459                   "amd64g_check_ldmxcsr",
11460                   &amd64g_check_ldmxcsr,
11461                   mkIRExprVec_1(
11462                      unop(Iop_32Uto64,
11463                           loadLE(Ity_I32, mkexpr(addr))
11464                      )
11465                   )
11466                )
11467         );
11468
11469   put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
11470   assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
11471   put_emwarn( mkexpr(ew) );
11472   /* Finally, if an emulation warning was reported, side-exit to
11473      the next insn, reporting the warning, so that Valgrind's
11474      dispatcher sees the warning. */
11475   stmt(
11476      IRStmt_Exit(
11477         binop(Iop_CmpNE64, unop(Iop_32Uto64,mkexpr(ew)), mkU64(0)),
11478         Ijk_EmWarn,
11479         IRConst_U64(guest_RIP_bbstart+delta),
11480         OFFB_RIP
11481      )
11482   );
11483   return delta;
11484}
11485
11486
11487static IRTemp math_PINSRW_128 ( IRTemp v128, IRTemp u16, UInt imm8 )
11488{
11489   vassert(imm8 >= 0 && imm8 <= 7);
11490
11491   // Create a V128 value which has the selected word in the
11492   // specified lane, and zeroes everywhere else.
11493   IRTemp tmp128    = newTemp(Ity_V128);
11494   IRTemp halfshift = newTemp(Ity_I64);
11495   assign(halfshift, binop(Iop_Shl64,
11496                           unop(Iop_16Uto64, mkexpr(u16)),
11497                           mkU8(16 * (imm8 & 3))));
11498   if (imm8 < 4) {
11499      assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
11500   } else {
11501      assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
11502   }
11503
11504   UShort mask = ~(3 << (imm8 * 2));
11505   IRTemp res  = newTemp(Ity_V128);
11506   assign( res, binop(Iop_OrV128,
11507                      mkexpr(tmp128),
11508                      binop(Iop_AndV128, mkexpr(v128), mkV128(mask))) );
11509   return res;
11510}
11511
11512
11513static IRTemp math_PSADBW_128 ( IRTemp dV, IRTemp sV )
11514{
11515   IRTemp s1, s0, d1, d0;
11516   s1 = s0 = d1 = d0 = IRTemp_INVALID;
11517
11518   breakupV128to64s( sV, &s1, &s0 );
11519   breakupV128to64s( dV, &d1, &d0 );
11520
11521   IRTemp res = newTemp(Ity_V128);
11522   assign( res,
11523           binop(Iop_64HLtoV128,
11524                 mkIRExprCCall(Ity_I64, 0/*regparms*/,
11525                               "amd64g_calculate_mmx_psadbw",
11526                               &amd64g_calculate_mmx_psadbw,
11527                               mkIRExprVec_2( mkexpr(s1), mkexpr(d1))),
11528                 mkIRExprCCall(Ity_I64, 0/*regparms*/,
11529                               "amd64g_calculate_mmx_psadbw",
11530                               &amd64g_calculate_mmx_psadbw,
11531                               mkIRExprVec_2( mkexpr(s0), mkexpr(d0)))) );
11532   return res;
11533}
11534
11535
11536static IRTemp math_PSADBW_256 ( IRTemp dV, IRTemp sV )
11537{
11538   IRTemp sHi, sLo, dHi, dLo;
11539   sHi = sLo = dHi = dLo = IRTemp_INVALID;
11540   breakupV256toV128s( dV, &dHi, &dLo);
11541   breakupV256toV128s( sV, &sHi, &sLo);
11542   IRTemp res = newTemp(Ity_V256);
11543   assign(res, binop(Iop_V128HLtoV256,
11544                     mkexpr(math_PSADBW_128(dHi, sHi)),
11545                     mkexpr(math_PSADBW_128(dLo, sLo))));
11546   return res;
11547}
11548
11549
11550static Long dis_MASKMOVDQU ( VexAbiInfo* vbi, Prefix pfx,
11551                             Long delta, Bool isAvx )
11552{
11553   IRTemp regD    = newTemp(Ity_V128);
11554   IRTemp mask    = newTemp(Ity_V128);
11555   IRTemp olddata = newTemp(Ity_V128);
11556   IRTemp newdata = newTemp(Ity_V128);
11557   IRTemp addr    = newTemp(Ity_I64);
11558   UChar  modrm   = getUChar(delta);
11559   UInt   rG      = gregOfRexRM(pfx,modrm);
11560   UInt   rE      = eregOfRexRM(pfx,modrm);
11561
11562   assign( addr, handleAddrOverrides( vbi, pfx, getIReg64(R_RDI) ));
11563   assign( regD, getXMMReg( rG ));
11564
11565   /* Unfortunately can't do the obvious thing with SarN8x16
11566      here since that can't be re-emitted as SSE2 code - no such
11567      insn. */
11568   assign( mask,
11569           binop(Iop_64HLtoV128,
11570                 binop(Iop_SarN8x8,
11571                       getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ),
11572                       mkU8(7) ),
11573                 binop(Iop_SarN8x8,
11574                       getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ),
11575                       mkU8(7) ) ));
11576   assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
11577   assign( newdata, binop(Iop_OrV128,
11578                          binop(Iop_AndV128,
11579                                mkexpr(regD),
11580                                mkexpr(mask) ),
11581                          binop(Iop_AndV128,
11582                                mkexpr(olddata),
11583                                unop(Iop_NotV128, mkexpr(mask)))) );
11584   storeLE( mkexpr(addr), mkexpr(newdata) );
11585
11586   delta += 1;
11587   DIP("%smaskmovdqu %s,%s\n", isAvx ? "v" : "",
11588       nameXMMReg(rE), nameXMMReg(rG) );
11589   return delta;
11590}
11591
11592
11593static Long dis_MOVMSKPS_128 ( VexAbiInfo* vbi, Prefix pfx,
11594                               Long delta, Bool isAvx )
11595{
11596   UChar modrm = getUChar(delta);
11597   UInt   rG   = gregOfRexRM(pfx,modrm);
11598   UInt   rE   = eregOfRexRM(pfx,modrm);
11599   IRTemp t0   = newTemp(Ity_I32);
11600   IRTemp t1   = newTemp(Ity_I32);
11601   IRTemp t2   = newTemp(Ity_I32);
11602   IRTemp t3   = newTemp(Ity_I32);
11603   delta += 1;
11604   assign( t0, binop( Iop_And32,
11605                      binop(Iop_Shr32, getXMMRegLane32(rE,0), mkU8(31)),
11606                      mkU32(1) ));
11607   assign( t1, binop( Iop_And32,
11608                      binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(30)),
11609                      mkU32(2) ));
11610   assign( t2, binop( Iop_And32,
11611                      binop(Iop_Shr32, getXMMRegLane32(rE,2), mkU8(29)),
11612                      mkU32(4) ));
11613   assign( t3, binop( Iop_And32,
11614                      binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(28)),
11615                      mkU32(8) ));
11616   putIReg32( rG, binop(Iop_Or32,
11617                        binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
11618                        binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
11619   DIP("%smovmskps %s,%s\n", isAvx ? "v" : "",
11620       nameXMMReg(rE), nameIReg32(rG));
11621   return delta;
11622}
11623
11624
11625static Long dis_MOVMSKPS_256 ( VexAbiInfo* vbi, Prefix pfx, Long delta )
11626{
11627   UChar modrm = getUChar(delta);
11628   UInt   rG   = gregOfRexRM(pfx,modrm);
11629   UInt   rE   = eregOfRexRM(pfx,modrm);
11630   IRTemp t0   = newTemp(Ity_I32);
11631   IRTemp t1   = newTemp(Ity_I32);
11632   IRTemp t2   = newTemp(Ity_I32);
11633   IRTemp t3   = newTemp(Ity_I32);
11634   IRTemp t4   = newTemp(Ity_I32);
11635   IRTemp t5   = newTemp(Ity_I32);
11636   IRTemp t6   = newTemp(Ity_I32);
11637   IRTemp t7   = newTemp(Ity_I32);
11638   delta += 1;
11639   assign( t0, binop( Iop_And32,
11640                      binop(Iop_Shr32, getYMMRegLane32(rE,0), mkU8(31)),
11641                      mkU32(1) ));
11642   assign( t1, binop( Iop_And32,
11643                      binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(30)),
11644                      mkU32(2) ));
11645   assign( t2, binop( Iop_And32,
11646                      binop(Iop_Shr32, getYMMRegLane32(rE,2), mkU8(29)),
11647                      mkU32(4) ));
11648   assign( t3, binop( Iop_And32,
11649                      binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(28)),
11650                      mkU32(8) ));
11651   assign( t4, binop( Iop_And32,
11652                      binop(Iop_Shr32, getYMMRegLane32(rE,4), mkU8(27)),
11653                      mkU32(16) ));
11654   assign( t5, binop( Iop_And32,
11655                      binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(26)),
11656                      mkU32(32) ));
11657   assign( t6, binop( Iop_And32,
11658                      binop(Iop_Shr32, getYMMRegLane32(rE,6), mkU8(25)),
11659                      mkU32(64) ));
11660   assign( t7, binop( Iop_And32,
11661                      binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(24)),
11662                      mkU32(128) ));
11663   putIReg32( rG, binop(Iop_Or32,
11664                        binop(Iop_Or32,
11665                              binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
11666                              binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ),
11667                        binop(Iop_Or32,
11668                              binop(Iop_Or32, mkexpr(t4), mkexpr(t5)),
11669                              binop(Iop_Or32, mkexpr(t6), mkexpr(t7)) ) ) );
11670   DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
11671   return delta;
11672}
11673
11674
11675static Long dis_MOVMSKPD_128 ( VexAbiInfo* vbi, Prefix pfx,
11676                               Long delta, Bool isAvx )
11677{
11678   UChar modrm = getUChar(delta);
11679   UInt   rG   = gregOfRexRM(pfx,modrm);
11680   UInt   rE   = eregOfRexRM(pfx,modrm);
11681   IRTemp t0   = newTemp(Ity_I32);
11682   IRTemp t1   = newTemp(Ity_I32);
11683   delta += 1;
11684   assign( t0, binop( Iop_And32,
11685                      binop(Iop_Shr32, getXMMRegLane32(rE,1), mkU8(31)),
11686                      mkU32(1) ));
11687   assign( t1, binop( Iop_And32,
11688                      binop(Iop_Shr32, getXMMRegLane32(rE,3), mkU8(30)),
11689                      mkU32(2) ));
11690   putIReg32( rG, binop(Iop_Or32, mkexpr(t0), mkexpr(t1) ) );
11691   DIP("%smovmskpd %s,%s\n", isAvx ? "v" : "",
11692       nameXMMReg(rE), nameIReg32(rG));
11693   return delta;
11694}
11695
11696
11697static Long dis_MOVMSKPD_256 ( VexAbiInfo* vbi, Prefix pfx, Long delta )
11698{
11699   UChar modrm = getUChar(delta);
11700   UInt   rG   = gregOfRexRM(pfx,modrm);
11701   UInt   rE   = eregOfRexRM(pfx,modrm);
11702   IRTemp t0   = newTemp(Ity_I32);
11703   IRTemp t1   = newTemp(Ity_I32);
11704   IRTemp t2   = newTemp(Ity_I32);
11705   IRTemp t3   = newTemp(Ity_I32);
11706   delta += 1;
11707   assign( t0, binop( Iop_And32,
11708                      binop(Iop_Shr32, getYMMRegLane32(rE,1), mkU8(31)),
11709                      mkU32(1) ));
11710   assign( t1, binop( Iop_And32,
11711                      binop(Iop_Shr32, getYMMRegLane32(rE,3), mkU8(30)),
11712                      mkU32(2) ));
11713   assign( t2, binop( Iop_And32,
11714                      binop(Iop_Shr32, getYMMRegLane32(rE,5), mkU8(29)),
11715                      mkU32(4) ));
11716   assign( t3, binop( Iop_And32,
11717                      binop(Iop_Shr32, getYMMRegLane32(rE,7), mkU8(28)),
11718                      mkU32(8) ));
11719   putIReg32( rG, binop(Iop_Or32,
11720                        binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
11721                        binop(Iop_Or32, mkexpr(t2), mkexpr(t3)) ) );
11722   DIP("vmovmskps %s,%s\n", nameYMMReg(rE), nameIReg32(rG));
11723   return delta;
11724}
11725
11726
11727/* Note, this also handles SSE(1) insns. */
11728__attribute__((noinline))
11729static
11730Long dis_ESC_0F__SSE2 ( Bool* decode_OK,
11731                        VexAbiInfo* vbi,
11732                        Prefix pfx, Int sz, Long deltaIN,
11733                        DisResult* dres )
11734{
11735   IRTemp addr  = IRTemp_INVALID;
11736   IRTemp t0    = IRTemp_INVALID;
11737   IRTemp t1    = IRTemp_INVALID;
11738   IRTemp t2    = IRTemp_INVALID;
11739   IRTemp t3    = IRTemp_INVALID;
11740   IRTemp t4    = IRTemp_INVALID;
11741   IRTemp t5    = IRTemp_INVALID;
11742   IRTemp t6    = IRTemp_INVALID;
11743   UChar  modrm = 0;
11744   Int    alen  = 0;
11745   HChar  dis_buf[50];
11746
11747   *decode_OK = False;
11748
11749   Long   delta = deltaIN;
11750   UChar  opc   = getUChar(delta);
11751   delta++;
11752   switch (opc) {
11753
11754   case 0x10:
11755      if (have66noF2noF3(pfx)
11756          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
11757         /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
11758         modrm = getUChar(delta);
11759         if (epartIsReg(modrm)) {
11760            putXMMReg( gregOfRexRM(pfx,modrm),
11761                       getXMMReg( eregOfRexRM(pfx,modrm) ));
11762            DIP("movupd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
11763                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
11764            delta += 1;
11765         } else {
11766            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11767            putXMMReg( gregOfRexRM(pfx,modrm),
11768                       loadLE(Ity_V128, mkexpr(addr)) );
11769            DIP("movupd %s,%s\n", dis_buf,
11770                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
11771            delta += alen;
11772         }
11773         goto decode_success;
11774      }
11775      /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
11776         G (lo half xmm).  If E is mem, upper half of G is zeroed out.
11777         If E is reg, upper half of G is unchanged. */
11778      if (haveF2no66noF3(pfx)
11779          && (sz == 4 || /* ignore redundant REX.W */ sz == 8) ) {
11780         modrm = getUChar(delta);
11781         if (epartIsReg(modrm)) {
11782            putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
11783                             getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
11784            DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
11785                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
11786            delta += 1;
11787         } else {
11788            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11789            putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
11790            putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
11791                             loadLE(Ity_I64, mkexpr(addr)) );
11792            DIP("movsd %s,%s\n", dis_buf,
11793                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
11794            delta += alen;
11795         }
11796         goto decode_success;
11797      }
11798      /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
11799         (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
11800      if (haveF3no66noF2(pfx)
11801          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
11802         modrm = getUChar(delta);
11803         if (epartIsReg(modrm)) {
11804            putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
11805                             getXMMRegLane32( eregOfRexRM(pfx,modrm), 0 ));
11806            DIP("movss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
11807                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
11808            delta += 1;
11809         } else {
11810            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11811            putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
11812            putXMMRegLane32( gregOfRexRM(pfx,modrm), 0,
11813                             loadLE(Ity_I32, mkexpr(addr)) );
11814            DIP("movss %s,%s\n", dis_buf,
11815                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
11816            delta += alen;
11817         }
11818         goto decode_success;
11819      }
11820      /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
11821      if (haveNo66noF2noF3(pfx)
11822          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
11823         modrm = getUChar(delta);
11824         if (epartIsReg(modrm)) {
11825            putXMMReg( gregOfRexRM(pfx,modrm),
11826                       getXMMReg( eregOfRexRM(pfx,modrm) ));
11827            DIP("movups %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
11828                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
11829            delta += 1;
11830         } else {
11831            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11832            putXMMReg( gregOfRexRM(pfx,modrm),
11833                       loadLE(Ity_V128, mkexpr(addr)) );
11834            DIP("movups %s,%s\n", dis_buf,
11835                                     nameXMMReg(gregOfRexRM(pfx,modrm)));
11836            delta += alen;
11837         }
11838         goto decode_success;
11839      }
11840      break;
11841
11842   case 0x11:
11843      /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
11844         or lo half xmm). */
11845      if (haveF2no66noF3(pfx)
11846          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
11847         modrm = getUChar(delta);
11848         if (epartIsReg(modrm)) {
11849            putXMMRegLane64( eregOfRexRM(pfx,modrm), 0,
11850                             getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
11851            DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11852                                 nameXMMReg(eregOfRexRM(pfx,modrm)));
11853            delta += 1;
11854         } else {
11855            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11856            storeLE( mkexpr(addr),
11857                     getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
11858            DIP("movsd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11859                                 dis_buf);
11860            delta += alen;
11861         }
11862         goto decode_success;
11863      }
11864      /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
11865         or lo 1/4 xmm). */
11866      if (haveF3no66noF2(pfx) && sz == 4) {
11867         modrm = getUChar(delta);
11868         if (epartIsReg(modrm)) {
11869            /* fall through, we don't yet have a test case */
11870         } else {
11871            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11872            storeLE( mkexpr(addr),
11873                     getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
11874            DIP("movss %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11875                                 dis_buf);
11876            delta += alen;
11877            goto decode_success;
11878         }
11879      }
11880      /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
11881      if (have66noF2noF3(pfx)
11882          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
11883         modrm = getUChar(delta);
11884         if (epartIsReg(modrm)) {
11885            putXMMReg( eregOfRexRM(pfx,modrm),
11886                       getXMMReg( gregOfRexRM(pfx,modrm) ) );
11887            DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11888                                  nameXMMReg(eregOfRexRM(pfx,modrm)));
11889            delta += 1;
11890         } else {
11891            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11892            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
11893            DIP("movupd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11894                                  dis_buf );
11895            delta += alen;
11896         }
11897         goto decode_success;
11898      }
11899      /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
11900      if (haveNo66noF2noF3(pfx)
11901          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
11902         modrm = getUChar(delta);
11903         if (epartIsReg(modrm)) {
11904            /* fall through; awaiting test case */
11905         } else {
11906            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11907            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
11908            DIP("movups %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
11909                                  dis_buf );
11910            delta += alen;
11911            goto decode_success;
11912         }
11913      }
11914      break;
11915
11916   case 0x12:
11917      /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
11918      /* Identical to MOVLPS ? */
11919      if (have66noF2noF3(pfx)
11920          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
11921         modrm = getUChar(delta);
11922         if (epartIsReg(modrm)) {
11923            /* fall through; apparently reg-reg is not possible */
11924         } else {
11925            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11926            delta += alen;
11927            putXMMRegLane64( gregOfRexRM(pfx,modrm),
11928                             0/*lower lane*/,
11929                             loadLE(Ity_I64, mkexpr(addr)) );
11930            DIP("movlpd %s, %s\n",
11931                dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
11932            goto decode_success;
11933         }
11934      }
11935      /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
11936      /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
11937      if (haveNo66noF2noF3(pfx)
11938          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
11939         modrm = getUChar(delta);
11940         if (epartIsReg(modrm)) {
11941            delta += 1;
11942            putXMMRegLane64( gregOfRexRM(pfx,modrm),
11943                             0/*lower lane*/,
11944                             getXMMRegLane64( eregOfRexRM(pfx,modrm), 1 ));
11945            DIP("movhlps %s, %s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
11946                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
11947         } else {
11948            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11949            delta += alen;
11950            putXMMRegLane64( gregOfRexRM(pfx,modrm),  0/*lower lane*/,
11951                             loadLE(Ity_I64, mkexpr(addr)) );
11952            DIP("movlps %s, %s\n",
11953                dis_buf, nameXMMReg( gregOfRexRM(pfx,modrm) ));
11954         }
11955         goto decode_success;
11956      }
11957      break;
11958
11959   case 0x13:
11960      /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
11961      if (haveNo66noF2noF3(pfx)
11962          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
11963         modrm = getUChar(delta);
11964         if (!epartIsReg(modrm)) {
11965            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11966            delta += alen;
11967            storeLE( mkexpr(addr),
11968                     getXMMRegLane64( gregOfRexRM(pfx,modrm),
11969                                      0/*lower lane*/ ) );
11970            DIP("movlps %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
11971                                   dis_buf);
11972            goto decode_success;
11973         }
11974         /* else fall through */
11975      }
11976      /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
11977      /* Identical to MOVLPS ? */
11978      if (have66noF2noF3(pfx)
11979          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
11980         modrm = getUChar(delta);
11981         if (!epartIsReg(modrm)) {
11982            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
11983            delta += alen;
11984            storeLE( mkexpr(addr),
11985                     getXMMRegLane64( gregOfRexRM(pfx,modrm),
11986                                      0/*lower lane*/ ) );
11987            DIP("movlpd %s, %s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
11988                                   dis_buf);
11989            goto decode_success;
11990         }
11991         /* else fall through */
11992      }
11993      break;
11994
11995   case 0x14:
11996   case 0x15:
11997      /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
11998      /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
11999      /* These just appear to be special cases of SHUFPS */
12000      if (haveNo66noF2noF3(pfx) && sz == 4) {
12001         Bool   hi = toBool(opc == 0x15);
12002         IRTemp sV = newTemp(Ity_V128);
12003         IRTemp dV = newTemp(Ity_V128);
12004         modrm = getUChar(delta);
12005         UInt   rG = gregOfRexRM(pfx,modrm);
12006         assign( dV, getXMMReg(rG) );
12007         if (epartIsReg(modrm)) {
12008            UInt rE = eregOfRexRM(pfx,modrm);
12009            assign( sV, getXMMReg(rE) );
12010            delta += 1;
12011            DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
12012                nameXMMReg(rE), nameXMMReg(rG));
12013         } else {
12014            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12015            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12016            delta += alen;
12017            DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
12018                dis_buf, nameXMMReg(rG));
12019         }
12020         IRTemp res = math_UNPCKxPS_128( sV, dV, hi );
12021         putXMMReg( rG, mkexpr(res) );
12022         goto decode_success;
12023      }
12024      /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
12025      /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
12026      /* These just appear to be special cases of SHUFPS */
12027      if (have66noF2noF3(pfx)
12028          && sz == 2 /* could be 8 if rex also present */) {
12029         Bool   hi = toBool(opc == 0x15);
12030         IRTemp sV = newTemp(Ity_V128);
12031         IRTemp dV = newTemp(Ity_V128);
12032         modrm = getUChar(delta);
12033         UInt   rG = gregOfRexRM(pfx,modrm);
12034         assign( dV, getXMMReg(rG) );
12035         if (epartIsReg(modrm)) {
12036            UInt rE = eregOfRexRM(pfx,modrm);
12037            assign( sV, getXMMReg(rE) );
12038            delta += 1;
12039            DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
12040                nameXMMReg(rE), nameXMMReg(rG));
12041         } else {
12042            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12043            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12044            delta += alen;
12045            DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
12046                dis_buf, nameXMMReg(rG));
12047         }
12048         IRTemp res = math_UNPCKxPD_128( sV, dV, hi );
12049         putXMMReg( rG, mkexpr(res) );
12050         goto decode_success;
12051      }
12052      break;
12053
12054   case 0x16:
12055      /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
12056      /* These seems identical to MOVHPS.  This instruction encoding is
12057         completely crazy. */
12058      if (have66noF2noF3(pfx)
12059          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12060         modrm = getUChar(delta);
12061         if (epartIsReg(modrm)) {
12062            /* fall through; apparently reg-reg is not possible */
12063         } else {
12064            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12065            delta += alen;
12066            putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
12067                             loadLE(Ity_I64, mkexpr(addr)) );
12068            DIP("movhpd %s,%s\n", dis_buf,
12069                                  nameXMMReg( gregOfRexRM(pfx,modrm) ));
12070            goto decode_success;
12071         }
12072      }
12073      /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
12074      /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
12075      if (haveNo66noF2noF3(pfx)
12076          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12077         modrm = getUChar(delta);
12078         if (epartIsReg(modrm)) {
12079            delta += 1;
12080            putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
12081                             getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ) );
12082            DIP("movhps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12083                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12084         } else {
12085            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12086            delta += alen;
12087            putXMMRegLane64( gregOfRexRM(pfx,modrm), 1/*upper lane*/,
12088                             loadLE(Ity_I64, mkexpr(addr)) );
12089            DIP("movhps %s,%s\n", dis_buf,
12090                                  nameXMMReg( gregOfRexRM(pfx,modrm) ));
12091         }
12092         goto decode_success;
12093      }
12094      break;
12095
12096   case 0x17:
12097      /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
12098      if (haveNo66noF2noF3(pfx)
12099          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12100         modrm = getUChar(delta);
12101         if (!epartIsReg(modrm)) {
12102            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12103            delta += alen;
12104            storeLE( mkexpr(addr),
12105                     getXMMRegLane64( gregOfRexRM(pfx,modrm),
12106                                      1/*upper lane*/ ) );
12107            DIP("movhps %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
12108                                  dis_buf);
12109            goto decode_success;
12110         }
12111         /* else fall through */
12112      }
12113      /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
12114      /* Again, this seems identical to MOVHPS. */
12115      if (have66noF2noF3(pfx)
12116          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12117         modrm = getUChar(delta);
12118         if (!epartIsReg(modrm)) {
12119            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12120            delta += alen;
12121            storeLE( mkexpr(addr),
12122                     getXMMRegLane64( gregOfRexRM(pfx,modrm),
12123                                      1/*upper lane*/ ) );
12124            DIP("movhpd %s,%s\n", nameXMMReg( gregOfRexRM(pfx,modrm) ),
12125                                  dis_buf);
12126            goto decode_success;
12127         }
12128         /* else fall through */
12129      }
12130      break;
12131
12132   case 0x18:
12133      /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
12134      /* 0F 18 /1 = PREFETCH0   -- with various different hints */
12135      /* 0F 18 /2 = PREFETCH1 */
12136      /* 0F 18 /3 = PREFETCH2 */
12137      if (haveNo66noF2noF3(pfx)
12138          && !epartIsReg(getUChar(delta))
12139          && gregLO3ofRM(getUChar(delta)) >= 0
12140          && gregLO3ofRM(getUChar(delta)) <= 3) {
12141         const HChar* hintstr = "??";
12142
12143         modrm = getUChar(delta);
12144         vassert(!epartIsReg(modrm));
12145
12146         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12147         delta += alen;
12148
12149         switch (gregLO3ofRM(modrm)) {
12150            case 0: hintstr = "nta"; break;
12151            case 1: hintstr = "t0"; break;
12152            case 2: hintstr = "t1"; break;
12153            case 3: hintstr = "t2"; break;
12154            default: vassert(0);
12155         }
12156
12157         DIP("prefetch%s %s\n", hintstr, dis_buf);
12158         goto decode_success;
12159      }
12160      break;
12161
12162   case 0x28:
12163      /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
12164      if (have66noF2noF3(pfx)
12165          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12166         modrm = getUChar(delta);
12167         if (epartIsReg(modrm)) {
12168            putXMMReg( gregOfRexRM(pfx,modrm),
12169                       getXMMReg( eregOfRexRM(pfx,modrm) ));
12170            DIP("movapd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12171                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12172            delta += 1;
12173         } else {
12174            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12175            gen_SEGV_if_not_16_aligned( addr );
12176            putXMMReg( gregOfRexRM(pfx,modrm),
12177                       loadLE(Ity_V128, mkexpr(addr)) );
12178            DIP("movapd %s,%s\n", dis_buf,
12179                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12180            delta += alen;
12181         }
12182         goto decode_success;
12183      }
12184      /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
12185      if (haveNo66noF2noF3(pfx)
12186          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12187         modrm = getUChar(delta);
12188         if (epartIsReg(modrm)) {
12189            putXMMReg( gregOfRexRM(pfx,modrm),
12190                       getXMMReg( eregOfRexRM(pfx,modrm) ));
12191            DIP("movaps %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12192                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12193            delta += 1;
12194         } else {
12195            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12196            gen_SEGV_if_not_16_aligned( addr );
12197            putXMMReg( gregOfRexRM(pfx,modrm),
12198                       loadLE(Ity_V128, mkexpr(addr)) );
12199            DIP("movaps %s,%s\n", dis_buf,
12200                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
12201            delta += alen;
12202         }
12203         goto decode_success;
12204      }
12205      break;
12206
12207   case 0x29:
12208      /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
12209      if (haveNo66noF2noF3(pfx)
12210          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12211         modrm = getUChar(delta);
12212         if (epartIsReg(modrm)) {
12213            putXMMReg( eregOfRexRM(pfx,modrm),
12214                       getXMMReg( gregOfRexRM(pfx,modrm) ));
12215            DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12216                                  nameXMMReg(eregOfRexRM(pfx,modrm)));
12217            delta += 1;
12218         } else {
12219            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12220            gen_SEGV_if_not_16_aligned( addr );
12221            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
12222            DIP("movaps %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12223                                  dis_buf );
12224            delta += alen;
12225         }
12226         goto decode_success;
12227      }
12228      /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
12229      if (have66noF2noF3(pfx)
12230          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12231         modrm = getUChar(delta);
12232         if (epartIsReg(modrm)) {
12233            putXMMReg( eregOfRexRM(pfx,modrm),
12234                       getXMMReg( gregOfRexRM(pfx,modrm) ) );
12235            DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12236                                  nameXMMReg(eregOfRexRM(pfx,modrm)));
12237            delta += 1;
12238         } else {
12239            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12240            gen_SEGV_if_not_16_aligned( addr );
12241            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
12242            DIP("movapd %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
12243                                  dis_buf );
12244            delta += alen;
12245         }
12246         goto decode_success;
12247      }
12248      break;
12249
12250   case 0x2A:
12251      /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
12252         half xmm */
12253      if (haveNo66noF2noF3(pfx) && sz == 4) {
12254         IRTemp arg64 = newTemp(Ity_I64);
12255         IRTemp rmode = newTemp(Ity_I32);
12256
12257         modrm = getUChar(delta);
12258         do_MMX_preamble();
12259         if (epartIsReg(modrm)) {
12260            assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
12261            delta += 1;
12262            DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
12263                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
12264         } else {
12265            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12266            assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
12267            delta += alen;
12268            DIP("cvtpi2ps %s,%s\n", dis_buf,
12269                                    nameXMMReg(gregOfRexRM(pfx,modrm)) );
12270         }
12271
12272         assign( rmode, get_sse_roundingmode() );
12273
12274         putXMMRegLane32F(
12275            gregOfRexRM(pfx,modrm), 0,
12276            binop(Iop_F64toF32,
12277                  mkexpr(rmode),
12278                  unop(Iop_I32StoF64,
12279                       unop(Iop_64to32, mkexpr(arg64)) )) );
12280
12281         putXMMRegLane32F(
12282            gregOfRexRM(pfx,modrm), 1,
12283            binop(Iop_F64toF32,
12284                  mkexpr(rmode),
12285                  unop(Iop_I32StoF64,
12286                       unop(Iop_64HIto32, mkexpr(arg64)) )) );
12287
12288         goto decode_success;
12289      }
12290      /* F3 0F 2A = CVTSI2SS
12291         -- sz==4: convert I32 in mem/ireg to F32 in low quarter xmm
12292         -- sz==8: convert I64 in mem/ireg to F32 in low quarter xmm */
12293      if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
12294         IRTemp rmode = newTemp(Ity_I32);
12295         assign( rmode, get_sse_roundingmode() );
12296         modrm = getUChar(delta);
12297         if (sz == 4) {
12298            IRTemp arg32 = newTemp(Ity_I32);
12299            if (epartIsReg(modrm)) {
12300               assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
12301               delta += 1;
12302               DIP("cvtsi2ss %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
12303                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
12304            } else {
12305               addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12306               assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
12307               delta += alen;
12308               DIP("cvtsi2ss %s,%s\n", dis_buf,
12309                                       nameXMMReg(gregOfRexRM(pfx,modrm)) );
12310            }
12311            putXMMRegLane32F(
12312               gregOfRexRM(pfx,modrm), 0,
12313               binop(Iop_F64toF32,
12314                     mkexpr(rmode),
12315                     unop(Iop_I32StoF64, mkexpr(arg32)) ) );
12316         } else {
12317            /* sz == 8 */
12318            IRTemp arg64 = newTemp(Ity_I64);
12319            if (epartIsReg(modrm)) {
12320               assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
12321               delta += 1;
12322               DIP("cvtsi2ssq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
12323                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
12324            } else {
12325               addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12326               assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
12327               delta += alen;
12328               DIP("cvtsi2ssq %s,%s\n", dis_buf,
12329                                        nameXMMReg(gregOfRexRM(pfx,modrm)) );
12330            }
12331            putXMMRegLane32F(
12332               gregOfRexRM(pfx,modrm), 0,
12333               binop(Iop_F64toF32,
12334                     mkexpr(rmode),
12335                     binop(Iop_I64StoF64, mkexpr(rmode), mkexpr(arg64)) ) );
12336         }
12337         goto decode_success;
12338      }
12339      /* F2 0F 2A = CVTSI2SD
12340         when sz==4 -- convert I32 in mem/ireg to F64 in low half xmm
12341         when sz==8 -- convert I64 in mem/ireg to F64 in low half xmm
12342      */
12343      if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
12344         modrm = getUChar(delta);
12345         if (sz == 4) {
12346            IRTemp arg32 = newTemp(Ity_I32);
12347            if (epartIsReg(modrm)) {
12348               assign( arg32, getIReg32(eregOfRexRM(pfx,modrm)) );
12349               delta += 1;
12350               DIP("cvtsi2sdl %s,%s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
12351                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
12352            } else {
12353               addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12354               assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
12355               delta += alen;
12356               DIP("cvtsi2sdl %s,%s\n", dis_buf,
12357                                        nameXMMReg(gregOfRexRM(pfx,modrm)) );
12358            }
12359            putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
12360                              unop(Iop_I32StoF64, mkexpr(arg32))
12361            );
12362         } else {
12363            /* sz == 8 */
12364            IRTemp arg64 = newTemp(Ity_I64);
12365            if (epartIsReg(modrm)) {
12366               assign( arg64, getIReg64(eregOfRexRM(pfx,modrm)) );
12367               delta += 1;
12368               DIP("cvtsi2sdq %s,%s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
12369                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
12370            } else {
12371               addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12372               assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
12373               delta += alen;
12374               DIP("cvtsi2sdq %s,%s\n", dis_buf,
12375                                        nameXMMReg(gregOfRexRM(pfx,modrm)) );
12376            }
12377            putXMMRegLane64F(
12378               gregOfRexRM(pfx,modrm),
12379               0,
12380               binop( Iop_I64StoF64,
12381                      get_sse_roundingmode(),
12382                      mkexpr(arg64)
12383               )
12384            );
12385         }
12386         goto decode_success;
12387      }
12388      /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
12389         xmm(G) */
12390      if (have66noF2noF3(pfx) && sz == 2) {
12391         IRTemp arg64 = newTemp(Ity_I64);
12392
12393         modrm = getUChar(delta);
12394         if (epartIsReg(modrm)) {
12395            /* Only switch to MMX mode if the source is a MMX register.
12396               This is inconsistent with all other instructions which
12397               convert between XMM and (M64 or MMX), which always switch
12398               to MMX mode even if 64-bit operand is M64 and not MMX.  At
12399               least, that's what the Intel docs seem to me to say.
12400               Fixes #210264. */
12401            do_MMX_preamble();
12402            assign( arg64, getMMXReg(eregLO3ofRM(modrm)) );
12403            delta += 1;
12404            DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
12405                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
12406         } else {
12407            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12408            assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
12409            delta += alen;
12410            DIP("cvtpi2pd %s,%s\n", dis_buf,
12411                                    nameXMMReg(gregOfRexRM(pfx,modrm)) );
12412         }
12413
12414         putXMMRegLane64F(
12415            gregOfRexRM(pfx,modrm), 0,
12416            unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
12417         );
12418
12419         putXMMRegLane64F(
12420            gregOfRexRM(pfx,modrm), 1,
12421            unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
12422         );
12423
12424         goto decode_success;
12425      }
12426      break;
12427
12428   case 0x2B:
12429      /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
12430      /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
12431      if ( (haveNo66noF2noF3(pfx) && sz == 4)
12432           || (have66noF2noF3(pfx) && sz == 2) ) {
12433         modrm = getUChar(delta);
12434         if (!epartIsReg(modrm)) {
12435            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12436            gen_SEGV_if_not_16_aligned( addr );
12437            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
12438            DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
12439                                    dis_buf,
12440                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
12441            delta += alen;
12442            goto decode_success;
12443         }
12444         /* else fall through */
12445      }
12446      break;
12447
12448   case 0x2C:
12449   case 0x2D:
12450      /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
12451         I32 in mmx, according to prevailing SSE rounding mode */
12452      /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
12453         I32 in mmx, rounding towards zero */
12454      if (haveNo66noF2noF3(pfx) && sz == 4) {
12455         IRTemp dst64  = newTemp(Ity_I64);
12456         IRTemp rmode  = newTemp(Ity_I32);
12457         IRTemp f32lo  = newTemp(Ity_F32);
12458         IRTemp f32hi  = newTemp(Ity_F32);
12459         Bool   r2zero = toBool(opc == 0x2C);
12460
12461         do_MMX_preamble();
12462         modrm = getUChar(delta);
12463
12464         if (epartIsReg(modrm)) {
12465            delta += 1;
12466            assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
12467            assign(f32hi, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 1));
12468            DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
12469                                      nameXMMReg(eregOfRexRM(pfx,modrm)),
12470                                      nameMMXReg(gregLO3ofRM(modrm)));
12471         } else {
12472            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12473            assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
12474            assign(f32hi, loadLE(Ity_F32, binop( Iop_Add64,
12475                                                 mkexpr(addr),
12476                                                 mkU64(4) )));
12477            delta += alen;
12478            DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
12479                                      dis_buf,
12480                                      nameMMXReg(gregLO3ofRM(modrm)));
12481         }
12482
12483         if (r2zero) {
12484            assign(rmode, mkU32((UInt)Irrm_ZERO) );
12485         } else {
12486            assign( rmode, get_sse_roundingmode() );
12487         }
12488
12489         assign(
12490            dst64,
12491            binop( Iop_32HLto64,
12492                   binop( Iop_F64toI32S,
12493                          mkexpr(rmode),
12494                          unop( Iop_F32toF64, mkexpr(f32hi) ) ),
12495                   binop( Iop_F64toI32S,
12496                          mkexpr(rmode),
12497                          unop( Iop_F32toF64, mkexpr(f32lo) ) )
12498                 )
12499         );
12500
12501         putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
12502         goto decode_success;
12503      }
12504      /* F3 0F 2D = CVTSS2SI
12505         when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
12506                       according to prevailing SSE rounding mode
12507         when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
12508                       according to prevailing SSE rounding mode
12509      */
12510      /* F3 0F 2C = CVTTSS2SI
12511         when sz==4 -- convert F32 in mem/low quarter xmm to I32 in ireg,
12512                       truncating towards zero
12513         when sz==8 -- convert F32 in mem/low quarter xmm to I64 in ireg,
12514                       truncating towards zero
12515      */
12516      if (haveF3no66noF2(pfx) && (sz == 4 || sz == 8)) {
12517         delta = dis_CVTxSS2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
12518         goto decode_success;
12519      }
12520      /* F2 0F 2D = CVTSD2SI
12521         when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
12522                       according to prevailing SSE rounding mode
12523         when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
12524                       according to prevailing SSE rounding mode
12525      */
12526      /* F2 0F 2C = CVTTSD2SI
12527         when sz==4 -- convert F64 in mem/low half xmm to I32 in ireg,
12528                       truncating towards zero
12529         when sz==8 -- convert F64 in mem/low half xmm to I64 in ireg,
12530                       truncating towards zero
12531      */
12532      if (haveF2no66noF3(pfx) && (sz == 4 || sz == 8)) {
12533         delta = dis_CVTxSD2SI( vbi, pfx, delta, False/*!isAvx*/, opc, sz);
12534         goto decode_success;
12535      }
12536      /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
12537         I32 in mmx, according to prevailing SSE rounding mode */
12538      /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
12539         I32 in mmx, rounding towards zero */
12540      if (have66noF2noF3(pfx) && sz == 2) {
12541         IRTemp dst64  = newTemp(Ity_I64);
12542         IRTemp rmode  = newTemp(Ity_I32);
12543         IRTemp f64lo  = newTemp(Ity_F64);
12544         IRTemp f64hi  = newTemp(Ity_F64);
12545         Bool   r2zero = toBool(opc == 0x2C);
12546
12547         do_MMX_preamble();
12548         modrm = getUChar(delta);
12549
12550         if (epartIsReg(modrm)) {
12551            delta += 1;
12552            assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
12553            assign(f64hi, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 1));
12554            DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
12555                                      nameXMMReg(eregOfRexRM(pfx,modrm)),
12556                                      nameMMXReg(gregLO3ofRM(modrm)));
12557         } else {
12558            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12559            assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
12560            assign(f64hi, loadLE(Ity_F64, binop( Iop_Add64,
12561                                                 mkexpr(addr),
12562                                                 mkU64(8) )));
12563            delta += alen;
12564            DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
12565                                      dis_buf,
12566                                      nameMMXReg(gregLO3ofRM(modrm)));
12567         }
12568
12569         if (r2zero) {
12570            assign(rmode, mkU32((UInt)Irrm_ZERO) );
12571         } else {
12572            assign( rmode, get_sse_roundingmode() );
12573         }
12574
12575         assign(
12576            dst64,
12577            binop( Iop_32HLto64,
12578                   binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
12579                   binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
12580                 )
12581         );
12582
12583         putMMXReg(gregLO3ofRM(modrm), mkexpr(dst64));
12584         goto decode_success;
12585      }
12586      break;
12587
12588   case 0x2E:
12589   case 0x2F:
12590      /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
12591      /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
12592      if (have66noF2noF3(pfx) && sz == 2) {
12593         delta = dis_COMISD( vbi, pfx, delta, False/*!isAvx*/, opc );
12594         goto decode_success;
12595      }
12596      /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
12597      /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
12598      if (haveNo66noF2noF3(pfx) && sz == 4) {
12599         delta = dis_COMISS( vbi, pfx, delta, False/*!isAvx*/, opc );
12600         goto decode_success;
12601      }
12602      break;
12603
12604   case 0x50:
12605      /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
12606         to 4 lowest bits of ireg(G) */
12607      if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
12608          && epartIsReg(getUChar(delta))) {
12609         /* sz == 8 is a kludge to handle insns with REX.W redundantly
12610            set to 1, which has been known to happen:
12611
12612            4c 0f 50 d9             rex64X movmskps %xmm1,%r11d
12613
12614            20071106: Intel docs say that REX.W isn't redundant: when
12615            present, a 64-bit register is written; when not present, only
12616            the 32-bit half is written.  However, testing on a Core2
12617            machine suggests the entire 64 bit register is written
12618            irrespective of the status of REX.W.  That could be because
12619            of the default rule that says "if the lower half of a 32-bit
12620            register is written, the upper half is zeroed".  By using
12621            putIReg32 here we inadvertantly produce the same behaviour as
12622            the Core2, for the same reason -- putIReg32 implements said
12623            rule.
12624
12625            AMD docs give no indication that REX.W is even valid for this
12626            insn. */
12627         delta = dis_MOVMSKPS_128( vbi, pfx, delta, False/*!isAvx*/ );
12628         goto decode_success;
12629      }
12630      /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
12631         2 lowest bits of ireg(G) */
12632      if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
12633         /* sz == 8 is a kludge to handle insns with REX.W redundantly
12634            set to 1, which has been known to happen:
12635            66 4c 0f 50 d9          rex64X movmskpd %xmm1,%r11d
12636            20071106: see further comments on MOVMSKPS implementation above.
12637         */
12638         delta = dis_MOVMSKPD_128( vbi, pfx, delta, False/*!isAvx*/ );
12639         goto decode_success;
12640      }
12641      break;
12642
12643   case 0x51:
12644      /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
12645      if (haveF3no66noF2(pfx) && sz == 4) {
12646         delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
12647                                            "sqrtss", Iop_Sqrt32F0x4 );
12648         goto decode_success;
12649      }
12650      /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
12651      if (haveNo66noF2noF3(pfx) && sz == 4) {
12652         delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
12653                                           "sqrtps", Iop_Sqrt32Fx4 );
12654         goto decode_success;
12655      }
12656      /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
12657      if (haveF2no66noF3(pfx) && sz == 4) {
12658         delta = dis_SSE_E_to_G_unary_lo64( vbi, pfx, delta,
12659                                            "sqrtsd", Iop_Sqrt64F0x2 );
12660         goto decode_success;
12661      }
12662      /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
12663      if (have66noF2noF3(pfx) && sz == 2) {
12664         delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
12665                                           "sqrtpd", Iop_Sqrt64Fx2 );
12666         goto decode_success;
12667      }
12668      break;
12669
12670   case 0x52:
12671      /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
12672      if (haveF3no66noF2(pfx) && sz == 4) {
12673         delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
12674                                            "rsqrtss", Iop_RSqrt32F0x4 );
12675         goto decode_success;
12676      }
12677      /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
12678      if (haveNo66noF2noF3(pfx) && sz == 4) {
12679         delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
12680                                           "rsqrtps", Iop_RSqrt32Fx4 );
12681         goto decode_success;
12682      }
12683      break;
12684
12685   case 0x53:
12686      /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
12687      if (haveF3no66noF2(pfx) && sz == 4) {
12688         delta = dis_SSE_E_to_G_unary_lo32( vbi, pfx, delta,
12689                                            "rcpss", Iop_Recip32F0x4 );
12690         goto decode_success;
12691      }
12692      /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
12693      if (haveNo66noF2noF3(pfx) && sz == 4) {
12694         delta = dis_SSE_E_to_G_unary_all( vbi, pfx, delta,
12695                                           "rcpps", Iop_Recip32Fx4 );
12696         goto decode_success;
12697      }
12698      break;
12699
12700   case 0x54:
12701      /* 0F 54 = ANDPS -- G = G and E */
12702      if (haveNo66noF2noF3(pfx) && sz == 4) {
12703         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andps", Iop_AndV128 );
12704         goto decode_success;
12705      }
12706      /* 66 0F 54 = ANDPD -- G = G and E */
12707      if (have66noF2noF3(pfx) && sz == 2) {
12708         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "andpd", Iop_AndV128 );
12709         goto decode_success;
12710      }
12711      break;
12712
12713   case 0x55:
12714      /* 0F 55 = ANDNPS -- G = (not G) and E */
12715      if (haveNo66noF2noF3(pfx) && sz == 4) {
12716         delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnps",
12717                                                           Iop_AndV128 );
12718         goto decode_success;
12719      }
12720      /* 66 0F 55 = ANDNPD -- G = (not G) and E */
12721      if (have66noF2noF3(pfx) && sz == 2) {
12722         delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "andnpd",
12723                                                           Iop_AndV128 );
12724         goto decode_success;
12725      }
12726      break;
12727
12728   case 0x56:
12729      /* 0F 56 = ORPS -- G = G and E */
12730      if (haveNo66noF2noF3(pfx) && sz == 4) {
12731         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orps", Iop_OrV128 );
12732         goto decode_success;
12733      }
12734      /* 66 0F 56 = ORPD -- G = G and E */
12735      if (have66noF2noF3(pfx) && sz == 2) {
12736         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "orpd", Iop_OrV128 );
12737         goto decode_success;
12738      }
12739      break;
12740
12741   case 0x57:
12742      /* 66 0F 57 = XORPD -- G = G xor E */
12743      if (have66noF2noF3(pfx) && sz == 2) {
12744         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorpd", Iop_XorV128 );
12745         goto decode_success;
12746      }
12747      /* 0F 57 = XORPS -- G = G xor E */
12748      if (haveNo66noF2noF3(pfx) && sz == 4) {
12749         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "xorps", Iop_XorV128 );
12750         goto decode_success;
12751      }
12752      break;
12753
12754   case 0x58:
12755      /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
12756      if (haveNo66noF2noF3(pfx) && sz == 4) {
12757         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addps", Iop_Add32Fx4 );
12758         goto decode_success;
12759      }
12760      /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
12761      if (haveF3no66noF2(pfx) && sz == 4) {
12762         delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "addss", Iop_Add32F0x4 );
12763         goto decode_success;
12764      }
12765      /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
12766      if (haveF2no66noF3(pfx)
12767          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12768         delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "addsd", Iop_Add64F0x2 );
12769         goto decode_success;
12770      }
12771      /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
12772      if (have66noF2noF3(pfx)
12773          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12774         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "addpd", Iop_Add64Fx2 );
12775         goto decode_success;
12776      }
12777      break;
12778
12779   case 0x59:
12780      /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
12781      if (haveF2no66noF3(pfx)
12782          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12783         delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "mulsd", Iop_Mul64F0x2 );
12784         goto decode_success;
12785      }
12786      /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
12787      if (haveF3no66noF2(pfx) && sz == 4) {
12788         delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "mulss", Iop_Mul32F0x4 );
12789         goto decode_success;
12790      }
12791      /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
12792      if (haveNo66noF2noF3(pfx) && sz == 4) {
12793         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulps", Iop_Mul32Fx4 );
12794         goto decode_success;
12795      }
12796      /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
12797      if (have66noF2noF3(pfx)
12798          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
12799         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "mulpd", Iop_Mul64Fx2 );
12800         goto decode_success;
12801      }
12802      break;
12803
12804   case 0x5A:
12805      /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
12806         F64 in xmm(G). */
12807      if (haveNo66noF2noF3(pfx) && sz == 4) {
12808         delta = dis_CVTPS2PD_128( vbi, pfx, delta, False/*!isAvx*/ );
12809         goto decode_success;
12810      }
12811      /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
12812         low half xmm(G) */
12813      if (haveF3no66noF2(pfx) && sz == 4) {
12814         IRTemp f32lo = newTemp(Ity_F32);
12815
12816         modrm = getUChar(delta);
12817         if (epartIsReg(modrm)) {
12818            delta += 1;
12819            assign(f32lo, getXMMRegLane32F(eregOfRexRM(pfx,modrm), 0));
12820            DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12821                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
12822         } else {
12823            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12824            assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
12825            delta += alen;
12826            DIP("cvtss2sd %s,%s\n", dis_buf,
12827                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
12828         }
12829
12830         putXMMRegLane64F( gregOfRexRM(pfx,modrm), 0,
12831                           unop( Iop_F32toF64, mkexpr(f32lo) ) );
12832
12833         goto decode_success;
12834      }
12835      /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
12836         low 1/4 xmm(G), according to prevailing SSE rounding mode */
12837      if (haveF2no66noF3(pfx) && sz == 4) {
12838         IRTemp rmode = newTemp(Ity_I32);
12839         IRTemp f64lo = newTemp(Ity_F64);
12840
12841         modrm = getUChar(delta);
12842         if (epartIsReg(modrm)) {
12843            delta += 1;
12844            assign(f64lo, getXMMRegLane64F(eregOfRexRM(pfx,modrm), 0));
12845            DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
12846                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
12847         } else {
12848            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
12849            assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
12850            delta += alen;
12851            DIP("cvtsd2ss %s,%s\n", dis_buf,
12852                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
12853         }
12854
12855         assign( rmode, get_sse_roundingmode() );
12856         putXMMRegLane32F(
12857            gregOfRexRM(pfx,modrm), 0,
12858            binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
12859         );
12860
12861         goto decode_success;
12862      }
12863      /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
12864         lo half xmm(G), rounding according to prevailing SSE rounding
12865         mode, and zero upper half */
12866      /* Note, this is practically identical to CVTPD2DQ.  It would have
12867         be nice to merge them together. */
12868      if (have66noF2noF3(pfx) && sz == 2) {
12869         delta = dis_CVTPD2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
12870         goto decode_success;
12871      }
12872      break;
12873
12874   case 0x5B:
12875      /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
12876         xmm(G), rounding towards zero */
12877      /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
12878         xmm(G), as per the prevailing rounding mode */
12879      if ( (have66noF2noF3(pfx) && sz == 2)
12880           || (haveF3no66noF2(pfx) && sz == 4) ) {
12881         Bool r2zero = toBool(sz == 4); // FIXME -- unreliable (???)
12882         delta = dis_CVTxPS2DQ_128( vbi, pfx, delta, False/*!isAvx*/, r2zero );
12883         goto decode_success;
12884      }
12885      /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
12886         xmm(G) */
12887      if (haveNo66noF2noF3(pfx) && sz == 4) {
12888         delta = dis_CVTDQ2PS_128( vbi, pfx, delta, False/*!isAvx*/ );
12889         goto decode_success;
12890      }
12891      break;
12892
12893   case 0x5C:
12894      /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
12895      if (haveF3no66noF2(pfx) && sz == 4) {
12896         delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "subss", Iop_Sub32F0x4 );
12897         goto decode_success;
12898      }
12899      /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
12900      if (haveF2no66noF3(pfx)
12901          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
12902         delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "subsd", Iop_Sub64F0x2 );
12903         goto decode_success;
12904      }
12905      /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
12906      if (haveNo66noF2noF3(pfx) && sz == 4) {
12907         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subps", Iop_Sub32Fx4 );
12908         goto decode_success;
12909      }
12910      /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
12911      if (have66noF2noF3(pfx) && sz == 2) {
12912         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "subpd", Iop_Sub64Fx2 );
12913         goto decode_success;
12914      }
12915      break;
12916
12917   case 0x5D:
12918      /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
12919      if (haveNo66noF2noF3(pfx) && sz == 4) {
12920         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minps", Iop_Min32Fx4 );
12921         goto decode_success;
12922      }
12923      /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
12924      if (haveF3no66noF2(pfx) && sz == 4) {
12925         delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "minss", Iop_Min32F0x4 );
12926         goto decode_success;
12927      }
12928      /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
12929      if (haveF2no66noF3(pfx) && sz == 4) {
12930         delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "minsd", Iop_Min64F0x2 );
12931         goto decode_success;
12932      }
12933      /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
12934      if (have66noF2noF3(pfx) && sz == 2) {
12935         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "minpd", Iop_Min64Fx2 );
12936         goto decode_success;
12937      }
12938      break;
12939
12940   case 0x5E:
12941      /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
12942      if (haveF2no66noF3(pfx) && sz == 4) {
12943         delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "divsd", Iop_Div64F0x2 );
12944         goto decode_success;
12945      }
12946      /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
12947      if (haveNo66noF2noF3(pfx) && sz == 4) {
12948         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divps", Iop_Div32Fx4 );
12949         goto decode_success;
12950      }
12951      /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
12952      if (haveF3no66noF2(pfx) && sz == 4) {
12953         delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "divss", Iop_Div32F0x4 );
12954         goto decode_success;
12955      }
12956      /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
12957      if (have66noF2noF3(pfx) && sz == 2) {
12958         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "divpd", Iop_Div64Fx2 );
12959         goto decode_success;
12960      }
12961      break;
12962
12963   case 0x5F:
12964      /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
12965      if (haveNo66noF2noF3(pfx) && sz == 4) {
12966         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxps", Iop_Max32Fx4 );
12967         goto decode_success;
12968      }
12969      /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
12970      if (haveF3no66noF2(pfx) && sz == 4) {
12971         delta = dis_SSE_E_to_G_lo32( vbi, pfx, delta, "maxss", Iop_Max32F0x4 );
12972         goto decode_success;
12973      }
12974      /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
12975      if (haveF2no66noF3(pfx) && sz == 4) {
12976         delta = dis_SSE_E_to_G_lo64( vbi, pfx, delta, "maxsd", Iop_Max64F0x2 );
12977         goto decode_success;
12978      }
12979      /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
12980      if (have66noF2noF3(pfx) && sz == 2) {
12981         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "maxpd", Iop_Max64Fx2 );
12982         goto decode_success;
12983      }
12984      break;
12985
12986   case 0x60:
12987      /* 66 0F 60 = PUNPCKLBW */
12988      if (have66noF2noF3(pfx) && sz == 2) {
12989         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
12990                                    "punpcklbw",
12991                                    Iop_InterleaveLO8x16, True );
12992         goto decode_success;
12993      }
12994      break;
12995
12996   case 0x61:
12997      /* 66 0F 61 = PUNPCKLWD */
12998      if (have66noF2noF3(pfx) && sz == 2) {
12999         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13000                                    "punpcklwd",
13001                                    Iop_InterleaveLO16x8, True );
13002         goto decode_success;
13003      }
13004      break;
13005
13006   case 0x62:
13007      /* 66 0F 62 = PUNPCKLDQ */
13008      if (have66noF2noF3(pfx) && sz == 2) {
13009         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13010                                    "punpckldq",
13011                                    Iop_InterleaveLO32x4, True );
13012         goto decode_success;
13013      }
13014      break;
13015
13016   case 0x63:
13017      /* 66 0F 63 = PACKSSWB */
13018      if (have66noF2noF3(pfx) && sz == 2) {
13019         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13020                                    "packsswb",
13021                                    Iop_QNarrowBin16Sto8Sx16, True );
13022         goto decode_success;
13023      }
13024      break;
13025
13026   case 0x64:
13027      /* 66 0F 64 = PCMPGTB */
13028      if (have66noF2noF3(pfx) && sz == 2) {
13029         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13030                                    "pcmpgtb", Iop_CmpGT8Sx16, False );
13031         goto decode_success;
13032      }
13033      break;
13034
13035   case 0x65:
13036      /* 66 0F 65 = PCMPGTW */
13037      if (have66noF2noF3(pfx) && sz == 2) {
13038         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13039                                    "pcmpgtw", Iop_CmpGT16Sx8, False );
13040         goto decode_success;
13041      }
13042      break;
13043
13044   case 0x66:
13045      /* 66 0F 66 = PCMPGTD */
13046      if (have66noF2noF3(pfx) && sz == 2) {
13047         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13048                                    "pcmpgtd", Iop_CmpGT32Sx4, False );
13049         goto decode_success;
13050      }
13051      break;
13052
13053   case 0x67:
13054      /* 66 0F 67 = PACKUSWB */
13055      if (have66noF2noF3(pfx) && sz == 2) {
13056         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13057                                    "packuswb",
13058                                    Iop_QNarrowBin16Sto8Ux16, True );
13059         goto decode_success;
13060      }
13061      break;
13062
13063   case 0x68:
13064      /* 66 0F 68 = PUNPCKHBW */
13065      if (have66noF2noF3(pfx) && sz == 2) {
13066         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13067                                    "punpckhbw",
13068                                    Iop_InterleaveHI8x16, True );
13069         goto decode_success;
13070      }
13071      break;
13072
13073   case 0x69:
13074      /* 66 0F 69 = PUNPCKHWD */
13075      if (have66noF2noF3(pfx) && sz == 2) {
13076         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13077                                    "punpckhwd",
13078                                    Iop_InterleaveHI16x8, True );
13079         goto decode_success;
13080      }
13081      break;
13082
13083   case 0x6A:
13084      /* 66 0F 6A = PUNPCKHDQ */
13085      if (have66noF2noF3(pfx) && sz == 2) {
13086         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13087                                    "punpckhdq",
13088                                    Iop_InterleaveHI32x4, True );
13089         goto decode_success;
13090      }
13091      break;
13092
13093   case 0x6B:
13094      /* 66 0F 6B = PACKSSDW */
13095      if (have66noF2noF3(pfx) && sz == 2) {
13096         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13097                                    "packssdw",
13098                                    Iop_QNarrowBin32Sto16Sx8, True );
13099         goto decode_success;
13100      }
13101      break;
13102
13103   case 0x6C:
13104      /* 66 0F 6C = PUNPCKLQDQ */
13105      if (have66noF2noF3(pfx) && sz == 2) {
13106         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13107                                    "punpcklqdq",
13108                                    Iop_InterleaveLO64x2, True );
13109         goto decode_success;
13110      }
13111      break;
13112
13113   case 0x6D:
13114      /* 66 0F 6D = PUNPCKHQDQ */
13115      if (have66noF2noF3(pfx) && sz == 2) {
13116         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13117                                    "punpckhqdq",
13118                                    Iop_InterleaveHI64x2, True );
13119         goto decode_success;
13120      }
13121      break;
13122
13123   case 0x6E:
13124      /* 66 0F 6E = MOVD from ireg32/m32 to xmm lo 1/4,
13125                    zeroing high 3/4 of xmm. */
13126      /*              or from ireg64/m64 to xmm lo 1/2,
13127                    zeroing high 1/2 of xmm. */
13128      if (have66noF2noF3(pfx)) {
13129         vassert(sz == 2 || sz == 8);
13130         if (sz == 2) sz = 4;
13131         modrm = getUChar(delta);
13132         if (epartIsReg(modrm)) {
13133            delta += 1;
13134            if (sz == 4) {
13135               putXMMReg(
13136                  gregOfRexRM(pfx,modrm),
13137                  unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
13138               );
13139               DIP("movd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
13140                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
13141            } else {
13142               putXMMReg(
13143                  gregOfRexRM(pfx,modrm),
13144                  unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
13145               );
13146               DIP("movq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
13147                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
13148            }
13149         } else {
13150            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
13151            delta += alen;
13152            putXMMReg(
13153               gregOfRexRM(pfx,modrm),
13154               sz == 4
13155                  ?  unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
13156                  :  unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)) )
13157            );
13158            DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q', dis_buf,
13159                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
13160         }
13161         goto decode_success;
13162      }
13163      break;
13164
13165   case 0x6F:
13166      if (have66noF2noF3(pfx)
13167          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
13168         /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
13169         modrm = getUChar(delta);
13170         if (epartIsReg(modrm)) {
13171            putXMMReg( gregOfRexRM(pfx,modrm),
13172                       getXMMReg( eregOfRexRM(pfx,modrm) ));
13173            DIP("movdqa %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
13174                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
13175            delta += 1;
13176         } else {
13177            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13178            gen_SEGV_if_not_16_aligned( addr );
13179            putXMMReg( gregOfRexRM(pfx,modrm),
13180                       loadLE(Ity_V128, mkexpr(addr)) );
13181            DIP("movdqa %s,%s\n", dis_buf,
13182                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
13183            delta += alen;
13184         }
13185         goto decode_success;
13186      }
13187      if (haveF3no66noF2(pfx) && sz == 4) {
13188         /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
13189         modrm = getUChar(delta);
13190         if (epartIsReg(modrm)) {
13191            putXMMReg( gregOfRexRM(pfx,modrm),
13192                       getXMMReg( eregOfRexRM(pfx,modrm) ));
13193            DIP("movdqu %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
13194                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
13195            delta += 1;
13196         } else {
13197            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13198            putXMMReg( gregOfRexRM(pfx,modrm),
13199                       loadLE(Ity_V128, mkexpr(addr)) );
13200            DIP("movdqu %s,%s\n", dis_buf,
13201                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
13202            delta += alen;
13203         }
13204         goto decode_success;
13205      }
13206      break;
13207
13208   case 0x70:
13209      /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
13210      if (have66noF2noF3(pfx) && sz == 2) {
13211         delta = dis_PSHUFD_32x4( vbi, pfx, delta, False/*!writesYmm*/);
13212         goto decode_success;
13213      }
13214      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
13215      /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
13216      if (haveNo66noF2noF3(pfx) && sz == 4) {
13217         Int order;
13218         IRTemp sV, dV, s3, s2, s1, s0;
13219         s3 = s2 = s1 = s0 = IRTemp_INVALID;
13220         sV = newTemp(Ity_I64);
13221         dV = newTemp(Ity_I64);
13222         do_MMX_preamble();
13223         modrm = getUChar(delta);
13224         if (epartIsReg(modrm)) {
13225            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
13226            order = (Int)getUChar(delta+1);
13227            delta += 1+1;
13228            DIP("pshufw $%d,%s,%s\n", order,
13229                                      nameMMXReg(eregLO3ofRM(modrm)),
13230                                      nameMMXReg(gregLO3ofRM(modrm)));
13231         } else {
13232            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
13233                              1/*extra byte after amode*/ );
13234            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
13235            order = (Int)getUChar(delta+alen);
13236            delta += 1+alen;
13237            DIP("pshufw $%d,%s,%s\n", order,
13238                                      dis_buf,
13239                                      nameMMXReg(gregLO3ofRM(modrm)));
13240         }
13241         breakup64to16s( sV, &s3, &s2, &s1, &s0 );
13242#        define SEL(n) \
13243                   ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
13244         assign(dV,
13245                mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
13246                             SEL((order>>2)&3), SEL((order>>0)&3) )
13247         );
13248         putMMXReg(gregLO3ofRM(modrm), mkexpr(dV));
13249#        undef SEL
13250         goto decode_success;
13251      }
13252      /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
13253         mem) to G(xmm), and copy upper half */
13254      if (haveF2no66noF3(pfx) && sz == 4) {
13255         delta = dis_PSHUFxW_128( vbi, pfx, delta,
13256                                  False/*!isAvx*/, False/*!xIsH*/ );
13257         goto decode_success;
13258      }
13259      /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
13260         mem) to G(xmm), and copy lower half */
13261      if (haveF3no66noF2(pfx) && sz == 4) {
13262         delta = dis_PSHUFxW_128( vbi, pfx, delta,
13263                                  False/*!isAvx*/, True/*xIsH*/ );
13264         goto decode_success;
13265      }
13266      break;
13267
13268   case 0x71:
13269      /* 66 0F 71 /2 ib = PSRLW by immediate */
13270      if (have66noF2noF3(pfx) && sz == 2
13271          && epartIsReg(getUChar(delta))
13272          && gregLO3ofRM(getUChar(delta)) == 2) {
13273         delta = dis_SSE_shiftE_imm( pfx, delta, "psrlw", Iop_ShrN16x8 );
13274         goto decode_success;
13275      }
13276      /* 66 0F 71 /4 ib = PSRAW by immediate */
13277      if (have66noF2noF3(pfx) && sz == 2
13278          && epartIsReg(getUChar(delta))
13279          && gregLO3ofRM(getUChar(delta)) == 4) {
13280         delta = dis_SSE_shiftE_imm( pfx, delta, "psraw", Iop_SarN16x8 );
13281         goto decode_success;
13282      }
13283      /* 66 0F 71 /6 ib = PSLLW by immediate */
13284      if (have66noF2noF3(pfx) && sz == 2
13285          && epartIsReg(getUChar(delta))
13286          && gregLO3ofRM(getUChar(delta)) == 6) {
13287         delta = dis_SSE_shiftE_imm( pfx, delta, "psllw", Iop_ShlN16x8 );
13288         goto decode_success;
13289      }
13290      break;
13291
13292   case 0x72:
13293      /* 66 0F 72 /2 ib = PSRLD by immediate */
13294      if (have66noF2noF3(pfx) && sz == 2
13295          && epartIsReg(getUChar(delta))
13296          && gregLO3ofRM(getUChar(delta)) == 2) {
13297         delta = dis_SSE_shiftE_imm( pfx, delta, "psrld", Iop_ShrN32x4 );
13298         goto decode_success;
13299      }
13300      /* 66 0F 72 /4 ib = PSRAD by immediate */
13301      if (have66noF2noF3(pfx) && sz == 2
13302          && epartIsReg(getUChar(delta))
13303          && gregLO3ofRM(getUChar(delta)) == 4) {
13304         delta = dis_SSE_shiftE_imm( pfx, delta, "psrad", Iop_SarN32x4 );
13305         goto decode_success;
13306      }
13307      /* 66 0F 72 /6 ib = PSLLD by immediate */
13308      if (have66noF2noF3(pfx) && sz == 2
13309          && epartIsReg(getUChar(delta))
13310          && gregLO3ofRM(getUChar(delta)) == 6) {
13311         delta = dis_SSE_shiftE_imm( pfx, delta, "pslld", Iop_ShlN32x4 );
13312         goto decode_success;
13313      }
13314      break;
13315
13316   case 0x73:
13317      /* 66 0F 73 /3 ib = PSRLDQ by immediate */
13318      /* note, if mem case ever filled in, 1 byte after amode */
13319      if (have66noF2noF3(pfx) && sz == 2
13320          && epartIsReg(getUChar(delta))
13321          && gregLO3ofRM(getUChar(delta)) == 3) {
13322         Int imm = (Int)getUChar(delta+1);
13323         Int reg = eregOfRexRM(pfx,getUChar(delta));
13324         DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
13325         delta += 2;
13326         IRTemp sV = newTemp(Ity_V128);
13327         assign( sV, getXMMReg(reg) );
13328         putXMMReg(reg, mkexpr(math_PSRLDQ( sV, imm )));
13329         goto decode_success;
13330      }
13331      /* 66 0F 73 /7 ib = PSLLDQ by immediate */
13332      /* note, if mem case ever filled in, 1 byte after amode */
13333      if (have66noF2noF3(pfx) && sz == 2
13334          && epartIsReg(getUChar(delta))
13335          && gregLO3ofRM(getUChar(delta)) == 7) {
13336         Int imm = (Int)getUChar(delta+1);
13337         Int reg = eregOfRexRM(pfx,getUChar(delta));
13338         DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
13339         vassert(imm >= 0 && imm <= 255);
13340         delta += 2;
13341         IRTemp sV = newTemp(Ity_V128);
13342         assign( sV, getXMMReg(reg) );
13343         putXMMReg(reg, mkexpr(math_PSLLDQ( sV, imm )));
13344         goto decode_success;
13345      }
13346      /* 66 0F 73 /2 ib = PSRLQ by immediate */
13347      if (have66noF2noF3(pfx) && sz == 2
13348          && epartIsReg(getUChar(delta))
13349          && gregLO3ofRM(getUChar(delta)) == 2) {
13350         delta = dis_SSE_shiftE_imm( pfx, delta, "psrlq", Iop_ShrN64x2 );
13351         goto decode_success;
13352      }
13353      /* 66 0F 73 /6 ib = PSLLQ by immediate */
13354      if (have66noF2noF3(pfx) && sz == 2
13355          && epartIsReg(getUChar(delta))
13356          && gregLO3ofRM(getUChar(delta)) == 6) {
13357         delta = dis_SSE_shiftE_imm( pfx, delta, "psllq", Iop_ShlN64x2 );
13358         goto decode_success;
13359      }
13360      break;
13361
13362   case 0x74:
13363      /* 66 0F 74 = PCMPEQB */
13364      if (have66noF2noF3(pfx) && sz == 2) {
13365         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13366                                    "pcmpeqb", Iop_CmpEQ8x16, False );
13367         goto decode_success;
13368      }
13369      break;
13370
13371   case 0x75:
13372      /* 66 0F 75 = PCMPEQW */
13373      if (have66noF2noF3(pfx) && sz == 2) {
13374         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13375                                    "pcmpeqw", Iop_CmpEQ16x8, False );
13376         goto decode_success;
13377      }
13378      break;
13379
13380   case 0x76:
13381      /* 66 0F 76 = PCMPEQD */
13382      if (have66noF2noF3(pfx) && sz == 2) {
13383         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13384                                    "pcmpeqd", Iop_CmpEQ32x4, False );
13385         goto decode_success;
13386      }
13387      break;
13388
13389   case 0x7E:
13390      /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
13391         G (lo half xmm).  Upper half of G is zeroed out. */
13392      if (haveF3no66noF2(pfx)
13393          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
13394         modrm = getUChar(delta);
13395         if (epartIsReg(modrm)) {
13396            putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
13397                             getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
13398               /* zero bits 127:64 */
13399               putXMMRegLane64( gregOfRexRM(pfx,modrm), 1, mkU64(0) );
13400            DIP("movsd %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
13401                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
13402            delta += 1;
13403         } else {
13404            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13405            putXMMReg( gregOfRexRM(pfx,modrm), mkV128(0) );
13406            putXMMRegLane64( gregOfRexRM(pfx,modrm), 0,
13407                             loadLE(Ity_I64, mkexpr(addr)) );
13408            DIP("movsd %s,%s\n", dis_buf,
13409                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
13410            delta += alen;
13411         }
13412         goto decode_success;
13413      }
13414      /* 66 0F 7E = MOVD from xmm low 1/4 to ireg32 or m32. */
13415      /*              or from xmm low 1/2 to ireg64 or m64. */
13416         if (have66noF2noF3(pfx) && (sz == 2 || sz == 8)) {
13417         if (sz == 2) sz = 4;
13418         modrm = getUChar(delta);
13419         if (epartIsReg(modrm)) {
13420            delta += 1;
13421            if (sz == 4) {
13422               putIReg32( eregOfRexRM(pfx,modrm),
13423                          getXMMRegLane32(gregOfRexRM(pfx,modrm), 0) );
13424               DIP("movd %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
13425                                    nameIReg32(eregOfRexRM(pfx,modrm)));
13426            } else {
13427               putIReg64( eregOfRexRM(pfx,modrm),
13428                          getXMMRegLane64(gregOfRexRM(pfx,modrm), 0) );
13429               DIP("movq %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
13430                                    nameIReg64(eregOfRexRM(pfx,modrm)));
13431            }
13432         } else {
13433            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
13434            delta += alen;
13435            storeLE( mkexpr(addr),
13436                     sz == 4
13437                        ? getXMMRegLane32(gregOfRexRM(pfx,modrm),0)
13438                        : getXMMRegLane64(gregOfRexRM(pfx,modrm),0) );
13439            DIP("mov%c %s, %s\n", sz == 4 ? 'd' : 'q',
13440                                  nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
13441         }
13442         goto decode_success;
13443      }
13444      break;
13445
13446   case 0x7F:
13447      /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
13448      if (haveF3no66noF2(pfx) && sz == 4) {
13449         modrm = getUChar(delta);
13450         if (epartIsReg(modrm)) {
13451            goto decode_failure; /* awaiting test case */
13452            delta += 1;
13453            putXMMReg( eregOfRexRM(pfx,modrm),
13454                       getXMMReg(gregOfRexRM(pfx,modrm)) );
13455            DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
13456                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
13457         } else {
13458            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
13459            delta += alen;
13460            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
13461            DIP("movdqu %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
13462         }
13463         goto decode_success;
13464      }
13465      /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
13466      if (have66noF2noF3(pfx) && sz == 2) {
13467         modrm = getUChar(delta);
13468         if (epartIsReg(modrm)) {
13469            delta += 1;
13470            putXMMReg( eregOfRexRM(pfx,modrm),
13471                       getXMMReg(gregOfRexRM(pfx,modrm)) );
13472            DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)),
13473                                   nameXMMReg(eregOfRexRM(pfx,modrm)));
13474         } else {
13475            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
13476            gen_SEGV_if_not_16_aligned( addr );
13477            delta += alen;
13478            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
13479            DIP("movdqa %s, %s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf);
13480         }
13481         goto decode_success;
13482      }
13483      break;
13484
13485   case 0xAE:
13486      /* 0F AE /7 = SFENCE -- flush pending operations to memory */
13487      if (haveNo66noF2noF3(pfx)
13488          && epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
13489          && sz == 4) {
13490         delta += 1;
13491         /* Insert a memory fence.  It's sometimes important that these
13492            are carried through to the generated code. */
13493         stmt( IRStmt_MBE(Imbe_Fence) );
13494         DIP("sfence\n");
13495         goto decode_success;
13496      }
13497      /* mindless duplication follows .. */
13498      /* 0F AE /5 = LFENCE -- flush pending operations to memory */
13499      /* 0F AE /6 = MFENCE -- flush pending operations to memory */
13500      if (haveNo66noF2noF3(pfx)
13501          && epartIsReg(getUChar(delta))
13502          && (gregLO3ofRM(getUChar(delta)) == 5
13503              || gregLO3ofRM(getUChar(delta)) == 6)
13504          && sz == 4) {
13505         delta += 1;
13506         /* Insert a memory fence.  It's sometimes important that these
13507            are carried through to the generated code. */
13508         stmt( IRStmt_MBE(Imbe_Fence) );
13509         DIP("%sfence\n", gregLO3ofRM(getUChar(delta-1))==5 ? "l" : "m");
13510         goto decode_success;
13511      }
13512
13513      /* 0F AE /7 = CLFLUSH -- flush cache line */
13514      if (haveNo66noF2noF3(pfx)
13515          && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 7
13516          && sz == 4) {
13517
13518         /* This is something of a hack.  We need to know the size of
13519            the cache line containing addr.  Since we don't (easily),
13520            assume 256 on the basis that no real cache would have a
13521            line that big.  It's safe to invalidate more stuff than we
13522            need, just inefficient. */
13523         ULong lineszB = 256ULL;
13524
13525         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13526         delta += alen;
13527
13528         /* Round addr down to the start of the containing block. */
13529         stmt( IRStmt_Put(
13530                  OFFB_CMSTART,
13531                  binop( Iop_And64,
13532                         mkexpr(addr),
13533                         mkU64( ~(lineszB-1) ))) );
13534
13535         stmt( IRStmt_Put(OFFB_CMLEN, mkU64(lineszB) ) );
13536
13537         jmp_lit(dres, Ijk_InvalICache, (Addr64)(guest_RIP_bbstart+delta));
13538
13539         DIP("clflush %s\n", dis_buf);
13540         goto decode_success;
13541      }
13542
13543      /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
13544      if (haveNo66noF2noF3(pfx)
13545          && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 3
13546          && sz == 4) {
13547         delta = dis_STMXCSR(vbi, pfx, delta, False/*!isAvx*/);
13548         goto decode_success;
13549      }
13550      /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
13551      if (haveNo66noF2noF3(pfx)
13552          && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 2
13553          && sz == 4) {
13554         delta = dis_LDMXCSR(vbi, pfx, delta, False/*!isAvx*/);
13555         goto decode_success;
13556      }
13557      /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory.
13558         Note that the presence or absence of REX.W slightly affects the
13559         written format: whether the saved FPU IP and DP pointers are 64
13560         or 32 bits.  But the helper function we call simply writes zero
13561         bits in the relevant fields (which are 64 bits regardless of
13562         what REX.W is) and so it's good enough (iow, equally broken) in
13563         both cases. */
13564      if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
13565          && !epartIsReg(getUChar(delta))
13566          && gregOfRexRM(pfx,getUChar(delta)) == 0) {
13567          IRDirty* d;
13568         modrm = getUChar(delta);
13569         vassert(!epartIsReg(modrm));
13570
13571         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13572         delta += alen;
13573         gen_SEGV_if_not_16_aligned(addr);
13574
13575         DIP("%sfxsave %s\n", sz==8 ? "rex64/" : "", dis_buf);
13576
13577         /* Uses dirty helper:
13578               void amd64g_do_FXSAVE ( VexGuestAMD64State*, ULong ) */
13579         d = unsafeIRDirty_0_N (
13580                0/*regparms*/,
13581                "amd64g_dirtyhelper_FXSAVE",
13582                &amd64g_dirtyhelper_FXSAVE,
13583                mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
13584             );
13585
13586         /* declare we're writing memory */
13587         d->mFx   = Ifx_Write;
13588         d->mAddr = mkexpr(addr);
13589         d->mSize = 464; /* according to recent Intel docs */
13590
13591         /* declare we're reading guest state */
13592         d->nFxState = 7;
13593         vex_bzero(&d->fxState, sizeof(d->fxState));
13594
13595         d->fxState[0].fx     = Ifx_Read;
13596         d->fxState[0].offset = OFFB_FTOP;
13597         d->fxState[0].size   = sizeof(UInt);
13598
13599         d->fxState[1].fx     = Ifx_Read;
13600         d->fxState[1].offset = OFFB_FPREGS;
13601         d->fxState[1].size   = 8 * sizeof(ULong);
13602
13603         d->fxState[2].fx     = Ifx_Read;
13604         d->fxState[2].offset = OFFB_FPTAGS;
13605         d->fxState[2].size   = 8 * sizeof(UChar);
13606
13607         d->fxState[3].fx     = Ifx_Read;
13608         d->fxState[3].offset = OFFB_FPROUND;
13609         d->fxState[3].size   = sizeof(ULong);
13610
13611         d->fxState[4].fx     = Ifx_Read;
13612         d->fxState[4].offset = OFFB_FC3210;
13613         d->fxState[4].size   = sizeof(ULong);
13614
13615         d->fxState[5].fx     = Ifx_Read;
13616         d->fxState[5].offset = OFFB_YMM0;
13617         d->fxState[5].size   = sizeof(U128);
13618         /* plus 15 more of the above, spaced out in YMM sized steps */
13619         d->fxState[5].nRepeats  = 15;
13620         d->fxState[5].repeatLen = sizeof(U256);
13621
13622         d->fxState[6].fx     = Ifx_Read;
13623         d->fxState[6].offset = OFFB_SSEROUND;
13624         d->fxState[6].size   = sizeof(ULong);
13625
13626         /* Be paranoid ... this assertion tries to ensure the 16 %ymm
13627            images are packed back-to-back.  If not, the settings for
13628            d->fxState[5] are wrong. */
13629         vassert(32 == sizeof(U256));
13630         vassert(OFFB_YMM15 == (OFFB_YMM0 + 15 * 32));
13631
13632         stmt( IRStmt_Dirty(d) );
13633
13634         goto decode_success;
13635      }
13636      /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory.
13637         As with FXSAVE above we ignore the value of REX.W since we're
13638         not bothering with the FPU DP and IP fields. */
13639      if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)
13640          && !epartIsReg(getUChar(delta))
13641          && gregOfRexRM(pfx,getUChar(delta)) == 1) {
13642         IRDirty* d;
13643         modrm = getUChar(delta);
13644         vassert(!epartIsReg(modrm));
13645
13646         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13647         delta += alen;
13648         gen_SEGV_if_not_16_aligned(addr);
13649
13650         DIP("%sfxrstor %s\n", sz==8 ? "rex64/" : "", dis_buf);
13651
13652         /* Uses dirty helper:
13653               VexEmNote amd64g_do_FXRSTOR ( VexGuestAMD64State*, ULong )
13654            NOTE:
13655               the VexEmNote value is simply ignored
13656         */
13657         d = unsafeIRDirty_0_N (
13658                0/*regparms*/,
13659                "amd64g_dirtyhelper_FXRSTOR",
13660                &amd64g_dirtyhelper_FXRSTOR,
13661                mkIRExprVec_2( IRExpr_BBPTR(), mkexpr(addr) )
13662             );
13663
13664         /* declare we're reading memory */
13665         d->mFx   = Ifx_Read;
13666         d->mAddr = mkexpr(addr);
13667         d->mSize = 464; /* according to recent Intel docs */
13668
13669         /* declare we're writing guest state */
13670         d->nFxState = 7;
13671         vex_bzero(&d->fxState, sizeof(d->fxState));
13672
13673         d->fxState[0].fx     = Ifx_Write;
13674         d->fxState[0].offset = OFFB_FTOP;
13675         d->fxState[0].size   = sizeof(UInt);
13676
13677         d->fxState[1].fx     = Ifx_Write;
13678         d->fxState[1].offset = OFFB_FPREGS;
13679         d->fxState[1].size   = 8 * sizeof(ULong);
13680
13681         d->fxState[2].fx     = Ifx_Write;
13682         d->fxState[2].offset = OFFB_FPTAGS;
13683         d->fxState[2].size   = 8 * sizeof(UChar);
13684
13685         d->fxState[3].fx     = Ifx_Write;
13686         d->fxState[3].offset = OFFB_FPROUND;
13687         d->fxState[3].size   = sizeof(ULong);
13688
13689         d->fxState[4].fx     = Ifx_Write;
13690         d->fxState[4].offset = OFFB_FC3210;
13691         d->fxState[4].size   = sizeof(ULong);
13692
13693         d->fxState[5].fx     = Ifx_Write;
13694         d->fxState[5].offset = OFFB_YMM0;
13695         d->fxState[5].size   = sizeof(U128);
13696         /* plus 15 more of the above, spaced out in YMM sized steps */
13697         d->fxState[5].nRepeats  = 15;
13698         d->fxState[5].repeatLen = sizeof(U256);
13699
13700         d->fxState[6].fx     = Ifx_Write;
13701         d->fxState[6].offset = OFFB_SSEROUND;
13702         d->fxState[6].size   = sizeof(ULong);
13703
13704         /* Be paranoid ... this assertion tries to ensure the 16 %ymm
13705            images are packed back-to-back.  If not, the settings for
13706            d->fxState[5] are wrong. */
13707         vassert(32 == sizeof(U256));
13708         vassert(OFFB_YMM15 == (OFFB_YMM0 + 15 * 32));
13709
13710         stmt( IRStmt_Dirty(d) );
13711
13712         goto decode_success;
13713      }
13714      break;
13715
13716   case 0xC2:
13717      /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
13718      if (haveNo66noF2noF3(pfx) && sz == 4) {
13719         Long delta0 = delta;
13720         delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpps", True, 4 );
13721         if (delta > delta0) goto decode_success;
13722      }
13723      /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
13724      if (haveF3no66noF2(pfx) && sz == 4) {
13725         Long delta0 = delta;
13726         delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpss", False, 4 );
13727         if (delta > delta0) goto decode_success;
13728      }
13729      /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
13730      if (haveF2no66noF3(pfx) && sz == 4) {
13731         Long delta0 = delta;
13732         delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmpsd", False, 8 );
13733         if (delta > delta0) goto decode_success;
13734      }
13735      /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
13736      if (have66noF2noF3(pfx) && sz == 2) {
13737         Long delta0 = delta;
13738         delta = dis_SSE_cmp_E_to_G( vbi, pfx, delta, "cmppd", True, 8 );
13739         if (delta > delta0) goto decode_success;
13740      }
13741      break;
13742
13743   case 0xC3:
13744      /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
13745      if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
13746         modrm = getUChar(delta);
13747         if (!epartIsReg(modrm)) {
13748            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
13749            storeLE( mkexpr(addr), getIRegG(sz, pfx, modrm) );
13750            DIP("movnti %s,%s\n", dis_buf,
13751                                  nameIRegG(sz, pfx, modrm));
13752            delta += alen;
13753            goto decode_success;
13754         }
13755         /* else fall through */
13756      }
13757      break;
13758
13759   case 0xC4:
13760      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
13761      /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
13762         put it into the specified lane of mmx(G). */
13763      if (haveNo66noF2noF3(pfx)
13764          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
13765         /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
13766            mmx reg.  t4 is the new lane value.  t5 is the original
13767            mmx value. t6 is the new mmx value. */
13768         Int lane;
13769         t4 = newTemp(Ity_I16);
13770         t5 = newTemp(Ity_I64);
13771         t6 = newTemp(Ity_I64);
13772         modrm = getUChar(delta);
13773         do_MMX_preamble();
13774
13775         assign(t5, getMMXReg(gregLO3ofRM(modrm)));
13776         breakup64to16s( t5, &t3, &t2, &t1, &t0 );
13777
13778         if (epartIsReg(modrm)) {
13779            assign(t4, getIReg16(eregOfRexRM(pfx,modrm)));
13780            delta += 1+1;
13781            lane = getUChar(delta-1);
13782            DIP("pinsrw $%d,%s,%s\n", (Int)lane,
13783                                      nameIReg16(eregOfRexRM(pfx,modrm)),
13784                                      nameMMXReg(gregLO3ofRM(modrm)));
13785         } else {
13786            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
13787            delta += 1+alen;
13788            lane = getUChar(delta-1);
13789            assign(t4, loadLE(Ity_I16, mkexpr(addr)));
13790            DIP("pinsrw $%d,%s,%s\n", (Int)lane,
13791                                      dis_buf,
13792                                      nameMMXReg(gregLO3ofRM(modrm)));
13793         }
13794
13795         switch (lane & 3) {
13796            case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
13797            case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
13798            case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
13799            case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
13800            default: vassert(0);
13801         }
13802         putMMXReg(gregLO3ofRM(modrm), mkexpr(t6));
13803         goto decode_success;
13804      }
13805      /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
13806         put it into the specified lane of xmm(G). */
13807      if (have66noF2noF3(pfx)
13808          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
13809         Int lane;
13810         t4 = newTemp(Ity_I16);
13811         modrm = getUChar(delta);
13812         UInt rG = gregOfRexRM(pfx,modrm);
13813         if (epartIsReg(modrm)) {
13814            UInt rE = eregOfRexRM(pfx,modrm);
13815            assign(t4, getIReg16(rE));
13816            delta += 1+1;
13817            lane = getUChar(delta-1);
13818            DIP("pinsrw $%d,%s,%s\n",
13819                (Int)lane, nameIReg16(rE), nameXMMReg(rG));
13820         } else {
13821            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
13822                              1/*byte after the amode*/ );
13823            delta += 1+alen;
13824            lane = getUChar(delta-1);
13825            assign(t4, loadLE(Ity_I16, mkexpr(addr)));
13826            DIP("pinsrw $%d,%s,%s\n",
13827                (Int)lane, dis_buf, nameXMMReg(rG));
13828         }
13829         IRTemp src_vec = newTemp(Ity_V128);
13830         assign(src_vec, getXMMReg(rG));
13831         IRTemp res_vec = math_PINSRW_128( src_vec, t4, lane & 7);
13832         putXMMReg(rG, mkexpr(res_vec));
13833         goto decode_success;
13834      }
13835      break;
13836
13837   case 0xC5:
13838      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
13839      /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
13840         zero-extend of it in ireg(G). */
13841      if (haveNo66noF2noF3(pfx) && (sz == 4 || sz == 8)) {
13842         modrm = getUChar(delta);
13843         if (epartIsReg(modrm)) {
13844            IRTemp sV = newTemp(Ity_I64);
13845            t5 = newTemp(Ity_I16);
13846            do_MMX_preamble();
13847            assign(sV, getMMXReg(eregLO3ofRM(modrm)));
13848            breakup64to16s( sV, &t3, &t2, &t1, &t0 );
13849            switch (getUChar(delta+1) & 3) {
13850               case 0:  assign(t5, mkexpr(t0)); break;
13851               case 1:  assign(t5, mkexpr(t1)); break;
13852               case 2:  assign(t5, mkexpr(t2)); break;
13853               case 3:  assign(t5, mkexpr(t3)); break;
13854               default: vassert(0);
13855            }
13856            if (sz == 8)
13857               putIReg64(gregOfRexRM(pfx,modrm), unop(Iop_16Uto64, mkexpr(t5)));
13858            else
13859               putIReg32(gregOfRexRM(pfx,modrm), unop(Iop_16Uto32, mkexpr(t5)));
13860            DIP("pextrw $%d,%s,%s\n",
13861                (Int)getUChar(delta+1),
13862                nameMMXReg(eregLO3ofRM(modrm)),
13863                sz==8 ? nameIReg64(gregOfRexRM(pfx,modrm))
13864                      : nameIReg32(gregOfRexRM(pfx,modrm))
13865            );
13866            delta += 2;
13867            goto decode_success;
13868         }
13869         /* else fall through */
13870         /* note, for anyone filling in the mem case: this insn has one
13871            byte after the amode and therefore you must pass 1 as the
13872            last arg to disAMode */
13873      }
13874      /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
13875         zero-extend of it in ireg(G). */
13876      if (have66noF2noF3(pfx)
13877          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
13878         Long delta0 = delta;
13879         delta = dis_PEXTRW_128_EregOnly_toG( vbi, pfx, delta,
13880                                              False/*!isAvx*/ );
13881         if (delta > delta0) goto decode_success;
13882         /* else fall through -- decoding has failed */
13883      }
13884      break;
13885
13886   case 0xC6:
13887      /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
13888      if (haveNo66noF2noF3(pfx) && sz == 4) {
13889         Int    imm8 = 0;
13890         IRTemp sV   = newTemp(Ity_V128);
13891         IRTemp dV   = newTemp(Ity_V128);
13892         modrm = getUChar(delta);
13893         UInt rG = gregOfRexRM(pfx,modrm);
13894         assign( dV, getXMMReg(rG) );
13895         if (epartIsReg(modrm)) {
13896            UInt rE = eregOfRexRM(pfx,modrm);
13897            assign( sV, getXMMReg(rE) );
13898            imm8 = (Int)getUChar(delta+1);
13899            delta += 1+1;
13900            DIP("shufps $%d,%s,%s\n", imm8, nameXMMReg(rE), nameXMMReg(rG));
13901         } else {
13902            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
13903            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
13904            imm8 = (Int)getUChar(delta+alen);
13905            delta += 1+alen;
13906            DIP("shufps $%d,%s,%s\n", imm8, dis_buf, nameXMMReg(rG));
13907         }
13908         IRTemp res = math_SHUFPS_128( sV, dV, imm8 );
13909         putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
13910         goto decode_success;
13911      }
13912      /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
13913      if (have66noF2noF3(pfx) && sz == 2) {
13914         Int    select;
13915         IRTemp sV = newTemp(Ity_V128);
13916         IRTemp dV = newTemp(Ity_V128);
13917
13918         modrm = getUChar(delta);
13919         assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
13920
13921         if (epartIsReg(modrm)) {
13922            assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
13923            select = (Int)getUChar(delta+1);
13924            delta += 1+1;
13925            DIP("shufpd $%d,%s,%s\n", select,
13926                                      nameXMMReg(eregOfRexRM(pfx,modrm)),
13927                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
13928         } else {
13929            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
13930            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
13931            select = getUChar(delta+alen);
13932            delta += 1+alen;
13933            DIP("shufpd $%d,%s,%s\n", select,
13934                                      dis_buf,
13935                                      nameXMMReg(gregOfRexRM(pfx,modrm)));
13936         }
13937
13938         IRTemp res = math_SHUFPD_128( sV, dV, select );
13939         putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
13940         goto decode_success;
13941      }
13942      break;
13943
13944   case 0xD1:
13945      /* 66 0F D1 = PSRLW by E */
13946      if (have66noF2noF3(pfx) && sz == 2) {
13947         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlw", Iop_ShrN16x8 );
13948         goto decode_success;
13949      }
13950      break;
13951
13952   case 0xD2:
13953      /* 66 0F D2 = PSRLD by E */
13954      if (have66noF2noF3(pfx) && sz == 2) {
13955         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrld", Iop_ShrN32x4 );
13956         goto decode_success;
13957      }
13958      break;
13959
13960   case 0xD3:
13961      /* 66 0F D3 = PSRLQ by E */
13962      if (have66noF2noF3(pfx) && sz == 2) {
13963         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrlq", Iop_ShrN64x2 );
13964         goto decode_success;
13965      }
13966      break;
13967
13968   case 0xD4:
13969      /* 66 0F D4 = PADDQ */
13970      if (have66noF2noF3(pfx) && sz == 2) {
13971         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13972                                    "paddq", Iop_Add64x2, False );
13973         goto decode_success;
13974      }
13975      /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
13976      /* 0F D4 = PADDQ -- add 64x1 */
13977      if (haveNo66noF2noF3(pfx) && sz == 4) {
13978         do_MMX_preamble();
13979         delta = dis_MMXop_regmem_to_reg (
13980                   vbi, pfx, delta, opc, "paddq", False );
13981         goto decode_success;
13982      }
13983      break;
13984
13985   case 0xD5:
13986      /* 66 0F D5 = PMULLW -- 16x8 multiply */
13987      if (have66noF2noF3(pfx) && sz == 2) {
13988         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
13989                                    "pmullw", Iop_Mul16x8, False );
13990         goto decode_success;
13991      }
13992      break;
13993
13994   case 0xD6:
13995      /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
13996         hi half). */
13997      if (haveF3no66noF2(pfx) && sz == 4) {
13998         modrm = getUChar(delta);
13999         if (epartIsReg(modrm)) {
14000            do_MMX_preamble();
14001            putXMMReg( gregOfRexRM(pfx,modrm),
14002                       unop(Iop_64UtoV128, getMMXReg( eregLO3ofRM(modrm) )) );
14003            DIP("movq2dq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
14004                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
14005            delta += 1;
14006            goto decode_success;
14007         }
14008         /* apparently no mem case for this insn */
14009      }
14010      /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
14011         or lo half xmm).  */
14012      if (have66noF2noF3(pfx)
14013          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
14014         modrm = getUChar(delta);
14015         if (epartIsReg(modrm)) {
14016            /* fall through, awaiting test case */
14017            /* dst: lo half copied, hi half zeroed */
14018         } else {
14019            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14020            storeLE( mkexpr(addr),
14021                     getXMMRegLane64( gregOfRexRM(pfx,modrm), 0 ));
14022            DIP("movq %s,%s\n", nameXMMReg(gregOfRexRM(pfx,modrm)), dis_buf );
14023            delta += alen;
14024            goto decode_success;
14025         }
14026      }
14027      /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
14028      if (haveF2no66noF3(pfx) && sz == 4) {
14029         modrm = getUChar(delta);
14030         if (epartIsReg(modrm)) {
14031            do_MMX_preamble();
14032            putMMXReg( gregLO3ofRM(modrm),
14033                       getXMMRegLane64( eregOfRexRM(pfx,modrm), 0 ));
14034            DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
14035                                   nameMMXReg(gregLO3ofRM(modrm)));
14036            delta += 1;
14037            goto decode_success;
14038         }
14039         /* apparently no mem case for this insn */
14040      }
14041      break;
14042
14043   case 0xD7:
14044      /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16
14045         lanes in xmm(E), turn them into a byte, and put
14046         zero-extend of it in ireg(G).  Doing this directly is just
14047         too cumbersome; give up therefore and call a helper. */
14048      if (have66noF2noF3(pfx)
14049          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)
14050          && epartIsReg(getUChar(delta))) { /* no memory case, it seems */
14051         delta = dis_PMOVMSKB_128( vbi, pfx, delta, False/*!isAvx*/ );
14052         goto decode_success;
14053      }
14054      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14055      /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
14056         mmx(E), turn them into a byte, and put zero-extend of it in
14057         ireg(G). */
14058      if (haveNo66noF2noF3(pfx)
14059          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
14060         modrm = getUChar(delta);
14061         if (epartIsReg(modrm)) {
14062            do_MMX_preamble();
14063            t0 = newTemp(Ity_I64);
14064            t1 = newTemp(Ity_I32);
14065            assign(t0, getMMXReg(eregLO3ofRM(modrm)));
14066            assign(t1, unop(Iop_8Uto32, unop(Iop_GetMSBs8x8, mkexpr(t0))));
14067            putIReg32(gregOfRexRM(pfx,modrm), mkexpr(t1));
14068            DIP("pmovmskb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
14069                                    nameIReg32(gregOfRexRM(pfx,modrm)));
14070            delta += 1;
14071            goto decode_success;
14072         }
14073         /* else fall through */
14074      }
14075      break;
14076
14077   case 0xD8:
14078      /* 66 0F D8 = PSUBUSB */
14079      if (have66noF2noF3(pfx) && sz == 2) {
14080         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14081                                    "psubusb", Iop_QSub8Ux16, False );
14082         goto decode_success;
14083      }
14084      break;
14085
14086   case 0xD9:
14087      /* 66 0F D9 = PSUBUSW */
14088      if (have66noF2noF3(pfx) && sz == 2) {
14089         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14090                                    "psubusw", Iop_QSub16Ux8, False );
14091         goto decode_success;
14092      }
14093      break;
14094
14095   case 0xDA:
14096      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14097      /* 0F DA = PMINUB -- 8x8 unsigned min */
14098      if (haveNo66noF2noF3(pfx) && sz == 4) {
14099         do_MMX_preamble();
14100         delta = dis_MMXop_regmem_to_reg (
14101                    vbi, pfx, delta, opc, "pminub", False );
14102         goto decode_success;
14103      }
14104      /* 66 0F DA = PMINUB -- 8x16 unsigned min */
14105      if (have66noF2noF3(pfx) && sz == 2) {
14106         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14107                                    "pminub", Iop_Min8Ux16, False );
14108         goto decode_success;
14109      }
14110      break;
14111
14112   case 0xDB:
14113      /* 66 0F DB = PAND */
14114      if (have66noF2noF3(pfx) && sz == 2) {
14115         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pand", Iop_AndV128 );
14116         goto decode_success;
14117      }
14118      break;
14119
14120   case 0xDC:
14121      /* 66 0F DC = PADDUSB */
14122      if (have66noF2noF3(pfx) && sz == 2) {
14123         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14124                                    "paddusb", Iop_QAdd8Ux16, False );
14125         goto decode_success;
14126      }
14127      break;
14128
14129   case 0xDD:
14130      /* 66 0F DD = PADDUSW */
14131      if (have66noF2noF3(pfx) && sz == 2) {
14132         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14133                                    "paddusw", Iop_QAdd16Ux8, False );
14134         goto decode_success;
14135      }
14136      break;
14137
14138   case 0xDE:
14139      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14140      /* 0F DE = PMAXUB -- 8x8 unsigned max */
14141      if (haveNo66noF2noF3(pfx) && sz == 4) {
14142         do_MMX_preamble();
14143         delta = dis_MMXop_regmem_to_reg (
14144                    vbi, pfx, delta, opc, "pmaxub", False );
14145         goto decode_success;
14146      }
14147      /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
14148      if (have66noF2noF3(pfx) && sz == 2) {
14149         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14150                                    "pmaxub", Iop_Max8Ux16, False );
14151         goto decode_success;
14152      }
14153      break;
14154
14155   case 0xDF:
14156      /* 66 0F DF = PANDN */
14157      if (have66noF2noF3(pfx) && sz == 2) {
14158         delta = dis_SSE_E_to_G_all_invG( vbi, pfx, delta, "pandn", Iop_AndV128 );
14159         goto decode_success;
14160      }
14161      break;
14162
14163   case 0xE0:
14164      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14165      /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
14166      if (haveNo66noF2noF3(pfx) && sz == 4) {
14167         do_MMX_preamble();
14168         delta = dis_MMXop_regmem_to_reg (
14169                    vbi, pfx, delta, opc, "pavgb", False );
14170         goto decode_success;
14171      }
14172      /* 66 0F E0 = PAVGB */
14173      if (have66noF2noF3(pfx) && sz == 2) {
14174         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14175                                    "pavgb", Iop_Avg8Ux16, False );
14176         goto decode_success;
14177      }
14178      break;
14179
14180   case 0xE1:
14181      /* 66 0F E1 = PSRAW by E */
14182      if (have66noF2noF3(pfx) && sz == 2) {
14183         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psraw", Iop_SarN16x8 );
14184         goto decode_success;
14185      }
14186      break;
14187
14188   case 0xE2:
14189      /* 66 0F E2 = PSRAD by E */
14190      if (have66noF2noF3(pfx) && sz == 2) {
14191         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psrad", Iop_SarN32x4 );
14192         goto decode_success;
14193      }
14194      break;
14195
14196   case 0xE3:
14197      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14198      /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
14199      if (haveNo66noF2noF3(pfx) && sz == 4) {
14200         do_MMX_preamble();
14201         delta = dis_MMXop_regmem_to_reg (
14202                    vbi, pfx, delta, opc, "pavgw", False );
14203         goto decode_success;
14204      }
14205      /* 66 0F E3 = PAVGW */
14206      if (have66noF2noF3(pfx) && sz == 2) {
14207         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14208                                    "pavgw", Iop_Avg16Ux8, False );
14209         goto decode_success;
14210      }
14211      break;
14212
14213   case 0xE4:
14214      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14215      /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
14216      if (haveNo66noF2noF3(pfx) && sz == 4) {
14217         do_MMX_preamble();
14218         delta = dis_MMXop_regmem_to_reg (
14219                    vbi, pfx, delta, opc, "pmuluh", False );
14220         goto decode_success;
14221      }
14222      /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
14223      if (have66noF2noF3(pfx) && sz == 2) {
14224         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14225                                    "pmulhuw", Iop_MulHi16Ux8, False );
14226         goto decode_success;
14227      }
14228      break;
14229
14230   case 0xE5:
14231      /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
14232      if (have66noF2noF3(pfx) && sz == 2) {
14233         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14234                                    "pmulhw", Iop_MulHi16Sx8, False );
14235         goto decode_success;
14236      }
14237      break;
14238
14239   case 0xE6:
14240      /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
14241         lo half xmm(G), and zero upper half, rounding towards zero */
14242      /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
14243         lo half xmm(G), according to prevailing rounding mode, and zero
14244         upper half */
14245      if ( (haveF2no66noF3(pfx) && sz == 4)
14246           || (have66noF2noF3(pfx) && sz == 2) ) {
14247         delta = dis_CVTxPD2DQ_128( vbi, pfx, delta, False/*!isAvx*/,
14248                                    toBool(sz == 2)/*r2zero*/);
14249         goto decode_success;
14250      }
14251      /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
14252         F64 in xmm(G) */
14253      if (haveF3no66noF2(pfx) && sz == 4) {
14254         delta = dis_CVTDQ2PD_128(vbi, pfx, delta, False/*!isAvx*/);
14255         goto decode_success;
14256      }
14257      break;
14258
14259   case 0xE7:
14260      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14261      /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
14262         Intel manual does not say anything about the usual business of
14263         the FP reg tags getting trashed whenever an MMX insn happens.
14264         So we just leave them alone.
14265      */
14266      if (haveNo66noF2noF3(pfx) && sz == 4) {
14267         modrm = getUChar(delta);
14268         if (!epartIsReg(modrm)) {
14269            /* do_MMX_preamble(); Intel docs don't specify this */
14270            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14271            storeLE( mkexpr(addr), getMMXReg(gregLO3ofRM(modrm)) );
14272            DIP("movntq %s,%s\n", dis_buf,
14273                                  nameMMXReg(gregLO3ofRM(modrm)));
14274            delta += alen;
14275            goto decode_success;
14276         }
14277         /* else fall through */
14278      }
14279      /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
14280      if (have66noF2noF3(pfx) && sz == 2) {
14281         modrm = getUChar(delta);
14282         if (!epartIsReg(modrm)) {
14283            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14284            gen_SEGV_if_not_16_aligned( addr );
14285            storeLE( mkexpr(addr), getXMMReg(gregOfRexRM(pfx,modrm)) );
14286            DIP("movntdq %s,%s\n", dis_buf,
14287                                   nameXMMReg(gregOfRexRM(pfx,modrm)));
14288            delta += alen;
14289            goto decode_success;
14290         }
14291         /* else fall through */
14292      }
14293      break;
14294
14295   case 0xE8:
14296      /* 66 0F E8 = PSUBSB */
14297      if (have66noF2noF3(pfx) && sz == 2) {
14298         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14299                                    "psubsb", Iop_QSub8Sx16, False );
14300         goto decode_success;
14301      }
14302      break;
14303
14304   case 0xE9:
14305      /* 66 0F E9 = PSUBSW */
14306      if (have66noF2noF3(pfx) && sz == 2) {
14307         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14308                                    "psubsw", Iop_QSub16Sx8, False );
14309         goto decode_success;
14310      }
14311      break;
14312
14313   case 0xEA:
14314      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14315      /* 0F EA = PMINSW -- 16x4 signed min */
14316      if (haveNo66noF2noF3(pfx) && sz == 4) {
14317         do_MMX_preamble();
14318         delta = dis_MMXop_regmem_to_reg (
14319                    vbi, pfx, delta, opc, "pminsw", False );
14320         goto decode_success;
14321      }
14322      /* 66 0F EA = PMINSW -- 16x8 signed min */
14323      if (have66noF2noF3(pfx) && sz == 2) {
14324         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14325                                    "pminsw", Iop_Min16Sx8, False );
14326         goto decode_success;
14327      }
14328      break;
14329
14330   case 0xEB:
14331      /* 66 0F EB = POR */
14332      if (have66noF2noF3(pfx) && sz == 2) {
14333         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "por", Iop_OrV128 );
14334         goto decode_success;
14335      }
14336      break;
14337
14338   case 0xEC:
14339      /* 66 0F EC = PADDSB */
14340      if (have66noF2noF3(pfx) && sz == 2) {
14341         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14342                                    "paddsb", Iop_QAdd8Sx16, False );
14343         goto decode_success;
14344      }
14345      break;
14346
14347   case 0xED:
14348      /* 66 0F ED = PADDSW */
14349      if (have66noF2noF3(pfx) && sz == 2) {
14350         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14351                                    "paddsw", Iop_QAdd16Sx8, False );
14352         goto decode_success;
14353      }
14354      break;
14355
14356   case 0xEE:
14357      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14358      /* 0F EE = PMAXSW -- 16x4 signed max */
14359      if (haveNo66noF2noF3(pfx) && sz == 4) {
14360         do_MMX_preamble();
14361         delta = dis_MMXop_regmem_to_reg (
14362                    vbi, pfx, delta, opc, "pmaxsw", False );
14363         goto decode_success;
14364      }
14365      /* 66 0F EE = PMAXSW -- 16x8 signed max */
14366      if (have66noF2noF3(pfx) && sz == 2) {
14367         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14368                                    "pmaxsw", Iop_Max16Sx8, False );
14369         goto decode_success;
14370      }
14371      break;
14372
14373   case 0xEF:
14374      /* 66 0F EF = PXOR */
14375      if (have66noF2noF3(pfx) && sz == 2) {
14376         delta = dis_SSE_E_to_G_all( vbi, pfx, delta, "pxor", Iop_XorV128 );
14377         goto decode_success;
14378      }
14379      break;
14380
14381   case 0xF1:
14382      /* 66 0F F1 = PSLLW by E */
14383      if (have66noF2noF3(pfx) && sz == 2) {
14384         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllw", Iop_ShlN16x8 );
14385         goto decode_success;
14386      }
14387      break;
14388
14389   case 0xF2:
14390      /* 66 0F F2 = PSLLD by E */
14391      if (have66noF2noF3(pfx) && sz == 2) {
14392         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "pslld", Iop_ShlN32x4 );
14393         goto decode_success;
14394      }
14395      break;
14396
14397   case 0xF3:
14398      /* 66 0F F3 = PSLLQ by E */
14399      if (have66noF2noF3(pfx) && sz == 2) {
14400         delta = dis_SSE_shiftG_byE( vbi, pfx, delta, "psllq", Iop_ShlN64x2 );
14401         goto decode_success;
14402      }
14403      break;
14404
14405   case 0xF4:
14406      /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
14407         0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
14408         half */
14409      if (have66noF2noF3(pfx) && sz == 2) {
14410         IRTemp sV = newTemp(Ity_V128);
14411         IRTemp dV = newTemp(Ity_V128);
14412         modrm = getUChar(delta);
14413         UInt rG = gregOfRexRM(pfx,modrm);
14414         assign( dV, getXMMReg(rG) );
14415         if (epartIsReg(modrm)) {
14416            UInt rE = eregOfRexRM(pfx,modrm);
14417            assign( sV, getXMMReg(rE) );
14418            delta += 1;
14419            DIP("pmuludq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
14420         } else {
14421            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14422            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
14423            delta += alen;
14424            DIP("pmuludq %s,%s\n", dis_buf, nameXMMReg(rG));
14425         }
14426         putXMMReg( rG, mkexpr(math_PMULUDQ_128( sV, dV )) );
14427         goto decode_success;
14428      }
14429      /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
14430      /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
14431         0 to form 64-bit result */
14432      if (haveNo66noF2noF3(pfx) && sz == 4) {
14433         IRTemp sV = newTemp(Ity_I64);
14434         IRTemp dV = newTemp(Ity_I64);
14435         t1 = newTemp(Ity_I32);
14436         t0 = newTemp(Ity_I32);
14437         modrm = getUChar(delta);
14438
14439         do_MMX_preamble();
14440         assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
14441
14442         if (epartIsReg(modrm)) {
14443            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
14444            delta += 1;
14445            DIP("pmuludq %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
14446                                   nameMMXReg(gregLO3ofRM(modrm)));
14447         } else {
14448            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14449            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
14450            delta += alen;
14451            DIP("pmuludq %s,%s\n", dis_buf,
14452                                   nameMMXReg(gregLO3ofRM(modrm)));
14453         }
14454
14455         assign( t0, unop(Iop_64to32, mkexpr(dV)) );
14456         assign( t1, unop(Iop_64to32, mkexpr(sV)) );
14457         putMMXReg( gregLO3ofRM(modrm),
14458                    binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
14459         goto decode_success;
14460      }
14461      break;
14462
14463   case 0xF5:
14464      /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
14465         E(xmm or mem) to G(xmm) */
14466      if (have66noF2noF3(pfx) && sz == 2) {
14467         IRTemp sV = newTemp(Ity_V128);
14468         IRTemp dV = newTemp(Ity_V128);
14469         modrm     = getUChar(delta);
14470         UInt   rG = gregOfRexRM(pfx,modrm);
14471         if (epartIsReg(modrm)) {
14472            UInt rE = eregOfRexRM(pfx,modrm);
14473            assign( sV, getXMMReg(rE) );
14474            delta += 1;
14475            DIP("pmaddwd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
14476         } else {
14477            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14478            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
14479            delta += alen;
14480            DIP("pmaddwd %s,%s\n", dis_buf, nameXMMReg(rG));
14481         }
14482         assign( dV, getXMMReg(rG) );
14483         putXMMReg( rG, mkexpr(math_PMADDWD_128(dV, sV)) );
14484         goto decode_success;
14485      }
14486      break;
14487
14488   case 0xF6:
14489      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14490      /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
14491      if (haveNo66noF2noF3(pfx) && sz == 4) {
14492         do_MMX_preamble();
14493         delta = dis_MMXop_regmem_to_reg (
14494                    vbi, pfx, delta, opc, "psadbw", False );
14495         goto decode_success;
14496      }
14497      /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
14498         from E(xmm or mem) to G(xmm) */
14499      if (have66noF2noF3(pfx) && sz == 2) {
14500         IRTemp sV  = newTemp(Ity_V128);
14501         IRTemp dV  = newTemp(Ity_V128);
14502         modrm = getUChar(delta);
14503         UInt   rG   = gregOfRexRM(pfx,modrm);
14504         if (epartIsReg(modrm)) {
14505            UInt rE = eregOfRexRM(pfx,modrm);
14506            assign( sV, getXMMReg(rE) );
14507            delta += 1;
14508            DIP("psadbw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
14509         } else {
14510            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14511            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
14512            delta += alen;
14513            DIP("psadbw %s,%s\n", dis_buf, nameXMMReg(rG));
14514         }
14515         assign( dV, getXMMReg(rG) );
14516         putXMMReg( rG, mkexpr( math_PSADBW_128 ( dV, sV ) ) );
14517
14518         goto decode_success;
14519      }
14520      break;
14521
14522   case 0xF7:
14523      /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
14524      /* 0F F7 = MASKMOVQ -- 8x8 masked store */
14525      if (haveNo66noF2noF3(pfx) && sz == 4) {
14526         Bool ok = False;
14527         delta = dis_MMX( &ok, vbi, pfx, sz, delta-1 );
14528         if (ok) goto decode_success;
14529      }
14530      /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
14531      if (have66noF2noF3(pfx) && sz == 2 && epartIsReg(getUChar(delta))) {
14532         delta = dis_MASKMOVDQU( vbi, pfx, delta, False/*!isAvx*/ );
14533         goto decode_success;
14534      }
14535      break;
14536
14537   case 0xF8:
14538      /* 66 0F F8 = PSUBB */
14539      if (have66noF2noF3(pfx) && sz == 2) {
14540         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14541                                    "psubb", Iop_Sub8x16, False );
14542         goto decode_success;
14543      }
14544      break;
14545
14546   case 0xF9:
14547      /* 66 0F F9 = PSUBW */
14548      if (have66noF2noF3(pfx) && sz == 2) {
14549         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14550                                    "psubw", Iop_Sub16x8, False );
14551         goto decode_success;
14552      }
14553      break;
14554
14555   case 0xFA:
14556      /* 66 0F FA = PSUBD */
14557      if (have66noF2noF3(pfx) && sz == 2) {
14558         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14559                                    "psubd", Iop_Sub32x4, False );
14560         goto decode_success;
14561      }
14562      break;
14563
14564   case 0xFB:
14565      /* 66 0F FB = PSUBQ */
14566      if (have66noF2noF3(pfx) && sz == 2) {
14567         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14568                                    "psubq", Iop_Sub64x2, False );
14569         goto decode_success;
14570      }
14571      /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
14572      /* 0F FB = PSUBQ -- sub 64x1 */
14573      if (haveNo66noF2noF3(pfx) && sz == 4) {
14574         do_MMX_preamble();
14575         delta = dis_MMXop_regmem_to_reg (
14576                   vbi, pfx, delta, opc, "psubq", False );
14577         goto decode_success;
14578      }
14579      break;
14580
14581   case 0xFC:
14582      /* 66 0F FC = PADDB */
14583      if (have66noF2noF3(pfx) && sz == 2) {
14584         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14585                                    "paddb", Iop_Add8x16, False );
14586         goto decode_success;
14587      }
14588      break;
14589
14590   case 0xFD:
14591      /* 66 0F FD = PADDW */
14592      if (have66noF2noF3(pfx) && sz == 2) {
14593         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14594                                    "paddw", Iop_Add16x8, False );
14595         goto decode_success;
14596      }
14597      break;
14598
14599   case 0xFE:
14600      /* 66 0F FE = PADDD */
14601      if (have66noF2noF3(pfx) && sz == 2) {
14602         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
14603                                    "paddd", Iop_Add32x4, False );
14604         goto decode_success;
14605      }
14606      break;
14607
14608   default:
14609      goto decode_failure;
14610
14611   }
14612
14613  decode_failure:
14614   *decode_OK = False;
14615   return deltaIN;
14616
14617  decode_success:
14618   *decode_OK = True;
14619   return delta;
14620}
14621
14622
14623/*------------------------------------------------------------*/
14624/*---                                                      ---*/
14625/*--- Top-level SSE3 (not SupSSE3): dis_ESC_0F__SSE3       ---*/
14626/*---                                                      ---*/
14627/*------------------------------------------------------------*/
14628
14629static Long dis_MOVDDUP_128 ( VexAbiInfo* vbi, Prefix pfx,
14630                              Long delta, Bool isAvx )
14631{
14632   IRTemp addr   = IRTemp_INVALID;
14633   Int    alen   = 0;
14634   HChar  dis_buf[50];
14635   IRTemp sV    = newTemp(Ity_V128);
14636   IRTemp d0    = newTemp(Ity_I64);
14637   UChar  modrm = getUChar(delta);
14638   UInt   rG    = gregOfRexRM(pfx,modrm);
14639   if (epartIsReg(modrm)) {
14640      UInt rE = eregOfRexRM(pfx,modrm);
14641      assign( sV, getXMMReg(rE) );
14642      DIP("%smovddup %s,%s\n",
14643          isAvx ? "v" : "", nameXMMReg(rE), nameXMMReg(rG));
14644      delta += 1;
14645      assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
14646   } else {
14647      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14648      assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
14649      DIP("%smovddup %s,%s\n",
14650          isAvx ? "v" : "", dis_buf, nameXMMReg(rG));
14651      delta += alen;
14652   }
14653   (isAvx ? putYMMRegLoAndZU : putXMMReg)
14654      ( rG, binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
14655   return delta;
14656}
14657
14658
14659static Long dis_MOVDDUP_256 ( VexAbiInfo* vbi, Prefix pfx,
14660                              Long delta )
14661{
14662   IRTemp addr   = IRTemp_INVALID;
14663   Int    alen   = 0;
14664   HChar  dis_buf[50];
14665   IRTemp d0    = newTemp(Ity_I64);
14666   IRTemp d1    = newTemp(Ity_I64);
14667   UChar  modrm = getUChar(delta);
14668   UInt   rG    = gregOfRexRM(pfx,modrm);
14669   if (epartIsReg(modrm)) {
14670      UInt rE = eregOfRexRM(pfx,modrm);
14671      DIP("vmovddup %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
14672      delta += 1;
14673      assign ( d0, getYMMRegLane64(rE, 0) );
14674      assign ( d1, getYMMRegLane64(rE, 2) );
14675   } else {
14676      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14677      assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
14678      assign( d1, loadLE(Ity_I64, binop(Iop_Add64,
14679                                        mkexpr(addr), mkU64(16))) );
14680      DIP("vmovddup %s,%s\n", dis_buf, nameYMMReg(rG));
14681      delta += alen;
14682   }
14683   putYMMRegLane64( rG, 0, mkexpr(d0) );
14684   putYMMRegLane64( rG, 1, mkexpr(d0) );
14685   putYMMRegLane64( rG, 2, mkexpr(d1) );
14686   putYMMRegLane64( rG, 3, mkexpr(d1) );
14687   return delta;
14688}
14689
14690
14691static Long dis_MOVSxDUP_128 ( VexAbiInfo* vbi, Prefix pfx,
14692                               Long delta, Bool isAvx, Bool isL )
14693{
14694   IRTemp addr  = IRTemp_INVALID;
14695   Int    alen  = 0;
14696   HChar  dis_buf[50];
14697   IRTemp sV    = newTemp(Ity_V128);
14698   UChar  modrm = getUChar(delta);
14699   UInt   rG    = gregOfRexRM(pfx,modrm);
14700   IRTemp s3, s2, s1, s0;
14701   s3 = s2 = s1 = s0 = IRTemp_INVALID;
14702   if (epartIsReg(modrm)) {
14703      UInt rE = eregOfRexRM(pfx,modrm);
14704      assign( sV, getXMMReg(rE) );
14705      DIP("%smovs%cdup %s,%s\n",
14706          isAvx ? "v" : "", isL ? 'l' : 'h', nameXMMReg(rE), nameXMMReg(rG));
14707      delta += 1;
14708   } else {
14709      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14710      if (!isAvx)
14711         gen_SEGV_if_not_16_aligned( addr );
14712      assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
14713      DIP("%smovs%cdup %s,%s\n",
14714          isAvx ? "v" : "", isL ? 'l' : 'h', dis_buf, nameXMMReg(rG));
14715      delta += alen;
14716   }
14717   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
14718   (isAvx ? putYMMRegLoAndZU : putXMMReg)
14719      ( rG, isL ? mkV128from32s( s2, s2, s0, s0 )
14720                : mkV128from32s( s3, s3, s1, s1 ) );
14721   return delta;
14722}
14723
14724
14725static Long dis_MOVSxDUP_256 ( VexAbiInfo* vbi, Prefix pfx,
14726                               Long delta, Bool isL )
14727{
14728   IRTemp addr  = IRTemp_INVALID;
14729   Int    alen  = 0;
14730   HChar  dis_buf[50];
14731   IRTemp sV    = newTemp(Ity_V256);
14732   UChar  modrm = getUChar(delta);
14733   UInt   rG    = gregOfRexRM(pfx,modrm);
14734   IRTemp s7, s6, s5, s4, s3, s2, s1, s0;
14735   s7 = s6 = s5 = s4 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
14736   if (epartIsReg(modrm)) {
14737      UInt rE = eregOfRexRM(pfx,modrm);
14738      assign( sV, getYMMReg(rE) );
14739      DIP("vmovs%cdup %s,%s\n",
14740          isL ? 'l' : 'h', nameYMMReg(rE), nameYMMReg(rG));
14741      delta += 1;
14742   } else {
14743      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14744      assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
14745      DIP("vmovs%cdup %s,%s\n",
14746          isL ? 'l' : 'h', dis_buf, nameYMMReg(rG));
14747      delta += alen;
14748   }
14749   breakupV256to32s( sV, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
14750   putYMMRegLane128( rG, 1, isL ? mkV128from32s( s6, s6, s4, s4 )
14751                                : mkV128from32s( s7, s7, s5, s5 ) );
14752   putYMMRegLane128( rG, 0, isL ? mkV128from32s( s2, s2, s0, s0 )
14753                                : mkV128from32s( s3, s3, s1, s1 ) );
14754   return delta;
14755}
14756
14757
14758static IRTemp math_HADDPS_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
14759{
14760   IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
14761   IRTemp leftV  = newTemp(Ity_V128);
14762   IRTemp rightV = newTemp(Ity_V128);
14763   IRTemp rm     = newTemp(Ity_I32);
14764   s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
14765
14766   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
14767   breakupV128to32s( dV, &d3, &d2, &d1, &d0 );
14768
14769   assign( leftV,  mkV128from32s( s2, s0, d2, d0 ) );
14770   assign( rightV, mkV128from32s( s3, s1, d3, d1 ) );
14771
14772   IRTemp res = newTemp(Ity_V128);
14773   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
14774   assign( res, triop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
14775                      mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
14776   return res;
14777}
14778
14779
14780static IRTemp math_HADDPD_128 ( IRTemp dV, IRTemp sV, Bool isAdd )
14781{
14782   IRTemp s1, s0, d1, d0;
14783   IRTemp leftV  = newTemp(Ity_V128);
14784   IRTemp rightV = newTemp(Ity_V128);
14785   IRTemp rm     = newTemp(Ity_I32);
14786   s1 = s0 = d1 = d0 = IRTemp_INVALID;
14787
14788   breakupV128to64s( sV, &s1, &s0 );
14789   breakupV128to64s( dV, &d1, &d0 );
14790
14791   assign( leftV,  binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
14792   assign( rightV, binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
14793
14794   IRTemp res = newTemp(Ity_V128);
14795   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
14796   assign( res, triop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
14797                      mkexpr(rm), mkexpr(leftV), mkexpr(rightV) ) );
14798   return res;
14799}
14800
14801
14802__attribute__((noinline))
14803static
14804Long dis_ESC_0F__SSE3 ( Bool* decode_OK,
14805                        VexAbiInfo* vbi,
14806                        Prefix pfx, Int sz, Long deltaIN )
14807{
14808   IRTemp addr  = IRTemp_INVALID;
14809   UChar  modrm = 0;
14810   Int    alen  = 0;
14811   HChar  dis_buf[50];
14812
14813   *decode_OK = False;
14814
14815   Long   delta = deltaIN;
14816   UChar  opc   = getUChar(delta);
14817   delta++;
14818   switch (opc) {
14819
14820   case 0x12:
14821      /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
14822         duplicating some lanes (2:2:0:0). */
14823      if (haveF3no66noF2(pfx) && sz == 4) {
14824         delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
14825                                   True/*isL*/ );
14826         goto decode_success;
14827      }
14828      /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
14829         duplicating some lanes (0:1:0:1). */
14830      if (haveF2no66noF3(pfx)
14831          && (sz == 4 || /* ignore redundant REX.W */ sz == 8)) {
14832         delta = dis_MOVDDUP_128( vbi, pfx, delta, False/*!isAvx*/ );
14833         goto decode_success;
14834      }
14835      break;
14836
14837   case 0x16:
14838      /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
14839         duplicating some lanes (3:3:1:1). */
14840      if (haveF3no66noF2(pfx) && sz == 4) {
14841         delta = dis_MOVSxDUP_128( vbi, pfx, delta, False/*!isAvx*/,
14842                                   False/*!isL*/ );
14843         goto decode_success;
14844      }
14845      break;
14846
14847   case 0x7C:
14848   case 0x7D:
14849      /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
14850      /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
14851      if (haveF2no66noF3(pfx) && sz == 4) {
14852         IRTemp eV     = newTemp(Ity_V128);
14853         IRTemp gV     = newTemp(Ity_V128);
14854         Bool   isAdd  = opc == 0x7C;
14855         const HChar* str = isAdd ? "add" : "sub";
14856         modrm         = getUChar(delta);
14857         UInt   rG     = gregOfRexRM(pfx,modrm);
14858         if (epartIsReg(modrm)) {
14859            UInt rE = eregOfRexRM(pfx,modrm);
14860            assign( eV, getXMMReg(rE) );
14861            DIP("h%sps %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
14862            delta += 1;
14863         } else {
14864            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14865            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
14866            DIP("h%sps %s,%s\n", str, dis_buf, nameXMMReg(rG));
14867            delta += alen;
14868         }
14869
14870         assign( gV, getXMMReg(rG) );
14871         putXMMReg( rG, mkexpr( math_HADDPS_128 ( gV, eV, isAdd ) ) );
14872         goto decode_success;
14873      }
14874      /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
14875      /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
14876      if (have66noF2noF3(pfx) && sz == 2) {
14877         IRTemp eV     = newTemp(Ity_V128);
14878         IRTemp gV     = newTemp(Ity_V128);
14879         Bool   isAdd  = opc == 0x7C;
14880         const HChar* str = isAdd ? "add" : "sub";
14881         modrm         = getUChar(delta);
14882         UInt   rG     = gregOfRexRM(pfx,modrm);
14883         if (epartIsReg(modrm)) {
14884            UInt rE = eregOfRexRM(pfx,modrm);
14885            assign( eV, getXMMReg(rE) );
14886            DIP("h%spd %s,%s\n", str, nameXMMReg(rE), nameXMMReg(rG));
14887            delta += 1;
14888         } else {
14889            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14890            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
14891            DIP("h%spd %s,%s\n", str, dis_buf, nameXMMReg(rG));
14892            delta += alen;
14893         }
14894
14895         assign( gV, getXMMReg(rG) );
14896         putXMMReg( rG, mkexpr( math_HADDPD_128 ( gV, eV, isAdd ) ) );
14897         goto decode_success;
14898      }
14899      break;
14900
14901   case 0xD0:
14902      /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
14903      if (have66noF2noF3(pfx) && sz == 2) {
14904         IRTemp eV   = newTemp(Ity_V128);
14905         IRTemp gV   = newTemp(Ity_V128);
14906         modrm       = getUChar(delta);
14907         UInt   rG   = gregOfRexRM(pfx,modrm);
14908         if (epartIsReg(modrm)) {
14909            UInt rE = eregOfRexRM(pfx,modrm);
14910            assign( eV, getXMMReg(rE) );
14911            DIP("addsubpd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
14912            delta += 1;
14913         } else {
14914            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14915            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
14916            DIP("addsubpd %s,%s\n", dis_buf, nameXMMReg(rG));
14917            delta += alen;
14918         }
14919
14920         assign( gV, getXMMReg(rG) );
14921         putXMMReg( rG, mkexpr( math_ADDSUBPD_128 ( gV, eV ) ) );
14922         goto decode_success;
14923      }
14924      /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
14925      if (haveF2no66noF3(pfx) && sz == 4) {
14926         IRTemp eV   = newTemp(Ity_V128);
14927         IRTemp gV   = newTemp(Ity_V128);
14928         modrm       = getUChar(delta);
14929         UInt   rG   = gregOfRexRM(pfx,modrm);
14930
14931         modrm = getUChar(delta);
14932         if (epartIsReg(modrm)) {
14933            UInt rE = eregOfRexRM(pfx,modrm);
14934            assign( eV, getXMMReg(rE) );
14935            DIP("addsubps %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
14936            delta += 1;
14937         } else {
14938            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14939            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
14940            DIP("addsubps %s,%s\n", dis_buf, nameXMMReg(rG));
14941            delta += alen;
14942         }
14943
14944         assign( gV, getXMMReg(rG) );
14945         putXMMReg( rG, mkexpr( math_ADDSUBPS_128 ( gV, eV ) ) );
14946         goto decode_success;
14947      }
14948      break;
14949
14950   case 0xF0:
14951      /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
14952      if (haveF2no66noF3(pfx) && sz == 4) {
14953         modrm = getUChar(delta);
14954         if (epartIsReg(modrm)) {
14955            goto decode_failure;
14956         } else {
14957            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
14958            putXMMReg( gregOfRexRM(pfx,modrm),
14959                       loadLE(Ity_V128, mkexpr(addr)) );
14960            DIP("lddqu %s,%s\n", dis_buf,
14961                                 nameXMMReg(gregOfRexRM(pfx,modrm)));
14962            delta += alen;
14963         }
14964         goto decode_success;
14965      }
14966      break;
14967
14968   default:
14969      goto decode_failure;
14970
14971   }
14972
14973  decode_failure:
14974   *decode_OK = False;
14975   return deltaIN;
14976
14977  decode_success:
14978   *decode_OK = True;
14979   return delta;
14980}
14981
14982
14983/*------------------------------------------------------------*/
14984/*---                                                      ---*/
14985/*--- Top-level SSSE3: dis_ESC_0F38__SupSSE3               ---*/
14986/*---                                                      ---*/
14987/*------------------------------------------------------------*/
14988
14989static
14990IRTemp math_PSHUFB_XMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ )
14991{
14992   IRTemp sHi        = newTemp(Ity_I64);
14993   IRTemp sLo        = newTemp(Ity_I64);
14994   IRTemp dHi        = newTemp(Ity_I64);
14995   IRTemp dLo        = newTemp(Ity_I64);
14996   IRTemp rHi        = newTemp(Ity_I64);
14997   IRTemp rLo        = newTemp(Ity_I64);
14998   IRTemp sevens     = newTemp(Ity_I64);
14999   IRTemp mask0x80hi = newTemp(Ity_I64);
15000   IRTemp mask0x80lo = newTemp(Ity_I64);
15001   IRTemp maskBit3hi = newTemp(Ity_I64);
15002   IRTemp maskBit3lo = newTemp(Ity_I64);
15003   IRTemp sAnd7hi    = newTemp(Ity_I64);
15004   IRTemp sAnd7lo    = newTemp(Ity_I64);
15005   IRTemp permdHi    = newTemp(Ity_I64);
15006   IRTemp permdLo    = newTemp(Ity_I64);
15007   IRTemp res        = newTemp(Ity_V128);
15008
15009   assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
15010   assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
15011   assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
15012   assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
15013
15014   assign( sevens, mkU64(0x0707070707070707ULL) );
15015
15016   /* mask0x80hi = Not(SarN8x8(sHi,7))
15017      maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
15018      sAnd7hi    = And(sHi,sevens)
15019      permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
15020      And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
15021      rHi        = And(permdHi,mask0x80hi)
15022   */
15023   assign(
15024      mask0x80hi,
15025      unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
15026
15027   assign(
15028      maskBit3hi,
15029      binop(Iop_SarN8x8,
15030            binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
15031            mkU8(7)));
15032
15033   assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
15034
15035   assign(
15036      permdHi,
15037      binop(
15038         Iop_Or64,
15039         binop(Iop_And64,
15040               binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
15041               mkexpr(maskBit3hi)),
15042         binop(Iop_And64,
15043               binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
15044               unop(Iop_Not64,mkexpr(maskBit3hi))) ));
15045
15046   assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
15047
15048   /* And the same for the lower half of the result.  What fun. */
15049
15050   assign(
15051      mask0x80lo,
15052      unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
15053
15054   assign(
15055      maskBit3lo,
15056      binop(Iop_SarN8x8,
15057            binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
15058            mkU8(7)));
15059
15060   assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
15061
15062   assign(
15063      permdLo,
15064      binop(
15065         Iop_Or64,
15066         binop(Iop_And64,
15067               binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
15068               mkexpr(maskBit3lo)),
15069         binop(Iop_And64,
15070               binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
15071               unop(Iop_Not64,mkexpr(maskBit3lo))) ));
15072
15073   assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
15074
15075   assign(res, binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo)));
15076   return res;
15077}
15078
15079
15080static
15081IRTemp math_PSHUFB_YMM ( IRTemp dV/*data to perm*/, IRTemp sV/*perm*/ )
15082{
15083   IRTemp sHi, sLo, dHi, dLo;
15084   sHi = sLo = dHi = dLo = IRTemp_INVALID;
15085   breakupV256toV128s( dV, &dHi, &dLo);
15086   breakupV256toV128s( sV, &sHi, &sLo);
15087   IRTemp res = newTemp(Ity_V256);
15088   assign(res, binop(Iop_V128HLtoV256,
15089                     mkexpr(math_PSHUFB_XMM(dHi, sHi)),
15090                     mkexpr(math_PSHUFB_XMM(dLo, sLo))));
15091   return res;
15092}
15093
15094
15095static Long dis_PHADD_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta,
15096                            Bool isAvx, UChar opc )
15097{
15098   IRTemp addr   = IRTemp_INVALID;
15099   Int    alen   = 0;
15100   HChar  dis_buf[50];
15101   const HChar* str = "???";
15102   IROp   opV64  = Iop_INVALID;
15103   IROp   opCatO = Iop_CatOddLanes16x4;
15104   IROp   opCatE = Iop_CatEvenLanes16x4;
15105   IRTemp sV     = newTemp(Ity_V128);
15106   IRTemp dV     = newTemp(Ity_V128);
15107   IRTemp sHi    = newTemp(Ity_I64);
15108   IRTemp sLo    = newTemp(Ity_I64);
15109   IRTemp dHi    = newTemp(Ity_I64);
15110   IRTemp dLo    = newTemp(Ity_I64);
15111   UChar  modrm  = getUChar(delta);
15112   UInt   rG     = gregOfRexRM(pfx,modrm);
15113   UInt   rV     = isAvx ? getVexNvvvv(pfx) : rG;
15114
15115   switch (opc) {
15116      case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
15117      case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
15118      case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
15119      case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
15120      case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
15121      case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
15122      default: vassert(0);
15123   }
15124   if (opc == 0x02 || opc == 0x06) {
15125      opCatO = Iop_InterleaveHI32x2;
15126      opCatE = Iop_InterleaveLO32x2;
15127   }
15128
15129   assign( dV, getXMMReg(rV) );
15130
15131   if (epartIsReg(modrm)) {
15132      UInt rE = eregOfRexRM(pfx,modrm);
15133      assign( sV, getXMMReg(rE) );
15134      DIP("%sph%s %s,%s\n", isAvx ? "v" : "", str,
15135          nameXMMReg(rE), nameXMMReg(rG));
15136      delta += 1;
15137   } else {
15138      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15139      if (!isAvx)
15140         gen_SEGV_if_not_16_aligned( addr );
15141      assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
15142      DIP("%sph%s %s,%s\n", isAvx ? "v" : "", str,
15143          dis_buf, nameXMMReg(rG));
15144      delta += alen;
15145   }
15146
15147   assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
15148   assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
15149   assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
15150   assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
15151
15152   /* This isn't a particularly efficient way to compute the
15153      result, but at least it avoids a proliferation of IROps,
15154      hence avoids complication all the backends. */
15155
15156   (isAvx ? putYMMRegLoAndZU : putXMMReg)
15157      ( rG,
15158        binop(Iop_64HLtoV128,
15159              binop(opV64,
15160                    binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
15161                    binop(opCatO,mkexpr(sHi),mkexpr(sLo)) ),
15162              binop(opV64,
15163                    binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
15164                    binop(opCatO,mkexpr(dHi),mkexpr(dLo)) ) ) );
15165   return delta;
15166}
15167
15168
15169static Long dis_PHADD_256 ( VexAbiInfo* vbi, Prefix pfx, Long delta, UChar opc )
15170{
15171   IRTemp addr   = IRTemp_INVALID;
15172   Int    alen   = 0;
15173   HChar  dis_buf[50];
15174   const HChar* str = "???";
15175   IROp   opV64  = Iop_INVALID;
15176   IROp   opCatO = Iop_CatOddLanes16x4;
15177   IROp   opCatE = Iop_CatEvenLanes16x4;
15178   IRTemp sV     = newTemp(Ity_V256);
15179   IRTemp dV     = newTemp(Ity_V256);
15180   IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
15181   s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
15182   UChar  modrm  = getUChar(delta);
15183   UInt   rG     = gregOfRexRM(pfx,modrm);
15184   UInt   rV     = getVexNvvvv(pfx);
15185
15186   switch (opc) {
15187      case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
15188      case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
15189      case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
15190      case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
15191      case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
15192      case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
15193      default: vassert(0);
15194   }
15195   if (opc == 0x02 || opc == 0x06) {
15196      opCatO = Iop_InterleaveHI32x2;
15197      opCatE = Iop_InterleaveLO32x2;
15198   }
15199
15200   assign( dV, getYMMReg(rV) );
15201
15202   if (epartIsReg(modrm)) {
15203      UInt rE = eregOfRexRM(pfx,modrm);
15204      assign( sV, getYMMReg(rE) );
15205      DIP("vph%s %s,%s\n", str, nameYMMReg(rE), nameYMMReg(rG));
15206      delta += 1;
15207   } else {
15208      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15209      assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
15210      DIP("vph%s %s,%s\n", str, dis_buf, nameYMMReg(rG));
15211      delta += alen;
15212   }
15213
15214   breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
15215   breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
15216
15217   /* This isn't a particularly efficient way to compute the
15218      result, but at least it avoids a proliferation of IROps,
15219      hence avoids complication all the backends. */
15220
15221   putYMMReg( rG,
15222              binop(Iop_V128HLtoV256,
15223                    binop(Iop_64HLtoV128,
15224                          binop(opV64,
15225                                binop(opCatE,mkexpr(s3),mkexpr(s2)),
15226                                binop(opCatO,mkexpr(s3),mkexpr(s2)) ),
15227                          binop(opV64,
15228                                binop(opCatE,mkexpr(d3),mkexpr(d2)),
15229                                binop(opCatO,mkexpr(d3),mkexpr(d2)) ) ),
15230                    binop(Iop_64HLtoV128,
15231                          binop(opV64,
15232                                binop(opCatE,mkexpr(s1),mkexpr(s0)),
15233                                binop(opCatO,mkexpr(s1),mkexpr(s0)) ),
15234                          binop(opV64,
15235                                binop(opCatE,mkexpr(d1),mkexpr(d0)),
15236                                binop(opCatO,mkexpr(d1),mkexpr(d0)) ) ) ) );
15237   return delta;
15238}
15239
15240
15241static IRTemp math_PMADDUBSW_128 ( IRTemp dV, IRTemp sV )
15242{
15243   IRTemp sVoddsSX  = newTemp(Ity_V128);
15244   IRTemp sVevensSX = newTemp(Ity_V128);
15245   IRTemp dVoddsZX  = newTemp(Ity_V128);
15246   IRTemp dVevensZX = newTemp(Ity_V128);
15247   /* compute dV unsigned x sV signed */
15248   assign( sVoddsSX, binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
15249   assign( sVevensSX, binop(Iop_SarN16x8,
15250                            binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
15251                            mkU8(8)) );
15252   assign( dVoddsZX, binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
15253   assign( dVevensZX, binop(Iop_ShrN16x8,
15254                            binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
15255                            mkU8(8)) );
15256
15257   IRTemp res = newTemp(Ity_V128);
15258   assign( res, binop(Iop_QAdd16Sx8,
15259                      binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
15260                      binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
15261                     )
15262         );
15263   return res;
15264}
15265
15266
15267static
15268IRTemp math_PMADDUBSW_256 ( IRTemp dV, IRTemp sV )
15269{
15270   IRTemp sHi, sLo, dHi, dLo;
15271   sHi = sLo = dHi = dLo = IRTemp_INVALID;
15272   breakupV256toV128s( dV, &dHi, &dLo);
15273   breakupV256toV128s( sV, &sHi, &sLo);
15274   IRTemp res = newTemp(Ity_V256);
15275   assign(res, binop(Iop_V128HLtoV256,
15276                     mkexpr(math_PMADDUBSW_128(dHi, sHi)),
15277                     mkexpr(math_PMADDUBSW_128(dLo, sLo))));
15278   return res;
15279}
15280
15281
15282__attribute__((noinline))
15283static
15284Long dis_ESC_0F38__SupSSE3 ( Bool* decode_OK,
15285                             VexAbiInfo* vbi,
15286                             Prefix pfx, Int sz, Long deltaIN )
15287{
15288   IRTemp addr  = IRTemp_INVALID;
15289   UChar  modrm = 0;
15290   Int    alen  = 0;
15291   HChar  dis_buf[50];
15292
15293   *decode_OK = False;
15294
15295   Long   delta = deltaIN;
15296   UChar  opc   = getUChar(delta);
15297   delta++;
15298   switch (opc) {
15299
15300   case 0x00:
15301      /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
15302      if (have66noF2noF3(pfx)
15303          && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
15304         IRTemp sV = newTemp(Ity_V128);
15305         IRTemp dV = newTemp(Ity_V128);
15306
15307         modrm = getUChar(delta);
15308         assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
15309
15310         if (epartIsReg(modrm)) {
15311            assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
15312            delta += 1;
15313            DIP("pshufb %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
15314                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
15315         } else {
15316            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15317            gen_SEGV_if_not_16_aligned( addr );
15318            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
15319            delta += alen;
15320            DIP("pshufb %s,%s\n", dis_buf,
15321                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
15322         }
15323
15324         IRTemp res = math_PSHUFB_XMM( dV, sV );
15325         putXMMReg(gregOfRexRM(pfx,modrm), mkexpr(res));
15326         goto decode_success;
15327      }
15328      /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
15329      if (haveNo66noF2noF3(pfx) && sz == 4) {
15330         IRTemp sV      = newTemp(Ity_I64);
15331         IRTemp dV      = newTemp(Ity_I64);
15332
15333         modrm = getUChar(delta);
15334         do_MMX_preamble();
15335         assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
15336
15337         if (epartIsReg(modrm)) {
15338            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
15339            delta += 1;
15340            DIP("pshufb %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
15341                                  nameMMXReg(gregLO3ofRM(modrm)));
15342         } else {
15343            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15344            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
15345            delta += alen;
15346            DIP("pshufb %s,%s\n", dis_buf,
15347                                  nameMMXReg(gregLO3ofRM(modrm)));
15348         }
15349
15350         putMMXReg(
15351            gregLO3ofRM(modrm),
15352            binop(
15353               Iop_And64,
15354               /* permute the lanes */
15355               binop(
15356                  Iop_Perm8x8,
15357                  mkexpr(dV),
15358                  binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
15359               ),
15360               /* mask off lanes which have (index & 0x80) == 0x80 */
15361               unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
15362            )
15363         );
15364         goto decode_success;
15365      }
15366      break;
15367
15368   case 0x01:
15369   case 0x02:
15370   case 0x03:
15371   case 0x05:
15372   case 0x06:
15373   case 0x07:
15374      /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
15375         G to G (xmm). */
15376      /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
15377         G to G (xmm). */
15378      /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
15379         xmm) and G to G (xmm). */
15380      /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
15381         G to G (xmm). */
15382      /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
15383         G to G (xmm). */
15384      /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
15385         xmm) and G to G (xmm). */
15386      if (have66noF2noF3(pfx)
15387          && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
15388         delta = dis_PHADD_128( vbi, pfx, delta, False/*isAvx*/, opc );
15389         goto decode_success;
15390      }
15391      /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
15392      /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
15393         to G (mmx). */
15394      /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
15395         to G (mmx). */
15396      /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
15397         mmx) and G to G (mmx). */
15398      /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
15399         to G (mmx). */
15400      /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
15401         to G (mmx). */
15402      /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
15403         mmx) and G to G (mmx). */
15404      if (haveNo66noF2noF3(pfx) && sz == 4) {
15405         const HChar* str = "???";
15406         IROp   opV64  = Iop_INVALID;
15407         IROp   opCatO = Iop_CatOddLanes16x4;
15408         IROp   opCatE = Iop_CatEvenLanes16x4;
15409         IRTemp sV     = newTemp(Ity_I64);
15410         IRTemp dV     = newTemp(Ity_I64);
15411
15412         modrm = getUChar(delta);
15413
15414         switch (opc) {
15415            case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
15416            case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
15417            case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
15418            case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
15419            case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
15420            case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
15421            default: vassert(0);
15422         }
15423         if (opc == 0x02 || opc == 0x06) {
15424            opCatO = Iop_InterleaveHI32x2;
15425            opCatE = Iop_InterleaveLO32x2;
15426         }
15427
15428         do_MMX_preamble();
15429         assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
15430
15431         if (epartIsReg(modrm)) {
15432            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
15433            delta += 1;
15434            DIP("ph%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
15435                                     nameMMXReg(gregLO3ofRM(modrm)));
15436         } else {
15437            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15438            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
15439            delta += alen;
15440            DIP("ph%s %s,%s\n", str, dis_buf,
15441                                     nameMMXReg(gregLO3ofRM(modrm)));
15442         }
15443
15444         putMMXReg(
15445            gregLO3ofRM(modrm),
15446            binop(opV64,
15447                  binop(opCatE,mkexpr(sV),mkexpr(dV)),
15448                  binop(opCatO,mkexpr(sV),mkexpr(dV))
15449            )
15450         );
15451         goto decode_success;
15452      }
15453      break;
15454
15455   case 0x04:
15456      /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
15457         Unsigned Bytes (XMM) */
15458      if (have66noF2noF3(pfx)
15459          && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
15460         IRTemp sV = newTemp(Ity_V128);
15461         IRTemp dV = newTemp(Ity_V128);
15462         modrm     = getUChar(delta);
15463         UInt   rG = gregOfRexRM(pfx,modrm);
15464
15465         assign( dV, getXMMReg(rG) );
15466
15467         if (epartIsReg(modrm)) {
15468            UInt rE = eregOfRexRM(pfx,modrm);
15469            assign( sV, getXMMReg(rE) );
15470            delta += 1;
15471            DIP("pmaddubsw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
15472         } else {
15473            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15474            gen_SEGV_if_not_16_aligned( addr );
15475            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
15476            delta += alen;
15477            DIP("pmaddubsw %s,%s\n", dis_buf, nameXMMReg(rG));
15478         }
15479
15480         putXMMReg( rG, mkexpr( math_PMADDUBSW_128( dV, sV ) ) );
15481         goto decode_success;
15482      }
15483      /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
15484         Unsigned Bytes (MMX) */
15485      if (haveNo66noF2noF3(pfx) && sz == 4) {
15486         IRTemp sV        = newTemp(Ity_I64);
15487         IRTemp dV        = newTemp(Ity_I64);
15488         IRTemp sVoddsSX  = newTemp(Ity_I64);
15489         IRTemp sVevensSX = newTemp(Ity_I64);
15490         IRTemp dVoddsZX  = newTemp(Ity_I64);
15491         IRTemp dVevensZX = newTemp(Ity_I64);
15492
15493         modrm = getUChar(delta);
15494         do_MMX_preamble();
15495         assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
15496
15497         if (epartIsReg(modrm)) {
15498            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
15499            delta += 1;
15500            DIP("pmaddubsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
15501                                     nameMMXReg(gregLO3ofRM(modrm)));
15502         } else {
15503            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15504            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
15505            delta += alen;
15506            DIP("pmaddubsw %s,%s\n", dis_buf,
15507                                     nameMMXReg(gregLO3ofRM(modrm)));
15508         }
15509
15510         /* compute dV unsigned x sV signed */
15511         assign( sVoddsSX,
15512                 binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
15513         assign( sVevensSX,
15514                 binop(Iop_SarN16x4,
15515                       binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
15516                       mkU8(8)) );
15517         assign( dVoddsZX,
15518                 binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
15519         assign( dVevensZX,
15520                 binop(Iop_ShrN16x4,
15521                       binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
15522                       mkU8(8)) );
15523
15524         putMMXReg(
15525            gregLO3ofRM(modrm),
15526            binop(Iop_QAdd16Sx4,
15527                  binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
15528                  binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
15529            )
15530         );
15531         goto decode_success;
15532      }
15533      break;
15534
15535   case 0x08:
15536   case 0x09:
15537   case 0x0A:
15538      /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
15539      /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
15540      /* 66 0F 38 0A = PSIGND -- Packed Sign 32x4 (XMM) */
15541      if (have66noF2noF3(pfx)
15542          && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
15543         IRTemp sV      = newTemp(Ity_V128);
15544         IRTemp dV      = newTemp(Ity_V128);
15545         IRTemp sHi     = newTemp(Ity_I64);
15546         IRTemp sLo     = newTemp(Ity_I64);
15547         IRTemp dHi     = newTemp(Ity_I64);
15548         IRTemp dLo     = newTemp(Ity_I64);
15549         const HChar* str = "???";
15550         Int    laneszB = 0;
15551
15552         switch (opc) {
15553            case 0x08: laneszB = 1; str = "b"; break;
15554            case 0x09: laneszB = 2; str = "w"; break;
15555            case 0x0A: laneszB = 4; str = "d"; break;
15556            default: vassert(0);
15557         }
15558
15559         modrm = getUChar(delta);
15560         assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
15561
15562         if (epartIsReg(modrm)) {
15563            assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
15564            delta += 1;
15565            DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
15566                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
15567         } else {
15568            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15569            gen_SEGV_if_not_16_aligned( addr );
15570            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
15571            delta += alen;
15572            DIP("psign%s %s,%s\n", str, dis_buf,
15573                                        nameXMMReg(gregOfRexRM(pfx,modrm)));
15574         }
15575
15576         assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
15577         assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
15578         assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
15579         assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
15580
15581         putXMMReg(
15582            gregOfRexRM(pfx,modrm),
15583            binop(Iop_64HLtoV128,
15584                  dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
15585                  dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
15586            )
15587         );
15588         goto decode_success;
15589      }
15590      /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
15591      /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
15592      /* 0F 38 0A = PSIGND -- Packed Sign 32x2 (MMX) */
15593      if (haveNo66noF2noF3(pfx) && sz == 4) {
15594         IRTemp sV      = newTemp(Ity_I64);
15595         IRTemp dV      = newTemp(Ity_I64);
15596         const HChar* str = "???";
15597         Int    laneszB = 0;
15598
15599         switch (opc) {
15600            case 0x08: laneszB = 1; str = "b"; break;
15601            case 0x09: laneszB = 2; str = "w"; break;
15602            case 0x0A: laneszB = 4; str = "d"; break;
15603            default: vassert(0);
15604         }
15605
15606         modrm = getUChar(delta);
15607         do_MMX_preamble();
15608         assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
15609
15610         if (epartIsReg(modrm)) {
15611            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
15612            delta += 1;
15613            DIP("psign%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
15614                                        nameMMXReg(gregLO3ofRM(modrm)));
15615         } else {
15616            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15617            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
15618            delta += alen;
15619            DIP("psign%s %s,%s\n", str, dis_buf,
15620                                        nameMMXReg(gregLO3ofRM(modrm)));
15621         }
15622
15623         putMMXReg(
15624            gregLO3ofRM(modrm),
15625            dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
15626         );
15627         goto decode_success;
15628      }
15629      break;
15630
15631   case 0x0B:
15632      /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
15633         Scale (XMM) */
15634      if (have66noF2noF3(pfx)
15635          && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
15636         IRTemp sV  = newTemp(Ity_V128);
15637         IRTemp dV  = newTemp(Ity_V128);
15638         IRTemp sHi = newTemp(Ity_I64);
15639         IRTemp sLo = newTemp(Ity_I64);
15640         IRTemp dHi = newTemp(Ity_I64);
15641         IRTemp dLo = newTemp(Ity_I64);
15642
15643         modrm = getUChar(delta);
15644         assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
15645
15646         if (epartIsReg(modrm)) {
15647            assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
15648            delta += 1;
15649            DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRexRM(pfx,modrm)),
15650                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
15651         } else {
15652            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15653            gen_SEGV_if_not_16_aligned( addr );
15654            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
15655            delta += alen;
15656            DIP("pmulhrsw %s,%s\n", dis_buf,
15657                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
15658         }
15659
15660         assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
15661         assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
15662         assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
15663         assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
15664
15665         putXMMReg(
15666            gregOfRexRM(pfx,modrm),
15667            binop(Iop_64HLtoV128,
15668                  dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
15669                  dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
15670            )
15671         );
15672         goto decode_success;
15673      }
15674      /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
15675         (MMX) */
15676      if (haveNo66noF2noF3(pfx) && sz == 4) {
15677         IRTemp sV = newTemp(Ity_I64);
15678         IRTemp dV = newTemp(Ity_I64);
15679
15680         modrm = getUChar(delta);
15681         do_MMX_preamble();
15682         assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
15683
15684         if (epartIsReg(modrm)) {
15685            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
15686            delta += 1;
15687            DIP("pmulhrsw %s,%s\n", nameMMXReg(eregLO3ofRM(modrm)),
15688                                    nameMMXReg(gregLO3ofRM(modrm)));
15689         } else {
15690            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15691            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
15692            delta += alen;
15693            DIP("pmulhrsw %s,%s\n", dis_buf,
15694                                    nameMMXReg(gregLO3ofRM(modrm)));
15695         }
15696
15697         putMMXReg(
15698            gregLO3ofRM(modrm),
15699            dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
15700         );
15701         goto decode_success;
15702      }
15703      break;
15704
15705   case 0x1C:
15706   case 0x1D:
15707   case 0x1E:
15708      /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
15709      /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
15710      /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
15711      if (have66noF2noF3(pfx)
15712          && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
15713         IRTemp sV  = newTemp(Ity_V128);
15714         const HChar* str = "???";
15715         Int    laneszB = 0;
15716
15717         switch (opc) {
15718            case 0x1C: laneszB = 1; str = "b"; break;
15719            case 0x1D: laneszB = 2; str = "w"; break;
15720            case 0x1E: laneszB = 4; str = "d"; break;
15721            default: vassert(0);
15722         }
15723
15724         modrm = getUChar(delta);
15725         if (epartIsReg(modrm)) {
15726            assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
15727            delta += 1;
15728            DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRexRM(pfx,modrm)),
15729                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
15730         } else {
15731            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15732            gen_SEGV_if_not_16_aligned( addr );
15733            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
15734            delta += alen;
15735            DIP("pabs%s %s,%s\n", str, dis_buf,
15736                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
15737         }
15738
15739         putXMMReg( gregOfRexRM(pfx,modrm),
15740                    mkexpr(math_PABS_XMM(sV, laneszB)) );
15741         goto decode_success;
15742      }
15743      /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
15744      /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
15745      /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
15746      if (haveNo66noF2noF3(pfx) && sz == 4) {
15747         IRTemp sV      = newTemp(Ity_I64);
15748         const HChar* str = "???";
15749         Int    laneszB = 0;
15750
15751         switch (opc) {
15752            case 0x1C: laneszB = 1; str = "b"; break;
15753            case 0x1D: laneszB = 2; str = "w"; break;
15754            case 0x1E: laneszB = 4; str = "d"; break;
15755            default: vassert(0);
15756         }
15757
15758         modrm = getUChar(delta);
15759         do_MMX_preamble();
15760
15761         if (epartIsReg(modrm)) {
15762            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
15763            delta += 1;
15764            DIP("pabs%s %s,%s\n", str, nameMMXReg(eregLO3ofRM(modrm)),
15765                                       nameMMXReg(gregLO3ofRM(modrm)));
15766         } else {
15767            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
15768            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
15769            delta += alen;
15770            DIP("pabs%s %s,%s\n", str, dis_buf,
15771                                       nameMMXReg(gregLO3ofRM(modrm)));
15772         }
15773
15774         putMMXReg( gregLO3ofRM(modrm),
15775                    mkexpr(math_PABS_MMX( sV, laneszB )) );
15776         goto decode_success;
15777      }
15778      break;
15779
15780   default:
15781      break;
15782
15783   }
15784
15785  //decode_failure:
15786   *decode_OK = False;
15787   return deltaIN;
15788
15789  decode_success:
15790   *decode_OK = True;
15791   return delta;
15792}
15793
15794
15795/*------------------------------------------------------------*/
15796/*---                                                      ---*/
15797/*--- Top-level SSSE3: dis_ESC_0F3A__SupSSE3               ---*/
15798/*---                                                      ---*/
15799/*------------------------------------------------------------*/
15800
15801__attribute__((noinline))
15802static
15803Long dis_ESC_0F3A__SupSSE3 ( Bool* decode_OK,
15804                             VexAbiInfo* vbi,
15805                             Prefix pfx, Int sz, Long deltaIN )
15806{
15807   Long   d64   = 0;
15808   IRTemp addr  = IRTemp_INVALID;
15809   UChar  modrm = 0;
15810   Int    alen  = 0;
15811   HChar  dis_buf[50];
15812
15813   *decode_OK = False;
15814
15815   Long   delta = deltaIN;
15816   UChar  opc   = getUChar(delta);
15817   delta++;
15818   switch (opc) {
15819
15820   case 0x0F:
15821      /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
15822      if (have66noF2noF3(pfx)
15823          && (sz == 2 || /*redundant REX.W*/ sz == 8)) {
15824         IRTemp sV  = newTemp(Ity_V128);
15825         IRTemp dV  = newTemp(Ity_V128);
15826
15827         modrm = getUChar(delta);
15828         assign( dV, getXMMReg(gregOfRexRM(pfx,modrm)) );
15829
15830         if (epartIsReg(modrm)) {
15831            assign( sV, getXMMReg(eregOfRexRM(pfx,modrm)) );
15832            d64 = (Long)getUChar(delta+1);
15833            delta += 1+1;
15834            DIP("palignr $%d,%s,%s\n", (Int)d64,
15835                                       nameXMMReg(eregOfRexRM(pfx,modrm)),
15836                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
15837         } else {
15838            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
15839            gen_SEGV_if_not_16_aligned( addr );
15840            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
15841            d64 = (Long)getUChar(delta+alen);
15842            delta += alen+1;
15843            DIP("palignr $%d,%s,%s\n", (Int)d64,
15844                                       dis_buf,
15845                                       nameXMMReg(gregOfRexRM(pfx,modrm)));
15846         }
15847
15848         IRTemp res = math_PALIGNR_XMM( sV, dV, d64 );
15849         putXMMReg( gregOfRexRM(pfx,modrm), mkexpr(res) );
15850         goto decode_success;
15851      }
15852      /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
15853      if (haveNo66noF2noF3(pfx) && sz == 4) {
15854         IRTemp sV  = newTemp(Ity_I64);
15855         IRTemp dV  = newTemp(Ity_I64);
15856         IRTemp res = newTemp(Ity_I64);
15857
15858         modrm = getUChar(delta);
15859         do_MMX_preamble();
15860         assign( dV, getMMXReg(gregLO3ofRM(modrm)) );
15861
15862         if (epartIsReg(modrm)) {
15863            assign( sV, getMMXReg(eregLO3ofRM(modrm)) );
15864            d64 = (Long)getUChar(delta+1);
15865            delta += 1+1;
15866            DIP("palignr $%d,%s,%s\n",  (Int)d64,
15867                                        nameMMXReg(eregLO3ofRM(modrm)),
15868                                        nameMMXReg(gregLO3ofRM(modrm)));
15869         } else {
15870            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
15871            assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
15872            d64 = (Long)getUChar(delta+alen);
15873            delta += alen+1;
15874            DIP("palignr $%d%s,%s\n", (Int)d64,
15875                                      dis_buf,
15876                                      nameMMXReg(gregLO3ofRM(modrm)));
15877         }
15878
15879         if (d64 == 0) {
15880            assign( res, mkexpr(sV) );
15881         }
15882         else if (d64 >= 1 && d64 <= 7) {
15883            assign(res,
15884                   binop(Iop_Or64,
15885                         binop(Iop_Shr64, mkexpr(sV), mkU8(8*d64)),
15886                         binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d64))
15887                        )));
15888         }
15889         else if (d64 == 8) {
15890           assign( res, mkexpr(dV) );
15891         }
15892         else if (d64 >= 9 && d64 <= 15) {
15893            assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d64-8))) );
15894         }
15895         else if (d64 >= 16 && d64 <= 255) {
15896            assign( res, mkU64(0) );
15897         }
15898         else
15899            vassert(0);
15900
15901         putMMXReg( gregLO3ofRM(modrm), mkexpr(res) );
15902         goto decode_success;
15903      }
15904      break;
15905
15906   default:
15907      break;
15908
15909   }
15910
15911  //decode_failure:
15912   *decode_OK = False;
15913   return deltaIN;
15914
15915  decode_success:
15916   *decode_OK = True;
15917   return delta;
15918}
15919
15920
15921/*------------------------------------------------------------*/
15922/*---                                                      ---*/
15923/*--- Top-level SSE4: dis_ESC_0F__SSE4                     ---*/
15924/*---                                                      ---*/
15925/*------------------------------------------------------------*/
15926
15927__attribute__((noinline))
15928static
15929Long dis_ESC_0F__SSE4 ( Bool* decode_OK,
15930                        VexArchInfo* archinfo,
15931                        VexAbiInfo* vbi,
15932                        Prefix pfx, Int sz, Long deltaIN )
15933{
15934   IRTemp addr  = IRTemp_INVALID;
15935   IRType ty    = Ity_INVALID;
15936   UChar  modrm = 0;
15937   Int    alen  = 0;
15938   HChar  dis_buf[50];
15939
15940   *decode_OK = False;
15941
15942   Long   delta = deltaIN;
15943   UChar  opc   = getUChar(delta);
15944   delta++;
15945   switch (opc) {
15946
15947   case 0xB8:
15948      /* F3 0F B8  = POPCNT{W,L,Q}
15949         Count the number of 1 bits in a register
15950      */
15951      if (haveF3noF2(pfx) /* so both 66 and REX.W are possibilities */
15952          && (sz == 2 || sz == 4 || sz == 8)) {
15953         /*IRType*/ ty  = szToITy(sz);
15954         IRTemp     src = newTemp(ty);
15955         modrm = getUChar(delta);
15956         if (epartIsReg(modrm)) {
15957            assign(src, getIRegE(sz, pfx, modrm));
15958            delta += 1;
15959            DIP("popcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
15960                nameIRegG(sz, pfx, modrm));
15961         } else {
15962            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
15963            assign(src, loadLE(ty, mkexpr(addr)));
15964            delta += alen;
15965            DIP("popcnt%c %s, %s\n", nameISize(sz), dis_buf,
15966                nameIRegG(sz, pfx, modrm));
15967         }
15968
15969         IRTemp result = gen_POPCOUNT(ty, src);
15970         putIRegG(sz, pfx, modrm, mkexpr(result));
15971
15972         // Update flags.  This is pretty lame .. perhaps can do better
15973         // if this turns out to be performance critical.
15974         // O S A C P are cleared.  Z is set if SRC == 0.
15975         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
15976         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
15977         stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
15978         stmt( IRStmt_Put( OFFB_CC_DEP1,
15979               binop(Iop_Shl64,
15980                     unop(Iop_1Uto64,
15981                          binop(Iop_CmpEQ64,
15982                                widenUto64(mkexpr(src)),
15983                                mkU64(0))),
15984                     mkU8(AMD64G_CC_SHIFT_Z))));
15985
15986         goto decode_success;
15987      }
15988      break;
15989
15990   case 0xBC:
15991      /* F3 0F BC -- TZCNT (count trailing zeroes.  A BMI extension,
15992         which we can only decode if we're sure this is a BMI1 capable cpu
15993         that supports TZCNT, since otherwise it's BSF, which behaves
15994         differently on zero source.  */
15995      if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
15996          && (sz == 2 || sz == 4 || sz == 8)
15997          && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_BMI)) {
15998         /*IRType*/ ty  = szToITy(sz);
15999         IRTemp     src = newTemp(ty);
16000         modrm = getUChar(delta);
16001         if (epartIsReg(modrm)) {
16002            assign(src, getIRegE(sz, pfx, modrm));
16003            delta += 1;
16004            DIP("tzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
16005                nameIRegG(sz, pfx, modrm));
16006         } else {
16007            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
16008            assign(src, loadLE(ty, mkexpr(addr)));
16009            delta += alen;
16010            DIP("tzcnt%c %s, %s\n", nameISize(sz), dis_buf,
16011                nameIRegG(sz, pfx, modrm));
16012         }
16013
16014         IRTemp res = gen_TZCNT(ty, src);
16015         putIRegG(sz, pfx, modrm, mkexpr(res));
16016
16017         // Update flags.  This is pretty lame .. perhaps can do better
16018         // if this turns out to be performance critical.
16019         // O S A P are cleared.  Z is set if RESULT == 0.
16020         // C is set if SRC is zero.
16021         IRTemp src64 = newTemp(Ity_I64);
16022         IRTemp res64 = newTemp(Ity_I64);
16023         assign(src64, widenUto64(mkexpr(src)));
16024         assign(res64, widenUto64(mkexpr(res)));
16025
16026         IRTemp oszacp = newTemp(Ity_I64);
16027         assign(
16028            oszacp,
16029            binop(Iop_Or64,
16030                  binop(Iop_Shl64,
16031                        unop(Iop_1Uto64,
16032                             binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))),
16033                        mkU8(AMD64G_CC_SHIFT_Z)),
16034                  binop(Iop_Shl64,
16035                        unop(Iop_1Uto64,
16036                             binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))),
16037                        mkU8(AMD64G_CC_SHIFT_C))
16038            )
16039         );
16040
16041         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
16042         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
16043         stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
16044         stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
16045
16046         goto decode_success;
16047      }
16048      break;
16049
16050   case 0xBD:
16051      /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
16052         which we can only decode if we're sure this is an AMD cpu
16053         that supports LZCNT, since otherwise it's BSR, which behaves
16054         differently.  Bizarrely, my Sandy Bridge also accepts these
16055         instructions but produces different results. */
16056      if (haveF3noF2(pfx) /* so both 66 and 48 are possibilities */
16057          && (sz == 2 || sz == 4 || sz == 8)
16058          && 0 != (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT)) {
16059         /*IRType*/ ty  = szToITy(sz);
16060         IRTemp     src = newTemp(ty);
16061         modrm = getUChar(delta);
16062         if (epartIsReg(modrm)) {
16063            assign(src, getIRegE(sz, pfx, modrm));
16064            delta += 1;
16065            DIP("lzcnt%c %s, %s\n", nameISize(sz), nameIRegE(sz, pfx, modrm),
16066                nameIRegG(sz, pfx, modrm));
16067         } else {
16068            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0);
16069            assign(src, loadLE(ty, mkexpr(addr)));
16070            delta += alen;
16071            DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
16072                nameIRegG(sz, pfx, modrm));
16073         }
16074
16075         IRTemp res = gen_LZCNT(ty, src);
16076         putIRegG(sz, pfx, modrm, mkexpr(res));
16077
16078         // Update flags.  This is pretty lame .. perhaps can do better
16079         // if this turns out to be performance critical.
16080         // O S A P are cleared.  Z is set if RESULT == 0.
16081         // C is set if SRC is zero.
16082         IRTemp src64 = newTemp(Ity_I64);
16083         IRTemp res64 = newTemp(Ity_I64);
16084         assign(src64, widenUto64(mkexpr(src)));
16085         assign(res64, widenUto64(mkexpr(res)));
16086
16087         IRTemp oszacp = newTemp(Ity_I64);
16088         assign(
16089            oszacp,
16090            binop(Iop_Or64,
16091                  binop(Iop_Shl64,
16092                        unop(Iop_1Uto64,
16093                             binop(Iop_CmpEQ64, mkexpr(res64), mkU64(0))),
16094                        mkU8(AMD64G_CC_SHIFT_Z)),
16095                  binop(Iop_Shl64,
16096                        unop(Iop_1Uto64,
16097                             binop(Iop_CmpEQ64, mkexpr(src64), mkU64(0))),
16098                        mkU8(AMD64G_CC_SHIFT_C))
16099            )
16100         );
16101
16102         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
16103         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
16104         stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
16105         stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
16106
16107         goto decode_success;
16108      }
16109      break;
16110
16111   default:
16112      break;
16113
16114   }
16115
16116  //decode_failure:
16117   *decode_OK = False;
16118   return deltaIN;
16119
16120  decode_success:
16121   *decode_OK = True;
16122   return delta;
16123}
16124
16125
16126/*------------------------------------------------------------*/
16127/*---                                                      ---*/
16128/*--- Top-level SSE4: dis_ESC_0F38__SSE4                   ---*/
16129/*---                                                      ---*/
16130/*------------------------------------------------------------*/
16131
16132static IRTemp math_PBLENDVB_128 ( IRTemp vecE, IRTemp vecG,
16133                                  IRTemp vec0/*controlling mask*/,
16134                                  UInt gran, IROp opSAR )
16135{
16136   /* The tricky bit is to convert vec0 into a suitable mask, by
16137      copying the most significant bit of each lane into all positions
16138      in the lane. */
16139   IRTemp sh = newTemp(Ity_I8);
16140   assign(sh, mkU8(8 * gran - 1));
16141
16142   IRTemp mask = newTemp(Ity_V128);
16143   assign(mask, binop(opSAR, mkexpr(vec0), mkexpr(sh)));
16144
16145   IRTemp notmask = newTemp(Ity_V128);
16146   assign(notmask, unop(Iop_NotV128, mkexpr(mask)));
16147
16148   IRTemp res = newTemp(Ity_V128);
16149   assign(res,  binop(Iop_OrV128,
16150                      binop(Iop_AndV128, mkexpr(vecE), mkexpr(mask)),
16151                      binop(Iop_AndV128, mkexpr(vecG), mkexpr(notmask))));
16152   return res;
16153}
16154
16155static IRTemp math_PBLENDVB_256 ( IRTemp vecE, IRTemp vecG,
16156                                  IRTemp vec0/*controlling mask*/,
16157                                  UInt gran, IROp opSAR128 )
16158{
16159   /* The tricky bit is to convert vec0 into a suitable mask, by
16160      copying the most significant bit of each lane into all positions
16161      in the lane. */
16162   IRTemp sh = newTemp(Ity_I8);
16163   assign(sh, mkU8(8 * gran - 1));
16164
16165   IRTemp vec0Hi = IRTemp_INVALID;
16166   IRTemp vec0Lo = IRTemp_INVALID;
16167   breakupV256toV128s( vec0, &vec0Hi, &vec0Lo );
16168
16169   IRTemp mask = newTemp(Ity_V256);
16170   assign(mask, binop(Iop_V128HLtoV256,
16171                      binop(opSAR128, mkexpr(vec0Hi), mkexpr(sh)),
16172                      binop(opSAR128, mkexpr(vec0Lo), mkexpr(sh))));
16173
16174   IRTemp notmask = newTemp(Ity_V256);
16175   assign(notmask, unop(Iop_NotV256, mkexpr(mask)));
16176
16177   IRTemp res = newTemp(Ity_V256);
16178   assign(res,  binop(Iop_OrV256,
16179                      binop(Iop_AndV256, mkexpr(vecE), mkexpr(mask)),
16180                      binop(Iop_AndV256, mkexpr(vecG), mkexpr(notmask))));
16181   return res;
16182}
16183
16184static Long dis_VBLENDV_128 ( VexAbiInfo* vbi, Prefix pfx, Long delta,
16185                              const HChar *name, UInt gran, IROp opSAR )
16186{
16187   IRTemp addr   = IRTemp_INVALID;
16188   Int    alen   = 0;
16189   HChar  dis_buf[50];
16190   UChar  modrm  = getUChar(delta);
16191   UInt   rG     = gregOfRexRM(pfx, modrm);
16192   UInt   rV     = getVexNvvvv(pfx);
16193   UInt   rIS4   = 0xFF; /* invalid */
16194   IRTemp vecE   = newTemp(Ity_V128);
16195   IRTemp vecV   = newTemp(Ity_V128);
16196   IRTemp vecIS4 = newTemp(Ity_V128);
16197   if (epartIsReg(modrm)) {
16198      delta++;
16199      UInt rE = eregOfRexRM(pfx, modrm);
16200      assign(vecE, getXMMReg(rE));
16201      UChar ib = getUChar(delta);
16202      rIS4 = (ib >> 4) & 0xF;
16203      DIP("%s %s,%s,%s,%s\n",
16204          name, nameXMMReg(rIS4), nameXMMReg(rE),
16205          nameXMMReg(rV), nameXMMReg(rG));
16206   } else {
16207      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
16208      delta += alen;
16209      assign(vecE, loadLE(Ity_V128, mkexpr(addr)));
16210      UChar ib = getUChar(delta);
16211      rIS4 = (ib >> 4) & 0xF;
16212      DIP("%s %s,%s,%s,%s\n",
16213          name, nameXMMReg(rIS4), dis_buf, nameXMMReg(rV), nameXMMReg(rG));
16214   }
16215   delta++;
16216   assign(vecV,   getXMMReg(rV));
16217   assign(vecIS4, getXMMReg(rIS4));
16218   IRTemp res = math_PBLENDVB_128( vecE, vecV, vecIS4, gran, opSAR );
16219   putYMMRegLoAndZU( rG, mkexpr(res) );
16220   return delta;
16221}
16222
16223static Long dis_VBLENDV_256 ( VexAbiInfo* vbi, Prefix pfx, Long delta,
16224                              const HChar *name, UInt gran, IROp opSAR128 )
16225{
16226   IRTemp addr   = IRTemp_INVALID;
16227   Int    alen   = 0;
16228   HChar  dis_buf[50];
16229   UChar  modrm  = getUChar(delta);
16230   UInt   rG     = gregOfRexRM(pfx, modrm);
16231   UInt   rV     = getVexNvvvv(pfx);
16232   UInt   rIS4   = 0xFF; /* invalid */
16233   IRTemp vecE   = newTemp(Ity_V256);
16234   IRTemp vecV   = newTemp(Ity_V256);
16235   IRTemp vecIS4 = newTemp(Ity_V256);
16236   if (epartIsReg(modrm)) {
16237      delta++;
16238      UInt rE = eregOfRexRM(pfx, modrm);
16239      assign(vecE, getYMMReg(rE));
16240      UChar ib = getUChar(delta);
16241      rIS4 = (ib >> 4) & 0xF;
16242      DIP("%s %s,%s,%s,%s\n",
16243          name, nameYMMReg(rIS4), nameYMMReg(rE),
16244          nameYMMReg(rV), nameYMMReg(rG));
16245   } else {
16246      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
16247      delta += alen;
16248      assign(vecE, loadLE(Ity_V256, mkexpr(addr)));
16249      UChar ib = getUChar(delta);
16250      rIS4 = (ib >> 4) & 0xF;
16251      DIP("%s %s,%s,%s,%s\n",
16252          name, nameYMMReg(rIS4), dis_buf, nameYMMReg(rV), nameYMMReg(rG));
16253   }
16254   delta++;
16255   assign(vecV,   getYMMReg(rV));
16256   assign(vecIS4, getYMMReg(rIS4));
16257   IRTemp res = math_PBLENDVB_256( vecE, vecV, vecIS4, gran, opSAR128 );
16258   putYMMReg( rG, mkexpr(res) );
16259   return delta;
16260}
16261
16262static void finish_xTESTy ( IRTemp andV, IRTemp andnV, Int sign )
16263{
16264   /* Set Z=1 iff (vecE & vecG) == 0
16265      Set C=1 iff (vecE & not vecG) == 0
16266   */
16267
16268   /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
16269
16270   /* andV resp. andnV, reduced to 64-bit values, by or-ing the top
16271      and bottom 64-bits together.  It relies on this trick:
16272
16273      InterleaveLO64x2([a,b],[c,d]) == [b,d]    hence
16274
16275      InterleaveLO64x2([a,b],[a,b]) == [b,b]    and similarly
16276      InterleaveHI64x2([a,b],[a,b]) == [a,a]
16277
16278      and so the OR of the above 2 exprs produces
16279      [a OR b, a OR b], from which we simply take the lower half.
16280   */
16281   IRTemp and64  = newTemp(Ity_I64);
16282   IRTemp andn64 = newTemp(Ity_I64);
16283
16284   assign(and64,
16285          unop(Iop_V128to64,
16286               binop(Iop_OrV128,
16287                     binop(Iop_InterleaveLO64x2,
16288                           mkexpr(andV), mkexpr(andV)),
16289                     binop(Iop_InterleaveHI64x2,
16290                           mkexpr(andV), mkexpr(andV)))));
16291
16292   assign(andn64,
16293          unop(Iop_V128to64,
16294               binop(Iop_OrV128,
16295                     binop(Iop_InterleaveLO64x2,
16296                           mkexpr(andnV), mkexpr(andnV)),
16297                     binop(Iop_InterleaveHI64x2,
16298                           mkexpr(andnV), mkexpr(andnV)))));
16299
16300   IRTemp z64 = newTemp(Ity_I64);
16301   IRTemp c64 = newTemp(Ity_I64);
16302   if (sign == 64) {
16303      /* When only interested in the most significant bit, just shift
16304         arithmetically right and negate.  */
16305      assign(z64,
16306             unop(Iop_Not64,
16307                  binop(Iop_Sar64, mkexpr(and64), mkU8(63))));
16308
16309      assign(c64,
16310             unop(Iop_Not64,
16311                  binop(Iop_Sar64, mkexpr(andn64), mkU8(63))));
16312   } else {
16313      if (sign == 32) {
16314         /* When interested in bit 31 and bit 63, mask those bits and
16315            fallthrough into the PTEST handling.  */
16316         IRTemp t0 = newTemp(Ity_I64);
16317         IRTemp t1 = newTemp(Ity_I64);
16318         IRTemp t2 = newTemp(Ity_I64);
16319         assign(t0, mkU64(0x8000000080000000ULL));
16320         assign(t1, binop(Iop_And64, mkexpr(and64), mkexpr(t0)));
16321         assign(t2, binop(Iop_And64, mkexpr(andn64), mkexpr(t0)));
16322         and64 = t1;
16323         andn64 = t2;
16324      }
16325      /* Now convert and64, andn64 to all-zeroes or all-1s, so we can
16326         slice out the Z and C bits conveniently.  We use the standard
16327         trick all-zeroes -> all-zeroes, anything-else -> all-ones
16328         done by "(x | -x) >>s (word-size - 1)".
16329      */
16330      assign(z64,
16331             unop(Iop_Not64,
16332                  binop(Iop_Sar64,
16333                        binop(Iop_Or64,
16334                              binop(Iop_Sub64, mkU64(0), mkexpr(and64)),
16335                                    mkexpr(and64)), mkU8(63))));
16336
16337      assign(c64,
16338             unop(Iop_Not64,
16339                  binop(Iop_Sar64,
16340                        binop(Iop_Or64,
16341                              binop(Iop_Sub64, mkU64(0), mkexpr(andn64)),
16342                                    mkexpr(andn64)), mkU8(63))));
16343   }
16344
16345   /* And finally, slice out the Z and C flags and set the flags
16346      thunk to COPY for them.  OSAP are set to zero. */
16347   IRTemp newOSZACP = newTemp(Ity_I64);
16348   assign(newOSZACP,
16349          binop(Iop_Or64,
16350                binop(Iop_And64, mkexpr(z64), mkU64(AMD64G_CC_MASK_Z)),
16351                binop(Iop_And64, mkexpr(c64), mkU64(AMD64G_CC_MASK_C))));
16352
16353   stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(newOSZACP)));
16354   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
16355   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
16356   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
16357}
16358
16359
16360/* Handles 128 bit versions of PTEST, VTESTPS or VTESTPD.
16361   sign is 0 for PTEST insn, 32 for VTESTPS and 64 for VTESTPD. */
16362static Long dis_xTESTy_128 ( VexAbiInfo* vbi, Prefix pfx,
16363                             Long delta, Bool isAvx, Int sign )
16364{
16365   IRTemp addr   = IRTemp_INVALID;
16366   Int    alen   = 0;
16367   HChar  dis_buf[50];
16368   UChar  modrm  = getUChar(delta);
16369   UInt   rG     = gregOfRexRM(pfx, modrm);
16370   IRTemp vecE = newTemp(Ity_V128);
16371   IRTemp vecG = newTemp(Ity_V128);
16372
16373   if ( epartIsReg(modrm) ) {
16374      UInt rE = eregOfRexRM(pfx, modrm);
16375      assign(vecE, getXMMReg(rE));
16376      delta += 1;
16377      DIP( "%s%stest%s %s,%s\n",
16378           isAvx ? "v" : "", sign == 0 ? "p" : "",
16379           sign == 0 ? "" : sign == 32 ? "ps" : "pd",
16380           nameXMMReg(rE), nameXMMReg(rG) );
16381   } else {
16382      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16383      if (!isAvx)
16384         gen_SEGV_if_not_16_aligned( addr );
16385      assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
16386      delta += alen;
16387      DIP( "%s%stest%s %s,%s\n",
16388           isAvx ? "v" : "", sign == 0 ? "p" : "",
16389           sign == 0 ? "" : sign == 32 ? "ps" : "pd",
16390           dis_buf, nameXMMReg(rG) );
16391   }
16392
16393   assign(vecG, getXMMReg(rG));
16394
16395   /* Set Z=1 iff (vecE & vecG) == 0
16396      Set C=1 iff (vecE & not vecG) == 0
16397   */
16398
16399   /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
16400   IRTemp andV  = newTemp(Ity_V128);
16401   IRTemp andnV = newTemp(Ity_V128);
16402   assign(andV,  binop(Iop_AndV128, mkexpr(vecE), mkexpr(vecG)));
16403   assign(andnV, binop(Iop_AndV128,
16404                       mkexpr(vecE),
16405                       binop(Iop_XorV128, mkexpr(vecG),
16406                                          mkV128(0xFFFF))));
16407
16408   finish_xTESTy ( andV, andnV, sign );
16409   return delta;
16410}
16411
16412
16413/* Handles 256 bit versions of PTEST, VTESTPS or VTESTPD.
16414   sign is 0 for PTEST insn, 32 for VTESTPS and 64 for VTESTPD. */
16415static Long dis_xTESTy_256 ( VexAbiInfo* vbi, Prefix pfx,
16416                             Long delta, Int sign )
16417{
16418   IRTemp addr   = IRTemp_INVALID;
16419   Int    alen   = 0;
16420   HChar  dis_buf[50];
16421   UChar  modrm  = getUChar(delta);
16422   UInt   rG     = gregOfRexRM(pfx, modrm);
16423   IRTemp vecE   = newTemp(Ity_V256);
16424   IRTemp vecG   = newTemp(Ity_V256);
16425
16426   if ( epartIsReg(modrm) ) {
16427      UInt rE = eregOfRexRM(pfx, modrm);
16428      assign(vecE, getYMMReg(rE));
16429      delta += 1;
16430      DIP( "v%stest%s %s,%s\n", sign == 0 ? "p" : "",
16431           sign == 0 ? "" : sign == 32 ? "ps" : "pd",
16432           nameYMMReg(rE), nameYMMReg(rG) );
16433   } else {
16434      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16435      assign(vecE, loadLE( Ity_V256, mkexpr(addr) ));
16436      delta += alen;
16437      DIP( "v%stest%s %s,%s\n", sign == 0 ? "p" : "",
16438           sign == 0 ? "" : sign == 32 ? "ps" : "pd",
16439           dis_buf, nameYMMReg(rG) );
16440   }
16441
16442   assign(vecG, getYMMReg(rG));
16443
16444   /* Set Z=1 iff (vecE & vecG) == 0
16445      Set C=1 iff (vecE & not vecG) == 0
16446   */
16447
16448   /* andV, andnV:  vecE & vecG,  vecE and not(vecG) */
16449   IRTemp andV  = newTemp(Ity_V256);
16450   IRTemp andnV = newTemp(Ity_V256);
16451   assign(andV,  binop(Iop_AndV256, mkexpr(vecE), mkexpr(vecG)));
16452   assign(andnV, binop(Iop_AndV256,
16453                       mkexpr(vecE), unop(Iop_NotV256, mkexpr(vecG))));
16454
16455   IRTemp andVhi  = IRTemp_INVALID;
16456   IRTemp andVlo  = IRTemp_INVALID;
16457   IRTemp andnVhi = IRTemp_INVALID;
16458   IRTemp andnVlo = IRTemp_INVALID;
16459   breakupV256toV128s( andV, &andVhi, &andVlo );
16460   breakupV256toV128s( andnV, &andnVhi, &andnVlo );
16461
16462   IRTemp andV128  = newTemp(Ity_V128);
16463   IRTemp andnV128 = newTemp(Ity_V128);
16464   assign( andV128, binop( Iop_OrV128, mkexpr(andVhi), mkexpr(andVlo) ) );
16465   assign( andnV128, binop( Iop_OrV128, mkexpr(andnVhi), mkexpr(andnVlo) ) );
16466
16467   finish_xTESTy ( andV128, andnV128, sign );
16468   return delta;
16469}
16470
16471
16472/* Handles 128 bit versions of PMOVZXBW and PMOVSXBW. */
16473static Long dis_PMOVxXBW_128 ( VexAbiInfo* vbi, Prefix pfx,
16474                               Long delta, Bool isAvx, Bool xIsZ )
16475{
16476   IRTemp addr   = IRTemp_INVALID;
16477   Int    alen   = 0;
16478   HChar  dis_buf[50];
16479   IRTemp srcVec = newTemp(Ity_V128);
16480   UChar  modrm  = getUChar(delta);
16481   const HChar* mbV    = isAvx ? "v" : "";
16482   const HChar  how    = xIsZ ? 'z' : 's';
16483   UInt   rG     = gregOfRexRM(pfx, modrm);
16484   if ( epartIsReg(modrm) ) {
16485      UInt rE = eregOfRexRM(pfx, modrm);
16486      assign( srcVec, getXMMReg(rE) );
16487      delta += 1;
16488      DIP( "%spmov%cxbw %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
16489   } else {
16490      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16491      assign( srcVec,
16492              unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
16493      delta += alen;
16494      DIP( "%spmov%cxbw %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
16495   }
16496
16497   IRExpr* res
16498      = xIsZ /* do math for either zero or sign extend */
16499        ? binop( Iop_InterleaveLO8x16,
16500                 IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) )
16501        : binop( Iop_SarN16x8,
16502                 binop( Iop_ShlN16x8,
16503                        binop( Iop_InterleaveLO8x16,
16504                               IRExpr_Const( IRConst_V128(0) ),
16505                               mkexpr(srcVec) ),
16506                        mkU8(8) ),
16507                 mkU8(8) );
16508
16509   (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
16510
16511   return delta;
16512}
16513
16514
16515/* Handles 256 bit versions of PMOVZXBW and PMOVSXBW. */
16516static Long dis_PMOVxXBW_256 ( VexAbiInfo* vbi, Prefix pfx,
16517                               Long delta, Bool xIsZ )
16518{
16519   IRTemp addr   = IRTemp_INVALID;
16520   Int    alen   = 0;
16521   HChar  dis_buf[50];
16522   IRTemp srcVec = newTemp(Ity_V128);
16523   UChar  modrm  = getUChar(delta);
16524   UChar  how    = xIsZ ? 'z' : 's';
16525   UInt   rG     = gregOfRexRM(pfx, modrm);
16526   if ( epartIsReg(modrm) ) {
16527      UInt rE = eregOfRexRM(pfx, modrm);
16528      assign( srcVec, getXMMReg(rE) );
16529      delta += 1;
16530      DIP( "vpmov%cxbw %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
16531   } else {
16532      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16533      assign( srcVec, loadLE( Ity_V128, mkexpr(addr) ) );
16534      delta += alen;
16535      DIP( "vpmov%cxbw %s,%s\n", how, dis_buf, nameYMMReg(rG) );
16536   }
16537
16538   /* First do zero extend.  */
16539   IRExpr* res
16540      = binop( Iop_V128HLtoV256,
16541               binop( Iop_InterleaveHI8x16,
16542                      IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
16543               binop( Iop_InterleaveLO8x16,
16544                      IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
16545   /* And if needed sign extension as well.  */
16546   if (!xIsZ)
16547      res = binop( Iop_SarN16x16,
16548                   binop( Iop_ShlN16x16, res, mkU8(8) ), mkU8(8) );
16549
16550   putYMMReg ( rG, res );
16551
16552   return delta;
16553}
16554
16555
16556static Long dis_PMOVxXWD_128 ( VexAbiInfo* vbi, Prefix pfx,
16557                               Long delta, Bool isAvx, Bool xIsZ )
16558{
16559   IRTemp addr   = IRTemp_INVALID;
16560   Int    alen   = 0;
16561   HChar  dis_buf[50];
16562   IRTemp srcVec = newTemp(Ity_V128);
16563   UChar  modrm  = getUChar(delta);
16564   const HChar* mbV    = isAvx ? "v" : "";
16565   const HChar  how    = xIsZ ? 'z' : 's';
16566   UInt   rG     = gregOfRexRM(pfx, modrm);
16567
16568   if ( epartIsReg(modrm) ) {
16569      UInt rE = eregOfRexRM(pfx, modrm);
16570      assign( srcVec, getXMMReg(rE) );
16571      delta += 1;
16572      DIP( "%spmov%cxwd %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
16573   } else {
16574      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16575      assign( srcVec,
16576              unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
16577      delta += alen;
16578      DIP( "%spmov%cxwd %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
16579   }
16580
16581   IRExpr* res
16582      = binop( Iop_InterleaveLO16x8,
16583               IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) );
16584   if (!xIsZ)
16585      res = binop(Iop_SarN32x4,
16586                  binop(Iop_ShlN32x4, res, mkU8(16)), mkU8(16));
16587
16588   (isAvx ? putYMMRegLoAndZU : putXMMReg)
16589      ( gregOfRexRM(pfx, modrm), res );
16590
16591   return delta;
16592}
16593
16594
16595static Long dis_PMOVxXWD_256 ( VexAbiInfo* vbi, Prefix pfx,
16596                               Long delta, Bool xIsZ )
16597{
16598   IRTemp addr   = IRTemp_INVALID;
16599   Int    alen   = 0;
16600   HChar  dis_buf[50];
16601   IRTemp srcVec = newTemp(Ity_V128);
16602   UChar  modrm  = getUChar(delta);
16603   UChar  how    = xIsZ ? 'z' : 's';
16604   UInt   rG     = gregOfRexRM(pfx, modrm);
16605
16606   if ( epartIsReg(modrm) ) {
16607      UInt rE = eregOfRexRM(pfx, modrm);
16608      assign( srcVec, getXMMReg(rE) );
16609      delta += 1;
16610      DIP( "vpmov%cxwd %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
16611   } else {
16612      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16613      assign( srcVec, loadLE( Ity_V128, mkexpr(addr) ) );
16614      delta += alen;
16615      DIP( "vpmov%cxwd %s,%s\n", how, dis_buf, nameYMMReg(rG) );
16616   }
16617
16618   IRExpr* res
16619      = binop( Iop_V128HLtoV256,
16620               binop( Iop_InterleaveHI16x8,
16621                      IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
16622               binop( Iop_InterleaveLO16x8,
16623                      IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
16624   if (!xIsZ)
16625      res = binop(Iop_SarN32x8,
16626                  binop(Iop_ShlN32x8, res, mkU8(16)), mkU8(16));
16627
16628   putYMMReg ( rG, res );
16629
16630   return delta;
16631}
16632
16633
16634static Long dis_PMOVSXWQ_128 ( VexAbiInfo* vbi, Prefix pfx,
16635                               Long delta, Bool isAvx )
16636{
16637   IRTemp addr     = IRTemp_INVALID;
16638   Int    alen     = 0;
16639   HChar  dis_buf[50];
16640   IRTemp srcBytes = newTemp(Ity_I32);
16641   UChar  modrm    = getUChar(delta);
16642   const HChar* mbV = isAvx ? "v" : "";
16643   UInt   rG       = gregOfRexRM(pfx, modrm);
16644
16645   if ( epartIsReg( modrm ) ) {
16646      UInt rE = eregOfRexRM(pfx, modrm);
16647      assign( srcBytes, getXMMRegLane32( rE, 0 ) );
16648      delta += 1;
16649      DIP( "%spmovsxwq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
16650   } else {
16651      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16652      assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) );
16653      delta += alen;
16654      DIP( "%spmovsxwq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
16655   }
16656
16657   (isAvx ? putYMMRegLoAndZU : putXMMReg)
16658      ( rG, binop( Iop_64HLtoV128,
16659                   unop( Iop_16Sto64,
16660                         unop( Iop_32HIto16, mkexpr(srcBytes) ) ),
16661                   unop( Iop_16Sto64,
16662                         unop( Iop_32to16, mkexpr(srcBytes) ) ) ) );
16663   return delta;
16664}
16665
16666
16667static Long dis_PMOVSXWQ_256 ( VexAbiInfo* vbi, Prefix pfx, Long delta )
16668{
16669   IRTemp addr     = IRTemp_INVALID;
16670   Int    alen     = 0;
16671   HChar  dis_buf[50];
16672   IRTemp srcBytes = newTemp(Ity_I64);
16673   UChar  modrm    = getUChar(delta);
16674   UInt   rG       = gregOfRexRM(pfx, modrm);
16675   IRTemp s3, s2, s1, s0;
16676   s3 = s2 = s1 = s0 = IRTemp_INVALID;
16677
16678   if ( epartIsReg( modrm ) ) {
16679      UInt rE = eregOfRexRM(pfx, modrm);
16680      assign( srcBytes, getXMMRegLane64( rE, 0 ) );
16681      delta += 1;
16682      DIP( "vpmovsxwq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
16683   } else {
16684      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16685      assign( srcBytes, loadLE( Ity_I64, mkexpr(addr) ) );
16686      delta += alen;
16687      DIP( "vpmovsxwq %s,%s\n", dis_buf, nameYMMReg(rG) );
16688   }
16689
16690   breakup64to16s( srcBytes, &s3, &s2, &s1, &s0 );
16691   putYMMReg( rG, binop( Iop_V128HLtoV256,
16692                         binop( Iop_64HLtoV128,
16693                                unop( Iop_16Sto64, mkexpr(s3) ),
16694                                unop( Iop_16Sto64, mkexpr(s2) ) ),
16695                         binop( Iop_64HLtoV128,
16696                                unop( Iop_16Sto64, mkexpr(s1) ),
16697                                unop( Iop_16Sto64, mkexpr(s0) ) ) ) );
16698   return delta;
16699}
16700
16701
16702static Long dis_PMOVZXWQ_128 ( VexAbiInfo* vbi, Prefix pfx,
16703                               Long delta, Bool isAvx )
16704{
16705   IRTemp addr     = IRTemp_INVALID;
16706   Int    alen     = 0;
16707   HChar  dis_buf[50];
16708   IRTemp srcVec = newTemp(Ity_V128);
16709   UChar  modrm    = getUChar(delta);
16710   const HChar* mbV = isAvx ? "v" : "";
16711   UInt   rG       = gregOfRexRM(pfx, modrm);
16712
16713   if ( epartIsReg( modrm ) ) {
16714      UInt rE = eregOfRexRM(pfx, modrm);
16715      assign( srcVec, getXMMReg(rE) );
16716      delta += 1;
16717      DIP( "%spmovzxwq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
16718   } else {
16719      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16720      assign( srcVec,
16721              unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
16722      delta += alen;
16723      DIP( "%spmovzxwq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
16724   }
16725
16726   IRTemp zeroVec = newTemp( Ity_V128 );
16727   assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
16728
16729   (isAvx ? putYMMRegLoAndZU : putXMMReg)
16730      ( rG, binop( Iop_InterleaveLO16x8,
16731                   mkexpr(zeroVec),
16732                   binop( Iop_InterleaveLO16x8,
16733                          mkexpr(zeroVec), mkexpr(srcVec) ) ) );
16734   return delta;
16735}
16736
16737
16738static Long dis_PMOVZXWQ_256 ( VexAbiInfo* vbi, Prefix pfx,
16739                               Long delta )
16740{
16741   IRTemp addr     = IRTemp_INVALID;
16742   Int    alen     = 0;
16743   HChar  dis_buf[50];
16744   IRTemp srcVec = newTemp(Ity_V128);
16745   UChar  modrm    = getUChar(delta);
16746   UInt   rG       = gregOfRexRM(pfx, modrm);
16747
16748   if ( epartIsReg( modrm ) ) {
16749      UInt rE = eregOfRexRM(pfx, modrm);
16750      assign( srcVec, getXMMReg(rE) );
16751      delta += 1;
16752      DIP( "vpmovzxwq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
16753   } else {
16754      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16755      assign( srcVec,
16756              unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
16757      delta += alen;
16758      DIP( "vpmovzxwq %s,%s\n", dis_buf, nameYMMReg(rG) );
16759   }
16760
16761   IRTemp zeroVec = newTemp( Ity_V128 );
16762   assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
16763
16764   putYMMReg( rG, binop( Iop_V128HLtoV256,
16765                         binop( Iop_InterleaveHI16x8,
16766                                mkexpr(zeroVec),
16767                                binop( Iop_InterleaveLO16x8,
16768                                       mkexpr(zeroVec), mkexpr(srcVec) ) ),
16769                         binop( Iop_InterleaveLO16x8,
16770                                mkexpr(zeroVec),
16771                                binop( Iop_InterleaveLO16x8,
16772                                       mkexpr(zeroVec), mkexpr(srcVec) ) ) ) );
16773   return delta;
16774}
16775
16776
16777/* Handles 128 bit versions of PMOVZXDQ and PMOVSXDQ. */
16778static Long dis_PMOVxXDQ_128 ( VexAbiInfo* vbi, Prefix pfx,
16779                               Long delta, Bool isAvx, Bool xIsZ )
16780{
16781   IRTemp addr   = IRTemp_INVALID;
16782   Int    alen   = 0;
16783   HChar  dis_buf[50];
16784   IRTemp srcI64 = newTemp(Ity_I64);
16785   IRTemp srcVec = newTemp(Ity_V128);
16786   UChar  modrm  = getUChar(delta);
16787   const HChar* mbV = isAvx ? "v" : "";
16788   const HChar  how = xIsZ ? 'z' : 's';
16789   UInt   rG     = gregOfRexRM(pfx, modrm);
16790   /* Compute both srcI64 -- the value to expand -- and srcVec -- same
16791      thing in a V128, with arbitrary junk in the top 64 bits.  Use
16792      one or both of them and let iropt clean up afterwards (as
16793      usual). */
16794   if ( epartIsReg(modrm) ) {
16795      UInt rE = eregOfRexRM(pfx, modrm);
16796      assign( srcVec, getXMMReg(rE) );
16797      assign( srcI64, unop(Iop_V128to64, mkexpr(srcVec)) );
16798      delta += 1;
16799      DIP( "%spmov%cxdq %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
16800   } else {
16801      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16802      assign( srcI64, loadLE(Ity_I64, mkexpr(addr)) );
16803      assign( srcVec, unop( Iop_64UtoV128, mkexpr(srcI64)) );
16804      delta += alen;
16805      DIP( "%spmov%cxdq %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
16806   }
16807
16808   IRExpr* res
16809      = xIsZ /* do math for either zero or sign extend */
16810        ? binop( Iop_InterleaveLO32x4,
16811                 IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) )
16812        : binop( Iop_64HLtoV128,
16813                 unop( Iop_32Sto64,
16814                       unop( Iop_64HIto32, mkexpr(srcI64) ) ),
16815                 unop( Iop_32Sto64,
16816                       unop( Iop_64to32, mkexpr(srcI64) ) ) );
16817
16818   (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
16819
16820   return delta;
16821}
16822
16823
16824/* Handles 256 bit versions of PMOVZXDQ and PMOVSXDQ. */
16825static Long dis_PMOVxXDQ_256 ( VexAbiInfo* vbi, Prefix pfx,
16826                               Long delta, Bool xIsZ )
16827{
16828   IRTemp addr   = IRTemp_INVALID;
16829   Int    alen   = 0;
16830   HChar  dis_buf[50];
16831   IRTemp srcVec = newTemp(Ity_V128);
16832   UChar  modrm  = getUChar(delta);
16833   UChar  how    = xIsZ ? 'z' : 's';
16834   UInt   rG     = gregOfRexRM(pfx, modrm);
16835   /* Compute both srcI64 -- the value to expand -- and srcVec -- same
16836      thing in a V128, with arbitrary junk in the top 64 bits.  Use
16837      one or both of them and let iropt clean up afterwards (as
16838      usual). */
16839   if ( epartIsReg(modrm) ) {
16840      UInt rE = eregOfRexRM(pfx, modrm);
16841      assign( srcVec, getXMMReg(rE) );
16842      delta += 1;
16843      DIP( "vpmov%cxdq %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
16844   } else {
16845      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16846      assign( srcVec, loadLE(Ity_V128, mkexpr(addr)) );
16847      delta += alen;
16848      DIP( "vpmov%cxdq %s,%s\n", how, dis_buf, nameYMMReg(rG) );
16849   }
16850
16851   IRExpr* res;
16852   if (xIsZ)
16853      res = binop( Iop_V128HLtoV256,
16854                   binop( Iop_InterleaveHI32x4,
16855                          IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ),
16856                   binop( Iop_InterleaveLO32x4,
16857                          IRExpr_Const( IRConst_V128(0) ), mkexpr(srcVec) ) );
16858   else {
16859      IRTemp s3, s2, s1, s0;
16860      s3 = s2 = s1 = s0 = IRTemp_INVALID;
16861      breakupV128to32s( srcVec, &s3, &s2, &s1, &s0 );
16862      res = binop( Iop_V128HLtoV256,
16863                   binop( Iop_64HLtoV128,
16864                          unop( Iop_32Sto64, mkexpr(s3) ),
16865                          unop( Iop_32Sto64, mkexpr(s2) ) ),
16866                   binop( Iop_64HLtoV128,
16867                          unop( Iop_32Sto64, mkexpr(s1) ),
16868                          unop( Iop_32Sto64, mkexpr(s0) ) ) );
16869   }
16870
16871   putYMMReg ( rG, res );
16872
16873   return delta;
16874}
16875
16876
16877/* Handles 128 bit versions of PMOVZXBD and PMOVSXBD. */
16878static Long dis_PMOVxXBD_128 ( VexAbiInfo* vbi, Prefix pfx,
16879                               Long delta, Bool isAvx, Bool xIsZ )
16880{
16881   IRTemp addr   = IRTemp_INVALID;
16882   Int    alen   = 0;
16883   HChar  dis_buf[50];
16884   IRTemp srcVec = newTemp(Ity_V128);
16885   UChar  modrm  = getUChar(delta);
16886   const HChar* mbV = isAvx ? "v" : "";
16887   const HChar  how = xIsZ ? 'z' : 's';
16888   UInt   rG     = gregOfRexRM(pfx, modrm);
16889   if ( epartIsReg(modrm) ) {
16890      UInt rE = eregOfRexRM(pfx, modrm);
16891      assign( srcVec, getXMMReg(rE) );
16892      delta += 1;
16893      DIP( "%spmov%cxbd %s,%s\n", mbV, how, nameXMMReg(rE), nameXMMReg(rG) );
16894   } else {
16895      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16896      assign( srcVec,
16897              unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) ) ) );
16898      delta += alen;
16899      DIP( "%spmov%cxbd %s,%s\n", mbV, how, dis_buf, nameXMMReg(rG) );
16900   }
16901
16902   IRTemp zeroVec = newTemp(Ity_V128);
16903   assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
16904
16905   IRExpr* res
16906      = binop(Iop_InterleaveLO8x16,
16907              mkexpr(zeroVec),
16908              binop(Iop_InterleaveLO8x16,
16909                    mkexpr(zeroVec), mkexpr(srcVec)));
16910   if (!xIsZ)
16911      res = binop(Iop_SarN32x4,
16912                  binop(Iop_ShlN32x4, res, mkU8(24)), mkU8(24));
16913
16914   (isAvx ? putYMMRegLoAndZU : putXMMReg) ( rG, res );
16915
16916   return delta;
16917}
16918
16919
16920/* Handles 256 bit versions of PMOVZXBD and PMOVSXBD. */
16921static Long dis_PMOVxXBD_256 ( VexAbiInfo* vbi, Prefix pfx,
16922                               Long delta, Bool xIsZ )
16923{
16924   IRTemp addr   = IRTemp_INVALID;
16925   Int    alen   = 0;
16926   HChar  dis_buf[50];
16927   IRTemp srcVec = newTemp(Ity_V128);
16928   UChar  modrm  = getUChar(delta);
16929   UChar  how    = xIsZ ? 'z' : 's';
16930   UInt   rG     = gregOfRexRM(pfx, modrm);
16931   if ( epartIsReg(modrm) ) {
16932      UInt rE = eregOfRexRM(pfx, modrm);
16933      assign( srcVec, getXMMReg(rE) );
16934      delta += 1;
16935      DIP( "vpmov%cxbd %s,%s\n", how, nameXMMReg(rE), nameYMMReg(rG) );
16936   } else {
16937      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16938      assign( srcVec,
16939              unop( Iop_64UtoV128, loadLE( Ity_I64, mkexpr(addr) ) ) );
16940      delta += alen;
16941      DIP( "vpmov%cxbd %s,%s\n", how, dis_buf, nameYMMReg(rG) );
16942   }
16943
16944   IRTemp zeroVec = newTemp(Ity_V128);
16945   assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
16946
16947   IRExpr* res
16948      = binop( Iop_V128HLtoV256,
16949               binop(Iop_InterleaveHI8x16,
16950                     mkexpr(zeroVec),
16951                     binop(Iop_InterleaveLO8x16,
16952                           mkexpr(zeroVec), mkexpr(srcVec)) ),
16953               binop(Iop_InterleaveLO8x16,
16954                     mkexpr(zeroVec),
16955                     binop(Iop_InterleaveLO8x16,
16956                           mkexpr(zeroVec), mkexpr(srcVec)) ) );
16957   if (!xIsZ)
16958      res = binop(Iop_SarN32x8,
16959                  binop(Iop_ShlN32x8, res, mkU8(24)), mkU8(24));
16960
16961   putYMMReg ( rG, res );
16962
16963   return delta;
16964}
16965
16966
16967/* Handles 128 bit versions of PMOVSXBQ. */
16968static Long dis_PMOVSXBQ_128 ( VexAbiInfo* vbi, Prefix pfx,
16969                               Long delta, Bool isAvx )
16970{
16971   IRTemp addr     = IRTemp_INVALID;
16972   Int    alen     = 0;
16973   HChar  dis_buf[50];
16974   IRTemp srcBytes = newTemp(Ity_I16);
16975   UChar  modrm    = getUChar(delta);
16976   const HChar* mbV = isAvx ? "v" : "";
16977   UInt   rG       = gregOfRexRM(pfx, modrm);
16978   if ( epartIsReg(modrm) ) {
16979      UInt rE = eregOfRexRM(pfx, modrm);
16980      assign( srcBytes, getXMMRegLane16( rE, 0 ) );
16981      delta += 1;
16982      DIP( "%spmovsxbq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
16983   } else {
16984      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
16985      assign( srcBytes, loadLE( Ity_I16, mkexpr(addr) ) );
16986      delta += alen;
16987      DIP( "%spmovsxbq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
16988   }
16989
16990   (isAvx ? putYMMRegLoAndZU : putXMMReg)
16991      ( rG, binop( Iop_64HLtoV128,
16992                   unop( Iop_8Sto64,
16993                         unop( Iop_16HIto8, mkexpr(srcBytes) ) ),
16994                   unop( Iop_8Sto64,
16995                         unop( Iop_16to8, mkexpr(srcBytes) ) ) ) );
16996   return delta;
16997}
16998
16999
17000/* Handles 256 bit versions of PMOVSXBQ. */
17001static Long dis_PMOVSXBQ_256 ( VexAbiInfo* vbi, Prefix pfx,
17002                               Long delta )
17003{
17004   IRTemp addr     = IRTemp_INVALID;
17005   Int    alen     = 0;
17006   HChar  dis_buf[50];
17007   IRTemp srcBytes = newTemp(Ity_I32);
17008   UChar  modrm    = getUChar(delta);
17009   UInt   rG       = gregOfRexRM(pfx, modrm);
17010   if ( epartIsReg(modrm) ) {
17011      UInt rE = eregOfRexRM(pfx, modrm);
17012      assign( srcBytes, getXMMRegLane32( rE, 0 ) );
17013      delta += 1;
17014      DIP( "vpmovsxbq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
17015   } else {
17016      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17017      assign( srcBytes, loadLE( Ity_I32, mkexpr(addr) ) );
17018      delta += alen;
17019      DIP( "vpmovsxbq %s,%s\n", dis_buf, nameYMMReg(rG) );
17020   }
17021
17022   putYMMReg
17023      ( rG, binop( Iop_V128HLtoV256,
17024                   binop( Iop_64HLtoV128,
17025                          unop( Iop_8Sto64,
17026                                unop( Iop_16HIto8,
17027                                      unop( Iop_32HIto16,
17028                                            mkexpr(srcBytes) ) ) ),
17029                          unop( Iop_8Sto64,
17030                                unop( Iop_16to8,
17031                                      unop( Iop_32HIto16,
17032                                            mkexpr(srcBytes) ) ) ) ),
17033                   binop( Iop_64HLtoV128,
17034                          unop( Iop_8Sto64,
17035                                unop( Iop_16HIto8,
17036                                      unop( Iop_32to16,
17037                                            mkexpr(srcBytes) ) ) ),
17038                          unop( Iop_8Sto64,
17039                                unop( Iop_16to8,
17040                                      unop( Iop_32to16,
17041                                            mkexpr(srcBytes) ) ) ) ) ) );
17042   return delta;
17043}
17044
17045
17046/* Handles 128 bit versions of PMOVZXBQ. */
17047static Long dis_PMOVZXBQ_128 ( VexAbiInfo* vbi, Prefix pfx,
17048                               Long delta, Bool isAvx )
17049{
17050   IRTemp addr     = IRTemp_INVALID;
17051   Int    alen     = 0;
17052   HChar  dis_buf[50];
17053   IRTemp srcVec   = newTemp(Ity_V128);
17054   UChar  modrm    = getUChar(delta);
17055   const HChar* mbV = isAvx ? "v" : "";
17056   UInt   rG       = gregOfRexRM(pfx, modrm);
17057   if ( epartIsReg(modrm) ) {
17058      UInt rE = eregOfRexRM(pfx, modrm);
17059      assign( srcVec, getXMMReg(rE) );
17060      delta += 1;
17061      DIP( "%spmovzxbq %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG) );
17062   } else {
17063      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17064      assign( srcVec,
17065              unop( Iop_32UtoV128,
17066                    unop( Iop_16Uto32, loadLE( Ity_I16, mkexpr(addr) ))));
17067      delta += alen;
17068      DIP( "%spmovzxbq %s,%s\n", mbV, dis_buf, nameXMMReg(rG) );
17069   }
17070
17071   IRTemp zeroVec = newTemp(Ity_V128);
17072   assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
17073
17074   (isAvx ? putYMMRegLoAndZU : putXMMReg)
17075      ( rG, binop( Iop_InterleaveLO8x16,
17076                   mkexpr(zeroVec),
17077                   binop( Iop_InterleaveLO8x16,
17078                          mkexpr(zeroVec),
17079                          binop( Iop_InterleaveLO8x16,
17080                                 mkexpr(zeroVec), mkexpr(srcVec) ) ) ) );
17081   return delta;
17082}
17083
17084
17085/* Handles 256 bit versions of PMOVZXBQ. */
17086static Long dis_PMOVZXBQ_256 ( VexAbiInfo* vbi, Prefix pfx,
17087                               Long delta )
17088{
17089   IRTemp addr     = IRTemp_INVALID;
17090   Int    alen     = 0;
17091   HChar  dis_buf[50];
17092   IRTemp srcVec   = newTemp(Ity_V128);
17093   UChar  modrm    = getUChar(delta);
17094   UInt   rG       = gregOfRexRM(pfx, modrm);
17095   if ( epartIsReg(modrm) ) {
17096      UInt rE = eregOfRexRM(pfx, modrm);
17097      assign( srcVec, getXMMReg(rE) );
17098      delta += 1;
17099      DIP( "vpmovzxbq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG) );
17100   } else {
17101      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17102      assign( srcVec,
17103              unop( Iop_32UtoV128, loadLE( Ity_I32, mkexpr(addr) )));
17104      delta += alen;
17105      DIP( "vpmovzxbq %s,%s\n", dis_buf, nameYMMReg(rG) );
17106   }
17107
17108   IRTemp zeroVec = newTemp(Ity_V128);
17109   assign( zeroVec, IRExpr_Const( IRConst_V128(0) ) );
17110
17111   putYMMReg
17112      ( rG, binop( Iop_V128HLtoV256,
17113                   binop( Iop_InterleaveHI8x16,
17114                          mkexpr(zeroVec),
17115                          binop( Iop_InterleaveLO8x16,
17116                                 mkexpr(zeroVec),
17117                                 binop( Iop_InterleaveLO8x16,
17118                                        mkexpr(zeroVec), mkexpr(srcVec) ) ) ),
17119                   binop( Iop_InterleaveLO8x16,
17120                          mkexpr(zeroVec),
17121                          binop( Iop_InterleaveLO8x16,
17122                                 mkexpr(zeroVec),
17123                                 binop( Iop_InterleaveLO8x16,
17124                                        mkexpr(zeroVec), mkexpr(srcVec) ) ) )
17125                 ) );
17126   return delta;
17127}
17128
17129
17130static Long dis_PHMINPOSUW_128 ( VexAbiInfo* vbi, Prefix pfx,
17131                                 Long delta, Bool isAvx )
17132{
17133   IRTemp addr   = IRTemp_INVALID;
17134   Int    alen   = 0;
17135   HChar  dis_buf[50];
17136   UChar  modrm  = getUChar(delta);
17137   const HChar* mbV = isAvx ? "v" : "";
17138   IRTemp sV     = newTemp(Ity_V128);
17139   IRTemp sHi    = newTemp(Ity_I64);
17140   IRTemp sLo    = newTemp(Ity_I64);
17141   IRTemp dLo    = newTemp(Ity_I64);
17142   UInt   rG     = gregOfRexRM(pfx,modrm);
17143   if (epartIsReg(modrm)) {
17144      UInt rE = eregOfRexRM(pfx,modrm);
17145      assign( sV, getXMMReg(rE) );
17146      delta += 1;
17147      DIP("%sphminposuw %s,%s\n", mbV, nameXMMReg(rE), nameXMMReg(rG));
17148   } else {
17149      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
17150      if (!isAvx)
17151         gen_SEGV_if_not_16_aligned(addr);
17152      assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
17153      delta += alen;
17154      DIP("%sphminposuw %s,%s\n", mbV, dis_buf, nameXMMReg(rG));
17155   }
17156   assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
17157   assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
17158   assign( dLo, mkIRExprCCall(
17159                   Ity_I64, 0/*regparms*/,
17160                   "amd64g_calculate_sse_phminposuw",
17161                   &amd64g_calculate_sse_phminposuw,
17162                   mkIRExprVec_2( mkexpr(sLo), mkexpr(sHi) )
17163         ));
17164   (isAvx ? putYMMRegLoAndZU : putXMMReg)
17165      (rG, unop(Iop_64UtoV128, mkexpr(dLo)));
17166   return delta;
17167}
17168
17169
17170static Long dis_AESx ( VexAbiInfo* vbi, Prefix pfx,
17171                       Long delta, Bool isAvx, UChar opc )
17172{
17173   IRTemp addr   = IRTemp_INVALID;
17174   Int    alen   = 0;
17175   HChar  dis_buf[50];
17176   UChar  modrm  = getUChar(delta);
17177   UInt   rG     = gregOfRexRM(pfx, modrm);
17178   UInt   regNoL = 0;
17179   UInt   regNoR = (isAvx && opc != 0xDB) ? getVexNvvvv(pfx) : rG;
17180
17181   /* This is a nasty kludge.  We need to pass 2 x V128 to the
17182      helper.  Since we can't do that, use a dirty
17183      helper to compute the results directly from the XMM regs in
17184      the guest state.  That means for the memory case, we need to
17185      move the left operand into a pseudo-register (XMM16, let's
17186      call it). */
17187   if (epartIsReg(modrm)) {
17188      regNoL = eregOfRexRM(pfx, modrm);
17189      delta += 1;
17190   } else {
17191      regNoL = 16; /* use XMM16 as an intermediary */
17192      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17193      /* alignment check needed ???? */
17194      stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
17195      delta += alen;
17196   }
17197
17198   void*  fn = &amd64g_dirtyhelper_AES;
17199   const HChar* nm = "amd64g_dirtyhelper_AES";
17200
17201   /* Round up the arguments.  Note that this is a kludge -- the
17202      use of mkU64 rather than mkIRExpr_HWord implies the
17203      assumption that the host's word size is 64-bit. */
17204   UInt gstOffD = ymmGuestRegOffset(rG);
17205   UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
17206   UInt gstOffR = ymmGuestRegOffset(regNoR);
17207   IRExpr*  opc4         = mkU64(opc);
17208   IRExpr*  gstOffDe     = mkU64(gstOffD);
17209   IRExpr*  gstOffLe     = mkU64(gstOffL);
17210   IRExpr*  gstOffRe     = mkU64(gstOffR);
17211   IRExpr** args
17212      = mkIRExprVec_5( IRExpr_BBPTR(), opc4, gstOffDe, gstOffLe, gstOffRe );
17213
17214   IRDirty* d    = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
17215   /* It's not really a dirty call, but we can't use the clean helper
17216      mechanism here for the very lame reason that we can't pass 2 x
17217      V128s by value to a helper.  Hence this roundabout scheme. */
17218   d->nFxState = 2;
17219   vex_bzero(&d->fxState, sizeof(d->fxState));
17220   /* AES{ENC,ENCLAST,DEC,DECLAST} read both registers, and writes
17221      the second for !isAvx or the third for isAvx.
17222      AESIMC (0xDB) reads the first register, and writes the second. */
17223   d->fxState[0].fx     = Ifx_Read;
17224   d->fxState[0].offset = gstOffL;
17225   d->fxState[0].size   = sizeof(U128);
17226   d->fxState[1].offset = gstOffR;
17227   d->fxState[1].size   = sizeof(U128);
17228   if (opc == 0xDB)
17229      d->fxState[1].fx   = Ifx_Write;
17230   else if (!isAvx || rG == regNoR)
17231      d->fxState[1].fx   = Ifx_Modify;
17232   else {
17233      d->fxState[1].fx     = Ifx_Read;
17234      d->nFxState++;
17235      d->fxState[2].fx     = Ifx_Write;
17236      d->fxState[2].offset = gstOffD;
17237      d->fxState[2].size   = sizeof(U128);
17238   }
17239
17240   stmt( IRStmt_Dirty(d) );
17241   {
17242      const HChar* opsuf;
17243      switch (opc) {
17244         case 0xDC: opsuf = "enc"; break;
17245         case 0XDD: opsuf = "enclast"; break;
17246         case 0xDE: opsuf = "dec"; break;
17247         case 0xDF: opsuf = "declast"; break;
17248         case 0xDB: opsuf = "imc"; break;
17249         default: vassert(0);
17250      }
17251      DIP("%saes%s %s,%s%s%s\n", isAvx ? "v" : "", opsuf,
17252          (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
17253          nameXMMReg(regNoR),
17254          (isAvx && opc != 0xDB) ? "," : "",
17255          (isAvx && opc != 0xDB) ? nameXMMReg(rG) : "");
17256   }
17257   if (isAvx)
17258      putYMMRegLane128( rG, 1, mkV128(0) );
17259   return delta;
17260}
17261
17262static Long dis_AESKEYGENASSIST ( VexAbiInfo* vbi, Prefix pfx,
17263                                  Long delta, Bool isAvx )
17264{
17265   IRTemp addr   = IRTemp_INVALID;
17266   Int    alen   = 0;
17267   HChar  dis_buf[50];
17268   UChar  modrm  = getUChar(delta);
17269   UInt   regNoL = 0;
17270   UInt   regNoR = gregOfRexRM(pfx, modrm);
17271   UChar  imm    = 0;
17272
17273   /* This is a nasty kludge.  See AESENC et al. instructions. */
17274   modrm = getUChar(delta);
17275   if (epartIsReg(modrm)) {
17276      regNoL = eregOfRexRM(pfx, modrm);
17277      imm = getUChar(delta+1);
17278      delta += 1+1;
17279   } else {
17280      regNoL = 16; /* use XMM16 as an intermediary */
17281      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
17282      /* alignment check ???? . */
17283      stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
17284      imm = getUChar(delta+alen);
17285      delta += alen+1;
17286   }
17287
17288   /* Who ya gonna call?  Presumably not Ghostbusters. */
17289   void*  fn = &amd64g_dirtyhelper_AESKEYGENASSIST;
17290   const HChar* nm = "amd64g_dirtyhelper_AESKEYGENASSIST";
17291
17292   /* Round up the arguments.  Note that this is a kludge -- the
17293      use of mkU64 rather than mkIRExpr_HWord implies the
17294      assumption that the host's word size is 64-bit. */
17295   UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
17296   UInt gstOffR = ymmGuestRegOffset(regNoR);
17297
17298   IRExpr*  imme          = mkU64(imm & 0xFF);
17299   IRExpr*  gstOffLe     = mkU64(gstOffL);
17300   IRExpr*  gstOffRe     = mkU64(gstOffR);
17301   IRExpr** args
17302      = mkIRExprVec_4( IRExpr_BBPTR(), imme, gstOffLe, gstOffRe );
17303
17304   IRDirty* d    = unsafeIRDirty_0_N( 0/*regparms*/, nm, fn, args );
17305   /* It's not really a dirty call, but we can't use the clean helper
17306      mechanism here for the very lame reason that we can't pass 2 x
17307      V128s by value to a helper.  Hence this roundabout scheme. */
17308   d->nFxState = 2;
17309   vex_bzero(&d->fxState, sizeof(d->fxState));
17310   d->fxState[0].fx     = Ifx_Read;
17311   d->fxState[0].offset = gstOffL;
17312   d->fxState[0].size   = sizeof(U128);
17313   d->fxState[1].fx     = Ifx_Write;
17314   d->fxState[1].offset = gstOffR;
17315   d->fxState[1].size   = sizeof(U128);
17316   stmt( IRStmt_Dirty(d) );
17317
17318   DIP("%saeskeygenassist $%x,%s,%s\n", isAvx ? "v" : "", (UInt)imm,
17319       (regNoL == 16 ? dis_buf : nameXMMReg(regNoL)),
17320       nameXMMReg(regNoR));
17321   if (isAvx)
17322      putYMMRegLane128( regNoR, 1, mkV128(0) );
17323   return delta;
17324}
17325
17326
17327__attribute__((noinline))
17328static
17329Long dis_ESC_0F38__SSE4 ( Bool* decode_OK,
17330                          VexAbiInfo* vbi,
17331                          Prefix pfx, Int sz, Long deltaIN )
17332{
17333   IRTemp addr  = IRTemp_INVALID;
17334   UChar  modrm = 0;
17335   Int    alen  = 0;
17336   HChar  dis_buf[50];
17337
17338   *decode_OK = False;
17339
17340   Long   delta = deltaIN;
17341   UChar  opc   = getUChar(delta);
17342   delta++;
17343   switch (opc) {
17344
17345   case 0x10:
17346   case 0x14:
17347   case 0x15:
17348      /* 66 0F 38 10 /r = PBLENDVB xmm1, xmm2/m128  (byte gran)
17349         66 0F 38 14 /r = BLENDVPS xmm1, xmm2/m128  (float gran)
17350         66 0F 38 15 /r = BLENDVPD xmm1, xmm2/m128  (double gran)
17351         Blend at various granularities, with XMM0 (implicit operand)
17352         providing the controlling mask.
17353      */
17354      if (have66noF2noF3(pfx) && sz == 2) {
17355         modrm = getUChar(delta);
17356
17357         const HChar* nm    = NULL;
17358         UInt   gran  = 0;
17359         IROp   opSAR = Iop_INVALID;
17360         switch (opc) {
17361            case 0x10:
17362               nm = "pblendvb"; gran = 1; opSAR = Iop_SarN8x16;
17363               break;
17364            case 0x14:
17365               nm = "blendvps"; gran = 4; opSAR = Iop_SarN32x4;
17366               break;
17367            case 0x15:
17368               nm = "blendvpd"; gran = 8; opSAR = Iop_SarN64x2;
17369               break;
17370         }
17371         vassert(nm);
17372
17373         IRTemp vecE = newTemp(Ity_V128);
17374         IRTemp vecG = newTemp(Ity_V128);
17375         IRTemp vec0 = newTemp(Ity_V128);
17376
17377         if ( epartIsReg(modrm) ) {
17378            assign(vecE, getXMMReg(eregOfRexRM(pfx, modrm)));
17379            delta += 1;
17380            DIP( "%s %s,%s\n", nm,
17381                 nameXMMReg( eregOfRexRM(pfx, modrm) ),
17382                 nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17383         } else {
17384            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17385            gen_SEGV_if_not_16_aligned( addr );
17386            assign(vecE, loadLE( Ity_V128, mkexpr(addr) ));
17387            delta += alen;
17388            DIP( "%s %s,%s\n", nm,
17389                 dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17390         }
17391
17392         assign(vecG, getXMMReg(gregOfRexRM(pfx, modrm)));
17393         assign(vec0, getXMMReg(0));
17394
17395         IRTemp res = math_PBLENDVB_128( vecE, vecG, vec0, gran, opSAR );
17396         putXMMReg(gregOfRexRM(pfx, modrm), mkexpr(res));
17397
17398         goto decode_success;
17399      }
17400      break;
17401
17402   case 0x17:
17403      /* 66 0F 38 17 /r = PTEST xmm1, xmm2/m128
17404         Logical compare (set ZF and CF from AND/ANDN of the operands) */
17405      if (have66noF2noF3(pfx)
17406          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
17407         delta = dis_xTESTy_128( vbi, pfx, delta, False/*!isAvx*/, 0 );
17408         goto decode_success;
17409      }
17410      break;
17411
17412   case 0x20:
17413      /* 66 0F 38 20 /r = PMOVSXBW xmm1, xmm2/m64
17414         Packed Move with Sign Extend from Byte to Word (XMM) */
17415      if (have66noF2noF3(pfx) && sz == 2) {
17416         delta = dis_PMOVxXBW_128( vbi, pfx, delta,
17417                                   False/*!isAvx*/, False/*!xIsZ*/ );
17418         goto decode_success;
17419      }
17420      break;
17421
17422   case 0x21:
17423      /* 66 0F 38 21 /r = PMOVSXBD xmm1, xmm2/m32
17424         Packed Move with Sign Extend from Byte to DWord (XMM) */
17425      if (have66noF2noF3(pfx) && sz == 2) {
17426         delta = dis_PMOVxXBD_128( vbi, pfx, delta,
17427                                   False/*!isAvx*/, False/*!xIsZ*/ );
17428         goto decode_success;
17429      }
17430      break;
17431
17432   case 0x22:
17433      /* 66 0F 38 22 /r = PMOVSXBQ xmm1, xmm2/m16
17434         Packed Move with Sign Extend from Byte to QWord (XMM) */
17435      if (have66noF2noF3(pfx) && sz == 2) {
17436         delta = dis_PMOVSXBQ_128( vbi, pfx, delta, False/*!isAvx*/ );
17437         goto decode_success;
17438      }
17439      break;
17440
17441   case 0x23:
17442      /* 66 0F 38 23 /r = PMOVSXWD xmm1, xmm2/m64
17443         Packed Move with Sign Extend from Word to DWord (XMM) */
17444      if (have66noF2noF3(pfx) && sz == 2) {
17445         delta = dis_PMOVxXWD_128(vbi, pfx, delta,
17446                                  False/*!isAvx*/, False/*!xIsZ*/);
17447         goto decode_success;
17448      }
17449      break;
17450
17451   case 0x24:
17452      /* 66 0F 38 24 /r = PMOVSXWQ xmm1, xmm2/m32
17453         Packed Move with Sign Extend from Word to QWord (XMM) */
17454      if (have66noF2noF3(pfx) && sz == 2) {
17455         delta = dis_PMOVSXWQ_128( vbi, pfx, delta, False/*!isAvx*/ );
17456         goto decode_success;
17457      }
17458      break;
17459
17460   case 0x25:
17461      /* 66 0F 38 25 /r = PMOVSXDQ xmm1, xmm2/m64
17462         Packed Move with Sign Extend from Double Word to Quad Word (XMM) */
17463      if (have66noF2noF3(pfx) && sz == 2) {
17464         delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
17465                                   False/*!isAvx*/, False/*!xIsZ*/ );
17466         goto decode_success;
17467      }
17468      break;
17469
17470   case 0x28:
17471      /* 66 0F 38 28 = PMULDQ -- signed widening multiply of 32-lanes
17472         0 x 0 to form lower 64-bit half and lanes 2 x 2 to form upper
17473         64-bit half */
17474      /* This is a really poor translation -- could be improved if
17475         performance critical.  It's a copy-paste of PMULUDQ, too. */
17476      if (have66noF2noF3(pfx) && sz == 2) {
17477         IRTemp sV = newTemp(Ity_V128);
17478         IRTemp dV = newTemp(Ity_V128);
17479         modrm = getUChar(delta);
17480         UInt rG = gregOfRexRM(pfx,modrm);
17481         assign( dV, getXMMReg(rG) );
17482         if (epartIsReg(modrm)) {
17483            UInt rE = eregOfRexRM(pfx,modrm);
17484            assign( sV, getXMMReg(rE) );
17485            delta += 1;
17486            DIP("pmuldq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
17487         } else {
17488            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
17489            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
17490            delta += alen;
17491            DIP("pmuldq %s,%s\n", dis_buf, nameXMMReg(rG));
17492         }
17493
17494         putXMMReg( rG, mkexpr(math_PMULDQ_128( dV, sV )) );
17495         goto decode_success;
17496      }
17497      break;
17498
17499   case 0x29:
17500      /* 66 0F 38 29 = PCMPEQQ
17501         64x2 equality comparison */
17502      if (have66noF2noF3(pfx) && sz == 2) {
17503         /* FIXME: this needs an alignment check */
17504         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
17505                                    "pcmpeqq", Iop_CmpEQ64x2, False );
17506         goto decode_success;
17507      }
17508      break;
17509
17510   case 0x2A:
17511      /* 66 0F 38 2A = MOVNTDQA
17512         "non-temporal" "streaming" load
17513         Handle like MOVDQA but only memory operand is allowed */
17514      if (have66noF2noF3(pfx) && sz == 2) {
17515         modrm = getUChar(delta);
17516         if (!epartIsReg(modrm)) {
17517            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
17518            gen_SEGV_if_not_16_aligned( addr );
17519            putXMMReg( gregOfRexRM(pfx,modrm),
17520                       loadLE(Ity_V128, mkexpr(addr)) );
17521            DIP("movntdqa %s,%s\n", dis_buf,
17522                                    nameXMMReg(gregOfRexRM(pfx,modrm)));
17523            delta += alen;
17524            goto decode_success;
17525         }
17526      }
17527      break;
17528
17529   case 0x2B:
17530      /* 66 0f 38 2B /r = PACKUSDW xmm1, xmm2/m128
17531         2x 32x4 S->U saturating narrow from xmm2/m128 to xmm1 */
17532      if (have66noF2noF3(pfx) && sz == 2) {
17533
17534         modrm = getUChar(delta);
17535
17536         IRTemp argL = newTemp(Ity_V128);
17537         IRTemp argR = newTemp(Ity_V128);
17538
17539         if ( epartIsReg(modrm) ) {
17540            assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
17541            delta += 1;
17542            DIP( "packusdw %s,%s\n",
17543                 nameXMMReg( eregOfRexRM(pfx, modrm) ),
17544                 nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17545         } else {
17546            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17547            gen_SEGV_if_not_16_aligned( addr );
17548            assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
17549            delta += alen;
17550            DIP( "packusdw %s,%s\n",
17551                 dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17552         }
17553
17554         assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
17555
17556         putXMMReg( gregOfRexRM(pfx, modrm),
17557                    binop( Iop_QNarrowBin32Sto16Ux8,
17558                           mkexpr(argL), mkexpr(argR)) );
17559
17560         goto decode_success;
17561      }
17562      break;
17563
17564   case 0x30:
17565      /* 66 0F 38 30 /r = PMOVZXBW xmm1, xmm2/m64
17566         Packed Move with Zero Extend from Byte to Word (XMM) */
17567      if (have66noF2noF3(pfx) && sz == 2) {
17568         delta = dis_PMOVxXBW_128( vbi, pfx, delta,
17569                                   False/*!isAvx*/, True/*xIsZ*/ );
17570         goto decode_success;
17571      }
17572      break;
17573
17574   case 0x31:
17575      /* 66 0F 38 31 /r = PMOVZXBD xmm1, xmm2/m32
17576         Packed Move with Zero Extend from Byte to DWord (XMM) */
17577      if (have66noF2noF3(pfx) && sz == 2) {
17578         delta = dis_PMOVxXBD_128( vbi, pfx, delta,
17579                                   False/*!isAvx*/, True/*xIsZ*/ );
17580         goto decode_success;
17581      }
17582      break;
17583
17584   case 0x32:
17585      /* 66 0F 38 32 /r = PMOVZXBQ xmm1, xmm2/m16
17586         Packed Move with Zero Extend from Byte to QWord (XMM) */
17587      if (have66noF2noF3(pfx) && sz == 2) {
17588         delta = dis_PMOVZXBQ_128( vbi, pfx, delta, False/*!isAvx*/ );
17589         goto decode_success;
17590      }
17591      break;
17592
17593   case 0x33:
17594      /* 66 0F 38 33 /r = PMOVZXWD xmm1, xmm2/m64
17595         Packed Move with Zero Extend from Word to DWord (XMM) */
17596      if (have66noF2noF3(pfx) && sz == 2) {
17597         delta = dis_PMOVxXWD_128( vbi, pfx, delta,
17598                                   False/*!isAvx*/, True/*xIsZ*/ );
17599         goto decode_success;
17600      }
17601      break;
17602
17603   case 0x34:
17604      /* 66 0F 38 34 /r = PMOVZXWQ xmm1, xmm2/m32
17605         Packed Move with Zero Extend from Word to QWord (XMM) */
17606      if (have66noF2noF3(pfx) && sz == 2) {
17607         delta = dis_PMOVZXWQ_128( vbi, pfx, delta, False/*!isAvx*/ );
17608         goto decode_success;
17609      }
17610      break;
17611
17612   case 0x35:
17613      /* 66 0F 38 35 /r = PMOVZXDQ xmm1, xmm2/m64
17614         Packed Move with Zero Extend from DWord to QWord (XMM) */
17615      if (have66noF2noF3(pfx) && sz == 2) {
17616         delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
17617                                   False/*!isAvx*/, True/*xIsZ*/ );
17618         goto decode_success;
17619      }
17620      break;
17621
17622   case 0x37:
17623      /* 66 0F 38 37 = PCMPGTQ
17624         64x2 comparison (signed, presumably; the Intel docs don't say :-)
17625      */
17626      if (have66noF2noF3(pfx) && sz == 2) {
17627         /* FIXME: this needs an alignment check */
17628         delta = dis_SSEint_E_to_G( vbi, pfx, delta,
17629                                    "pcmpgtq", Iop_CmpGT64Sx2, False );
17630         goto decode_success;
17631      }
17632      break;
17633
17634   case 0x38:
17635   case 0x3C:
17636      /* 66 0F 38 38 /r = PMINSB xmm1, xmm2/m128    8Sx16 (signed) min
17637         66 0F 38 3C /r = PMAXSB xmm1, xmm2/m128    8Sx16 (signed) max
17638      */
17639      if (have66noF2noF3(pfx) && sz == 2) {
17640         /* FIXME: this needs an alignment check */
17641         Bool isMAX = opc == 0x3C;
17642         delta = dis_SSEint_E_to_G(
17643                    vbi, pfx, delta,
17644                    isMAX ? "pmaxsb" : "pminsb",
17645                    isMAX ? Iop_Max8Sx16 : Iop_Min8Sx16,
17646                    False
17647                 );
17648         goto decode_success;
17649      }
17650      break;
17651
17652   case 0x39:
17653   case 0x3D:
17654      /* 66 0F 38 39 /r = PMINSD xmm1, xmm2/m128
17655         Minimum of Packed Signed Double Word Integers (XMM)
17656         66 0F 38 3D /r = PMAXSD xmm1, xmm2/m128
17657         Maximum of Packed Signed Double Word Integers (XMM)
17658      */
17659      if (have66noF2noF3(pfx) && sz == 2) {
17660         /* FIXME: this needs an alignment check */
17661         Bool isMAX = opc == 0x3D;
17662         delta = dis_SSEint_E_to_G(
17663                    vbi, pfx, delta,
17664                    isMAX ? "pmaxsd" : "pminsd",
17665                    isMAX ? Iop_Max32Sx4 : Iop_Min32Sx4,
17666                    False
17667                 );
17668         goto decode_success;
17669      }
17670      break;
17671
17672   case 0x3A:
17673   case 0x3E:
17674      /* 66 0F 38 3A /r = PMINUW xmm1, xmm2/m128
17675         Minimum of Packed Unsigned Word Integers (XMM)
17676         66 0F 38 3E /r = PMAXUW xmm1, xmm2/m128
17677         Maximum of Packed Unsigned Word Integers (XMM)
17678      */
17679      if (have66noF2noF3(pfx) && sz == 2) {
17680         /* FIXME: this needs an alignment check */
17681         Bool isMAX = opc == 0x3E;
17682         delta = dis_SSEint_E_to_G(
17683                    vbi, pfx, delta,
17684                    isMAX ? "pmaxuw" : "pminuw",
17685                    isMAX ? Iop_Max16Ux8 : Iop_Min16Ux8,
17686                    False
17687                 );
17688         goto decode_success;
17689      }
17690      break;
17691
17692   case 0x3B:
17693   case 0x3F:
17694      /* 66 0F 38 3B /r = PMINUD xmm1, xmm2/m128
17695         Minimum of Packed Unsigned Doubleword Integers (XMM)
17696         66 0F 38 3F /r = PMAXUD xmm1, xmm2/m128
17697         Maximum of Packed Unsigned Doubleword Integers (XMM)
17698      */
17699      if (have66noF2noF3(pfx) && sz == 2) {
17700         /* FIXME: this needs an alignment check */
17701         Bool isMAX = opc == 0x3F;
17702         delta = dis_SSEint_E_to_G(
17703                    vbi, pfx, delta,
17704                    isMAX ? "pmaxud" : "pminud",
17705                    isMAX ? Iop_Max32Ux4 : Iop_Min32Ux4,
17706                    False
17707                 );
17708         goto decode_success;
17709      }
17710      break;
17711
17712   case 0x40:
17713      /* 66 0F 38 40 /r = PMULLD xmm1, xmm2/m128
17714         32x4 integer multiply from xmm2/m128 to xmm1 */
17715      if (have66noF2noF3(pfx) && sz == 2) {
17716
17717         modrm = getUChar(delta);
17718
17719         IRTemp argL = newTemp(Ity_V128);
17720         IRTemp argR = newTemp(Ity_V128);
17721
17722         if ( epartIsReg(modrm) ) {
17723            assign( argL, getXMMReg( eregOfRexRM(pfx, modrm) ) );
17724            delta += 1;
17725            DIP( "pmulld %s,%s\n",
17726                 nameXMMReg( eregOfRexRM(pfx, modrm) ),
17727                 nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17728         } else {
17729            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17730            gen_SEGV_if_not_16_aligned( addr );
17731            assign( argL, loadLE( Ity_V128, mkexpr(addr) ));
17732            delta += alen;
17733            DIP( "pmulld %s,%s\n",
17734                 dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
17735         }
17736
17737         assign(argR, getXMMReg( gregOfRexRM(pfx, modrm) ));
17738
17739         putXMMReg( gregOfRexRM(pfx, modrm),
17740                    binop( Iop_Mul32x4, mkexpr(argL), mkexpr(argR)) );
17741
17742         goto decode_success;
17743      }
17744      break;
17745
17746   case 0x41:
17747      /* 66 0F 38 41 /r = PHMINPOSUW xmm1, xmm2/m128
17748         Packed Horizontal Word Minimum from xmm2/m128 to xmm1 */
17749      if (have66noF2noF3(pfx) && sz == 2) {
17750         delta = dis_PHMINPOSUW_128( vbi, pfx, delta, False/*!isAvx*/ );
17751         goto decode_success;
17752      }
17753      break;
17754
17755   case 0xDC:
17756   case 0xDD:
17757   case 0xDE:
17758   case 0xDF:
17759   case 0xDB:
17760      /* 66 0F 38 DC /r = AESENC xmm1, xmm2/m128
17761                  DD /r = AESENCLAST xmm1, xmm2/m128
17762                  DE /r = AESDEC xmm1, xmm2/m128
17763                  DF /r = AESDECLAST xmm1, xmm2/m128
17764
17765                  DB /r = AESIMC xmm1, xmm2/m128 */
17766      if (have66noF2noF3(pfx) && sz == 2) {
17767         delta = dis_AESx( vbi, pfx, delta, False/*!isAvx*/, opc );
17768         goto decode_success;
17769      }
17770      break;
17771
17772   case 0xF0:
17773   case 0xF1:
17774      /* F2 0F 38 F0 /r = CRC32 r/m8, r32 (REX.W ok, 66 not ok)
17775         F2 0F 38 F1 /r = CRC32 r/m{16,32,64}, r32
17776         The decoding on this is a bit unusual.
17777      */
17778      if (haveF2noF3(pfx)
17779          && (opc == 0xF1 || (opc == 0xF0 && !have66(pfx)))) {
17780         modrm = getUChar(delta);
17781
17782         if (opc == 0xF0)
17783            sz = 1;
17784         else
17785            vassert(sz == 2 || sz == 4 || sz == 8);
17786
17787         IRType tyE = szToITy(sz);
17788         IRTemp valE = newTemp(tyE);
17789
17790         if (epartIsReg(modrm)) {
17791            assign(valE, getIRegE(sz, pfx, modrm));
17792            delta += 1;
17793            DIP("crc32b %s,%s\n", nameIRegE(sz, pfx, modrm),
17794                nameIRegG(1==getRexW(pfx) ? 8 : 4, pfx, modrm));
17795         } else {
17796            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
17797            assign(valE, loadLE(tyE, mkexpr(addr)));
17798            delta += alen;
17799            DIP("crc32b %s,%s\n", dis_buf,
17800                nameIRegG(1==getRexW(pfx) ? 8 : 4, pfx, modrm));
17801         }
17802
17803         /* Somewhat funny getting/putting of the crc32 value, in order
17804            to ensure that it turns into 64-bit gets and puts.  However,
17805            mask off the upper 32 bits so as to not get memcheck false
17806            +ves around the helper call. */
17807         IRTemp valG0 = newTemp(Ity_I64);
17808         assign(valG0, binop(Iop_And64, getIRegG(8, pfx, modrm),
17809                             mkU64(0xFFFFFFFF)));
17810
17811         const HChar* nm = NULL;
17812         void*  fn = NULL;
17813         switch (sz) {
17814            case 1: nm = "amd64g_calc_crc32b";
17815                    fn = &amd64g_calc_crc32b; break;
17816            case 2: nm = "amd64g_calc_crc32w";
17817                    fn = &amd64g_calc_crc32w; break;
17818            case 4: nm = "amd64g_calc_crc32l";
17819                    fn = &amd64g_calc_crc32l; break;
17820            case 8: nm = "amd64g_calc_crc32q";
17821                    fn = &amd64g_calc_crc32q; break;
17822         }
17823         vassert(nm && fn);
17824         IRTemp valG1 = newTemp(Ity_I64);
17825         assign(valG1,
17826                mkIRExprCCall(Ity_I64, 0/*regparm*/, nm, fn,
17827                              mkIRExprVec_2(mkexpr(valG0),
17828                                            widenUto64(mkexpr(valE)))));
17829
17830         putIRegG(4, pfx, modrm, unop(Iop_64to32, mkexpr(valG1)));
17831         goto decode_success;
17832      }
17833      break;
17834
17835   default:
17836      break;
17837
17838   }
17839
17840  //decode_failure:
17841   *decode_OK = False;
17842   return deltaIN;
17843
17844  decode_success:
17845   *decode_OK = True;
17846   return delta;
17847}
17848
17849
17850/*------------------------------------------------------------*/
17851/*---                                                      ---*/
17852/*--- Top-level SSE4: dis_ESC_0F3A__SSE4                   ---*/
17853/*---                                                      ---*/
17854/*------------------------------------------------------------*/
17855
17856static Long dis_PEXTRW ( VexAbiInfo* vbi, Prefix pfx,
17857                         Long delta, Bool isAvx )
17858{
17859   IRTemp addr  = IRTemp_INVALID;
17860   IRTemp t0    = IRTemp_INVALID;
17861   IRTemp t1    = IRTemp_INVALID;
17862   IRTemp t2    = IRTemp_INVALID;
17863   IRTemp t3    = IRTemp_INVALID;
17864   UChar  modrm = getUChar(delta);
17865   Int    alen  = 0;
17866   HChar  dis_buf[50];
17867   UInt   rG    = gregOfRexRM(pfx,modrm);
17868   Int    imm8_20;
17869   IRTemp xmm_vec = newTemp(Ity_V128);
17870   IRTemp d16   = newTemp(Ity_I16);
17871   const HChar* mbV = isAvx ? "v" : "";
17872
17873   vassert(0==getRexW(pfx)); /* ensured by caller */
17874   assign( xmm_vec, getXMMReg(rG) );
17875   breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
17876
17877   if ( epartIsReg( modrm ) ) {
17878      imm8_20 = (Int)(getUChar(delta+1) & 7);
17879   } else {
17880      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
17881      imm8_20 = (Int)(getUChar(delta+alen) & 7);
17882   }
17883
17884   switch (imm8_20) {
17885      case 0:  assign(d16, unop(Iop_32to16,   mkexpr(t0))); break;
17886      case 1:  assign(d16, unop(Iop_32HIto16, mkexpr(t0))); break;
17887      case 2:  assign(d16, unop(Iop_32to16,   mkexpr(t1))); break;
17888      case 3:  assign(d16, unop(Iop_32HIto16, mkexpr(t1))); break;
17889      case 4:  assign(d16, unop(Iop_32to16,   mkexpr(t2))); break;
17890      case 5:  assign(d16, unop(Iop_32HIto16, mkexpr(t2))); break;
17891      case 6:  assign(d16, unop(Iop_32to16,   mkexpr(t3))); break;
17892      case 7:  assign(d16, unop(Iop_32HIto16, mkexpr(t3))); break;
17893      default: vassert(0);
17894   }
17895
17896   if ( epartIsReg( modrm ) ) {
17897      UInt rE = eregOfRexRM(pfx,modrm);
17898      putIReg32( rE, unop(Iop_16Uto32, mkexpr(d16)) );
17899      delta += 1+1;
17900      DIP( "%spextrw $%d, %s,%s\n", mbV, imm8_20,
17901           nameXMMReg( rG ), nameIReg32( rE ) );
17902   } else {
17903      storeLE( mkexpr(addr), mkexpr(d16) );
17904      delta += alen+1;
17905      DIP( "%spextrw $%d, %s,%s\n", mbV, imm8_20, nameXMMReg( rG ), dis_buf );
17906   }
17907   return delta;
17908}
17909
17910
17911static Long dis_PEXTRD ( VexAbiInfo* vbi, Prefix pfx,
17912                         Long delta, Bool isAvx )
17913{
17914   IRTemp addr  = IRTemp_INVALID;
17915   IRTemp t0    = IRTemp_INVALID;
17916   IRTemp t1    = IRTemp_INVALID;
17917   IRTemp t2    = IRTemp_INVALID;
17918   IRTemp t3    = IRTemp_INVALID;
17919   UChar  modrm = 0;
17920   Int    alen  = 0;
17921   HChar  dis_buf[50];
17922
17923   Int    imm8_10;
17924   IRTemp xmm_vec   = newTemp(Ity_V128);
17925   IRTemp src_dword = newTemp(Ity_I32);
17926   const HChar* mbV = isAvx ? "v" : "";
17927
17928   vassert(0==getRexW(pfx)); /* ensured by caller */
17929   modrm = getUChar(delta);
17930   assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
17931   breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
17932
17933   if ( epartIsReg( modrm ) ) {
17934      imm8_10 = (Int)(getUChar(delta+1) & 3);
17935   } else {
17936      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
17937      imm8_10 = (Int)(getUChar(delta+alen) & 3);
17938   }
17939
17940   switch ( imm8_10 ) {
17941      case 0:  assign( src_dword, mkexpr(t0) ); break;
17942      case 1:  assign( src_dword, mkexpr(t1) ); break;
17943      case 2:  assign( src_dword, mkexpr(t2) ); break;
17944      case 3:  assign( src_dword, mkexpr(t3) ); break;
17945      default: vassert(0);
17946   }
17947
17948   if ( epartIsReg( modrm ) ) {
17949      putIReg32( eregOfRexRM(pfx,modrm), mkexpr(src_dword) );
17950      delta += 1+1;
17951      DIP( "%spextrd $%d, %s,%s\n", mbV, imm8_10,
17952           nameXMMReg( gregOfRexRM(pfx, modrm) ),
17953           nameIReg32( eregOfRexRM(pfx, modrm) ) );
17954   } else {
17955      storeLE( mkexpr(addr), mkexpr(src_dword) );
17956      delta += alen+1;
17957      DIP( "%spextrd $%d, %s,%s\n", mbV,
17958           imm8_10, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
17959   }
17960   return delta;
17961}
17962
17963
17964static Long dis_PEXTRQ ( VexAbiInfo* vbi, Prefix pfx,
17965                         Long delta, Bool isAvx )
17966{
17967   IRTemp addr  = IRTemp_INVALID;
17968   UChar  modrm = 0;
17969   Int    alen  = 0;
17970   HChar  dis_buf[50];
17971
17972   Int imm8_0;
17973   IRTemp xmm_vec   = newTemp(Ity_V128);
17974   IRTemp src_qword = newTemp(Ity_I64);
17975   const HChar* mbV = isAvx ? "v" : "";
17976
17977   vassert(1==getRexW(pfx)); /* ensured by caller */
17978   modrm = getUChar(delta);
17979   assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
17980
17981   if ( epartIsReg( modrm ) ) {
17982      imm8_0 = (Int)(getUChar(delta+1) & 1);
17983   } else {
17984      addr   = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
17985      imm8_0 = (Int)(getUChar(delta+alen) & 1);
17986   }
17987
17988   switch ( imm8_0 ) {
17989      case 0:  assign( src_qword, unop(Iop_V128to64,   mkexpr(xmm_vec)) );
17990               break;
17991      case 1:  assign( src_qword, unop(Iop_V128HIto64, mkexpr(xmm_vec)) );
17992               break;
17993      default: vassert(0);
17994   }
17995
17996   if ( epartIsReg( modrm ) ) {
17997      putIReg64( eregOfRexRM(pfx,modrm), mkexpr(src_qword) );
17998      delta += 1+1;
17999      DIP( "%spextrq $%d, %s,%s\n", mbV, imm8_0,
18000           nameXMMReg( gregOfRexRM(pfx, modrm) ),
18001           nameIReg64( eregOfRexRM(pfx, modrm) ) );
18002   } else {
18003      storeLE( mkexpr(addr), mkexpr(src_qword) );
18004      delta += alen+1;
18005      DIP( "%spextrq $%d, %s,%s\n", mbV,
18006           imm8_0, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
18007   }
18008   return delta;
18009}
18010
18011static IRExpr* math_CTZ32(IRExpr *exp)
18012{
18013   /* Iop_Ctz32 isn't implemented by the amd64 back end, so use Iop_Ctz64. */
18014   return unop(Iop_64to32, unop(Iop_Ctz64, unop(Iop_32Uto64, exp)));
18015}
18016
18017static Long dis_PCMPISTRI_3A ( UChar modrm, UInt regNoL, UInt regNoR,
18018                               Long delta, UChar opc, UChar imm,
18019                               HChar dis_buf[])
18020{
18021   /* We only handle PCMPISTRI for now */
18022   vassert((opc & 0x03) == 0x03);
18023   /* And only an immediate byte of 0x38 or 0x3A */
18024   vassert((imm & ~0x02) == 0x38);
18025
18026   /* FIXME: Is this correct when RegNoL == 16 ? */
18027   IRTemp argL = newTemp(Ity_V128);
18028   assign(argL, getXMMReg(regNoL));
18029   IRTemp argR = newTemp(Ity_V128);
18030   assign(argR, getXMMReg(regNoR));
18031
18032   IRTemp zmaskL = newTemp(Ity_I32);
18033   assign(zmaskL, unop(Iop_16Uto32,
18034                       unop(Iop_GetMSBs8x16,
18035                            binop(Iop_CmpEQ8x16, mkexpr(argL), mkV128(0)))));
18036   IRTemp zmaskR = newTemp(Ity_I32);
18037   assign(zmaskR, unop(Iop_16Uto32,
18038                       unop(Iop_GetMSBs8x16,
18039                            binop(Iop_CmpEQ8x16, mkexpr(argR), mkV128(0)))));
18040
18041   /* We want validL = ~(zmaskL | -zmaskL)
18042
18043      But this formulation kills memcheck's validity tracking when any
18044      bits above the first "1" are invalid.  So reformulate as:
18045
18046      validL = (zmaskL ? (1 << ctz(zmaskL)) : 0) - 1
18047   */
18048
18049   IRExpr *ctzL = unop(Iop_32to8, math_CTZ32(mkexpr(zmaskL)));
18050
18051   /* Generate a bool expression which is zero iff the original is
18052      zero.  Do this carefully so memcheck can propagate validity bits
18053      correctly.
18054    */
18055   IRTemp zmaskL_zero = newTemp(Ity_I1);
18056   assign(zmaskL_zero, binop(Iop_ExpCmpNE32, mkexpr(zmaskL), mkU32(0)));
18057
18058   IRTemp validL = newTemp(Ity_I32);
18059   assign(validL, binop(Iop_Sub32,
18060                        IRExpr_ITE(mkexpr(zmaskL_zero),
18061                                   binop(Iop_Shl32, mkU32(1), ctzL),
18062                                   mkU32(0)),
18063                        mkU32(1)));
18064
18065   /* And similarly for validR. */
18066   IRExpr *ctzR = unop(Iop_32to8, math_CTZ32(mkexpr(zmaskR)));
18067   IRTemp zmaskR_zero = newTemp(Ity_I1);
18068   assign(zmaskR_zero, binop(Iop_ExpCmpNE32, mkexpr(zmaskR), mkU32(0)));
18069   IRTemp validR = newTemp(Ity_I32);
18070   assign(validR, binop(Iop_Sub32,
18071                        IRExpr_ITE(mkexpr(zmaskR_zero),
18072                                   binop(Iop_Shl32, mkU32(1), ctzR),
18073                                   mkU32(0)),
18074                        mkU32(1)));
18075
18076   /* Do the actual comparison. */
18077   IRExpr *boolResII = unop(Iop_16Uto32,
18078                            unop(Iop_GetMSBs8x16,
18079                                 binop(Iop_CmpEQ8x16, mkexpr(argL),
18080                                                      mkexpr(argR))));
18081
18082   /* Compute boolresII & validL & validR (i.e., if both valid, use
18083      comparison result) */
18084   IRExpr *intRes1_a = binop(Iop_And32, boolResII,
18085                             binop(Iop_And32,
18086                                   mkexpr(validL), mkexpr(validR)));
18087
18088   /* Compute ~(validL | validR); i.e., if both invalid, force 1. */
18089   IRExpr *intRes1_b = unop(Iop_Not32, binop(Iop_Or32,
18090                                             mkexpr(validL), mkexpr(validR)));
18091   /* Otherwise, zero. */
18092   IRExpr *intRes1 = binop(Iop_And32, mkU32(0xFFFF),
18093                           binop(Iop_Or32, intRes1_a, intRes1_b));
18094
18095   /* The "0x30" in imm=0x3A means "polarity=3" means XOR validL with
18096      result. */
18097   IRTemp intRes2 = newTemp(Ity_I32);
18098   assign(intRes2, binop(Iop_And32, mkU32(0xFFFF),
18099                         binop(Iop_Xor32, intRes1, mkexpr(validL))));
18100
18101   /* If the 0x40 bit were set in imm=0x3A, we would return the index
18102      of the msb.  Since it is clear, we return the index of the
18103      lsb. */
18104   IRExpr *newECX = math_CTZ32(binop(Iop_Or32,
18105                                     mkexpr(intRes2), mkU32(0x10000)));
18106
18107   /* And thats our rcx. */
18108   putIReg32(R_RCX, newECX);
18109
18110   /* Now for the condition codes... */
18111
18112   /* C == 0 iff intRes2 == 0 */
18113   IRExpr *c_bit = IRExpr_ITE( binop(Iop_ExpCmpNE32, mkexpr(intRes2),
18114                                     mkU32(0)),
18115                               mkU32(1 << AMD64G_CC_SHIFT_C),
18116                               mkU32(0));
18117   /* Z == 1 iff any in argL is 0 */
18118   IRExpr *z_bit = IRExpr_ITE( mkexpr(zmaskL_zero),
18119                               mkU32(1 << AMD64G_CC_SHIFT_Z),
18120                               mkU32(0));
18121   /* S == 1 iff any in argR is 0 */
18122   IRExpr *s_bit = IRExpr_ITE( mkexpr(zmaskR_zero),
18123                               mkU32(1 << AMD64G_CC_SHIFT_S),
18124                               mkU32(0));
18125   /* O == IntRes2[0] */
18126   IRExpr *o_bit = binop(Iop_Shl32, binop(Iop_And32, mkexpr(intRes2),
18127                                          mkU32(0x01)),
18128                         mkU8(AMD64G_CC_SHIFT_O));
18129
18130   /* Put them all together */
18131   IRTemp cc = newTemp(Ity_I64);
18132   assign(cc, widenUto64(binop(Iop_Or32,
18133                               binop(Iop_Or32, c_bit, z_bit),
18134                               binop(Iop_Or32, s_bit, o_bit))));
18135   stmt(IRStmt_Put(OFFB_CC_OP, mkU64(AMD64G_CC_OP_COPY)));
18136   stmt(IRStmt_Put(OFFB_CC_DEP1, mkexpr(cc)));
18137   stmt(IRStmt_Put(OFFB_CC_DEP2, mkU64(0)));
18138   stmt(IRStmt_Put(OFFB_CC_NDEP, mkU64(0)));
18139
18140   return delta;
18141}
18142
18143/* This can fail, in which case it returns the original (unchanged)
18144   delta. */
18145static Long dis_PCMPxSTRx ( VexAbiInfo* vbi, Prefix pfx,
18146                            Long delta, Bool isAvx, UChar opc )
18147{
18148   Long   delta0  = delta;
18149   UInt   isISTRx = opc & 2;
18150   UInt   isxSTRM = (opc & 1) ^ 1;
18151   UInt   regNoL  = 0;
18152   UInt   regNoR  = 0;
18153   UChar  imm     = 0;
18154   IRTemp addr    = IRTemp_INVALID;
18155   Int    alen    = 0;
18156   HChar  dis_buf[50];
18157
18158   /* This is a nasty kludge.  We need to pass 2 x V128 to the helper
18159      (which is clean).  Since we can't do that, use a dirty helper to
18160      compute the results directly from the XMM regs in the guest
18161      state.  That means for the memory case, we need to move the left
18162      operand into a pseudo-register (XMM16, let's call it). */
18163   UChar modrm = getUChar(delta);
18164   if (epartIsReg(modrm)) {
18165      regNoL = eregOfRexRM(pfx, modrm);
18166      regNoR = gregOfRexRM(pfx, modrm);
18167      imm = getUChar(delta+1);
18168      delta += 1+1;
18169   } else {
18170      regNoL = 16; /* use XMM16 as an intermediary */
18171      regNoR = gregOfRexRM(pfx, modrm);
18172      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
18173      /* No alignment check; I guess that makes sense, given that
18174         these insns are for dealing with C style strings. */
18175      stmt( IRStmt_Put( OFFB_YMM16, loadLE(Ity_V128, mkexpr(addr)) ));
18176      imm = getUChar(delta+alen);
18177      delta += alen+1;
18178   }
18179
18180   /* Print the insn here, since dis_PCMPISTRI_3A doesn't do so
18181      itself. */
18182   if (regNoL == 16) {
18183      DIP("%spcmp%cstr%c $%x,%s,%s\n",
18184          isAvx ? "v" : "", isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
18185          (UInt)imm, dis_buf, nameXMMReg(regNoR));
18186   } else {
18187      DIP("%spcmp%cstr%c $%x,%s,%s\n",
18188          isAvx ? "v" : "", isISTRx ? 'i' : 'e', isxSTRM ? 'm' : 'i',
18189          (UInt)imm, nameXMMReg(regNoL), nameXMMReg(regNoR));
18190   }
18191
18192   /* Handle special case(s). */
18193   if (imm == 0x3A && isISTRx && !isxSTRM) {
18194      return dis_PCMPISTRI_3A ( modrm, regNoL, regNoR, delta,
18195                                opc, imm, dis_buf);
18196   }
18197
18198   /* Now we know the XMM reg numbers for the operands, and the
18199      immediate byte.  Is it one we can actually handle? Throw out any
18200      cases for which the helper function has not been verified. */
18201   switch (imm) {
18202      case 0x00: case 0x02: case 0x08: case 0x0A: case 0x0C: case 0x0E:
18203      case 0x12: case 0x14: case 0x1A:
18204      case 0x30: case 0x34: case 0x38: case 0x3A:
18205      case 0x40: case 0x44: case 0x46: case 0x4A:
18206         break;
18207      // the 16-bit character versions of the above
18208      case 0x01: case 0x03: case 0x09: case 0x0B: case 0x0D:
18209      case 0x13:            case 0x1B:
18210                            case 0x39: case 0x3B:
18211                 case 0x45:            case 0x4B:
18212         break;
18213      default:
18214         return delta0; /*FAIL*/
18215   }
18216
18217   /* Who ya gonna call?  Presumably not Ghostbusters. */
18218   void*  fn = &amd64g_dirtyhelper_PCMPxSTRx;
18219   const HChar* nm = "amd64g_dirtyhelper_PCMPxSTRx";
18220
18221   /* Round up the arguments.  Note that this is a kludge -- the use
18222      of mkU64 rather than mkIRExpr_HWord implies the assumption that
18223      the host's word size is 64-bit. */
18224   UInt gstOffL = regNoL == 16 ? OFFB_YMM16 : ymmGuestRegOffset(regNoL);
18225   UInt gstOffR = ymmGuestRegOffset(regNoR);
18226
18227   IRExpr*  opc4_and_imm = mkU64((opc << 8) | (imm & 0xFF));
18228   IRExpr*  gstOffLe     = mkU64(gstOffL);
18229   IRExpr*  gstOffRe     = mkU64(gstOffR);
18230   IRExpr*  edxIN        = isISTRx ? mkU64(0) : getIRegRDX(8);
18231   IRExpr*  eaxIN        = isISTRx ? mkU64(0) : getIRegRAX(8);
18232   IRExpr** args
18233      = mkIRExprVec_6( IRExpr_BBPTR(),
18234                       opc4_and_imm, gstOffLe, gstOffRe, edxIN, eaxIN );
18235
18236   IRTemp   resT = newTemp(Ity_I64);
18237   IRDirty* d    = unsafeIRDirty_1_N( resT, 0/*regparms*/, nm, fn, args );
18238   /* It's not really a dirty call, but we can't use the clean helper
18239      mechanism here for the very lame reason that we can't pass 2 x
18240      V128s by value to a helper.  Hence this roundabout scheme. */
18241   d->nFxState = 2;
18242   vex_bzero(&d->fxState, sizeof(d->fxState));
18243   d->fxState[0].fx     = Ifx_Read;
18244   d->fxState[0].offset = gstOffL;
18245   d->fxState[0].size   = sizeof(U128);
18246   d->fxState[1].fx     = Ifx_Read;
18247   d->fxState[1].offset = gstOffR;
18248   d->fxState[1].size   = sizeof(U128);
18249   if (isxSTRM) {
18250      /* Declare that the helper writes XMM0. */
18251      d->nFxState = 3;
18252      d->fxState[2].fx     = Ifx_Write;
18253      d->fxState[2].offset = ymmGuestRegOffset(0);
18254      d->fxState[2].size   = sizeof(U128);
18255   }
18256
18257   stmt( IRStmt_Dirty(d) );
18258
18259   /* Now resT[15:0] holds the new OSZACP values, so the condition
18260      codes must be updated. And for a xSTRI case, resT[31:16] holds
18261      the new ECX value, so stash that too. */
18262   if (!isxSTRM) {
18263      putIReg64(R_RCX, binop(Iop_And64,
18264                             binop(Iop_Shr64, mkexpr(resT), mkU8(16)),
18265                             mkU64(0xFFFF)));
18266   }
18267
18268   /* Zap the upper half of the dest reg as per AVX conventions. */
18269   if (isxSTRM && isAvx)
18270      putYMMRegLane128(/*YMM*/0, 1, mkV128(0));
18271
18272   stmt( IRStmt_Put(
18273            OFFB_CC_DEP1,
18274            binop(Iop_And64, mkexpr(resT), mkU64(0xFFFF))
18275   ));
18276   stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
18277   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
18278   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
18279
18280   return delta;
18281}
18282
18283
18284static IRTemp math_PINSRB_128 ( IRTemp v128, IRTemp u8, UInt imm8 )
18285{
18286   vassert(imm8 >= 0 && imm8 <= 15);
18287
18288   // Create a V128 value which has the selected byte in the
18289   // specified lane, and zeroes everywhere else.
18290   IRTemp tmp128    = newTemp(Ity_V128);
18291   IRTemp halfshift = newTemp(Ity_I64);
18292   assign(halfshift, binop(Iop_Shl64,
18293                           unop(Iop_8Uto64, mkexpr(u8)),
18294                           mkU8(8 * (imm8 & 7))));
18295   if (imm8 < 8) {
18296      assign(tmp128, binop(Iop_64HLtoV128, mkU64(0), mkexpr(halfshift)));
18297   } else {
18298      assign(tmp128, binop(Iop_64HLtoV128, mkexpr(halfshift), mkU64(0)));
18299   }
18300
18301   UShort mask = ~(1 << imm8);
18302   IRTemp res  = newTemp(Ity_V128);
18303   assign( res, binop(Iop_OrV128,
18304                      mkexpr(tmp128),
18305                      binop(Iop_AndV128, mkexpr(v128), mkV128(mask))) );
18306   return res;
18307}
18308
18309
18310static IRTemp math_PINSRD_128 ( IRTemp v128, IRTemp u32, UInt imm8 )
18311{
18312   IRTemp z32 = newTemp(Ity_I32);
18313   assign(z32, mkU32(0));
18314
18315   /* Surround u32 with zeroes as per imm, giving us something we can
18316      OR into a suitably masked-out v128.*/
18317   IRTemp withZs = newTemp(Ity_V128);
18318   UShort mask = 0;
18319   switch (imm8) {
18320      case 3:  mask = 0x0FFF;
18321               assign(withZs, mkV128from32s(u32, z32, z32, z32));
18322               break;
18323      case 2:  mask = 0xF0FF;
18324               assign(withZs, mkV128from32s(z32, u32, z32, z32));
18325               break;
18326      case 1:  mask = 0xFF0F;
18327               assign(withZs, mkV128from32s(z32, z32, u32, z32));
18328               break;
18329      case 0:  mask = 0xFFF0;
18330               assign(withZs, mkV128from32s(z32, z32, z32, u32));
18331               break;
18332      default: vassert(0);
18333   }
18334
18335   IRTemp res = newTemp(Ity_V128);
18336   assign(res, binop( Iop_OrV128,
18337                      mkexpr(withZs),
18338                      binop( Iop_AndV128, mkexpr(v128), mkV128(mask) ) ) );
18339   return res;
18340}
18341
18342
18343static IRTemp math_PINSRQ_128 ( IRTemp v128, IRTemp u64, UInt imm8 )
18344{
18345   /* Surround u64 with zeroes as per imm, giving us something we can
18346      OR into a suitably masked-out v128.*/
18347   IRTemp withZs = newTemp(Ity_V128);
18348   UShort mask = 0;
18349   if (imm8 == 0) {
18350      mask = 0xFF00;
18351      assign(withZs, binop(Iop_64HLtoV128, mkU64(0), mkexpr(u64)));
18352   } else {
18353      vassert(imm8 == 1);
18354      mask = 0x00FF;
18355      assign( withZs, binop(Iop_64HLtoV128, mkexpr(u64), mkU64(0)));
18356   }
18357
18358   IRTemp res = newTemp(Ity_V128);
18359   assign( res, binop( Iop_OrV128,
18360                       mkexpr(withZs),
18361                       binop( Iop_AndV128, mkexpr(v128), mkV128(mask) ) ) );
18362   return res;
18363}
18364
18365
18366static IRTemp math_INSERTPS ( IRTemp dstV, IRTemp toInsertD, UInt imm8 )
18367{
18368   const IRTemp inval = IRTemp_INVALID;
18369   IRTemp dstDs[4] = { inval, inval, inval, inval };
18370   breakupV128to32s( dstV, &dstDs[3], &dstDs[2], &dstDs[1], &dstDs[0] );
18371
18372   vassert(imm8 <= 255);
18373   dstDs[(imm8 >> 4) & 3] = toInsertD; /* "imm8_count_d" */
18374
18375   UInt imm8_zmask = (imm8 & 15);
18376   IRTemp zero_32 = newTemp(Ity_I32);
18377   assign( zero_32, mkU32(0) );
18378   IRTemp resV = newTemp(Ity_V128);
18379   assign( resV, mkV128from32s(
18380                    ((imm8_zmask & 8) == 8) ? zero_32 : dstDs[3],
18381                    ((imm8_zmask & 4) == 4) ? zero_32 : dstDs[2],
18382                    ((imm8_zmask & 2) == 2) ? zero_32 : dstDs[1],
18383                    ((imm8_zmask & 1) == 1) ? zero_32 : dstDs[0]) );
18384   return resV;
18385}
18386
18387
18388static Long dis_PEXTRB_128_GtoE ( VexAbiInfo* vbi, Prefix pfx,
18389                                  Long delta, Bool isAvx )
18390{
18391   IRTemp addr     = IRTemp_INVALID;
18392   Int    alen     = 0;
18393   HChar  dis_buf[50];
18394   IRTemp xmm_vec  = newTemp(Ity_V128);
18395   IRTemp sel_lane = newTemp(Ity_I32);
18396   IRTemp shr_lane = newTemp(Ity_I32);
18397   const HChar* mbV = isAvx ? "v" : "";
18398   UChar  modrm    = getUChar(delta);
18399   IRTemp t3, t2, t1, t0;
18400   Int    imm8;
18401   assign( xmm_vec, getXMMReg( gregOfRexRM(pfx,modrm) ) );
18402   t3 = t2 = t1 = t0 = IRTemp_INVALID;
18403   breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
18404
18405   if ( epartIsReg( modrm ) ) {
18406      imm8 = (Int)getUChar(delta+1);
18407   } else {
18408      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
18409      imm8 = (Int)getUChar(delta+alen);
18410   }
18411   switch ( (imm8 >> 2) & 3 ) {
18412      case 0:  assign( sel_lane, mkexpr(t0) ); break;
18413      case 1:  assign( sel_lane, mkexpr(t1) ); break;
18414      case 2:  assign( sel_lane, mkexpr(t2) ); break;
18415      case 3:  assign( sel_lane, mkexpr(t3) ); break;
18416      default: vassert(0);
18417   }
18418   assign( shr_lane,
18419           binop( Iop_Shr32, mkexpr(sel_lane), mkU8(((imm8 & 3)*8)) ) );
18420
18421   if ( epartIsReg( modrm ) ) {
18422      putIReg64( eregOfRexRM(pfx,modrm),
18423                 unop( Iop_32Uto64,
18424                       binop(Iop_And32, mkexpr(shr_lane), mkU32(255)) ) );
18425      delta += 1+1;
18426      DIP( "%spextrb $%d, %s,%s\n", mbV, imm8,
18427           nameXMMReg( gregOfRexRM(pfx, modrm) ),
18428           nameIReg64( eregOfRexRM(pfx, modrm) ) );
18429   } else {
18430      storeLE( mkexpr(addr), unop(Iop_32to8, mkexpr(shr_lane) ) );
18431      delta += alen+1;
18432      DIP( "%spextrb $%d,%s,%s\n", mbV,
18433           imm8, nameXMMReg( gregOfRexRM(pfx, modrm) ), dis_buf );
18434   }
18435
18436   return delta;
18437}
18438
18439
18440static IRTemp math_DPPD_128 ( IRTemp src_vec, IRTemp dst_vec, UInt imm8 )
18441{
18442   vassert(imm8 < 256);
18443   UShort imm8_perms[4] = { 0x0000, 0x00FF, 0xFF00, 0xFFFF };
18444   IRTemp and_vec = newTemp(Ity_V128);
18445   IRTemp sum_vec = newTemp(Ity_V128);
18446   IRTemp rm      = newTemp(Ity_I32);
18447   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
18448   assign( and_vec, binop( Iop_AndV128,
18449                           triop( Iop_Mul64Fx2,
18450                                  mkexpr(rm),
18451                                  mkexpr(dst_vec), mkexpr(src_vec) ),
18452                           mkV128( imm8_perms[ ((imm8 >> 4) & 3) ] ) ) );
18453
18454   assign( sum_vec, binop( Iop_Add64F0x2,
18455                           binop( Iop_InterleaveHI64x2,
18456                                  mkexpr(and_vec), mkexpr(and_vec) ),
18457                           binop( Iop_InterleaveLO64x2,
18458                                  mkexpr(and_vec), mkexpr(and_vec) ) ) );
18459   IRTemp res = newTemp(Ity_V128);
18460   assign(res, binop( Iop_AndV128,
18461                      binop( Iop_InterleaveLO64x2,
18462                             mkexpr(sum_vec), mkexpr(sum_vec) ),
18463                      mkV128( imm8_perms[ (imm8 & 3) ] ) ) );
18464   return res;
18465}
18466
18467
18468static IRTemp math_DPPS_128 ( IRTemp src_vec, IRTemp dst_vec, UInt imm8 )
18469{
18470   vassert(imm8 < 256);
18471   IRTemp tmp_prod_vec = newTemp(Ity_V128);
18472   IRTemp prod_vec     = newTemp(Ity_V128);
18473   IRTemp sum_vec      = newTemp(Ity_V128);
18474   IRTemp rm           = newTemp(Ity_I32);
18475   IRTemp v3, v2, v1, v0;
18476   v3 = v2 = v1 = v0   = IRTemp_INVALID;
18477   UShort imm8_perms[16] = { 0x0000, 0x000F, 0x00F0, 0x00FF, 0x0F00,
18478                             0x0F0F, 0x0FF0, 0x0FFF, 0xF000, 0xF00F,
18479                             0xF0F0, 0xF0FF, 0xFF00, 0xFF0F, 0xFFF0,
18480                             0xFFFF };
18481
18482   assign( rm, get_FAKE_roundingmode() ); /* XXXROUNDINGFIXME */
18483   assign( tmp_prod_vec,
18484           binop( Iop_AndV128,
18485                  triop( Iop_Mul32Fx4,
18486                         mkexpr(rm), mkexpr(dst_vec), mkexpr(src_vec) ),
18487                  mkV128( imm8_perms[((imm8 >> 4)& 15)] ) ) );
18488   breakupV128to32s( tmp_prod_vec, &v3, &v2, &v1, &v0 );
18489   assign( prod_vec, mkV128from32s( v3, v1, v2, v0 ) );
18490
18491   assign( sum_vec, triop( Iop_Add32Fx4,
18492                           mkexpr(rm),
18493                           binop( Iop_InterleaveHI32x4,
18494                                  mkexpr(prod_vec), mkexpr(prod_vec) ),
18495                           binop( Iop_InterleaveLO32x4,
18496                                  mkexpr(prod_vec), mkexpr(prod_vec) ) ) );
18497
18498   IRTemp res = newTemp(Ity_V128);
18499   assign( res, binop( Iop_AndV128,
18500                       triop( Iop_Add32Fx4,
18501                              mkexpr(rm),
18502                              binop( Iop_InterleaveHI32x4,
18503                                     mkexpr(sum_vec), mkexpr(sum_vec) ),
18504                              binop( Iop_InterleaveLO32x4,
18505                                     mkexpr(sum_vec), mkexpr(sum_vec) ) ),
18506                       mkV128( imm8_perms[ (imm8 & 15) ] ) ) );
18507   return res;
18508}
18509
18510
18511static IRTemp math_MPSADBW_128 ( IRTemp dst_vec, IRTemp src_vec, UInt imm8 )
18512{
18513   /* Mask out bits of the operands we don't need.  This isn't
18514      strictly necessary, but it does ensure Memcheck doesn't
18515      give us any false uninitialised value errors as a
18516      result. */
18517   UShort src_mask[4] = { 0x000F, 0x00F0, 0x0F00, 0xF000 };
18518   UShort dst_mask[2] = { 0x07FF, 0x7FF0 };
18519
18520   IRTemp src_maskV = newTemp(Ity_V128);
18521   IRTemp dst_maskV = newTemp(Ity_V128);
18522   assign(src_maskV, mkV128( src_mask[ imm8 & 3 ] ));
18523   assign(dst_maskV, mkV128( dst_mask[ (imm8 >> 2) & 1 ] ));
18524
18525   IRTemp src_masked = newTemp(Ity_V128);
18526   IRTemp dst_masked = newTemp(Ity_V128);
18527   assign(src_masked, binop(Iop_AndV128, mkexpr(src_vec), mkexpr(src_maskV)));
18528   assign(dst_masked, binop(Iop_AndV128, mkexpr(dst_vec), mkexpr(dst_maskV)));
18529
18530   /* Generate 4 64 bit values that we can hand to a clean helper */
18531   IRTemp sHi = newTemp(Ity_I64);
18532   IRTemp sLo = newTemp(Ity_I64);
18533   assign( sHi, unop(Iop_V128HIto64, mkexpr(src_masked)) );
18534   assign( sLo, unop(Iop_V128to64,   mkexpr(src_masked)) );
18535
18536   IRTemp dHi = newTemp(Ity_I64);
18537   IRTemp dLo = newTemp(Ity_I64);
18538   assign( dHi, unop(Iop_V128HIto64, mkexpr(dst_masked)) );
18539   assign( dLo, unop(Iop_V128to64,   mkexpr(dst_masked)) );
18540
18541   /* Compute halves of the result separately */
18542   IRTemp resHi = newTemp(Ity_I64);
18543   IRTemp resLo = newTemp(Ity_I64);
18544
18545   IRExpr** argsHi
18546      = mkIRExprVec_5( mkexpr(sHi), mkexpr(sLo), mkexpr(dHi), mkexpr(dLo),
18547                       mkU64( 0x80 | (imm8 & 7) ));
18548   IRExpr** argsLo
18549      = mkIRExprVec_5( mkexpr(sHi), mkexpr(sLo), mkexpr(dHi), mkexpr(dLo),
18550                       mkU64( 0x00 | (imm8 & 7) ));
18551
18552   assign(resHi, mkIRExprCCall( Ity_I64, 0/*regparm*/,
18553                                "amd64g_calc_mpsadbw",
18554                                &amd64g_calc_mpsadbw, argsHi ));
18555   assign(resLo, mkIRExprCCall( Ity_I64, 0/*regparm*/,
18556                                "amd64g_calc_mpsadbw",
18557                                &amd64g_calc_mpsadbw, argsLo ));
18558
18559   IRTemp res = newTemp(Ity_V128);
18560   assign(res, binop(Iop_64HLtoV128, mkexpr(resHi), mkexpr(resLo)));
18561   return res;
18562}
18563
18564static Long dis_EXTRACTPS ( VexAbiInfo* vbi, Prefix pfx,
18565                            Long delta, Bool isAvx )
18566{
18567   IRTemp addr       = IRTemp_INVALID;
18568   Int    alen       = 0;
18569   HChar  dis_buf[50];
18570   UChar  modrm      = getUChar(delta);
18571   Int imm8_10;
18572   IRTemp xmm_vec    = newTemp(Ity_V128);
18573   IRTemp src_dword  = newTemp(Ity_I32);
18574   UInt   rG         = gregOfRexRM(pfx,modrm);
18575   IRTemp t3, t2, t1, t0;
18576   t3 = t2 = t1 = t0 = IRTemp_INVALID;
18577
18578   assign( xmm_vec, getXMMReg( rG ) );
18579   breakupV128to32s( xmm_vec, &t3, &t2, &t1, &t0 );
18580
18581   if ( epartIsReg( modrm ) ) {
18582      imm8_10 = (Int)(getUChar(delta+1) & 3);
18583   } else {
18584      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
18585      imm8_10 = (Int)(getUChar(delta+alen) & 3);
18586   }
18587
18588   switch ( imm8_10 ) {
18589      case 0:  assign( src_dword, mkexpr(t0) ); break;
18590      case 1:  assign( src_dword, mkexpr(t1) ); break;
18591      case 2:  assign( src_dword, mkexpr(t2) ); break;
18592      case 3:  assign( src_dword, mkexpr(t3) ); break;
18593      default: vassert(0);
18594   }
18595
18596   if ( epartIsReg( modrm ) ) {
18597      UInt rE = eregOfRexRM(pfx,modrm);
18598      putIReg32( rE, mkexpr(src_dword) );
18599      delta += 1+1;
18600      DIP( "%sextractps $%d, %s,%s\n", isAvx ? "v" : "", imm8_10,
18601           nameXMMReg( rG ), nameIReg32( rE ) );
18602   } else {
18603      storeLE( mkexpr(addr), mkexpr(src_dword) );
18604      delta += alen+1;
18605      DIP( "%sextractps $%d, %s,%s\n", isAvx ? "v" : "", imm8_10,
18606           nameXMMReg( rG ), dis_buf );
18607   }
18608
18609   return delta;
18610}
18611
18612
18613static IRTemp math_PCLMULQDQ( IRTemp dV, IRTemp sV, UInt imm8 )
18614{
18615   IRTemp t0 = newTemp(Ity_I64);
18616   IRTemp t1 = newTemp(Ity_I64);
18617   assign(t0, unop((imm8&1)? Iop_V128HIto64 : Iop_V128to64,
18618              mkexpr(dV)));
18619   assign(t1, unop((imm8&16) ? Iop_V128HIto64 : Iop_V128to64,
18620              mkexpr(sV)));
18621
18622   IRTemp t2 = newTemp(Ity_I64);
18623   IRTemp t3 = newTemp(Ity_I64);
18624
18625   IRExpr** args;
18626
18627   args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(0));
18628   assign(t2, mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
18629                            &amd64g_calculate_pclmul, args));
18630   args = mkIRExprVec_3(mkexpr(t0), mkexpr(t1), mkU64(1));
18631   assign(t3, mkIRExprCCall(Ity_I64,0, "amd64g_calculate_pclmul",
18632                            &amd64g_calculate_pclmul, args));
18633
18634   IRTemp res     = newTemp(Ity_V128);
18635   assign(res, binop(Iop_64HLtoV128, mkexpr(t3), mkexpr(t2)));
18636   return res;
18637}
18638
18639
18640__attribute__((noinline))
18641static
18642Long dis_ESC_0F3A__SSE4 ( Bool* decode_OK,
18643                          VexAbiInfo* vbi,
18644                          Prefix pfx, Int sz, Long deltaIN )
18645{
18646   IRTemp addr  = IRTemp_INVALID;
18647   UChar  modrm = 0;
18648   Int    alen  = 0;
18649   HChar  dis_buf[50];
18650
18651   *decode_OK = False;
18652
18653   Long   delta = deltaIN;
18654   UChar  opc   = getUChar(delta);
18655   delta++;
18656   switch (opc) {
18657
18658   case 0x08:
18659      /* 66 0F 3A 08 /r ib = ROUNDPS imm8, xmm2/m128, xmm1 */
18660      if (have66noF2noF3(pfx) && sz == 2) {
18661
18662         IRTemp src0 = newTemp(Ity_F32);
18663         IRTemp src1 = newTemp(Ity_F32);
18664         IRTemp src2 = newTemp(Ity_F32);
18665         IRTemp src3 = newTemp(Ity_F32);
18666         IRTemp res0 = newTemp(Ity_F32);
18667         IRTemp res1 = newTemp(Ity_F32);
18668         IRTemp res2 = newTemp(Ity_F32);
18669         IRTemp res3 = newTemp(Ity_F32);
18670         IRTemp rm   = newTemp(Ity_I32);
18671         Int    imm  = 0;
18672
18673         modrm = getUChar(delta);
18674
18675         if (epartIsReg(modrm)) {
18676            assign( src0,
18677                    getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
18678            assign( src1,
18679                    getXMMRegLane32F( eregOfRexRM(pfx, modrm), 1 ) );
18680            assign( src2,
18681                    getXMMRegLane32F( eregOfRexRM(pfx, modrm), 2 ) );
18682            assign( src3,
18683                    getXMMRegLane32F( eregOfRexRM(pfx, modrm), 3 ) );
18684            imm = getUChar(delta+1);
18685            if (imm & ~15) goto decode_failure;
18686            delta += 1+1;
18687            DIP( "roundps $%d,%s,%s\n",
18688                 imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
18689                      nameXMMReg( gregOfRexRM(pfx, modrm) ) );
18690         } else {
18691            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
18692            gen_SEGV_if_not_16_aligned(addr);
18693            assign( src0, loadLE(Ity_F32,
18694                                 binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
18695            assign( src1, loadLE(Ity_F32,
18696                                 binop(Iop_Add64, mkexpr(addr), mkU64(4) )));
18697            assign( src2, loadLE(Ity_F32,
18698                                 binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
18699            assign( src3, loadLE(Ity_F32,
18700                                 binop(Iop_Add64, mkexpr(addr), mkU64(12) )));
18701            imm = getUChar(delta+alen);
18702            if (imm & ~15) goto decode_failure;
18703            delta += alen+1;
18704            DIP( "roundps $%d,%s,%s\n",
18705                 imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
18706         }
18707
18708         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
18709            that encoding is the same as the encoding for IRRoundingMode,
18710            we can use that value directly in the IR as a rounding
18711            mode. */
18712         assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
18713
18714         assign(res0, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src0)) );
18715         assign(res1, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src1)) );
18716         assign(res2, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src2)) );
18717         assign(res3, binop(Iop_RoundF32toInt, mkexpr(rm), mkexpr(src3)) );
18718
18719         putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
18720         putXMMRegLane32F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
18721         putXMMRegLane32F( gregOfRexRM(pfx, modrm), 2, mkexpr(res2) );
18722         putXMMRegLane32F( gregOfRexRM(pfx, modrm), 3, mkexpr(res3) );
18723
18724         goto decode_success;
18725      }
18726      break;
18727
18728   case 0x09:
18729      /* 66 0F 3A 09 /r ib = ROUNDPD imm8, xmm2/m128, xmm1 */
18730      if (have66noF2noF3(pfx) && sz == 2) {
18731
18732         IRTemp src0 = newTemp(Ity_F64);
18733         IRTemp src1 = newTemp(Ity_F64);
18734         IRTemp res0 = newTemp(Ity_F64);
18735         IRTemp res1 = newTemp(Ity_F64);
18736         IRTemp rm   = newTemp(Ity_I32);
18737         Int    imm  = 0;
18738
18739         modrm = getUChar(delta);
18740
18741         if (epartIsReg(modrm)) {
18742            assign( src0,
18743                    getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 ) );
18744            assign( src1,
18745                    getXMMRegLane64F( eregOfRexRM(pfx, modrm), 1 ) );
18746            imm = getUChar(delta+1);
18747            if (imm & ~15) goto decode_failure;
18748            delta += 1+1;
18749            DIP( "roundpd $%d,%s,%s\n",
18750                 imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
18751                      nameXMMReg( gregOfRexRM(pfx, modrm) ) );
18752         } else {
18753            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
18754            gen_SEGV_if_not_16_aligned(addr);
18755            assign( src0, loadLE(Ity_F64,
18756                                 binop(Iop_Add64, mkexpr(addr), mkU64(0) )));
18757            assign( src1, loadLE(Ity_F64,
18758                                 binop(Iop_Add64, mkexpr(addr), mkU64(8) )));
18759            imm = getUChar(delta+alen);
18760            if (imm & ~15) goto decode_failure;
18761            delta += alen+1;
18762            DIP( "roundpd $%d,%s,%s\n",
18763                 imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
18764         }
18765
18766         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
18767            that encoding is the same as the encoding for IRRoundingMode,
18768            we can use that value directly in the IR as a rounding
18769            mode. */
18770         assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
18771
18772         assign(res0, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src0)) );
18773         assign(res1, binop(Iop_RoundF64toInt, mkexpr(rm), mkexpr(src1)) );
18774
18775         putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res0) );
18776         putXMMRegLane64F( gregOfRexRM(pfx, modrm), 1, mkexpr(res1) );
18777
18778         goto decode_success;
18779      }
18780      break;
18781
18782   case 0x0A:
18783   case 0x0B:
18784      /* 66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
18785         66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
18786      */
18787      if (have66noF2noF3(pfx) && sz == 2) {
18788
18789         Bool   isD = opc == 0x0B;
18790         IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
18791         IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
18792         Int    imm = 0;
18793
18794         modrm = getUChar(delta);
18795
18796         if (epartIsReg(modrm)) {
18797            assign( src,
18798                    isD ? getXMMRegLane64F( eregOfRexRM(pfx, modrm), 0 )
18799                        : getXMMRegLane32F( eregOfRexRM(pfx, modrm), 0 ) );
18800            imm = getUChar(delta+1);
18801            if (imm & ~15) goto decode_failure;
18802            delta += 1+1;
18803            DIP( "rounds%c $%d,%s,%s\n",
18804                 isD ? 'd' : 's',
18805                 imm, nameXMMReg( eregOfRexRM(pfx, modrm) ),
18806                      nameXMMReg( gregOfRexRM(pfx, modrm) ) );
18807         } else {
18808            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
18809            assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
18810            imm = getUChar(delta+alen);
18811            if (imm & ~15) goto decode_failure;
18812            delta += alen+1;
18813            DIP( "rounds%c $%d,%s,%s\n",
18814                 isD ? 'd' : 's',
18815                 imm, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
18816         }
18817
18818         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
18819            that encoding is the same as the encoding for IRRoundingMode,
18820            we can use that value directly in the IR as a rounding
18821            mode. */
18822         assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
18823                           (imm & 4) ? get_sse_roundingmode()
18824                                     : mkU32(imm & 3),
18825                           mkexpr(src)) );
18826
18827         if (isD)
18828            putXMMRegLane64F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
18829         else
18830            putXMMRegLane32F( gregOfRexRM(pfx, modrm), 0, mkexpr(res) );
18831
18832         goto decode_success;
18833      }
18834      break;
18835
18836   case 0x0C:
18837      /* 66 0F 3A 0C /r ib = BLENDPS xmm1, xmm2/m128, imm8
18838         Blend Packed Single Precision Floating-Point Values (XMM) */
18839      if (have66noF2noF3(pfx) && sz == 2) {
18840
18841         Int imm8;
18842         IRTemp dst_vec = newTemp(Ity_V128);
18843         IRTemp src_vec = newTemp(Ity_V128);
18844
18845         modrm = getUChar(delta);
18846
18847         assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
18848
18849         if ( epartIsReg( modrm ) ) {
18850            imm8 = (Int)getUChar(delta+1);
18851            assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
18852            delta += 1+1;
18853            DIP( "blendps $%d, %s,%s\n", imm8,
18854                 nameXMMReg( eregOfRexRM(pfx, modrm) ),
18855                 nameXMMReg( gregOfRexRM(pfx, modrm) ) );
18856         } else {
18857            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
18858                             1/* imm8 is 1 byte after the amode */ );
18859            gen_SEGV_if_not_16_aligned( addr );
18860            assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
18861            imm8 = (Int)getUChar(delta+alen);
18862            delta += alen+1;
18863            DIP( "blendpd $%d, %s,%s\n",
18864                 imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
18865         }
18866
18867         putXMMReg( gregOfRexRM(pfx, modrm),
18868                    mkexpr( math_BLENDPS_128( src_vec, dst_vec, imm8) ) );
18869         goto decode_success;
18870      }
18871      break;
18872
18873   case 0x0D:
18874      /* 66 0F 3A 0D /r ib = BLENDPD xmm1, xmm2/m128, imm8
18875         Blend Packed Double Precision Floating-Point Values (XMM) */
18876      if (have66noF2noF3(pfx) && sz == 2) {
18877
18878         Int imm8;
18879         IRTemp dst_vec = newTemp(Ity_V128);
18880         IRTemp src_vec = newTemp(Ity_V128);
18881
18882         modrm = getUChar(delta);
18883         assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
18884
18885         if ( epartIsReg( modrm ) ) {
18886            imm8 = (Int)getUChar(delta+1);
18887            assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
18888            delta += 1+1;
18889            DIP( "blendpd $%d, %s,%s\n", imm8,
18890                 nameXMMReg( eregOfRexRM(pfx, modrm) ),
18891                 nameXMMReg( gregOfRexRM(pfx, modrm) ) );
18892         } else {
18893            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
18894                             1/* imm8 is 1 byte after the amode */ );
18895            gen_SEGV_if_not_16_aligned( addr );
18896            assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
18897            imm8 = (Int)getUChar(delta+alen);
18898            delta += alen+1;
18899            DIP( "blendpd $%d, %s,%s\n",
18900                 imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
18901         }
18902
18903         putXMMReg( gregOfRexRM(pfx, modrm),
18904                    mkexpr( math_BLENDPD_128( src_vec, dst_vec, imm8) ) );
18905         goto decode_success;
18906      }
18907      break;
18908
18909   case 0x0E:
18910      /* 66 0F 3A 0E /r ib = PBLENDW xmm1, xmm2/m128, imm8
18911         Blend Packed Words (XMM) */
18912      if (have66noF2noF3(pfx) && sz == 2) {
18913
18914         Int imm8;
18915         IRTemp dst_vec = newTemp(Ity_V128);
18916         IRTemp src_vec = newTemp(Ity_V128);
18917
18918         modrm = getUChar(delta);
18919
18920         assign( dst_vec, getXMMReg( gregOfRexRM(pfx, modrm) ) );
18921
18922         if ( epartIsReg( modrm ) ) {
18923            imm8 = (Int)getUChar(delta+1);
18924            assign( src_vec, getXMMReg( eregOfRexRM(pfx, modrm) ) );
18925            delta += 1+1;
18926            DIP( "pblendw $%d, %s,%s\n", imm8,
18927                 nameXMMReg( eregOfRexRM(pfx, modrm) ),
18928                 nameXMMReg( gregOfRexRM(pfx, modrm) ) );
18929         } else {
18930            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
18931                             1/* imm8 is 1 byte after the amode */ );
18932            gen_SEGV_if_not_16_aligned( addr );
18933            assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
18934            imm8 = (Int)getUChar(delta+alen);
18935            delta += alen+1;
18936            DIP( "pblendw $%d, %s,%s\n",
18937                 imm8, dis_buf, nameXMMReg( gregOfRexRM(pfx, modrm) ) );
18938         }
18939
18940         putXMMReg( gregOfRexRM(pfx, modrm),
18941                    mkexpr( math_PBLENDW_128( src_vec, dst_vec, imm8) ) );
18942         goto decode_success;
18943      }
18944      break;
18945
18946   case 0x14:
18947      /* 66 0F 3A 14 /r ib = PEXTRB r/m16, xmm, imm8
18948         Extract Byte from xmm, store in mem or zero-extend + store in gen.reg.
18949         (XMM) */
18950      if (have66noF2noF3(pfx) && sz == 2) {
18951         delta = dis_PEXTRB_128_GtoE( vbi, pfx, delta, False/*!isAvx*/ );
18952         goto decode_success;
18953      }
18954      break;
18955
18956   case 0x15:
18957      /* 66 0F 3A 15 /r ib = PEXTRW r/m16, xmm, imm8
18958         Extract Word from xmm, store in mem or zero-extend + store in gen.reg.
18959         (XMM) */
18960      if (have66noF2noF3(pfx) && sz == 2) {
18961         delta = dis_PEXTRW( vbi, pfx, delta, False/*!isAvx*/ );
18962         goto decode_success;
18963      }
18964      break;
18965
18966   case 0x16:
18967      /* 66 no-REX.W 0F 3A 16 /r ib = PEXTRD reg/mem32, xmm2, imm8
18968         Extract Doubleword int from xmm reg and store in gen.reg or mem. (XMM)
18969         Note that this insn has the same opcodes as PEXTRQ, but
18970         here the REX.W bit is _not_ present */
18971      if (have66noF2noF3(pfx)
18972          && sz == 2 /* REX.W is _not_ present */) {
18973         delta = dis_PEXTRD( vbi, pfx, delta, False/*!isAvx*/ );
18974         goto decode_success;
18975      }
18976      /* 66 REX.W 0F 3A 16 /r ib = PEXTRQ reg/mem64, xmm2, imm8
18977         Extract Quadword int from xmm reg and store in gen.reg or mem. (XMM)
18978         Note that this insn has the same opcodes as PEXTRD, but
18979         here the REX.W bit is present */
18980      if (have66noF2noF3(pfx)
18981          && sz == 8 /* REX.W is present */) {
18982         delta = dis_PEXTRQ( vbi, pfx, delta, False/*!isAvx*/);
18983         goto decode_success;
18984      }
18985      break;
18986
18987   case 0x17:
18988      /* 66 0F 3A 17 /r ib = EXTRACTPS reg/mem32, xmm2, imm8 Extract
18989         float from xmm reg and store in gen.reg or mem.  This is
18990         identical to PEXTRD, except that REX.W appears to be ignored.
18991      */
18992      if (have66noF2noF3(pfx)
18993          && (sz == 2 || /* ignore redundant REX.W */ sz == 8)) {
18994         delta = dis_EXTRACTPS( vbi, pfx, delta, False/*!isAvx*/ );
18995         goto decode_success;
18996      }
18997      break;
18998
18999   case 0x20:
19000      /* 66 0F 3A 20 /r ib = PINSRB xmm1, r32/m8, imm8
19001         Extract byte from r32/m8 and insert into xmm1 */
19002      if (have66noF2noF3(pfx) && sz == 2) {
19003         Int    imm8;
19004         IRTemp new8 = newTemp(Ity_I8);
19005         modrm = getUChar(delta);
19006         UInt rG = gregOfRexRM(pfx, modrm);
19007         if ( epartIsReg( modrm ) ) {
19008            UInt rE = eregOfRexRM(pfx,modrm);
19009            imm8 = (Int)(getUChar(delta+1) & 0xF);
19010            assign( new8, unop(Iop_32to8, getIReg32(rE)) );
19011            delta += 1+1;
19012            DIP( "pinsrb $%d,%s,%s\n", imm8,
19013                 nameIReg32(rE), nameXMMReg(rG) );
19014         } else {
19015            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19016            imm8 = (Int)(getUChar(delta+alen) & 0xF);
19017            assign( new8, loadLE( Ity_I8, mkexpr(addr) ) );
19018            delta += alen+1;
19019            DIP( "pinsrb $%d,%s,%s\n",
19020                 imm8, dis_buf, nameXMMReg(rG) );
19021         }
19022         IRTemp src_vec = newTemp(Ity_V128);
19023         assign(src_vec, getXMMReg( gregOfRexRM(pfx, modrm) ));
19024         IRTemp res = math_PINSRB_128( src_vec, new8, imm8 );
19025         putXMMReg( rG, mkexpr(res) );
19026         goto decode_success;
19027      }
19028      break;
19029
19030   case 0x21:
19031      /* 66 0F 3A 21 /r ib = INSERTPS imm8, xmm2/m32, xmm1
19032         Insert Packed Single Precision Floating-Point Value (XMM) */
19033      if (have66noF2noF3(pfx) && sz == 2) {
19034         UInt   imm8;
19035         IRTemp d2ins = newTemp(Ity_I32); /* comes from the E part */
19036         const IRTemp inval = IRTemp_INVALID;
19037
19038         modrm = getUChar(delta);
19039         UInt rG = gregOfRexRM(pfx, modrm);
19040
19041         if ( epartIsReg( modrm ) ) {
19042            UInt   rE = eregOfRexRM(pfx, modrm);
19043            IRTemp vE = newTemp(Ity_V128);
19044            assign( vE, getXMMReg(rE) );
19045            IRTemp dsE[4] = { inval, inval, inval, inval };
19046            breakupV128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
19047            imm8 = getUChar(delta+1);
19048            d2ins = dsE[(imm8 >> 6) & 3]; /* "imm8_count_s" */
19049            delta += 1+1;
19050            DIP( "insertps $%u, %s,%s\n",
19051                 imm8, nameXMMReg(rE), nameXMMReg(rG) );
19052         } else {
19053            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19054            assign( d2ins, loadLE( Ity_I32, mkexpr(addr) ) );
19055            imm8 = getUChar(delta+alen);
19056            delta += alen+1;
19057            DIP( "insertps $%u, %s,%s\n",
19058                 imm8, dis_buf, nameXMMReg(rG) );
19059         }
19060
19061         IRTemp vG = newTemp(Ity_V128);
19062         assign( vG, getXMMReg(rG) );
19063
19064         putXMMReg( rG, mkexpr(math_INSERTPS( vG, d2ins, imm8 )) );
19065         goto decode_success;
19066      }
19067      break;
19068
19069   case 0x22:
19070      /* 66 no-REX.W 0F 3A 22 /r ib = PINSRD xmm1, r/m32, imm8
19071         Extract Doubleword int from gen.reg/mem32 and insert into xmm1 */
19072      if (have66noF2noF3(pfx)
19073          && sz == 2 /* REX.W is NOT present */) {
19074         Int    imm8_10;
19075         IRTemp src_u32 = newTemp(Ity_I32);
19076         modrm = getUChar(delta);
19077         UInt rG = gregOfRexRM(pfx, modrm);
19078
19079         if ( epartIsReg( modrm ) ) {
19080            UInt rE = eregOfRexRM(pfx,modrm);
19081            imm8_10 = (Int)(getUChar(delta+1) & 3);
19082            assign( src_u32, getIReg32( rE ) );
19083            delta += 1+1;
19084            DIP( "pinsrd $%d, %s,%s\n",
19085                 imm8_10, nameIReg32(rE), nameXMMReg(rG) );
19086         } else {
19087            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19088            imm8_10 = (Int)(getUChar(delta+alen) & 3);
19089            assign( src_u32, loadLE( Ity_I32, mkexpr(addr) ) );
19090            delta += alen+1;
19091            DIP( "pinsrd $%d, %s,%s\n",
19092                 imm8_10, dis_buf, nameXMMReg(rG) );
19093         }
19094
19095         IRTemp src_vec = newTemp(Ity_V128);
19096         assign(src_vec, getXMMReg( rG ));
19097         IRTemp res_vec = math_PINSRD_128( src_vec, src_u32, imm8_10 );
19098         putXMMReg( rG, mkexpr(res_vec) );
19099         goto decode_success;
19100      }
19101      /* 66 REX.W 0F 3A 22 /r ib = PINSRQ xmm1, r/m64, imm8
19102         Extract Quadword int from gen.reg/mem64 and insert into xmm1 */
19103      if (have66noF2noF3(pfx)
19104          && sz == 8 /* REX.W is present */) {
19105         Int imm8_0;
19106         IRTemp src_u64 = newTemp(Ity_I64);
19107         modrm = getUChar(delta);
19108         UInt rG = gregOfRexRM(pfx, modrm);
19109
19110         if ( epartIsReg( modrm ) ) {
19111            UInt rE = eregOfRexRM(pfx,modrm);
19112            imm8_0 = (Int)(getUChar(delta+1) & 1);
19113            assign( src_u64, getIReg64( rE ) );
19114            delta += 1+1;
19115            DIP( "pinsrq $%d, %s,%s\n",
19116                 imm8_0, nameIReg64(rE), nameXMMReg(rG) );
19117         } else {
19118            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
19119            imm8_0 = (Int)(getUChar(delta+alen) & 1);
19120            assign( src_u64, loadLE( Ity_I64, mkexpr(addr) ) );
19121            delta += alen+1;
19122            DIP( "pinsrq $%d, %s,%s\n",
19123                 imm8_0, dis_buf, nameXMMReg(rG) );
19124         }
19125
19126         IRTemp src_vec = newTemp(Ity_V128);
19127         assign(src_vec, getXMMReg( rG ));
19128         IRTemp res_vec = math_PINSRQ_128( src_vec, src_u64, imm8_0 );
19129         putXMMReg( rG, mkexpr(res_vec) );
19130         goto decode_success;
19131      }
19132      break;
19133
19134   case 0x40:
19135      /* 66 0F 3A 40 /r ib = DPPS xmm1, xmm2/m128, imm8
19136         Dot Product of Packed Single Precision Floating-Point Values (XMM) */
19137      if (have66noF2noF3(pfx) && sz == 2) {
19138         modrm = getUChar(delta);
19139         Int    imm8;
19140         IRTemp src_vec = newTemp(Ity_V128);
19141         IRTemp dst_vec = newTemp(Ity_V128);
19142         UInt   rG      = gregOfRexRM(pfx, modrm);
19143         assign( dst_vec, getXMMReg( rG ) );
19144         if ( epartIsReg( modrm ) ) {
19145            UInt rE = eregOfRexRM(pfx, modrm);
19146            imm8 = (Int)getUChar(delta+1);
19147            assign( src_vec, getXMMReg(rE) );
19148            delta += 1+1;
19149            DIP( "dpps $%d, %s,%s\n",
19150                 imm8, nameXMMReg(rE), nameXMMReg(rG) );
19151         } else {
19152            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
19153                             1/* imm8 is 1 byte after the amode */ );
19154            gen_SEGV_if_not_16_aligned( addr );
19155            assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
19156            imm8 = (Int)getUChar(delta+alen);
19157            delta += alen+1;
19158            DIP( "dpps $%d, %s,%s\n",
19159                 imm8, dis_buf, nameXMMReg(rG) );
19160         }
19161         IRTemp res = math_DPPS_128( src_vec, dst_vec, imm8 );
19162         putXMMReg( rG, mkexpr(res) );
19163         goto decode_success;
19164      }
19165      break;
19166
19167   case 0x41:
19168      /* 66 0F 3A 41 /r ib = DPPD xmm1, xmm2/m128, imm8
19169         Dot Product of Packed Double Precision Floating-Point Values (XMM) */
19170      if (have66noF2noF3(pfx) && sz == 2) {
19171         modrm = getUChar(delta);
19172         Int    imm8;
19173         IRTemp src_vec = newTemp(Ity_V128);
19174         IRTemp dst_vec = newTemp(Ity_V128);
19175         UInt   rG      = gregOfRexRM(pfx, modrm);
19176         assign( dst_vec, getXMMReg( rG ) );
19177         if ( epartIsReg( modrm ) ) {
19178            UInt rE = eregOfRexRM(pfx, modrm);
19179            imm8 = (Int)getUChar(delta+1);
19180            assign( src_vec, getXMMReg(rE) );
19181            delta += 1+1;
19182            DIP( "dppd $%d, %s,%s\n",
19183                 imm8, nameXMMReg(rE), nameXMMReg(rG) );
19184         } else {
19185            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
19186                             1/* imm8 is 1 byte after the amode */ );
19187            gen_SEGV_if_not_16_aligned( addr );
19188            assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
19189            imm8 = (Int)getUChar(delta+alen);
19190            delta += alen+1;
19191            DIP( "dppd $%d, %s,%s\n",
19192                 imm8, dis_buf, nameXMMReg(rG) );
19193         }
19194         IRTemp res = math_DPPD_128( src_vec, dst_vec, imm8 );
19195         putXMMReg( rG, mkexpr(res) );
19196         goto decode_success;
19197      }
19198      break;
19199
19200   case 0x42:
19201      /* 66 0F 3A 42 /r ib = MPSADBW xmm1, xmm2/m128, imm8
19202         Multiple Packed Sums of Absolule Difference (XMM) */
19203      if (have66noF2noF3(pfx) && sz == 2) {
19204         Int    imm8;
19205         IRTemp src_vec = newTemp(Ity_V128);
19206         IRTemp dst_vec = newTemp(Ity_V128);
19207         modrm          = getUChar(delta);
19208         UInt   rG      = gregOfRexRM(pfx, modrm);
19209
19210         assign( dst_vec, getXMMReg(rG) );
19211
19212         if ( epartIsReg( modrm ) ) {
19213            UInt rE = eregOfRexRM(pfx, modrm);
19214
19215            imm8 = (Int)getUChar(delta+1);
19216            assign( src_vec, getXMMReg(rE) );
19217            delta += 1+1;
19218            DIP( "mpsadbw $%d, %s,%s\n", imm8,
19219                 nameXMMReg(rE), nameXMMReg(rG) );
19220         } else {
19221            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
19222                             1/* imm8 is 1 byte after the amode */ );
19223            gen_SEGV_if_not_16_aligned( addr );
19224            assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
19225            imm8 = (Int)getUChar(delta+alen);
19226            delta += alen+1;
19227            DIP( "mpsadbw $%d, %s,%s\n", imm8, dis_buf, nameXMMReg(rG) );
19228         }
19229
19230         putXMMReg( rG, mkexpr( math_MPSADBW_128(dst_vec, src_vec, imm8) ) );
19231         goto decode_success;
19232      }
19233      break;
19234
19235   case 0x44:
19236      /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
19237       * Carry-less multiplication of selected XMM quadwords into XMM
19238       * registers (a.k.a multiplication of polynomials over GF(2))
19239       */
19240      if (have66noF2noF3(pfx) && sz == 2) {
19241
19242         Int imm8;
19243         IRTemp svec = newTemp(Ity_V128);
19244         IRTemp dvec = newTemp(Ity_V128);
19245         modrm       = getUChar(delta);
19246         UInt   rG   = gregOfRexRM(pfx, modrm);
19247
19248         assign( dvec, getXMMReg(rG) );
19249
19250         if ( epartIsReg( modrm ) ) {
19251            UInt rE = eregOfRexRM(pfx, modrm);
19252            imm8 = (Int)getUChar(delta+1);
19253            assign( svec, getXMMReg(rE) );
19254            delta += 1+1;
19255            DIP( "pclmulqdq $%d, %s,%s\n", imm8,
19256                 nameXMMReg(rE), nameXMMReg(rG) );
19257         } else {
19258            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
19259                             1/* imm8 is 1 byte after the amode */ );
19260            gen_SEGV_if_not_16_aligned( addr );
19261            assign( svec, loadLE( Ity_V128, mkexpr(addr) ) );
19262            imm8 = (Int)getUChar(delta+alen);
19263            delta += alen+1;
19264            DIP( "pclmulqdq $%d, %s,%s\n",
19265                 imm8, dis_buf, nameXMMReg(rG) );
19266         }
19267
19268         putXMMReg( rG, mkexpr( math_PCLMULQDQ(dvec, svec, imm8) ) );
19269         goto decode_success;
19270      }
19271      break;
19272
19273   case 0x60:
19274   case 0x61:
19275   case 0x62:
19276   case 0x63:
19277      /* 66 0F 3A 63 /r ib = PCMPISTRI imm8, xmm2/m128, xmm1
19278         66 0F 3A 62 /r ib = PCMPISTRM imm8, xmm2/m128, xmm1
19279         66 0F 3A 61 /r ib = PCMPESTRI imm8, xmm2/m128, xmm1
19280         66 0F 3A 60 /r ib = PCMPESTRM imm8, xmm2/m128, xmm1
19281         (selected special cases that actually occur in glibc,
19282          not by any means a complete implementation.)
19283      */
19284      if (have66noF2noF3(pfx) && sz == 2) {
19285         Long delta0 = delta;
19286         delta = dis_PCMPxSTRx( vbi, pfx, delta, False/*!isAvx*/, opc );
19287         if (delta > delta0) goto decode_success;
19288         /* else fall though; dis_PCMPxSTRx failed to decode it */
19289      }
19290      break;
19291
19292   case 0xDF:
19293      /* 66 0F 3A DF /r ib = AESKEYGENASSIST imm8, xmm2/m128, xmm1 */
19294      if (have66noF2noF3(pfx) && sz == 2) {
19295         delta = dis_AESKEYGENASSIST( vbi, pfx, delta, False/*!isAvx*/ );
19296         goto decode_success;
19297      }
19298      break;
19299
19300   default:
19301      break;
19302
19303   }
19304
19305  decode_failure:
19306   *decode_OK = False;
19307   return deltaIN;
19308
19309  decode_success:
19310   *decode_OK = True;
19311   return delta;
19312}
19313
19314
19315/*------------------------------------------------------------*/
19316/*---                                                      ---*/
19317/*--- Top-level post-escape decoders: dis_ESC_NONE         ---*/
19318/*---                                                      ---*/
19319/*------------------------------------------------------------*/
19320
19321__attribute__((noinline))
19322static
19323Long dis_ESC_NONE (
19324        /*MB_OUT*/DisResult* dres,
19325        /*MB_OUT*/Bool*      expect_CAS,
19326        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
19327        Bool         resteerCisOk,
19328        void*        callback_opaque,
19329        VexArchInfo* archinfo,
19330        VexAbiInfo*  vbi,
19331        Prefix pfx, Int sz, Long deltaIN
19332     )
19333{
19334   Long   d64   = 0;
19335   UChar  abyte = 0;
19336   IRTemp addr  = IRTemp_INVALID;
19337   IRTemp t1    = IRTemp_INVALID;
19338   IRTemp t2    = IRTemp_INVALID;
19339   IRTemp t3    = IRTemp_INVALID;
19340   IRTemp t4    = IRTemp_INVALID;
19341   IRTemp t5    = IRTemp_INVALID;
19342   IRType ty    = Ity_INVALID;
19343   UChar  modrm = 0;
19344   Int    am_sz = 0;
19345   Int    d_sz  = 0;
19346   Int    alen  = 0;
19347   HChar  dis_buf[50];
19348
19349   Long   delta = deltaIN;
19350   UChar  opc   = getUChar(delta); delta++;
19351
19352   /* delta now points at the modrm byte.  In most of the cases that
19353      follow, neither the F2 nor F3 prefixes are allowed.  However,
19354      for some basic arithmetic operations we have to allow F2/XACQ or
19355      F3/XREL in the case where the destination is memory and the LOCK
19356      prefix is also present.  Do this check by looking at the modrm
19357      byte but not advancing delta over it. */
19358   /* By default, F2 and F3 are not allowed, so let's start off with
19359      that setting. */
19360   Bool validF2orF3 = haveF2orF3(pfx) ? False : True;
19361   { UChar tmp_modrm = getUChar(delta);
19362     switch (opc) {
19363        case 0x00: /* ADD Gb,Eb */  case 0x01: /* ADD Gv,Ev */
19364        case 0x08: /* OR  Gb,Eb */  case 0x09: /* OR  Gv,Ev */
19365        case 0x10: /* ADC Gb,Eb */  case 0x11: /* ADC Gv,Ev */
19366        case 0x18: /* SBB Gb,Eb */  case 0x19: /* SBB Gv,Ev */
19367        case 0x20: /* AND Gb,Eb */  case 0x21: /* AND Gv,Ev */
19368        case 0x28: /* SUB Gb,Eb */  case 0x29: /* SUB Gv,Ev */
19369        case 0x30: /* XOR Gb,Eb */  case 0x31: /* XOR Gv,Ev */
19370           if (!epartIsReg(tmp_modrm)
19371               && haveF2orF3(pfx) && !haveF2andF3(pfx) && haveLOCK(pfx)) {
19372              /* dst is mem, and we have F2 or F3 but not both */
19373              validF2orF3 = True;
19374           }
19375           break;
19376        default:
19377           break;
19378     }
19379   }
19380
19381   /* Now, in the switch below, for the opc values examined by the
19382      switch above, use validF2orF3 rather than looking at pfx
19383      directly. */
19384   switch (opc) {
19385
19386   case 0x00: /* ADD Gb,Eb */
19387      if (!validF2orF3) goto decode_failure;
19388      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Add8, True, 1, delta, "add" );
19389      return delta;
19390   case 0x01: /* ADD Gv,Ev */
19391      if (!validF2orF3) goto decode_failure;
19392      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Add8, True, sz, delta, "add" );
19393      return delta;
19394
19395   case 0x02: /* ADD Eb,Gb */
19396      if (haveF2orF3(pfx)) goto decode_failure;
19397      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Add8, True, 1, delta, "add" );
19398      return delta;
19399   case 0x03: /* ADD Ev,Gv */
19400      if (haveF2orF3(pfx)) goto decode_failure;
19401      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Add8, True, sz, delta, "add" );
19402      return delta;
19403
19404   case 0x04: /* ADD Ib, AL */
19405      if (haveF2orF3(pfx)) goto decode_failure;
19406      delta = dis_op_imm_A( 1, False, Iop_Add8, True, delta, "add" );
19407      return delta;
19408   case 0x05: /* ADD Iv, eAX */
19409      if (haveF2orF3(pfx)) goto decode_failure;
19410      delta = dis_op_imm_A(sz, False, Iop_Add8, True, delta, "add" );
19411      return delta;
19412
19413   case 0x08: /* OR Gb,Eb */
19414      if (!validF2orF3) goto decode_failure;
19415      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Or8, True, 1, delta, "or" );
19416      return delta;
19417   case 0x09: /* OR Gv,Ev */
19418      if (!validF2orF3) goto decode_failure;
19419      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Or8, True, sz, delta, "or" );
19420      return delta;
19421
19422   case 0x0A: /* OR Eb,Gb */
19423      if (haveF2orF3(pfx)) goto decode_failure;
19424      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Or8, True, 1, delta, "or" );
19425      return delta;
19426   case 0x0B: /* OR Ev,Gv */
19427      if (haveF2orF3(pfx)) goto decode_failure;
19428      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Or8, True, sz, delta, "or" );
19429      return delta;
19430
19431   case 0x0C: /* OR Ib, AL */
19432      if (haveF2orF3(pfx)) goto decode_failure;
19433      delta = dis_op_imm_A( 1, False, Iop_Or8, True, delta, "or" );
19434      return delta;
19435   case 0x0D: /* OR Iv, eAX */
19436      if (haveF2orF3(pfx)) goto decode_failure;
19437      delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
19438      return delta;
19439
19440   case 0x10: /* ADC Gb,Eb */
19441      if (!validF2orF3) goto decode_failure;
19442      delta = dis_op2_G_E ( vbi, pfx, True, Iop_Add8, True, 1, delta, "adc" );
19443      return delta;
19444   case 0x11: /* ADC Gv,Ev */
19445      if (!validF2orF3) goto decode_failure;
19446      delta = dis_op2_G_E ( vbi, pfx, True, Iop_Add8, True, sz, delta, "adc" );
19447      return delta;
19448
19449   case 0x12: /* ADC Eb,Gb */
19450      if (haveF2orF3(pfx)) goto decode_failure;
19451      delta = dis_op2_E_G ( vbi, pfx, True, Iop_Add8, True, 1, delta, "adc" );
19452      return delta;
19453   case 0x13: /* ADC Ev,Gv */
19454      if (haveF2orF3(pfx)) goto decode_failure;
19455      delta = dis_op2_E_G ( vbi, pfx, True, Iop_Add8, True, sz, delta, "adc" );
19456      return delta;
19457
19458   case 0x14: /* ADC Ib, AL */
19459      if (haveF2orF3(pfx)) goto decode_failure;
19460      delta = dis_op_imm_A( 1, True, Iop_Add8, True, delta, "adc" );
19461      return delta;
19462   case 0x15: /* ADC Iv, eAX */
19463      if (haveF2orF3(pfx)) goto decode_failure;
19464      delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
19465      return delta;
19466
19467   case 0x18: /* SBB Gb,Eb */
19468      if (!validF2orF3) goto decode_failure;
19469      delta = dis_op2_G_E ( vbi, pfx, True, Iop_Sub8, True, 1, delta, "sbb" );
19470      return delta;
19471   case 0x19: /* SBB Gv,Ev */
19472      if (!validF2orF3) goto decode_failure;
19473      delta = dis_op2_G_E ( vbi, pfx, True, Iop_Sub8, True, sz, delta, "sbb" );
19474      return delta;
19475
19476   case 0x1A: /* SBB Eb,Gb */
19477      if (haveF2orF3(pfx)) goto decode_failure;
19478      delta = dis_op2_E_G ( vbi, pfx, True, Iop_Sub8, True, 1, delta, "sbb" );
19479      return delta;
19480   case 0x1B: /* SBB Ev,Gv */
19481      if (haveF2orF3(pfx)) goto decode_failure;
19482      delta = dis_op2_E_G ( vbi, pfx, True, Iop_Sub8, True, sz, delta, "sbb" );
19483      return delta;
19484
19485   case 0x1C: /* SBB Ib, AL */
19486      if (haveF2orF3(pfx)) goto decode_failure;
19487      delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
19488      return delta;
19489   case 0x1D: /* SBB Iv, eAX */
19490      if (haveF2orF3(pfx)) goto decode_failure;
19491      delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
19492      return delta;
19493
19494   case 0x20: /* AND Gb,Eb */
19495      if (!validF2orF3) goto decode_failure;
19496      delta = dis_op2_G_E ( vbi, pfx, False, Iop_And8, True, 1, delta, "and" );
19497      return delta;
19498   case 0x21: /* AND Gv,Ev */
19499      if (!validF2orF3) goto decode_failure;
19500      delta = dis_op2_G_E ( vbi, pfx, False, Iop_And8, True, sz, delta, "and" );
19501      return delta;
19502
19503   case 0x22: /* AND Eb,Gb */
19504      if (haveF2orF3(pfx)) goto decode_failure;
19505      delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, True, 1, delta, "and" );
19506      return delta;
19507   case 0x23: /* AND Ev,Gv */
19508      if (haveF2orF3(pfx)) goto decode_failure;
19509      delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, True, sz, delta, "and" );
19510      return delta;
19511
19512   case 0x24: /* AND Ib, AL */
19513      if (haveF2orF3(pfx)) goto decode_failure;
19514      delta = dis_op_imm_A( 1, False, Iop_And8, True, delta, "and" );
19515      return delta;
19516   case 0x25: /* AND Iv, eAX */
19517      if (haveF2orF3(pfx)) goto decode_failure;
19518      delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
19519      return delta;
19520
19521   case 0x28: /* SUB Gb,Eb */
19522      if (!validF2orF3) goto decode_failure;
19523      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, True, 1, delta, "sub" );
19524      return delta;
19525   case 0x29: /* SUB Gv,Ev */
19526      if (!validF2orF3) goto decode_failure;
19527      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, True, sz, delta, "sub" );
19528      return delta;
19529
19530   case 0x2A: /* SUB Eb,Gb */
19531      if (haveF2orF3(pfx)) goto decode_failure;
19532      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, True, 1, delta, "sub" );
19533      return delta;
19534   case 0x2B: /* SUB Ev,Gv */
19535      if (haveF2orF3(pfx)) goto decode_failure;
19536      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, True, sz, delta, "sub" );
19537      return delta;
19538
19539   case 0x2C: /* SUB Ib, AL */
19540      if (haveF2orF3(pfx)) goto decode_failure;
19541      delta = dis_op_imm_A(1, False, Iop_Sub8, True, delta, "sub" );
19542      return delta;
19543   case 0x2D: /* SUB Iv, eAX */
19544      if (haveF2orF3(pfx)) goto decode_failure;
19545      delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
19546      return delta;
19547
19548   case 0x30: /* XOR Gb,Eb */
19549      if (!validF2orF3) goto decode_failure;
19550      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Xor8, True, 1, delta, "xor" );
19551      return delta;
19552   case 0x31: /* XOR Gv,Ev */
19553      if (!validF2orF3) goto decode_failure;
19554      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Xor8, True, sz, delta, "xor" );
19555      return delta;
19556
19557   case 0x32: /* XOR Eb,Gb */
19558      if (haveF2orF3(pfx)) goto decode_failure;
19559      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Xor8, True, 1, delta, "xor" );
19560      return delta;
19561   case 0x33: /* XOR Ev,Gv */
19562      if (haveF2orF3(pfx)) goto decode_failure;
19563      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Xor8, True, sz, delta, "xor" );
19564      return delta;
19565
19566   case 0x34: /* XOR Ib, AL */
19567      if (haveF2orF3(pfx)) goto decode_failure;
19568      delta = dis_op_imm_A( 1, False, Iop_Xor8, True, delta, "xor" );
19569      return delta;
19570   case 0x35: /* XOR Iv, eAX */
19571      if (haveF2orF3(pfx)) goto decode_failure;
19572      delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
19573      return delta;
19574
19575   case 0x38: /* CMP Gb,Eb */
19576      if (haveF2orF3(pfx)) goto decode_failure;
19577      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, False, 1, delta, "cmp" );
19578      return delta;
19579   case 0x39: /* CMP Gv,Ev */
19580      if (haveF2orF3(pfx)) goto decode_failure;
19581      delta = dis_op2_G_E ( vbi, pfx, False, Iop_Sub8, False, sz, delta, "cmp" );
19582      return delta;
19583
19584   case 0x3A: /* CMP Eb,Gb */
19585      if (haveF2orF3(pfx)) goto decode_failure;
19586      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, False, 1, delta, "cmp" );
19587      return delta;
19588   case 0x3B: /* CMP Ev,Gv */
19589      if (haveF2orF3(pfx)) goto decode_failure;
19590      delta = dis_op2_E_G ( vbi, pfx, False, Iop_Sub8, False, sz, delta, "cmp" );
19591      return delta;
19592
19593   case 0x3C: /* CMP Ib, AL */
19594      if (haveF2orF3(pfx)) goto decode_failure;
19595      delta = dis_op_imm_A( 1, False, Iop_Sub8, False, delta, "cmp" );
19596      return delta;
19597   case 0x3D: /* CMP Iv, eAX */
19598      if (haveF2orF3(pfx)) goto decode_failure;
19599      delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
19600      return delta;
19601
19602   case 0x50: /* PUSH eAX */
19603   case 0x51: /* PUSH eCX */
19604   case 0x52: /* PUSH eDX */
19605   case 0x53: /* PUSH eBX */
19606   case 0x55: /* PUSH eBP */
19607   case 0x56: /* PUSH eSI */
19608   case 0x57: /* PUSH eDI */
19609   case 0x54: /* PUSH eSP */
19610      /* This is the Right Way, in that the value to be pushed is
19611         established before %rsp is changed, so that pushq %rsp
19612         correctly pushes the old value. */
19613      if (haveF2orF3(pfx)) goto decode_failure;
19614      vassert(sz == 2 || sz == 4 || sz == 8);
19615      if (sz == 4)
19616         sz = 8; /* there is no encoding for 32-bit push in 64-bit mode */
19617      ty = sz==2 ? Ity_I16 : Ity_I64;
19618      t1 = newTemp(ty);
19619      t2 = newTemp(Ity_I64);
19620      assign(t1, getIRegRexB(sz, pfx, opc-0x50));
19621      assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(sz)));
19622      putIReg64(R_RSP, mkexpr(t2) );
19623      storeLE(mkexpr(t2),mkexpr(t1));
19624      DIP("push%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x50));
19625      return delta;
19626
19627   case 0x58: /* POP eAX */
19628   case 0x59: /* POP eCX */
19629   case 0x5A: /* POP eDX */
19630   case 0x5B: /* POP eBX */
19631   case 0x5D: /* POP eBP */
19632   case 0x5E: /* POP eSI */
19633   case 0x5F: /* POP eDI */
19634   case 0x5C: /* POP eSP */
19635      if (haveF2orF3(pfx)) goto decode_failure;
19636      vassert(sz == 2 || sz == 4 || sz == 8);
19637      if (sz == 4)
19638         sz = 8; /* there is no encoding for 32-bit pop in 64-bit mode */
19639      t1 = newTemp(szToITy(sz));
19640      t2 = newTemp(Ity_I64);
19641      assign(t2, getIReg64(R_RSP));
19642      assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
19643      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
19644      putIRegRexB(sz, pfx, opc-0x58, mkexpr(t1));
19645      DIP("pop%c %s\n", nameISize(sz), nameIRegRexB(sz,pfx,opc-0x58));
19646      return delta;
19647
19648   case 0x63: /* MOVSX */
19649      if (haveF2orF3(pfx)) goto decode_failure;
19650      if (haveREX(pfx) && 1==getRexW(pfx)) {
19651         vassert(sz == 8);
19652         /* movsx r/m32 to r64 */
19653         modrm = getUChar(delta);
19654         if (epartIsReg(modrm)) {
19655            delta++;
19656            putIRegG(8, pfx, modrm,
19657                             unop(Iop_32Sto64,
19658                                  getIRegE(4, pfx, modrm)));
19659            DIP("movslq %s,%s\n",
19660                nameIRegE(4, pfx, modrm),
19661                nameIRegG(8, pfx, modrm));
19662            return delta;
19663         } else {
19664            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
19665            delta += alen;
19666            putIRegG(8, pfx, modrm,
19667                             unop(Iop_32Sto64,
19668                                  loadLE(Ity_I32, mkexpr(addr))));
19669            DIP("movslq %s,%s\n", dis_buf,
19670                nameIRegG(8, pfx, modrm));
19671            return delta;
19672         }
19673      } else {
19674         goto decode_failure;
19675      }
19676
19677   case 0x68: /* PUSH Iv */
19678      if (haveF2orF3(pfx)) goto decode_failure;
19679      /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
19680      if (sz == 4) sz = 8;
19681      d64 = getSDisp(imin(4,sz),delta);
19682      delta += imin(4,sz);
19683      goto do_push_I;
19684
19685   case 0x69: /* IMUL Iv, Ev, Gv */
19686      if (haveF2orF3(pfx)) goto decode_failure;
19687      delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, sz );
19688      return delta;
19689
19690   case 0x6A: /* PUSH Ib, sign-extended to sz */
19691      if (haveF2orF3(pfx)) goto decode_failure;
19692      /* Note, sz==4 is not possible in 64-bit mode.  Hence ... */
19693      if (sz == 4) sz = 8;
19694      d64 = getSDisp8(delta); delta += 1;
19695      goto do_push_I;
19696   do_push_I:
19697      ty = szToITy(sz);
19698      t1 = newTemp(Ity_I64);
19699      t2 = newTemp(ty);
19700      assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
19701      putIReg64(R_RSP, mkexpr(t1) );
19702      /* stop mkU16 asserting if d32 is a negative 16-bit number
19703         (bug #132813) */
19704      if (ty == Ity_I16)
19705         d64 &= 0xFFFF;
19706      storeLE( mkexpr(t1), mkU(ty,d64) );
19707      DIP("push%c $%lld\n", nameISize(sz), (Long)d64);
19708      return delta;
19709
19710   case 0x6B: /* IMUL Ib, Ev, Gv */
19711      delta = dis_imul_I_E_G ( vbi, pfx, sz, delta, 1 );
19712      return delta;
19713
19714   case 0x70:
19715   case 0x71:
19716   case 0x72:   /* JBb/JNAEb (jump below) */
19717   case 0x73:   /* JNBb/JAEb (jump not below) */
19718   case 0x74:   /* JZb/JEb (jump zero) */
19719   case 0x75:   /* JNZb/JNEb (jump not zero) */
19720   case 0x76:   /* JBEb/JNAb (jump below or equal) */
19721   case 0x77:   /* JNBEb/JAb (jump not below or equal) */
19722   case 0x78:   /* JSb (jump negative) */
19723   case 0x79:   /* JSb (jump not negative) */
19724   case 0x7A:   /* JP (jump parity even) */
19725   case 0x7B:   /* JNP/JPO (jump parity odd) */
19726   case 0x7C:   /* JLb/JNGEb (jump less) */
19727   case 0x7D:   /* JGEb/JNLb (jump greater or equal) */
19728   case 0x7E:   /* JLEb/JNGb (jump less or equal) */
19729   case 0x7F: { /* JGb/JNLEb (jump greater) */
19730      Long   jmpDelta;
19731      const HChar* comment  = "";
19732      if (haveF3(pfx)) goto decode_failure;
19733      if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
19734      jmpDelta = getSDisp8(delta);
19735      vassert(-128 <= jmpDelta && jmpDelta < 128);
19736      d64 = (guest_RIP_bbstart+delta+1) + jmpDelta;
19737      delta++;
19738      if (resteerCisOk
19739          && vex_control.guest_chase_cond
19740          && (Addr64)d64 != (Addr64)guest_RIP_bbstart
19741          && jmpDelta < 0
19742          && resteerOkFn( callback_opaque, d64) ) {
19743         /* Speculation: assume this backward branch is taken.  So we
19744            need to emit a side-exit to the insn following this one,
19745            on the negation of the condition, and continue at the
19746            branch target address (d64).  If we wind up back at the
19747            first instruction of the trace, just stop; it's better to
19748            let the IR loop unroller handle that case. */
19749         stmt( IRStmt_Exit(
19750                  mk_amd64g_calculate_condition(
19751                     (AMD64Condcode)(1 ^ (opc - 0x70))),
19752                  Ijk_Boring,
19753                  IRConst_U64(guest_RIP_bbstart+delta),
19754                  OFFB_RIP ) );
19755         dres->whatNext   = Dis_ResteerC;
19756         dres->continueAt = d64;
19757         comment = "(assumed taken)";
19758      }
19759      else
19760      if (resteerCisOk
19761          && vex_control.guest_chase_cond
19762          && (Addr64)d64 != (Addr64)guest_RIP_bbstart
19763          && jmpDelta >= 0
19764          && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
19765         /* Speculation: assume this forward branch is not taken.  So
19766            we need to emit a side-exit to d64 (the dest) and continue
19767            disassembling at the insn immediately following this
19768            one. */
19769         stmt( IRStmt_Exit(
19770                  mk_amd64g_calculate_condition((AMD64Condcode)(opc - 0x70)),
19771                  Ijk_Boring,
19772                  IRConst_U64(d64),
19773                  OFFB_RIP ) );
19774         dres->whatNext   = Dis_ResteerC;
19775         dres->continueAt = guest_RIP_bbstart+delta;
19776         comment = "(assumed not taken)";
19777      }
19778      else {
19779         /* Conservative default translation - end the block at this
19780            point. */
19781         jcc_01( dres, (AMD64Condcode)(opc - 0x70),
19782                 guest_RIP_bbstart+delta, d64 );
19783         vassert(dres->whatNext == Dis_StopHere);
19784      }
19785      DIP("j%s-8 0x%llx %s\n", name_AMD64Condcode(opc - 0x70), d64, comment);
19786      return delta;
19787   }
19788
19789   case 0x80: /* Grp1 Ib,Eb */
19790      modrm = getUChar(delta);
19791      /* Disallow F2/XACQ and F3/XREL for the non-mem case.  Allow
19792         just one for the mem case and also require LOCK in this case.
19793         Note that this erroneously allows XACQ/XREL on CMP since we
19794         don't check the subopcode here.  No big deal. */
19795      if (epartIsReg(modrm) && haveF2orF3(pfx))
19796         goto decode_failure;
19797      if (!epartIsReg(modrm) && haveF2andF3(pfx))
19798         goto decode_failure;
19799      if (!epartIsReg(modrm) && haveF2orF3(pfx) && !haveLOCK(pfx))
19800         goto decode_failure;
19801      am_sz = lengthAMode(pfx,delta);
19802      sz    = 1;
19803      d_sz  = 1;
19804      d64   = getSDisp8(delta + am_sz);
19805      delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
19806      return delta;
19807
19808   case 0x81: /* Grp1 Iv,Ev */
19809      modrm = getUChar(delta);
19810      /* Same comment as for case 0x80 just above. */
19811      if (epartIsReg(modrm) && haveF2orF3(pfx))
19812         goto decode_failure;
19813      if (!epartIsReg(modrm) && haveF2andF3(pfx))
19814         goto decode_failure;
19815      if (!epartIsReg(modrm) && haveF2orF3(pfx) && !haveLOCK(pfx))
19816         goto decode_failure;
19817      am_sz = lengthAMode(pfx,delta);
19818      d_sz  = imin(sz,4);
19819      d64   = getSDisp(d_sz, delta + am_sz);
19820      delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
19821      return delta;
19822
19823   case 0x83: /* Grp1 Ib,Ev */
19824      if (haveF2orF3(pfx)) goto decode_failure;
19825      modrm = getUChar(delta);
19826      am_sz = lengthAMode(pfx,delta);
19827      d_sz  = 1;
19828      d64   = getSDisp8(delta + am_sz);
19829      delta = dis_Grp1 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz, d64 );
19830      return delta;
19831
19832   case 0x84: /* TEST Eb,Gb */
19833      if (haveF2orF3(pfx)) goto decode_failure;
19834      delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, False, 1, delta, "test" );
19835      return delta;
19836
19837   case 0x85: /* TEST Ev,Gv */
19838      if (haveF2orF3(pfx)) goto decode_failure;
19839      delta = dis_op2_E_G ( vbi, pfx, False, Iop_And8, False, sz, delta, "test" );
19840      return delta;
19841
19842   /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
19843      prefix.  Therefore, generate CAS regardless of the presence or
19844      otherwise of a LOCK prefix. */
19845   case 0x86: /* XCHG Gb,Eb */
19846      sz = 1;
19847      /* Fall through ... */
19848   case 0x87: /* XCHG Gv,Ev */
19849      modrm = getUChar(delta);
19850      /* Check whether F2 or F3 are allowable.  For the mem case, one
19851         or the othter but not both are.  We don't care about the
19852         presence of LOCK in this case -- XCHG is unusual in this
19853         respect. */
19854      if (haveF2orF3(pfx)) {
19855         if (epartIsReg(modrm)) {
19856            goto decode_failure;
19857         } else {
19858            if (haveF2andF3(pfx))
19859               goto decode_failure;
19860         }
19861      }
19862      ty = szToITy(sz);
19863      t1 = newTemp(ty); t2 = newTemp(ty);
19864      if (epartIsReg(modrm)) {
19865         assign(t1, getIRegE(sz, pfx, modrm));
19866         assign(t2, getIRegG(sz, pfx, modrm));
19867         putIRegG(sz, pfx, modrm, mkexpr(t1));
19868         putIRegE(sz, pfx, modrm, mkexpr(t2));
19869         delta++;
19870         DIP("xchg%c %s, %s\n",
19871             nameISize(sz), nameIRegG(sz, pfx, modrm),
19872                            nameIRegE(sz, pfx, modrm));
19873      } else {
19874         *expect_CAS = True;
19875         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
19876         assign( t1, loadLE(ty, mkexpr(addr)) );
19877         assign( t2, getIRegG(sz, pfx, modrm) );
19878         casLE( mkexpr(addr),
19879                mkexpr(t1), mkexpr(t2), guest_RIP_curr_instr );
19880         putIRegG( sz, pfx, modrm, mkexpr(t1) );
19881         delta += alen;
19882         DIP("xchg%c %s, %s\n", nameISize(sz),
19883                                nameIRegG(sz, pfx, modrm), dis_buf);
19884      }
19885      return delta;
19886
19887   case 0x88: { /* MOV Gb,Eb */
19888      /* We let dis_mov_G_E decide whether F3(XRELEASE) is allowable. */
19889      Bool ok = True;
19890      delta = dis_mov_G_E(vbi, pfx, 1, delta, &ok);
19891      if (!ok) goto decode_failure;
19892      return delta;
19893   }
19894
19895   case 0x89: { /* MOV Gv,Ev */
19896      /* We let dis_mov_G_E decide whether F3(XRELEASE) is allowable. */
19897      Bool ok = True;
19898      delta = dis_mov_G_E(vbi, pfx, sz, delta, &ok);
19899      if (!ok) goto decode_failure;
19900      return delta;
19901   }
19902
19903   case 0x8A: /* MOV Eb,Gb */
19904      if (haveF2orF3(pfx)) goto decode_failure;
19905      delta = dis_mov_E_G(vbi, pfx, 1, delta);
19906      return delta;
19907
19908   case 0x8B: /* MOV Ev,Gv */
19909      if (haveF2orF3(pfx)) goto decode_failure;
19910      delta = dis_mov_E_G(vbi, pfx, sz, delta);
19911      return delta;
19912
19913   case 0x8D: /* LEA M,Gv */
19914      if (haveF2orF3(pfx)) goto decode_failure;
19915      if (sz != 4 && sz != 8)
19916         goto decode_failure;
19917      modrm = getUChar(delta);
19918      if (epartIsReg(modrm))
19919         goto decode_failure;
19920      /* NOTE!  this is the one place where a segment override prefix
19921         has no effect on the address calculation.  Therefore we clear
19922         any segment override bits in pfx. */
19923      addr = disAMode ( &alen, vbi, clearSegBits(pfx), delta, dis_buf, 0 );
19924      delta += alen;
19925      /* This is a hack.  But it isn't clear that really doing the
19926         calculation at 32 bits is really worth it.  Hence for leal,
19927         do the full 64-bit calculation and then truncate it. */
19928      putIRegG( sz, pfx, modrm,
19929                         sz == 4
19930                            ? unop(Iop_64to32, mkexpr(addr))
19931                            : mkexpr(addr)
19932              );
19933      DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
19934                            nameIRegG(sz,pfx,modrm));
19935      return delta;
19936
19937   case 0x8F: { /* POPQ m64 / POPW m16 */
19938      Int   len;
19939      UChar rm;
19940      /* There is no encoding for 32-bit pop in 64-bit mode.
19941         So sz==4 actually means sz==8. */
19942      if (haveF2orF3(pfx)) goto decode_failure;
19943      vassert(sz == 2 || sz == 4
19944              || /* tolerate redundant REX.W, see #210481 */ sz == 8);
19945      if (sz == 4) sz = 8;
19946      if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
19947
19948      rm = getUChar(delta);
19949
19950      /* make sure this instruction is correct POP */
19951      if (epartIsReg(rm) || gregLO3ofRM(rm) != 0)
19952         goto decode_failure;
19953      /* and has correct size */
19954      vassert(sz == 8);
19955
19956      t1 = newTemp(Ity_I64);
19957      t3 = newTemp(Ity_I64);
19958      assign( t1, getIReg64(R_RSP) );
19959      assign( t3, loadLE(Ity_I64, mkexpr(t1)) );
19960
19961      /* Increase RSP; must be done before the STORE.  Intel manual
19962         says: If the RSP register is used as a base register for
19963         addressing a destination operand in memory, the POP
19964         instruction computes the effective address of the operand
19965         after it increments the RSP register.  */
19966      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(sz)) );
19967
19968      addr = disAMode ( &len, vbi, pfx, delta, dis_buf, 0 );
19969      storeLE( mkexpr(addr), mkexpr(t3) );
19970
19971      DIP("popl %s\n", dis_buf);
19972
19973      delta += len;
19974      return delta;
19975   }
19976
19977   case 0x90: /* XCHG eAX,eAX */
19978      /* detect and handle F3 90 (rep nop) specially */
19979      if (!have66(pfx) && !haveF2(pfx) && haveF3(pfx)) {
19980         DIP("rep nop (P4 pause)\n");
19981         /* "observe" the hint.  The Vex client needs to be careful not
19982            to cause very long delays as a result, though. */
19983         jmp_lit(dres, Ijk_Yield, guest_RIP_bbstart+delta);
19984         vassert(dres->whatNext == Dis_StopHere);
19985         return delta;
19986      }
19987      /* detect and handle NOPs specially */
19988      if (/* F2/F3 probably change meaning completely */
19989          !haveF2orF3(pfx)
19990          /* If REX.B is 1, we're not exchanging rAX with itself */
19991          && getRexB(pfx)==0 ) {
19992         DIP("nop\n");
19993         return delta;
19994      }
19995      /* else fall through to normal case. */
19996   case 0x91: /* XCHG rAX,rCX */
19997   case 0x92: /* XCHG rAX,rDX */
19998   case 0x93: /* XCHG rAX,rBX */
19999   case 0x94: /* XCHG rAX,rSP */
20000   case 0x95: /* XCHG rAX,rBP */
20001   case 0x96: /* XCHG rAX,rSI */
20002   case 0x97: /* XCHG rAX,rDI */
20003      /* guard against mutancy */
20004      if (haveF2orF3(pfx)) goto decode_failure;
20005      codegen_xchg_rAX_Reg ( pfx, sz, opc - 0x90 );
20006      return delta;
20007
20008   case 0x98: /* CBW */
20009      if (haveF2orF3(pfx)) goto decode_failure;
20010      if (sz == 8) {
20011         putIRegRAX( 8, unop(Iop_32Sto64, getIRegRAX(4)) );
20012         DIP(/*"cdqe\n"*/"cltq");
20013         return delta;
20014      }
20015      if (sz == 4) {
20016         putIRegRAX( 4, unop(Iop_16Sto32, getIRegRAX(2)) );
20017         DIP("cwtl\n");
20018         return delta;
20019      }
20020      if (sz == 2) {
20021         putIRegRAX( 2, unop(Iop_8Sto16, getIRegRAX(1)) );
20022         DIP("cbw\n");
20023         return delta;
20024      }
20025      goto decode_failure;
20026
20027   case 0x99: /* CWD/CDQ/CQO */
20028      if (haveF2orF3(pfx)) goto decode_failure;
20029      vassert(sz == 2 || sz == 4 || sz == 8);
20030      ty = szToITy(sz);
20031      putIRegRDX( sz,
20032                  binop(mkSizedOp(ty,Iop_Sar8),
20033                        getIRegRAX(sz),
20034                        mkU8(sz == 2 ? 15 : (sz == 4 ? 31 : 63))) );
20035      DIP(sz == 2 ? "cwd\n"
20036                  : (sz == 4 ? /*"cdq\n"*/ "cltd\n"
20037                             : "cqo\n"));
20038      return delta;
20039
20040   case 0x9B: /* FWAIT (X87 insn) */
20041      /* ignore? */
20042      DIP("fwait\n");
20043      return delta;
20044
20045   case 0x9C: /* PUSHF */ {
20046      /* Note.  There is no encoding for a 32-bit pushf in 64-bit
20047         mode.  So sz==4 actually means sz==8. */
20048      /* 24 July 06: has also been seen with a redundant REX prefix,
20049         so must also allow sz==8. */
20050      if (haveF2orF3(pfx)) goto decode_failure;
20051      vassert(sz == 2 || sz == 4 || sz == 8);
20052      if (sz == 4) sz = 8;
20053      if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
20054
20055      t1 = newTemp(Ity_I64);
20056      assign( t1, binop(Iop_Sub64,getIReg64(R_RSP),mkU64(sz)) );
20057      putIReg64(R_RSP, mkexpr(t1) );
20058
20059      t2 = newTemp(Ity_I64);
20060      assign( t2, mk_amd64g_calculate_rflags_all() );
20061
20062      /* Patch in the D flag.  This can simply be a copy of bit 10 of
20063         baseBlock[OFFB_DFLAG]. */
20064      t3 = newTemp(Ity_I64);
20065      assign( t3, binop(Iop_Or64,
20066                        mkexpr(t2),
20067                        binop(Iop_And64,
20068                              IRExpr_Get(OFFB_DFLAG,Ity_I64),
20069                              mkU64(1<<10)))
20070            );
20071
20072      /* And patch in the ID flag. */
20073      t4 = newTemp(Ity_I64);
20074      assign( t4, binop(Iop_Or64,
20075                        mkexpr(t3),
20076                        binop(Iop_And64,
20077                              binop(Iop_Shl64, IRExpr_Get(OFFB_IDFLAG,Ity_I64),
20078                                               mkU8(21)),
20079                              mkU64(1<<21)))
20080            );
20081
20082      /* And patch in the AC flag too. */
20083      t5 = newTemp(Ity_I64);
20084      assign( t5, binop(Iop_Or64,
20085                        mkexpr(t4),
20086                        binop(Iop_And64,
20087                              binop(Iop_Shl64, IRExpr_Get(OFFB_ACFLAG,Ity_I64),
20088                                               mkU8(18)),
20089                              mkU64(1<<18)))
20090            );
20091
20092      /* if sz==2, the stored value needs to be narrowed. */
20093      if (sz == 2)
20094        storeLE( mkexpr(t1), unop(Iop_32to16,
20095                             unop(Iop_64to32,mkexpr(t5))) );
20096      else
20097        storeLE( mkexpr(t1), mkexpr(t5) );
20098
20099      DIP("pushf%c\n", nameISize(sz));
20100      return delta;
20101   }
20102
20103   case 0x9D: /* POPF */
20104      /* Note.  There is no encoding for a 32-bit popf in 64-bit mode.
20105         So sz==4 actually means sz==8. */
20106      if (haveF2orF3(pfx)) goto decode_failure;
20107      vassert(sz == 2 || sz == 4);
20108      if (sz == 4) sz = 8;
20109      if (sz != 8) goto decode_failure; // until we know a sz==2 test case exists
20110      t1 = newTemp(Ity_I64); t2 = newTemp(Ity_I64);
20111      assign(t2, getIReg64(R_RSP));
20112      assign(t1, widenUto64(loadLE(szToITy(sz),mkexpr(t2))));
20113      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t2), mkU64(sz)));
20114      /* t1 is the flag word.  Mask out everything except OSZACP and
20115         set the flags thunk to AMD64G_CC_OP_COPY. */
20116      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
20117      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
20118      stmt( IRStmt_Put( OFFB_CC_DEP1,
20119                        binop(Iop_And64,
20120                              mkexpr(t1),
20121                              mkU64( AMD64G_CC_MASK_C | AMD64G_CC_MASK_P
20122                                     | AMD64G_CC_MASK_A | AMD64G_CC_MASK_Z
20123                                     | AMD64G_CC_MASK_S| AMD64G_CC_MASK_O )
20124                             )
20125                       )
20126          );
20127
20128      /* Also need to set the D flag, which is held in bit 10 of t1.
20129         If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
20130      stmt( IRStmt_Put(
20131               OFFB_DFLAG,
20132               IRExpr_ITE(
20133                  unop(Iop_64to1,
20134                       binop(Iop_And64,
20135                             binop(Iop_Shr64, mkexpr(t1), mkU8(10)),
20136                             mkU64(1))),
20137                  mkU64(0xFFFFFFFFFFFFFFFFULL),
20138                  mkU64(1)))
20139          );
20140
20141      /* And set the ID flag */
20142      stmt( IRStmt_Put(
20143               OFFB_IDFLAG,
20144               IRExpr_ITE(
20145                  unop(Iop_64to1,
20146                       binop(Iop_And64,
20147                             binop(Iop_Shr64, mkexpr(t1), mkU8(21)),
20148                             mkU64(1))),
20149                  mkU64(1),
20150                  mkU64(0)))
20151          );
20152
20153      /* And set the AC flag too */
20154      stmt( IRStmt_Put(
20155               OFFB_ACFLAG,
20156               IRExpr_ITE(
20157                  unop(Iop_64to1,
20158                       binop(Iop_And64,
20159                             binop(Iop_Shr64, mkexpr(t1), mkU8(18)),
20160                             mkU64(1))),
20161                  mkU64(1),
20162                  mkU64(0)))
20163          );
20164
20165      DIP("popf%c\n", nameISize(sz));
20166      return delta;
20167
20168   case 0x9E: /* SAHF */
20169      codegen_SAHF();
20170      DIP("sahf\n");
20171      return delta;
20172
20173   case 0x9F: /* LAHF */
20174      codegen_LAHF();
20175      DIP("lahf\n");
20176      return delta;
20177
20178   case 0xA0: /* MOV Ob,AL */
20179      if (have66orF2orF3(pfx)) goto decode_failure;
20180      sz = 1;
20181      /* Fall through ... */
20182   case 0xA1: /* MOV Ov,eAX */
20183      if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
20184         goto decode_failure;
20185      d64 = getDisp64(delta);
20186      delta += 8;
20187      ty = szToITy(sz);
20188      addr = newTemp(Ity_I64);
20189      assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
20190      putIRegRAX(sz, loadLE( ty, mkexpr(addr) ));
20191      DIP("mov%c %s0x%llx, %s\n", nameISize(sz),
20192                                  segRegTxt(pfx), d64,
20193                                  nameIRegRAX(sz));
20194      return delta;
20195
20196   case 0xA2: /* MOV AL,Ob */
20197      if (have66orF2orF3(pfx)) goto decode_failure;
20198      sz = 1;
20199      /* Fall through ... */
20200   case 0xA3: /* MOV eAX,Ov */
20201      if (sz != 8 && sz != 4 && sz != 2 && sz != 1)
20202         goto decode_failure;
20203      d64 = getDisp64(delta);
20204      delta += 8;
20205      ty = szToITy(sz);
20206      addr = newTemp(Ity_I64);
20207      assign( addr, handleAddrOverrides(vbi, pfx, mkU64(d64)) );
20208      storeLE( mkexpr(addr), getIRegRAX(sz) );
20209      DIP("mov%c %s, %s0x%llx\n", nameISize(sz), nameIRegRAX(sz),
20210                                  segRegTxt(pfx), d64);
20211      return delta;
20212
20213   case 0xA4:
20214   case 0xA5:
20215      /* F3 A4: rep movsb */
20216      if (haveF3(pfx) && !haveF2(pfx)) {
20217         if (opc == 0xA4)
20218            sz = 1;
20219         dis_REP_op ( dres, AMD64CondAlways, dis_MOVS, sz,
20220                      guest_RIP_curr_instr,
20221                      guest_RIP_bbstart+delta, "rep movs", pfx );
20222        dres->whatNext = Dis_StopHere;
20223        return delta;
20224      }
20225      /* A4: movsb */
20226      if (!haveF3(pfx) && !haveF2(pfx)) {
20227         if (opc == 0xA4)
20228            sz = 1;
20229         dis_string_op( dis_MOVS, sz, "movs", pfx );
20230         return delta;
20231      }
20232      goto decode_failure;
20233
20234   case 0xA6:
20235   case 0xA7:
20236      /* F3 A6/A7: repe cmps/rep cmps{w,l,q} */
20237      if (haveF3(pfx) && !haveF2(pfx)) {
20238         if (opc == 0xA6)
20239            sz = 1;
20240         dis_REP_op ( dres, AMD64CondZ, dis_CMPS, sz,
20241                      guest_RIP_curr_instr,
20242                      guest_RIP_bbstart+delta, "repe cmps", pfx );
20243         dres->whatNext = Dis_StopHere;
20244         return delta;
20245      }
20246      goto decode_failure;
20247
20248   case 0xAA:
20249   case 0xAB:
20250      /* F3 AA/AB: rep stosb/rep stos{w,l,q} */
20251      if (haveF3(pfx) && !haveF2(pfx)) {
20252         if (opc == 0xAA)
20253            sz = 1;
20254         dis_REP_op ( dres, AMD64CondAlways, dis_STOS, sz,
20255                      guest_RIP_curr_instr,
20256                      guest_RIP_bbstart+delta, "rep stos", pfx );
20257         vassert(dres->whatNext == Dis_StopHere);
20258         return delta;
20259      }
20260      /* AA/AB: stosb/stos{w,l,q} */
20261      if (!haveF3(pfx) && !haveF2(pfx)) {
20262         if (opc == 0xAA)
20263            sz = 1;
20264         dis_string_op( dis_STOS, sz, "stos", pfx );
20265         return delta;
20266      }
20267      goto decode_failure;
20268
20269   case 0xA8: /* TEST Ib, AL */
20270      if (haveF2orF3(pfx)) goto decode_failure;
20271      delta = dis_op_imm_A( 1, False, Iop_And8, False, delta, "test" );
20272      return delta;
20273   case 0xA9: /* TEST Iv, eAX */
20274      if (haveF2orF3(pfx)) goto decode_failure;
20275      delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
20276      return delta;
20277
20278   case 0xAC: /* LODS, no REP prefix */
20279   case 0xAD:
20280      dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", pfx );
20281      return delta;
20282
20283   case 0xAE:
20284   case 0xAF:
20285      /* F2 AE/AF: repne scasb/repne scas{w,l,q} */
20286      if (haveF2(pfx) && !haveF3(pfx)) {
20287         if (opc == 0xAE)
20288            sz = 1;
20289         dis_REP_op ( dres, AMD64CondNZ, dis_SCAS, sz,
20290                      guest_RIP_curr_instr,
20291                      guest_RIP_bbstart+delta, "repne scas", pfx );
20292         vassert(dres->whatNext == Dis_StopHere);
20293         return delta;
20294      }
20295      /* F3 AE/AF: repe scasb/repe scas{w,l,q} */
20296      if (!haveF2(pfx) && haveF3(pfx)) {
20297         if (opc == 0xAE)
20298            sz = 1;
20299         dis_REP_op ( dres, AMD64CondZ, dis_SCAS, sz,
20300                      guest_RIP_curr_instr,
20301                      guest_RIP_bbstart+delta, "repe scas", pfx );
20302         vassert(dres->whatNext == Dis_StopHere);
20303         return delta;
20304      }
20305      /* AE/AF: scasb/scas{w,l,q} */
20306      if (!haveF2(pfx) && !haveF3(pfx)) {
20307         if (opc == 0xAE)
20308            sz = 1;
20309         dis_string_op( dis_SCAS, sz, "scas", pfx );
20310         return delta;
20311      }
20312      goto decode_failure;
20313
20314   /* XXXX be careful here with moves to AH/BH/CH/DH */
20315   case 0xB0: /* MOV imm,AL */
20316   case 0xB1: /* MOV imm,CL */
20317   case 0xB2: /* MOV imm,DL */
20318   case 0xB3: /* MOV imm,BL */
20319   case 0xB4: /* MOV imm,AH */
20320   case 0xB5: /* MOV imm,CH */
20321   case 0xB6: /* MOV imm,DH */
20322   case 0xB7: /* MOV imm,BH */
20323      if (haveF2orF3(pfx)) goto decode_failure;
20324      d64 = getUChar(delta);
20325      delta += 1;
20326      putIRegRexB(1, pfx, opc-0xB0, mkU8(d64));
20327      DIP("movb $%lld,%s\n", d64, nameIRegRexB(1,pfx,opc-0xB0));
20328      return delta;
20329
20330   case 0xB8: /* MOV imm,eAX */
20331   case 0xB9: /* MOV imm,eCX */
20332   case 0xBA: /* MOV imm,eDX */
20333   case 0xBB: /* MOV imm,eBX */
20334   case 0xBC: /* MOV imm,eSP */
20335   case 0xBD: /* MOV imm,eBP */
20336   case 0xBE: /* MOV imm,eSI */
20337   case 0xBF: /* MOV imm,eDI */
20338      /* This is the one-and-only place where 64-bit literals are
20339         allowed in the instruction stream. */
20340      if (haveF2orF3(pfx)) goto decode_failure;
20341      if (sz == 8) {
20342         d64 = getDisp64(delta);
20343         delta += 8;
20344         putIRegRexB(8, pfx, opc-0xB8, mkU64(d64));
20345         DIP("movabsq $%lld,%s\n", (Long)d64,
20346                                   nameIRegRexB(8,pfx,opc-0xB8));
20347      } else {
20348         d64 = getSDisp(imin(4,sz),delta);
20349         delta += imin(4,sz);
20350         putIRegRexB(sz, pfx, opc-0xB8,
20351                         mkU(szToITy(sz), d64 & mkSizeMask(sz)));
20352         DIP("mov%c $%lld,%s\n", nameISize(sz),
20353                                 (Long)d64,
20354                                 nameIRegRexB(sz,pfx,opc-0xB8));
20355      }
20356      return delta;
20357
20358   case 0xC0: { /* Grp2 Ib,Eb */
20359      Bool decode_OK = True;
20360      if (haveF2orF3(pfx)) goto decode_failure;
20361      modrm = getUChar(delta);
20362      am_sz = lengthAMode(pfx,delta);
20363      d_sz  = 1;
20364      d64   = getUChar(delta + am_sz);
20365      sz    = 1;
20366      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
20367                         mkU8(d64 & 0xFF), NULL, &decode_OK );
20368      if (!decode_OK) goto decode_failure;
20369      return delta;
20370   }
20371
20372   case 0xC1: { /* Grp2 Ib,Ev */
20373      Bool decode_OK = True;
20374      if (haveF2orF3(pfx)) goto decode_failure;
20375      modrm = getUChar(delta);
20376      am_sz = lengthAMode(pfx,delta);
20377      d_sz  = 1;
20378      d64   = getUChar(delta + am_sz);
20379      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
20380                         mkU8(d64 & 0xFF), NULL, &decode_OK );
20381      if (!decode_OK) goto decode_failure;
20382      return delta;
20383   }
20384
20385   case 0xC2: /* RET imm16 */
20386      if (have66orF3(pfx)) goto decode_failure;
20387      if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
20388      d64 = getUDisp16(delta);
20389      delta += 2;
20390      dis_ret(dres, vbi, d64);
20391      DIP("ret $%lld\n", d64);
20392      return delta;
20393
20394   case 0xC3: /* RET */
20395      if (have66(pfx)) goto decode_failure;
20396      /* F3 is acceptable on AMD. */
20397      if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
20398      dis_ret(dres, vbi, 0);
20399      DIP(haveF3(pfx) ? "rep ; ret\n" : "ret\n");
20400      return delta;
20401
20402   case 0xC6: /* C6 /0 = MOV Ib,Eb */
20403      sz = 1;
20404      goto maybe_do_Mov_I_E;
20405   case 0xC7: /* C7 /0 = MOV Iv,Ev */
20406      goto maybe_do_Mov_I_E;
20407   maybe_do_Mov_I_E:
20408      modrm = getUChar(delta);
20409      if (gregLO3ofRM(modrm) == 0) {
20410         if (epartIsReg(modrm)) {
20411            /* Neither F2 nor F3 are allowable. */
20412            if (haveF2orF3(pfx)) goto decode_failure;
20413            delta++; /* mod/rm byte */
20414            d64 = getSDisp(imin(4,sz),delta);
20415            delta += imin(4,sz);
20416            putIRegE(sz, pfx, modrm,
20417                         mkU(szToITy(sz), d64 & mkSizeMask(sz)));
20418            DIP("mov%c $%lld, %s\n", nameISize(sz),
20419                                     (Long)d64,
20420                                     nameIRegE(sz,pfx,modrm));
20421         } else {
20422            if (haveF2(pfx)) goto decode_failure;
20423            /* F3(XRELEASE) is allowable here */
20424            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf,
20425                              /*xtra*/imin(4,sz) );
20426            delta += alen;
20427            d64 = getSDisp(imin(4,sz),delta);
20428            delta += imin(4,sz);
20429            storeLE(mkexpr(addr),
20430                    mkU(szToITy(sz), d64 & mkSizeMask(sz)));
20431            DIP("mov%c $%lld, %s\n", nameISize(sz), (Long)d64, dis_buf);
20432         }
20433         return delta;
20434      }
20435      /* BEGIN HACKY SUPPORT FOR xbegin */
20436      if (opc == 0xC7 && modrm == 0xF8 && !have66orF2orF3(pfx) && sz == 4
20437          && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
20438         delta++; /* mod/rm byte */
20439         d64 = getSDisp(4,delta);
20440         delta += 4;
20441         guest_RIP_next_mustcheck = True;
20442         guest_RIP_next_assumed   = guest_RIP_bbstart + delta;
20443         Addr64 failAddr = guest_RIP_bbstart + delta + d64;
20444         /* EAX contains the failure status code.  Bit 3 is "Set if an
20445            internal buffer overflowed", which seems like the
20446            least-bogus choice we can make here. */
20447         putIRegRAX(4, mkU32(1<<3));
20448         /* And jump to the fail address. */
20449         jmp_lit(dres, Ijk_Boring, failAddr);
20450         vassert(dres->whatNext == Dis_StopHere);
20451         DIP("xbeginq 0x%llx\n", failAddr);
20452         return delta;
20453      }
20454      /* END HACKY SUPPORT FOR xbegin */
20455      /* BEGIN HACKY SUPPORT FOR xabort */
20456      if (opc == 0xC6 && modrm == 0xF8 && !have66orF2orF3(pfx) && sz == 1
20457          && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
20458         delta++; /* mod/rm byte */
20459         abyte = getUChar(delta); delta++;
20460         /* There is never a real transaction in progress, so do nothing. */
20461         DIP("xabort $%d", (Int)abyte);
20462         return delta;
20463      }
20464      /* END HACKY SUPPORT FOR xabort */
20465      goto decode_failure;
20466
20467   case 0xC8: /* ENTER */
20468      /* Same comments re operand size as for LEAVE below apply.
20469         Also, only handles the case "enter $imm16, $0"; other cases
20470         for the second operand (nesting depth) are not handled. */
20471      if (sz != 4)
20472         goto decode_failure;
20473      d64 = getUDisp16(delta);
20474      delta += 2;
20475      vassert(d64 >= 0 && d64 <= 0xFFFF);
20476      if (getUChar(delta) != 0)
20477         goto decode_failure;
20478      delta++;
20479      /* Intel docs seem to suggest:
20480           push rbp
20481           temp = rsp
20482           rbp = temp
20483           rsp = rsp - imm16
20484      */
20485      t1 = newTemp(Ity_I64);
20486      assign(t1, getIReg64(R_RBP));
20487      t2 = newTemp(Ity_I64);
20488      assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
20489      putIReg64(R_RSP, mkexpr(t2));
20490      storeLE(mkexpr(t2), mkexpr(t1));
20491      putIReg64(R_RBP, mkexpr(t2));
20492      if (d64 > 0) {
20493         putIReg64(R_RSP, binop(Iop_Sub64, mkexpr(t2), mkU64(d64)));
20494      }
20495      DIP("enter $%u, $0\n", (UInt)d64);
20496      return delta;
20497
20498   case 0xC9: /* LEAVE */
20499      /* In 64-bit mode this defaults to a 64-bit operand size.  There
20500         is no way to encode a 32-bit variant.  Hence sz==4 but we do
20501         it as if sz=8. */
20502      if (sz != 4)
20503         goto decode_failure;
20504      t1 = newTemp(Ity_I64);
20505      t2 = newTemp(Ity_I64);
20506      assign(t1, getIReg64(R_RBP));
20507      /* First PUT RSP looks redundant, but need it because RSP must
20508         always be up-to-date for Memcheck to work... */
20509      putIReg64(R_RSP, mkexpr(t1));
20510      assign(t2, loadLE(Ity_I64,mkexpr(t1)));
20511      putIReg64(R_RBP, mkexpr(t2));
20512      putIReg64(R_RSP, binop(Iop_Add64, mkexpr(t1), mkU64(8)) );
20513      DIP("leave\n");
20514      return delta;
20515
20516   case 0xCC: /* INT 3 */
20517      jmp_lit(dres, Ijk_SigTRAP, guest_RIP_bbstart + delta);
20518      vassert(dres->whatNext == Dis_StopHere);
20519      DIP("int $0x3\n");
20520      return delta;
20521
20522   case 0xD0: { /* Grp2 1,Eb */
20523      Bool decode_OK = True;
20524      if (haveF2orF3(pfx)) goto decode_failure;
20525      modrm = getUChar(delta);
20526      am_sz = lengthAMode(pfx,delta);
20527      d_sz  = 0;
20528      d64   = 1;
20529      sz    = 1;
20530      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
20531                         mkU8(d64), NULL, &decode_OK );
20532      if (!decode_OK) goto decode_failure;
20533      return delta;
20534   }
20535
20536   case 0xD1: { /* Grp2 1,Ev */
20537      Bool decode_OK = True;
20538      if (haveF2orF3(pfx)) goto decode_failure;
20539      modrm = getUChar(delta);
20540      am_sz = lengthAMode(pfx,delta);
20541      d_sz  = 0;
20542      d64   = 1;
20543      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
20544                         mkU8(d64), NULL, &decode_OK );
20545      if (!decode_OK) goto decode_failure;
20546      return delta;
20547   }
20548
20549   case 0xD2: { /* Grp2 CL,Eb */
20550      Bool decode_OK = True;
20551      if (haveF2orF3(pfx)) goto decode_failure;
20552      modrm = getUChar(delta);
20553      am_sz = lengthAMode(pfx,delta);
20554      d_sz  = 0;
20555      sz    = 1;
20556      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
20557                         getIRegCL(), "%cl", &decode_OK );
20558      if (!decode_OK) goto decode_failure;
20559      return delta;
20560   }
20561
20562   case 0xD3: { /* Grp2 CL,Ev */
20563      Bool decode_OK = True;
20564      if (haveF2orF3(pfx)) goto decode_failure;
20565      modrm = getUChar(delta);
20566      am_sz = lengthAMode(pfx,delta);
20567      d_sz  = 0;
20568      delta = dis_Grp2 ( vbi, pfx, delta, modrm, am_sz, d_sz, sz,
20569                         getIRegCL(), "%cl", &decode_OK );
20570      if (!decode_OK) goto decode_failure;
20571      return delta;
20572   }
20573
20574   case 0xD8: /* X87 instructions */
20575   case 0xD9:
20576   case 0xDA:
20577   case 0xDB:
20578   case 0xDC:
20579   case 0xDD:
20580   case 0xDE:
20581   case 0xDF: {
20582      Bool redundantREXWok = False;
20583
20584      if (haveF2orF3(pfx))
20585         goto decode_failure;
20586
20587      /* kludge to tolerate redundant rex.w prefixes (should do this
20588         properly one day) */
20589      /* mono 1.1.18.1 produces 48 D9 FA, which is rex.w fsqrt */
20590      if ( (opc == 0xD9 && getUChar(delta+0) == 0xFA)/*fsqrt*/ )
20591         redundantREXWok = True;
20592
20593      Bool size_OK = False;
20594      if ( sz == 4 )
20595         size_OK = True;
20596      else if ( sz == 8 )
20597         size_OK = redundantREXWok;
20598      else if ( sz == 2 ) {
20599         int mod_rm = getUChar(delta+0);
20600         int reg = gregLO3ofRM(mod_rm);
20601         /* The HotSpot JVM uses these */
20602         if ( (opc == 0xDD) && (reg == 0 /* FLDL   */ ||
20603                                reg == 4 /* FNSAVE */ ||
20604                                reg == 6 /* FRSTOR */ ) )
20605            size_OK = True;
20606      }
20607      /* AMD manual says 0x66 size override is ignored, except where
20608         it is meaningful */
20609      if (!size_OK)
20610         goto decode_failure;
20611
20612      Bool decode_OK = False;
20613      delta = dis_FPU ( &decode_OK, vbi, pfx, delta );
20614      if (!decode_OK)
20615         goto decode_failure;
20616
20617      return delta;
20618   }
20619
20620   case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
20621   case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
20622   case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
20623    { /* The docs say this uses rCX as a count depending on the
20624         address size override, not the operand one. */
20625      IRExpr* zbit  = NULL;
20626      IRExpr* count = NULL;
20627      IRExpr* cond  = NULL;
20628      const HChar* xtra = NULL;
20629
20630      if (have66orF2orF3(pfx) || 1==getRexW(pfx)) goto decode_failure;
20631      /* So at this point we've rejected any variants which appear to
20632         be governed by the usual operand-size modifiers.  Hence only
20633         the address size prefix can have an effect.  It changes the
20634         size from 64 (default) to 32. */
20635      d64 = guest_RIP_bbstart+delta+1 + getSDisp8(delta);
20636      delta++;
20637      if (haveASO(pfx)) {
20638         /* 64to32 of 64-bit get is merely a get-put improvement
20639            trick. */
20640         putIReg32(R_RCX, binop(Iop_Sub32,
20641                                unop(Iop_64to32, getIReg64(R_RCX)),
20642                                mkU32(1)));
20643      } else {
20644         putIReg64(R_RCX, binop(Iop_Sub64, getIReg64(R_RCX), mkU64(1)));
20645      }
20646
20647      /* This is correct, both for 32- and 64-bit versions.  If we're
20648         doing a 32-bit dec and the result is zero then the default
20649         zero extension rule will cause the upper 32 bits to be zero
20650         too.  Hence a 64-bit check against zero is OK. */
20651      count = getIReg64(R_RCX);
20652      cond = binop(Iop_CmpNE64, count, mkU64(0));
20653      switch (opc) {
20654         case 0xE2:
20655            xtra = "";
20656            break;
20657         case 0xE1:
20658            xtra = "e";
20659            zbit = mk_amd64g_calculate_condition( AMD64CondZ );
20660            cond = mkAnd1(cond, zbit);
20661            break;
20662         case 0xE0:
20663            xtra = "ne";
20664            zbit = mk_amd64g_calculate_condition( AMD64CondNZ );
20665            cond = mkAnd1(cond, zbit);
20666            break;
20667         default:
20668            vassert(0);
20669      }
20670      stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U64(d64), OFFB_RIP) );
20671
20672      DIP("loop%s%s 0x%llx\n", xtra, haveASO(pfx) ? "l" : "", d64);
20673      return delta;
20674    }
20675
20676   case 0xE3:
20677      /* JRCXZ or JECXZ, depending address size override. */
20678      if (have66orF2orF3(pfx)) goto decode_failure;
20679      d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
20680      delta++;
20681      if (haveASO(pfx)) {
20682         /* 32-bit */
20683         stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
20684                                  unop(Iop_32Uto64, getIReg32(R_RCX)),
20685                                  mkU64(0)),
20686                            Ijk_Boring,
20687                            IRConst_U64(d64),
20688                            OFFB_RIP
20689             ));
20690         DIP("jecxz 0x%llx\n", d64);
20691      } else {
20692         /* 64-bit */
20693         stmt( IRStmt_Exit( binop(Iop_CmpEQ64,
20694                                  getIReg64(R_RCX),
20695                                  mkU64(0)),
20696                            Ijk_Boring,
20697                            IRConst_U64(d64),
20698                            OFFB_RIP
20699               ));
20700         DIP("jrcxz 0x%llx\n", d64);
20701      }
20702      return delta;
20703
20704   case 0xE4: /* IN imm8, AL */
20705      sz = 1;
20706      t1 = newTemp(Ity_I64);
20707      abyte = getUChar(delta); delta++;
20708      assign(t1, mkU64( abyte & 0xFF ));
20709      DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
20710      goto do_IN;
20711   case 0xE5: /* IN imm8, eAX */
20712      if (!(sz == 2 || sz == 4)) goto decode_failure;
20713      t1 = newTemp(Ity_I64);
20714      abyte = getUChar(delta); delta++;
20715      assign(t1, mkU64( abyte & 0xFF ));
20716      DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIRegRAX(sz));
20717      goto do_IN;
20718   case 0xEC: /* IN %DX, AL */
20719      sz = 1;
20720      t1 = newTemp(Ity_I64);
20721      assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
20722      DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
20723                                         nameIRegRAX(sz));
20724      goto do_IN;
20725   case 0xED: /* IN %DX, eAX */
20726      if (!(sz == 2 || sz == 4)) goto decode_failure;
20727      t1 = newTemp(Ity_I64);
20728      assign(t1, unop(Iop_16Uto64, getIRegRDX(2)));
20729      DIP("in%c %s,%s\n", nameISize(sz), nameIRegRDX(2),
20730                                         nameIRegRAX(sz));
20731      goto do_IN;
20732   do_IN: {
20733      /* At this point, sz indicates the width, and t1 is a 64-bit
20734         value giving port number. */
20735      IRDirty* d;
20736      if (haveF2orF3(pfx)) goto decode_failure;
20737      vassert(sz == 1 || sz == 2 || sz == 4);
20738      ty = szToITy(sz);
20739      t2 = newTemp(Ity_I64);
20740      d = unsafeIRDirty_1_N(
20741             t2,
20742             0/*regparms*/,
20743             "amd64g_dirtyhelper_IN",
20744             &amd64g_dirtyhelper_IN,
20745             mkIRExprVec_2( mkexpr(t1), mkU64(sz) )
20746          );
20747      /* do the call, dumping the result in t2. */
20748      stmt( IRStmt_Dirty(d) );
20749      putIRegRAX(sz, narrowTo( ty, mkexpr(t2) ) );
20750      return delta;
20751   }
20752
20753   case 0xE6: /* OUT AL, imm8 */
20754      sz = 1;
20755      t1 = newTemp(Ity_I64);
20756      abyte = getUChar(delta); delta++;
20757      assign( t1, mkU64( abyte & 0xFF ) );
20758      DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
20759      goto do_OUT;
20760   case 0xE7: /* OUT eAX, imm8 */
20761      if (!(sz == 2 || sz == 4)) goto decode_failure;
20762      t1 = newTemp(Ity_I64);
20763      abyte = getUChar(delta); delta++;
20764      assign( t1, mkU64( abyte & 0xFF ) );
20765      DIP("out%c %s,$%d\n", nameISize(sz), nameIRegRAX(sz), (Int)abyte);
20766      goto do_OUT;
20767   case 0xEE: /* OUT AL, %DX */
20768      sz = 1;
20769      t1 = newTemp(Ity_I64);
20770      assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
20771      DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
20772                                          nameIRegRDX(2));
20773      goto do_OUT;
20774   case 0xEF: /* OUT eAX, %DX */
20775      if (!(sz == 2 || sz == 4)) goto decode_failure;
20776      t1 = newTemp(Ity_I64);
20777      assign( t1, unop(Iop_16Uto64, getIRegRDX(2)) );
20778      DIP("out%c %s,%s\n", nameISize(sz), nameIRegRAX(sz),
20779                                          nameIRegRDX(2));
20780      goto do_OUT;
20781   do_OUT: {
20782      /* At this point, sz indicates the width, and t1 is a 64-bit
20783         value giving port number. */
20784      IRDirty* d;
20785      if (haveF2orF3(pfx)) goto decode_failure;
20786      vassert(sz == 1 || sz == 2 || sz == 4);
20787      ty = szToITy(sz);
20788      d = unsafeIRDirty_0_N(
20789             0/*regparms*/,
20790             "amd64g_dirtyhelper_OUT",
20791             &amd64g_dirtyhelper_OUT,
20792             mkIRExprVec_3( mkexpr(t1),
20793                            widenUto64( getIRegRAX(sz) ),
20794                            mkU64(sz) )
20795          );
20796      stmt( IRStmt_Dirty(d) );
20797      return delta;
20798   }
20799
20800   case 0xE8: /* CALL J4 */
20801      if (haveF3(pfx)) goto decode_failure;
20802      if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
20803      d64 = getSDisp32(delta); delta += 4;
20804      d64 += (guest_RIP_bbstart+delta);
20805      /* (guest_RIP_bbstart+delta) == return-to addr, d64 == call-to addr */
20806      t1 = newTemp(Ity_I64);
20807      assign(t1, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
20808      putIReg64(R_RSP, mkexpr(t1));
20809      storeLE( mkexpr(t1), mkU64(guest_RIP_bbstart+delta));
20810      t2 = newTemp(Ity_I64);
20811      assign(t2, mkU64((Addr64)d64));
20812      make_redzone_AbiHint(vbi, t1, t2/*nia*/, "call-d32");
20813      if (resteerOkFn( callback_opaque, (Addr64)d64) ) {
20814         /* follow into the call target. */
20815         dres->whatNext   = Dis_ResteerU;
20816         dres->continueAt = d64;
20817      } else {
20818         jmp_lit(dres, Ijk_Call, d64);
20819         vassert(dres->whatNext == Dis_StopHere);
20820      }
20821      DIP("call 0x%llx\n",d64);
20822      return delta;
20823
20824   case 0xE9: /* Jv (jump, 16/32 offset) */
20825      if (haveF3(pfx)) goto decode_failure;
20826      if (sz != 4)
20827         goto decode_failure; /* JRS added 2004 July 11 */
20828      if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
20829      d64 = (guest_RIP_bbstart+delta+sz) + getSDisp(sz,delta);
20830      delta += sz;
20831      if (resteerOkFn(callback_opaque,d64)) {
20832         dres->whatNext   = Dis_ResteerU;
20833         dres->continueAt = d64;
20834      } else {
20835         jmp_lit(dres, Ijk_Boring, d64);
20836         vassert(dres->whatNext == Dis_StopHere);
20837      }
20838      DIP("jmp 0x%llx\n", d64);
20839      return delta;
20840
20841   case 0xEB: /* Jb (jump, byte offset) */
20842      if (haveF3(pfx)) goto decode_failure;
20843      if (sz != 4)
20844         goto decode_failure; /* JRS added 2004 July 11 */
20845      if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
20846      d64 = (guest_RIP_bbstart+delta+1) + getSDisp8(delta);
20847      delta++;
20848      if (resteerOkFn(callback_opaque,d64)) {
20849         dres->whatNext   = Dis_ResteerU;
20850         dres->continueAt = d64;
20851      } else {
20852         jmp_lit(dres, Ijk_Boring, d64);
20853         vassert(dres->whatNext == Dis_StopHere);
20854      }
20855      DIP("jmp-8 0x%llx\n", d64);
20856      return delta;
20857
20858   case 0xF5: /* CMC */
20859   case 0xF8: /* CLC */
20860   case 0xF9: /* STC */
20861      t1 = newTemp(Ity_I64);
20862      t2 = newTemp(Ity_I64);
20863      assign( t1, mk_amd64g_calculate_rflags_all() );
20864      switch (opc) {
20865         case 0xF5:
20866            assign( t2, binop(Iop_Xor64, mkexpr(t1),
20867                                         mkU64(AMD64G_CC_MASK_C)));
20868            DIP("cmc\n");
20869            break;
20870         case 0xF8:
20871            assign( t2, binop(Iop_And64, mkexpr(t1),
20872                                         mkU64(~AMD64G_CC_MASK_C)));
20873            DIP("clc\n");
20874            break;
20875         case 0xF9:
20876            assign( t2, binop(Iop_Or64, mkexpr(t1),
20877                                        mkU64(AMD64G_CC_MASK_C)));
20878            DIP("stc\n");
20879            break;
20880         default:
20881            vpanic("disInstr(x64)(cmc/clc/stc)");
20882      }
20883      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
20884      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
20885      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t2) ));
20886      /* Set NDEP even though it isn't used.  This makes redundant-PUT
20887         elimination of previous stores to this field work better. */
20888      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
20889      return delta;
20890
20891   case 0xF6: { /* Grp3 Eb */
20892      Bool decode_OK = True;
20893      /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
20894      /* We now let dis_Grp3 itself decide if F2 and/or F3 are valid */
20895      delta = dis_Grp3 ( vbi, pfx, 1, delta, &decode_OK );
20896      if (!decode_OK) goto decode_failure;
20897      return delta;
20898   }
20899
20900   case 0xF7: { /* Grp3 Ev */
20901      Bool decode_OK = True;
20902      /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
20903      /* We now let dis_Grp3 itself decide if F2 and/or F3 are valid */
20904      delta = dis_Grp3 ( vbi, pfx, sz, delta, &decode_OK );
20905      if (!decode_OK) goto decode_failure;
20906      return delta;
20907   }
20908
20909   case 0xFC: /* CLD */
20910      if (haveF2orF3(pfx)) goto decode_failure;
20911      stmt( IRStmt_Put( OFFB_DFLAG, mkU64(1)) );
20912      DIP("cld\n");
20913      return delta;
20914
20915   case 0xFD: /* STD */
20916      if (haveF2orF3(pfx)) goto decode_failure;
20917      stmt( IRStmt_Put( OFFB_DFLAG, mkU64(-1ULL)) );
20918      DIP("std\n");
20919      return delta;
20920
20921   case 0xFE: { /* Grp4 Eb */
20922      Bool decode_OK = True;
20923      /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
20924      /* We now let dis_Grp4 itself decide if F2 and/or F3 are valid */
20925      delta = dis_Grp4 ( vbi, pfx, delta, &decode_OK );
20926      if (!decode_OK) goto decode_failure;
20927      return delta;
20928   }
20929
20930   case 0xFF: { /* Grp5 Ev */
20931      Bool decode_OK = True;
20932      /* RM'd: if (haveF2orF3(pfx)) goto decode_failure; */
20933      /* We now let dis_Grp5 itself decide if F2 and/or F3 are valid */
20934      delta = dis_Grp5 ( vbi, pfx, sz, delta, dres, &decode_OK );
20935      if (!decode_OK) goto decode_failure;
20936      return delta;
20937   }
20938
20939   default:
20940      break;
20941
20942   }
20943
20944  decode_failure:
20945   return deltaIN; /* fail */
20946}
20947
20948
20949/*------------------------------------------------------------*/
20950/*---                                                      ---*/
20951/*--- Top-level post-escape decoders: dis_ESC_0F           ---*/
20952/*---                                                      ---*/
20953/*------------------------------------------------------------*/
20954
20955static IRTemp math_BSWAP ( IRTemp t1, IRType ty )
20956{
20957   IRTemp t2 = newTemp(ty);
20958   if (ty == Ity_I64) {
20959      IRTemp m8  = newTemp(Ity_I64);
20960      IRTemp s8  = newTemp(Ity_I64);
20961      IRTemp m16 = newTemp(Ity_I64);
20962      IRTemp s16 = newTemp(Ity_I64);
20963      IRTemp m32 = newTemp(Ity_I64);
20964      assign( m8, mkU64(0xFF00FF00FF00FF00ULL) );
20965      assign( s8,
20966              binop(Iop_Or64,
20967                    binop(Iop_Shr64,
20968                          binop(Iop_And64,mkexpr(t1),mkexpr(m8)),
20969                          mkU8(8)),
20970                    binop(Iop_And64,
20971                          binop(Iop_Shl64,mkexpr(t1),mkU8(8)),
20972                          mkexpr(m8))
20973                   )
20974            );
20975
20976      assign( m16, mkU64(0xFFFF0000FFFF0000ULL) );
20977      assign( s16,
20978              binop(Iop_Or64,
20979                    binop(Iop_Shr64,
20980                          binop(Iop_And64,mkexpr(s8),mkexpr(m16)),
20981                          mkU8(16)),
20982                    binop(Iop_And64,
20983                          binop(Iop_Shl64,mkexpr(s8),mkU8(16)),
20984                          mkexpr(m16))
20985                   )
20986            );
20987
20988      assign( m32, mkU64(0xFFFFFFFF00000000ULL) );
20989      assign( t2,
20990              binop(Iop_Or64,
20991                    binop(Iop_Shr64,
20992                          binop(Iop_And64,mkexpr(s16),mkexpr(m32)),
20993                          mkU8(32)),
20994                    binop(Iop_And64,
20995                          binop(Iop_Shl64,mkexpr(s16),mkU8(32)),
20996                          mkexpr(m32))
20997                   )
20998            );
20999      return t2;
21000   }
21001   if (ty == Ity_I32) {
21002      assign( t2,
21003         binop(
21004            Iop_Or32,
21005            binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
21006            binop(
21007               Iop_Or32,
21008               binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
21009                                mkU32(0x00FF0000)),
21010               binop(Iop_Or32,
21011                     binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
21012                                      mkU32(0x0000FF00)),
21013                     binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
21014                                      mkU32(0x000000FF) )
21015            )))
21016      );
21017      return t2;
21018   }
21019   if (ty == Ity_I16) {
21020      assign(t2,
21021             binop(Iop_Or16,
21022                   binop(Iop_Shl16, mkexpr(t1), mkU8(8)),
21023                   binop(Iop_Shr16, mkexpr(t1), mkU8(8)) ));
21024      return t2;
21025   }
21026   vassert(0);
21027   /*NOTREACHED*/
21028   return IRTemp_INVALID;
21029}
21030
21031
21032__attribute__((noinline))
21033static
21034Long dis_ESC_0F (
21035        /*MB_OUT*/DisResult* dres,
21036        /*MB_OUT*/Bool*      expect_CAS,
21037        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
21038        Bool         resteerCisOk,
21039        void*        callback_opaque,
21040        VexArchInfo* archinfo,
21041        VexAbiInfo*  vbi,
21042        Prefix pfx, Int sz, Long deltaIN
21043     )
21044{
21045   Long   d64   = 0;
21046   IRTemp addr  = IRTemp_INVALID;
21047   IRTemp t1    = IRTemp_INVALID;
21048   IRTemp t2    = IRTemp_INVALID;
21049   UChar  modrm = 0;
21050   Int    am_sz = 0;
21051   Int    alen  = 0;
21052   HChar  dis_buf[50];
21053
21054   /* In the first switch, look for ordinary integer insns. */
21055   Long   delta = deltaIN;
21056   UChar  opc   = getUChar(delta);
21057   delta++;
21058   switch (opc) { /* first switch */
21059
21060   case 0x01:
21061   {
21062      modrm = getUChar(delta);
21063      /* 0F 01 /0 -- SGDT */
21064      /* 0F 01 /1 -- SIDT */
21065      if (!epartIsReg(modrm)
21066          && (gregLO3ofRM(modrm) == 0 || gregLO3ofRM(modrm) == 1)) {
21067         /* This is really revolting, but ... since each processor
21068            (core) only has one IDT and one GDT, just let the guest
21069            see it (pass-through semantics).  I can't see any way to
21070            construct a faked-up value, so don't bother to try. */
21071         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21072         delta += alen;
21073         switch (gregLO3ofRM(modrm)) {
21074            case 0: DIP("sgdt %s\n", dis_buf); break;
21075            case 1: DIP("sidt %s\n", dis_buf); break;
21076            default: vassert(0); /*NOTREACHED*/
21077         }
21078         IRDirty* d = unsafeIRDirty_0_N (
21079                          0/*regparms*/,
21080                          "amd64g_dirtyhelper_SxDT",
21081                          &amd64g_dirtyhelper_SxDT,
21082                          mkIRExprVec_2( mkexpr(addr),
21083                                         mkU64(gregLO3ofRM(modrm)) )
21084                      );
21085         /* declare we're writing memory */
21086         d->mFx   = Ifx_Write;
21087         d->mAddr = mkexpr(addr);
21088         d->mSize = 6;
21089         stmt( IRStmt_Dirty(d) );
21090         return delta;
21091      }
21092      /* 0F 01 D0 = XGETBV */
21093      if (modrm == 0xD0 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
21094         delta += 1;
21095         DIP("xgetbv\n");
21096         /* Fault (SEGV) if ECX isn't zero.  Intel docs say #GP and I
21097            am not sure if that translates in to SEGV or to something
21098            else, in user space. */
21099         t1 = newTemp(Ity_I32);
21100         assign( t1, getIReg32(R_RCX) );
21101         stmt( IRStmt_Exit(binop(Iop_CmpNE32, mkexpr(t1), mkU32(0)),
21102                           Ijk_SigSEGV,
21103                           IRConst_U64(guest_RIP_curr_instr),
21104                           OFFB_RIP
21105         ));
21106         putIRegRAX(4, mkU32(7));
21107         putIRegRDX(4, mkU32(0));
21108         return delta;
21109      }
21110      /* BEGIN HACKY SUPPORT FOR xtest */
21111      /* 0F 01 D6 = XTEST */
21112      if (modrm == 0xD6 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
21113         /* Sets ZF because there never is a transaction, and all
21114            CF, OF, SF, PF and AF are always cleared by xtest. */
21115         delta += 1;
21116         DIP("xtest\n");
21117         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
21118         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
21119         stmt( IRStmt_Put( OFFB_CC_DEP1, mkU64(AMD64G_CC_MASK_Z) ));
21120         /* Set NDEP even though it isn't used.  This makes redundant-PUT
21121            elimination of previous stores to this field work better. */
21122         stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
21123         return delta;
21124      }
21125      /* END HACKY SUPPORT FOR xtest */
21126      /* 0F 01 F9 = RDTSCP */
21127      if (modrm == 0xF9 && (archinfo->hwcaps & VEX_HWCAPS_AMD64_RDTSCP)) {
21128         delta += 1;
21129         /* Uses dirty helper:
21130            void amd64g_dirtyhelper_RDTSCP ( VexGuestAMD64State* )
21131            declared to wr rax, rcx, rdx
21132         */
21133         const HChar* fName = "amd64g_dirtyhelper_RDTSCP";
21134         void*        fAddr = &amd64g_dirtyhelper_RDTSCP;
21135         IRDirty* d
21136            = unsafeIRDirty_0_N ( 0/*regparms*/,
21137                                  fName, fAddr, mkIRExprVec_1(IRExpr_BBPTR()) );
21138         /* declare guest state effects */
21139         d->nFxState = 3;
21140         vex_bzero(&d->fxState, sizeof(d->fxState));
21141         d->fxState[0].fx     = Ifx_Write;
21142         d->fxState[0].offset = OFFB_RAX;
21143         d->fxState[0].size   = 8;
21144         d->fxState[1].fx     = Ifx_Write;
21145         d->fxState[1].offset = OFFB_RCX;
21146         d->fxState[1].size   = 8;
21147         d->fxState[2].fx     = Ifx_Write;
21148         d->fxState[2].offset = OFFB_RDX;
21149         d->fxState[2].size   = 8;
21150         /* execute the dirty call, side-effecting guest state */
21151         stmt( IRStmt_Dirty(d) );
21152         /* RDTSCP is a serialising insn.  So, just in case someone is
21153            using it as a memory fence ... */
21154         stmt( IRStmt_MBE(Imbe_Fence) );
21155         DIP("rdtscp\n");
21156         return delta;
21157      }
21158      /* else decode failed */
21159      break;
21160   }
21161
21162   case 0x05: /* SYSCALL */
21163      guest_RIP_next_mustcheck = True;
21164      guest_RIP_next_assumed = guest_RIP_bbstart + delta;
21165      putIReg64( R_RCX, mkU64(guest_RIP_next_assumed) );
21166      /* It's important that all guest state is up-to-date
21167         at this point.  So we declare an end-of-block here, which
21168         forces any cached guest state to be flushed. */
21169      jmp_lit(dres, Ijk_Sys_syscall, guest_RIP_next_assumed);
21170      vassert(dres->whatNext == Dis_StopHere);
21171      DIP("syscall\n");
21172      return delta;
21173
21174   case 0x0B: /* UD2 */
21175      stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
21176      jmp_lit(dres, Ijk_NoDecode, guest_RIP_curr_instr);
21177      vassert(dres->whatNext == Dis_StopHere);
21178      DIP("ud2\n");
21179      return delta;
21180
21181   case 0x0D: /* 0F 0D /0 -- prefetch mem8 */
21182              /* 0F 0D /1 -- prefetchw mem8 */
21183      if (have66orF2orF3(pfx)) goto decode_failure;
21184      modrm = getUChar(delta);
21185      if (epartIsReg(modrm)) goto decode_failure;
21186      if (gregLO3ofRM(modrm) != 0 && gregLO3ofRM(modrm) != 1)
21187         goto decode_failure;
21188      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21189      delta += alen;
21190      switch (gregLO3ofRM(modrm)) {
21191         case 0: DIP("prefetch %s\n", dis_buf); break;
21192         case 1: DIP("prefetchw %s\n", dis_buf); break;
21193         default: vassert(0); /*NOTREACHED*/
21194      }
21195      return delta;
21196
21197   case 0x1F:
21198      if (haveF2orF3(pfx)) goto decode_failure;
21199      modrm = getUChar(delta);
21200      if (epartIsReg(modrm)) goto decode_failure;
21201      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21202      delta += alen;
21203      DIP("nop%c %s\n", nameISize(sz), dis_buf);
21204      return delta;
21205
21206   case 0x31: { /* RDTSC */
21207      IRTemp   val  = newTemp(Ity_I64);
21208      IRExpr** args = mkIRExprVec_0();
21209      IRDirty* d    = unsafeIRDirty_1_N (
21210                         val,
21211                         0/*regparms*/,
21212                         "amd64g_dirtyhelper_RDTSC",
21213                         &amd64g_dirtyhelper_RDTSC,
21214                         args
21215                      );
21216      if (have66orF2orF3(pfx)) goto decode_failure;
21217      /* execute the dirty call, dumping the result in val. */
21218      stmt( IRStmt_Dirty(d) );
21219      putIRegRDX(4, unop(Iop_64HIto32, mkexpr(val)));
21220      putIRegRAX(4, unop(Iop_64to32, mkexpr(val)));
21221      DIP("rdtsc\n");
21222      return delta;
21223   }
21224
21225   case 0x40:
21226   case 0x41:
21227   case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
21228   case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
21229   case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
21230   case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
21231   case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
21232   case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
21233   case 0x48: /* CMOVSb (cmov negative) */
21234   case 0x49: /* CMOVSb (cmov not negative) */
21235   case 0x4A: /* CMOVP (cmov parity even) */
21236   case 0x4B: /* CMOVNP (cmov parity odd) */
21237   case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
21238   case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
21239   case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
21240   case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
21241      if (haveF2orF3(pfx)) goto decode_failure;
21242      delta = dis_cmov_E_G(vbi, pfx, sz, (AMD64Condcode)(opc - 0x40), delta);
21243      return delta;
21244
21245   case 0x80:
21246   case 0x81:
21247   case 0x82:   /* JBb/JNAEb (jump below) */
21248   case 0x83:   /* JNBb/JAEb (jump not below) */
21249   case 0x84:   /* JZb/JEb (jump zero) */
21250   case 0x85:   /* JNZb/JNEb (jump not zero) */
21251   case 0x86:   /* JBEb/JNAb (jump below or equal) */
21252   case 0x87:   /* JNBEb/JAb (jump not below or equal) */
21253   case 0x88:   /* JSb (jump negative) */
21254   case 0x89:   /* JSb (jump not negative) */
21255   case 0x8A:   /* JP (jump parity even) */
21256   case 0x8B:   /* JNP/JPO (jump parity odd) */
21257   case 0x8C:   /* JLb/JNGEb (jump less) */
21258   case 0x8D:   /* JGEb/JNLb (jump greater or equal) */
21259   case 0x8E:   /* JLEb/JNGb (jump less or equal) */
21260   case 0x8F: { /* JGb/JNLEb (jump greater) */
21261      Long   jmpDelta;
21262      const HChar* comment  = "";
21263      if (haveF3(pfx)) goto decode_failure;
21264      if (haveF2(pfx)) DIP("bnd ; "); /* MPX bnd prefix. */
21265      jmpDelta = getSDisp32(delta);
21266      d64 = (guest_RIP_bbstart+delta+4) + jmpDelta;
21267      delta += 4;
21268      if (resteerCisOk
21269          && vex_control.guest_chase_cond
21270          && (Addr64)d64 != (Addr64)guest_RIP_bbstart
21271          && jmpDelta < 0
21272          && resteerOkFn( callback_opaque, d64) ) {
21273         /* Speculation: assume this backward branch is taken.  So
21274            we need to emit a side-exit to the insn following this
21275            one, on the negation of the condition, and continue at
21276            the branch target address (d64).  If we wind up back at
21277            the first instruction of the trace, just stop; it's
21278            better to let the IR loop unroller handle that case. */
21279         stmt( IRStmt_Exit(
21280                  mk_amd64g_calculate_condition(
21281                     (AMD64Condcode)(1 ^ (opc - 0x80))),
21282                  Ijk_Boring,
21283                  IRConst_U64(guest_RIP_bbstart+delta),
21284                  OFFB_RIP
21285             ));
21286         dres->whatNext   = Dis_ResteerC;
21287         dres->continueAt = d64;
21288         comment = "(assumed taken)";
21289      }
21290      else
21291      if (resteerCisOk
21292          && vex_control.guest_chase_cond
21293          && (Addr64)d64 != (Addr64)guest_RIP_bbstart
21294          && jmpDelta >= 0
21295          && resteerOkFn( callback_opaque, guest_RIP_bbstart+delta ) ) {
21296         /* Speculation: assume this forward branch is not taken.
21297            So we need to emit a side-exit to d64 (the dest) and
21298            continue disassembling at the insn immediately
21299            following this one. */
21300         stmt( IRStmt_Exit(
21301                  mk_amd64g_calculate_condition((AMD64Condcode)
21302                                                (opc - 0x80)),
21303                  Ijk_Boring,
21304                  IRConst_U64(d64),
21305                  OFFB_RIP
21306             ));
21307         dres->whatNext   = Dis_ResteerC;
21308         dres->continueAt = guest_RIP_bbstart+delta;
21309         comment = "(assumed not taken)";
21310      }
21311      else {
21312         /* Conservative default translation - end the block at
21313            this point. */
21314         jcc_01( dres, (AMD64Condcode)(opc - 0x80),
21315                 guest_RIP_bbstart+delta, d64 );
21316         vassert(dres->whatNext == Dis_StopHere);
21317      }
21318      DIP("j%s-32 0x%llx %s\n", name_AMD64Condcode(opc - 0x80), d64, comment);
21319      return delta;
21320   }
21321
21322   case 0x90:
21323   case 0x91:
21324   case 0x92: /* set-Bb/set-NAEb (set if below) */
21325   case 0x93: /* set-NBb/set-AEb (set if not below) */
21326   case 0x94: /* set-Zb/set-Eb (set if zero) */
21327   case 0x95: /* set-NZb/set-NEb (set if not zero) */
21328   case 0x96: /* set-BEb/set-NAb (set if below or equal) */
21329   case 0x97: /* set-NBEb/set-Ab (set if not below or equal) */
21330   case 0x98: /* set-Sb (set if negative) */
21331   case 0x99: /* set-Sb (set if not negative) */
21332   case 0x9A: /* set-P (set if parity even) */
21333   case 0x9B: /* set-NP (set if parity odd) */
21334   case 0x9C: /* set-Lb/set-NGEb (set if less) */
21335   case 0x9D: /* set-GEb/set-NLb (set if greater or equal) */
21336   case 0x9E: /* set-LEb/set-NGb (set if less or equal) */
21337   case 0x9F: /* set-Gb/set-NLEb (set if greater) */
21338      if (haveF2orF3(pfx)) goto decode_failure;
21339      t1 = newTemp(Ity_I8);
21340      assign( t1, unop(Iop_1Uto8,mk_amd64g_calculate_condition(opc-0x90)) );
21341      modrm = getUChar(delta);
21342      if (epartIsReg(modrm)) {
21343         delta++;
21344         putIRegE(1, pfx, modrm, mkexpr(t1));
21345         DIP("set%s %s\n", name_AMD64Condcode(opc-0x90),
21346                           nameIRegE(1,pfx,modrm));
21347      } else {
21348         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21349         delta += alen;
21350         storeLE( mkexpr(addr), mkexpr(t1) );
21351         DIP("set%s %s\n", name_AMD64Condcode(opc-0x90), dis_buf);
21352      }
21353      return delta;
21354
21355   case 0x1A:
21356   case 0x1B: { /* Future MPX instructions, currently NOPs.
21357                   BNDMK b, m     F3 0F 1B
21358                   BNDCL b, r/m   F3 0F 1A
21359                   BNDCU b, r/m   F2 0F 1A
21360                   BNDCN b, r/m   F2 0F 1B
21361                   BNDMOV b, b/m  66 0F 1A
21362                   BNDMOV b/m, b  66 0F 1B
21363                   BNDLDX b, mib     0F 1A
21364                   BNDSTX mib, b     0F 1B */
21365
21366      /* All instructions have two operands. One operand is always the
21367         bnd register number (bnd0-bnd3, other register numbers are
21368         ignored when MPX isn't enabled, but should generate an
21369         exception if MPX is enabled) given by gregOfRexRM. The other
21370         operand is either a ModRM:reg, ModRM:r/m or a SIB encoded
21371         address, all of which can be decoded by using either
21372         eregOfRexRM or disAMode. */
21373
21374      modrm = getUChar(delta);
21375      int bnd = gregOfRexRM(pfx,modrm);
21376      const HChar *oper;
21377      if (epartIsReg(modrm)) {
21378         oper = nameIReg64 (eregOfRexRM(pfx,modrm));
21379         delta += 1;
21380      } else {
21381         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21382         delta += alen;
21383         oper = dis_buf;
21384      }
21385
21386      if (haveF3no66noF2 (pfx)) {
21387         if (opc == 0x1B) {
21388            DIP ("bndmk %s, %%bnd%d\n", oper, bnd);
21389         } else /* opc == 0x1A */ {
21390            DIP ("bndcl %s, %%bnd%d\n", oper, bnd);
21391         }
21392      } else if (haveF2no66noF3 (pfx)) {
21393         if (opc == 0x1A) {
21394            DIP ("bndcu %s, %%bnd%d\n", oper, bnd);
21395         } else /* opc == 0x1B */ {
21396            DIP ("bndcn %s, %%bnd%d\n", oper, bnd);
21397         }
21398      } else if (have66noF2noF3 (pfx)) {
21399         if (opc == 0x1A) {
21400            DIP ("bndmov %s, %%bnd%d\n", oper, bnd);
21401         } else /* opc == 0x1B */ {
21402            DIP ("bndmov %%bnd%d, %s\n", bnd, oper);
21403         }
21404      } else if (haveNo66noF2noF3 (pfx)) {
21405         if (opc == 0x1A) {
21406            DIP ("bndldx %s, %%bnd%d\n", oper, bnd);
21407         } else /* opc == 0x1B */ {
21408            DIP ("bndstx %%bnd%d, %s\n", bnd, oper);
21409         }
21410      } else goto decode_failure;
21411
21412      return delta;
21413   }
21414
21415   case 0xA2: { /* CPUID */
21416      /* Uses dirty helper:
21417            void amd64g_dirtyhelper_CPUID ( VexGuestAMD64State* )
21418         declared to mod rax, wr rbx, rcx, rdx
21419      */
21420      IRDirty* d     = NULL;
21421      const HChar*   fName = NULL;
21422      void*    fAddr = NULL;
21423      if (haveF2orF3(pfx)) goto decode_failure;
21424      /* This isn't entirely correct, CPUID should depend on the VEX
21425         capabilities, not on the underlying CPU. See bug #324882. */
21426      if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) &&
21427          (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16) &&
21428          (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX)) {
21429         fName = "amd64g_dirtyhelper_CPUID_avx_and_cx16";
21430         fAddr = &amd64g_dirtyhelper_CPUID_avx_and_cx16;
21431         /* This is a Core-i5-2300-like machine */
21432      }
21433      else if ((archinfo->hwcaps & VEX_HWCAPS_AMD64_SSE3) &&
21434               (archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16)) {
21435         fName = "amd64g_dirtyhelper_CPUID_sse42_and_cx16";
21436         fAddr = &amd64g_dirtyhelper_CPUID_sse42_and_cx16;
21437         /* This is a Core-i5-670-like machine */
21438      }
21439      else {
21440         /* Give a CPUID for at least a baseline machine, SSE2
21441            only, and no CX16 */
21442         fName = "amd64g_dirtyhelper_CPUID_baseline";
21443         fAddr = &amd64g_dirtyhelper_CPUID_baseline;
21444      }
21445
21446      vassert(fName); vassert(fAddr);
21447      d = unsafeIRDirty_0_N ( 0/*regparms*/,
21448                              fName, fAddr, mkIRExprVec_1(IRExpr_BBPTR()) );
21449      /* declare guest state effects */
21450      d->nFxState = 4;
21451      vex_bzero(&d->fxState, sizeof(d->fxState));
21452      d->fxState[0].fx     = Ifx_Modify;
21453      d->fxState[0].offset = OFFB_RAX;
21454      d->fxState[0].size   = 8;
21455      d->fxState[1].fx     = Ifx_Write;
21456      d->fxState[1].offset = OFFB_RBX;
21457      d->fxState[1].size   = 8;
21458      d->fxState[2].fx     = Ifx_Modify;
21459      d->fxState[2].offset = OFFB_RCX;
21460      d->fxState[2].size   = 8;
21461      d->fxState[3].fx     = Ifx_Write;
21462      d->fxState[3].offset = OFFB_RDX;
21463      d->fxState[3].size   = 8;
21464      /* execute the dirty call, side-effecting guest state */
21465      stmt( IRStmt_Dirty(d) );
21466      /* CPUID is a serialising insn.  So, just in case someone is
21467         using it as a memory fence ... */
21468      stmt( IRStmt_MBE(Imbe_Fence) );
21469      DIP("cpuid\n");
21470      return delta;
21471   }
21472
21473   case 0xA3: { /* BT Gv,Ev */
21474      /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
21475      Bool ok = True;
21476      if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
21477      delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpNone, &ok );
21478      if (!ok) goto decode_failure;
21479      return delta;
21480   }
21481
21482   case 0xA4: /* SHLDv imm8,Gv,Ev */
21483      modrm = getUChar(delta);
21484      d64   = delta + lengthAMode(pfx, delta);
21485      vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
21486      delta = dis_SHLRD_Gv_Ev (
21487                 vbi, pfx, delta, modrm, sz,
21488                 mkU8(getUChar(d64)), True, /* literal */
21489                 dis_buf, True /* left */ );
21490      return delta;
21491
21492   case 0xA5: /* SHLDv %cl,Gv,Ev */
21493      modrm = getUChar(delta);
21494      delta = dis_SHLRD_Gv_Ev (
21495                 vbi, pfx, delta, modrm, sz,
21496                 getIRegCL(), False, /* not literal */
21497                 "%cl", True /* left */ );
21498      return delta;
21499
21500   case 0xAB: { /* BTS Gv,Ev */
21501      /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
21502      Bool ok = True;
21503      if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
21504      delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpSet, &ok );
21505      if (!ok) goto decode_failure;
21506      return delta;
21507   }
21508
21509   case 0xAC: /* SHRDv imm8,Gv,Ev */
21510      modrm = getUChar(delta);
21511      d64   = delta + lengthAMode(pfx, delta);
21512      vex_sprintf(dis_buf, "$%d", (Int)getUChar(d64));
21513      delta = dis_SHLRD_Gv_Ev (
21514                 vbi, pfx, delta, modrm, sz,
21515                 mkU8(getUChar(d64)), True, /* literal */
21516                 dis_buf, False /* right */ );
21517      return delta;
21518
21519   case 0xAD: /* SHRDv %cl,Gv,Ev */
21520      modrm = getUChar(delta);
21521      delta = dis_SHLRD_Gv_Ev (
21522                 vbi, pfx, delta, modrm, sz,
21523                 getIRegCL(), False, /* not literal */
21524                 "%cl", False /* right */);
21525      return delta;
21526
21527   case 0xAF: /* IMUL Ev, Gv */
21528      if (haveF2orF3(pfx)) goto decode_failure;
21529      delta = dis_mul_E_G ( vbi, pfx, sz, delta );
21530      return delta;
21531
21532   case 0xB0: { /* CMPXCHG Gb,Eb */
21533      Bool ok = True;
21534      /* We let dis_cmpxchg_G_E decide whether F2 or F3 are allowable. */
21535      delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, 1, delta );
21536      if (!ok) goto decode_failure;
21537      return delta;
21538   }
21539
21540   case 0xB1: { /* CMPXCHG Gv,Ev (allowed in 16,32,64 bit) */
21541      Bool ok = True;
21542      /* We let dis_cmpxchg_G_E decide whether F2 or F3 are allowable. */
21543      if (sz != 2 && sz != 4 && sz != 8) goto decode_failure;
21544      delta = dis_cmpxchg_G_E ( &ok, vbi, pfx, sz, delta );
21545      if (!ok) goto decode_failure;
21546      return delta;
21547   }
21548
21549   case 0xB3: { /* BTR Gv,Ev */
21550      /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
21551      Bool ok = True;
21552      if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
21553      delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpReset, &ok );
21554      if (!ok) goto decode_failure;
21555      return delta;
21556   }
21557
21558   case 0xB6: /* MOVZXb Eb,Gv */
21559      if (haveF2orF3(pfx)) goto decode_failure;
21560      if (sz != 2 && sz != 4 && sz != 8)
21561         goto decode_failure;
21562      delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, False );
21563      return delta;
21564
21565   case 0xB7: /* MOVZXw Ew,Gv */
21566      if (haveF2orF3(pfx)) goto decode_failure;
21567      if (sz != 4 && sz != 8)
21568         goto decode_failure;
21569      delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, False );
21570      return delta;
21571
21572   case 0xBA: { /* Grp8 Ib,Ev */
21573      /* We let dis_Grp8_Imm decide whether F2 or F3 are allowable. */
21574      Bool decode_OK = False;
21575      modrm = getUChar(delta);
21576      am_sz = lengthAMode(pfx,delta);
21577      d64   = getSDisp8(delta + am_sz);
21578      delta = dis_Grp8_Imm ( vbi, pfx, delta, modrm, am_sz, sz, d64,
21579                             &decode_OK );
21580      if (!decode_OK)
21581         goto decode_failure;
21582      return delta;
21583   }
21584
21585   case 0xBB: { /* BTC Gv,Ev */
21586      /* We let dis_bt_G_E decide whether F2 or F3 are allowable. */
21587      Bool ok = False;
21588      if (sz != 8 && sz != 4 && sz != 2) goto decode_failure;
21589      delta = dis_bt_G_E ( vbi, pfx, sz, delta, BtOpComp, &ok );
21590      if (!ok) goto decode_failure;
21591      return delta;
21592   }
21593
21594   case 0xBC: /* BSF Gv,Ev */
21595      if (!haveF2orF3(pfx)
21596          || (haveF3noF2(pfx)
21597              && 0 == (archinfo->hwcaps & VEX_HWCAPS_AMD64_BMI))) {
21598         /* no-F2 no-F3 0F BC = BSF
21599                  or F3 0F BC = REP; BSF on older CPUs.  */
21600         delta = dis_bs_E_G ( vbi, pfx, sz, delta, True );
21601         return delta;
21602      }
21603      /* Fall through, since F3 0F BC is TZCNT, and needs to
21604         be handled by dis_ESC_0F__SSE4. */
21605      break;
21606
21607   case 0xBD: /* BSR Gv,Ev */
21608      if (!haveF2orF3(pfx)
21609          || (haveF3noF2(pfx)
21610              && 0 == (archinfo->hwcaps & VEX_HWCAPS_AMD64_LZCNT))) {
21611         /* no-F2 no-F3 0F BD = BSR
21612                  or F3 0F BD = REP; BSR on older CPUs.  */
21613         delta = dis_bs_E_G ( vbi, pfx, sz, delta, False );
21614         return delta;
21615      }
21616      /* Fall through, since F3 0F BD is LZCNT, and needs to
21617         be handled by dis_ESC_0F__SSE4. */
21618      break;
21619
21620   case 0xBE: /* MOVSXb Eb,Gv */
21621      if (haveF2orF3(pfx)) goto decode_failure;
21622      if (sz != 2 && sz != 4 && sz != 8)
21623         goto decode_failure;
21624      delta = dis_movx_E_G ( vbi, pfx, delta, 1, sz, True );
21625      return delta;
21626
21627   case 0xBF: /* MOVSXw Ew,Gv */
21628      if (haveF2orF3(pfx)) goto decode_failure;
21629      if (sz != 4 && sz != 8)
21630         goto decode_failure;
21631      delta = dis_movx_E_G ( vbi, pfx, delta, 2, sz, True );
21632      return delta;
21633
21634   case 0xC0: { /* XADD Gb,Eb */
21635      Bool decode_OK = False;
21636      delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, 1, delta );
21637      if (!decode_OK)
21638         goto decode_failure;
21639      return delta;
21640   }
21641
21642   case 0xC1: { /* XADD Gv,Ev */
21643      Bool decode_OK = False;
21644      delta = dis_xadd_G_E ( &decode_OK, vbi, pfx, sz, delta );
21645      if (!decode_OK)
21646         goto decode_failure;
21647      return delta;
21648   }
21649
21650   case 0xC7: { /* CMPXCHG8B Ev, CMPXCHG16B Ev */
21651      IRType  elemTy     = sz==4 ? Ity_I32 : Ity_I64;
21652      IRTemp  expdHi     = newTemp(elemTy);
21653      IRTemp  expdLo     = newTemp(elemTy);
21654      IRTemp  dataHi     = newTemp(elemTy);
21655      IRTemp  dataLo     = newTemp(elemTy);
21656      IRTemp  oldHi      = newTemp(elemTy);
21657      IRTemp  oldLo      = newTemp(elemTy);
21658      IRTemp  flags_old  = newTemp(Ity_I64);
21659      IRTemp  flags_new  = newTemp(Ity_I64);
21660      IRTemp  success    = newTemp(Ity_I1);
21661      IROp    opOR       = sz==4 ? Iop_Or32    : Iop_Or64;
21662      IROp    opXOR      = sz==4 ? Iop_Xor32   : Iop_Xor64;
21663      IROp    opCasCmpEQ = sz==4 ? Iop_CasCmpEQ32 : Iop_CasCmpEQ64;
21664      IRExpr* zero       = sz==4 ? mkU32(0)    : mkU64(0);
21665      IRTemp expdHi64    = newTemp(Ity_I64);
21666      IRTemp expdLo64    = newTemp(Ity_I64);
21667
21668      /* Translate this using a DCAS, even if there is no LOCK
21669         prefix.  Life is too short to bother with generating two
21670         different translations for the with/without-LOCK-prefix
21671         cases. */
21672      *expect_CAS = True;
21673
21674      /* Decode, and generate address. */
21675      if (have66(pfx)) goto decode_failure;
21676      if (sz != 4 && sz != 8) goto decode_failure;
21677      if (sz == 8 && !(archinfo->hwcaps & VEX_HWCAPS_AMD64_CX16))
21678         goto decode_failure;
21679      modrm = getUChar(delta);
21680      if (epartIsReg(modrm)) goto decode_failure;
21681      if (gregLO3ofRM(modrm) != 1) goto decode_failure;
21682      if (haveF2orF3(pfx)) {
21683         /* Since the e-part is memory only, F2 or F3 (one or the
21684            other) is acceptable if LOCK is also present.  But only
21685            for cmpxchg8b. */
21686         if (sz == 8) goto decode_failure;
21687         if (haveF2andF3(pfx) || !haveLOCK(pfx)) goto decode_failure;
21688      }
21689
21690      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21691      delta += alen;
21692
21693      /* cmpxchg16b requires an alignment check. */
21694      if (sz == 8)
21695         gen_SEGV_if_not_16_aligned( addr );
21696
21697      /* Get the expected and new values. */
21698      assign( expdHi64, getIReg64(R_RDX) );
21699      assign( expdLo64, getIReg64(R_RAX) );
21700
21701      /* These are the correctly-sized expected and new values.
21702         However, we also get expdHi64/expdLo64 above as 64-bits
21703         regardless, because we will need them later in the 32-bit
21704         case (paradoxically). */
21705      assign( expdHi, sz==4 ? unop(Iop_64to32, mkexpr(expdHi64))
21706                            : mkexpr(expdHi64) );
21707      assign( expdLo, sz==4 ? unop(Iop_64to32, mkexpr(expdLo64))
21708                            : mkexpr(expdLo64) );
21709      assign( dataHi, sz==4 ? getIReg32(R_RCX) : getIReg64(R_RCX) );
21710      assign( dataLo, sz==4 ? getIReg32(R_RBX) : getIReg64(R_RBX) );
21711
21712      /* Do the DCAS */
21713      stmt( IRStmt_CAS(
21714               mkIRCAS( oldHi, oldLo,
21715                        Iend_LE, mkexpr(addr),
21716                        mkexpr(expdHi), mkexpr(expdLo),
21717                        mkexpr(dataHi), mkexpr(dataLo)
21718            )));
21719
21720      /* success when oldHi:oldLo == expdHi:expdLo */
21721      assign( success,
21722              binop(opCasCmpEQ,
21723                    binop(opOR,
21724                          binop(opXOR, mkexpr(oldHi), mkexpr(expdHi)),
21725                          binop(opXOR, mkexpr(oldLo), mkexpr(expdLo))
21726                    ),
21727                    zero
21728              ));
21729
21730      /* If the DCAS is successful, that is to say oldHi:oldLo ==
21731         expdHi:expdLo, then put expdHi:expdLo back in RDX:RAX,
21732         which is where they came from originally.  Both the actual
21733         contents of these two regs, and any shadow values, are
21734         unchanged.  If the DCAS fails then we're putting into
21735         RDX:RAX the value seen in memory. */
21736      /* Now of course there's a complication in the 32-bit case
21737         (bah!): if the DCAS succeeds, we need to leave RDX:RAX
21738         unchanged; but if we use the same scheme as in the 64-bit
21739         case, we get hit by the standard rule that a write to the
21740         bottom 32 bits of an integer register zeros the upper 32
21741         bits.  And so the upper halves of RDX and RAX mysteriously
21742         become zero.  So we have to stuff back in the original
21743         64-bit values which we previously stashed in
21744         expdHi64:expdLo64, even if we're doing a cmpxchg8b. */
21745      /* It's just _so_ much fun ... */
21746      putIRegRDX( 8,
21747                  IRExpr_ITE( mkexpr(success),
21748                              mkexpr(expdHi64),
21749                              sz == 4 ? unop(Iop_32Uto64, mkexpr(oldHi))
21750                                      : mkexpr(oldHi)
21751                ));
21752      putIRegRAX( 8,
21753                  IRExpr_ITE( mkexpr(success),
21754                              mkexpr(expdLo64),
21755                              sz == 4 ? unop(Iop_32Uto64, mkexpr(oldLo))
21756                                      : mkexpr(oldLo)
21757                ));
21758
21759      /* Copy the success bit into the Z flag and leave the others
21760         unchanged */
21761      assign( flags_old, widenUto64(mk_amd64g_calculate_rflags_all()));
21762      assign(
21763         flags_new,
21764         binop(Iop_Or64,
21765               binop(Iop_And64, mkexpr(flags_old),
21766                                mkU64(~AMD64G_CC_MASK_Z)),
21767               binop(Iop_Shl64,
21768                     binop(Iop_And64,
21769                           unop(Iop_1Uto64, mkexpr(success)), mkU64(1)),
21770                     mkU8(AMD64G_CC_SHIFT_Z)) ));
21771
21772      stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(AMD64G_CC_OP_COPY) ));
21773      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
21774      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0) ));
21775      /* Set NDEP even though it isn't used.  This makes
21776         redundant-PUT elimination of previous stores to this field
21777         work better. */
21778      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU64(0) ));
21779
21780      /* Sheesh.  Aren't you glad it was me and not you that had to
21781         write and validate all this grunge? */
21782
21783      DIP("cmpxchg8b %s\n", dis_buf);
21784      return delta;
21785   }
21786
21787   case 0xC8: /* BSWAP %eax */
21788   case 0xC9:
21789   case 0xCA:
21790   case 0xCB:
21791   case 0xCC:
21792   case 0xCD:
21793   case 0xCE:
21794   case 0xCF: /* BSWAP %edi */
21795      if (haveF2orF3(pfx)) goto decode_failure;
21796      /* According to the AMD64 docs, this insn can have size 4 or
21797         8. */
21798      if (sz == 4) {
21799         t1 = newTemp(Ity_I32);
21800         assign( t1, getIRegRexB(4, pfx, opc-0xC8) );
21801         t2 = math_BSWAP( t1, Ity_I32 );
21802         putIRegRexB(4, pfx, opc-0xC8, mkexpr(t2));
21803         DIP("bswapl %s\n", nameIRegRexB(4, pfx, opc-0xC8));
21804         return delta;
21805      }
21806      if (sz == 8) {
21807         t1 = newTemp(Ity_I64);
21808         t2 = newTemp(Ity_I64);
21809         assign( t1, getIRegRexB(8, pfx, opc-0xC8) );
21810         t2 = math_BSWAP( t1, Ity_I64 );
21811         putIRegRexB(8, pfx, opc-0xC8, mkexpr(t2));
21812         DIP("bswapq %s\n", nameIRegRexB(8, pfx, opc-0xC8));
21813         return delta;
21814      }
21815      goto decode_failure;
21816
21817   default:
21818      break;
21819
21820   } /* first switch */
21821
21822
21823   /* =-=-=-=-=-=-=-=-= MMXery =-=-=-=-=-=-=-=-= */
21824   /* In the second switch, pick off MMX insns. */
21825
21826   if (!have66orF2orF3(pfx)) {
21827      /* So there's no SIMD prefix. */
21828
21829      vassert(sz == 4 || sz == 8);
21830
21831      switch (opc) { /* second switch */
21832
21833      case 0x71:
21834      case 0x72:
21835      case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
21836
21837      case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
21838      case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
21839      case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
21840      case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
21841
21842      case 0xFC:
21843      case 0xFD:
21844      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
21845
21846      case 0xEC:
21847      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
21848
21849      case 0xDC:
21850      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
21851
21852      case 0xF8:
21853      case 0xF9:
21854      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
21855
21856      case 0xE8:
21857      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
21858
21859      case 0xD8:
21860      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
21861
21862      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
21863      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
21864
21865      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
21866
21867      case 0x74:
21868      case 0x75:
21869      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
21870
21871      case 0x64:
21872      case 0x65:
21873      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
21874
21875      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
21876      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
21877      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
21878
21879      case 0x68:
21880      case 0x69:
21881      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
21882
21883      case 0x60:
21884      case 0x61:
21885      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
21886
21887      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
21888      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
21889      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
21890      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
21891
21892      case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
21893      case 0xF2:
21894      case 0xF3:
21895
21896      case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
21897      case 0xD2:
21898      case 0xD3:
21899
21900      case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
21901      case 0xE2: {
21902         Bool decode_OK = False;
21903         delta = dis_MMX ( &decode_OK, vbi, pfx, sz, deltaIN );
21904         if (decode_OK)
21905            return delta;
21906         goto decode_failure;
21907      }
21908
21909      default:
21910         break;
21911      } /* second switch */
21912
21913   }
21914
21915   /* A couple of MMX corner cases */
21916   if (opc == 0x0E/* FEMMS */ || opc == 0x77/* EMMS */) {
21917      if (sz != 4)
21918         goto decode_failure;
21919      do_EMMS_preamble();
21920      DIP("{f}emms\n");
21921      return delta;
21922   }
21923
21924   /* =-=-=-=-=-=-=-=-= SSE2ery =-=-=-=-=-=-=-=-= */
21925   /* Perhaps it's an SSE or SSE2 instruction.  We can try this
21926      without checking the guest hwcaps because SSE2 is a baseline
21927      facility in 64 bit mode. */
21928   {
21929      Bool decode_OK = False;
21930      delta = dis_ESC_0F__SSE2 ( &decode_OK, vbi, pfx, sz, deltaIN, dres );
21931      if (decode_OK)
21932         return delta;
21933   }
21934
21935   /* =-=-=-=-=-=-=-=-= SSE3ery =-=-=-=-=-=-=-=-= */
21936   /* Perhaps it's a SSE3 instruction.  FIXME: check guest hwcaps
21937      first. */
21938   {
21939      Bool decode_OK = False;
21940      delta = dis_ESC_0F__SSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
21941      if (decode_OK)
21942         return delta;
21943   }
21944
21945   /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
21946   /* Perhaps it's a SSE4 instruction.  FIXME: check guest hwcaps
21947      first. */
21948   {
21949      Bool decode_OK = False;
21950      delta = dis_ESC_0F__SSE4 ( &decode_OK,
21951                                 archinfo, vbi, pfx, sz, deltaIN );
21952      if (decode_OK)
21953         return delta;
21954   }
21955
21956  decode_failure:
21957   return deltaIN; /* fail */
21958}
21959
21960
21961/*------------------------------------------------------------*/
21962/*---                                                      ---*/
21963/*--- Top-level post-escape decoders: dis_ESC_0F38         ---*/
21964/*---                                                      ---*/
21965/*------------------------------------------------------------*/
21966
21967__attribute__((noinline))
21968static
21969Long dis_ESC_0F38 (
21970        /*MB_OUT*/DisResult* dres,
21971        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
21972        Bool         resteerCisOk,
21973        void*        callback_opaque,
21974        VexArchInfo* archinfo,
21975        VexAbiInfo*  vbi,
21976        Prefix pfx, Int sz, Long deltaIN
21977     )
21978{
21979   Long   delta = deltaIN;
21980   UChar  opc   = getUChar(delta);
21981   delta++;
21982   switch (opc) {
21983
21984   case 0xF0:   /* 0F 38 F0 = MOVBE m16/32/64(E), r16/32/64(G) */
21985   case 0xF1: { /* 0F 38 F1 = MOVBE r16/32/64(G), m16/32/64(E) */
21986      if (!haveF2orF3(pfx) && !haveVEX(pfx)
21987          && (sz == 2 || sz == 4 || sz == 8)) {
21988         IRTemp addr  = IRTemp_INVALID;
21989         UChar  modrm = 0;
21990         Int    alen  = 0;
21991         HChar  dis_buf[50];
21992         modrm = getUChar(delta);
21993         if (epartIsReg(modrm)) break;
21994         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
21995         delta += alen;
21996         IRType ty = szToITy(sz);
21997         IRTemp src = newTemp(ty);
21998         if (opc == 0xF0) { /* LOAD */
21999            assign(src, loadLE(ty, mkexpr(addr)));
22000            IRTemp dst = math_BSWAP(src, ty);
22001            putIRegG(sz, pfx, modrm, mkexpr(dst));
22002            DIP("movbe %s,%s\n", dis_buf, nameIRegG(sz, pfx, modrm));
22003         } else { /* STORE */
22004            assign(src, getIRegG(sz, pfx, modrm));
22005            IRTemp dst = math_BSWAP(src, ty);
22006            storeLE(mkexpr(addr), mkexpr(dst));
22007            DIP("movbe %s,%s\n", nameIRegG(sz, pfx, modrm), dis_buf);
22008         }
22009         return delta;
22010      }
22011      /* else fall through; maybe one of the decoders below knows what
22012         it is. */
22013      break;
22014   }
22015
22016   default:
22017      break;
22018
22019   }
22020
22021   /* =-=-=-=-=-=-=-=-= SSSE3ery =-=-=-=-=-=-=-=-= */
22022   /* Perhaps it's an SSSE3 instruction.  FIXME: consult guest hwcaps
22023      rather than proceeding indiscriminately. */
22024   {
22025      Bool decode_OK = False;
22026      delta = dis_ESC_0F38__SupSSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
22027      if (decode_OK)
22028         return delta;
22029   }
22030
22031   /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
22032   /* Perhaps it's an SSE4 instruction.  FIXME: consult guest hwcaps
22033      rather than proceeding indiscriminately. */
22034   {
22035      Bool decode_OK = False;
22036      delta = dis_ESC_0F38__SSE4 ( &decode_OK, vbi, pfx, sz, deltaIN );
22037      if (decode_OK)
22038         return delta;
22039   }
22040
22041  /*decode_failure:*/
22042   return deltaIN; /* fail */
22043}
22044
22045
22046/*------------------------------------------------------------*/
22047/*---                                                      ---*/
22048/*--- Top-level post-escape decoders: dis_ESC_0F3A         ---*/
22049/*---                                                      ---*/
22050/*------------------------------------------------------------*/
22051
22052__attribute__((noinline))
22053static
22054Long dis_ESC_0F3A (
22055        /*MB_OUT*/DisResult* dres,
22056        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
22057        Bool         resteerCisOk,
22058        void*        callback_opaque,
22059        VexArchInfo* archinfo,
22060        VexAbiInfo*  vbi,
22061        Prefix pfx, Int sz, Long deltaIN
22062     )
22063{
22064   Long   delta = deltaIN;
22065   UChar  opc   = getUChar(delta);
22066   delta++;
22067   switch (opc) {
22068
22069   default:
22070      break;
22071
22072   }
22073
22074   /* =-=-=-=-=-=-=-=-= SSSE3ery =-=-=-=-=-=-=-=-= */
22075   /* Perhaps it's an SSSE3 instruction.  FIXME: consult guest hwcaps
22076      rather than proceeding indiscriminately. */
22077   {
22078      Bool decode_OK = False;
22079      delta = dis_ESC_0F3A__SupSSE3 ( &decode_OK, vbi, pfx, sz, deltaIN );
22080      if (decode_OK)
22081         return delta;
22082   }
22083
22084   /* =-=-=-=-=-=-=-=-= SSE4ery =-=-=-=-=-=-=-=-= */
22085   /* Perhaps it's an SSE4 instruction.  FIXME: consult guest hwcaps
22086      rather than proceeding indiscriminately. */
22087   {
22088      Bool decode_OK = False;
22089      delta = dis_ESC_0F3A__SSE4 ( &decode_OK, vbi, pfx, sz, deltaIN );
22090      if (decode_OK)
22091         return delta;
22092   }
22093
22094   return deltaIN; /* fail */
22095}
22096
22097
22098/*------------------------------------------------------------*/
22099/*---                                                      ---*/
22100/*--- Top-level post-escape decoders: dis_ESC_0F__VEX      ---*/
22101/*---                                                      ---*/
22102/*------------------------------------------------------------*/
22103
22104/* FIXME: common up with the _256_ version below? */
22105static
22106Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG (
22107        /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
22108        Prefix pfx, Long delta, const HChar* name,
22109        /* The actual operation.  Use either 'op' or 'opfn',
22110           but not both. */
22111        IROp op, IRTemp(*opFn)(IRTemp,IRTemp),
22112        Bool invertLeftArg,
22113        Bool swapArgs
22114     )
22115{
22116   UChar  modrm = getUChar(delta);
22117   UInt   rD    = gregOfRexRM(pfx, modrm);
22118   UInt   rSL   = getVexNvvvv(pfx);
22119   IRTemp tSL   = newTemp(Ity_V128);
22120   IRTemp tSR   = newTemp(Ity_V128);
22121   IRTemp addr  = IRTemp_INVALID;
22122   HChar  dis_buf[50];
22123   Int    alen  = 0;
22124   vassert(0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*WIG?*/);
22125
22126   assign(tSL, invertLeftArg ? unop(Iop_NotV128, getXMMReg(rSL))
22127                             : getXMMReg(rSL));
22128
22129   if (epartIsReg(modrm)) {
22130      UInt rSR = eregOfRexRM(pfx, modrm);
22131      delta += 1;
22132      assign(tSR, getXMMReg(rSR));
22133      DIP("%s %s,%s,%s\n",
22134          name, nameXMMReg(rSR), nameXMMReg(rSL), nameXMMReg(rD));
22135   } else {
22136      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
22137      delta += alen;
22138      assign(tSR, loadLE(Ity_V128, mkexpr(addr)));
22139      DIP("%s %s,%s,%s\n",
22140          name, dis_buf, nameXMMReg(rSL), nameXMMReg(rD));
22141   }
22142
22143   IRTemp res = IRTemp_INVALID;
22144   if (op != Iop_INVALID) {
22145      vassert(opFn == NULL);
22146      res = newTemp(Ity_V128);
22147      if (requiresRMode(op)) {
22148         IRTemp rm = newTemp(Ity_I32);
22149         assign(rm, get_FAKE_roundingmode()); /* XXXROUNDINGFIXME */
22150         assign(res, swapArgs
22151                        ? triop(op, mkexpr(rm), mkexpr(tSR), mkexpr(tSL))
22152                        : triop(op, mkexpr(rm), mkexpr(tSL), mkexpr(tSR)));
22153      } else {
22154         assign(res, swapArgs
22155                        ? binop(op, mkexpr(tSR), mkexpr(tSL))
22156                        : binop(op, mkexpr(tSL), mkexpr(tSR)));
22157      }
22158   } else {
22159      vassert(opFn != NULL);
22160      res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR);
22161   }
22162
22163   putYMMRegLoAndZU(rD, mkexpr(res));
22164
22165   *uses_vvvv = True;
22166   return delta;
22167}
22168
22169
22170/* Handle a VEX_NDS_128_66_0F_WIG (3-addr) insn, with a simple IROp
22171   for the operation, no inversion of the left arg, and no swapping of
22172   args. */
22173static
22174Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple (
22175        /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
22176        Prefix pfx, Long delta, const HChar* name,
22177        IROp op
22178     )
22179{
22180   return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
22181             uses_vvvv, vbi, pfx, delta, name, op, NULL, False, False);
22182}
22183
22184
22185/* Handle a VEX_NDS_128_66_0F_WIG (3-addr) insn, using the given IR
22186   generator to compute the result, no inversion of the left
22187   arg, and no swapping of args. */
22188static
22189Long dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex (
22190        /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
22191        Prefix pfx, Long delta, const HChar* name,
22192        IRTemp(*opFn)(IRTemp,IRTemp)
22193     )
22194{
22195   return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
22196             uses_vvvv, vbi, pfx, delta, name,
22197             Iop_INVALID, opFn, False, False );
22198}
22199
22200
22201/* Vector by scalar shift of V by the amount specified at the bottom
22202   of E. */
22203static ULong dis_AVX128_shiftV_byE ( VexAbiInfo* vbi,
22204                                     Prefix pfx, Long delta,
22205                                     const HChar* opname, IROp op )
22206{
22207   HChar   dis_buf[50];
22208   Int     alen, size;
22209   IRTemp  addr;
22210   Bool    shl, shr, sar;
22211   UChar   modrm = getUChar(delta);
22212   UInt    rG    = gregOfRexRM(pfx,modrm);
22213   UInt    rV    = getVexNvvvv(pfx);;
22214   IRTemp  g0    = newTemp(Ity_V128);
22215   IRTemp  g1    = newTemp(Ity_V128);
22216   IRTemp  amt   = newTemp(Ity_I64);
22217   IRTemp  amt8  = newTemp(Ity_I8);
22218   if (epartIsReg(modrm)) {
22219      UInt rE = eregOfRexRM(pfx,modrm);
22220      assign( amt, getXMMRegLane64(rE, 0) );
22221      DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
22222          nameXMMReg(rV), nameXMMReg(rG) );
22223      delta++;
22224   } else {
22225      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22226      assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
22227      DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
22228      delta += alen;
22229   }
22230   assign( g0, getXMMReg(rV) );
22231   assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
22232
22233   shl = shr = sar = False;
22234   size = 0;
22235   switch (op) {
22236      case Iop_ShlN16x8: shl = True; size = 32; break;
22237      case Iop_ShlN32x4: shl = True; size = 32; break;
22238      case Iop_ShlN64x2: shl = True; size = 64; break;
22239      case Iop_SarN16x8: sar = True; size = 16; break;
22240      case Iop_SarN32x4: sar = True; size = 32; break;
22241      case Iop_ShrN16x8: shr = True; size = 16; break;
22242      case Iop_ShrN32x4: shr = True; size = 32; break;
22243      case Iop_ShrN64x2: shr = True; size = 64; break;
22244      default: vassert(0);
22245   }
22246
22247   if (shl || shr) {
22248     assign(
22249        g1,
22250        IRExpr_ITE(
22251           binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
22252           binop(op, mkexpr(g0), mkexpr(amt8)),
22253           mkV128(0x0000)
22254        )
22255     );
22256   } else
22257   if (sar) {
22258     assign(
22259        g1,
22260        IRExpr_ITE(
22261           binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
22262           binop(op, mkexpr(g0), mkexpr(amt8)),
22263           binop(op, mkexpr(g0), mkU8(size-1))
22264        )
22265     );
22266   } else {
22267      vassert(0);
22268   }
22269
22270   putYMMRegLoAndZU( rG, mkexpr(g1) );
22271   return delta;
22272}
22273
22274
22275/* Vector by scalar shift of V by the amount specified at the bottom
22276   of E. */
22277static ULong dis_AVX256_shiftV_byE ( VexAbiInfo* vbi,
22278                                     Prefix pfx, Long delta,
22279                                     const HChar* opname, IROp op )
22280{
22281   HChar   dis_buf[50];
22282   Int     alen, size;
22283   IRTemp  addr;
22284   Bool    shl, shr, sar;
22285   UChar   modrm = getUChar(delta);
22286   UInt    rG    = gregOfRexRM(pfx,modrm);
22287   UInt    rV    = getVexNvvvv(pfx);;
22288   IRTemp  g0    = newTemp(Ity_V256);
22289   IRTemp  g1    = newTemp(Ity_V256);
22290   IRTemp  amt   = newTemp(Ity_I64);
22291   IRTemp  amt8  = newTemp(Ity_I8);
22292   if (epartIsReg(modrm)) {
22293      UInt rE = eregOfRexRM(pfx,modrm);
22294      assign( amt, getXMMRegLane64(rE, 0) );
22295      DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
22296          nameYMMReg(rV), nameYMMReg(rG) );
22297      delta++;
22298   } else {
22299      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22300      assign( amt, loadLE(Ity_I64, mkexpr(addr)) );
22301      DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
22302      delta += alen;
22303   }
22304   assign( g0, getYMMReg(rV) );
22305   assign( amt8, unop(Iop_64to8, mkexpr(amt)) );
22306
22307   shl = shr = sar = False;
22308   size = 0;
22309   switch (op) {
22310      case Iop_ShlN16x16: shl = True; size = 32; break;
22311      case Iop_ShlN32x8:  shl = True; size = 32; break;
22312      case Iop_ShlN64x4:  shl = True; size = 64; break;
22313      case Iop_SarN16x16: sar = True; size = 16; break;
22314      case Iop_SarN32x8:  sar = True; size = 32; break;
22315      case Iop_ShrN16x16: shr = True; size = 16; break;
22316      case Iop_ShrN32x8:  shr = True; size = 32; break;
22317      case Iop_ShrN64x4:  shr = True; size = 64; break;
22318      default: vassert(0);
22319   }
22320
22321   if (shl || shr) {
22322     assign(
22323        g1,
22324        IRExpr_ITE(
22325           binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
22326           binop(op, mkexpr(g0), mkexpr(amt8)),
22327           binop(Iop_V128HLtoV256, mkV128(0), mkV128(0))
22328        )
22329     );
22330   } else
22331   if (sar) {
22332     assign(
22333        g1,
22334        IRExpr_ITE(
22335           binop(Iop_CmpLT64U, mkexpr(amt), mkU64(size)),
22336           binop(op, mkexpr(g0), mkexpr(amt8)),
22337           binop(op, mkexpr(g0), mkU8(size-1))
22338        )
22339     );
22340   } else {
22341      vassert(0);
22342   }
22343
22344   putYMMReg( rG, mkexpr(g1) );
22345   return delta;
22346}
22347
22348
22349/* Vector by vector shift of V by the amount specified at the bottom
22350   of E.  Vector by vector shifts are defined for all shift amounts,
22351   so not using Iop_S*x* here (and SSE2 doesn't support variable shifts
22352   anyway).  */
22353static ULong dis_AVX_var_shiftV_byE ( VexAbiInfo* vbi,
22354                                      Prefix pfx, Long delta,
22355                                      const HChar* opname, IROp op, Bool isYMM )
22356{
22357   HChar   dis_buf[50];
22358   Int     alen, size, i;
22359   IRTemp  addr;
22360   UChar   modrm = getUChar(delta);
22361   UInt    rG    = gregOfRexRM(pfx,modrm);
22362   UInt    rV    = getVexNvvvv(pfx);;
22363   IRTemp  sV    = isYMM ? newTemp(Ity_V256) : newTemp(Ity_V128);
22364   IRTemp  amt   = isYMM ? newTemp(Ity_V256) : newTemp(Ity_V128);
22365   IRTemp  amts[8], sVs[8], res[8];
22366   if (epartIsReg(modrm)) {
22367      UInt rE = eregOfRexRM(pfx,modrm);
22368      assign( amt, isYMM ? getYMMReg(rE) : getXMMReg(rE) );
22369      if (isYMM) {
22370         DIP("%s %s,%s,%s\n", opname, nameYMMReg(rE),
22371             nameYMMReg(rV), nameYMMReg(rG) );
22372      } else {
22373         DIP("%s %s,%s,%s\n", opname, nameXMMReg(rE),
22374             nameXMMReg(rV), nameXMMReg(rG) );
22375      }
22376      delta++;
22377   } else {
22378      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22379      assign( amt, loadLE(isYMM ? Ity_V256 : Ity_V128, mkexpr(addr)) );
22380      if (isYMM) {
22381         DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV),
22382             nameYMMReg(rG) );
22383      } else {
22384         DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV),
22385             nameXMMReg(rG) );
22386      }
22387      delta += alen;
22388   }
22389   assign( sV, isYMM ? getYMMReg(rV) : getXMMReg(rV) );
22390
22391   size = 0;
22392   switch (op) {
22393      case Iop_Shl32: size = 32; break;
22394      case Iop_Shl64: size = 64; break;
22395      case Iop_Sar32: size = 32; break;
22396      case Iop_Shr32: size = 32; break;
22397      case Iop_Shr64: size = 64; break;
22398      default: vassert(0);
22399   }
22400
22401   for (i = 0; i < 8; i++) {
22402      sVs[i] = IRTemp_INVALID;
22403      amts[i] = IRTemp_INVALID;
22404   }
22405   switch (size) {
22406      case 32:
22407         if (isYMM) {
22408            breakupV256to32s( sV, &sVs[7], &sVs[6], &sVs[5], &sVs[4],
22409                                  &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
22410            breakupV256to32s( amt, &amts[7], &amts[6], &amts[5], &amts[4],
22411                                   &amts[3], &amts[2], &amts[1], &amts[0] );
22412         } else {
22413            breakupV128to32s( sV, &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
22414            breakupV128to32s( amt, &amts[3], &amts[2], &amts[1], &amts[0] );
22415        }
22416         break;
22417      case 64:
22418         if (isYMM) {
22419            breakupV256to64s( sV, &sVs[3], &sVs[2], &sVs[1], &sVs[0] );
22420            breakupV256to64s( amt, &amts[3], &amts[2], &amts[1], &amts[0] );
22421         } else {
22422            breakupV128to64s( sV, &sVs[1], &sVs[0] );
22423            breakupV128to64s( amt, &amts[1], &amts[0] );
22424         }
22425         break;
22426      default: vassert(0);
22427   }
22428   for (i = 0; i < 8; i++)
22429      if (sVs[i] != IRTemp_INVALID) {
22430         res[i] = size == 32 ? newTemp(Ity_I32) : newTemp(Ity_I64);
22431         assign( res[i],
22432                 IRExpr_ITE(
22433                    binop(size == 32 ? Iop_CmpLT32U : Iop_CmpLT64U,
22434                          mkexpr(amts[i]),
22435                          size == 32 ? mkU32(size) : mkU64(size)),
22436                    binop(op, mkexpr(sVs[i]),
22437                               unop(size == 32 ? Iop_32to8 : Iop_64to8,
22438                                    mkexpr(amts[i]))),
22439                    op == Iop_Sar32 ? binop(op, mkexpr(sVs[i]), mkU8(size-1))
22440                                    : size == 32 ? mkU32(0) : mkU64(0)
22441         ));
22442      }
22443   switch (size) {
22444      case 32:
22445         for (i = 0; i < 8; i++)
22446            putYMMRegLane32( rG, i, (i < 4 || isYMM)
22447                                    ? mkexpr(res[i]) : mkU32(0) );
22448         break;
22449      case 64:
22450         for (i = 0; i < 4; i++)
22451            putYMMRegLane64( rG, i, (i < 2 || isYMM)
22452                                    ? mkexpr(res[i]) : mkU64(0) );
22453         break;
22454      default: vassert(0);
22455   }
22456
22457   return delta;
22458}
22459
22460
22461/* Vector by scalar shift of E into V, by an immediate byte.  Modified
22462   version of dis_SSE_shiftE_imm. */
22463static
22464Long dis_AVX128_shiftE_to_V_imm( Prefix pfx,
22465                                 Long delta, const HChar* opname, IROp op )
22466{
22467   Bool    shl, shr, sar;
22468   UChar   rm   = getUChar(delta);
22469   IRTemp  e0   = newTemp(Ity_V128);
22470   IRTemp  e1   = newTemp(Ity_V128);
22471   UInt    rD   = getVexNvvvv(pfx);
22472   UChar   amt, size;
22473   vassert(epartIsReg(rm));
22474   vassert(gregLO3ofRM(rm) == 2
22475           || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
22476   amt = getUChar(delta+1);
22477   delta += 2;
22478   DIP("%s $%d,%s,%s\n", opname,
22479                         (Int)amt,
22480                         nameXMMReg(eregOfRexRM(pfx,rm)),
22481                         nameXMMReg(rD));
22482   assign( e0, getXMMReg(eregOfRexRM(pfx,rm)) );
22483
22484   shl = shr = sar = False;
22485   size = 0;
22486   switch (op) {
22487      case Iop_ShlN16x8: shl = True; size = 16; break;
22488      case Iop_ShlN32x4: shl = True; size = 32; break;
22489      case Iop_ShlN64x2: shl = True; size = 64; break;
22490      case Iop_SarN16x8: sar = True; size = 16; break;
22491      case Iop_SarN32x4: sar = True; size = 32; break;
22492      case Iop_ShrN16x8: shr = True; size = 16; break;
22493      case Iop_ShrN32x4: shr = True; size = 32; break;
22494      case Iop_ShrN64x2: shr = True; size = 64; break;
22495      default: vassert(0);
22496   }
22497
22498   if (shl || shr) {
22499     assign( e1, amt >= size
22500                    ? mkV128(0x0000)
22501                    : binop(op, mkexpr(e0), mkU8(amt))
22502     );
22503   } else
22504   if (sar) {
22505     assign( e1, amt >= size
22506                    ? binop(op, mkexpr(e0), mkU8(size-1))
22507                    : binop(op, mkexpr(e0), mkU8(amt))
22508     );
22509   } else {
22510      vassert(0);
22511   }
22512
22513   putYMMRegLoAndZU( rD, mkexpr(e1) );
22514   return delta;
22515}
22516
22517
22518/* Vector by scalar shift of E into V, by an immediate byte.  Modified
22519   version of dis_AVX128_shiftE_to_V_imm. */
22520static
22521Long dis_AVX256_shiftE_to_V_imm( Prefix pfx,
22522                                 Long delta, const HChar* opname, IROp op )
22523{
22524   Bool    shl, shr, sar;
22525   UChar   rm   = getUChar(delta);
22526   IRTemp  e0   = newTemp(Ity_V256);
22527   IRTemp  e1   = newTemp(Ity_V256);
22528   UInt    rD   = getVexNvvvv(pfx);
22529   UChar   amt, size;
22530   vassert(epartIsReg(rm));
22531   vassert(gregLO3ofRM(rm) == 2
22532           || gregLO3ofRM(rm) == 4 || gregLO3ofRM(rm) == 6);
22533   amt = getUChar(delta+1);
22534   delta += 2;
22535   DIP("%s $%d,%s,%s\n", opname,
22536                         (Int)amt,
22537                         nameYMMReg(eregOfRexRM(pfx,rm)),
22538                         nameYMMReg(rD));
22539   assign( e0, getYMMReg(eregOfRexRM(pfx,rm)) );
22540
22541   shl = shr = sar = False;
22542   size = 0;
22543   switch (op) {
22544      case Iop_ShlN16x16: shl = True; size = 16; break;
22545      case Iop_ShlN32x8:  shl = True; size = 32; break;
22546      case Iop_ShlN64x4:  shl = True; size = 64; break;
22547      case Iop_SarN16x16: sar = True; size = 16; break;
22548      case Iop_SarN32x8:  sar = True; size = 32; break;
22549      case Iop_ShrN16x16: shr = True; size = 16; break;
22550      case Iop_ShrN32x8:  shr = True; size = 32; break;
22551      case Iop_ShrN64x4:  shr = True; size = 64; break;
22552      default: vassert(0);
22553   }
22554
22555
22556   if (shl || shr) {
22557     assign( e1, amt >= size
22558                    ? binop(Iop_V128HLtoV256, mkV128(0), mkV128(0))
22559                    : binop(op, mkexpr(e0), mkU8(amt))
22560     );
22561   } else
22562   if (sar) {
22563     assign( e1, amt >= size
22564                    ? binop(op, mkexpr(e0), mkU8(size-1))
22565                    : binop(op, mkexpr(e0), mkU8(amt))
22566     );
22567   } else {
22568      vassert(0);
22569   }
22570
22571   putYMMReg( rD, mkexpr(e1) );
22572   return delta;
22573}
22574
22575
22576/* Lower 64-bit lane only AVX128 binary operation:
22577   G[63:0]    = V[63:0] `op` E[63:0]
22578   G[127:64]  = V[127:64]
22579   G[255:128] = 0.
22580   The specified op must be of the 64F0x2 kind, so that it
22581   copies the upper half of the left operand to the result.
22582*/
22583static Long dis_AVX128_E_V_to_G_lo64 ( /*OUT*/Bool* uses_vvvv,
22584                                       VexAbiInfo* vbi,
22585                                       Prefix pfx, Long delta,
22586                                       const HChar* opname, IROp op )
22587{
22588   HChar   dis_buf[50];
22589   Int     alen;
22590   IRTemp  addr;
22591   UChar   rm    = getUChar(delta);
22592   UInt    rG    = gregOfRexRM(pfx,rm);
22593   UInt    rV    = getVexNvvvv(pfx);
22594   IRExpr* vpart = getXMMReg(rV);
22595   if (epartIsReg(rm)) {
22596      UInt rE = eregOfRexRM(pfx,rm);
22597      putXMMReg( rG, binop(op, vpart, getXMMReg(rE)) );
22598      DIP("%s %s,%s,%s\n", opname,
22599          nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
22600      delta = delta+1;
22601   } else {
22602      /* We can only do a 64-bit memory read, so the upper half of the
22603         E operand needs to be made simply of zeroes. */
22604      IRTemp epart = newTemp(Ity_V128);
22605      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22606      assign( epart, unop( Iop_64UtoV128,
22607                           loadLE(Ity_I64, mkexpr(addr))) );
22608      putXMMReg( rG, binop(op, vpart, mkexpr(epart)) );
22609      DIP("%s %s,%s,%s\n", opname,
22610          dis_buf, nameXMMReg(rV), nameXMMReg(rG));
22611      delta = delta+alen;
22612   }
22613   putYMMRegLane128( rG, 1, mkV128(0) );
22614   *uses_vvvv = True;
22615   return delta;
22616}
22617
22618
22619/* Lower 64-bit lane only AVX128 unary operation:
22620   G[63:0]    = op(E[63:0])
22621   G[127:64]  = V[127:64]
22622   G[255:128] = 0
22623   The specified op must be of the 64F0x2 kind, so that it
22624   copies the upper half of the operand to the result.
22625*/
22626static Long dis_AVX128_E_V_to_G_lo64_unary ( /*OUT*/Bool* uses_vvvv,
22627                                             VexAbiInfo* vbi,
22628                                             Prefix pfx, Long delta,
22629                                             const HChar* opname, IROp op )
22630{
22631   HChar   dis_buf[50];
22632   Int     alen;
22633   IRTemp  addr;
22634   UChar   rm  = getUChar(delta);
22635   UInt    rG  = gregOfRexRM(pfx,rm);
22636   UInt    rV  = getVexNvvvv(pfx);
22637   IRTemp  e64 = newTemp(Ity_I64);
22638
22639   /* Fetch E[63:0] */
22640   if (epartIsReg(rm)) {
22641      UInt rE = eregOfRexRM(pfx,rm);
22642      assign(e64, getXMMRegLane64(rE, 0));
22643      DIP("%s %s,%s,%s\n", opname,
22644          nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
22645      delta += 1;
22646   } else {
22647      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22648      assign(e64, loadLE(Ity_I64, mkexpr(addr)));
22649      DIP("%s %s,%s,%s\n", opname,
22650          dis_buf, nameXMMReg(rV), nameXMMReg(rG));
22651      delta += alen;
22652   }
22653
22654   /* Create a value 'arg' as V[127:64]++E[63:0] */
22655   IRTemp arg = newTemp(Ity_V128);
22656   assign(arg,
22657          binop(Iop_SetV128lo64,
22658                getXMMReg(rV), mkexpr(e64)));
22659   /* and apply op to it */
22660   putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
22661   *uses_vvvv = True;
22662   return delta;
22663}
22664
22665
22666/* Lower 32-bit lane only AVX128 unary operation:
22667   G[31:0]    = op(E[31:0])
22668   G[127:32]  = V[127:32]
22669   G[255:128] = 0
22670   The specified op must be of the 32F0x4 kind, so that it
22671   copies the upper 3/4 of the operand to the result.
22672*/
22673static Long dis_AVX128_E_V_to_G_lo32_unary ( /*OUT*/Bool* uses_vvvv,
22674                                             VexAbiInfo* vbi,
22675                                             Prefix pfx, Long delta,
22676                                             const HChar* opname, IROp op )
22677{
22678   HChar   dis_buf[50];
22679   Int     alen;
22680   IRTemp  addr;
22681   UChar   rm  = getUChar(delta);
22682   UInt    rG  = gregOfRexRM(pfx,rm);
22683   UInt    rV  = getVexNvvvv(pfx);
22684   IRTemp  e32 = newTemp(Ity_I32);
22685
22686   /* Fetch E[31:0] */
22687   if (epartIsReg(rm)) {
22688      UInt rE = eregOfRexRM(pfx,rm);
22689      assign(e32, getXMMRegLane32(rE, 0));
22690      DIP("%s %s,%s,%s\n", opname,
22691          nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
22692      delta += 1;
22693   } else {
22694      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22695      assign(e32, loadLE(Ity_I32, mkexpr(addr)));
22696      DIP("%s %s,%s,%s\n", opname,
22697          dis_buf, nameXMMReg(rV), nameXMMReg(rG));
22698      delta += alen;
22699   }
22700
22701   /* Create a value 'arg' as V[127:32]++E[31:0] */
22702   IRTemp arg = newTemp(Ity_V128);
22703   assign(arg,
22704          binop(Iop_SetV128lo32,
22705                getXMMReg(rV), mkexpr(e32)));
22706   /* and apply op to it */
22707   putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
22708   *uses_vvvv = True;
22709   return delta;
22710}
22711
22712
22713/* Lower 32-bit lane only AVX128 binary operation:
22714   G[31:0]    = V[31:0] `op` E[31:0]
22715   G[127:32]  = V[127:32]
22716   G[255:128] = 0.
22717   The specified op must be of the 32F0x4 kind, so that it
22718   copies the upper 3/4 of the left operand to the result.
22719*/
22720static Long dis_AVX128_E_V_to_G_lo32 ( /*OUT*/Bool* uses_vvvv,
22721                                       VexAbiInfo* vbi,
22722                                       Prefix pfx, Long delta,
22723                                       const HChar* opname, IROp op )
22724{
22725   HChar   dis_buf[50];
22726   Int     alen;
22727   IRTemp  addr;
22728   UChar   rm    = getUChar(delta);
22729   UInt    rG    = gregOfRexRM(pfx,rm);
22730   UInt    rV    = getVexNvvvv(pfx);
22731   IRExpr* vpart = getXMMReg(rV);
22732   if (epartIsReg(rm)) {
22733      UInt rE = eregOfRexRM(pfx,rm);
22734      putXMMReg( rG, binop(op, vpart, getXMMReg(rE)) );
22735      DIP("%s %s,%s,%s\n", opname,
22736          nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
22737      delta = delta+1;
22738   } else {
22739      /* We can only do a 32-bit memory read, so the upper 3/4 of the
22740         E operand needs to be made simply of zeroes. */
22741      IRTemp epart = newTemp(Ity_V128);
22742      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22743      assign( epart, unop( Iop_32UtoV128,
22744                           loadLE(Ity_I32, mkexpr(addr))) );
22745      putXMMReg( rG, binop(op, vpart, mkexpr(epart)) );
22746      DIP("%s %s,%s,%s\n", opname,
22747          dis_buf, nameXMMReg(rV), nameXMMReg(rG));
22748      delta = delta+alen;
22749   }
22750   putYMMRegLane128( rG, 1, mkV128(0) );
22751   *uses_vvvv = True;
22752   return delta;
22753}
22754
22755
22756/* All-lanes AVX128 binary operation:
22757   G[127:0]   = V[127:0] `op` E[127:0]
22758   G[255:128] = 0.
22759*/
22760static Long dis_AVX128_E_V_to_G ( /*OUT*/Bool* uses_vvvv,
22761                                  VexAbiInfo* vbi,
22762                                  Prefix pfx, Long delta,
22763                                  const HChar* opname, IROp op )
22764{
22765   return dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
22766             uses_vvvv, vbi, pfx, delta, opname, op,
22767             NULL, False/*!invertLeftArg*/, False/*!swapArgs*/
22768   );
22769}
22770
22771
22772/* Handles AVX128 32F/64F comparisons.  A derivative of
22773   dis_SSEcmp_E_to_G.  It can fail, in which case it returns the
22774   original delta to indicate failure. */
22775static
22776Long dis_AVX128_cmp_V_E_to_G ( /*OUT*/Bool* uses_vvvv,
22777                               VexAbiInfo* vbi,
22778                               Prefix pfx, Long delta,
22779                               const HChar* opname, Bool all_lanes, Int sz )
22780{
22781   vassert(sz == 4 || sz == 8);
22782   Long    deltaIN = delta;
22783   HChar   dis_buf[50];
22784   Int     alen;
22785   UInt    imm8;
22786   IRTemp  addr;
22787   Bool    preSwap = False;
22788   IROp    op      = Iop_INVALID;
22789   Bool    postNot = False;
22790   IRTemp  plain   = newTemp(Ity_V128);
22791   UChar   rm      = getUChar(delta);
22792   UInt    rG      = gregOfRexRM(pfx, rm);
22793   UInt    rV      = getVexNvvvv(pfx);
22794   IRTemp argL     = newTemp(Ity_V128);
22795   IRTemp argR     = newTemp(Ity_V128);
22796
22797   assign(argL, getXMMReg(rV));
22798   if (epartIsReg(rm)) {
22799      imm8 = getUChar(delta+1);
22800      Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
22801      if (!ok) return deltaIN; /* FAIL */
22802      UInt rE = eregOfRexRM(pfx,rm);
22803      assign(argR, getXMMReg(rE));
22804      delta += 1+1;
22805      DIP("%s $%d,%s,%s,%s\n",
22806          opname, (Int)imm8,
22807          nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
22808   } else {
22809      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
22810      imm8 = getUChar(delta+alen);
22811      Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8, all_lanes, sz);
22812      if (!ok) return deltaIN; /* FAIL */
22813      assign(argR,
22814             all_lanes   ? loadLE(Ity_V128, mkexpr(addr))
22815             : sz == 8   ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
22816             : /*sz==4*/   unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr))));
22817      delta += alen+1;
22818      DIP("%s $%d,%s,%s,%s\n",
22819          opname, (Int)imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
22820   }
22821
22822   assign(plain, preSwap ? binop(op, mkexpr(argR), mkexpr(argL))
22823                         : binop(op, mkexpr(argL), mkexpr(argR)));
22824
22825   if (all_lanes) {
22826      /* This is simple: just invert the result, if necessary, and
22827         have done. */
22828      if (postNot) {
22829         putYMMRegLoAndZU( rG, unop(Iop_NotV128, mkexpr(plain)) );
22830      } else {
22831         putYMMRegLoAndZU( rG, mkexpr(plain) );
22832      }
22833   }
22834   else
22835   if (!preSwap) {
22836      /* More complex.  It's a one-lane-only, hence need to possibly
22837         invert only that one lane.  But at least the other lanes are
22838         correctly "in" the result, having been copied from the left
22839         operand (argL). */
22840      if (postNot) {
22841         IRExpr* mask = mkV128(sz==4 ? 0x000F : 0x00FF);
22842         putYMMRegLoAndZU( rG, binop(Iop_XorV128, mkexpr(plain),
22843                                                  mask) );
22844      } else {
22845         putYMMRegLoAndZU( rG, mkexpr(plain) );
22846      }
22847   }
22848   else {
22849      /* This is the most complex case.  One-lane-only, but the args
22850         were swapped.  So we have to possibly invert the bottom lane,
22851         and (definitely) we have to copy the upper lane(s) from argL
22852         since, due to the swapping, what's currently there is from
22853         argR, which is not correct. */
22854      IRTemp res     = newTemp(Ity_V128);
22855      IRTemp mask    = newTemp(Ity_V128);
22856      IRTemp notMask = newTemp(Ity_V128);
22857      assign(mask,    mkV128(sz==4 ? 0x000F : 0x00FF));
22858      assign(notMask, mkV128(sz==4 ? 0xFFF0 : 0xFF00));
22859      if (postNot) {
22860         assign(res,
22861                binop(Iop_OrV128,
22862                      binop(Iop_AndV128,
22863                            unop(Iop_NotV128, mkexpr(plain)),
22864                            mkexpr(mask)),
22865                      binop(Iop_AndV128, mkexpr(argL), mkexpr(notMask))));
22866      } else {
22867         assign(res,
22868                binop(Iop_OrV128,
22869                      binop(Iop_AndV128,
22870                            mkexpr(plain),
22871                            mkexpr(mask)),
22872                      binop(Iop_AndV128, mkexpr(argL), mkexpr(notMask))));
22873      }
22874      putYMMRegLoAndZU( rG, mkexpr(res) );
22875   }
22876
22877   *uses_vvvv = True;
22878   return delta;
22879}
22880
22881
22882/* Handles AVX256 32F/64F comparisons.  A derivative of
22883   dis_SSEcmp_E_to_G.  It can fail, in which case it returns the
22884   original delta to indicate failure. */
22885static
22886Long dis_AVX256_cmp_V_E_to_G ( /*OUT*/Bool* uses_vvvv,
22887                               VexAbiInfo* vbi,
22888                               Prefix pfx, Long delta,
22889                               const HChar* opname, Int sz )
22890{
22891   vassert(sz == 4 || sz == 8);
22892   Long    deltaIN = delta;
22893   HChar   dis_buf[50];
22894   Int     alen;
22895   UInt    imm8;
22896   IRTemp  addr;
22897   Bool    preSwap = False;
22898   IROp    op      = Iop_INVALID;
22899   Bool    postNot = False;
22900   IRTemp  plain   = newTemp(Ity_V256);
22901   UChar   rm      = getUChar(delta);
22902   UInt    rG      = gregOfRexRM(pfx, rm);
22903   UInt    rV      = getVexNvvvv(pfx);
22904   IRTemp argL     = newTemp(Ity_V256);
22905   IRTemp argR     = newTemp(Ity_V256);
22906   IRTemp argLhi   = IRTemp_INVALID;
22907   IRTemp argLlo   = IRTemp_INVALID;
22908   IRTemp argRhi   = IRTemp_INVALID;
22909   IRTemp argRlo   = IRTemp_INVALID;
22910
22911   assign(argL, getYMMReg(rV));
22912   if (epartIsReg(rm)) {
22913      imm8 = getUChar(delta+1);
22914      Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8,
22915                             True/*all_lanes*/, sz);
22916      if (!ok) return deltaIN; /* FAIL */
22917      UInt rE = eregOfRexRM(pfx,rm);
22918      assign(argR, getYMMReg(rE));
22919      delta += 1+1;
22920      DIP("%s $%d,%s,%s,%s\n",
22921          opname, (Int)imm8,
22922          nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
22923   } else {
22924      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
22925      imm8 = getUChar(delta+alen);
22926      Bool ok = findSSECmpOp(&preSwap, &op, &postNot, imm8,
22927                             True/*all_lanes*/, sz);
22928      if (!ok) return deltaIN; /* FAIL */
22929      assign(argR, loadLE(Ity_V256, mkexpr(addr)) );
22930      delta += alen+1;
22931      DIP("%s $%d,%s,%s,%s\n",
22932          opname, (Int)imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
22933   }
22934
22935   breakupV256toV128s( preSwap ? argR : argL, &argLhi, &argLlo );
22936   breakupV256toV128s( preSwap ? argL : argR, &argRhi, &argRlo );
22937   assign(plain, binop( Iop_V128HLtoV256,
22938                        binop(op, mkexpr(argLhi), mkexpr(argRhi)),
22939                        binop(op, mkexpr(argLlo), mkexpr(argRlo)) ) );
22940
22941   /* This is simple: just invert the result, if necessary, and
22942      have done. */
22943   if (postNot) {
22944      putYMMReg( rG, unop(Iop_NotV256, mkexpr(plain)) );
22945   } else {
22946      putYMMReg( rG, mkexpr(plain) );
22947   }
22948
22949   *uses_vvvv = True;
22950   return delta;
22951}
22952
22953
22954/* Handles AVX128 unary E-to-G all-lanes operations. */
22955static
22956Long dis_AVX128_E_to_G_unary ( /*OUT*/Bool* uses_vvvv,
22957                               VexAbiInfo* vbi,
22958                               Prefix pfx, Long delta,
22959                               const HChar* opname,
22960                               IRTemp (*opFn)(IRTemp) )
22961{
22962   HChar  dis_buf[50];
22963   Int    alen;
22964   IRTemp addr;
22965   IRTemp res  = newTemp(Ity_V128);
22966   IRTemp arg  = newTemp(Ity_V128);
22967   UChar  rm   = getUChar(delta);
22968   UInt   rG   = gregOfRexRM(pfx, rm);
22969   if (epartIsReg(rm)) {
22970      UInt rE = eregOfRexRM(pfx,rm);
22971      assign(arg, getXMMReg(rE));
22972      delta += 1;
22973      DIP("%s %s,%s\n", opname, nameXMMReg(rE), nameXMMReg(rG));
22974   } else {
22975      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
22976      assign(arg, loadLE(Ity_V128, mkexpr(addr)));
22977      delta += alen;
22978      DIP("%s %s,%s\n", opname, dis_buf, nameXMMReg(rG));
22979   }
22980   res = opFn(arg);
22981   putYMMRegLoAndZU( rG, mkexpr(res) );
22982   *uses_vvvv = False;
22983   return delta;
22984}
22985
22986
22987/* Handles AVX128 unary E-to-G all-lanes operations. */
22988static
22989Long dis_AVX128_E_to_G_unary_all ( /*OUT*/Bool* uses_vvvv,
22990                                   VexAbiInfo* vbi,
22991                                   Prefix pfx, Long delta,
22992                                   const HChar* opname, IROp op )
22993{
22994   HChar  dis_buf[50];
22995   Int    alen;
22996   IRTemp addr;
22997   IRTemp arg  = newTemp(Ity_V128);
22998   UChar  rm   = getUChar(delta);
22999   UInt   rG   = gregOfRexRM(pfx, rm);
23000   if (epartIsReg(rm)) {
23001      UInt rE = eregOfRexRM(pfx,rm);
23002      assign(arg, getXMMReg(rE));
23003      delta += 1;
23004      DIP("%s %s,%s\n", opname, nameXMMReg(rE), nameXMMReg(rG));
23005   } else {
23006      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23007      assign(arg, loadLE(Ity_V128, mkexpr(addr)));
23008      delta += alen;
23009      DIP("%s %s,%s\n", opname, dis_buf, nameXMMReg(rG));
23010   }
23011   putYMMRegLoAndZU( rG, unop(op, mkexpr(arg)) );
23012   *uses_vvvv = False;
23013   return delta;
23014}
23015
23016
23017/* FIXME: common up with the _128_ version above? */
23018static
23019Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG (
23020        /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
23021        Prefix pfx, Long delta, const HChar* name,
23022        /* The actual operation.  Use either 'op' or 'opfn',
23023           but not both. */
23024        IROp op, IRTemp(*opFn)(IRTemp,IRTemp),
23025        Bool invertLeftArg,
23026        Bool swapArgs
23027     )
23028{
23029   UChar  modrm = getUChar(delta);
23030   UInt   rD    = gregOfRexRM(pfx, modrm);
23031   UInt   rSL   = getVexNvvvv(pfx);
23032   IRTemp tSL   = newTemp(Ity_V256);
23033   IRTemp tSR   = newTemp(Ity_V256);
23034   IRTemp addr  = IRTemp_INVALID;
23035   HChar  dis_buf[50];
23036   Int    alen  = 0;
23037   vassert(1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*WIG?*/);
23038
23039   assign(tSL, invertLeftArg ? unop(Iop_NotV256, getYMMReg(rSL))
23040                             : getYMMReg(rSL));
23041
23042   if (epartIsReg(modrm)) {
23043      UInt rSR = eregOfRexRM(pfx, modrm);
23044      delta += 1;
23045      assign(tSR, getYMMReg(rSR));
23046      DIP("%s %s,%s,%s\n",
23047          name, nameYMMReg(rSR), nameYMMReg(rSL), nameYMMReg(rD));
23048   } else {
23049      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
23050      delta += alen;
23051      assign(tSR, loadLE(Ity_V256, mkexpr(addr)));
23052      DIP("%s %s,%s,%s\n",
23053          name, dis_buf, nameYMMReg(rSL), nameYMMReg(rD));
23054   }
23055
23056   IRTemp res = IRTemp_INVALID;
23057   if (op != Iop_INVALID) {
23058      vassert(opFn == NULL);
23059      res = newTemp(Ity_V256);
23060      if (requiresRMode(op)) {
23061         IRTemp rm = newTemp(Ity_I32);
23062         assign(rm, get_FAKE_roundingmode()); /* XXXROUNDINGFIXME */
23063         assign(res, swapArgs
23064                        ? triop(op, mkexpr(rm), mkexpr(tSR), mkexpr(tSL))
23065                        : triop(op, mkexpr(rm), mkexpr(tSL), mkexpr(tSR)));
23066      } else {
23067         assign(res, swapArgs
23068                        ? binop(op, mkexpr(tSR), mkexpr(tSL))
23069                        : binop(op, mkexpr(tSL), mkexpr(tSR)));
23070      }
23071   } else {
23072      vassert(opFn != NULL);
23073      res = swapArgs ? opFn(tSR, tSL) : opFn(tSL, tSR);
23074   }
23075
23076   putYMMReg(rD, mkexpr(res));
23077
23078   *uses_vvvv = True;
23079   return delta;
23080}
23081
23082
23083/* All-lanes AVX256 binary operation:
23084   G[255:0] = V[255:0] `op` E[255:0]
23085*/
23086static Long dis_AVX256_E_V_to_G ( /*OUT*/Bool* uses_vvvv,
23087                                  VexAbiInfo* vbi,
23088                                  Prefix pfx, Long delta,
23089                                  const HChar* opname, IROp op )
23090{
23091   return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
23092             uses_vvvv, vbi, pfx, delta, opname, op,
23093             NULL, False/*!invertLeftArg*/, False/*!swapArgs*/
23094   );
23095}
23096
23097
23098/* Handle a VEX_NDS_256_66_0F_WIG (3-addr) insn, with a simple IROp
23099   for the operation, no inversion of the left arg, and no swapping of
23100   args. */
23101static
23102Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple (
23103        /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
23104        Prefix pfx, Long delta, const HChar* name,
23105        IROp op
23106     )
23107{
23108   return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
23109             uses_vvvv, vbi, pfx, delta, name, op, NULL, False, False);
23110}
23111
23112
23113/* Handle a VEX_NDS_256_66_0F_WIG (3-addr) insn, using the given IR
23114   generator to compute the result, no inversion of the left
23115   arg, and no swapping of args. */
23116static
23117Long dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex (
23118        /*OUT*/Bool* uses_vvvv, VexAbiInfo* vbi,
23119        Prefix pfx, Long delta, const HChar* name,
23120        IRTemp(*opFn)(IRTemp,IRTemp)
23121     )
23122{
23123   return dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
23124             uses_vvvv, vbi, pfx, delta, name,
23125             Iop_INVALID, opFn, False, False );
23126}
23127
23128
23129/* Handles AVX256 unary E-to-G all-lanes operations. */
23130static
23131Long dis_AVX256_E_to_G_unary ( /*OUT*/Bool* uses_vvvv,
23132                               VexAbiInfo* vbi,
23133                               Prefix pfx, Long delta,
23134                               const HChar* opname,
23135                               IRTemp (*opFn)(IRTemp) )
23136{
23137   HChar  dis_buf[50];
23138   Int    alen;
23139   IRTemp addr;
23140   IRTemp res  = newTemp(Ity_V256);
23141   IRTemp arg  = newTemp(Ity_V256);
23142   UChar  rm   = getUChar(delta);
23143   UInt   rG   = gregOfRexRM(pfx, rm);
23144   if (epartIsReg(rm)) {
23145      UInt rE = eregOfRexRM(pfx,rm);
23146      assign(arg, getYMMReg(rE));
23147      delta += 1;
23148      DIP("%s %s,%s\n", opname, nameYMMReg(rE), nameYMMReg(rG));
23149   } else {
23150      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23151      assign(arg, loadLE(Ity_V256, mkexpr(addr)));
23152      delta += alen;
23153      DIP("%s %s,%s\n", opname, dis_buf, nameYMMReg(rG));
23154   }
23155   res = opFn(arg);
23156   putYMMReg( rG, mkexpr(res) );
23157   *uses_vvvv = False;
23158   return delta;
23159}
23160
23161
23162/* Handles AVX256 unary E-to-G all-lanes operations. */
23163static
23164Long dis_AVX256_E_to_G_unary_all ( /*OUT*/Bool* uses_vvvv,
23165                                   VexAbiInfo* vbi,
23166                                   Prefix pfx, Long delta,
23167                                   const HChar* opname, IROp op )
23168{
23169   HChar  dis_buf[50];
23170   Int    alen;
23171   IRTemp addr;
23172   IRTemp arg  = newTemp(Ity_V256);
23173   UChar  rm   = getUChar(delta);
23174   UInt   rG   = gregOfRexRM(pfx, rm);
23175   if (epartIsReg(rm)) {
23176      UInt rE = eregOfRexRM(pfx,rm);
23177      assign(arg, getYMMReg(rE));
23178      delta += 1;
23179      DIP("%s %s,%s\n", opname, nameYMMReg(rE), nameYMMReg(rG));
23180   } else {
23181      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23182      assign(arg, loadLE(Ity_V256, mkexpr(addr)));
23183      delta += alen;
23184      DIP("%s %s,%s\n", opname, dis_buf, nameYMMReg(rG));
23185   }
23186   putYMMReg( rG, unop(op, mkexpr(arg)) );
23187   *uses_vvvv = False;
23188   return delta;
23189}
23190
23191
23192/* The use of ReinterpF64asI64 is ugly.  Surely could do better if we
23193   had a variant of Iop_64x4toV256 that took F64s as args instead. */
23194static Long dis_CVTDQ2PD_256 ( VexAbiInfo* vbi, Prefix pfx,
23195                               Long delta )
23196{
23197   IRTemp addr  = IRTemp_INVALID;
23198   Int    alen  = 0;
23199   HChar  dis_buf[50];
23200   UChar  modrm = getUChar(delta);
23201   IRTemp sV    = newTemp(Ity_V128);
23202   UInt   rG    = gregOfRexRM(pfx,modrm);
23203   if (epartIsReg(modrm)) {
23204      UInt rE = eregOfRexRM(pfx,modrm);
23205      assign( sV, getXMMReg(rE) );
23206      delta += 1;
23207      DIP("vcvtdq2pd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
23208   } else {
23209      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23210      assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
23211      delta += alen;
23212      DIP("vcvtdq2pd %s,%s\n", dis_buf, nameYMMReg(rG) );
23213   }
23214   IRTemp s3, s2, s1, s0;
23215   s3 = s2 = s1 = s0 = IRTemp_INVALID;
23216   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
23217   IRExpr* res
23218      = IRExpr_Qop(
23219           Iop_64x4toV256,
23220           unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s3))),
23221           unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s2))),
23222           unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s1))),
23223           unop(Iop_ReinterpF64asI64, unop(Iop_I32StoF64, mkexpr(s0)))
23224        );
23225   putYMMReg(rG, res);
23226   return delta;
23227}
23228
23229
23230static Long dis_CVTPD2PS_256 ( VexAbiInfo* vbi, Prefix pfx,
23231                               Long delta )
23232{
23233   IRTemp addr  = IRTemp_INVALID;
23234   Int    alen  = 0;
23235   HChar  dis_buf[50];
23236   UChar  modrm = getUChar(delta);
23237   UInt   rG    = gregOfRexRM(pfx,modrm);
23238   IRTemp argV  = newTemp(Ity_V256);
23239   IRTemp rmode = newTemp(Ity_I32);
23240   if (epartIsReg(modrm)) {
23241      UInt rE = eregOfRexRM(pfx,modrm);
23242      assign( argV, getYMMReg(rE) );
23243      delta += 1;
23244      DIP("vcvtpd2psy %s,%s\n", nameYMMReg(rE), nameXMMReg(rG));
23245   } else {
23246      addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23247      assign( argV, loadLE(Ity_V256, mkexpr(addr)) );
23248      delta += alen;
23249      DIP("vcvtpd2psy %s,%s\n", dis_buf, nameXMMReg(rG) );
23250   }
23251
23252   assign( rmode, get_sse_roundingmode() );
23253   IRTemp t3, t2, t1, t0;
23254   t3 = t2 = t1 = t0 = IRTemp_INVALID;
23255   breakupV256to64s( argV, &t3, &t2, &t1, &t0 );
23256#  define CVT(_t)  binop( Iop_F64toF32, mkexpr(rmode), \
23257                          unop(Iop_ReinterpI64asF64, mkexpr(_t)) )
23258   putXMMRegLane32F( rG, 3, CVT(t3) );
23259   putXMMRegLane32F( rG, 2, CVT(t2) );
23260   putXMMRegLane32F( rG, 1, CVT(t1) );
23261   putXMMRegLane32F( rG, 0, CVT(t0) );
23262#  undef CVT
23263   putYMMRegLane128( rG, 1, mkV128(0) );
23264   return delta;
23265}
23266
23267
23268static IRTemp math_VPUNPCK_YMM ( IRTemp tL, IRType tR, IROp op )
23269{
23270   IRTemp tLhi, tLlo, tRhi, tRlo;
23271   tLhi = tLlo = tRhi = tRlo = IRTemp_INVALID;
23272   IRTemp res = newTemp(Ity_V256);
23273   breakupV256toV128s( tL, &tLhi, &tLlo );
23274   breakupV256toV128s( tR, &tRhi, &tRlo );
23275   assign( res, binop( Iop_V128HLtoV256,
23276                       binop( op, mkexpr(tRhi), mkexpr(tLhi) ),
23277                       binop( op, mkexpr(tRlo), mkexpr(tLlo) ) ) );
23278   return res;
23279}
23280
23281
23282static IRTemp math_VPUNPCKLBW_YMM ( IRTemp tL, IRTemp tR )
23283{
23284   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO8x16 );
23285}
23286
23287
23288static IRTemp math_VPUNPCKLWD_YMM ( IRTemp tL, IRTemp tR )
23289{
23290   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO16x8 );
23291}
23292
23293
23294static IRTemp math_VPUNPCKLDQ_YMM ( IRTemp tL, IRTemp tR )
23295{
23296   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO32x4 );
23297}
23298
23299
23300static IRTemp math_VPUNPCKLQDQ_YMM ( IRTemp tL, IRTemp tR )
23301{
23302   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveLO64x2 );
23303}
23304
23305
23306static IRTemp math_VPUNPCKHBW_YMM ( IRTemp tL, IRTemp tR )
23307{
23308   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI8x16 );
23309}
23310
23311
23312static IRTemp math_VPUNPCKHWD_YMM ( IRTemp tL, IRTemp tR )
23313{
23314   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI16x8 );
23315}
23316
23317
23318static IRTemp math_VPUNPCKHDQ_YMM ( IRTemp tL, IRTemp tR )
23319{
23320   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI32x4 );
23321}
23322
23323
23324static IRTemp math_VPUNPCKHQDQ_YMM ( IRTemp tL, IRTemp tR )
23325{
23326   return math_VPUNPCK_YMM( tL, tR, Iop_InterleaveHI64x2 );
23327}
23328
23329
23330static IRTemp math_VPACKSSWB_YMM ( IRTemp tL, IRTemp tR )
23331{
23332   return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin16Sto8Sx16 );
23333}
23334
23335
23336static IRTemp math_VPACKUSWB_YMM ( IRTemp tL, IRTemp tR )
23337{
23338   return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin16Sto8Ux16 );
23339}
23340
23341
23342static IRTemp math_VPACKSSDW_YMM ( IRTemp tL, IRTemp tR )
23343{
23344   return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin32Sto16Sx8 );
23345}
23346
23347
23348static IRTemp math_VPACKUSDW_YMM ( IRTemp tL, IRTemp tR )
23349{
23350   return math_VPUNPCK_YMM( tL, tR, Iop_QNarrowBin32Sto16Ux8 );
23351}
23352
23353
23354__attribute__((noinline))
23355static
23356Long dis_ESC_0F__VEX (
23357        /*MB_OUT*/DisResult* dres,
23358        /*OUT*/   Bool*      uses_vvvv,
23359        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
23360        Bool         resteerCisOk,
23361        void*        callback_opaque,
23362        VexArchInfo* archinfo,
23363        VexAbiInfo*  vbi,
23364        Prefix pfx, Int sz, Long deltaIN
23365     )
23366{
23367   IRTemp addr  = IRTemp_INVALID;
23368   Int    alen  = 0;
23369   HChar  dis_buf[50];
23370   Long   delta = deltaIN;
23371   UChar  opc   = getUChar(delta);
23372   delta++;
23373   *uses_vvvv = False;
23374
23375   switch (opc) {
23376
23377   case 0x10:
23378      /* VMOVSD m64, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */
23379      /* Move 64 bits from E (mem only) to G (lo half xmm).
23380         Bits 255-64 of the dest are zeroed out. */
23381      if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) {
23382         UChar modrm = getUChar(delta);
23383         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23384         UInt   rG   = gregOfRexRM(pfx,modrm);
23385         IRTemp z128 = newTemp(Ity_V128);
23386         assign(z128, mkV128(0));
23387         putXMMReg( rG, mkexpr(z128) );
23388         /* FIXME: ALIGNMENT CHECK? */
23389         putXMMRegLane64( rG, 0, loadLE(Ity_I64, mkexpr(addr)) );
23390         putYMMRegLane128( rG, 1, mkexpr(z128) );
23391         DIP("vmovsd %s,%s\n", dis_buf, nameXMMReg(rG));
23392         delta += alen;
23393         goto decode_success;
23394      }
23395      /* VMOVSD xmm3, xmm2, xmm1 = VEX.LIG.F2.0F.WIG 10 /r */
23396      /* Reg form. */
23397      if (haveF2no66noF3(pfx) && epartIsReg(getUChar(delta))) {
23398         UChar modrm = getUChar(delta);
23399         UInt  rG    = gregOfRexRM(pfx, modrm);
23400         UInt  rE    = eregOfRexRM(pfx, modrm);
23401         UInt  rV    = getVexNvvvv(pfx);
23402         delta++;
23403         DIP("vmovsd %s,%s,%s\n",
23404             nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
23405         IRTemp res = newTemp(Ity_V128);
23406         assign(res, binop(Iop_64HLtoV128,
23407                           getXMMRegLane64(rV, 1),
23408                           getXMMRegLane64(rE, 0)));
23409         putYMMRegLoAndZU(rG, mkexpr(res));
23410         *uses_vvvv = True;
23411         goto decode_success;
23412      }
23413      /* VMOVSS m32, xmm1 = VEX.LIG.F3.0F.WIG 10 /r */
23414      /* Move 32 bits from E (mem only) to G (lo half xmm).
23415         Bits 255-32 of the dest are zeroed out. */
23416      if (haveF3no66noF2(pfx) && !epartIsReg(getUChar(delta))) {
23417         UChar modrm = getUChar(delta);
23418         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23419         UInt   rG   = gregOfRexRM(pfx,modrm);
23420         IRTemp z128 = newTemp(Ity_V128);
23421         assign(z128, mkV128(0));
23422         putXMMReg( rG, mkexpr(z128) );
23423         /* FIXME: ALIGNMENT CHECK? */
23424         putXMMRegLane32( rG, 0, loadLE(Ity_I32, mkexpr(addr)) );
23425         putYMMRegLane128( rG, 1, mkexpr(z128) );
23426         DIP("vmovss %s,%s\n", dis_buf, nameXMMReg(rG));
23427         delta += alen;
23428         goto decode_success;
23429      }
23430      /* VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 10 /r */
23431      /* Reg form. */
23432      if (haveF3no66noF2(pfx) && epartIsReg(getUChar(delta))) {
23433         UChar modrm = getUChar(delta);
23434         UInt  rG    = gregOfRexRM(pfx, modrm);
23435         UInt  rE    = eregOfRexRM(pfx, modrm);
23436         UInt  rV    = getVexNvvvv(pfx);
23437         delta++;
23438         DIP("vmovss %s,%s,%s\n",
23439             nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
23440         IRTemp res = newTemp(Ity_V128);
23441         assign( res, binop( Iop_64HLtoV128,
23442                             getXMMRegLane64(rV, 1),
23443                             binop(Iop_32HLto64,
23444                                   getXMMRegLane32(rV, 1),
23445                                   getXMMRegLane32(rE, 0)) ) );
23446         putYMMRegLoAndZU(rG, mkexpr(res));
23447         *uses_vvvv = True;
23448         goto decode_success;
23449      }
23450      /* VMOVUPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 10 /r */
23451      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23452         UChar modrm = getUChar(delta);
23453         UInt  rG    = gregOfRexRM(pfx, modrm);
23454         if (epartIsReg(modrm)) {
23455            UInt rE = eregOfRexRM(pfx,modrm);
23456            putYMMRegLoAndZU( rG, getXMMReg( rE ));
23457            DIP("vmovupd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
23458            delta += 1;
23459         } else {
23460            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23461            putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
23462            DIP("vmovupd %s,%s\n", dis_buf, nameXMMReg(rG));
23463            delta += alen;
23464         }
23465         goto decode_success;
23466      }
23467      /* VMOVUPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 10 /r */
23468      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
23469         UChar modrm = getUChar(delta);
23470         UInt  rG    = gregOfRexRM(pfx, modrm);
23471         if (epartIsReg(modrm)) {
23472            UInt rE = eregOfRexRM(pfx,modrm);
23473            putYMMReg( rG, getYMMReg( rE ));
23474            DIP("vmovupd %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
23475            delta += 1;
23476         } else {
23477            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23478            putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
23479            DIP("vmovupd %s,%s\n", dis_buf, nameYMMReg(rG));
23480            delta += alen;
23481         }
23482         goto decode_success;
23483      }
23484      /* VMOVUPS xmm2/m128, xmm1 = VEX.128.0F.WIG 10 /r */
23485      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23486         UChar modrm = getUChar(delta);
23487         UInt  rG    = gregOfRexRM(pfx, modrm);
23488         if (epartIsReg(modrm)) {
23489            UInt rE = eregOfRexRM(pfx,modrm);
23490            putYMMRegLoAndZU( rG, getXMMReg( rE ));
23491            DIP("vmovups %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
23492            delta += 1;
23493         } else {
23494            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23495            putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
23496            DIP("vmovups %s,%s\n", dis_buf, nameXMMReg(rG));
23497            delta += alen;
23498         }
23499         goto decode_success;
23500      }
23501      /* VMOVUPS ymm2/m256, ymm1 = VEX.256.0F.WIG 10 /r */
23502      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
23503         UChar modrm = getUChar(delta);
23504         UInt  rG    = gregOfRexRM(pfx, modrm);
23505         if (epartIsReg(modrm)) {
23506            UInt rE = eregOfRexRM(pfx,modrm);
23507            putYMMReg( rG, getYMMReg( rE ));
23508            DIP("vmovups %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
23509            delta += 1;
23510         } else {
23511            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23512            putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
23513            DIP("vmovups %s,%s\n", dis_buf, nameYMMReg(rG));
23514            delta += alen;
23515         }
23516         goto decode_success;
23517      }
23518      break;
23519
23520   case 0x11:
23521      /* VMOVSD xmm1, m64 = VEX.LIG.F2.0F.WIG 11 /r */
23522      /* Move 64 bits from G (low half xmm) to mem only. */
23523      if (haveF2no66noF3(pfx) && !epartIsReg(getUChar(delta))) {
23524         UChar modrm = getUChar(delta);
23525         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23526         UInt   rG   = gregOfRexRM(pfx,modrm);
23527         /* FIXME: ALIGNMENT CHECK? */
23528         storeLE( mkexpr(addr), getXMMRegLane64(rG, 0));
23529         DIP("vmovsd %s,%s\n", nameXMMReg(rG), dis_buf);
23530         delta += alen;
23531         goto decode_success;
23532      }
23533      /* VMOVSD xmm3, xmm2, xmm1 = VEX.LIG.F2.0F.WIG 11 /r */
23534      /* Reg form. */
23535      if (haveF2no66noF3(pfx) && epartIsReg(getUChar(delta))) {
23536         UChar modrm = getUChar(delta);
23537         UInt  rG    = gregOfRexRM(pfx, modrm);
23538         UInt  rE    = eregOfRexRM(pfx, modrm);
23539         UInt  rV    = getVexNvvvv(pfx);
23540         delta++;
23541         DIP("vmovsd %s,%s,%s\n",
23542             nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
23543         IRTemp res = newTemp(Ity_V128);
23544         assign(res, binop(Iop_64HLtoV128,
23545                           getXMMRegLane64(rV, 1),
23546                           getXMMRegLane64(rE, 0)));
23547         putYMMRegLoAndZU(rG, mkexpr(res));
23548         *uses_vvvv = True;
23549         goto decode_success;
23550      }
23551      /* VMOVSS xmm1, m64 = VEX.LIG.F3.0F.WIG 11 /r */
23552      /* Move 32 bits from G (low 1/4 xmm) to mem only. */
23553      if (haveF3no66noF2(pfx) && !epartIsReg(getUChar(delta))) {
23554         UChar modrm = getUChar(delta);
23555         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23556         UInt   rG   = gregOfRexRM(pfx,modrm);
23557         /* FIXME: ALIGNMENT CHECK? */
23558         storeLE( mkexpr(addr), getXMMRegLane32(rG, 0));
23559         DIP("vmovss %s,%s\n", nameXMMReg(rG), dis_buf);
23560         delta += alen;
23561         goto decode_success;
23562      }
23563      /* VMOVSS xmm3, xmm2, xmm1 = VEX.LIG.F3.0F.WIG 11 /r */
23564      /* Reg form. */
23565      if (haveF3no66noF2(pfx) && epartIsReg(getUChar(delta))) {
23566         UChar modrm = getUChar(delta);
23567         UInt  rG    = gregOfRexRM(pfx, modrm);
23568         UInt  rE    = eregOfRexRM(pfx, modrm);
23569         UInt  rV    = getVexNvvvv(pfx);
23570         delta++;
23571         DIP("vmovss %s,%s,%s\n",
23572             nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
23573         IRTemp res = newTemp(Ity_V128);
23574         assign( res, binop( Iop_64HLtoV128,
23575                             getXMMRegLane64(rV, 1),
23576                             binop(Iop_32HLto64,
23577                                   getXMMRegLane32(rV, 1),
23578                                   getXMMRegLane32(rE, 0)) ) );
23579         putYMMRegLoAndZU(rG, mkexpr(res));
23580         *uses_vvvv = True;
23581         goto decode_success;
23582      }
23583      /* VMOVUPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 11 /r */
23584      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23585         UChar modrm = getUChar(delta);
23586         UInt  rG    = gregOfRexRM(pfx,modrm);
23587         if (epartIsReg(modrm)) {
23588            UInt rE = eregOfRexRM(pfx,modrm);
23589            putYMMRegLoAndZU( rE, getXMMReg(rG) );
23590            DIP("vmovupd %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
23591            delta += 1;
23592         } else {
23593            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23594            storeLE( mkexpr(addr), getXMMReg(rG) );
23595            DIP("vmovupd %s,%s\n", nameXMMReg(rG), dis_buf);
23596            delta += alen;
23597         }
23598         goto decode_success;
23599      }
23600      /* VMOVUPD ymm1, ymm2/m256 = VEX.256.66.0F.WIG 11 /r */
23601      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
23602         UChar modrm = getUChar(delta);
23603         UInt  rG    = gregOfRexRM(pfx,modrm);
23604         if (epartIsReg(modrm)) {
23605            UInt rE = eregOfRexRM(pfx,modrm);
23606            putYMMReg( rE, getYMMReg(rG) );
23607            DIP("vmovupd %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
23608            delta += 1;
23609         } else {
23610            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23611            storeLE( mkexpr(addr), getYMMReg(rG) );
23612            DIP("vmovupd %s,%s\n", nameYMMReg(rG), dis_buf);
23613            delta += alen;
23614         }
23615         goto decode_success;
23616      }
23617      /* VMOVUPS xmm1, xmm2/m128 = VEX.128.0F.WIG 11 /r */
23618      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23619         UChar modrm = getUChar(delta);
23620         UInt  rG    = gregOfRexRM(pfx,modrm);
23621         if (epartIsReg(modrm)) {
23622            UInt rE = eregOfRexRM(pfx,modrm);
23623            putYMMRegLoAndZU( rE, getXMMReg(rG) );
23624            DIP("vmovups %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
23625            delta += 1;
23626         } else {
23627            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23628            storeLE( mkexpr(addr), getXMMReg(rG) );
23629            DIP("vmovups %s,%s\n", nameXMMReg(rG), dis_buf);
23630            delta += alen;
23631         }
23632         goto decode_success;
23633      }
23634      /* VMOVUPS ymm1, ymm2/m256 = VEX.256.0F.WIG 11 /r */
23635      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
23636         UChar modrm = getUChar(delta);
23637         UInt  rG    = gregOfRexRM(pfx,modrm);
23638         if (epartIsReg(modrm)) {
23639            UInt rE = eregOfRexRM(pfx,modrm);
23640            putYMMReg( rE, getYMMReg(rG) );
23641            DIP("vmovups %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
23642            delta += 1;
23643         } else {
23644            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23645            storeLE( mkexpr(addr), getYMMReg(rG) );
23646            DIP("vmovups %s,%s\n", nameYMMReg(rG), dis_buf);
23647            delta += alen;
23648         }
23649         goto decode_success;
23650      }
23651      break;
23652
23653   case 0x12:
23654      /* VMOVDDUP xmm2/m64, xmm1 = VEX.128.F2.0F.WIG /12 r */
23655      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23656         delta = dis_MOVDDUP_128( vbi, pfx, delta, True/*isAvx*/ );
23657         goto decode_success;
23658      }
23659      /* VMOVDDUP ymm2/m256, ymm1 = VEX.256.F2.0F.WIG /12 r */
23660      if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
23661         delta = dis_MOVDDUP_256( vbi, pfx, delta );
23662         goto decode_success;
23663      }
23664      /* VMOVHLPS xmm3, xmm2, xmm1 = VEX.NDS.128.0F.WIG 12 /r */
23665      /* Insn only exists in reg form */
23666      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
23667          && epartIsReg(getUChar(delta))) {
23668         UChar modrm = getUChar(delta);
23669         UInt  rG    = gregOfRexRM(pfx, modrm);
23670         UInt  rE    = eregOfRexRM(pfx, modrm);
23671         UInt  rV    = getVexNvvvv(pfx);
23672         delta++;
23673         DIP("vmovhlps %s,%s,%s\n",
23674             nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
23675         IRTemp res = newTemp(Ity_V128);
23676         assign(res, binop(Iop_64HLtoV128,
23677                           getXMMRegLane64(rV, 1),
23678                           getXMMRegLane64(rE, 1)));
23679         putYMMRegLoAndZU(rG, mkexpr(res));
23680         *uses_vvvv = True;
23681         goto decode_success;
23682      }
23683      /* VMOVLPS m64, xmm1, xmm2 = VEX.NDS.128.0F.WIG 12 /r */
23684      /* Insn exists only in mem form, it appears. */
23685      /* VMOVLPD m64, xmm1, xmm2 = VEX.NDS.128.66.0F.WIG 12 /r */
23686      /* Insn exists only in mem form, it appears. */
23687      if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
23688          && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
23689         UChar modrm = getUChar(delta);
23690         UInt  rG    = gregOfRexRM(pfx, modrm);
23691         UInt  rV    = getVexNvvvv(pfx);
23692         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23693         delta += alen;
23694         DIP("vmovlpd %s,%s,%s\n",
23695             dis_buf, nameXMMReg(rV), nameXMMReg(rG));
23696         IRTemp res = newTemp(Ity_V128);
23697         assign(res, binop(Iop_64HLtoV128,
23698                           getXMMRegLane64(rV, 1),
23699                           loadLE(Ity_I64, mkexpr(addr))));
23700         putYMMRegLoAndZU(rG, mkexpr(res));
23701         *uses_vvvv = True;
23702         goto decode_success;
23703      }
23704      /* VMOVSLDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 12 /r */
23705      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
23706         delta = dis_MOVSxDUP_128( vbi, pfx, delta, True/*isAvx*/,
23707                                   True/*isL*/ );
23708         goto decode_success;
23709      }
23710      /* VMOVSLDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 12 /r */
23711      if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
23712         delta = dis_MOVSxDUP_256( vbi, pfx, delta, True/*isL*/ );
23713         goto decode_success;
23714      }
23715      break;
23716
23717   case 0x13:
23718      /* VMOVLPS xmm1, m64 = VEX.128.0F.WIG 13 /r */
23719      /* Insn exists only in mem form, it appears. */
23720      /* VMOVLPD xmm1, m64 = VEX.128.66.0F.WIG 13 /r */
23721      /* Insn exists only in mem form, it appears. */
23722      if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
23723          && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
23724         UChar modrm = getUChar(delta);
23725         UInt  rG    = gregOfRexRM(pfx, modrm);
23726         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23727         delta += alen;
23728         storeLE( mkexpr(addr), getXMMRegLane64( rG, 0));
23729         DIP("vmovlpd %s,%s\n", nameXMMReg(rG), dis_buf);
23730         goto decode_success;
23731      }
23732      break;
23733
23734   case 0x14:
23735   case 0x15:
23736      /* VUNPCKLPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 14 /r */
23737      /* VUNPCKHPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 15 /r */
23738      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23739         Bool   hi    = opc == 0x15;
23740         UChar  modrm = getUChar(delta);
23741         UInt   rG    = gregOfRexRM(pfx,modrm);
23742         UInt   rV    = getVexNvvvv(pfx);
23743         IRTemp eV    = newTemp(Ity_V128);
23744         IRTemp vV    = newTemp(Ity_V128);
23745         assign( vV, getXMMReg(rV) );
23746         if (epartIsReg(modrm)) {
23747            UInt rE = eregOfRexRM(pfx,modrm);
23748            assign( eV, getXMMReg(rE) );
23749            delta += 1;
23750            DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
23751                nameXMMReg(rE), nameXMMReg(rG));
23752         } else {
23753            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23754            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
23755            delta += alen;
23756            DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
23757                dis_buf, nameXMMReg(rG));
23758         }
23759         IRTemp res = math_UNPCKxPS_128( eV, vV, hi );
23760         putYMMRegLoAndZU( rG, mkexpr(res) );
23761         *uses_vvvv = True;
23762         goto decode_success;
23763      }
23764      /* VUNPCKLPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 14 /r */
23765      /* VUNPCKHPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 15 /r */
23766      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
23767         Bool   hi    = opc == 0x15;
23768         UChar  modrm = getUChar(delta);
23769         UInt   rG    = gregOfRexRM(pfx,modrm);
23770         UInt   rV    = getVexNvvvv(pfx);
23771         IRTemp eV    = newTemp(Ity_V256);
23772         IRTemp vV    = newTemp(Ity_V256);
23773         assign( vV, getYMMReg(rV) );
23774         if (epartIsReg(modrm)) {
23775            UInt rE = eregOfRexRM(pfx,modrm);
23776            assign( eV, getYMMReg(rE) );
23777            delta += 1;
23778            DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
23779                nameYMMReg(rE), nameYMMReg(rG));
23780         } else {
23781            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23782            assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
23783            delta += alen;
23784            DIP("vunpck%sps %s,%s\n", hi ? "h" : "l",
23785                dis_buf, nameYMMReg(rG));
23786         }
23787         IRTemp res = math_UNPCKxPS_256( eV, vV, hi );
23788         putYMMReg( rG, mkexpr(res) );
23789         *uses_vvvv = True;
23790         goto decode_success;
23791      }
23792      /* VUNPCKLPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 14 /r */
23793      /* VUNPCKHPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 15 /r */
23794      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23795         Bool   hi    = opc == 0x15;
23796         UChar  modrm = getUChar(delta);
23797         UInt   rG    = gregOfRexRM(pfx,modrm);
23798         UInt   rV    = getVexNvvvv(pfx);
23799         IRTemp eV    = newTemp(Ity_V128);
23800         IRTemp vV    = newTemp(Ity_V128);
23801         assign( vV, getXMMReg(rV) );
23802         if (epartIsReg(modrm)) {
23803            UInt rE = eregOfRexRM(pfx,modrm);
23804            assign( eV, getXMMReg(rE) );
23805            delta += 1;
23806            DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
23807                nameXMMReg(rE), nameXMMReg(rG));
23808         } else {
23809            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23810            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
23811            delta += alen;
23812            DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
23813                dis_buf, nameXMMReg(rG));
23814         }
23815         IRTemp res = math_UNPCKxPD_128( eV, vV, hi );
23816         putYMMRegLoAndZU( rG, mkexpr(res) );
23817         *uses_vvvv = True;
23818         goto decode_success;
23819      }
23820      /* VUNPCKLPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 14 /r */
23821      /* VUNPCKHPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 15 /r */
23822      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
23823         Bool   hi    = opc == 0x15;
23824         UChar  modrm = getUChar(delta);
23825         UInt   rG    = gregOfRexRM(pfx,modrm);
23826         UInt   rV    = getVexNvvvv(pfx);
23827         IRTemp eV    = newTemp(Ity_V256);
23828         IRTemp vV    = newTemp(Ity_V256);
23829         assign( vV, getYMMReg(rV) );
23830         if (epartIsReg(modrm)) {
23831            UInt rE = eregOfRexRM(pfx,modrm);
23832            assign( eV, getYMMReg(rE) );
23833            delta += 1;
23834            DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
23835                nameYMMReg(rE), nameYMMReg(rG));
23836         } else {
23837            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23838            assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
23839            delta += alen;
23840            DIP("vunpck%spd %s,%s\n", hi ? "h" : "l",
23841                dis_buf, nameYMMReg(rG));
23842         }
23843         IRTemp res = math_UNPCKxPD_256( eV, vV, hi );
23844         putYMMReg( rG, mkexpr(res) );
23845         *uses_vvvv = True;
23846         goto decode_success;
23847      }
23848      break;
23849
23850   case 0x16:
23851      /* VMOVLHPS xmm3, xmm2, xmm1 = VEX.NDS.128.0F.WIG 16 /r */
23852      /* Insn only exists in reg form */
23853      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
23854          && epartIsReg(getUChar(delta))) {
23855         UChar modrm = getUChar(delta);
23856         UInt  rG    = gregOfRexRM(pfx, modrm);
23857         UInt  rE    = eregOfRexRM(pfx, modrm);
23858         UInt  rV    = getVexNvvvv(pfx);
23859         delta++;
23860         DIP("vmovlhps %s,%s,%s\n",
23861             nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
23862         IRTemp res = newTemp(Ity_V128);
23863         assign(res, binop(Iop_64HLtoV128,
23864                           getXMMRegLane64(rE, 0),
23865                           getXMMRegLane64(rV, 0)));
23866         putYMMRegLoAndZU(rG, mkexpr(res));
23867         *uses_vvvv = True;
23868         goto decode_success;
23869      }
23870      /* VMOVHPS m64, xmm1, xmm2 = VEX.NDS.128.0F.WIG 16 /r */
23871      /* Insn exists only in mem form, it appears. */
23872      /* VMOVHPD m64, xmm1, xmm2 = VEX.NDS.128.66.0F.WIG 16 /r */
23873      /* Insn exists only in mem form, it appears. */
23874      if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
23875          && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
23876         UChar modrm = getUChar(delta);
23877         UInt  rG    = gregOfRexRM(pfx, modrm);
23878         UInt  rV    = getVexNvvvv(pfx);
23879         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23880         delta += alen;
23881         DIP("vmovhp%c %s,%s,%s\n", have66(pfx) ? 'd' : 's',
23882             dis_buf, nameXMMReg(rV), nameXMMReg(rG));
23883         IRTemp res = newTemp(Ity_V128);
23884         assign(res, binop(Iop_64HLtoV128,
23885                           loadLE(Ity_I64, mkexpr(addr)),
23886                           getXMMRegLane64(rV, 0)));
23887         putYMMRegLoAndZU(rG, mkexpr(res));
23888         *uses_vvvv = True;
23889         goto decode_success;
23890      }
23891      /* VMOVSHDUP xmm2/m128, xmm1 = VEX.NDS.128.F3.0F.WIG 16 /r */
23892      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
23893         delta = dis_MOVSxDUP_128( vbi, pfx, delta, True/*isAvx*/,
23894                                   False/*!isL*/ );
23895         goto decode_success;
23896      }
23897      /* VMOVSHDUP ymm2/m256, ymm1 = VEX.NDS.256.F3.0F.WIG 16 /r */
23898      if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
23899         delta = dis_MOVSxDUP_256( vbi, pfx, delta, False/*!isL*/ );
23900         goto decode_success;
23901      }
23902      break;
23903
23904   case 0x17:
23905      /* VMOVHPS xmm1, m64 = VEX.128.0F.WIG 17 /r */
23906      /* Insn exists only in mem form, it appears. */
23907      /* VMOVHPD xmm1, m64 = VEX.128.66.0F.WIG 17 /r */
23908      /* Insn exists only in mem form, it appears. */
23909      if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
23910          && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
23911         UChar modrm = getUChar(delta);
23912         UInt  rG    = gregOfRexRM(pfx, modrm);
23913         addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23914         delta += alen;
23915         storeLE( mkexpr(addr), getXMMRegLane64( rG, 1));
23916         DIP("vmovhp%c %s,%s\n", have66(pfx) ? 'd' : 's',
23917             nameXMMReg(rG), dis_buf);
23918         goto decode_success;
23919      }
23920      break;
23921
23922   case 0x28:
23923      /* VMOVAPD xmm2/m128, xmm1 = VEX.128.66.0F.WIG 28 /r */
23924      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23925         UChar modrm = getUChar(delta);
23926         UInt  rG    = gregOfRexRM(pfx, modrm);
23927         if (epartIsReg(modrm)) {
23928            UInt rE = eregOfRexRM(pfx,modrm);
23929            putYMMRegLoAndZU( rG, getXMMReg( rE ));
23930            DIP("vmovapd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
23931            delta += 1;
23932         } else {
23933            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23934            gen_SEGV_if_not_16_aligned( addr );
23935            putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
23936            DIP("vmovapd %s,%s\n", dis_buf, nameXMMReg(rG));
23937            delta += alen;
23938         }
23939         goto decode_success;
23940      }
23941      /* VMOVAPD ymm2/m256, ymm1 = VEX.256.66.0F.WIG 28 /r */
23942      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
23943         UChar modrm = getUChar(delta);
23944         UInt  rG    = gregOfRexRM(pfx, modrm);
23945         if (epartIsReg(modrm)) {
23946            UInt rE = eregOfRexRM(pfx,modrm);
23947            putYMMReg( rG, getYMMReg( rE ));
23948            DIP("vmovapd %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
23949            delta += 1;
23950         } else {
23951            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23952            gen_SEGV_if_not_32_aligned( addr );
23953            putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
23954            DIP("vmovapd %s,%s\n", dis_buf, nameYMMReg(rG));
23955            delta += alen;
23956         }
23957         goto decode_success;
23958      }
23959      /* VMOVAPS xmm2/m128, xmm1 = VEX.128.0F.WIG 28 /r */
23960      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
23961         UChar modrm = getUChar(delta);
23962         UInt  rG    = gregOfRexRM(pfx, modrm);
23963         if (epartIsReg(modrm)) {
23964            UInt rE = eregOfRexRM(pfx,modrm);
23965            putYMMRegLoAndZU( rG, getXMMReg( rE ));
23966            DIP("vmovaps %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
23967            delta += 1;
23968         } else {
23969            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23970            gen_SEGV_if_not_16_aligned( addr );
23971            putYMMRegLoAndZU( rG, loadLE(Ity_V128, mkexpr(addr)) );
23972            DIP("vmovaps %s,%s\n", dis_buf, nameXMMReg(rG));
23973            delta += alen;
23974         }
23975         goto decode_success;
23976      }
23977      /* VMOVAPS ymm2/m256, ymm1 = VEX.256.0F.WIG 28 /r */
23978      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
23979         UChar modrm = getUChar(delta);
23980         UInt  rG    = gregOfRexRM(pfx, modrm);
23981         if (epartIsReg(modrm)) {
23982            UInt rE = eregOfRexRM(pfx,modrm);
23983            putYMMReg( rG, getYMMReg( rE ));
23984            DIP("vmovaps %s,%s\n", nameYMMReg(rE), nameYMMReg(rG));
23985            delta += 1;
23986         } else {
23987            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
23988            gen_SEGV_if_not_32_aligned( addr );
23989            putYMMReg( rG, loadLE(Ity_V256, mkexpr(addr)) );
23990            DIP("vmovaps %s,%s\n", dis_buf, nameYMMReg(rG));
23991            delta += alen;
23992         }
23993         goto decode_success;
23994      }
23995      break;
23996
23997   case 0x29:
23998      /* VMOVAPD xmm1, xmm2/m128 = VEX.128.66.0F.WIG 29 /r */
23999      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24000         UChar modrm = getUChar(delta);
24001         UInt  rG    = gregOfRexRM(pfx,modrm);
24002         if (epartIsReg(modrm)) {
24003            UInt rE = eregOfRexRM(pfx,modrm);
24004            putYMMRegLoAndZU( rE, getXMMReg(rG) );
24005            DIP("vmovapd %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
24006            delta += 1;
24007         } else {
24008            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24009            gen_SEGV_if_not_16_aligned( addr );
24010            storeLE( mkexpr(addr), getXMMReg(rG) );
24011            DIP("vmovapd %s,%s\n", nameXMMReg(rG), dis_buf );
24012            delta += alen;
24013         }
24014         goto decode_success;
24015      }
24016      /* VMOVAPD ymm1, ymm2/m256 = VEX.256.66.0F.WIG 29 /r */
24017      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24018         UChar modrm = getUChar(delta);
24019         UInt  rG    = gregOfRexRM(pfx,modrm);
24020         if (epartIsReg(modrm)) {
24021            UInt rE = eregOfRexRM(pfx,modrm);
24022            putYMMReg( rE, getYMMReg(rG) );
24023            DIP("vmovapd %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
24024            delta += 1;
24025         } else {
24026            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24027            gen_SEGV_if_not_32_aligned( addr );
24028            storeLE( mkexpr(addr), getYMMReg(rG) );
24029            DIP("vmovapd %s,%s\n", nameYMMReg(rG), dis_buf );
24030            delta += alen;
24031         }
24032         goto decode_success;
24033      }
24034      /* VMOVAPS xmm1, xmm2/m128 = VEX.128.0F.WIG 29 /r */
24035      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24036         UChar modrm = getUChar(delta);
24037         UInt  rG    = gregOfRexRM(pfx,modrm);
24038         if (epartIsReg(modrm)) {
24039            UInt rE = eregOfRexRM(pfx,modrm);
24040            putYMMRegLoAndZU( rE, getXMMReg(rG) );
24041            DIP("vmovaps %s,%s\n", nameXMMReg(rG), nameXMMReg(rE));
24042            delta += 1;
24043            goto decode_success;
24044         } else {
24045            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24046            gen_SEGV_if_not_16_aligned( addr );
24047            storeLE( mkexpr(addr), getXMMReg(rG) );
24048            DIP("vmovaps %s,%s\n", nameXMMReg(rG), dis_buf );
24049            delta += alen;
24050            goto decode_success;
24051         }
24052      }
24053      /* VMOVAPS ymm1, ymm2/m256 = VEX.256.0F.WIG 29 /r */
24054      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24055         UChar modrm = getUChar(delta);
24056         UInt  rG    = gregOfRexRM(pfx,modrm);
24057         if (epartIsReg(modrm)) {
24058            UInt rE = eregOfRexRM(pfx,modrm);
24059            putYMMReg( rE, getYMMReg(rG) );
24060            DIP("vmovaps %s,%s\n", nameYMMReg(rG), nameYMMReg(rE));
24061            delta += 1;
24062            goto decode_success;
24063         } else {
24064            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24065            gen_SEGV_if_not_32_aligned( addr );
24066            storeLE( mkexpr(addr), getYMMReg(rG) );
24067            DIP("vmovaps %s,%s\n", nameYMMReg(rG), dis_buf );
24068            delta += alen;
24069            goto decode_success;
24070         }
24071      }
24072      break;
24073
24074   case 0x2A: {
24075      IRTemp rmode = newTemp(Ity_I32);
24076      assign( rmode, get_sse_roundingmode() );
24077      /* VCVTSI2SD r/m32, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W0 2A /r */
24078      if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
24079         UChar  modrm = getUChar(delta);
24080         UInt   rV    = getVexNvvvv(pfx);
24081         UInt   rD    = gregOfRexRM(pfx, modrm);
24082         IRTemp arg32 = newTemp(Ity_I32);
24083         if (epartIsReg(modrm)) {
24084            UInt rS = eregOfRexRM(pfx,modrm);
24085            assign( arg32, getIReg32(rS) );
24086            delta += 1;
24087            DIP("vcvtsi2sdl %s,%s,%s\n",
24088                nameIReg32(rS), nameXMMReg(rV), nameXMMReg(rD));
24089         } else {
24090            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24091            assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
24092            delta += alen;
24093            DIP("vcvtsi2sdl %s,%s,%s\n",
24094                dis_buf, nameXMMReg(rV), nameXMMReg(rD));
24095         }
24096         putXMMRegLane64F( rD, 0,
24097                           unop(Iop_I32StoF64, mkexpr(arg32)));
24098         putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
24099         putYMMRegLane128( rD, 1, mkV128(0) );
24100         *uses_vvvv = True;
24101         goto decode_success;
24102      }
24103      /* VCVTSI2SD r/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.W1 2A /r */
24104      if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
24105         UChar  modrm = getUChar(delta);
24106         UInt   rV    = getVexNvvvv(pfx);
24107         UInt   rD    = gregOfRexRM(pfx, modrm);
24108         IRTemp arg64 = newTemp(Ity_I64);
24109         if (epartIsReg(modrm)) {
24110            UInt rS = eregOfRexRM(pfx,modrm);
24111            assign( arg64, getIReg64(rS) );
24112            delta += 1;
24113            DIP("vcvtsi2sdq %s,%s,%s\n",
24114                nameIReg64(rS), nameXMMReg(rV), nameXMMReg(rD));
24115         } else {
24116            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24117            assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
24118            delta += alen;
24119            DIP("vcvtsi2sdq %s,%s,%s\n",
24120                dis_buf, nameXMMReg(rV), nameXMMReg(rD));
24121         }
24122         putXMMRegLane64F( rD, 0,
24123                           binop( Iop_I64StoF64,
24124                                  get_sse_roundingmode(),
24125                                  mkexpr(arg64)) );
24126         putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
24127         putYMMRegLane128( rD, 1, mkV128(0) );
24128         *uses_vvvv = True;
24129         goto decode_success;
24130      }
24131      /* VCVTSI2SS r/m64, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W1 2A /r */
24132      if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
24133         UChar  modrm = getUChar(delta);
24134         UInt   rV    = getVexNvvvv(pfx);
24135         UInt   rD    = gregOfRexRM(pfx, modrm);
24136         IRTemp arg64 = newTemp(Ity_I64);
24137         if (epartIsReg(modrm)) {
24138            UInt rS = eregOfRexRM(pfx,modrm);
24139            assign( arg64, getIReg64(rS) );
24140            delta += 1;
24141            DIP("vcvtsi2ssq %s,%s,%s\n",
24142                nameIReg64(rS), nameXMMReg(rV), nameXMMReg(rD));
24143         } else {
24144            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24145            assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
24146            delta += alen;
24147            DIP("vcvtsi2ssq %s,%s,%s\n",
24148                dis_buf, nameXMMReg(rV), nameXMMReg(rD));
24149         }
24150         putXMMRegLane32F( rD, 0,
24151                           binop(Iop_F64toF32,
24152                                 mkexpr(rmode),
24153                                 binop(Iop_I64StoF64, mkexpr(rmode),
24154                                                      mkexpr(arg64)) ) );
24155         putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
24156         putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
24157         putYMMRegLane128( rD, 1, mkV128(0) );
24158         *uses_vvvv = True;
24159         goto decode_success;
24160      }
24161      /* VCVTSI2SS r/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.W0 2A /r */
24162      if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
24163         UChar  modrm = getUChar(delta);
24164         UInt   rV    = getVexNvvvv(pfx);
24165         UInt   rD    = gregOfRexRM(pfx, modrm);
24166         IRTemp arg32 = newTemp(Ity_I32);
24167         if (epartIsReg(modrm)) {
24168            UInt rS = eregOfRexRM(pfx,modrm);
24169            assign( arg32, getIReg32(rS) );
24170            delta += 1;
24171            DIP("vcvtsi2ssl %s,%s,%s\n",
24172                nameIReg32(rS), nameXMMReg(rV), nameXMMReg(rD));
24173         } else {
24174            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24175            assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
24176            delta += alen;
24177            DIP("vcvtsi2ssl %s,%s,%s\n",
24178                dis_buf, nameXMMReg(rV), nameXMMReg(rD));
24179         }
24180         putXMMRegLane32F( rD, 0,
24181                           binop(Iop_F64toF32,
24182                                 mkexpr(rmode),
24183                                 unop(Iop_I32StoF64, mkexpr(arg32)) ) );
24184         putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
24185         putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
24186         putYMMRegLane128( rD, 1, mkV128(0) );
24187         *uses_vvvv = True;
24188         goto decode_success;
24189      }
24190      break;
24191   }
24192
24193   case 0x2B:
24194      /* VMOVNTPD xmm1, m128 = VEX.128.66.0F.WIG 2B /r */
24195      /* VMOVNTPS xmm1, m128 = VEX.128.0F.WIG 2B /r */
24196      if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
24197          && 0==getVexL(pfx)/*128*/ && !epartIsReg(getUChar(delta))) {
24198         UChar  modrm = getUChar(delta);
24199         UInt   rS    = gregOfRexRM(pfx, modrm);
24200         IRTemp tS    = newTemp(Ity_V128);
24201         assign(tS, getXMMReg(rS));
24202         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
24203         delta += alen;
24204         gen_SEGV_if_not_16_aligned(addr);
24205         storeLE(mkexpr(addr), mkexpr(tS));
24206         DIP("vmovntp%c %s,%s\n", have66(pfx) ? 'd' : 's',
24207             nameXMMReg(rS), dis_buf);
24208         goto decode_success;
24209      }
24210      /* VMOVNTPD ymm1, m256 = VEX.256.66.0F.WIG 2B /r */
24211      /* VMOVNTPS ymm1, m256 = VEX.256.0F.WIG 2B /r */
24212      if ((have66noF2noF3(pfx) || haveNo66noF2noF3(pfx))
24213          && 1==getVexL(pfx)/*256*/ && !epartIsReg(getUChar(delta))) {
24214         UChar  modrm = getUChar(delta);
24215         UInt   rS    = gregOfRexRM(pfx, modrm);
24216         IRTemp tS    = newTemp(Ity_V256);
24217         assign(tS, getYMMReg(rS));
24218         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
24219         delta += alen;
24220         gen_SEGV_if_not_32_aligned(addr);
24221         storeLE(mkexpr(addr), mkexpr(tS));
24222         DIP("vmovntp%c %s,%s\n", have66(pfx) ? 'd' : 's',
24223             nameYMMReg(rS), dis_buf);
24224         goto decode_success;
24225      }
24226      break;
24227
24228   case 0x2C:
24229      /* VCVTTSD2SI xmm1/m32, r32 = VEX.LIG.F2.0F.W0 2C /r */
24230      if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
24231         delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
24232         goto decode_success;
24233      }
24234      /* VCVTTSD2SI xmm1/m64, r64 = VEX.LIG.F2.0F.W1 2C /r */
24235      if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
24236         delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
24237         goto decode_success;
24238      }
24239      /* VCVTTSS2SI xmm1/m32, r32 = VEX.LIG.F3.0F.W0 2C /r */
24240      if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
24241         delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
24242         goto decode_success;
24243      }
24244      /* VCVTTSS2SI xmm1/m64, r64 = VEX.LIG.F3.0F.W1 2C /r */
24245      if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
24246         delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
24247         goto decode_success;
24248      }
24249      break;
24250
24251   case 0x2D:
24252      /* VCVTSD2SI xmm1/m32, r32 = VEX.LIG.F2.0F.W0 2D /r */
24253      if (haveF2no66noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
24254         delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
24255         goto decode_success;
24256      }
24257      /* VCVTSD2SI xmm1/m64, r64 = VEX.LIG.F2.0F.W1 2D /r */
24258      if (haveF2no66noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
24259         delta = dis_CVTxSD2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
24260         goto decode_success;
24261      }
24262      /* VCVTSS2SI xmm1/m32, r32 = VEX.LIG.F3.0F.W0 2D /r */
24263      if (haveF3no66noF2(pfx) && 0==getRexW(pfx)/*W0*/) {
24264         delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 4);
24265         goto decode_success;
24266      }
24267      /* VCVTSS2SI xmm1/m64, r64 = VEX.LIG.F3.0F.W1 2D /r */
24268      if (haveF3no66noF2(pfx) && 1==getRexW(pfx)/*W1*/) {
24269         delta = dis_CVTxSS2SI( vbi, pfx, delta, True/*isAvx*/, opc, 8);
24270         goto decode_success;
24271      }
24272      break;
24273
24274   case 0x2E:
24275   case 0x2F:
24276      /* VUCOMISD xmm2/m64, xmm1 = VEX.LIG.66.0F.WIG 2E /r */
24277      /* VCOMISD  xmm2/m64, xmm1 = VEX.LIG.66.0F.WIG 2F /r */
24278      if (have66noF2noF3(pfx)) {
24279         delta = dis_COMISD( vbi, pfx, delta, True/*isAvx*/, opc );
24280         goto decode_success;
24281      }
24282      /* VUCOMISS xmm2/m32, xmm1 = VEX.LIG.0F.WIG 2E /r */
24283      /* VCOMISS xmm2/m32, xmm1  = VEX.LIG.0F.WIG 2F /r */
24284      if (haveNo66noF2noF3(pfx)) {
24285         delta = dis_COMISS( vbi, pfx, delta, True/*isAvx*/, opc );
24286         goto decode_success;
24287      }
24288      break;
24289
24290   case 0x50:
24291      /* VMOVMSKPD xmm2, r32 = VEX.128.66.0F.WIG 50 /r */
24292      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24293         delta = dis_MOVMSKPD_128( vbi, pfx, delta, True/*isAvx*/ );
24294         goto decode_success;
24295      }
24296      /* VMOVMSKPD ymm2, r32 = VEX.256.66.0F.WIG 50 /r */
24297      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24298         delta = dis_MOVMSKPD_256( vbi, pfx, delta );
24299         goto decode_success;
24300      }
24301      /* VMOVMSKPS xmm2, r32 = VEX.128.0F.WIG 50 /r */
24302      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24303         delta = dis_MOVMSKPS_128( vbi, pfx, delta, True/*isAvx*/ );
24304         goto decode_success;
24305      }
24306      /* VMOVMSKPS ymm2, r32 = VEX.256.0F.WIG 50 /r */
24307      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24308         delta = dis_MOVMSKPS_256( vbi, pfx, delta );
24309         goto decode_success;
24310      }
24311      break;
24312
24313   case 0x51:
24314      /* VSQRTSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 51 /r */
24315      if (haveF3no66noF2(pfx)) {
24316         delta = dis_AVX128_E_V_to_G_lo32_unary(
24317                    uses_vvvv, vbi, pfx, delta, "vsqrtss", Iop_Sqrt32F0x4 );
24318         goto decode_success;
24319      }
24320      /* VSQRTPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 51 /r */
24321      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24322         delta = dis_AVX128_E_to_G_unary_all(
24323                    uses_vvvv, vbi, pfx, delta, "vsqrtps", Iop_Sqrt32Fx4 );
24324         goto decode_success;
24325      }
24326      /* VSQRTPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 51 /r */
24327      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24328         delta = dis_AVX256_E_to_G_unary_all(
24329                    uses_vvvv, vbi, pfx, delta, "vsqrtps", Iop_Sqrt32Fx8 );
24330         goto decode_success;
24331      }
24332      /* VSQRTSD xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F2.0F.WIG 51 /r */
24333      if (haveF2no66noF3(pfx)) {
24334         delta = dis_AVX128_E_V_to_G_lo64_unary(
24335                    uses_vvvv, vbi, pfx, delta, "vsqrtsd", Iop_Sqrt64F0x2 );
24336         goto decode_success;
24337      }
24338      /* VSQRTPD xmm2/m128(E), xmm1(G) = VEX.NDS.128.66.0F.WIG 51 /r */
24339      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24340         delta = dis_AVX128_E_to_G_unary_all(
24341                    uses_vvvv, vbi, pfx, delta, "vsqrtpd", Iop_Sqrt64Fx2 );
24342         goto decode_success;
24343      }
24344      /* VSQRTPD ymm2/m256(E), ymm1(G) = VEX.NDS.256.66.0F.WIG 51 /r */
24345      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24346         delta = dis_AVX256_E_to_G_unary_all(
24347                    uses_vvvv, vbi, pfx, delta, "vsqrtpd", Iop_Sqrt64Fx4 );
24348         goto decode_success;
24349      }
24350      break;
24351
24352   case 0x52:
24353      /* VRSQRTSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 52 /r */
24354      if (haveF3no66noF2(pfx)) {
24355         delta = dis_AVX128_E_V_to_G_lo32_unary(
24356                    uses_vvvv, vbi, pfx, delta, "vrsqrtss", Iop_RSqrt32F0x4 );
24357         goto decode_success;
24358      }
24359      /* VRSQRTPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 52 /r */
24360      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24361         delta = dis_AVX128_E_to_G_unary_all(
24362                    uses_vvvv, vbi, pfx, delta, "vrsqrtps", Iop_RSqrt32Fx4 );
24363         goto decode_success;
24364      }
24365      /* VRSQRTPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 52 /r */
24366      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24367         delta = dis_AVX256_E_to_G_unary_all(
24368                    uses_vvvv, vbi, pfx, delta, "vrsqrtps", Iop_RSqrt32Fx8 );
24369         goto decode_success;
24370      }
24371      break;
24372
24373   case 0x53:
24374      /* VRCPSS xmm3/m64(E), xmm2(V), xmm1(G) = VEX.NDS.LIG.F3.0F.WIG 53 /r */
24375      if (haveF3no66noF2(pfx)) {
24376         delta = dis_AVX128_E_V_to_G_lo32_unary(
24377                    uses_vvvv, vbi, pfx, delta, "vrcpss", Iop_Recip32F0x4 );
24378         goto decode_success;
24379      }
24380      /* VRCPPS xmm2/m128(E), xmm1(G) = VEX.NDS.128.0F.WIG 53 /r */
24381      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24382         delta = dis_AVX128_E_to_G_unary_all(
24383                    uses_vvvv, vbi, pfx, delta, "vrcpps", Iop_Recip32Fx4 );
24384         goto decode_success;
24385      }
24386      /* VRCPPS ymm2/m256(E), ymm1(G) = VEX.NDS.256.0F.WIG 53 /r */
24387      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24388         delta = dis_AVX256_E_to_G_unary_all(
24389                    uses_vvvv, vbi, pfx, delta, "vrcpps", Iop_Recip32Fx8 );
24390         goto decode_success;
24391      }
24392      break;
24393
24394   case 0x54:
24395      /* VANDPD r/m, rV, r ::: r = rV & r/m */
24396      /* VANDPD = VEX.NDS.128.66.0F.WIG 54 /r */
24397      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24398         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
24399                    uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV128 );
24400         goto decode_success;
24401      }
24402      /* VANDPD r/m, rV, r ::: r = rV & r/m */
24403      /* VANDPD = VEX.NDS.256.66.0F.WIG 54 /r */
24404      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24405         delta = dis_AVX256_E_V_to_G(
24406                    uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV256 );
24407         goto decode_success;
24408      }
24409      /* VANDPS = VEX.NDS.128.0F.WIG 54 /r */
24410      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24411         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
24412                    uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV128 );
24413         goto decode_success;
24414      }
24415      /* VANDPS = VEX.NDS.256.0F.WIG 54 /r */
24416      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24417         delta = dis_AVX256_E_V_to_G(
24418                    uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV256 );
24419         goto decode_success;
24420      }
24421      break;
24422
24423   case 0x55:
24424      /* VANDNPD r/m, rV, r ::: r = (not rV) & r/m */
24425      /* VANDNPD = VEX.NDS.128.66.0F.WIG 55 /r */
24426      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24427         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
24428                    uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV128,
24429                    NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
24430         goto decode_success;
24431      }
24432      /* VANDNPD = VEX.NDS.256.66.0F.WIG 55 /r */
24433      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24434         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
24435                    uses_vvvv, vbi, pfx, delta, "vandpd", Iop_AndV256,
24436                    NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
24437         goto decode_success;
24438      }
24439      /* VANDNPS = VEX.NDS.128.0F.WIG 55 /r */
24440      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24441         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
24442                    uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV128,
24443                    NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
24444         goto decode_success;
24445      }
24446      /* VANDNPS = VEX.NDS.256.0F.WIG 55 /r */
24447      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24448         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
24449                    uses_vvvv, vbi, pfx, delta, "vandps", Iop_AndV256,
24450                    NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
24451         goto decode_success;
24452      }
24453      break;
24454
24455   case 0x56:
24456      /* VORPD r/m, rV, r ::: r = rV | r/m */
24457      /* VORPD = VEX.NDS.128.66.0F.WIG 56 /r */
24458      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24459         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
24460                    uses_vvvv, vbi, pfx, delta, "vorpd", Iop_OrV128 );
24461         goto decode_success;
24462      }
24463      /* VORPD r/m, rV, r ::: r = rV | r/m */
24464      /* VORPD = VEX.NDS.256.66.0F.WIG 56 /r */
24465      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24466         delta = dis_AVX256_E_V_to_G(
24467                    uses_vvvv, vbi, pfx, delta, "vorpd", Iop_OrV256 );
24468         goto decode_success;
24469      }
24470      /* VORPS r/m, rV, r ::: r = rV | r/m */
24471      /* VORPS = VEX.NDS.128.0F.WIG 56 /r */
24472      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24473         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
24474                    uses_vvvv, vbi, pfx, delta, "vorps", Iop_OrV128 );
24475         goto decode_success;
24476      }
24477      /* VORPS r/m, rV, r ::: r = rV | r/m */
24478      /* VORPS = VEX.NDS.256.0F.WIG 56 /r */
24479      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24480         delta = dis_AVX256_E_V_to_G(
24481                    uses_vvvv, vbi, pfx, delta, "vorps", Iop_OrV256 );
24482         goto decode_success;
24483      }
24484      break;
24485
24486   case 0x57:
24487      /* VXORPD r/m, rV, r ::: r = rV ^ r/m */
24488      /* VXORPD = VEX.NDS.128.66.0F.WIG 57 /r */
24489      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24490         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
24491                    uses_vvvv, vbi, pfx, delta, "vxorpd", Iop_XorV128 );
24492         goto decode_success;
24493      }
24494      /* VXORPD r/m, rV, r ::: r = rV ^ r/m */
24495      /* VXORPD = VEX.NDS.256.66.0F.WIG 57 /r */
24496      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24497         delta = dis_AVX256_E_V_to_G(
24498                    uses_vvvv, vbi, pfx, delta, "vxorpd", Iop_XorV256 );
24499         goto decode_success;
24500      }
24501      /* VXORPS r/m, rV, r ::: r = rV ^ r/m */
24502      /* VXORPS = VEX.NDS.128.0F.WIG 57 /r */
24503      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24504         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
24505                    uses_vvvv, vbi, pfx, delta, "vxorps", Iop_XorV128 );
24506         goto decode_success;
24507      }
24508      /* VXORPS r/m, rV, r ::: r = rV ^ r/m */
24509      /* VXORPS = VEX.NDS.256.0F.WIG 57 /r */
24510      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24511         delta = dis_AVX256_E_V_to_G(
24512                    uses_vvvv, vbi, pfx, delta, "vxorps", Iop_XorV256 );
24513         goto decode_success;
24514      }
24515      break;
24516
24517   case 0x58:
24518      /* VADDSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 58 /r */
24519      if (haveF2no66noF3(pfx)) {
24520         delta = dis_AVX128_E_V_to_G_lo64(
24521                    uses_vvvv, vbi, pfx, delta, "vaddsd", Iop_Add64F0x2 );
24522         goto decode_success;
24523      }
24524      /* VADDSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 58 /r */
24525      if (haveF3no66noF2(pfx)) {
24526         delta = dis_AVX128_E_V_to_G_lo32(
24527                    uses_vvvv, vbi, pfx, delta, "vaddss", Iop_Add32F0x4 );
24528         goto decode_success;
24529      }
24530      /* VADDPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 58 /r */
24531      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24532         delta = dis_AVX128_E_V_to_G(
24533                    uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx4 );
24534         goto decode_success;
24535      }
24536      /* VADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 58 /r */
24537      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24538         delta = dis_AVX256_E_V_to_G(
24539                    uses_vvvv, vbi, pfx, delta, "vaddps", Iop_Add32Fx8 );
24540         goto decode_success;
24541      }
24542      /* VADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 58 /r */
24543      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24544         delta = dis_AVX128_E_V_to_G(
24545                    uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx2 );
24546         goto decode_success;
24547      }
24548      /* VADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 58 /r */
24549      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24550         delta = dis_AVX256_E_V_to_G(
24551                    uses_vvvv, vbi, pfx, delta, "vaddpd", Iop_Add64Fx4 );
24552         goto decode_success;
24553      }
24554      break;
24555
24556   case 0x59:
24557      /* VMULSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 59 /r */
24558      if (haveF2no66noF3(pfx)) {
24559         delta = dis_AVX128_E_V_to_G_lo64(
24560                    uses_vvvv, vbi, pfx, delta, "vmulsd", Iop_Mul64F0x2 );
24561         goto decode_success;
24562      }
24563      /* VMULSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 59 /r */
24564      if (haveF3no66noF2(pfx)) {
24565         delta = dis_AVX128_E_V_to_G_lo32(
24566                    uses_vvvv, vbi, pfx, delta, "vmulss", Iop_Mul32F0x4 );
24567         goto decode_success;
24568      }
24569      /* VMULPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 59 /r */
24570      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24571         delta = dis_AVX128_E_V_to_G(
24572                    uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx4 );
24573         goto decode_success;
24574      }
24575      /* VMULPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 59 /r */
24576      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24577         delta = dis_AVX256_E_V_to_G(
24578                    uses_vvvv, vbi, pfx, delta, "vmulps", Iop_Mul32Fx8 );
24579         goto decode_success;
24580      }
24581      /* VMULPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 59 /r */
24582      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24583         delta = dis_AVX128_E_V_to_G(
24584                    uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx2 );
24585         goto decode_success;
24586      }
24587      /* VMULPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 59 /r */
24588      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24589         delta = dis_AVX256_E_V_to_G(
24590                    uses_vvvv, vbi, pfx, delta, "vmulpd", Iop_Mul64Fx4 );
24591         goto decode_success;
24592      }
24593      break;
24594
24595   case 0x5A:
24596      /* VCVTPS2PD xmm2/m64, xmm1 = VEX.128.0F.WIG 5A /r */
24597      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24598         delta = dis_CVTPS2PD_128( vbi, pfx, delta, True/*isAvx*/ );
24599         goto decode_success;
24600      }
24601      /* VCVTPS2PD xmm2/m128, ymm1 = VEX.256.0F.WIG 5A /r */
24602      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24603         delta = dis_CVTPS2PD_256( vbi, pfx, delta );
24604         goto decode_success;
24605      }
24606      /* VCVTPD2PS xmm2/m128, xmm1 = VEX.128.66.0F.WIG 5A /r */
24607      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24608         delta = dis_CVTPD2PS_128( vbi, pfx, delta, True/*isAvx*/ );
24609         goto decode_success;
24610      }
24611      /* VCVTPD2PS ymm2/m256, xmm1 = VEX.256.66.0F.WIG 5A /r */
24612      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24613         delta = dis_CVTPD2PS_256( vbi, pfx, delta );
24614         goto decode_success;
24615      }
24616      /* VCVTSD2SS xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5A /r */
24617      if (haveF2no66noF3(pfx)) {
24618         UChar  modrm = getUChar(delta);
24619         UInt   rV    = getVexNvvvv(pfx);
24620         UInt   rD    = gregOfRexRM(pfx, modrm);
24621         IRTemp f64lo = newTemp(Ity_F64);
24622         IRTemp rmode = newTemp(Ity_I32);
24623         assign( rmode, get_sse_roundingmode() );
24624         if (epartIsReg(modrm)) {
24625            UInt rS = eregOfRexRM(pfx,modrm);
24626            assign(f64lo, getXMMRegLane64F(rS, 0));
24627            delta += 1;
24628            DIP("vcvtsd2ss %s,%s,%s\n",
24629                nameXMMReg(rS), nameXMMReg(rV), nameXMMReg(rD));
24630         } else {
24631            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24632            assign(f64lo, loadLE(Ity_F64, mkexpr(addr)) );
24633            delta += alen;
24634            DIP("vcvtsd2ss %s,%s,%s\n",
24635                dis_buf, nameXMMReg(rV), nameXMMReg(rD));
24636         }
24637         putXMMRegLane32F( rD, 0,
24638                           binop( Iop_F64toF32, mkexpr(rmode),
24639                                                mkexpr(f64lo)) );
24640         putXMMRegLane32( rD, 1, getXMMRegLane32( rV, 1 ));
24641         putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
24642         putYMMRegLane128( rD, 1, mkV128(0) );
24643         *uses_vvvv = True;
24644         goto decode_success;
24645      }
24646      /* VCVTSS2SD xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5A /r */
24647      if (haveF3no66noF2(pfx)) {
24648         UChar  modrm = getUChar(delta);
24649         UInt   rV    = getVexNvvvv(pfx);
24650         UInt   rD    = gregOfRexRM(pfx, modrm);
24651         IRTemp f32lo = newTemp(Ity_F32);
24652         if (epartIsReg(modrm)) {
24653            UInt rS = eregOfRexRM(pfx,modrm);
24654            assign(f32lo, getXMMRegLane32F(rS, 0));
24655            delta += 1;
24656            DIP("vcvtss2sd %s,%s,%s\n",
24657                nameXMMReg(rS), nameXMMReg(rV), nameXMMReg(rD));
24658         } else {
24659            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
24660            assign(f32lo, loadLE(Ity_F32, mkexpr(addr)) );
24661            delta += alen;
24662            DIP("vcvtss2sd %s,%s,%s\n",
24663                dis_buf, nameXMMReg(rV), nameXMMReg(rD));
24664         }
24665         putXMMRegLane64F( rD, 0,
24666                           unop( Iop_F32toF64, mkexpr(f32lo)) );
24667         putXMMRegLane64( rD, 1, getXMMRegLane64( rV, 1 ));
24668         putYMMRegLane128( rD, 1, mkV128(0) );
24669         *uses_vvvv = True;
24670         goto decode_success;
24671      }
24672      break;
24673
24674   case 0x5B:
24675      /* VCVTPS2DQ xmm2/m128, xmm1 = VEX.128.66.0F.WIG 5B /r */
24676      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24677         delta = dis_CVTxPS2DQ_128( vbi, pfx, delta,
24678                                    True/*isAvx*/, False/*!r2zero*/ );
24679         goto decode_success;
24680      }
24681      /* VCVTPS2DQ ymm2/m256, ymm1 = VEX.256.66.0F.WIG 5B /r */
24682      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24683         delta = dis_CVTxPS2DQ_256( vbi, pfx, delta,
24684                                    False/*!r2zero*/ );
24685         goto decode_success;
24686      }
24687      /* VCVTTPS2DQ xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 5B /r */
24688      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
24689         delta = dis_CVTxPS2DQ_128( vbi, pfx, delta,
24690                                    True/*isAvx*/, True/*r2zero*/ );
24691         goto decode_success;
24692      }
24693      /* VCVTTPS2DQ ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 5B /r */
24694      if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
24695         delta = dis_CVTxPS2DQ_256( vbi, pfx, delta,
24696                                    True/*r2zero*/ );
24697         goto decode_success;
24698      }
24699      /* VCVTDQ2PS xmm2/m128, xmm1 = VEX.128.0F.WIG 5B /r */
24700      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24701         delta = dis_CVTDQ2PS_128 ( vbi, pfx, delta, True/*isAvx*/ );
24702         goto decode_success;
24703      }
24704      /* VCVTDQ2PS ymm2/m256, ymm1 = VEX.256.0F.WIG 5B /r */
24705      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24706         delta = dis_CVTDQ2PS_256 ( vbi, pfx, delta );
24707         goto decode_success;
24708      }
24709      break;
24710
24711   case 0x5C:
24712      /* VSUBSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5C /r */
24713      if (haveF2no66noF3(pfx)) {
24714         delta = dis_AVX128_E_V_to_G_lo64(
24715                    uses_vvvv, vbi, pfx, delta, "vsubsd", Iop_Sub64F0x2 );
24716         goto decode_success;
24717      }
24718      /* VSUBSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5C /r */
24719      if (haveF3no66noF2(pfx)) {
24720         delta = dis_AVX128_E_V_to_G_lo32(
24721                    uses_vvvv, vbi, pfx, delta, "vsubss", Iop_Sub32F0x4 );
24722         goto decode_success;
24723      }
24724      /* VSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5C /r */
24725      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24726         delta = dis_AVX128_E_V_to_G(
24727                    uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx4 );
24728         goto decode_success;
24729      }
24730      /* VSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5C /r */
24731      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24732         delta = dis_AVX256_E_V_to_G(
24733                    uses_vvvv, vbi, pfx, delta, "vsubps", Iop_Sub32Fx8 );
24734         goto decode_success;
24735      }
24736      /* VSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5C /r */
24737      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24738         delta = dis_AVX128_E_V_to_G(
24739                    uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx2 );
24740         goto decode_success;
24741      }
24742      /* VSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5C /r */
24743      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24744         delta = dis_AVX256_E_V_to_G(
24745                    uses_vvvv, vbi, pfx, delta, "vsubpd", Iop_Sub64Fx4 );
24746         goto decode_success;
24747      }
24748      break;
24749
24750   case 0x5D:
24751      /* VMINSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5D /r */
24752      if (haveF2no66noF3(pfx)) {
24753         delta = dis_AVX128_E_V_to_G_lo64(
24754                    uses_vvvv, vbi, pfx, delta, "vminsd", Iop_Min64F0x2 );
24755         goto decode_success;
24756      }
24757      /* VMINSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5D /r */
24758      if (haveF3no66noF2(pfx)) {
24759         delta = dis_AVX128_E_V_to_G_lo32(
24760                    uses_vvvv, vbi, pfx, delta, "vminss", Iop_Min32F0x4 );
24761         goto decode_success;
24762      }
24763      /* VMINPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5D /r */
24764      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24765         delta = dis_AVX128_E_V_to_G(
24766                    uses_vvvv, vbi, pfx, delta, "vminps", Iop_Min32Fx4 );
24767         goto decode_success;
24768      }
24769      /* VMINPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5D /r */
24770      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24771         delta = dis_AVX256_E_V_to_G(
24772                    uses_vvvv, vbi, pfx, delta, "vminps", Iop_Min32Fx8 );
24773         goto decode_success;
24774      }
24775      /* VMINPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5D /r */
24776      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24777         delta = dis_AVX128_E_V_to_G(
24778                    uses_vvvv, vbi, pfx, delta, "vminpd", Iop_Min64Fx2 );
24779         goto decode_success;
24780      }
24781      /* VMINPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5D /r */
24782      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24783         delta = dis_AVX256_E_V_to_G(
24784                    uses_vvvv, vbi, pfx, delta, "vminpd", Iop_Min64Fx4 );
24785         goto decode_success;
24786      }
24787      break;
24788
24789   case 0x5E:
24790      /* VDIVSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5E /r */
24791      if (haveF2no66noF3(pfx)) {
24792         delta = dis_AVX128_E_V_to_G_lo64(
24793                    uses_vvvv, vbi, pfx, delta, "vdivsd", Iop_Div64F0x2 );
24794         goto decode_success;
24795      }
24796      /* VDIVSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5E /r */
24797      if (haveF3no66noF2(pfx)) {
24798         delta = dis_AVX128_E_V_to_G_lo32(
24799                    uses_vvvv, vbi, pfx, delta, "vdivss", Iop_Div32F0x4 );
24800         goto decode_success;
24801      }
24802      /* VDIVPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5E /r */
24803      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24804         delta = dis_AVX128_E_V_to_G(
24805                    uses_vvvv, vbi, pfx, delta, "vdivps", Iop_Div32Fx4 );
24806         goto decode_success;
24807      }
24808      /* VDIVPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5E /r */
24809      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24810         delta = dis_AVX256_E_V_to_G(
24811                    uses_vvvv, vbi, pfx, delta, "vdivps", Iop_Div32Fx8 );
24812         goto decode_success;
24813      }
24814      /* VDIVPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5E /r */
24815      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24816         delta = dis_AVX128_E_V_to_G(
24817                    uses_vvvv, vbi, pfx, delta, "vdivpd", Iop_Div64Fx2 );
24818         goto decode_success;
24819      }
24820      /* VDIVPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5E /r */
24821      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24822         delta = dis_AVX256_E_V_to_G(
24823                    uses_vvvv, vbi, pfx, delta, "vdivpd", Iop_Div64Fx4 );
24824         goto decode_success;
24825      }
24826      break;
24827
24828   case 0x5F:
24829      /* VMAXSD xmm3/m64, xmm2, xmm1 = VEX.NDS.LIG.F2.0F.WIG 5F /r */
24830      if (haveF2no66noF3(pfx)) {
24831         delta = dis_AVX128_E_V_to_G_lo64(
24832                    uses_vvvv, vbi, pfx, delta, "vmaxsd", Iop_Max64F0x2 );
24833         goto decode_success;
24834      }
24835      /* VMAXSS xmm3/m32, xmm2, xmm1 = VEX.NDS.LIG.F3.0F.WIG 5F /r */
24836      if (haveF3no66noF2(pfx)) {
24837         delta = dis_AVX128_E_V_to_G_lo32(
24838                    uses_vvvv, vbi, pfx, delta, "vmaxss", Iop_Max32F0x4 );
24839         goto decode_success;
24840      }
24841      /* VMAXPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.0F.WIG 5F /r */
24842      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24843         delta = dis_AVX128_E_V_to_G(
24844                    uses_vvvv, vbi, pfx, delta, "vmaxps", Iop_Max32Fx4 );
24845         goto decode_success;
24846      }
24847      /* VMAXPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.0F.WIG 5F /r */
24848      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24849         delta = dis_AVX256_E_V_to_G(
24850                    uses_vvvv, vbi, pfx, delta, "vmaxps", Iop_Max32Fx8 );
24851         goto decode_success;
24852      }
24853      /* VMAXPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 5F /r */
24854      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24855         delta = dis_AVX128_E_V_to_G(
24856                    uses_vvvv, vbi, pfx, delta, "vmaxpd", Iop_Max64Fx2 );
24857         goto decode_success;
24858      }
24859      /* VMAXPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 5F /r */
24860      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24861         delta = dis_AVX256_E_V_to_G(
24862                    uses_vvvv, vbi, pfx, delta, "vmaxpd", Iop_Max64Fx4 );
24863         goto decode_success;
24864      }
24865      break;
24866
24867   case 0x60:
24868      /* VPUNPCKLBW r/m, rV, r ::: r = interleave-lo-bytes(rV, r/m) */
24869      /* VPUNPCKLBW = VEX.NDS.128.66.0F.WIG 60 /r */
24870      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24871         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
24872                    uses_vvvv, vbi, pfx, delta, "vpunpcklbw",
24873                    Iop_InterleaveLO8x16, NULL,
24874                    False/*!invertLeftArg*/, True/*swapArgs*/ );
24875         goto decode_success;
24876      }
24877      /* VPUNPCKLBW r/m, rV, r ::: r = interleave-lo-bytes(rV, r/m) */
24878      /* VPUNPCKLBW = VEX.NDS.256.66.0F.WIG 60 /r */
24879      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24880         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
24881                    uses_vvvv, vbi, pfx, delta, "vpunpcklbw",
24882                    math_VPUNPCKLBW_YMM );
24883         goto decode_success;
24884      }
24885      break;
24886
24887   case 0x61:
24888      /* VPUNPCKLWD r/m, rV, r ::: r = interleave-lo-words(rV, r/m) */
24889      /* VPUNPCKLWD = VEX.NDS.128.66.0F.WIG 61 /r */
24890      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24891         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
24892                    uses_vvvv, vbi, pfx, delta, "vpunpcklwd",
24893                    Iop_InterleaveLO16x8, NULL,
24894                    False/*!invertLeftArg*/, True/*swapArgs*/ );
24895         goto decode_success;
24896      }
24897      /* VPUNPCKLWD r/m, rV, r ::: r = interleave-lo-words(rV, r/m) */
24898      /* VPUNPCKLWD = VEX.NDS.256.66.0F.WIG 61 /r */
24899      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24900         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
24901                    uses_vvvv, vbi, pfx, delta, "vpunpcklwd",
24902                    math_VPUNPCKLWD_YMM );
24903         goto decode_success;
24904      }
24905      break;
24906
24907   case 0x62:
24908      /* VPUNPCKLDQ r/m, rV, r ::: r = interleave-lo-dwords(rV, r/m) */
24909      /* VPUNPCKLDQ = VEX.NDS.128.66.0F.WIG 62 /r */
24910      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24911         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
24912                    uses_vvvv, vbi, pfx, delta, "vpunpckldq",
24913                    Iop_InterleaveLO32x4, NULL,
24914                    False/*!invertLeftArg*/, True/*swapArgs*/ );
24915         goto decode_success;
24916      }
24917      /* VPUNPCKLDQ r/m, rV, r ::: r = interleave-lo-dwords(rV, r/m) */
24918      /* VPUNPCKLDQ = VEX.NDS.256.66.0F.WIG 62 /r */
24919      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24920         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
24921                    uses_vvvv, vbi, pfx, delta, "vpunpckldq",
24922                    math_VPUNPCKLDQ_YMM );
24923         goto decode_success;
24924      }
24925      break;
24926
24927   case 0x63:
24928      /* VPACKSSWB r/m, rV, r ::: r = QNarrowBin16Sto8Sx16(rV, r/m) */
24929      /* VPACKSSWB = VEX.NDS.128.66.0F.WIG 63 /r */
24930      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24931         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
24932                    uses_vvvv, vbi, pfx, delta, "vpacksswb",
24933                    Iop_QNarrowBin16Sto8Sx16, NULL,
24934                    False/*!invertLeftArg*/, True/*swapArgs*/ );
24935         goto decode_success;
24936      }
24937      /* VPACKSSWB r/m, rV, r ::: r = QNarrowBin16Sto8Sx16(rV, r/m) */
24938      /* VPACKSSWB = VEX.NDS.256.66.0F.WIG 63 /r */
24939      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24940         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
24941                    uses_vvvv, vbi, pfx, delta, "vpacksswb",
24942                    math_VPACKSSWB_YMM );
24943         goto decode_success;
24944      }
24945      break;
24946
24947   case 0x64:
24948      /* VPCMPGTB r/m, rV, r ::: r = rV `>s-by-8s` r/m */
24949      /* VPCMPGTB = VEX.NDS.128.66.0F.WIG 64 /r */
24950      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24951         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
24952                    uses_vvvv, vbi, pfx, delta, "vpcmpgtb", Iop_CmpGT8Sx16 );
24953         goto decode_success;
24954      }
24955      /* VPCMPGTB r/m, rV, r ::: r = rV `>s-by-8s` r/m */
24956      /* VPCMPGTB = VEX.NDS.256.66.0F.WIG 64 /r */
24957      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24958         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
24959                    uses_vvvv, vbi, pfx, delta, "vpcmpgtb", Iop_CmpGT8Sx32 );
24960         goto decode_success;
24961      }
24962      break;
24963
24964   case 0x65:
24965      /* VPCMPGTW r/m, rV, r ::: r = rV `>s-by-16s` r/m */
24966      /* VPCMPGTW = VEX.NDS.128.66.0F.WIG 65 /r */
24967      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24968         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
24969                    uses_vvvv, vbi, pfx, delta, "vpcmpgtw", Iop_CmpGT16Sx8 );
24970         goto decode_success;
24971      }
24972      /* VPCMPGTW r/m, rV, r ::: r = rV `>s-by-16s` r/m */
24973      /* VPCMPGTW = VEX.NDS.256.66.0F.WIG 65 /r */
24974      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24975         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
24976                    uses_vvvv, vbi, pfx, delta, "vpcmpgtw", Iop_CmpGT16Sx16 );
24977         goto decode_success;
24978      }
24979      break;
24980
24981   case 0x66:
24982      /* VPCMPGTD r/m, rV, r ::: r = rV `>s-by-32s` r/m */
24983      /* VPCMPGTD = VEX.NDS.128.66.0F.WIG 66 /r */
24984      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
24985         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
24986                    uses_vvvv, vbi, pfx, delta, "vpcmpgtd", Iop_CmpGT32Sx4 );
24987         goto decode_success;
24988      }
24989      /* VPCMPGTD r/m, rV, r ::: r = rV `>s-by-32s` r/m */
24990      /* VPCMPGTD = VEX.NDS.256.66.0F.WIG 66 /r */
24991      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
24992         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
24993                    uses_vvvv, vbi, pfx, delta, "vpcmpgtd", Iop_CmpGT32Sx8 );
24994         goto decode_success;
24995      }
24996      break;
24997
24998   case 0x67:
24999      /* VPACKUSWB r/m, rV, r ::: r = QNarrowBin16Sto8Ux16(rV, r/m) */
25000      /* VPACKUSWB = VEX.NDS.128.66.0F.WIG 67 /r */
25001      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25002         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25003                    uses_vvvv, vbi, pfx, delta, "vpackuswb",
25004                    Iop_QNarrowBin16Sto8Ux16, NULL,
25005                    False/*!invertLeftArg*/, True/*swapArgs*/ );
25006         goto decode_success;
25007      }
25008      /* VPACKUSWB r/m, rV, r ::: r = QNarrowBin16Sto8Ux16(rV, r/m) */
25009      /* VPACKUSWB = VEX.NDS.256.66.0F.WIG 67 /r */
25010      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25011         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25012                    uses_vvvv, vbi, pfx, delta, "vpackuswb",
25013                    math_VPACKUSWB_YMM );
25014         goto decode_success;
25015      }
25016      break;
25017
25018   case 0x68:
25019      /* VPUNPCKHBW r/m, rV, r ::: r = interleave-hi-bytes(rV, r/m) */
25020      /* VPUNPCKHBW = VEX.NDS.128.0F.WIG 68 /r */
25021      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25022         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25023                    uses_vvvv, vbi, pfx, delta, "vpunpckhbw",
25024                    Iop_InterleaveHI8x16, NULL,
25025                    False/*!invertLeftArg*/, True/*swapArgs*/ );
25026         goto decode_success;
25027      }
25028      /* VPUNPCKHBW r/m, rV, r ::: r = interleave-hi-bytes(rV, r/m) */
25029      /* VPUNPCKHBW = VEX.NDS.256.0F.WIG 68 /r */
25030      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25031         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25032                    uses_vvvv, vbi, pfx, delta, "vpunpckhbw",
25033                    math_VPUNPCKHBW_YMM );
25034         goto decode_success;
25035      }
25036      break;
25037
25038   case 0x69:
25039      /* VPUNPCKHWD r/m, rV, r ::: r = interleave-hi-words(rV, r/m) */
25040      /* VPUNPCKHWD = VEX.NDS.128.0F.WIG 69 /r */
25041      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25042         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25043                    uses_vvvv, vbi, pfx, delta, "vpunpckhwd",
25044                    Iop_InterleaveHI16x8, NULL,
25045                    False/*!invertLeftArg*/, True/*swapArgs*/ );
25046         goto decode_success;
25047      }
25048      /* VPUNPCKHWD r/m, rV, r ::: r = interleave-hi-words(rV, r/m) */
25049      /* VPUNPCKHWD = VEX.NDS.256.0F.WIG 69 /r */
25050      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25051         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25052                    uses_vvvv, vbi, pfx, delta, "vpunpckhwd",
25053                    math_VPUNPCKHWD_YMM );
25054         goto decode_success;
25055      }
25056      break;
25057
25058   case 0x6A:
25059      /* VPUNPCKHDQ r/m, rV, r ::: r = interleave-hi-dwords(rV, r/m) */
25060      /* VPUNPCKHDQ = VEX.NDS.128.66.0F.WIG 6A /r */
25061      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25062         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25063                    uses_vvvv, vbi, pfx, delta, "vpunpckhdq",
25064                    Iop_InterleaveHI32x4, NULL,
25065                    False/*!invertLeftArg*/, True/*swapArgs*/ );
25066         goto decode_success;
25067      }
25068      /* VPUNPCKHDQ r/m, rV, r ::: r = interleave-hi-dwords(rV, r/m) */
25069      /* VPUNPCKHDQ = VEX.NDS.256.66.0F.WIG 6A /r */
25070      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25071         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25072                    uses_vvvv, vbi, pfx, delta, "vpunpckhdq",
25073                    math_VPUNPCKHDQ_YMM );
25074         goto decode_success;
25075      }
25076      break;
25077
25078   case 0x6B:
25079      /* VPACKSSDW r/m, rV, r ::: r = QNarrowBin32Sto16Sx8(rV, r/m) */
25080      /* VPACKSSDW = VEX.NDS.128.66.0F.WIG 6B /r */
25081      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25082         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25083                    uses_vvvv, vbi, pfx, delta, "vpackssdw",
25084                    Iop_QNarrowBin32Sto16Sx8, NULL,
25085                    False/*!invertLeftArg*/, True/*swapArgs*/ );
25086         goto decode_success;
25087      }
25088      /* VPACKSSDW r/m, rV, r ::: r = QNarrowBin32Sto16Sx8(rV, r/m) */
25089      /* VPACKSSDW = VEX.NDS.256.66.0F.WIG 6B /r */
25090      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25091         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25092                    uses_vvvv, vbi, pfx, delta, "vpackssdw",
25093                    math_VPACKSSDW_YMM );
25094         goto decode_success;
25095      }
25096      break;
25097
25098   case 0x6C:
25099      /* VPUNPCKLQDQ r/m, rV, r ::: r = interleave-lo-64bitses(rV, r/m) */
25100      /* VPUNPCKLQDQ = VEX.NDS.128.0F.WIG 6C /r */
25101      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25102         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25103                    uses_vvvv, vbi, pfx, delta, "vpunpcklqdq",
25104                    Iop_InterleaveLO64x2, NULL,
25105                    False/*!invertLeftArg*/, True/*swapArgs*/ );
25106         goto decode_success;
25107      }
25108      /* VPUNPCKLQDQ r/m, rV, r ::: r = interleave-lo-64bitses(rV, r/m) */
25109      /* VPUNPCKLQDQ = VEX.NDS.256.0F.WIG 6C /r */
25110      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25111         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25112                    uses_vvvv, vbi, pfx, delta, "vpunpcklqdq",
25113                    math_VPUNPCKLQDQ_YMM );
25114         goto decode_success;
25115      }
25116      break;
25117
25118   case 0x6D:
25119      /* VPUNPCKHQDQ r/m, rV, r ::: r = interleave-hi-64bitses(rV, r/m) */
25120      /* VPUNPCKHQDQ = VEX.NDS.128.0F.WIG 6D /r */
25121      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25122         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
25123                    uses_vvvv, vbi, pfx, delta, "vpunpckhqdq",
25124                    Iop_InterleaveHI64x2, NULL,
25125                    False/*!invertLeftArg*/, True/*swapArgs*/ );
25126         goto decode_success;
25127      }
25128      /* VPUNPCKHQDQ r/m, rV, r ::: r = interleave-hi-64bitses(rV, r/m) */
25129      /* VPUNPCKHQDQ = VEX.NDS.256.0F.WIG 6D /r */
25130      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25131         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
25132                    uses_vvvv, vbi, pfx, delta, "vpunpckhqdq",
25133                    math_VPUNPCKHQDQ_YMM );
25134         goto decode_success;
25135      }
25136      break;
25137
25138   case 0x6E:
25139      /* VMOVD r32/m32, xmm1 = VEX.128.66.0F.W0 6E */
25140      if (have66noF2noF3(pfx)
25141          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
25142         vassert(sz == 2); /* even tho we are transferring 4, not 2. */
25143         UChar modrm = getUChar(delta);
25144         if (epartIsReg(modrm)) {
25145            delta += 1;
25146            putYMMRegLoAndZU(
25147               gregOfRexRM(pfx,modrm),
25148               unop( Iop_32UtoV128, getIReg32(eregOfRexRM(pfx,modrm)) )
25149            );
25150            DIP("vmovd %s, %s\n", nameIReg32(eregOfRexRM(pfx,modrm)),
25151                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
25152        } else {
25153            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
25154            delta += alen;
25155            putYMMRegLoAndZU(
25156               gregOfRexRM(pfx,modrm),
25157               unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)))
25158                             );
25159            DIP("vmovd %s, %s\n", dis_buf,
25160                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
25161         }
25162         goto decode_success;
25163      }
25164      /* VMOVQ r64/m64, xmm1 = VEX.128.66.0F.W1 6E */
25165      if (have66noF2noF3(pfx)
25166          && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
25167         vassert(sz == 2); /* even tho we are transferring 8, not 2. */
25168         UChar modrm = getUChar(delta);
25169         if (epartIsReg(modrm)) {
25170            delta += 1;
25171            putYMMRegLoAndZU(
25172               gregOfRexRM(pfx,modrm),
25173               unop( Iop_64UtoV128, getIReg64(eregOfRexRM(pfx,modrm)) )
25174            );
25175            DIP("vmovq %s, %s\n", nameIReg64(eregOfRexRM(pfx,modrm)),
25176                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
25177        } else {
25178            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
25179            delta += alen;
25180            putYMMRegLoAndZU(
25181               gregOfRexRM(pfx,modrm),
25182               unop( Iop_64UtoV128,loadLE(Ity_I64, mkexpr(addr)))
25183                             );
25184            DIP("vmovq %s, %s\n", dis_buf,
25185                                  nameXMMReg(gregOfRexRM(pfx,modrm)));
25186         }
25187         goto decode_success;
25188      }
25189      break;
25190
25191   case 0x6F:
25192      /* VMOVDQA ymm2/m256, ymm1 = VEX.256.66.0F.WIG 6F */
25193      /* VMOVDQU ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 6F */
25194      if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
25195          && 1==getVexL(pfx)/*256*/) {
25196         UChar  modrm = getUChar(delta);
25197         UInt   rD    = gregOfRexRM(pfx, modrm);
25198         IRTemp tD    = newTemp(Ity_V256);
25199         Bool   isA   = have66noF2noF3(pfx);
25200         HChar  ch    = isA ? 'a' : 'u';
25201         if (epartIsReg(modrm)) {
25202            UInt rS = eregOfRexRM(pfx, modrm);
25203            delta += 1;
25204            assign(tD, getYMMReg(rS));
25205            DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), nameYMMReg(rD));
25206         } else {
25207            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
25208            delta += alen;
25209            if (isA)
25210               gen_SEGV_if_not_32_aligned(addr);
25211            assign(tD, loadLE(Ity_V256, mkexpr(addr)));
25212            DIP("vmovdq%c %s,%s\n", ch, dis_buf, nameYMMReg(rD));
25213         }
25214         putYMMReg(rD, mkexpr(tD));
25215         goto decode_success;
25216      }
25217      /* VMOVDQA xmm2/m128, xmm1 = VEX.128.66.0F.WIG 6F */
25218      /* VMOVDQU xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 6F */
25219      if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
25220          && 0==getVexL(pfx)/*128*/) {
25221         UChar  modrm = getUChar(delta);
25222         UInt   rD    = gregOfRexRM(pfx, modrm);
25223         IRTemp tD    = newTemp(Ity_V128);
25224         Bool   isA   = have66noF2noF3(pfx);
25225         HChar  ch    = isA ? 'a' : 'u';
25226         if (epartIsReg(modrm)) {
25227            UInt rS = eregOfRexRM(pfx, modrm);
25228            delta += 1;
25229            assign(tD, getXMMReg(rS));
25230            DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), nameXMMReg(rD));
25231         } else {
25232            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
25233            delta += alen;
25234            if (isA)
25235               gen_SEGV_if_not_16_aligned(addr);
25236            assign(tD, loadLE(Ity_V128, mkexpr(addr)));
25237            DIP("vmovdq%c %s,%s\n", ch, dis_buf, nameXMMReg(rD));
25238         }
25239         putYMMRegLoAndZU(rD, mkexpr(tD));
25240         goto decode_success;
25241      }
25242      break;
25243
25244   case 0x70:
25245      /* VPSHUFD imm8, xmm2/m128, xmm1 = VEX.128.66.0F.WIG 70 /r ib */
25246      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25247         delta = dis_PSHUFD_32x4( vbi, pfx, delta, True/*writesYmm*/);
25248         goto decode_success;
25249      }
25250      /* VPSHUFD imm8, ymm2/m256, ymm1 = VEX.256.66.0F.WIG 70 /r ib */
25251      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25252         delta = dis_PSHUFD_32x8( vbi, pfx, delta);
25253         goto decode_success;
25254      }
25255      /* VPSHUFLW imm8, xmm2/m128, xmm1 = VEX.128.F2.0F.WIG 70 /r ib */
25256      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25257         delta = dis_PSHUFxW_128( vbi, pfx, delta,
25258                                  True/*isAvx*/, False/*!xIsH*/ );
25259         goto decode_success;
25260      }
25261      /* VPSHUFLW imm8, ymm2/m256, ymm1 = VEX.256.F2.0F.WIG 70 /r ib */
25262      if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25263         delta = dis_PSHUFxW_256( vbi, pfx, delta, False/*!xIsH*/ );
25264         goto decode_success;
25265      }
25266      /* VPSHUFHW imm8, xmm2/m128, xmm1 = VEX.128.F3.0F.WIG 70 /r ib */
25267      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
25268         delta = dis_PSHUFxW_128( vbi, pfx, delta,
25269                                  True/*isAvx*/, True/*xIsH*/ );
25270         goto decode_success;
25271      }
25272      /* VPSHUFHW imm8, ymm2/m256, ymm1 = VEX.256.F3.0F.WIG 70 /r ib */
25273      if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
25274         delta = dis_PSHUFxW_256( vbi, pfx, delta, True/*xIsH*/ );
25275         goto decode_success;
25276      }
25277      break;
25278
25279   case 0x71:
25280      /* VPSRLW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /2 ib */
25281      /* VPSRAW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /4 ib */
25282      /* VPSLLW imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 71 /6 ib */
25283      if (have66noF2noF3(pfx)
25284          && 0==getVexL(pfx)/*128*/
25285          && epartIsReg(getUChar(delta))) {
25286         if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
25287            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
25288                                                "vpsrlw", Iop_ShrN16x8 );
25289            *uses_vvvv = True;
25290            goto decode_success;
25291         }
25292         if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
25293            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
25294                                                "vpsraw", Iop_SarN16x8 );
25295            *uses_vvvv = True;
25296            goto decode_success;
25297         }
25298         if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
25299            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
25300                                                "vpsllw", Iop_ShlN16x8 );
25301            *uses_vvvv = True;
25302            goto decode_success;
25303         }
25304         /* else fall through */
25305      }
25306      /* VPSRLW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /2 ib */
25307      /* VPSRAW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /4 ib */
25308      /* VPSLLW imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 71 /6 ib */
25309      if (have66noF2noF3(pfx)
25310          && 1==getVexL(pfx)/*256*/
25311          && epartIsReg(getUChar(delta))) {
25312         if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
25313            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
25314                                                "vpsrlw", Iop_ShrN16x16 );
25315            *uses_vvvv = True;
25316            goto decode_success;
25317         }
25318         if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
25319            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
25320                                                "vpsraw", Iop_SarN16x16 );
25321            *uses_vvvv = True;
25322            goto decode_success;
25323         }
25324         if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
25325            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
25326                                                "vpsllw", Iop_ShlN16x16 );
25327            *uses_vvvv = True;
25328            goto decode_success;
25329         }
25330         /* else fall through */
25331      }
25332      break;
25333
25334   case 0x72:
25335      /* VPSRLD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /2 ib */
25336      /* VPSRAD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /4 ib */
25337      /* VPSLLD imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 72 /6 ib */
25338      if (have66noF2noF3(pfx)
25339          && 0==getVexL(pfx)/*128*/
25340          && epartIsReg(getUChar(delta))) {
25341         if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
25342            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
25343                                                "vpsrld", Iop_ShrN32x4 );
25344            *uses_vvvv = True;
25345            goto decode_success;
25346         }
25347         if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
25348            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
25349                                                "vpsrad", Iop_SarN32x4 );
25350            *uses_vvvv = True;
25351            goto decode_success;
25352         }
25353         if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
25354            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
25355                                                "vpslld", Iop_ShlN32x4 );
25356            *uses_vvvv = True;
25357            goto decode_success;
25358         }
25359         /* else fall through */
25360      }
25361      /* VPSRLD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /2 ib */
25362      /* VPSRAD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /4 ib */
25363      /* VPSLLD imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 72 /6 ib */
25364      if (have66noF2noF3(pfx)
25365          && 1==getVexL(pfx)/*256*/
25366          && epartIsReg(getUChar(delta))) {
25367         if (gregLO3ofRM(getUChar(delta)) == 2/*SRL*/) {
25368            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
25369                                                "vpsrld", Iop_ShrN32x8 );
25370            *uses_vvvv = True;
25371            goto decode_success;
25372         }
25373         if (gregLO3ofRM(getUChar(delta)) == 4/*SRA*/) {
25374            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
25375                                                "vpsrad", Iop_SarN32x8 );
25376            *uses_vvvv = True;
25377            goto decode_success;
25378         }
25379         if (gregLO3ofRM(getUChar(delta)) == 6/*SLL*/) {
25380            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
25381                                                "vpslld", Iop_ShlN32x8 );
25382            *uses_vvvv = True;
25383            goto decode_success;
25384         }
25385         /* else fall through */
25386      }
25387      break;
25388
25389   case 0x73:
25390      /* VPSRLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /3 ib */
25391      /* VPSLLDQ imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /7 ib */
25392      /* VPSRLQ  imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /2 ib */
25393      /* VPSLLQ  imm8, xmm2, xmm1 = VEX.NDD.128.66.0F.WIG 73 /6 ib */
25394      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
25395          && epartIsReg(getUChar(delta))) {
25396         Int    rS   = eregOfRexRM(pfx,getUChar(delta));
25397         Int    rD   = getVexNvvvv(pfx);
25398         IRTemp vecS = newTemp(Ity_V128);
25399         if (gregLO3ofRM(getUChar(delta)) == 3) {
25400            Int imm = (Int)getUChar(delta+1);
25401            DIP("vpsrldq $%d,%s,%s\n", imm, nameXMMReg(rS), nameXMMReg(rD));
25402            delta += 2;
25403            assign( vecS, getXMMReg(rS) );
25404            putYMMRegLoAndZU(rD, mkexpr(math_PSRLDQ( vecS, imm )));
25405            *uses_vvvv = True;
25406            goto decode_success;
25407         }
25408         if (gregLO3ofRM(getUChar(delta)) == 7) {
25409            Int imm = (Int)getUChar(delta+1);
25410            DIP("vpslldq $%d,%s,%s\n", imm, nameXMMReg(rS), nameXMMReg(rD));
25411            delta += 2;
25412            assign( vecS, getXMMReg(rS) );
25413            putYMMRegLoAndZU(rD, mkexpr(math_PSLLDQ( vecS, imm )));
25414            *uses_vvvv = True;
25415            goto decode_success;
25416         }
25417         if (gregLO3ofRM(getUChar(delta)) == 2) {
25418            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
25419                                                "vpsrlq", Iop_ShrN64x2 );
25420            *uses_vvvv = True;
25421            goto decode_success;
25422         }
25423         if (gregLO3ofRM(getUChar(delta)) == 6) {
25424            delta = dis_AVX128_shiftE_to_V_imm( pfx, delta,
25425                                                "vpsllq", Iop_ShlN64x2 );
25426            *uses_vvvv = True;
25427            goto decode_success;
25428         }
25429         /* else fall through */
25430      }
25431      /* VPSRLDQ imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /3 ib */
25432      /* VPSLLDQ imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /7 ib */
25433      /* VPSRLQ  imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /2 ib */
25434      /* VPSLLQ  imm8, ymm2, ymm1 = VEX.NDD.256.66.0F.WIG 73 /6 ib */
25435      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
25436          && epartIsReg(getUChar(delta))) {
25437         Int    rS   = eregOfRexRM(pfx,getUChar(delta));
25438         Int    rD   = getVexNvvvv(pfx);
25439         if (gregLO3ofRM(getUChar(delta)) == 3) {
25440            IRTemp vecS0 = newTemp(Ity_V128);
25441            IRTemp vecS1 = newTemp(Ity_V128);
25442            Int imm = (Int)getUChar(delta+1);
25443            DIP("vpsrldq $%d,%s,%s\n", imm, nameYMMReg(rS), nameYMMReg(rD));
25444            delta += 2;
25445            assign( vecS0, getYMMRegLane128(rS, 0));
25446            assign( vecS1, getYMMRegLane128(rS, 1));
25447            putYMMRegLane128(rD, 0, mkexpr(math_PSRLDQ( vecS0, imm )));
25448            putYMMRegLane128(rD, 1, mkexpr(math_PSRLDQ( vecS1, imm )));
25449            *uses_vvvv = True;
25450            goto decode_success;
25451         }
25452         if (gregLO3ofRM(getUChar(delta)) == 7) {
25453            IRTemp vecS0 = newTemp(Ity_V128);
25454            IRTemp vecS1 = newTemp(Ity_V128);
25455            Int imm = (Int)getUChar(delta+1);
25456            DIP("vpslldq $%d,%s,%s\n", imm, nameYMMReg(rS), nameYMMReg(rD));
25457            delta += 2;
25458            assign( vecS0, getYMMRegLane128(rS, 0));
25459            assign( vecS1, getYMMRegLane128(rS, 1));
25460            putYMMRegLane128(rD, 0, mkexpr(math_PSLLDQ( vecS0, imm )));
25461            putYMMRegLane128(rD, 1, mkexpr(math_PSLLDQ( vecS1, imm )));
25462            *uses_vvvv = True;
25463            goto decode_success;
25464         }
25465         if (gregLO3ofRM(getUChar(delta)) == 2) {
25466            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
25467                                                "vpsrlq", Iop_ShrN64x4 );
25468            *uses_vvvv = True;
25469            goto decode_success;
25470         }
25471         if (gregLO3ofRM(getUChar(delta)) == 6) {
25472            delta = dis_AVX256_shiftE_to_V_imm( pfx, delta,
25473                                                "vpsllq", Iop_ShlN64x4 );
25474            *uses_vvvv = True;
25475            goto decode_success;
25476         }
25477         /* else fall through */
25478      }
25479      break;
25480
25481   case 0x74:
25482      /* VPCMPEQB r/m, rV, r ::: r = rV `eq-by-8s` r/m */
25483      /* VPCMPEQB = VEX.NDS.128.66.0F.WIG 74 /r */
25484      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25485         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25486                    uses_vvvv, vbi, pfx, delta, "vpcmpeqb", Iop_CmpEQ8x16 );
25487         goto decode_success;
25488      }
25489      /* VPCMPEQB r/m, rV, r ::: r = rV `eq-by-8s` r/m */
25490      /* VPCMPEQB = VEX.NDS.256.66.0F.WIG 74 /r */
25491      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25492         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
25493                    uses_vvvv, vbi, pfx, delta, "vpcmpeqb", Iop_CmpEQ8x32 );
25494         goto decode_success;
25495      }
25496      break;
25497
25498   case 0x75:
25499      /* VPCMPEQW r/m, rV, r ::: r = rV `eq-by-16s` r/m */
25500      /* VPCMPEQW = VEX.NDS.128.66.0F.WIG 75 /r */
25501      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25502         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25503                    uses_vvvv, vbi, pfx, delta, "vpcmpeqw", Iop_CmpEQ16x8 );
25504         goto decode_success;
25505      }
25506      /* VPCMPEQW r/m, rV, r ::: r = rV `eq-by-16s` r/m */
25507      /* VPCMPEQW = VEX.NDS.256.66.0F.WIG 75 /r */
25508      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25509         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
25510                    uses_vvvv, vbi, pfx, delta, "vpcmpeqw", Iop_CmpEQ16x16 );
25511         goto decode_success;
25512      }
25513      break;
25514
25515   case 0x76:
25516      /* VPCMPEQD r/m, rV, r ::: r = rV `eq-by-32s` r/m */
25517      /* VPCMPEQD = VEX.NDS.128.66.0F.WIG 76 /r */
25518      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25519         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
25520                    uses_vvvv, vbi, pfx, delta, "vpcmpeqd", Iop_CmpEQ32x4 );
25521         goto decode_success;
25522      }
25523      /* VPCMPEQD r/m, rV, r ::: r = rV `eq-by-32s` r/m */
25524      /* VPCMPEQD = VEX.NDS.256.66.0F.WIG 76 /r */
25525      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25526         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
25527                    uses_vvvv, vbi, pfx, delta, "vpcmpeqd", Iop_CmpEQ32x8 );
25528         goto decode_success;
25529      }
25530      break;
25531
25532   case 0x77:
25533      /* VZEROUPPER = VEX.128.0F.WIG 77 */
25534      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25535         Int i;
25536         IRTemp zero128 = newTemp(Ity_V128);
25537         assign(zero128, mkV128(0));
25538         for (i = 0; i < 16; i++) {
25539            putYMMRegLane128(i, 1, mkexpr(zero128));
25540         }
25541         DIP("vzeroupper\n");
25542         goto decode_success;
25543      }
25544      /* VZEROALL = VEX.256.0F.WIG 77 */
25545      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25546         Int i;
25547         IRTemp zero128 = newTemp(Ity_V128);
25548         assign(zero128, mkV128(0));
25549         for (i = 0; i < 16; i++) {
25550            putYMMRegLoAndZU(i, mkexpr(zero128));
25551         }
25552         DIP("vzeroall\n");
25553         goto decode_success;
25554      }
25555      break;
25556
25557   case 0x7C:
25558   case 0x7D:
25559      /* VHADDPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG 7C /r */
25560      /* VHSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG 7D /r */
25561      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25562         IRTemp sV     = newTemp(Ity_V128);
25563         IRTemp dV     = newTemp(Ity_V128);
25564         Bool   isAdd  = opc == 0x7C;
25565         const HChar* str = isAdd ? "add" : "sub";
25566         UChar modrm   = getUChar(delta);
25567         UInt   rG     = gregOfRexRM(pfx,modrm);
25568         UInt   rV     = getVexNvvvv(pfx);
25569         if (epartIsReg(modrm)) {
25570            UInt rE = eregOfRexRM(pfx,modrm);
25571            assign( sV, getXMMReg(rE) );
25572            DIP("vh%spd %s,%s,%s\n", str, nameXMMReg(rE),
25573                nameXMMReg(rV), nameXMMReg(rG));
25574            delta += 1;
25575         } else {
25576            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
25577            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
25578            DIP("vh%spd %s,%s,%s\n", str, dis_buf,
25579                nameXMMReg(rV), nameXMMReg(rG));
25580            delta += alen;
25581         }
25582         assign( dV, getXMMReg(rV) );
25583         putYMMRegLoAndZU( rG, mkexpr( math_HADDPS_128 ( dV, sV, isAdd ) ) );
25584         *uses_vvvv = True;
25585         goto decode_success;
25586      }
25587      /* VHADDPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG 7C /r */
25588      /* VHSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG 7D /r */
25589      if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25590         IRTemp sV     = newTemp(Ity_V256);
25591         IRTemp dV     = newTemp(Ity_V256);
25592         IRTemp s1, s0, d1, d0;
25593         Bool   isAdd  = opc == 0x7C;
25594         const HChar* str = isAdd ? "add" : "sub";
25595         UChar modrm   = getUChar(delta);
25596         UInt   rG     = gregOfRexRM(pfx,modrm);
25597         UInt   rV     = getVexNvvvv(pfx);
25598         s1 = s0 = d1 = d0 = IRTemp_INVALID;
25599         if (epartIsReg(modrm)) {
25600            UInt rE = eregOfRexRM(pfx,modrm);
25601            assign( sV, getYMMReg(rE) );
25602            DIP("vh%spd %s,%s,%s\n", str, nameYMMReg(rE),
25603                nameYMMReg(rV), nameYMMReg(rG));
25604            delta += 1;
25605         } else {
25606            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
25607            assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
25608            DIP("vh%spd %s,%s,%s\n", str, dis_buf,
25609                nameYMMReg(rV), nameYMMReg(rG));
25610            delta += alen;
25611         }
25612         assign( dV, getYMMReg(rV) );
25613         breakupV256toV128s( dV, &d1, &d0 );
25614         breakupV256toV128s( sV, &s1, &s0 );
25615         putYMMReg( rG, binop(Iop_V128HLtoV256,
25616                              mkexpr( math_HADDPS_128 ( d1, s1, isAdd ) ),
25617                              mkexpr( math_HADDPS_128 ( d0, s0, isAdd ) ) ) );
25618         *uses_vvvv = True;
25619         goto decode_success;
25620      }
25621      /* VHADDPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 7C /r */
25622      /* VHSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG 7D /r */
25623      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25624         IRTemp sV     = newTemp(Ity_V128);
25625         IRTemp dV     = newTemp(Ity_V128);
25626         Bool   isAdd  = opc == 0x7C;
25627         const HChar* str = isAdd ? "add" : "sub";
25628         UChar modrm   = getUChar(delta);
25629         UInt   rG     = gregOfRexRM(pfx,modrm);
25630         UInt   rV     = getVexNvvvv(pfx);
25631         if (epartIsReg(modrm)) {
25632            UInt rE = eregOfRexRM(pfx,modrm);
25633            assign( sV, getXMMReg(rE) );
25634            DIP("vh%spd %s,%s,%s\n", str, nameXMMReg(rE),
25635                nameXMMReg(rV), nameXMMReg(rG));
25636            delta += 1;
25637         } else {
25638            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
25639            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
25640            DIP("vh%spd %s,%s,%s\n", str, dis_buf,
25641                nameXMMReg(rV), nameXMMReg(rG));
25642            delta += alen;
25643         }
25644         assign( dV, getXMMReg(rV) );
25645         putYMMRegLoAndZU( rG, mkexpr( math_HADDPD_128 ( dV, sV, isAdd ) ) );
25646         *uses_vvvv = True;
25647         goto decode_success;
25648      }
25649      /* VHADDPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 7C /r */
25650      /* VHSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG 7D /r */
25651      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25652         IRTemp sV     = newTemp(Ity_V256);
25653         IRTemp dV     = newTemp(Ity_V256);
25654         IRTemp s1, s0, d1, d0;
25655         Bool   isAdd  = opc == 0x7C;
25656         const HChar* str = isAdd ? "add" : "sub";
25657         UChar modrm   = getUChar(delta);
25658         UInt   rG     = gregOfRexRM(pfx,modrm);
25659         UInt   rV     = getVexNvvvv(pfx);
25660         s1 = s0 = d1 = d0 = IRTemp_INVALID;
25661         if (epartIsReg(modrm)) {
25662            UInt rE = eregOfRexRM(pfx,modrm);
25663            assign( sV, getYMMReg(rE) );
25664            DIP("vh%spd %s,%s,%s\n", str, nameYMMReg(rE),
25665                nameYMMReg(rV), nameYMMReg(rG));
25666            delta += 1;
25667         } else {
25668            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
25669            assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
25670            DIP("vh%spd %s,%s,%s\n", str, dis_buf,
25671                nameYMMReg(rV), nameYMMReg(rG));
25672            delta += alen;
25673         }
25674         assign( dV, getYMMReg(rV) );
25675         breakupV256toV128s( dV, &d1, &d0 );
25676         breakupV256toV128s( sV, &s1, &s0 );
25677         putYMMReg( rG, binop(Iop_V128HLtoV256,
25678                              mkexpr( math_HADDPD_128 ( d1, s1, isAdd ) ),
25679                              mkexpr( math_HADDPD_128 ( d0, s0, isAdd ) ) ) );
25680         *uses_vvvv = True;
25681         goto decode_success;
25682      }
25683      break;
25684
25685   case 0x7E:
25686      /* Note the Intel docs don't make sense for this.  I think they
25687         are wrong.  They seem to imply it is a store when in fact I
25688         think it is a load.  Also it's unclear whether this is W0, W1
25689         or WIG. */
25690      /* VMOVQ xmm2/m64, xmm1 = VEX.128.F3.0F.W0 7E /r */
25691      if (haveF3no66noF2(pfx)
25692          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
25693         vassert(sz == 4); /* even tho we are transferring 8, not 4. */
25694         UChar modrm = getUChar(delta);
25695         UInt  rG    = gregOfRexRM(pfx,modrm);
25696         if (epartIsReg(modrm)) {
25697            UInt rE = eregOfRexRM(pfx,modrm);
25698            putXMMRegLane64( rG, 0, getXMMRegLane64( rE, 0 ));
25699            DIP("vmovq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
25700            delta += 1;
25701         } else {
25702            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
25703            putXMMRegLane64( rG, 0, loadLE(Ity_I64, mkexpr(addr)) );
25704            DIP("vmovq %s,%s\n", dis_buf, nameXMMReg(rG));
25705            delta += alen;
25706         }
25707         /* zero bits 255:64 */
25708         putXMMRegLane64( rG, 1, mkU64(0) );
25709         putYMMRegLane128( rG, 1, mkV128(0) );
25710         goto decode_success;
25711      }
25712      /* VMOVQ xmm1, r64 = VEX.128.66.0F.W1 7E /r (reg case only) */
25713      /* Moves from G to E, so is a store-form insn */
25714      /* Intel docs list this in the VMOVD entry for some reason. */
25715      if (have66noF2noF3(pfx)
25716          && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
25717         UChar modrm = getUChar(delta);
25718         UInt  rG    = gregOfRexRM(pfx,modrm);
25719         if (epartIsReg(modrm)) {
25720            UInt rE = eregOfRexRM(pfx,modrm);
25721            DIP("vmovq %s,%s\n", nameXMMReg(rG), nameIReg64(rE));
25722            putIReg64(rE, getXMMRegLane64(rG, 0));
25723            delta += 1;
25724         } else {
25725            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
25726            storeLE( mkexpr(addr), getXMMRegLane64(rG, 0) );
25727            DIP("vmovq %s,%s\n", dis_buf, nameXMMReg(rG));
25728            delta += alen;
25729         }
25730         goto decode_success;
25731      }
25732      /* VMOVD xmm1, m32/r32 = VEX.128.66.0F.W0 7E /r (reg case only) */
25733      /* Moves from G to E, so is a store-form insn */
25734      if (have66noF2noF3(pfx)
25735          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
25736         UChar modrm = getUChar(delta);
25737         UInt  rG    = gregOfRexRM(pfx,modrm);
25738         if (epartIsReg(modrm)) {
25739            UInt rE = eregOfRexRM(pfx,modrm);
25740            DIP("vmovd %s,%s\n", nameXMMReg(rG), nameIReg32(rE));
25741            putIReg32(rE, getXMMRegLane32(rG, 0));
25742            delta += 1;
25743         } else {
25744            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
25745            storeLE( mkexpr(addr), getXMMRegLane32(rG, 0) );
25746            DIP("vmovd %s,%s\n", dis_buf, nameXMMReg(rG));
25747            delta += alen;
25748         }
25749         goto decode_success;
25750      }
25751      break;
25752
25753   case 0x7F:
25754      /* VMOVDQA ymm1, ymm2/m256 = VEX.256.66.0F.WIG 7F */
25755      /* VMOVDQU ymm1, ymm2/m256 = VEX.256.F3.0F.WIG 7F */
25756      if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
25757          && 1==getVexL(pfx)/*256*/) {
25758         UChar  modrm = getUChar(delta);
25759         UInt   rS    = gregOfRexRM(pfx, modrm);
25760         IRTemp tS    = newTemp(Ity_V256);
25761         Bool   isA   = have66noF2noF3(pfx);
25762         HChar  ch    = isA ? 'a' : 'u';
25763         assign(tS, getYMMReg(rS));
25764         if (epartIsReg(modrm)) {
25765            UInt rD = eregOfRexRM(pfx, modrm);
25766            delta += 1;
25767            putYMMReg(rD, mkexpr(tS));
25768            DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), nameYMMReg(rD));
25769         } else {
25770            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
25771            delta += alen;
25772            if (isA)
25773               gen_SEGV_if_not_32_aligned(addr);
25774            storeLE(mkexpr(addr), mkexpr(tS));
25775            DIP("vmovdq%c %s,%s\n", ch, nameYMMReg(rS), dis_buf);
25776         }
25777         goto decode_success;
25778      }
25779      /* VMOVDQA xmm1, xmm2/m128 = VEX.128.66.0F.WIG 7F */
25780      /* VMOVDQU xmm1, xmm2/m128 = VEX.128.F3.0F.WIG 7F */
25781      if ((have66noF2noF3(pfx) || haveF3no66noF2(pfx))
25782          && 0==getVexL(pfx)/*128*/) {
25783         UChar  modrm = getUChar(delta);
25784         UInt   rS    = gregOfRexRM(pfx, modrm);
25785         IRTemp tS    = newTemp(Ity_V128);
25786         Bool   isA   = have66noF2noF3(pfx);
25787         HChar  ch    = isA ? 'a' : 'u';
25788         assign(tS, getXMMReg(rS));
25789         if (epartIsReg(modrm)) {
25790            UInt rD = eregOfRexRM(pfx, modrm);
25791            delta += 1;
25792            putYMMRegLoAndZU(rD, mkexpr(tS));
25793            DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), nameXMMReg(rD));
25794         } else {
25795            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
25796            delta += alen;
25797            if (isA)
25798               gen_SEGV_if_not_16_aligned(addr);
25799            storeLE(mkexpr(addr), mkexpr(tS));
25800            DIP("vmovdq%c %s,%s\n", ch, nameXMMReg(rS), dis_buf);
25801         }
25802         goto decode_success;
25803      }
25804      break;
25805
25806   case 0xAE:
25807      /* VSTMXCSR m32 = VEX.LZ.0F.WIG AE /3 */
25808      if (haveNo66noF2noF3(pfx)
25809          && 0==getVexL(pfx)/*LZ*/
25810          && 0==getRexW(pfx) /* be paranoid -- Intel docs don't require this */
25811          && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 3
25812          && sz == 4) {
25813         delta = dis_STMXCSR(vbi, pfx, delta, True/*isAvx*/);
25814         goto decode_success;
25815      }
25816      /* VLDMXCSR m32 = VEX.LZ.0F.WIG AE /2 */
25817      if (haveNo66noF2noF3(pfx)
25818          && 0==getVexL(pfx)/*LZ*/
25819          && 0==getRexW(pfx) /* be paranoid -- Intel docs don't require this */
25820          && !epartIsReg(getUChar(delta)) && gregLO3ofRM(getUChar(delta)) == 2
25821          && sz == 4) {
25822         delta = dis_LDMXCSR(vbi, pfx, delta, True/*isAvx*/);
25823         goto decode_success;
25824      }
25825      break;
25826
25827   case 0xC2:
25828      /* VCMPSD xmm3/m64(E=argL), xmm2(V=argR), xmm1(G) */
25829      /* = VEX.NDS.LIG.F2.0F.WIG C2 /r ib */
25830      if (haveF2no66noF3(pfx)) {
25831         Long delta0 = delta;
25832         delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
25833                                          "vcmpsd", False/*!all_lanes*/,
25834                                          8/*sz*/);
25835         if (delta > delta0) goto decode_success;
25836         /* else fall through -- decoding has failed */
25837      }
25838      /* VCMPSS xmm3/m32(E=argL), xmm2(V=argR), xmm1(G) */
25839      /* = VEX.NDS.LIG.F3.0F.WIG C2 /r ib */
25840      if (haveF3no66noF2(pfx)) {
25841         Long delta0 = delta;
25842         delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
25843                                          "vcmpss", False/*!all_lanes*/,
25844                                          4/*sz*/);
25845         if (delta > delta0) goto decode_success;
25846         /* else fall through -- decoding has failed */
25847      }
25848      /* VCMPPD xmm3/m128(E=argL), xmm2(V=argR), xmm1(G) */
25849      /* = VEX.NDS.128.66.0F.WIG C2 /r ib */
25850      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25851         Long delta0 = delta;
25852         delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
25853                                          "vcmppd", True/*all_lanes*/,
25854                                          8/*sz*/);
25855         if (delta > delta0) goto decode_success;
25856         /* else fall through -- decoding has failed */
25857      }
25858      /* VCMPPD ymm3/m256(E=argL), ymm2(V=argR), ymm1(G) */
25859      /* = VEX.NDS.256.66.0F.WIG C2 /r ib */
25860      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25861         Long delta0 = delta;
25862         delta = dis_AVX256_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
25863                                          "vcmppd", 8/*sz*/);
25864         if (delta > delta0) goto decode_success;
25865         /* else fall through -- decoding has failed */
25866      }
25867      /* VCMPPS xmm3/m128(E=argL), xmm2(V=argR), xmm1(G) */
25868      /* = VEX.NDS.128.0F.WIG C2 /r ib */
25869      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25870         Long delta0 = delta;
25871         delta = dis_AVX128_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
25872                                          "vcmpps", True/*all_lanes*/,
25873                                          4/*sz*/);
25874         if (delta > delta0) goto decode_success;
25875         /* else fall through -- decoding has failed */
25876      }
25877      /* VCMPPS ymm3/m256(E=argL), ymm2(V=argR), ymm1(G) */
25878      /* = VEX.NDS.256.0F.WIG C2 /r ib */
25879      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25880         Long delta0 = delta;
25881         delta = dis_AVX256_cmp_V_E_to_G( uses_vvvv, vbi, pfx, delta,
25882                                          "vcmpps", 4/*sz*/);
25883         if (delta > delta0) goto decode_success;
25884         /* else fall through -- decoding has failed */
25885      }
25886      break;
25887
25888   case 0xC4:
25889      /* VPINSRW r32/m16, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG C4 /r ib */
25890      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25891         UChar  modrm = getUChar(delta);
25892         UInt   rG    = gregOfRexRM(pfx, modrm);
25893         UInt   rV    = getVexNvvvv(pfx);
25894         Int    imm8;
25895         IRTemp new16 = newTemp(Ity_I16);
25896
25897         if ( epartIsReg( modrm ) ) {
25898            imm8 = (Int)(getUChar(delta+1) & 7);
25899            assign( new16, unop(Iop_32to16,
25900                                getIReg32(eregOfRexRM(pfx,modrm))) );
25901            delta += 1+1;
25902            DIP( "vpinsrw $%d,%s,%s\n", imm8,
25903                 nameIReg32( eregOfRexRM(pfx, modrm) ), nameXMMReg(rG) );
25904         } else {
25905            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
25906            imm8 = (Int)(getUChar(delta+alen) & 7);
25907            assign( new16, loadLE( Ity_I16, mkexpr(addr) ));
25908            delta += alen+1;
25909            DIP( "vpinsrw $%d,%s,%s\n",
25910                 imm8, dis_buf, nameXMMReg(rG) );
25911         }
25912
25913         IRTemp src_vec = newTemp(Ity_V128);
25914         assign(src_vec, getXMMReg( rV ));
25915         IRTemp res_vec = math_PINSRW_128( src_vec, new16, imm8 );
25916         putYMMRegLoAndZU( rG, mkexpr(res_vec) );
25917         *uses_vvvv = True;
25918         goto decode_success;
25919      }
25920      break;
25921
25922   case 0xC5:
25923      /* VPEXTRW imm8, xmm1, reg32 = VEX.128.66.0F.W0 C5 /r ib */
25924      if (have66noF2noF3(pfx)
25925         && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
25926         Long delta0 = delta;
25927         delta = dis_PEXTRW_128_EregOnly_toG( vbi, pfx, delta,
25928                                              True/*isAvx*/ );
25929         if (delta > delta0) goto decode_success;
25930         /* else fall through -- decoding has failed */
25931      }
25932      break;
25933
25934   case 0xC6:
25935      /* VSHUFPS imm8, xmm3/m128, xmm2, xmm1, xmm2 */
25936      /* = VEX.NDS.128.0F.WIG C6 /r ib */
25937      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25938         Int    imm8 = 0;
25939         IRTemp eV   = newTemp(Ity_V128);
25940         IRTemp vV   = newTemp(Ity_V128);
25941         UInt  modrm = getUChar(delta);
25942         UInt  rG    = gregOfRexRM(pfx,modrm);
25943         UInt  rV    = getVexNvvvv(pfx);
25944         assign( vV, getXMMReg(rV) );
25945         if (epartIsReg(modrm)) {
25946            UInt rE = eregOfRexRM(pfx,modrm);
25947            assign( eV, getXMMReg(rE) );
25948            imm8 = (Int)getUChar(delta+1);
25949            delta += 1+1;
25950            DIP("vshufps $%d,%s,%s,%s\n",
25951                imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
25952         } else {
25953            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
25954            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
25955            imm8 = (Int)getUChar(delta+alen);
25956            delta += 1+alen;
25957            DIP("vshufps $%d,%s,%s,%s\n",
25958                imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
25959         }
25960         IRTemp res = math_SHUFPS_128( eV, vV, imm8 );
25961         putYMMRegLoAndZU( rG, mkexpr(res) );
25962         *uses_vvvv = True;
25963         goto decode_success;
25964      }
25965      /* VSHUFPS imm8, ymm3/m256, ymm2, ymm1, ymm2 */
25966      /* = VEX.NDS.256.0F.WIG C6 /r ib */
25967      if (haveNo66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
25968         Int    imm8 = 0;
25969         IRTemp eV   = newTemp(Ity_V256);
25970         IRTemp vV   = newTemp(Ity_V256);
25971         UInt  modrm = getUChar(delta);
25972         UInt  rG    = gregOfRexRM(pfx,modrm);
25973         UInt  rV    = getVexNvvvv(pfx);
25974         assign( vV, getYMMReg(rV) );
25975         if (epartIsReg(modrm)) {
25976            UInt rE = eregOfRexRM(pfx,modrm);
25977            assign( eV, getYMMReg(rE) );
25978            imm8 = (Int)getUChar(delta+1);
25979            delta += 1+1;
25980            DIP("vshufps $%d,%s,%s,%s\n",
25981                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
25982         } else {
25983            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
25984            assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
25985            imm8 = (Int)getUChar(delta+alen);
25986            delta += 1+alen;
25987            DIP("vshufps $%d,%s,%s,%s\n",
25988                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
25989         }
25990         IRTemp res = math_SHUFPS_256( eV, vV, imm8 );
25991         putYMMReg( rG, mkexpr(res) );
25992         *uses_vvvv = True;
25993         goto decode_success;
25994      }
25995      /* VSHUFPD imm8, xmm3/m128, xmm2, xmm1, xmm2 */
25996      /* = VEX.NDS.128.66.0F.WIG C6 /r ib */
25997      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
25998         Int    imm8 = 0;
25999         IRTemp eV   = newTemp(Ity_V128);
26000         IRTemp vV   = newTemp(Ity_V128);
26001         UInt  modrm = getUChar(delta);
26002         UInt  rG    = gregOfRexRM(pfx,modrm);
26003         UInt  rV    = getVexNvvvv(pfx);
26004         assign( vV, getXMMReg(rV) );
26005         if (epartIsReg(modrm)) {
26006            UInt rE = eregOfRexRM(pfx,modrm);
26007            assign( eV, getXMMReg(rE) );
26008            imm8 = (Int)getUChar(delta+1);
26009            delta += 1+1;
26010            DIP("vshufpd $%d,%s,%s,%s\n",
26011                imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
26012         } else {
26013            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
26014            assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
26015            imm8 = (Int)getUChar(delta+alen);
26016            delta += 1+alen;
26017            DIP("vshufpd $%d,%s,%s,%s\n",
26018                imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
26019         }
26020         IRTemp res = math_SHUFPD_128( eV, vV, imm8 );
26021         putYMMRegLoAndZU( rG, mkexpr(res) );
26022         *uses_vvvv = True;
26023         goto decode_success;
26024      }
26025      /* VSHUFPD imm8, ymm3/m256, ymm2, ymm1, ymm2 */
26026      /* = VEX.NDS.256.66.0F.WIG C6 /r ib */
26027      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26028         Int    imm8 = 0;
26029         IRTemp eV   = newTemp(Ity_V256);
26030         IRTemp vV   = newTemp(Ity_V256);
26031         UInt  modrm = getUChar(delta);
26032         UInt  rG    = gregOfRexRM(pfx,modrm);
26033         UInt  rV    = getVexNvvvv(pfx);
26034         assign( vV, getYMMReg(rV) );
26035         if (epartIsReg(modrm)) {
26036            UInt rE = eregOfRexRM(pfx,modrm);
26037            assign( eV, getYMMReg(rE) );
26038            imm8 = (Int)getUChar(delta+1);
26039            delta += 1+1;
26040            DIP("vshufpd $%d,%s,%s,%s\n",
26041                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
26042         } else {
26043            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 1 );
26044            assign( eV, loadLE(Ity_V256, mkexpr(addr)) );
26045            imm8 = (Int)getUChar(delta+alen);
26046            delta += 1+alen;
26047            DIP("vshufpd $%d,%s,%s,%s\n",
26048                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
26049         }
26050         IRTemp res = math_SHUFPD_256( eV, vV, imm8 );
26051         putYMMReg( rG, mkexpr(res) );
26052         *uses_vvvv = True;
26053         goto decode_success;
26054      }
26055      break;
26056
26057   case 0xD0:
26058      /* VADDSUBPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D0 /r */
26059      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26060         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
26061                    uses_vvvv, vbi, pfx, delta,
26062                    "vaddsubpd", math_ADDSUBPD_128 );
26063         goto decode_success;
26064      }
26065      /* VADDSUBPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D0 /r */
26066      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26067         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
26068                    uses_vvvv, vbi, pfx, delta,
26069                    "vaddsubpd", math_ADDSUBPD_256 );
26070         goto decode_success;
26071      }
26072      /* VADDSUBPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.F2.0F.WIG D0 /r */
26073      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26074         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
26075                    uses_vvvv, vbi, pfx, delta,
26076                    "vaddsubps", math_ADDSUBPS_128 );
26077         goto decode_success;
26078      }
26079      /* VADDSUBPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.F2.0F.WIG D0 /r */
26080      if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26081         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
26082                    uses_vvvv, vbi, pfx, delta,
26083                    "vaddsubps", math_ADDSUBPS_256 );
26084         goto decode_success;
26085      }
26086      break;
26087
26088   case 0xD1:
26089      /* VPSRLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D1 /r */
26090      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26091         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
26092                                        "vpsrlw", Iop_ShrN16x8 );
26093         *uses_vvvv = True;
26094         goto decode_success;
26095
26096      }
26097      /* VPSRLW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D1 /r */
26098      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26099         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
26100                                        "vpsrlw", Iop_ShrN16x16 );
26101         *uses_vvvv = True;
26102         goto decode_success;
26103
26104      }
26105      break;
26106
26107   case 0xD2:
26108      /* VPSRLD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D2 /r */
26109      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26110         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
26111                                        "vpsrld", Iop_ShrN32x4 );
26112         *uses_vvvv = True;
26113         goto decode_success;
26114      }
26115      /* VPSRLD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D2 /r */
26116      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26117         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
26118                                        "vpsrld", Iop_ShrN32x8 );
26119         *uses_vvvv = True;
26120         goto decode_success;
26121      }
26122      break;
26123
26124   case 0xD3:
26125      /* VPSRLQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D3 /r */
26126      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26127         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
26128                                        "vpsrlq", Iop_ShrN64x2 );
26129         *uses_vvvv = True;
26130         goto decode_success;
26131      }
26132      /* VPSRLQ xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D3 /r */
26133      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26134         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
26135                                        "vpsrlq", Iop_ShrN64x4 );
26136         *uses_vvvv = True;
26137         goto decode_success;
26138      }
26139      break;
26140
26141   case 0xD4:
26142      /* VPADDQ r/m, rV, r ::: r = rV + r/m */
26143      /* VPADDQ = VEX.NDS.128.66.0F.WIG D4 /r */
26144      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26145         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26146                    uses_vvvv, vbi, pfx, delta, "vpaddq", Iop_Add64x2 );
26147         goto decode_success;
26148      }
26149      /* VPADDQ r/m, rV, r ::: r = rV + r/m */
26150      /* VPADDQ = VEX.NDS.256.66.0F.WIG D4 /r */
26151      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26152         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26153                    uses_vvvv, vbi, pfx, delta, "vpaddq", Iop_Add64x4 );
26154         goto decode_success;
26155      }
26156      break;
26157
26158   case 0xD5:
26159      /* VPMULLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D5 /r */
26160      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26161         delta = dis_AVX128_E_V_to_G(
26162                    uses_vvvv, vbi, pfx, delta, "vpmullw", Iop_Mul16x8 );
26163         goto decode_success;
26164      }
26165      /* VPMULLW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D5 /r */
26166      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26167         delta = dis_AVX256_E_V_to_G(
26168                    uses_vvvv, vbi, pfx, delta, "vpmullw", Iop_Mul16x16 );
26169         goto decode_success;
26170      }
26171      break;
26172
26173   case 0xD6:
26174      /* I can't even find any Intel docs for this one. */
26175      /* Basically: 66 0F D6 = MOVQ -- move 64 bits from G (lo half
26176         xmm) to E (mem or lo half xmm).  Looks like L==0(128), W==0
26177         (WIG, maybe?) */
26178      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
26179          && 0==getRexW(pfx)/*this might be redundant, dunno*/) {
26180         UChar modrm = getUChar(delta);
26181         UInt  rG    = gregOfRexRM(pfx,modrm);
26182         if (epartIsReg(modrm)) {
26183            /* fall through, awaiting test case */
26184            /* dst: lo half copied, hi half zeroed */
26185         } else {
26186            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
26187            storeLE( mkexpr(addr), getXMMRegLane64( rG, 0 ));
26188            DIP("vmovq %s,%s\n", nameXMMReg(rG), dis_buf );
26189            delta += alen;
26190            goto decode_success;
26191         }
26192      }
26193      break;
26194
26195   case 0xD7:
26196      /* VEX.128.66.0F.WIG D7 /r = VPMOVMSKB xmm1, r32 */
26197      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26198         delta = dis_PMOVMSKB_128( vbi, pfx, delta, True/*isAvx*/ );
26199         goto decode_success;
26200      }
26201      /* VEX.128.66.0F.WIG D7 /r = VPMOVMSKB ymm1, r32 */
26202      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26203         delta = dis_PMOVMSKB_256( vbi, pfx, delta );
26204         goto decode_success;
26205      }
26206      break;
26207
26208   case 0xD8:
26209      /* VPSUBUSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D8 /r */
26210      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26211         delta = dis_AVX128_E_V_to_G(
26212                    uses_vvvv, vbi, pfx, delta, "vpsubusb", Iop_QSub8Ux16 );
26213         goto decode_success;
26214      }
26215      /* VPSUBUSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D8 /r */
26216      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26217         delta = dis_AVX256_E_V_to_G(
26218                    uses_vvvv, vbi, pfx, delta, "vpsubusb", Iop_QSub8Ux32 );
26219         goto decode_success;
26220      }
26221      break;
26222
26223   case 0xD9:
26224      /* VPSUBUSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG D9 /r */
26225      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26226         delta = dis_AVX128_E_V_to_G(
26227                    uses_vvvv, vbi, pfx, delta, "vpsubusw", Iop_QSub16Ux8 );
26228         goto decode_success;
26229      }
26230      /* VPSUBUSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG D9 /r */
26231      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26232         delta = dis_AVX256_E_V_to_G(
26233                    uses_vvvv, vbi, pfx, delta, "vpsubusw", Iop_QSub16Ux16 );
26234         goto decode_success;
26235      }
26236      break;
26237
26238   case 0xDA:
26239      /* VPMINUB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DA /r */
26240      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26241         delta = dis_AVX128_E_V_to_G(
26242                    uses_vvvv, vbi, pfx, delta, "vpminub", Iop_Min8Ux16 );
26243         goto decode_success;
26244      }
26245      /* VPMINUB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DA /r */
26246      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26247         delta = dis_AVX256_E_V_to_G(
26248                    uses_vvvv, vbi, pfx, delta, "vpminub", Iop_Min8Ux32 );
26249         goto decode_success;
26250      }
26251      break;
26252
26253   case 0xDB:
26254      /* VPAND r/m, rV, r ::: r = rV & r/m */
26255      /* VEX.NDS.128.66.0F.WIG DB /r = VPAND xmm3/m128, xmm2, xmm1 */
26256      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26257         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26258                    uses_vvvv, vbi, pfx, delta, "vpand", Iop_AndV128 );
26259         goto decode_success;
26260      }
26261      /* VPAND r/m, rV, r ::: r = rV & r/m */
26262      /* VEX.NDS.256.66.0F.WIG DB /r = VPAND ymm3/m256, ymm2, ymm1 */
26263      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26264         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26265                    uses_vvvv, vbi, pfx, delta, "vpand", Iop_AndV256 );
26266         goto decode_success;
26267      }
26268      break;
26269
26270   case 0xDC:
26271      /* VPADDUSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DC /r */
26272      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26273         delta = dis_AVX128_E_V_to_G(
26274                    uses_vvvv, vbi, pfx, delta, "vpaddusb", Iop_QAdd8Ux16 );
26275         goto decode_success;
26276      }
26277      /* VPADDUSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DC /r */
26278      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26279         delta = dis_AVX256_E_V_to_G(
26280                    uses_vvvv, vbi, pfx, delta, "vpaddusb", Iop_QAdd8Ux32 );
26281         goto decode_success;
26282      }
26283      break;
26284
26285   case 0xDD:
26286      /* VPADDUSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DD /r */
26287      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26288         delta = dis_AVX128_E_V_to_G(
26289                    uses_vvvv, vbi, pfx, delta, "vpaddusw", Iop_QAdd16Ux8 );
26290         goto decode_success;
26291      }
26292      /* VPADDUSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DD /r */
26293      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26294         delta = dis_AVX256_E_V_to_G(
26295                    uses_vvvv, vbi, pfx, delta, "vpaddusw", Iop_QAdd16Ux16 );
26296         goto decode_success;
26297      }
26298      break;
26299
26300   case 0xDE:
26301      /* VPMAXUB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG DE /r */
26302      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26303         delta = dis_AVX128_E_V_to_G(
26304                    uses_vvvv, vbi, pfx, delta, "vpmaxub", Iop_Max8Ux16 );
26305         goto decode_success;
26306      }
26307      /* VPMAXUB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG DE /r */
26308      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26309         delta = dis_AVX256_E_V_to_G(
26310                    uses_vvvv, vbi, pfx, delta, "vpmaxub", Iop_Max8Ux32 );
26311         goto decode_success;
26312      }
26313      break;
26314
26315   case 0xDF:
26316      /* VPANDN r/m, rV, r ::: r = rV & ~r/m (is that correct, re the ~ ?) */
26317      /* VEX.NDS.128.66.0F.WIG DF /r = VPANDN xmm3/m128, xmm2, xmm1 */
26318      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26319         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
26320                    uses_vvvv, vbi, pfx, delta, "vpandn", Iop_AndV128,
26321                    NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
26322         goto decode_success;
26323      }
26324      /* VPANDN r/m, rV, r ::: r = rV & ~r/m (is that correct, re the ~ ?) */
26325      /* VEX.NDS.256.66.0F.WIG DF /r = VPANDN ymm3/m256, ymm2, ymm1 */
26326      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26327         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG(
26328                    uses_vvvv, vbi, pfx, delta, "vpandn", Iop_AndV256,
26329                    NULL, True/*invertLeftArg*/, False/*swapArgs*/ );
26330         goto decode_success;
26331      }
26332      break;
26333
26334   case 0xE0:
26335      /* VPAVGB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E0 /r */
26336      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26337         delta = dis_AVX128_E_V_to_G(
26338                    uses_vvvv, vbi, pfx, delta, "vpavgb", Iop_Avg8Ux16 );
26339         goto decode_success;
26340      }
26341      /* VPAVGB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E0 /r */
26342      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26343         delta = dis_AVX256_E_V_to_G(
26344                    uses_vvvv, vbi, pfx, delta, "vpavgb", Iop_Avg8Ux32 );
26345         goto decode_success;
26346      }
26347      break;
26348
26349   case 0xE1:
26350      /* VPSRAW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E1 /r */
26351      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26352         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
26353                                        "vpsraw", Iop_SarN16x8 );
26354         *uses_vvvv = True;
26355         goto decode_success;
26356      }
26357      /* VPSRAW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E1 /r */
26358      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26359         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
26360                                        "vpsraw", Iop_SarN16x16 );
26361         *uses_vvvv = True;
26362         goto decode_success;
26363      }
26364      break;
26365
26366   case 0xE2:
26367      /* VPSRAD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E2 /r */
26368      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26369         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
26370                                        "vpsrad", Iop_SarN32x4 );
26371         *uses_vvvv = True;
26372         goto decode_success;
26373      }
26374      /* VPSRAD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E2 /r */
26375      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26376         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
26377                                        "vpsrad", Iop_SarN32x8 );
26378         *uses_vvvv = True;
26379         goto decode_success;
26380      }
26381      break;
26382
26383   case 0xE3:
26384      /* VPAVGW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E3 /r */
26385      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26386         delta = dis_AVX128_E_V_to_G(
26387                    uses_vvvv, vbi, pfx, delta, "vpavgw", Iop_Avg16Ux8 );
26388         goto decode_success;
26389      }
26390      /* VPAVGW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E3 /r */
26391      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26392         delta = dis_AVX256_E_V_to_G(
26393                    uses_vvvv, vbi, pfx, delta, "vpavgw", Iop_Avg16Ux16 );
26394         goto decode_success;
26395      }
26396      break;
26397
26398   case 0xE4:
26399      /* VPMULHUW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E4 /r */
26400      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26401         delta = dis_AVX128_E_V_to_G(
26402                    uses_vvvv, vbi, pfx, delta, "vpmulhuw", Iop_MulHi16Ux8 );
26403         goto decode_success;
26404      }
26405      /* VPMULHUW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E4 /r */
26406      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26407         delta = dis_AVX256_E_V_to_G(
26408                    uses_vvvv, vbi, pfx, delta, "vpmulhuw", Iop_MulHi16Ux16 );
26409         goto decode_success;
26410      }
26411      break;
26412
26413   case 0xE5:
26414      /* VPMULHW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E5 /r */
26415      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26416         delta = dis_AVX128_E_V_to_G(
26417                    uses_vvvv, vbi, pfx, delta, "vpmulhw", Iop_MulHi16Sx8 );
26418         goto decode_success;
26419      }
26420      /* VPMULHW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E5 /r */
26421      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26422         delta = dis_AVX256_E_V_to_G(
26423                    uses_vvvv, vbi, pfx, delta, "vpmulhw", Iop_MulHi16Sx16 );
26424         goto decode_success;
26425      }
26426      break;
26427
26428   case 0xE6:
26429      /* VCVTDQ2PD xmm2/m64, xmm1 = VEX.128.F3.0F.WIG E6 /r */
26430      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*128*/) {
26431         delta = dis_CVTDQ2PD_128(vbi, pfx, delta, True/*isAvx*/);
26432         goto decode_success;
26433      }
26434      /* VCVTDQ2PD xmm2/m128, ymm1 = VEX.256.F3.0F.WIG E6 /r */
26435      if (haveF3no66noF2(pfx) && 1==getVexL(pfx)/*256*/) {
26436         delta = dis_CVTDQ2PD_256(vbi, pfx, delta);
26437         goto decode_success;
26438      }
26439      /* VCVTTPD2DQ xmm2/m128, xmm1 = VEX.128.66.0F.WIG E6 /r */
26440      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26441         delta = dis_CVTxPD2DQ_128(vbi, pfx, delta, True/*isAvx*/,
26442                                   True/*r2zero*/);
26443         goto decode_success;
26444      }
26445      /* VCVTTPD2DQ ymm2/m256, xmm1 = VEX.256.66.0F.WIG E6 /r */
26446      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26447         delta = dis_CVTxPD2DQ_256(vbi, pfx, delta, True/*r2zero*/);
26448         goto decode_success;
26449      }
26450      /* VCVTPD2DQ xmm2/m128, xmm1 = VEX.128.F2.0F.WIG E6 /r */
26451      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26452         delta = dis_CVTxPD2DQ_128(vbi, pfx, delta, True/*isAvx*/,
26453                                   False/*!r2zero*/);
26454         goto decode_success;
26455      }
26456      /* VCVTPD2DQ ymm2/m256, xmm1 = VEX.256.F2.0F.WIG E6 /r */
26457      if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26458         delta = dis_CVTxPD2DQ_256(vbi, pfx, delta, False/*!r2zero*/);
26459         goto decode_success;
26460      }
26461      break;
26462
26463   case 0xE7:
26464      /* VMOVNTDQ xmm1, m128 = VEX.128.66.0F.WIG E7 /r */
26465      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26466         UChar modrm = getUChar(delta);
26467         UInt rG     = gregOfRexRM(pfx,modrm);
26468         if (!epartIsReg(modrm)) {
26469            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
26470            gen_SEGV_if_not_16_aligned( addr );
26471            storeLE( mkexpr(addr), getXMMReg(rG) );
26472            DIP("vmovntdq %s,%s\n", dis_buf, nameXMMReg(rG));
26473            delta += alen;
26474            goto decode_success;
26475         }
26476         /* else fall through */
26477      }
26478      /* VMOVNTDQ ymm1, m256 = VEX.256.66.0F.WIG E7 /r */
26479      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26480         UChar modrm = getUChar(delta);
26481         UInt rG     = gregOfRexRM(pfx,modrm);
26482         if (!epartIsReg(modrm)) {
26483            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
26484            gen_SEGV_if_not_32_aligned( addr );
26485            storeLE( mkexpr(addr), getYMMReg(rG) );
26486            DIP("vmovntdq %s,%s\n", dis_buf, nameYMMReg(rG));
26487            delta += alen;
26488            goto decode_success;
26489         }
26490         /* else fall through */
26491      }
26492      break;
26493
26494   case 0xE8:
26495      /* VPSUBSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E8 /r */
26496      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26497         delta = dis_AVX128_E_V_to_G(
26498                    uses_vvvv, vbi, pfx, delta, "vpsubsb", Iop_QSub8Sx16 );
26499         goto decode_success;
26500      }
26501      /* VPSUBSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E8 /r */
26502      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26503         delta = dis_AVX256_E_V_to_G(
26504                    uses_vvvv, vbi, pfx, delta, "vpsubsb", Iop_QSub8Sx32 );
26505         goto decode_success;
26506      }
26507      break;
26508
26509   case 0xE9:
26510      /* VPSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG E9 /r */
26511      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26512         delta = dis_AVX128_E_V_to_G(
26513                    uses_vvvv, vbi, pfx, delta, "vpsubsw", Iop_QSub16Sx8 );
26514         goto decode_success;
26515      }
26516      /* VPSUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG E9 /r */
26517      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26518         delta = dis_AVX256_E_V_to_G(
26519                    uses_vvvv, vbi, pfx, delta, "vpsubsw", Iop_QSub16Sx16 );
26520         goto decode_success;
26521      }
26522      break;
26523
26524   case 0xEA:
26525      /* VPMINSW r/m, rV, r ::: r = min-signed16s(rV, r/m) */
26526      /* VPMINSW = VEX.NDS.128.66.0F.WIG EA /r */
26527      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26528         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26529                    uses_vvvv, vbi, pfx, delta, "vpminsw", Iop_Min16Sx8 );
26530         goto decode_success;
26531      }
26532      /* VPMINSW r/m, rV, r ::: r = min-signed16s(rV, r/m) */
26533      /* VPMINSW = VEX.NDS.256.66.0F.WIG EA /r */
26534      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26535         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26536                    uses_vvvv, vbi, pfx, delta, "vpminsw", Iop_Min16Sx16 );
26537         goto decode_success;
26538      }
26539      break;
26540
26541   case 0xEB:
26542      /* VPOR r/m, rV, r ::: r = rV | r/m */
26543      /* VPOR = VEX.NDS.128.66.0F.WIG EB /r */
26544      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26545         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26546                    uses_vvvv, vbi, pfx, delta, "vpor", Iop_OrV128 );
26547         goto decode_success;
26548      }
26549      /* VPOR r/m, rV, r ::: r = rV | r/m */
26550      /* VPOR = VEX.NDS.256.66.0F.WIG EB /r */
26551      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26552         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26553                    uses_vvvv, vbi, pfx, delta, "vpor", Iop_OrV256 );
26554         goto decode_success;
26555      }
26556      break;
26557
26558   case 0xEC:
26559      /* VPADDSB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG EC /r */
26560      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26561         delta = dis_AVX128_E_V_to_G(
26562                    uses_vvvv, vbi, pfx, delta, "vpaddsb", Iop_QAdd8Sx16 );
26563         goto decode_success;
26564      }
26565      /* VPADDSB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG EC /r */
26566      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26567         delta = dis_AVX256_E_V_to_G(
26568                    uses_vvvv, vbi, pfx, delta, "vpaddsb", Iop_QAdd8Sx32 );
26569         goto decode_success;
26570      }
26571      break;
26572
26573   case 0xED:
26574      /* VPADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG ED /r */
26575      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26576         delta = dis_AVX128_E_V_to_G(
26577                    uses_vvvv, vbi, pfx, delta, "vpaddsw", Iop_QAdd16Sx8 );
26578         goto decode_success;
26579      }
26580      /* VPADDSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG ED /r */
26581      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26582         delta = dis_AVX256_E_V_to_G(
26583                    uses_vvvv, vbi, pfx, delta, "vpaddsw", Iop_QAdd16Sx16 );
26584         goto decode_success;
26585      }
26586      break;
26587
26588   case 0xEE:
26589      /* VPMAXSW r/m, rV, r ::: r = max-signed16s(rV, r/m) */
26590      /* VPMAXSW = VEX.NDS.128.66.0F.WIG EE /r */
26591      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26592         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26593                    uses_vvvv, vbi, pfx, delta, "vpmaxsw", Iop_Max16Sx8 );
26594         goto decode_success;
26595      }
26596      /* VPMAXSW r/m, rV, r ::: r = max-signed16s(rV, r/m) */
26597      /* VPMAXSW = VEX.NDS.256.66.0F.WIG EE /r */
26598      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26599         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26600                    uses_vvvv, vbi, pfx, delta, "vpmaxsw", Iop_Max16Sx16 );
26601         goto decode_success;
26602      }
26603      break;
26604
26605   case 0xEF:
26606      /* VPXOR r/m, rV, r ::: r = rV ^ r/m */
26607      /* VPXOR = VEX.NDS.128.66.0F.WIG EF /r */
26608      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26609         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26610                    uses_vvvv, vbi, pfx, delta, "vpxor", Iop_XorV128 );
26611         goto decode_success;
26612      }
26613      /* VPXOR r/m, rV, r ::: r = rV ^ r/m */
26614      /* VPXOR = VEX.NDS.256.66.0F.WIG EF /r */
26615      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26616         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26617                    uses_vvvv, vbi, pfx, delta, "vpxor", Iop_XorV256 );
26618         goto decode_success;
26619      }
26620      break;
26621
26622   case 0xF0:
26623      /* VLDDQU m256, ymm1 = VEX.256.F2.0F.WIG F0 /r */
26624      if (haveF2no66noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26625         UChar  modrm = getUChar(delta);
26626         UInt   rD    = gregOfRexRM(pfx, modrm);
26627         IRTemp tD    = newTemp(Ity_V256);
26628         if (epartIsReg(modrm)) break;
26629         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
26630         delta += alen;
26631         assign(tD, loadLE(Ity_V256, mkexpr(addr)));
26632         DIP("vlddqu %s,%s\n", dis_buf, nameYMMReg(rD));
26633         putYMMReg(rD, mkexpr(tD));
26634         goto decode_success;
26635      }
26636      /* VLDDQU m128, xmm1 = VEX.128.F2.0F.WIG F0 /r */
26637      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26638         UChar  modrm = getUChar(delta);
26639         UInt   rD    = gregOfRexRM(pfx, modrm);
26640         IRTemp tD    = newTemp(Ity_V128);
26641         if (epartIsReg(modrm)) break;
26642         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
26643         delta += alen;
26644         assign(tD, loadLE(Ity_V128, mkexpr(addr)));
26645         DIP("vlddqu %s,%s\n", dis_buf, nameXMMReg(rD));
26646         putYMMRegLoAndZU(rD, mkexpr(tD));
26647         goto decode_success;
26648      }
26649      break;
26650
26651   case 0xF1:
26652      /* VPSLLW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F1 /r */
26653      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26654         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
26655                                        "vpsllw", Iop_ShlN16x8 );
26656         *uses_vvvv = True;
26657         goto decode_success;
26658
26659      }
26660      /* VPSLLW xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F1 /r */
26661      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26662         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
26663                                        "vpsllw", Iop_ShlN16x16 );
26664         *uses_vvvv = True;
26665         goto decode_success;
26666
26667      }
26668      break;
26669
26670   case 0xF2:
26671      /* VPSLLD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F2 /r */
26672      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26673         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
26674                                        "vpslld", Iop_ShlN32x4 );
26675         *uses_vvvv = True;
26676         goto decode_success;
26677      }
26678      /* VPSLLD xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F2 /r */
26679      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26680         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
26681                                        "vpslld", Iop_ShlN32x8 );
26682         *uses_vvvv = True;
26683         goto decode_success;
26684      }
26685      break;
26686
26687   case 0xF3:
26688      /* VPSLLQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F3 /r */
26689      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26690         delta = dis_AVX128_shiftV_byE( vbi, pfx, delta,
26691                                        "vpsllq", Iop_ShlN64x2 );
26692         *uses_vvvv = True;
26693         goto decode_success;
26694      }
26695      /* VPSLLQ xmm3/m128, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F3 /r */
26696      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26697         delta = dis_AVX256_shiftV_byE( vbi, pfx, delta,
26698                                        "vpsllq", Iop_ShlN64x4 );
26699         *uses_vvvv = True;
26700         goto decode_success;
26701      }
26702      break;
26703
26704   case 0xF4:
26705      /* VPMULUDQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F4 /r */
26706      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26707         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
26708                    uses_vvvv, vbi, pfx, delta,
26709                    "vpmuludq", math_PMULUDQ_128 );
26710         goto decode_success;
26711      }
26712      /* VPMULUDQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F4 /r */
26713      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26714         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
26715                    uses_vvvv, vbi, pfx, delta,
26716                    "vpmuludq", math_PMULUDQ_256 );
26717         goto decode_success;
26718      }
26719      break;
26720
26721   case 0xF5:
26722      /* VPMADDWD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F5 /r */
26723      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26724         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
26725                    uses_vvvv, vbi, pfx, delta,
26726                    "vpmaddwd", math_PMADDWD_128 );
26727         goto decode_success;
26728      }
26729      /* VPMADDWD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F5 /r */
26730      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26731         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
26732                    uses_vvvv, vbi, pfx, delta,
26733                    "vpmaddwd", math_PMADDWD_256 );
26734         goto decode_success;
26735      }
26736      break;
26737
26738   case 0xF6:
26739      /* VPSADBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F.WIG F6 /r */
26740      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26741         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
26742                    uses_vvvv, vbi, pfx, delta,
26743                    "vpsadbw", math_PSADBW_128 );
26744         goto decode_success;
26745      }
26746      /* VPSADBW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F.WIG F6 /r */
26747      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26748         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
26749                    uses_vvvv, vbi, pfx, delta,
26750                    "vpsadbw", math_PSADBW_256 );
26751         goto decode_success;
26752      }
26753      break;
26754
26755   case 0xF7:
26756      /* VMASKMOVDQU xmm2, xmm1 = VEX.128.66.0F.WIG F7 /r */
26757      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
26758          && epartIsReg(getUChar(delta))) {
26759         delta = dis_MASKMOVDQU( vbi, pfx, delta, True/*isAvx*/ );
26760         goto decode_success;
26761      }
26762      break;
26763
26764   case 0xF8:
26765      /* VPSUBB r/m, rV, r ::: r = rV - r/m */
26766      /* VPSUBB = VEX.NDS.128.66.0F.WIG F8 /r */
26767      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26768         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26769                    uses_vvvv, vbi, pfx, delta, "vpsubb", Iop_Sub8x16 );
26770         goto decode_success;
26771      }
26772      /* VPSUBB r/m, rV, r ::: r = rV - r/m */
26773      /* VPSUBB = VEX.NDS.256.66.0F.WIG F8 /r */
26774      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26775         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26776                    uses_vvvv, vbi, pfx, delta, "vpsubb", Iop_Sub8x32 );
26777         goto decode_success;
26778      }
26779      break;
26780
26781   case 0xF9:
26782      /* VPSUBW r/m, rV, r ::: r = rV - r/m */
26783      /* VPSUBW = VEX.NDS.128.66.0F.WIG F9 /r */
26784      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26785         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26786                    uses_vvvv, vbi, pfx, delta, "vpsubw", Iop_Sub16x8 );
26787         goto decode_success;
26788      }
26789      /* VPSUBW r/m, rV, r ::: r = rV - r/m */
26790      /* VPSUBW = VEX.NDS.256.66.0F.WIG F9 /r */
26791      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26792         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26793                    uses_vvvv, vbi, pfx, delta, "vpsubw", Iop_Sub16x16 );
26794         goto decode_success;
26795      }
26796      break;
26797
26798   case 0xFA:
26799      /* VPSUBD r/m, rV, r ::: r = rV - r/m */
26800      /* VPSUBD = VEX.NDS.128.66.0F.WIG FA /r */
26801      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26802         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26803                    uses_vvvv, vbi, pfx, delta, "vpsubd", Iop_Sub32x4 );
26804         goto decode_success;
26805      }
26806      /* VPSUBD r/m, rV, r ::: r = rV - r/m */
26807      /* VPSUBD = VEX.NDS.256.66.0F.WIG FA /r */
26808      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26809         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26810                    uses_vvvv, vbi, pfx, delta, "vpsubd", Iop_Sub32x8 );
26811         goto decode_success;
26812      }
26813      break;
26814
26815   case 0xFB:
26816      /* VPSUBQ r/m, rV, r ::: r = rV - r/m */
26817      /* VPSUBQ = VEX.NDS.128.66.0F.WIG FB /r */
26818      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26819         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26820                    uses_vvvv, vbi, pfx, delta, "vpsubq", Iop_Sub64x2 );
26821         goto decode_success;
26822      }
26823      /* VPSUBQ r/m, rV, r ::: r = rV - r/m */
26824      /* VPSUBQ = VEX.NDS.256.66.0F.WIG FB /r */
26825      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26826         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26827                    uses_vvvv, vbi, pfx, delta, "vpsubq", Iop_Sub64x4 );
26828         goto decode_success;
26829      }
26830      break;
26831
26832   case 0xFC:
26833      /* VPADDB r/m, rV, r ::: r = rV + r/m */
26834      /* VPADDB = VEX.NDS.128.66.0F.WIG FC /r */
26835      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26836         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26837                    uses_vvvv, vbi, pfx, delta, "vpaddb", Iop_Add8x16 );
26838         goto decode_success;
26839      }
26840      /* VPADDB r/m, rV, r ::: r = rV + r/m */
26841      /* VPADDB = VEX.NDS.256.66.0F.WIG FC /r */
26842      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26843         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26844                    uses_vvvv, vbi, pfx, delta, "vpaddb", Iop_Add8x32 );
26845         goto decode_success;
26846      }
26847      break;
26848
26849   case 0xFD:
26850      /* VPADDW r/m, rV, r ::: r = rV + r/m */
26851      /* VPADDW = VEX.NDS.128.66.0F.WIG FD /r */
26852      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26853         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26854                    uses_vvvv, vbi, pfx, delta, "vpaddw", Iop_Add16x8 );
26855         goto decode_success;
26856      }
26857      /* VPADDW r/m, rV, r ::: r = rV + r/m */
26858      /* VPADDW = VEX.NDS.256.66.0F.WIG FD /r */
26859      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26860         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26861                    uses_vvvv, vbi, pfx, delta, "vpaddw", Iop_Add16x16 );
26862         goto decode_success;
26863      }
26864      break;
26865
26866   case 0xFE:
26867      /* VPADDD r/m, rV, r ::: r = rV + r/m */
26868      /* VPADDD = VEX.NDS.128.66.0F.WIG FE /r */
26869      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
26870         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
26871                    uses_vvvv, vbi, pfx, delta, "vpaddd", Iop_Add32x4 );
26872         goto decode_success;
26873      }
26874      /* VPADDD r/m, rV, r ::: r = rV + r/m */
26875      /* VPADDD = VEX.NDS.256.66.0F.WIG FE /r */
26876      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
26877         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
26878                    uses_vvvv, vbi, pfx, delta, "vpaddd", Iop_Add32x8 );
26879         goto decode_success;
26880      }
26881      break;
26882
26883   default:
26884      break;
26885
26886   }
26887
26888  //decode_failure:
26889   return deltaIN;
26890
26891  decode_success:
26892   return delta;
26893}
26894
26895
26896/*------------------------------------------------------------*/
26897/*---                                                      ---*/
26898/*--- Top-level post-escape decoders: dis_ESC_0F38__VEX    ---*/
26899/*---                                                      ---*/
26900/*------------------------------------------------------------*/
26901
26902static IRTemp math_PERMILPS_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
26903{
26904   /* In the control vector, zero out all but the bottom two bits of
26905      each 32-bit lane. */
26906   IRExpr* cv1 = binop(Iop_ShrN32x4,
26907                       binop(Iop_ShlN32x4, mkexpr(ctrlV), mkU8(30)),
26908                       mkU8(30));
26909   /* And use the resulting cleaned-up control vector as steering
26910      in a Perm operation. */
26911   IRTemp res = newTemp(Ity_V128);
26912   assign(res, binop(Iop_Perm32x4, mkexpr(dataV), cv1));
26913   return res;
26914}
26915
26916static IRTemp math_PERMILPS_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
26917{
26918   IRTemp dHi, dLo, cHi, cLo;
26919   dHi = dLo = cHi = cLo = IRTemp_INVALID;
26920   breakupV256toV128s( dataV, &dHi, &dLo );
26921   breakupV256toV128s( ctrlV, &cHi, &cLo );
26922   IRTemp rHi = math_PERMILPS_VAR_128( dHi, cHi );
26923   IRTemp rLo = math_PERMILPS_VAR_128( dLo, cLo );
26924   IRTemp res = newTemp(Ity_V256);
26925   assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
26926   return res;
26927}
26928
26929static IRTemp math_PERMILPD_VAR_128 ( IRTemp dataV, IRTemp ctrlV )
26930{
26931   /* No cleverness here .. */
26932   IRTemp dHi, dLo, cHi, cLo;
26933   dHi = dLo = cHi = cLo = IRTemp_INVALID;
26934   breakupV128to64s( dataV, &dHi, &dLo );
26935   breakupV128to64s( ctrlV, &cHi, &cLo );
26936   IRExpr* rHi
26937      = IRExpr_ITE( unop(Iop_64to1,
26938                         binop(Iop_Shr64, mkexpr(cHi), mkU8(1))),
26939                    mkexpr(dHi), mkexpr(dLo) );
26940   IRExpr* rLo
26941      = IRExpr_ITE( unop(Iop_64to1,
26942                         binop(Iop_Shr64, mkexpr(cLo), mkU8(1))),
26943                    mkexpr(dHi), mkexpr(dLo) );
26944   IRTemp res = newTemp(Ity_V128);
26945   assign(res, binop(Iop_64HLtoV128, rHi, rLo));
26946   return res;
26947}
26948
26949static IRTemp math_PERMILPD_VAR_256 ( IRTemp dataV, IRTemp ctrlV )
26950{
26951   IRTemp dHi, dLo, cHi, cLo;
26952   dHi = dLo = cHi = cLo = IRTemp_INVALID;
26953   breakupV256toV128s( dataV, &dHi, &dLo );
26954   breakupV256toV128s( ctrlV, &cHi, &cLo );
26955   IRTemp rHi = math_PERMILPD_VAR_128( dHi, cHi );
26956   IRTemp rLo = math_PERMILPD_VAR_128( dLo, cLo );
26957   IRTemp res = newTemp(Ity_V256);
26958   assign(res, binop(Iop_V128HLtoV256, mkexpr(rHi), mkexpr(rLo)));
26959   return res;
26960}
26961
26962static IRTemp math_VPERMD ( IRTemp ctrlV, IRTemp dataV )
26963{
26964   /* In the control vector, zero out all but the bottom three bits of
26965      each 32-bit lane. */
26966   IRExpr* cv1 = binop(Iop_ShrN32x8,
26967                       binop(Iop_ShlN32x8, mkexpr(ctrlV), mkU8(29)),
26968                       mkU8(29));
26969   /* And use the resulting cleaned-up control vector as steering
26970      in a Perm operation. */
26971   IRTemp res = newTemp(Ity_V256);
26972   assign(res, binop(Iop_Perm32x8, mkexpr(dataV), cv1));
26973   return res;
26974}
26975
26976static Long dis_SHIFTX ( /*OUT*/Bool* uses_vvvv,
26977                         VexAbiInfo* vbi, Prefix pfx, Long delta,
26978                         const HChar* opname, IROp op8 )
26979{
26980   HChar   dis_buf[50];
26981   Int     alen;
26982   Int     size = getRexW(pfx) ? 8 : 4;
26983   IRType  ty   = szToITy(size);
26984   IRTemp  src  = newTemp(ty);
26985   IRTemp  amt  = newTemp(ty);
26986   UChar   rm   = getUChar(delta);
26987
26988   assign( amt, getIRegV(size,pfx) );
26989   if (epartIsReg(rm)) {
26990      assign( src, getIRegE(size,pfx,rm) );
26991      DIP("%s %s,%s,%s\n", opname, nameIRegV(size,pfx),
26992                           nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
26993      delta++;
26994   } else {
26995      IRTemp addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
26996      assign( src, loadLE(ty, mkexpr(addr)) );
26997      DIP("%s %s,%s,%s\n", opname, nameIRegV(size,pfx), dis_buf,
26998                           nameIRegG(size,pfx,rm));
26999      delta += alen;
27000   }
27001
27002   putIRegG( size, pfx, rm,
27003             binop(mkSizedOp(ty,op8), mkexpr(src),
27004                   narrowTo(Ity_I8, binop(mkSizedOp(ty,Iop_And8), mkexpr(amt),
27005                                          mkU(ty,8*size-1)))) );
27006   /* Flags aren't modified.  */
27007   *uses_vvvv = True;
27008   return delta;
27009}
27010
27011
27012static Long dis_FMA ( VexAbiInfo* vbi, Prefix pfx, Long delta, UChar opc )
27013{
27014   UChar  modrm   = getUChar(delta);
27015   UInt   rG      = gregOfRexRM(pfx, modrm);
27016   UInt   rV      = getVexNvvvv(pfx);
27017   Bool   scalar  = (opc & 0xF) > 7 && (opc & 1);
27018   IRType ty      = getRexW(pfx) ? Ity_F64 : Ity_F32;
27019   IRType vty     = scalar ? ty : getVexL(pfx) ? Ity_V256 : Ity_V128;
27020   IRTemp vX      = newTemp(vty);
27021   IRTemp vY      = newTemp(vty);
27022   IRTemp vZ      = newTemp(vty);
27023   IRExpr *x[8], *y[8], *z[8];
27024   IRTemp addr    = IRTemp_INVALID;
27025   HChar  dis_buf[50];
27026   Int    alen    = 0;
27027   const HChar *name;
27028   const HChar *suffix;
27029   const HChar *order;
27030   Bool   negateRes   = False;
27031   Bool   negateZeven = False;
27032   Bool   negateZodd  = False;
27033   Int    i, j;
27034   Int    count;
27035   static IROp ops[] = { Iop_V256to64_0, Iop_V256to64_1,
27036                         Iop_V256to64_2, Iop_V256to64_3,
27037                         Iop_V128to64, Iop_V128HIto64 };
27038
27039   switch (opc & 0xF) {
27040   case 0x6:
27041      name = "addsub";
27042      negateZeven = True;
27043      break;
27044   case 0x7:
27045      name = "subadd";
27046      negateZodd = True;
27047      break;
27048   case 0x8:
27049   case 0x9:
27050      name = "add";
27051      break;
27052   case 0xA:
27053   case 0xB:
27054      name = "sub";
27055      negateZeven = True;
27056      negateZodd = True;
27057      break;
27058   case 0xC:
27059   case 0xD:
27060      name = "add";
27061      negateRes = True;
27062      negateZeven = True;
27063      negateZodd = True;
27064      break;
27065   case 0xE:
27066   case 0xF:
27067      name = "sub";
27068      negateRes = True;
27069      break;
27070   default:
27071      vpanic("dis_FMA(amd64)");
27072      break;
27073   }
27074   switch (opc & 0xF0) {
27075   case 0x90: order = "132"; break;
27076   case 0xA0: order = "213"; break;
27077   case 0xB0: order = "231"; break;
27078   default: vpanic("dis_FMA(amd64)"); break;
27079   }
27080   if (scalar)
27081      suffix = ty == Ity_F64 ? "sd" : "ss";
27082   else
27083      suffix = ty == Ity_F64 ? "pd" : "ps";
27084
27085   if (scalar) {
27086      assign( vX, ty == Ity_F64
27087                  ? getXMMRegLane64F(rG, 0) : getXMMRegLane32F(rG, 0) );
27088      assign( vZ, ty == Ity_F64
27089                  ? getXMMRegLane64F(rV, 0) : getXMMRegLane32F(rV, 0) );
27090   } else {
27091      assign( vX, vty == Ity_V256 ? getYMMReg(rG) : getXMMReg(rG) );
27092      assign( vZ, vty == Ity_V256 ? getYMMReg(rV) : getXMMReg(rV) );
27093   }
27094
27095   if (epartIsReg(modrm)) {
27096      UInt rE = eregOfRexRM(pfx, modrm);
27097      delta += 1;
27098      if (scalar)
27099         assign( vY, ty == Ity_F64
27100                     ? getXMMRegLane64F(rE, 0) : getXMMRegLane32F(rE, 0) );
27101      else
27102         assign( vY, vty == Ity_V256 ? getYMMReg(rE) : getXMMReg(rE) );
27103      if (vty == Ity_V256) {
27104         DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
27105             name, order, suffix, nameYMMReg(rE), nameYMMReg(rV),
27106             nameYMMReg(rG));
27107      } else {
27108         DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
27109             name, order, suffix, nameXMMReg(rE), nameXMMReg(rV),
27110             nameXMMReg(rG));
27111      }
27112   } else {
27113      addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
27114      delta += alen;
27115      assign(vY, loadLE(vty, mkexpr(addr)));
27116      if (vty == Ity_V256) {
27117         DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
27118             name, order, suffix, dis_buf, nameYMMReg(rV),
27119             nameYMMReg(rG));
27120      } else {
27121         DIP("vf%sm%s%s%s %s,%s,%s\n", negateRes ? "n" : "",
27122             name, order, suffix, dis_buf, nameXMMReg(rV),
27123             nameXMMReg(rG));
27124      }
27125   }
27126
27127   /* vX/vY/vZ now in 132 order.  If it is different order, swap the
27128      arguments.  */
27129   if ((opc & 0xF0) != 0x90) {
27130      IRTemp tem = vX;
27131      if ((opc & 0xF0) == 0xA0) {
27132         vX = vZ;
27133         vZ = vY;
27134         vY = tem;
27135      } else {
27136         vX = vZ;
27137         vZ = tem;
27138      }
27139   }
27140
27141   if (scalar) {
27142      count = 1;
27143      x[0] = mkexpr(vX);
27144      y[0] = mkexpr(vY);
27145      z[0] = mkexpr(vZ);
27146   } else if (ty == Ity_F32) {
27147      count = vty == Ity_V256 ? 8 : 4;
27148      j = vty == Ity_V256 ? 0 : 4;
27149      for (i = 0; i < count; i += 2) {
27150         IRTemp tem = newTemp(Ity_I64);
27151         assign(tem, unop(ops[i / 2 + j], mkexpr(vX)));
27152         x[i] = unop(Iop_64to32, mkexpr(tem));
27153         x[i + 1] = unop(Iop_64HIto32, mkexpr(tem));
27154         tem = newTemp(Ity_I64);
27155         assign(tem, unop(ops[i / 2 + j], mkexpr(vY)));
27156         y[i] = unop(Iop_64to32, mkexpr(tem));
27157         y[i + 1] = unop(Iop_64HIto32, mkexpr(tem));
27158         tem = newTemp(Ity_I64);
27159         assign(tem, unop(ops[i / 2 + j], mkexpr(vZ)));
27160         z[i] = unop(Iop_64to32, mkexpr(tem));
27161         z[i + 1] = unop(Iop_64HIto32, mkexpr(tem));
27162      }
27163   } else {
27164      count = vty == Ity_V256 ? 4 : 2;
27165      j = vty == Ity_V256 ? 0 : 4;
27166      for (i = 0; i < count; i++) {
27167         x[i] = unop(ops[i + j], mkexpr(vX));
27168         y[i] = unop(ops[i + j], mkexpr(vY));
27169         z[i] = unop(ops[i + j], mkexpr(vZ));
27170      }
27171   }
27172   if (!scalar)
27173      for (i = 0; i < count; i++) {
27174         IROp op = ty == Ity_F64
27175                   ? Iop_ReinterpI64asF64 : Iop_ReinterpI32asF32;
27176         x[i] = unop(op, x[i]);
27177         y[i] = unop(op, y[i]);
27178         z[i] = unop(op, z[i]);
27179      }
27180   for (i = 0; i < count; i++) {
27181      if ((i & 1) ? negateZodd : negateZeven)
27182         z[i] = unop(ty == Ity_F64 ? Iop_NegF64 : Iop_NegF32, z[i]);
27183      x[i] = IRExpr_Qop(ty == Ity_F64 ? Iop_MAddF64 : Iop_MAddF32,
27184                        get_FAKE_roundingmode(), x[i], y[i], z[i]);
27185      if (negateRes)
27186         x[i] = unop(ty == Ity_F64 ? Iop_NegF64 : Iop_NegF32, x[i]);
27187      if (ty == Ity_F64)
27188         putYMMRegLane64F( rG, i, x[i] );
27189      else
27190         putYMMRegLane32F( rG, i, x[i] );
27191   }
27192   if (vty != Ity_V256)
27193      putYMMRegLane128( rG, 1, mkV128(0) );
27194
27195   return delta;
27196}
27197
27198
27199/* Masked load.  */
27200static ULong dis_VMASKMOV_load ( Bool *uses_vvvv, VexAbiInfo* vbi,
27201                                 Prefix pfx, Long delta,
27202                                 const HChar* opname, Bool isYMM, IRType ty )
27203{
27204   HChar   dis_buf[50];
27205   Int     alen, i;
27206   IRTemp  addr;
27207   UChar   modrm = getUChar(delta);
27208   UInt    rG    = gregOfRexRM(pfx,modrm);
27209   UInt    rV    = getVexNvvvv(pfx);
27210   IRTemp  res[8], cond;
27211   addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
27212   if (isYMM) {
27213      DIP("%s %s,%s,%s\n", opname, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
27214   } else {
27215      DIP("%s %s,%s,%s\n", opname, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
27216   }
27217   delta += alen;
27218
27219   for (i = 0; i < 2 * (isYMM ? 2 : 1) * (ty == Ity_I32 ? 2 : 1); i++) {
27220      res[i] = newTemp(ty);
27221      cond = newTemp(Ity_I1);
27222      assign( cond,
27223              binop(ty == Ity_I32 ? Iop_CmpLT32S : Iop_CmpLT64S,
27224                    ty == Ity_I32 ? getYMMRegLane32( rV, i )
27225                                  : getYMMRegLane64( rV, i ),
27226                    mkU(ty, 0) ));
27227      assign( res[i],
27228              IRExpr_ITE(
27229                 mkexpr(cond),
27230                 loadLE(ty, IRExpr_ITE(
27231                               mkexpr(cond),
27232                               binop(Iop_Add64, mkexpr(addr),
27233                                     mkU64(i*(ty == Ity_I32 ? 4 : 8))),
27234                               getIReg64(R_RSP)
27235                            )
27236                       ),
27237                 mkU(ty, 0)
27238              )
27239            );
27240   }
27241   switch (ty) {
27242      case Ity_I32:
27243         for (i = 0; i < 8; i++)
27244            putYMMRegLane32( rG, i, (i < 4 || isYMM)
27245                                    ? mkexpr(res[i]) : mkU32(0) );
27246         break;
27247      case Ity_I64:
27248         for (i = 0; i < 4; i++)
27249            putYMMRegLane64( rG, i, (i < 2 || isYMM)
27250                                    ? mkexpr(res[i]) : mkU64(0) );
27251         break;
27252      default: vassert(0);
27253   }
27254
27255   *uses_vvvv = True;
27256   return delta;
27257}
27258
27259
27260/* Gather.  */
27261static ULong dis_VGATHER ( Bool *uses_vvvv, VexAbiInfo* vbi,
27262                           Prefix pfx, Long delta,
27263                           const HChar* opname, Bool isYMM,
27264                           Bool isVM64x, IRType ty )
27265{
27266   HChar  dis_buf[50];
27267   Int    alen, i, vscale, count1, count2;
27268   IRTemp addr;
27269   UChar  modrm = getUChar(delta);
27270   UInt   rG    = gregOfRexRM(pfx,modrm);
27271   UInt   rV    = getVexNvvvv(pfx);
27272   UInt   rI;
27273   IRType dstTy = (isYMM && (ty == Ity_I64 || !isVM64x)) ? Ity_V256 : Ity_V128;
27274   IRType idxTy = (isYMM && (ty == Ity_I32 || isVM64x)) ? Ity_V256 : Ity_V128;
27275   IRTemp cond;
27276   addr = disAVSIBMode ( &alen, vbi, pfx, delta, dis_buf, &rI,
27277                         idxTy, &vscale );
27278   if (addr == IRTemp_INVALID || rI == rG || rI == rV || rG == rV)
27279      return delta;
27280   if (dstTy == Ity_V256) {
27281      DIP("%s %s,%s,%s\n", opname, nameYMMReg(rV), dis_buf, nameYMMReg(rG) );
27282   } else {
27283      DIP("%s %s,%s,%s\n", opname, nameXMMReg(rV), dis_buf, nameXMMReg(rG) );
27284   }
27285   delta += alen;
27286
27287   if (ty == Ity_I32) {
27288      count1 = isYMM ? 8 : 4;
27289      count2 = isVM64x ? count1 / 2 : count1;
27290   } else {
27291      count1 = count2 = isYMM ? 4 : 2;
27292   }
27293
27294   /* First update the mask register to copies of the sign bit.  */
27295   if (ty == Ity_I32) {
27296      if (isYMM)
27297         putYMMReg( rV, binop(Iop_SarN32x8, getYMMReg( rV ), mkU8(31)) );
27298      else
27299         putYMMRegLoAndZU( rV, binop(Iop_SarN32x4, getXMMReg( rV ), mkU8(31)) );
27300   } else {
27301      for (i = 0; i < count1; i++) {
27302         putYMMRegLane64( rV, i, binop(Iop_Sar64, getYMMRegLane64( rV, i ),
27303                                       mkU8(63)) );
27304      }
27305   }
27306
27307   /* Next gather the individual elements.  If any fault occurs, the
27308      corresponding mask element will be set and the loop stops.  */
27309   for (i = 0; i < count2; i++) {
27310      IRExpr *expr, *addr_expr;
27311      cond = newTemp(Ity_I1);
27312      assign( cond,
27313              binop(ty == Ity_I32 ? Iop_CmpLT32S : Iop_CmpLT64S,
27314                    ty == Ity_I32 ? getYMMRegLane32( rV, i )
27315                                  : getYMMRegLane64( rV, i ),
27316                    mkU(ty, 0)) );
27317      expr = ty == Ity_I32 ? getYMMRegLane32( rG, i )
27318                           : getYMMRegLane64( rG, i );
27319      addr_expr = isVM64x ? getYMMRegLane64( rI, i )
27320                          : unop(Iop_32Sto64, getYMMRegLane32( rI, i ));
27321      switch (vscale) {
27322         case 2: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(1)); break;
27323         case 4: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(2)); break;
27324         case 8: addr_expr = binop(Iop_Shl64, addr_expr, mkU8(3)); break;
27325         default: break;
27326      }
27327      addr_expr = binop(Iop_Add64, mkexpr(addr), addr_expr);
27328      addr_expr = handleAddrOverrides(vbi, pfx, addr_expr);
27329      addr_expr = IRExpr_ITE(mkexpr(cond), addr_expr, getIReg64(R_RSP));
27330      expr = IRExpr_ITE(mkexpr(cond), loadLE(ty, addr_expr), expr);
27331      if (ty == Ity_I32) {
27332         putYMMRegLane32( rG, i, expr );
27333         putYMMRegLane32( rV, i, mkU32(0) );
27334      } else {
27335         putYMMRegLane64( rG, i, expr);
27336         putYMMRegLane64( rV, i, mkU64(0) );
27337      }
27338   }
27339
27340   if (!isYMM || (ty == Ity_I32 && isVM64x)) {
27341      if (ty == Ity_I64 || isYMM)
27342         putYMMRegLane128( rV, 1, mkV128(0) );
27343      else if (ty == Ity_I32 && count2 == 2) {
27344         putYMMRegLane64( rV, 1, mkU64(0) );
27345         putYMMRegLane64( rG, 1, mkU64(0) );
27346      }
27347      putYMMRegLane128( rG, 1, mkV128(0) );
27348   }
27349
27350   *uses_vvvv = True;
27351   return delta;
27352}
27353
27354
27355__attribute__((noinline))
27356static
27357Long dis_ESC_0F38__VEX (
27358        /*MB_OUT*/DisResult* dres,
27359        /*OUT*/   Bool*      uses_vvvv,
27360        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
27361        Bool         resteerCisOk,
27362        void*        callback_opaque,
27363        VexArchInfo* archinfo,
27364        VexAbiInfo*  vbi,
27365        Prefix pfx, Int sz, Long deltaIN
27366     )
27367{
27368   IRTemp addr  = IRTemp_INVALID;
27369   Int    alen  = 0;
27370   HChar  dis_buf[50];
27371   Long   delta = deltaIN;
27372   UChar  opc   = getUChar(delta);
27373   delta++;
27374   *uses_vvvv = False;
27375
27376   switch (opc) {
27377
27378   case 0x00:
27379      /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */
27380      /* VPSHUFB = VEX.NDS.128.66.0F38.WIG 00 /r */
27381      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27382         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
27383                    uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_XMM );
27384         goto decode_success;
27385      }
27386      /* VPSHUFB r/m, rV, r ::: r = shuf(rV, r/m) */
27387      /* VPSHUFB = VEX.NDS.256.66.0F38.WIG 00 /r */
27388      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27389         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
27390                    uses_vvvv, vbi, pfx, delta, "vpshufb", math_PSHUFB_YMM );
27391         goto decode_success;
27392      }
27393      break;
27394
27395   case 0x01:
27396   case 0x02:
27397   case 0x03:
27398      /* VPHADDW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 01 /r */
27399      /* VPHADDD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 02 /r */
27400      /* VPHADDSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 03 /r */
27401      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27402         delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
27403         *uses_vvvv = True;
27404         goto decode_success;
27405      }
27406      /* VPHADDW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 01 /r */
27407      /* VPHADDD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 02 /r */
27408      /* VPHADDSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 03 /r */
27409      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27410         delta = dis_PHADD_256( vbi, pfx, delta, opc );
27411         *uses_vvvv = True;
27412         goto decode_success;
27413      }
27414      break;
27415
27416   case 0x04:
27417      /* VPMADDUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 04 /r */
27418      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27419         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
27420                    uses_vvvv, vbi, pfx, delta, "vpmaddubsw",
27421                    math_PMADDUBSW_128 );
27422         goto decode_success;
27423      }
27424      /* VPMADDUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 04 /r */
27425      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27426         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
27427                    uses_vvvv, vbi, pfx, delta, "vpmaddubsw",
27428                    math_PMADDUBSW_256 );
27429         goto decode_success;
27430      }
27431      break;
27432
27433   case 0x05:
27434   case 0x06:
27435   case 0x07:
27436      /* VPHSUBW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 05 /r */
27437      /* VPHSUBD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 06 /r */
27438      /* VPHSUBSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 07 /r */
27439      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27440         delta = dis_PHADD_128( vbi, pfx, delta, True/*isAvx*/, opc );
27441         *uses_vvvv = True;
27442         goto decode_success;
27443      }
27444      /* VPHSUBW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 05 /r */
27445      /* VPHSUBD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 06 /r */
27446      /* VPHSUBSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 07 /r */
27447      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27448         delta = dis_PHADD_256( vbi, pfx, delta, opc );
27449         *uses_vvvv = True;
27450         goto decode_success;
27451      }
27452      break;
27453
27454   case 0x08:
27455   case 0x09:
27456   case 0x0A:
27457      /* VPSIGNB xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 08 /r */
27458      /* VPSIGNW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 09 /r */
27459      /* VPSIGND xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0A /r */
27460      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27461         IRTemp sV      = newTemp(Ity_V128);
27462         IRTemp dV      = newTemp(Ity_V128);
27463         IRTemp sHi, sLo, dHi, dLo;
27464         sHi = sLo = dHi = dLo = IRTemp_INVALID;
27465         HChar  ch      = '?';
27466         Int    laneszB = 0;
27467         UChar  modrm   = getUChar(delta);
27468         UInt   rG      = gregOfRexRM(pfx,modrm);
27469         UInt   rV      = getVexNvvvv(pfx);
27470
27471         switch (opc) {
27472            case 0x08: laneszB = 1; ch = 'b'; break;
27473            case 0x09: laneszB = 2; ch = 'w'; break;
27474            case 0x0A: laneszB = 4; ch = 'd'; break;
27475            default: vassert(0);
27476         }
27477
27478         assign( dV, getXMMReg(rV) );
27479
27480         if (epartIsReg(modrm)) {
27481            UInt rE = eregOfRexRM(pfx,modrm);
27482            assign( sV, getXMMReg(rE) );
27483            delta += 1;
27484            DIP("vpsign%c %s,%s,%s\n", ch, nameXMMReg(rE),
27485                nameXMMReg(rV), nameXMMReg(rG));
27486         } else {
27487            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
27488            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
27489            delta += alen;
27490            DIP("vpsign%c %s,%s,%s\n", ch, dis_buf,
27491                nameXMMReg(rV), nameXMMReg(rG));
27492         }
27493
27494         breakupV128to64s( dV, &dHi, &dLo );
27495         breakupV128to64s( sV, &sHi, &sLo );
27496
27497         putYMMRegLoAndZU(
27498            rG,
27499            binop(Iop_64HLtoV128,
27500                  dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
27501                  dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
27502            )
27503         );
27504         *uses_vvvv = True;
27505         goto decode_success;
27506      }
27507      /* VPSIGNB ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 08 /r */
27508      /* VPSIGNW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 09 /r */
27509      /* VPSIGND ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 0A /r */
27510      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27511         IRTemp sV      = newTemp(Ity_V256);
27512         IRTemp dV      = newTemp(Ity_V256);
27513         IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
27514         s3 = s2 = s1 = s0 = IRTemp_INVALID;
27515         d3 = d2 = d1 = d0 = IRTemp_INVALID;
27516         UChar  ch      = '?';
27517         Int    laneszB = 0;
27518         UChar  modrm   = getUChar(delta);
27519         UInt   rG      = gregOfRexRM(pfx,modrm);
27520         UInt   rV      = getVexNvvvv(pfx);
27521
27522         switch (opc) {
27523            case 0x08: laneszB = 1; ch = 'b'; break;
27524            case 0x09: laneszB = 2; ch = 'w'; break;
27525            case 0x0A: laneszB = 4; ch = 'd'; break;
27526            default: vassert(0);
27527         }
27528
27529         assign( dV, getYMMReg(rV) );
27530
27531         if (epartIsReg(modrm)) {
27532            UInt rE = eregOfRexRM(pfx,modrm);
27533            assign( sV, getYMMReg(rE) );
27534            delta += 1;
27535            DIP("vpsign%c %s,%s,%s\n", ch, nameYMMReg(rE),
27536                nameYMMReg(rV), nameYMMReg(rG));
27537         } else {
27538            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
27539            assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
27540            delta += alen;
27541            DIP("vpsign%c %s,%s,%s\n", ch, dis_buf,
27542                nameYMMReg(rV), nameYMMReg(rG));
27543         }
27544
27545         breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
27546         breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
27547
27548         putYMMReg(
27549            rG,
27550            binop( Iop_V128HLtoV256,
27551                   binop(Iop_64HLtoV128,
27552                         dis_PSIGN_helper( mkexpr(s3), mkexpr(d3), laneszB ),
27553                         dis_PSIGN_helper( mkexpr(s2), mkexpr(d2), laneszB )
27554                   ),
27555                   binop(Iop_64HLtoV128,
27556                         dis_PSIGN_helper( mkexpr(s1), mkexpr(d1), laneszB ),
27557                         dis_PSIGN_helper( mkexpr(s0), mkexpr(d0), laneszB )
27558                   )
27559            )
27560         );
27561         *uses_vvvv = True;
27562         goto decode_success;
27563      }
27564      break;
27565
27566   case 0x0B:
27567      /* VPMULHRSW xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 0B /r */
27568      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27569         IRTemp sV      = newTemp(Ity_V128);
27570         IRTemp dV      = newTemp(Ity_V128);
27571         IRTemp sHi, sLo, dHi, dLo;
27572         sHi = sLo = dHi = dLo = IRTemp_INVALID;
27573         UChar  modrm   = getUChar(delta);
27574         UInt   rG      = gregOfRexRM(pfx,modrm);
27575         UInt   rV      = getVexNvvvv(pfx);
27576
27577         assign( dV, getXMMReg(rV) );
27578
27579         if (epartIsReg(modrm)) {
27580            UInt rE = eregOfRexRM(pfx,modrm);
27581            assign( sV, getXMMReg(rE) );
27582            delta += 1;
27583            DIP("vpmulhrsw %s,%s,%s\n", nameXMMReg(rE),
27584                nameXMMReg(rV), nameXMMReg(rG));
27585         } else {
27586            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
27587            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
27588            delta += alen;
27589            DIP("vpmulhrsw %s,%s,%s\n", dis_buf,
27590                nameXMMReg(rV), nameXMMReg(rG));
27591         }
27592
27593         breakupV128to64s( dV, &dHi, &dLo );
27594         breakupV128to64s( sV, &sHi, &sLo );
27595
27596         putYMMRegLoAndZU(
27597            rG,
27598            binop(Iop_64HLtoV128,
27599                  dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
27600                  dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
27601            )
27602         );
27603         *uses_vvvv = True;
27604         goto decode_success;
27605      }
27606      /* VPMULHRSW ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 0B /r */
27607      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27608         IRTemp sV      = newTemp(Ity_V256);
27609         IRTemp dV      = newTemp(Ity_V256);
27610         IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
27611         s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
27612         UChar  modrm   = getUChar(delta);
27613         UInt   rG      = gregOfRexRM(pfx,modrm);
27614         UInt   rV      = getVexNvvvv(pfx);
27615
27616         assign( dV, getYMMReg(rV) );
27617
27618         if (epartIsReg(modrm)) {
27619            UInt rE = eregOfRexRM(pfx,modrm);
27620            assign( sV, getYMMReg(rE) );
27621            delta += 1;
27622            DIP("vpmulhrsw %s,%s,%s\n", nameYMMReg(rE),
27623                nameYMMReg(rV), nameYMMReg(rG));
27624         } else {
27625            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
27626            assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
27627            delta += alen;
27628            DIP("vpmulhrsw %s,%s,%s\n", dis_buf,
27629                nameYMMReg(rV), nameYMMReg(rG));
27630         }
27631
27632         breakupV256to64s( dV, &d3, &d2, &d1, &d0 );
27633         breakupV256to64s( sV, &s3, &s2, &s1, &s0 );
27634
27635         putYMMReg(
27636            rG,
27637            binop(Iop_V128HLtoV256,
27638                  binop(Iop_64HLtoV128,
27639                        dis_PMULHRSW_helper( mkexpr(s3), mkexpr(d3) ),
27640                        dis_PMULHRSW_helper( mkexpr(s2), mkexpr(d2) ) ),
27641                  binop(Iop_64HLtoV128,
27642                        dis_PMULHRSW_helper( mkexpr(s1), mkexpr(d1) ),
27643                        dis_PMULHRSW_helper( mkexpr(s0), mkexpr(d0) ) )
27644            )
27645         );
27646         *uses_vvvv = True;
27647         goto decode_success;
27648      }
27649      break;
27650
27651   case 0x0C:
27652      /* VPERMILPS xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0C /r */
27653      if (have66noF2noF3(pfx)
27654          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
27655         UChar  modrm = getUChar(delta);
27656         UInt   rG    = gregOfRexRM(pfx, modrm);
27657         UInt   rV    = getVexNvvvv(pfx);
27658         IRTemp ctrlV = newTemp(Ity_V128);
27659         if (epartIsReg(modrm)) {
27660            UInt rE = eregOfRexRM(pfx, modrm);
27661            delta += 1;
27662            DIP("vpermilps %s,%s,%s\n",
27663                nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
27664            assign(ctrlV, getXMMReg(rE));
27665         } else {
27666            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
27667            delta += alen;
27668            DIP("vpermilps %s,%s,%s\n",
27669                dis_buf, nameXMMReg(rV), nameXMMReg(rG));
27670            assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
27671         }
27672         IRTemp dataV = newTemp(Ity_V128);
27673         assign(dataV, getXMMReg(rV));
27674         IRTemp resV = math_PERMILPS_VAR_128(dataV, ctrlV);
27675         putYMMRegLoAndZU(rG, mkexpr(resV));
27676         *uses_vvvv = True;
27677         goto decode_success;
27678      }
27679      /* VPERMILPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0C /r */
27680      if (have66noF2noF3(pfx)
27681          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
27682         UChar  modrm = getUChar(delta);
27683         UInt   rG    = gregOfRexRM(pfx, modrm);
27684         UInt   rV    = getVexNvvvv(pfx);
27685         IRTemp ctrlV = newTemp(Ity_V256);
27686         if (epartIsReg(modrm)) {
27687            UInt rE = eregOfRexRM(pfx, modrm);
27688            delta += 1;
27689            DIP("vpermilps %s,%s,%s\n",
27690                nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
27691            assign(ctrlV, getYMMReg(rE));
27692         } else {
27693            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
27694            delta += alen;
27695            DIP("vpermilps %s,%s,%s\n",
27696                dis_buf, nameYMMReg(rV), nameYMMReg(rG));
27697            assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
27698         }
27699         IRTemp dataV = newTemp(Ity_V256);
27700         assign(dataV, getYMMReg(rV));
27701         IRTemp resV = math_PERMILPS_VAR_256(dataV, ctrlV);
27702         putYMMReg(rG, mkexpr(resV));
27703         *uses_vvvv = True;
27704         goto decode_success;
27705      }
27706      break;
27707
27708   case 0x0D:
27709      /* VPERMILPD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 0D /r */
27710      if (have66noF2noF3(pfx)
27711          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
27712         UChar  modrm = getUChar(delta);
27713         UInt   rG    = gregOfRexRM(pfx, modrm);
27714         UInt   rV    = getVexNvvvv(pfx);
27715         IRTemp ctrlV = newTemp(Ity_V128);
27716         if (epartIsReg(modrm)) {
27717            UInt rE = eregOfRexRM(pfx, modrm);
27718            delta += 1;
27719            DIP("vpermilpd %s,%s,%s\n",
27720                nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
27721            assign(ctrlV, getXMMReg(rE));
27722         } else {
27723            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
27724            delta += alen;
27725            DIP("vpermilpd %s,%s,%s\n",
27726                dis_buf, nameXMMReg(rV), nameXMMReg(rG));
27727            assign(ctrlV, loadLE(Ity_V128, mkexpr(addr)));
27728         }
27729         IRTemp dataV = newTemp(Ity_V128);
27730         assign(dataV, getXMMReg(rV));
27731         IRTemp resV = math_PERMILPD_VAR_128(dataV, ctrlV);
27732         putYMMRegLoAndZU(rG, mkexpr(resV));
27733         *uses_vvvv = True;
27734         goto decode_success;
27735      }
27736      /* VPERMILPD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 0D /r */
27737      if (have66noF2noF3(pfx)
27738          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
27739         UChar  modrm = getUChar(delta);
27740         UInt   rG    = gregOfRexRM(pfx, modrm);
27741         UInt   rV    = getVexNvvvv(pfx);
27742         IRTemp ctrlV = newTemp(Ity_V256);
27743         if (epartIsReg(modrm)) {
27744            UInt rE = eregOfRexRM(pfx, modrm);
27745            delta += 1;
27746            DIP("vpermilpd %s,%s,%s\n",
27747                nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
27748            assign(ctrlV, getYMMReg(rE));
27749         } else {
27750            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
27751            delta += alen;
27752            DIP("vpermilpd %s,%s,%s\n",
27753                dis_buf, nameYMMReg(rV), nameYMMReg(rG));
27754            assign(ctrlV, loadLE(Ity_V256, mkexpr(addr)));
27755         }
27756         IRTemp dataV = newTemp(Ity_V256);
27757         assign(dataV, getYMMReg(rV));
27758         IRTemp resV = math_PERMILPD_VAR_256(dataV, ctrlV);
27759         putYMMReg(rG, mkexpr(resV));
27760         *uses_vvvv = True;
27761         goto decode_success;
27762      }
27763      break;
27764
27765   case 0x0E:
27766      /* VTESTPS xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 0E /r */
27767      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27768         delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 32 );
27769         goto decode_success;
27770      }
27771      /* VTESTPS ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 0E /r */
27772      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27773         delta = dis_xTESTy_256( vbi, pfx, delta, 32 );
27774         goto decode_success;
27775      }
27776      break;
27777
27778   case 0x0F:
27779      /* VTESTPD xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 0F /r */
27780      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27781         delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 64 );
27782         goto decode_success;
27783      }
27784      /* VTESTPD ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 0F /r */
27785      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27786         delta = dis_xTESTy_256( vbi, pfx, delta, 64 );
27787         goto decode_success;
27788      }
27789      break;
27790
27791   case 0x16:
27792      /* VPERMPS ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 16 /r */
27793      if (have66noF2noF3(pfx)
27794          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
27795         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
27796                    uses_vvvv, vbi, pfx, delta, "vpermps", math_VPERMD );
27797         goto decode_success;
27798      }
27799      break;
27800
27801   case 0x17:
27802      /* VPTEST xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 17 /r */
27803      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27804         delta = dis_xTESTy_128( vbi, pfx, delta, True/*isAvx*/, 0 );
27805         goto decode_success;
27806      }
27807      /* VPTEST ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 17 /r */
27808      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27809         delta = dis_xTESTy_256( vbi, pfx, delta, 0 );
27810         goto decode_success;
27811      }
27812      break;
27813
27814   case 0x18:
27815      /* VBROADCASTSS m32, xmm1 = VEX.128.66.0F38.WIG 18 /r */
27816      if (have66noF2noF3(pfx)
27817          && 0==getVexL(pfx)/*128*/
27818          && !epartIsReg(getUChar(delta))) {
27819         UChar modrm = getUChar(delta);
27820         UInt  rG    = gregOfRexRM(pfx, modrm);
27821         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
27822         delta += alen;
27823         DIP("vbroadcastss %s,%s\n", dis_buf, nameXMMReg(rG));
27824         IRTemp t32 = newTemp(Ity_I32);
27825         assign(t32, loadLE(Ity_I32, mkexpr(addr)));
27826         IRTemp t64 = newTemp(Ity_I64);
27827         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
27828         IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
27829         putYMMRegLoAndZU(rG, res);
27830         goto decode_success;
27831      }
27832      /* VBROADCASTSS m32, ymm1 = VEX.256.66.0F38.WIG 18 /r */
27833      if (have66noF2noF3(pfx)
27834          && 1==getVexL(pfx)/*256*/
27835          && !epartIsReg(getUChar(delta))) {
27836         UChar modrm = getUChar(delta);
27837         UInt  rG    = gregOfRexRM(pfx, modrm);
27838         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
27839         delta += alen;
27840         DIP("vbroadcastss %s,%s\n", dis_buf, nameYMMReg(rG));
27841         IRTemp t32 = newTemp(Ity_I32);
27842         assign(t32, loadLE(Ity_I32, mkexpr(addr)));
27843         IRTemp t64 = newTemp(Ity_I64);
27844         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
27845         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
27846                                                  mkexpr(t64), mkexpr(t64));
27847         putYMMReg(rG, res);
27848         goto decode_success;
27849      }
27850      /* VBROADCASTSS xmm2, xmm1 = VEX.128.66.0F38.WIG 18 /r */
27851      if (have66noF2noF3(pfx)
27852          && 0==getVexL(pfx)/*128*/
27853          && epartIsReg(getUChar(delta))) {
27854         UChar modrm = getUChar(delta);
27855         UInt  rG    = gregOfRexRM(pfx, modrm);
27856         UInt  rE    = eregOfRexRM(pfx, modrm);
27857         DIP("vbroadcastss %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
27858         IRTemp t32 = newTemp(Ity_I32);
27859         assign(t32, getXMMRegLane32(rE, 0));
27860         IRTemp t64 = newTemp(Ity_I64);
27861         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
27862         IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
27863         putYMMRegLoAndZU(rG, res);
27864         delta++;
27865         goto decode_success;
27866      }
27867      /* VBROADCASTSS xmm2, ymm1 = VEX.256.66.0F38.WIG 18 /r */
27868      if (have66noF2noF3(pfx)
27869          && 1==getVexL(pfx)/*256*/
27870          && epartIsReg(getUChar(delta))) {
27871         UChar modrm = getUChar(delta);
27872         UInt  rG    = gregOfRexRM(pfx, modrm);
27873         UInt  rE    = eregOfRexRM(pfx, modrm);
27874         DIP("vbroadcastss %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
27875         IRTemp t32 = newTemp(Ity_I32);
27876         assign(t32, getXMMRegLane32(rE, 0));
27877         IRTemp t64 = newTemp(Ity_I64);
27878         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
27879         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
27880                                                  mkexpr(t64), mkexpr(t64));
27881         putYMMReg(rG, res);
27882         delta++;
27883         goto decode_success;
27884      }
27885      break;
27886
27887   case 0x19:
27888      /* VBROADCASTSD m64, ymm1 = VEX.256.66.0F38.WIG 19 /r */
27889      if (have66noF2noF3(pfx)
27890          && 1==getVexL(pfx)/*256*/
27891          && !epartIsReg(getUChar(delta))) {
27892         UChar modrm = getUChar(delta);
27893         UInt  rG    = gregOfRexRM(pfx, modrm);
27894         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
27895         delta += alen;
27896         DIP("vbroadcastsd %s,%s\n", dis_buf, nameYMMReg(rG));
27897         IRTemp t64 = newTemp(Ity_I64);
27898         assign(t64, loadLE(Ity_I64, mkexpr(addr)));
27899         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
27900                                                  mkexpr(t64), mkexpr(t64));
27901         putYMMReg(rG, res);
27902         goto decode_success;
27903      }
27904      /* VBROADCASTSD xmm2, ymm1 = VEX.256.66.0F38.WIG 19 /r */
27905      if (have66noF2noF3(pfx)
27906          && 1==getVexL(pfx)/*256*/
27907          && epartIsReg(getUChar(delta))) {
27908         UChar modrm = getUChar(delta);
27909         UInt  rG    = gregOfRexRM(pfx, modrm);
27910         UInt  rE    = eregOfRexRM(pfx, modrm);
27911         DIP("vbroadcastsd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
27912         IRTemp t64 = newTemp(Ity_I64);
27913         assign(t64, getXMMRegLane64(rE, 0));
27914         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
27915                                                  mkexpr(t64), mkexpr(t64));
27916         putYMMReg(rG, res);
27917         delta++;
27918         goto decode_success;
27919      }
27920      break;
27921
27922   case 0x1A:
27923      /* VBROADCASTF128 m128, ymm1 = VEX.256.66.0F38.WIG 1A /r */
27924      if (have66noF2noF3(pfx)
27925          && 1==getVexL(pfx)/*256*/
27926          && !epartIsReg(getUChar(delta))) {
27927         UChar modrm = getUChar(delta);
27928         UInt  rG    = gregOfRexRM(pfx, modrm);
27929         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
27930         delta += alen;
27931         DIP("vbroadcastf128 %s,%s\n", dis_buf, nameYMMReg(rG));
27932         IRTemp t128 = newTemp(Ity_V128);
27933         assign(t128, loadLE(Ity_V128, mkexpr(addr)));
27934         putYMMReg( rG, binop(Iop_V128HLtoV256, mkexpr(t128), mkexpr(t128)) );
27935         goto decode_success;
27936      }
27937      break;
27938
27939   case 0x1C:
27940      /* VPABSB xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1C /r */
27941      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27942         delta = dis_AVX128_E_to_G_unary(
27943                    uses_vvvv, vbi, pfx, delta,
27944                    "vpabsb", math_PABS_XMM_pap1 );
27945         goto decode_success;
27946      }
27947      /* VPABSB ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1C /r */
27948      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27949         delta = dis_AVX256_E_to_G_unary(
27950                    uses_vvvv, vbi, pfx, delta,
27951                    "vpabsb", math_PABS_YMM_pap1 );
27952         goto decode_success;
27953      }
27954      break;
27955
27956   case 0x1D:
27957      /* VPABSW xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1D /r */
27958      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27959         delta = dis_AVX128_E_to_G_unary(
27960                    uses_vvvv, vbi, pfx, delta,
27961                    "vpabsw", math_PABS_XMM_pap2 );
27962         goto decode_success;
27963      }
27964      /* VPABSW ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1D /r */
27965      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27966         delta = dis_AVX256_E_to_G_unary(
27967                    uses_vvvv, vbi, pfx, delta,
27968                    "vpabsw", math_PABS_YMM_pap2 );
27969         goto decode_success;
27970      }
27971      break;
27972
27973   case 0x1E:
27974      /* VPABSD xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 1E /r */
27975      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27976         delta = dis_AVX128_E_to_G_unary(
27977                    uses_vvvv, vbi, pfx, delta,
27978                    "vpabsd", math_PABS_XMM_pap4 );
27979         goto decode_success;
27980      }
27981      /* VPABSD ymm2/m256, ymm1 = VEX.256.66.0F38.WIG 1E /r */
27982      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
27983         delta = dis_AVX256_E_to_G_unary(
27984                    uses_vvvv, vbi, pfx, delta,
27985                    "vpabsd", math_PABS_YMM_pap4 );
27986         goto decode_success;
27987      }
27988      break;
27989
27990   case 0x20:
27991      /* VPMOVSXBW xmm2/m64, xmm1 */
27992      /* VPMOVSXBW = VEX.128.66.0F38.WIG 20 /r */
27993      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
27994         delta = dis_PMOVxXBW_128( vbi, pfx, delta,
27995                                   True/*isAvx*/, False/*!xIsZ*/ );
27996         goto decode_success;
27997      }
27998      /* VPMOVSXBW xmm2/m128, ymm1 */
27999      /* VPMOVSXBW = VEX.256.66.0F38.WIG 20 /r */
28000      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28001         delta = dis_PMOVxXBW_256( vbi, pfx, delta, False/*!xIsZ*/ );
28002         goto decode_success;
28003      }
28004      break;
28005
28006   case 0x21:
28007      /* VPMOVSXBD xmm2/m32, xmm1 */
28008      /* VPMOVSXBD = VEX.128.66.0F38.WIG 21 /r */
28009      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28010         delta = dis_PMOVxXBD_128( vbi, pfx, delta,
28011                                   True/*isAvx*/, False/*!xIsZ*/ );
28012         goto decode_success;
28013      }
28014      /* VPMOVSXBD xmm2/m64, ymm1 */
28015      /* VPMOVSXBD = VEX.256.66.0F38.WIG 21 /r */
28016      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28017         delta = dis_PMOVxXBD_256( vbi, pfx, delta, False/*!xIsZ*/ );
28018         goto decode_success;
28019      }
28020      break;
28021
28022   case 0x22:
28023      /* VPMOVSXBQ xmm2/m16, xmm1 */
28024      /* VPMOVSXBQ = VEX.128.66.0F38.WIG 22 /r */
28025      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28026         delta = dis_PMOVSXBQ_128( vbi, pfx, delta, True/*isAvx*/ );
28027         goto decode_success;
28028      }
28029      /* VPMOVSXBQ xmm2/m32, ymm1 */
28030      /* VPMOVSXBQ = VEX.256.66.0F38.WIG 22 /r */
28031      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28032         delta = dis_PMOVSXBQ_256( vbi, pfx, delta );
28033         goto decode_success;
28034      }
28035      break;
28036
28037   case 0x23:
28038      /* VPMOVSXWD xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 23 /r */
28039      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28040         delta = dis_PMOVxXWD_128( vbi, pfx, delta,
28041                                   True/*isAvx*/, False/*!xIsZ*/ );
28042         goto decode_success;
28043      }
28044      /* VPMOVSXWD xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 23 /r */
28045      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28046         delta = dis_PMOVxXWD_256( vbi, pfx, delta, False/*!xIsZ*/ );
28047         goto decode_success;
28048      }
28049      break;
28050
28051   case 0x24:
28052      /* VPMOVSXWQ xmm2/m32, xmm1 = VEX.128.66.0F38.WIG 24 /r */
28053      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28054         delta = dis_PMOVSXWQ_128( vbi, pfx, delta, True/*isAvx*/ );
28055         goto decode_success;
28056      }
28057      /* VPMOVSXWQ xmm2/m64, ymm1 = VEX.256.66.0F38.WIG 24 /r */
28058      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28059         delta = dis_PMOVSXWQ_256( vbi, pfx, delta );
28060         goto decode_success;
28061      }
28062      break;
28063
28064   case 0x25:
28065      /* VPMOVSXDQ xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 25 /r */
28066      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28067         delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
28068                                   True/*isAvx*/, False/*!xIsZ*/ );
28069         goto decode_success;
28070      }
28071      /* VPMOVSXDQ xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 25 /r */
28072      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28073         delta = dis_PMOVxXDQ_256( vbi, pfx, delta, False/*!xIsZ*/ );
28074         goto decode_success;
28075      }
28076      break;
28077
28078   case 0x28:
28079      /* VPMULDQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 28 /r */
28080      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28081         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_complex(
28082                    uses_vvvv, vbi, pfx, delta,
28083                    "vpmuldq", math_PMULDQ_128 );
28084         goto decode_success;
28085      }
28086      /* VPMULDQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 28 /r */
28087      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28088         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
28089                    uses_vvvv, vbi, pfx, delta,
28090                    "vpmuldq", math_PMULDQ_256 );
28091         goto decode_success;
28092      }
28093      break;
28094
28095   case 0x29:
28096      /* VPCMPEQQ r/m, rV, r ::: r = rV `eq-by-64s` r/m */
28097      /* VPCMPEQQ = VEX.NDS.128.66.0F38.WIG 29 /r */
28098      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28099         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
28100                    uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x2 );
28101         goto decode_success;
28102      }
28103      /* VPCMPEQQ r/m, rV, r ::: r = rV `eq-by-64s` r/m */
28104      /* VPCMPEQQ = VEX.NDS.256.66.0F38.WIG 29 /r */
28105      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28106         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
28107                    uses_vvvv, vbi, pfx, delta, "vpcmpeqq", Iop_CmpEQ64x4 );
28108         goto decode_success;
28109      }
28110      break;
28111
28112   case 0x2A:
28113      /* VMOVNTDQA m128, xmm1 = VEX.128.66.0F38.WIG 2A /r */
28114      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28115          && !epartIsReg(getUChar(delta))) {
28116         UChar  modrm = getUChar(delta);
28117         UInt   rD    = gregOfRexRM(pfx, modrm);
28118         IRTemp tD    = newTemp(Ity_V128);
28119         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28120         delta += alen;
28121         gen_SEGV_if_not_16_aligned(addr);
28122         assign(tD, loadLE(Ity_V128, mkexpr(addr)));
28123         DIP("vmovntdqa %s,%s\n", dis_buf, nameXMMReg(rD));
28124         putYMMRegLoAndZU(rD, mkexpr(tD));
28125         goto decode_success;
28126      }
28127      /* VMOVNTDQA m256, ymm1 = VEX.256.66.0F38.WIG 2A /r */
28128      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28129          && !epartIsReg(getUChar(delta))) {
28130         UChar  modrm = getUChar(delta);
28131         UInt   rD    = gregOfRexRM(pfx, modrm);
28132         IRTemp tD    = newTemp(Ity_V256);
28133         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28134         delta += alen;
28135         gen_SEGV_if_not_32_aligned(addr);
28136         assign(tD, loadLE(Ity_V256, mkexpr(addr)));
28137         DIP("vmovntdqa %s,%s\n", dis_buf, nameYMMReg(rD));
28138         putYMMReg(rD, mkexpr(tD));
28139         goto decode_success;
28140      }
28141      break;
28142
28143   case 0x2B:
28144      /* VPACKUSDW r/m, rV, r ::: r = QNarrowBin32Sto16Ux8(rV, r/m) */
28145      /* VPACKUSDW = VEX.NDS.128.66.0F38.WIG 2B /r */
28146      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28147         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG(
28148                    uses_vvvv, vbi, pfx, delta, "vpackusdw",
28149                    Iop_QNarrowBin32Sto16Ux8, NULL,
28150                    False/*!invertLeftArg*/, True/*swapArgs*/ );
28151         goto decode_success;
28152      }
28153      /* VPACKUSDW r/m, rV, r ::: r = QNarrowBin32Sto16Ux8(rV, r/m) */
28154      /* VPACKUSDW = VEX.NDS.256.66.0F38.WIG 2B /r */
28155      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28156         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
28157                    uses_vvvv, vbi, pfx, delta, "vpackusdw",
28158                    math_VPACKUSDW_YMM );
28159         goto decode_success;
28160      }
28161      break;
28162
28163   case 0x2C:
28164      /* VMASKMOVPS m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 2C /r */
28165      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28166          && !epartIsReg(getUChar(delta))) {
28167         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
28168                                    /*!isYMM*/False, Ity_I32 );
28169         goto decode_success;
28170      }
28171      /* VMASKMOVPS m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 2C /r */
28172      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28173          && !epartIsReg(getUChar(delta))) {
28174         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovps",
28175                                    /*isYMM*/True, Ity_I32 );
28176         goto decode_success;
28177      }
28178      break;
28179
28180   case 0x2D:
28181      /* VMASKMOVPD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.WIG 2D /r */
28182      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28183          && !epartIsReg(getUChar(delta))) {
28184         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
28185                                    /*!isYMM*/False, Ity_I64 );
28186         goto decode_success;
28187      }
28188      /* VMASKMOVPD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.WIG 2D /r */
28189      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28190          && !epartIsReg(getUChar(delta))) {
28191         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vmaskmovpd",
28192                                    /*isYMM*/True, Ity_I64 );
28193         goto decode_success;
28194      }
28195      break;
28196
28197   case 0x30:
28198      /* VPMOVZXBW xmm2/m64, xmm1 */
28199      /* VPMOVZXBW = VEX.128.66.0F38.WIG 30 /r */
28200      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28201         delta = dis_PMOVxXBW_128( vbi, pfx, delta,
28202                                   True/*isAvx*/, True/*xIsZ*/ );
28203         goto decode_success;
28204      }
28205      /* VPMOVZXBW xmm2/m128, ymm1 */
28206      /* VPMOVZXBW = VEX.256.66.0F38.WIG 30 /r */
28207      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28208         delta = dis_PMOVxXBW_256( vbi, pfx, delta, True/*xIsZ*/ );
28209         goto decode_success;
28210      }
28211      break;
28212
28213   case 0x31:
28214      /* VPMOVZXBD xmm2/m32, xmm1 */
28215      /* VPMOVZXBD = VEX.128.66.0F38.WIG 31 /r */
28216      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28217         delta = dis_PMOVxXBD_128( vbi, pfx, delta,
28218                                   True/*isAvx*/, True/*xIsZ*/ );
28219         goto decode_success;
28220      }
28221      /* VPMOVZXBD xmm2/m64, ymm1 */
28222      /* VPMOVZXBD = VEX.256.66.0F38.WIG 31 /r */
28223      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28224         delta = dis_PMOVxXBD_256( vbi, pfx, delta, True/*xIsZ*/ );
28225         goto decode_success;
28226      }
28227      break;
28228
28229   case 0x32:
28230      /* VPMOVZXBQ xmm2/m16, xmm1 */
28231      /* VPMOVZXBQ = VEX.128.66.0F38.WIG 32 /r */
28232      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28233         delta = dis_PMOVZXBQ_128( vbi, pfx, delta, True/*isAvx*/ );
28234         goto decode_success;
28235      }
28236      /* VPMOVZXBQ xmm2/m32, ymm1 */
28237      /* VPMOVZXBQ = VEX.256.66.0F38.WIG 32 /r */
28238      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28239         delta = dis_PMOVZXBQ_256( vbi, pfx, delta );
28240         goto decode_success;
28241      }
28242      break;
28243
28244   case 0x33:
28245      /* VPMOVZXWD xmm2/m64, xmm1 */
28246      /* VPMOVZXWD = VEX.128.66.0F38.WIG 33 /r */
28247      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28248         delta = dis_PMOVxXWD_128( vbi, pfx, delta,
28249                                   True/*isAvx*/, True/*xIsZ*/ );
28250         goto decode_success;
28251      }
28252      /* VPMOVZXWD xmm2/m128, ymm1 */
28253      /* VPMOVZXWD = VEX.256.66.0F38.WIG 33 /r */
28254      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28255         delta = dis_PMOVxXWD_256( vbi, pfx, delta, True/*xIsZ*/ );
28256         goto decode_success;
28257      }
28258      break;
28259
28260   case 0x34:
28261      /* VPMOVZXWQ xmm2/m32, xmm1 = VEX.128.66.0F38.WIG 34 /r */
28262      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28263         delta = dis_PMOVZXWQ_128( vbi, pfx, delta, True/*isAvx*/ );
28264         goto decode_success;
28265      }
28266      /* VPMOVZXWQ xmm2/m64, ymm1 = VEX.256.66.0F38.WIG 34 /r */
28267      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28268         delta = dis_PMOVZXWQ_256( vbi, pfx, delta );
28269         goto decode_success;
28270      }
28271      break;
28272
28273   case 0x35:
28274      /* VPMOVZXDQ xmm2/m64, xmm1 = VEX.128.66.0F38.WIG 35 /r */
28275      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28276         delta = dis_PMOVxXDQ_128( vbi, pfx, delta,
28277                                   True/*isAvx*/, True/*xIsZ*/ );
28278         goto decode_success;
28279      }
28280      /* VPMOVZXDQ xmm2/m128, ymm1 = VEX.256.66.0F38.WIG 35 /r */
28281      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28282         delta = dis_PMOVxXDQ_256( vbi, pfx, delta, True/*xIsZ*/ );
28283         goto decode_success;
28284      }
28285      break;
28286
28287   case 0x36:
28288      /* VPERMD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 36 /r */
28289      if (have66noF2noF3(pfx)
28290          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
28291         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_complex(
28292                    uses_vvvv, vbi, pfx, delta, "vpermd", math_VPERMD );
28293         goto decode_success;
28294      }
28295      break;
28296
28297   case 0x37:
28298      /* VPCMPGTQ r/m, rV, r ::: r = rV `>s-by-64s` r/m */
28299      /* VPCMPGTQ = VEX.NDS.128.66.0F38.WIG 37 /r */
28300      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28301         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
28302                    uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx2 );
28303         goto decode_success;
28304      }
28305      /* VPCMPGTQ r/m, rV, r ::: r = rV `>s-by-64s` r/m */
28306      /* VPCMPGTQ = VEX.NDS.256.66.0F38.WIG 37 /r */
28307      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28308         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
28309                    uses_vvvv, vbi, pfx, delta, "vpcmpgtq", Iop_CmpGT64Sx4 );
28310         goto decode_success;
28311      }
28312      break;
28313
28314   case 0x38:
28315      /* VPMINSB r/m, rV, r ::: r = min-signed-8s(rV, r/m) */
28316      /* VPMINSB = VEX.NDS.128.66.0F38.WIG 38 /r */
28317      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28318         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
28319                    uses_vvvv, vbi, pfx, delta, "vpminsb", Iop_Min8Sx16 );
28320         goto decode_success;
28321      }
28322      /* VPMINSB r/m, rV, r ::: r = min-signed-8s(rV, r/m) */
28323      /* VPMINSB = VEX.NDS.256.66.0F38.WIG 38 /r */
28324      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28325         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
28326                    uses_vvvv, vbi, pfx, delta, "vpminsb", Iop_Min8Sx32 );
28327         goto decode_success;
28328      }
28329      break;
28330
28331   case 0x39:
28332      /* VPMINSD r/m, rV, r ::: r = min-signed-32s(rV, r/m) */
28333      /* VPMINSD = VEX.NDS.128.66.0F38.WIG 39 /r */
28334      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28335         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
28336                    uses_vvvv, vbi, pfx, delta, "vpminsd", Iop_Min32Sx4 );
28337         goto decode_success;
28338      }
28339      /* VPMINSD r/m, rV, r ::: r = min-signed-32s(rV, r/m) */
28340      /* VPMINSD = VEX.NDS.256.66.0F38.WIG 39 /r */
28341      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28342         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
28343                    uses_vvvv, vbi, pfx, delta, "vpminsd", Iop_Min32Sx8 );
28344         goto decode_success;
28345      }
28346      break;
28347
28348   case 0x3A:
28349      /* VPMINUW r/m, rV, r ::: r = min-unsigned-16s(rV, r/m) */
28350      /* VPMINUW = VEX.NDS.128.66.0F38.WIG 3A /r */
28351      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28352         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
28353                    uses_vvvv, vbi, pfx, delta, "vpminuw", Iop_Min16Ux8 );
28354         goto decode_success;
28355      }
28356      /* VPMINUW r/m, rV, r ::: r = min-unsigned-16s(rV, r/m) */
28357      /* VPMINUW = VEX.NDS.256.66.0F38.WIG 3A /r */
28358      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28359         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
28360                    uses_vvvv, vbi, pfx, delta, "vpminuw", Iop_Min16Ux16 );
28361         goto decode_success;
28362      }
28363      break;
28364
28365   case 0x3B:
28366      /* VPMINUD r/m, rV, r ::: r = min-unsigned-32s(rV, r/m) */
28367      /* VPMINUD = VEX.NDS.128.66.0F38.WIG 3B /r */
28368      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28369         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
28370                    uses_vvvv, vbi, pfx, delta, "vpminud", Iop_Min32Ux4 );
28371         goto decode_success;
28372      }
28373      /* VPMINUD r/m, rV, r ::: r = min-unsigned-32s(rV, r/m) */
28374      /* VPMINUD = VEX.NDS.256.66.0F38.WIG 3B /r */
28375      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28376         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
28377                    uses_vvvv, vbi, pfx, delta, "vpminud", Iop_Min32Ux8 );
28378         goto decode_success;
28379      }
28380      break;
28381
28382   case 0x3C:
28383      /* VPMAXSB r/m, rV, r ::: r = max-signed-8s(rV, r/m) */
28384      /* VPMAXSB = VEX.NDS.128.66.0F38.WIG 3C /r */
28385      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28386         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
28387                    uses_vvvv, vbi, pfx, delta, "vpmaxsb", Iop_Max8Sx16 );
28388         goto decode_success;
28389      }
28390      /* VPMAXSB r/m, rV, r ::: r = max-signed-8s(rV, r/m) */
28391      /* VPMAXSB = VEX.NDS.256.66.0F38.WIG 3C /r */
28392      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28393         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
28394                    uses_vvvv, vbi, pfx, delta, "vpmaxsb", Iop_Max8Sx32 );
28395         goto decode_success;
28396      }
28397      break;
28398
28399   case 0x3D:
28400      /* VPMAXSD r/m, rV, r ::: r = max-signed-32s(rV, r/m) */
28401      /* VPMAXSD = VEX.NDS.128.66.0F38.WIG 3D /r */
28402      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28403         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
28404                    uses_vvvv, vbi, pfx, delta, "vpmaxsd", Iop_Max32Sx4 );
28405         goto decode_success;
28406      }
28407      /* VPMAXSD r/m, rV, r ::: r = max-signed-32s(rV, r/m) */
28408      /* VPMAXSD = VEX.NDS.256.66.0F38.WIG 3D /r */
28409      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28410         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
28411                    uses_vvvv, vbi, pfx, delta, "vpmaxsd", Iop_Max32Sx8 );
28412         goto decode_success;
28413      }
28414      break;
28415
28416   case 0x3E:
28417      /* VPMAXUW r/m, rV, r ::: r = max-unsigned-16s(rV, r/m) */
28418      /* VPMAXUW = VEX.NDS.128.66.0F38.WIG 3E /r */
28419      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28420         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
28421                    uses_vvvv, vbi, pfx, delta, "vpmaxuw", Iop_Max16Ux8 );
28422         goto decode_success;
28423      }
28424      /* VPMAXUW r/m, rV, r ::: r = max-unsigned-16s(rV, r/m) */
28425      /* VPMAXUW = VEX.NDS.256.66.0F38.WIG 3E /r */
28426      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28427         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
28428                    uses_vvvv, vbi, pfx, delta, "vpmaxuw", Iop_Max16Ux16 );
28429         goto decode_success;
28430      }
28431      break;
28432
28433   case 0x3F:
28434      /* VPMAXUD r/m, rV, r ::: r = max-unsigned-32s(rV, r/m) */
28435      /* VPMAXUD = VEX.NDS.128.66.0F38.WIG 3F /r */
28436      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28437         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
28438                    uses_vvvv, vbi, pfx, delta, "vpmaxud", Iop_Max32Ux4 );
28439         goto decode_success;
28440      }
28441      /* VPMAXUD r/m, rV, r ::: r = max-unsigned-32s(rV, r/m) */
28442      /* VPMAXUD = VEX.NDS.256.66.0F38.WIG 3F /r */
28443      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28444         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
28445                    uses_vvvv, vbi, pfx, delta, "vpmaxud", Iop_Max32Ux8 );
28446         goto decode_success;
28447      }
28448      break;
28449
28450   case 0x40:
28451      /* VPMULLD r/m, rV, r ::: r = mul-32s(rV, r/m) */
28452      /* VPMULLD = VEX.NDS.128.66.0F38.WIG 40 /r */
28453      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28454         delta = dis_VEX_NDS_128_AnySimdPfx_0F_WIG_simple(
28455                    uses_vvvv, vbi, pfx, delta, "vpmulld", Iop_Mul32x4 );
28456         goto decode_success;
28457      }
28458      /* VPMULLD r/m, rV, r ::: r = mul-32s(rV, r/m) */
28459      /* VPMULLD = VEX.NDS.256.66.0F38.WIG 40 /r */
28460      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
28461         delta = dis_VEX_NDS_256_AnySimdPfx_0F_WIG_simple(
28462                    uses_vvvv, vbi, pfx, delta, "vpmulld", Iop_Mul32x8 );
28463         goto decode_success;
28464      }
28465      break;
28466
28467   case 0x41:
28468      /* VPHMINPOSUW xmm2/m128, xmm1 = VEX.128.66.0F38.WIG 41 /r */
28469      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
28470         delta = dis_PHMINPOSUW_128( vbi, pfx, delta, True/*isAvx*/ );
28471         goto decode_success;
28472      }
28473      break;
28474
28475   case 0x45:
28476      /* VPSRLVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 45 /r */
28477      /* VPSRLVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 45 /r */
28478      if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
28479         delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsrlvd",
28480                                         Iop_Shr32, 1==getVexL(pfx) );
28481         *uses_vvvv = True;
28482         goto decode_success;
28483      }
28484      /* VPSRLVQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 45 /r */
28485      /* VPSRLVQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 45 /r */
28486      if (have66noF2noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
28487         delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsrlvq",
28488                                         Iop_Shr64, 1==getVexL(pfx) );
28489         *uses_vvvv = True;
28490         goto decode_success;
28491      }
28492      break;
28493
28494   case 0x46:
28495      /* VPSRAVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 46 /r */
28496      /* VPSRAVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 46 /r */
28497      if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
28498         delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsravd",
28499                                         Iop_Sar32, 1==getVexL(pfx) );
28500         *uses_vvvv = True;
28501         goto decode_success;
28502      }
28503      break;
28504
28505   case 0x47:
28506      /* VPSLLVD xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 47 /r */
28507      /* VPSLLVD ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 47 /r */
28508      if (have66noF2noF3(pfx) && 0==getRexW(pfx)/*W0*/) {
28509         delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsllvd",
28510                                         Iop_Shl32, 1==getVexL(pfx) );
28511         *uses_vvvv = True;
28512         goto decode_success;
28513      }
28514      /* VPSLLVQ xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 47 /r */
28515      /* VPSLLVQ ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 47 /r */
28516      if (have66noF2noF3(pfx) && 1==getRexW(pfx)/*W1*/) {
28517         delta = dis_AVX_var_shiftV_byE( vbi, pfx, delta, "vpsllvq",
28518                                         Iop_Shl64, 1==getVexL(pfx) );
28519         *uses_vvvv = True;
28520         goto decode_success;
28521      }
28522      break;
28523
28524   case 0x58:
28525      /* VPBROADCASTD xmm2/m32, xmm1 = VEX.128.66.0F38.W0 58 /r */
28526      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28527          && 0==getRexW(pfx)/*W0*/) {
28528         UChar modrm = getUChar(delta);
28529         UInt  rG    = gregOfRexRM(pfx, modrm);
28530         IRTemp t32 = newTemp(Ity_I32);
28531         if (epartIsReg(modrm)) {
28532            UInt rE = eregOfRexRM(pfx, modrm);
28533            delta++;
28534            DIP("vpbroadcastd %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
28535            assign(t32, getXMMRegLane32(rE, 0));
28536         } else {
28537            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28538            delta += alen;
28539            DIP("vpbroadcastd %s,%s\n", dis_buf, nameXMMReg(rG));
28540            assign(t32, loadLE(Ity_I32, mkexpr(addr)));
28541         }
28542         IRTemp t64 = newTemp(Ity_I64);
28543         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
28544         IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
28545         putYMMRegLoAndZU(rG, res);
28546         goto decode_success;
28547      }
28548      /* VPBROADCASTD xmm2/m32, ymm1 = VEX.256.66.0F38.W0 58 /r */
28549      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28550          && 0==getRexW(pfx)/*W0*/) {
28551         UChar modrm = getUChar(delta);
28552         UInt  rG    = gregOfRexRM(pfx, modrm);
28553         IRTemp t32 = newTemp(Ity_I32);
28554         if (epartIsReg(modrm)) {
28555            UInt rE = eregOfRexRM(pfx, modrm);
28556            delta++;
28557            DIP("vpbroadcastd %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
28558            assign(t32, getXMMRegLane32(rE, 0));
28559         } else {
28560            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28561            delta += alen;
28562            DIP("vpbroadcastd %s,%s\n", dis_buf, nameYMMReg(rG));
28563            assign(t32, loadLE(Ity_I32, mkexpr(addr)));
28564         }
28565         IRTemp t64 = newTemp(Ity_I64);
28566         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
28567         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
28568                                                  mkexpr(t64), mkexpr(t64));
28569         putYMMReg(rG, res);
28570         goto decode_success;
28571      }
28572      break;
28573
28574   case 0x59:
28575      /* VPBROADCASTQ xmm2/m64, xmm1 = VEX.128.66.0F38.W0 59 /r */
28576      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28577          && 0==getRexW(pfx)/*W0*/) {
28578         UChar modrm = getUChar(delta);
28579         UInt  rG    = gregOfRexRM(pfx, modrm);
28580         IRTemp t64 = newTemp(Ity_I64);
28581         if (epartIsReg(modrm)) {
28582            UInt rE = eregOfRexRM(pfx, modrm);
28583            delta++;
28584            DIP("vpbroadcastq %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
28585            assign(t64, getXMMRegLane64(rE, 0));
28586         } else {
28587            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28588            delta += alen;
28589            DIP("vpbroadcastq %s,%s\n", dis_buf, nameXMMReg(rG));
28590            assign(t64, loadLE(Ity_I64, mkexpr(addr)));
28591         }
28592         IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
28593         putYMMRegLoAndZU(rG, res);
28594         goto decode_success;
28595      }
28596      /* VPBROADCASTQ xmm2/m64, ymm1 = VEX.256.66.0F38.W0 59 /r */
28597      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28598          && 0==getRexW(pfx)/*W0*/) {
28599         UChar modrm = getUChar(delta);
28600         UInt  rG    = gregOfRexRM(pfx, modrm);
28601         IRTemp t64 = newTemp(Ity_I64);
28602         if (epartIsReg(modrm)) {
28603            UInt rE = eregOfRexRM(pfx, modrm);
28604            delta++;
28605            DIP("vpbroadcastq %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
28606            assign(t64, getXMMRegLane64(rE, 0));
28607         } else {
28608            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28609            delta += alen;
28610            DIP("vpbroadcastq %s,%s\n", dis_buf, nameYMMReg(rG));
28611            assign(t64, loadLE(Ity_I64, mkexpr(addr)));
28612         }
28613         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
28614                                                  mkexpr(t64), mkexpr(t64));
28615         putYMMReg(rG, res);
28616         goto decode_success;
28617      }
28618      break;
28619
28620   case 0x5A:
28621      /* VBROADCASTI128 m128, ymm1 = VEX.256.66.0F38.WIG 5A /r */
28622      if (have66noF2noF3(pfx)
28623          && 1==getVexL(pfx)/*256*/
28624          && !epartIsReg(getUChar(delta))) {
28625         UChar modrm = getUChar(delta);
28626         UInt  rG    = gregOfRexRM(pfx, modrm);
28627         addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28628         delta += alen;
28629         DIP("vbroadcasti128 %s,%s\n", dis_buf, nameYMMReg(rG));
28630         IRTemp t128 = newTemp(Ity_V128);
28631         assign(t128, loadLE(Ity_V128, mkexpr(addr)));
28632         putYMMReg( rG, binop(Iop_V128HLtoV256, mkexpr(t128), mkexpr(t128)) );
28633         goto decode_success;
28634      }
28635      break;
28636
28637   case 0x78:
28638      /* VPBROADCASTB xmm2/m8, xmm1 = VEX.128.66.0F38.W0 78 /r */
28639      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28640          && 0==getRexW(pfx)/*W0*/) {
28641         UChar modrm = getUChar(delta);
28642         UInt  rG    = gregOfRexRM(pfx, modrm);
28643         IRTemp t8   = newTemp(Ity_I8);
28644         if (epartIsReg(modrm)) {
28645            UInt rE = eregOfRexRM(pfx, modrm);
28646            DIP("vpbroadcastb %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
28647            assign(t8, unop(Iop_32to8, getXMMRegLane32(rE, 0)));
28648         } else {
28649            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28650            delta += alen;
28651            DIP("vpbroadcastb %s,%s\n", dis_buf, nameXMMReg(rG));
28652            assign(t8, loadLE(Ity_I8, mkexpr(addr)));
28653         }
28654         IRTemp t16 = newTemp(Ity_I16);
28655         assign(t16, binop(Iop_8HLto16, mkexpr(t8), mkexpr(t8)));
28656         IRTemp t32 = newTemp(Ity_I32);
28657         assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
28658         IRTemp t64 = newTemp(Ity_I64);
28659         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
28660         IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
28661         putYMMRegLoAndZU(rG, res);
28662         goto decode_success;
28663      }
28664      /* VPBROADCASTB xmm2/m8, ymm1 = VEX.256.66.0F38.W0 78 /r */
28665      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28666          && 0==getRexW(pfx)/*W0*/) {
28667         UChar modrm = getUChar(delta);
28668         UInt  rG    = gregOfRexRM(pfx, modrm);
28669         IRTemp t8   = newTemp(Ity_I8);
28670         if (epartIsReg(modrm)) {
28671            UInt rE = eregOfRexRM(pfx, modrm);
28672            DIP("vpbroadcastb %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
28673            assign(t8, unop(Iop_32to8, getXMMRegLane32(rE, 0)));
28674         } else {
28675            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28676            delta += alen;
28677            DIP("vpbroadcastb %s,%s\n", dis_buf, nameYMMReg(rG));
28678            assign(t8, loadLE(Ity_I8, mkexpr(addr)));
28679         }
28680         IRTemp t16 = newTemp(Ity_I16);
28681         assign(t16, binop(Iop_8HLto16, mkexpr(t8), mkexpr(t8)));
28682         IRTemp t32 = newTemp(Ity_I32);
28683         assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
28684         IRTemp t64 = newTemp(Ity_I64);
28685         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
28686         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
28687                                                  mkexpr(t64), mkexpr(t64));
28688         putYMMReg(rG, res);
28689         goto decode_success;
28690      }
28691      break;
28692
28693   case 0x79:
28694      /* VPBROADCASTW xmm2/m16, xmm1 = VEX.128.66.0F38.W0 79 /r */
28695      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28696          && 0==getRexW(pfx)/*W0*/) {
28697         UChar modrm = getUChar(delta);
28698         UInt  rG    = gregOfRexRM(pfx, modrm);
28699         IRTemp t16  = newTemp(Ity_I16);
28700         if (epartIsReg(modrm)) {
28701            UInt rE = eregOfRexRM(pfx, modrm);
28702            DIP("vpbroadcastw %s,%s\n", nameXMMReg(rE), nameXMMReg(rG));
28703            assign(t16, unop(Iop_32to16, getXMMRegLane32(rE, 0)));
28704         } else {
28705            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28706            delta += alen;
28707            DIP("vpbroadcastw %s,%s\n", dis_buf, nameXMMReg(rG));
28708            assign(t16, loadLE(Ity_I16, mkexpr(addr)));
28709         }
28710         IRTemp t32 = newTemp(Ity_I32);
28711         assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
28712         IRTemp t64 = newTemp(Ity_I64);
28713         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
28714         IRExpr* res = binop(Iop_64HLtoV128, mkexpr(t64), mkexpr(t64));
28715         putYMMRegLoAndZU(rG, res);
28716         goto decode_success;
28717      }
28718      /* VPBROADCASTW xmm2/m16, ymm1 = VEX.256.66.0F38.W0 79 /r */
28719      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28720          && 0==getRexW(pfx)/*W0*/) {
28721         UChar modrm = getUChar(delta);
28722         UInt  rG    = gregOfRexRM(pfx, modrm);
28723         IRTemp t16  = newTemp(Ity_I16);
28724         if (epartIsReg(modrm)) {
28725            UInt rE = eregOfRexRM(pfx, modrm);
28726            DIP("vpbroadcastw %s,%s\n", nameXMMReg(rE), nameYMMReg(rG));
28727            assign(t16, unop(Iop_32to16, getXMMRegLane32(rE, 0)));
28728         } else {
28729            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 0 );
28730            delta += alen;
28731            DIP("vpbroadcastw %s,%s\n", dis_buf, nameYMMReg(rG));
28732            assign(t16, loadLE(Ity_I16, mkexpr(addr)));
28733         }
28734         IRTemp t32 = newTemp(Ity_I32);
28735         assign(t32, binop(Iop_16HLto32, mkexpr(t16), mkexpr(t16)));
28736         IRTemp t64 = newTemp(Ity_I64);
28737         assign(t64, binop(Iop_32HLto64, mkexpr(t32), mkexpr(t32)));
28738         IRExpr* res = IRExpr_Qop(Iop_64x4toV256, mkexpr(t64), mkexpr(t64),
28739                                                  mkexpr(t64), mkexpr(t64));
28740         putYMMReg(rG, res);
28741         goto decode_success;
28742      }
28743      break;
28744
28745   case 0x8C:
28746      /* VPMASKMOVD m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W0 8C /r */
28747      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28748          && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
28749         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
28750                                    /*!isYMM*/False, Ity_I32 );
28751         goto decode_success;
28752      }
28753      /* VPMASKMOVD m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W0 8C /r */
28754      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28755          && 0==getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
28756         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovd",
28757                                    /*isYMM*/True, Ity_I32 );
28758         goto decode_success;
28759      }
28760      /* VPMASKMOVQ m128, xmm2, xmm1 = VEX.NDS.128.66.0F38.W1 8C /r */
28761      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28762          && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
28763         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
28764                                    /*!isYMM*/False, Ity_I64 );
28765         goto decode_success;
28766      }
28767      /* VPMASKMOVQ m256, ymm2, ymm1 = VEX.NDS.256.66.0F38.W1 8C /r */
28768      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28769          && 1==getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
28770         delta = dis_VMASKMOV_load( uses_vvvv, vbi, pfx, delta, "vpmaskmovq",
28771                                    /*isYMM*/True, Ity_I64 );
28772         goto decode_success;
28773      }
28774      break;
28775
28776   case 0x90:
28777      /* VPGATHERDD xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W0 90 /r */
28778      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28779          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
28780         Long delta0 = delta;
28781         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdd",
28782                              /*!isYMM*/False, /*!isVM64x*/False, Ity_I32 );
28783         if (delta != delta0)
28784            goto decode_success;
28785      }
28786      /* VPGATHERDD ymm2, vm32y, ymm1 = VEX.DDS.256.66.0F38.W0 90 /r */
28787      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28788          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
28789         Long delta0 = delta;
28790         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdd",
28791                              /*isYMM*/True, /*!isVM64x*/False, Ity_I32 );
28792         if (delta != delta0)
28793            goto decode_success;
28794      }
28795      /* VPGATHERDQ xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W1 90 /r */
28796      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28797          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
28798         Long delta0 = delta;
28799         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdq",
28800                              /*!isYMM*/False, /*!isVM64x*/False, Ity_I64 );
28801         if (delta != delta0)
28802            goto decode_success;
28803      }
28804      /* VPGATHERDQ ymm2, vm32x, ymm1 = VEX.DDS.256.66.0F38.W1 90 /r */
28805      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28806          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
28807         Long delta0 = delta;
28808         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherdq",
28809                              /*isYMM*/True, /*!isVM64x*/False, Ity_I64 );
28810         if (delta != delta0)
28811            goto decode_success;
28812      }
28813      break;
28814
28815   case 0x91:
28816      /* VPGATHERQD xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W0 91 /r */
28817      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28818          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
28819         Long delta0 = delta;
28820         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqd",
28821                              /*!isYMM*/False, /*isVM64x*/True, Ity_I32 );
28822         if (delta != delta0)
28823            goto decode_success;
28824      }
28825      /* VPGATHERQD xmm2, vm64y, xmm1 = VEX.DDS.256.66.0F38.W0 91 /r */
28826      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28827          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
28828         Long delta0 = delta;
28829         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqd",
28830                              /*isYMM*/True, /*isVM64x*/True, Ity_I32 );
28831         if (delta != delta0)
28832            goto decode_success;
28833      }
28834      /* VPGATHERQQ xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W1 91 /r */
28835      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28836          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
28837         Long delta0 = delta;
28838         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqq",
28839                              /*!isYMM*/False, /*isVM64x*/True, Ity_I64 );
28840         if (delta != delta0)
28841            goto decode_success;
28842      }
28843      /* VPGATHERQQ ymm2, vm64y, ymm1 = VEX.DDS.256.66.0F38.W1 91 /r */
28844      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28845          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
28846         Long delta0 = delta;
28847         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vpgatherqq",
28848                              /*isYMM*/True, /*isVM64x*/True, Ity_I64 );
28849         if (delta != delta0)
28850            goto decode_success;
28851      }
28852      break;
28853
28854   case 0x92:
28855      /* VGATHERDPS xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W0 92 /r */
28856      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28857          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
28858         Long delta0 = delta;
28859         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdps",
28860                              /*!isYMM*/False, /*!isVM64x*/False, Ity_I32 );
28861         if (delta != delta0)
28862            goto decode_success;
28863      }
28864      /* VGATHERDPS ymm2, vm32y, ymm1 = VEX.DDS.256.66.0F38.W0 92 /r */
28865      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28866          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
28867         Long delta0 = delta;
28868         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdps",
28869                              /*isYMM*/True, /*!isVM64x*/False, Ity_I32 );
28870         if (delta != delta0)
28871            goto decode_success;
28872      }
28873      /* VGATHERDPD xmm2, vm32x, xmm1 = VEX.DDS.128.66.0F38.W1 92 /r */
28874      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28875          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
28876         Long delta0 = delta;
28877         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdpd",
28878                              /*!isYMM*/False, /*!isVM64x*/False, Ity_I64 );
28879         if (delta != delta0)
28880            goto decode_success;
28881      }
28882      /* VGATHERDPD ymm2, vm32x, ymm1 = VEX.DDS.256.66.0F38.W1 92 /r */
28883      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28884          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
28885         Long delta0 = delta;
28886         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherdpd",
28887                              /*isYMM*/True, /*!isVM64x*/False, Ity_I64 );
28888         if (delta != delta0)
28889            goto decode_success;
28890      }
28891      break;
28892
28893   case 0x93:
28894      /* VGATHERQPS xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W0 93 /r */
28895      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28896          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
28897         Long delta0 = delta;
28898         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqps",
28899                              /*!isYMM*/False, /*isVM64x*/True, Ity_I32 );
28900         if (delta != delta0)
28901            goto decode_success;
28902      }
28903      /* VGATHERQPS xmm2, vm64y, xmm1 = VEX.DDS.256.66.0F38.W0 93 /r */
28904      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28905          && 0 == getRexW(pfx)/*W0*/ && !epartIsReg(getUChar(delta))) {
28906         Long delta0 = delta;
28907         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqps",
28908                              /*isYMM*/True, /*isVM64x*/True, Ity_I32 );
28909         if (delta != delta0)
28910            goto decode_success;
28911      }
28912      /* VGATHERQPD xmm2, vm64x, xmm1 = VEX.DDS.128.66.0F38.W1 93 /r */
28913      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/
28914          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
28915         Long delta0 = delta;
28916         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqpd",
28917                              /*!isYMM*/False, /*isVM64x*/True, Ity_I64 );
28918         if (delta != delta0)
28919            goto decode_success;
28920      }
28921      /* VGATHERQPD ymm2, vm64y, ymm1 = VEX.DDS.256.66.0F38.W1 93 /r */
28922      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
28923          && 1 == getRexW(pfx)/*W1*/ && !epartIsReg(getUChar(delta))) {
28924         Long delta0 = delta;
28925         delta = dis_VGATHER( uses_vvvv, vbi, pfx, delta, "vgatherqpd",
28926                              /*isYMM*/True, /*isVM64x*/True, Ity_I64 );
28927         if (delta != delta0)
28928            goto decode_success;
28929      }
28930      break;
28931
28932   case 0x96 ... 0x9F:
28933   case 0xA6 ... 0xAF:
28934   case 0xB6 ... 0xBF:
28935      /* VFMADDSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 96 /r */
28936      /* VFMADDSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 96 /r */
28937      /* VFMADDSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 96 /r */
28938      /* VFMADDSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 96 /r */
28939      /* VFMSUBADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 97 /r */
28940      /* VFMSUBADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 97 /r */
28941      /* VFMSUBADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 97 /r */
28942      /* VFMSUBADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 97 /r */
28943      /* VFMADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 98 /r */
28944      /* VFMADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 98 /r */
28945      /* VFMADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 98 /r */
28946      /* VFMADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 98 /r */
28947      /* VFMADD132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 99 /r */
28948      /* VFMADD132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 99 /r */
28949      /* VFMSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9A /r */
28950      /* VFMSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9A /r */
28951      /* VFMSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9A /r */
28952      /* VFMSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9A /r */
28953      /* VFMSUB132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9B /r */
28954      /* VFMSUB132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9B /r */
28955      /* VFNMADD132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9C /r */
28956      /* VFNMADD132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9C /r */
28957      /* VFNMADD132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9C /r */
28958      /* VFNMADD132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9C /r */
28959      /* VFNMADD132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9D /r */
28960      /* VFNMADD132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9D /r */
28961      /* VFNMSUB132PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 9E /r */
28962      /* VFNMSUB132PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 9E /r */
28963      /* VFNMSUB132PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 9E /r */
28964      /* VFNMSUB132PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 9E /r */
28965      /* VFNMSUB132SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 9F /r */
28966      /* VFNMSUB132SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 9F /r */
28967      /* VFMADDSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A6 /r */
28968      /* VFMADDSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A6 /r */
28969      /* VFMADDSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A6 /r */
28970      /* VFMADDSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A6 /r */
28971      /* VFMSUBADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A7 /r */
28972      /* VFMSUBADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A7 /r */
28973      /* VFMSUBADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A7 /r */
28974      /* VFMSUBADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A7 /r */
28975      /* VFMADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 A8 /r */
28976      /* VFMADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 A8 /r */
28977      /* VFMADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 A8 /r */
28978      /* VFMADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 A8 /r */
28979      /* VFMADD213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 A9 /r */
28980      /* VFMADD213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 A9 /r */
28981      /* VFMSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AA /r */
28982      /* VFMSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AA /r */
28983      /* VFMSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AA /r */
28984      /* VFMSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AA /r */
28985      /* VFMSUB213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AB /r */
28986      /* VFMSUB213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AB /r */
28987      /* VFNMADD213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AC /r */
28988      /* VFNMADD213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AC /r */
28989      /* VFNMADD213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AC /r */
28990      /* VFNMADD213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AC /r */
28991      /* VFNMADD213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AD /r */
28992      /* VFNMADD213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AD /r */
28993      /* VFNMSUB213PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 AE /r */
28994      /* VFNMSUB213PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 AE /r */
28995      /* VFNMSUB213PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 AE /r */
28996      /* VFNMSUB213PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 AE /r */
28997      /* VFNMSUB213SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 AF /r */
28998      /* VFNMSUB213SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 AF /r */
28999      /* VFMADDSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B6 /r */
29000      /* VFMADDSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B6 /r */
29001      /* VFMADDSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B6 /r */
29002      /* VFMADDSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B6 /r */
29003      /* VFMSUBADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B7 /r */
29004      /* VFMSUBADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B7 /r */
29005      /* VFMSUBADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B7 /r */
29006      /* VFMSUBADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B7 /r */
29007      /* VFMADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 B8 /r */
29008      /* VFMADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 B8 /r */
29009      /* VFMADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 B8 /r */
29010      /* VFMADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 B8 /r */
29011      /* VFMADD231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 B9 /r */
29012      /* VFMADD231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 B9 /r */
29013      /* VFMSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BA /r */
29014      /* VFMSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BA /r */
29015      /* VFMSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BA /r */
29016      /* VFMSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BA /r */
29017      /* VFMSUB231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BB /r */
29018      /* VFMSUB231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BB /r */
29019      /* VFNMADD231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BC /r */
29020      /* VFNMADD231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BC /r */
29021      /* VFNMADD231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BC /r */
29022      /* VFNMADD231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BC /r */
29023      /* VFNMADD231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BD /r */
29024      /* VFNMADD231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BD /r */
29025      /* VFNMSUB231PS xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W0 BE /r */
29026      /* VFNMSUB231PS ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W0 BE /r */
29027      /* VFNMSUB231PD xmm3/m128, xmm2, xmm1 = VEX.DDS.128.66.0F38.W1 BE /r */
29028      /* VFNMSUB231PD ymm3/m256, ymm2, ymm1 = VEX.DDS.256.66.0F38.W1 BE /r */
29029      /* VFNMSUB231SS xmm3/m32, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W0 BF /r */
29030      /* VFNMSUB231SD xmm3/m64, xmm2, xmm1 = VEX.DDS.LIG.66.0F38.W1 BF /r */
29031      if (have66noF2noF3(pfx)) {
29032         delta = dis_FMA( vbi, pfx, delta, opc );
29033         *uses_vvvv = True;
29034         goto decode_success;
29035      }
29036      break;
29037
29038   case 0xDB:
29039   case 0xDC:
29040   case 0xDD:
29041   case 0xDE:
29042   case 0xDF:
29043      /* VAESIMC xmm2/m128, xmm1 = VEX.128.66.0F38.WIG DB /r */
29044      /* VAESENC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DC /r */
29045      /* VAESENCLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DD /r */
29046      /* VAESDEC xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DE /r */
29047      /* VAESDECLAST xmm3/m128, xmm2, xmm1 = VEX.128.66.0F38.WIG DF /r */
29048      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29049         delta = dis_AESx( vbi, pfx, delta, True/*!isAvx*/, opc );
29050         if (opc != 0xDB) *uses_vvvv = True;
29051         goto decode_success;
29052      }
29053      break;
29054
29055   case 0xF2:
29056      /* ANDN r/m32, r32b, r32a = VEX.NDS.LZ.0F38.W0 F2 /r */
29057      /* ANDN r/m64, r64b, r64a = VEX.NDS.LZ.0F38.W1 F2 /r */
29058      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
29059         Int     size = getRexW(pfx) ? 8 : 4;
29060         IRType  ty   = szToITy(size);
29061         IRTemp  dst  = newTemp(ty);
29062         IRTemp  src1 = newTemp(ty);
29063         IRTemp  src2 = newTemp(ty);
29064         UChar   rm   = getUChar(delta);
29065
29066         assign( src1, getIRegV(size,pfx) );
29067         if (epartIsReg(rm)) {
29068            assign( src2, getIRegE(size,pfx,rm) );
29069            DIP("andn %s,%s,%s\n", nameIRegE(size,pfx,rm),
29070                nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
29071            delta++;
29072         } else {
29073            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
29074            assign( src2, loadLE(ty, mkexpr(addr)) );
29075            DIP("andn %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
29076                nameIRegG(size,pfx,rm));
29077            delta += alen;
29078         }
29079
29080         assign( dst, binop( mkSizedOp(ty,Iop_And8),
29081                             unop( mkSizedOp(ty,Iop_Not8), mkexpr(src1) ),
29082                             mkexpr(src2) ) );
29083         putIRegG( size, pfx, rm, mkexpr(dst) );
29084         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
29085                                               ? AMD64G_CC_OP_ANDN64
29086                                               : AMD64G_CC_OP_ANDN32)) );
29087         stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
29088         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
29089         *uses_vvvv = True;
29090         goto decode_success;
29091      }
29092      break;
29093
29094   case 0xF3:
29095      /* BLSI r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /3 */
29096      /* BLSI r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /3 */
29097      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
29098          && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 3) {
29099         Int     size = getRexW(pfx) ? 8 : 4;
29100         IRType  ty   = szToITy(size);
29101         IRTemp  src  = newTemp(ty);
29102         IRTemp  dst  = newTemp(ty);
29103         UChar   rm   = getUChar(delta);
29104
29105         if (epartIsReg(rm)) {
29106            assign( src, getIRegE(size,pfx,rm) );
29107            DIP("blsi %s,%s\n", nameIRegE(size,pfx,rm),
29108                nameIRegV(size,pfx));
29109            delta++;
29110         } else {
29111            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
29112            assign( src, loadLE(ty, mkexpr(addr)) );
29113            DIP("blsi %s,%s\n", dis_buf, nameIRegV(size,pfx));
29114            delta += alen;
29115         }
29116
29117         assign( dst, binop(mkSizedOp(ty,Iop_And8),
29118                            binop(mkSizedOp(ty,Iop_Sub8), mkU(ty, 0),
29119                                  mkexpr(src)), mkexpr(src)) );
29120         putIRegV( size, pfx, mkexpr(dst) );
29121         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
29122                                               ? AMD64G_CC_OP_BLSI64
29123                                               : AMD64G_CC_OP_BLSI32)) );
29124         stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
29125         stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
29126         *uses_vvvv = True;
29127         goto decode_success;
29128      }
29129      /* BLSMSK r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /2 */
29130      /* BLSMSK r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /2 */
29131      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
29132          && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 2) {
29133         Int     size = getRexW(pfx) ? 8 : 4;
29134         IRType  ty   = szToITy(size);
29135         IRTemp  src  = newTemp(ty);
29136         IRTemp  dst  = newTemp(ty);
29137         UChar   rm   = getUChar(delta);
29138
29139         if (epartIsReg(rm)) {
29140            assign( src, getIRegE(size,pfx,rm) );
29141            DIP("blsmsk %s,%s\n", nameIRegE(size,pfx,rm),
29142                nameIRegV(size,pfx));
29143            delta++;
29144         } else {
29145            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
29146            assign( src, loadLE(ty, mkexpr(addr)) );
29147            DIP("blsmsk %s,%s\n", dis_buf, nameIRegV(size,pfx));
29148            delta += alen;
29149         }
29150
29151         assign( dst, binop(mkSizedOp(ty,Iop_Xor8),
29152                            binop(mkSizedOp(ty,Iop_Sub8), mkexpr(src),
29153                                  mkU(ty, 1)), mkexpr(src)) );
29154         putIRegV( size, pfx, mkexpr(dst) );
29155         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
29156                                               ? AMD64G_CC_OP_BLSMSK64
29157                                               : AMD64G_CC_OP_BLSMSK32)) );
29158         stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
29159         stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
29160         *uses_vvvv = True;
29161         goto decode_success;
29162      }
29163      /* BLSR r/m32, r32 = VEX.NDD.LZ.0F38.W0 F3 /1 */
29164      /* BLSR r/m64, r64 = VEX.NDD.LZ.0F38.W1 F3 /1 */
29165      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/
29166          && !haveREX(pfx) && gregLO3ofRM(getUChar(delta)) == 1) {
29167         Int     size = getRexW(pfx) ? 8 : 4;
29168         IRType  ty   = szToITy(size);
29169         IRTemp  src  = newTemp(ty);
29170         IRTemp  dst  = newTemp(ty);
29171         UChar   rm   = getUChar(delta);
29172
29173         if (epartIsReg(rm)) {
29174            assign( src, getIRegE(size,pfx,rm) );
29175            DIP("blsr %s,%s\n", nameIRegE(size,pfx,rm),
29176                nameIRegV(size,pfx));
29177            delta++;
29178         } else {
29179            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
29180            assign( src, loadLE(ty, mkexpr(addr)) );
29181            DIP("blsr %s,%s\n", dis_buf, nameIRegV(size,pfx));
29182            delta += alen;
29183         }
29184
29185         assign( dst, binop(mkSizedOp(ty,Iop_And8),
29186                            binop(mkSizedOp(ty,Iop_Sub8), mkexpr(src),
29187                                  mkU(ty, 1)), mkexpr(src)) );
29188         putIRegV( size, pfx, mkexpr(dst) );
29189         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
29190                                               ? AMD64G_CC_OP_BLSR64
29191                                               : AMD64G_CC_OP_BLSR32)) );
29192         stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
29193         stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(src))) );
29194         *uses_vvvv = True;
29195         goto decode_success;
29196      }
29197      break;
29198
29199   case 0xF5:
29200      /* BZHI r32b, r/m32, r32a = VEX.NDS.LZ.0F38.W0 F5 /r */
29201      /* BZHI r64b, r/m64, r64a = VEX.NDS.LZ.0F38.W1 F5 /r */
29202      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
29203         Int     size  = getRexW(pfx) ? 8 : 4;
29204         IRType  ty    = szToITy(size);
29205         IRTemp  dst   = newTemp(ty);
29206         IRTemp  src1  = newTemp(ty);
29207         IRTemp  src2  = newTemp(ty);
29208         IRTemp  start = newTemp(Ity_I8);
29209         IRTemp  cond  = newTemp(Ity_I1);
29210         UChar   rm    = getUChar(delta);
29211
29212         assign( src2, getIRegV(size,pfx) );
29213         if (epartIsReg(rm)) {
29214            assign( src1, getIRegE(size,pfx,rm) );
29215            DIP("bzhi %s,%s,%s\n", nameIRegV(size,pfx),
29216                nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
29217            delta++;
29218         } else {
29219            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
29220            assign( src1, loadLE(ty, mkexpr(addr)) );
29221            DIP("bzhi %s,%s,%s\n", nameIRegV(size,pfx), dis_buf,
29222                nameIRegG(size,pfx,rm));
29223            delta += alen;
29224         }
29225
29226         assign( start, narrowTo( Ity_I8, mkexpr(src2) ) );
29227         assign( cond, binop(Iop_CmpLT32U,
29228                             unop(Iop_8Uto32, mkexpr(start)),
29229                             mkU32(8*size)) );
29230         /* if (start < opsize) {
29231               if (start == 0)
29232                  dst = 0;
29233               else
29234                  dst = (src1 << (opsize-start)) u>> (opsize-start);
29235            } else {
29236               dst = src1;
29237            } */
29238         assign( dst,
29239                 IRExpr_ITE(
29240                    mkexpr(cond),
29241                    IRExpr_ITE(
29242                       binop(Iop_CmpEQ8, mkexpr(start), mkU8(0)),
29243                       mkU(ty, 0),
29244                       binop(
29245                          mkSizedOp(ty,Iop_Shr8),
29246                          binop(
29247                             mkSizedOp(ty,Iop_Shl8),
29248                             mkexpr(src1),
29249                             binop(Iop_Sub8, mkU8(8*size), mkexpr(start))
29250                          ),
29251                          binop(Iop_Sub8, mkU8(8*size), mkexpr(start))
29252                       )
29253                    ),
29254                    mkexpr(src1)
29255                 )
29256               );
29257         putIRegG( size, pfx, rm, mkexpr(dst) );
29258         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
29259                                               ? AMD64G_CC_OP_BLSR64
29260                                               : AMD64G_CC_OP_BLSR32)) );
29261         stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
29262         stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto64(mkexpr(cond))) );
29263         *uses_vvvv = True;
29264         goto decode_success;
29265      }
29266      /* PDEP r/m32, r32b, r32a = VEX.NDS.LZ.F2.0F38.W0 F5 /r */
29267      /* PDEP r/m64, r64b, r64a = VEX.NDS.LZ.F2.0F38.W1 F5 /r */
29268      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
29269         Int     size = getRexW(pfx) ? 8 : 4;
29270         IRType  ty   = szToITy(size);
29271         IRTemp  src  = newTemp(ty);
29272         IRTemp  mask = newTemp(ty);
29273         UChar   rm   = getUChar(delta);
29274
29275         assign( src, getIRegV(size,pfx) );
29276         if (epartIsReg(rm)) {
29277            assign( mask, getIRegE(size,pfx,rm) );
29278            DIP("pdep %s,%s,%s\n", nameIRegE(size,pfx,rm),
29279                nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
29280            delta++;
29281         } else {
29282            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
29283            assign( mask, loadLE(ty, mkexpr(addr)) );
29284            DIP("pdep %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
29285                nameIRegG(size,pfx,rm));
29286            delta += alen;
29287         }
29288
29289         IRExpr** args = mkIRExprVec_2( widenUto64(mkexpr(src)),
29290                                        widenUto64(mkexpr(mask)) );
29291         putIRegG( size, pfx, rm,
29292                   narrowTo(ty, mkIRExprCCall(Ity_I64, 0/*regparms*/,
29293                                              "amd64g_calculate_pdep",
29294                                              &amd64g_calculate_pdep, args)) );
29295         *uses_vvvv = True;
29296         /* Flags aren't modified.  */
29297         goto decode_success;
29298      }
29299      /* PEXT r/m32, r32b, r32a = VEX.NDS.LZ.F3.0F38.W0 F5 /r */
29300      /* PEXT r/m64, r64b, r64a = VEX.NDS.LZ.F3.0F38.W1 F5 /r */
29301      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
29302         Int     size = getRexW(pfx) ? 8 : 4;
29303         IRType  ty   = szToITy(size);
29304         IRTemp  src  = newTemp(ty);
29305         IRTemp  mask = newTemp(ty);
29306         UChar   rm   = getUChar(delta);
29307
29308         assign( src, getIRegV(size,pfx) );
29309         if (epartIsReg(rm)) {
29310            assign( mask, getIRegE(size,pfx,rm) );
29311            DIP("pext %s,%s,%s\n", nameIRegE(size,pfx,rm),
29312                nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
29313            delta++;
29314         } else {
29315            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
29316            assign( mask, loadLE(ty, mkexpr(addr)) );
29317            DIP("pext %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
29318                nameIRegG(size,pfx,rm));
29319            delta += alen;
29320         }
29321
29322         /* First mask off bits not set in mask, they are ignored
29323            and it should be fine if they contain undefined values.  */
29324         IRExpr* masked = binop(mkSizedOp(ty,Iop_And8),
29325                                mkexpr(src), mkexpr(mask));
29326         IRExpr** args = mkIRExprVec_2( widenUto64(masked),
29327                                        widenUto64(mkexpr(mask)) );
29328         putIRegG( size, pfx, rm,
29329                   narrowTo(ty, mkIRExprCCall(Ity_I64, 0/*regparms*/,
29330                                              "amd64g_calculate_pext",
29331                                              &amd64g_calculate_pext, args)) );
29332         *uses_vvvv = True;
29333         /* Flags aren't modified.  */
29334         goto decode_success;
29335      }
29336      break;
29337
29338   case 0xF6:
29339      /* MULX r/m32, r32b, r32a = VEX.NDD.LZ.F2.0F38.W0 F6 /r */
29340      /* MULX r/m64, r64b, r64a = VEX.NDD.LZ.F2.0F38.W1 F6 /r */
29341      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
29342         Int     size = getRexW(pfx) ? 8 : 4;
29343         IRType  ty   = szToITy(size);
29344         IRTemp  src1 = newTemp(ty);
29345         IRTemp  src2 = newTemp(ty);
29346         IRTemp  res  = newTemp(size == 8 ? Ity_I128 : Ity_I64);
29347         UChar   rm   = getUChar(delta);
29348
29349         assign( src1, getIRegRDX(size) );
29350         if (epartIsReg(rm)) {
29351            assign( src2, getIRegE(size,pfx,rm) );
29352            DIP("mulx %s,%s,%s\n", nameIRegE(size,pfx,rm),
29353                nameIRegV(size,pfx), nameIRegG(size,pfx,rm));
29354            delta++;
29355         } else {
29356            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
29357            assign( src2, loadLE(ty, mkexpr(addr)) );
29358            DIP("mulx %s,%s,%s\n", dis_buf, nameIRegV(size,pfx),
29359                nameIRegG(size,pfx,rm));
29360            delta += alen;
29361         }
29362
29363         assign( res, binop(size == 8 ? Iop_MullU64 : Iop_MullU32,
29364                            mkexpr(src1), mkexpr(src2)) );
29365         putIRegV( size, pfx,
29366                   unop(size == 8 ? Iop_128to64 : Iop_64to32, mkexpr(res)) );
29367         putIRegG( size, pfx, rm,
29368                   unop(size == 8 ? Iop_128HIto64 : Iop_64HIto32,
29369                        mkexpr(res)) );
29370         *uses_vvvv = True;
29371         /* Flags aren't modified.  */
29372         goto decode_success;
29373      }
29374      break;
29375
29376   case 0xF7:
29377      /* SARX r32b, r/m32, r32a = VEX.NDS.LZ.F3.0F38.W0 F7 /r */
29378      /* SARX r64b, r/m64, r64a = VEX.NDS.LZ.F3.0F38.W1 F7 /r */
29379      if (haveF3no66noF2(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
29380         delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "sarx", Iop_Sar8 );
29381         goto decode_success;
29382      }
29383      /* SHLX r32b, r/m32, r32a = VEX.NDS.LZ.66.0F38.W0 F7 /r */
29384      /* SHLX r64b, r/m64, r64a = VEX.NDS.LZ.66.0F38.W1 F7 /r */
29385      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
29386         delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "shlx", Iop_Shl8 );
29387         goto decode_success;
29388      }
29389      /* SHRX r32b, r/m32, r32a = VEX.NDS.LZ.F2.0F38.W0 F7 /r */
29390      /* SHRX r64b, r/m64, r64a = VEX.NDS.LZ.F2.0F38.W1 F7 /r */
29391      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
29392         delta = dis_SHIFTX( uses_vvvv, vbi, pfx, delta, "shrx", Iop_Shr8 );
29393         goto decode_success;
29394      }
29395      /* BEXTR r32b, r/m32, r32a = VEX.NDS.LZ.0F38.W0 F7 /r */
29396      /* BEXTR r64b, r/m64, r64a = VEX.NDS.LZ.0F38.W1 F7 /r */
29397      if (haveNo66noF2noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
29398         Int     size  = getRexW(pfx) ? 8 : 4;
29399         IRType  ty    = szToITy(size);
29400         IRTemp  dst   = newTemp(ty);
29401         IRTemp  src1  = newTemp(ty);
29402         IRTemp  src2  = newTemp(ty);
29403         IRTemp  stle  = newTemp(Ity_I16);
29404         IRTemp  start = newTemp(Ity_I8);
29405         IRTemp  len   = newTemp(Ity_I8);
29406         UChar   rm    = getUChar(delta);
29407
29408         assign( src2, getIRegV(size,pfx) );
29409         if (epartIsReg(rm)) {
29410            assign( src1, getIRegE(size,pfx,rm) );
29411            DIP("bextr %s,%s,%s\n", nameIRegV(size,pfx),
29412                nameIRegE(size,pfx,rm), nameIRegG(size,pfx,rm));
29413            delta++;
29414         } else {
29415            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
29416            assign( src1, loadLE(ty, mkexpr(addr)) );
29417            DIP("bextr %s,%s,%s\n", nameIRegV(size,pfx), dis_buf,
29418                nameIRegG(size,pfx,rm));
29419            delta += alen;
29420         }
29421
29422         assign( stle, narrowTo( Ity_I16, mkexpr(src2) ) );
29423         assign( start, unop( Iop_16to8, mkexpr(stle) ) );
29424         assign( len, unop( Iop_16HIto8, mkexpr(stle) ) );
29425         /* if (start+len < opsize) {
29426               if (len != 0)
29427                  dst = (src1 << (opsize-start-len)) u>> (opsize-len);
29428               else
29429                  dst = 0;
29430            } else {
29431               if (start < opsize)
29432                  dst = src1 u>> start;
29433               else
29434                  dst = 0;
29435            } */
29436         assign( dst,
29437                 IRExpr_ITE(
29438                    binop(Iop_CmpLT32U,
29439                          binop(Iop_Add32,
29440                                unop(Iop_8Uto32, mkexpr(start)),
29441                                unop(Iop_8Uto32, mkexpr(len))),
29442                          mkU32(8*size)),
29443                    IRExpr_ITE(
29444                       binop(Iop_CmpEQ8, mkexpr(len), mkU8(0)),
29445                       mkU(ty, 0),
29446                       binop(mkSizedOp(ty,Iop_Shr8),
29447                             binop(mkSizedOp(ty,Iop_Shl8), mkexpr(src1),
29448                                   binop(Iop_Sub8,
29449                                         binop(Iop_Sub8, mkU8(8*size),
29450                                               mkexpr(start)),
29451                                         mkexpr(len))),
29452                             binop(Iop_Sub8, mkU8(8*size),
29453                                   mkexpr(len)))
29454                    ),
29455                    IRExpr_ITE(
29456                       binop(Iop_CmpLT32U,
29457                             unop(Iop_8Uto32, mkexpr(start)),
29458                             mkU32(8*size)),
29459                       binop(mkSizedOp(ty,Iop_Shr8), mkexpr(src1),
29460                             mkexpr(start)),
29461                       mkU(ty, 0)
29462                    )
29463                 )
29464               );
29465         putIRegG( size, pfx, rm, mkexpr(dst) );
29466         stmt( IRStmt_Put( OFFB_CC_OP,   mkU64(size == 8
29467                                               ? AMD64G_CC_OP_ANDN64
29468                                               : AMD64G_CC_OP_ANDN32)) );
29469         stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto64(mkexpr(dst))) );
29470         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU64(0)) );
29471         *uses_vvvv = True;
29472         goto decode_success;
29473      }
29474      break;
29475
29476   default:
29477      break;
29478
29479   }
29480
29481  //decode_failure:
29482   return deltaIN;
29483
29484  decode_success:
29485   return delta;
29486}
29487
29488
29489/*------------------------------------------------------------*/
29490/*---                                                      ---*/
29491/*--- Top-level post-escape decoders: dis_ESC_0F3A__VEX    ---*/
29492/*---                                                      ---*/
29493/*------------------------------------------------------------*/
29494
29495static IRTemp math_VPERMILPS_128 ( IRTemp sV, UInt imm8 )
29496{
29497   vassert(imm8 < 256);
29498   IRTemp s3, s2, s1, s0;
29499   s3 = s2 = s1 = s0 = IRTemp_INVALID;
29500   breakupV128to32s( sV, &s3, &s2, &s1, &s0 );
29501#  define SEL(_nn) (((_nn)==0) ? s0 : ((_nn)==1) ? s1 \
29502                                    : ((_nn)==2) ? s2 : s3)
29503   IRTemp res = newTemp(Ity_V128);
29504   assign(res, mkV128from32s( SEL((imm8 >> 6) & 3),
29505                              SEL((imm8 >> 4) & 3),
29506                              SEL((imm8 >> 2) & 3),
29507                              SEL((imm8 >> 0) & 3) ));
29508#  undef SEL
29509   return res;
29510}
29511
29512__attribute__((noinline))
29513static
29514Long dis_ESC_0F3A__VEX (
29515        /*MB_OUT*/DisResult* dres,
29516        /*OUT*/   Bool*      uses_vvvv,
29517        Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
29518        Bool         resteerCisOk,
29519        void*        callback_opaque,
29520        VexArchInfo* archinfo,
29521        VexAbiInfo*  vbi,
29522        Prefix pfx, Int sz, Long deltaIN
29523     )
29524{
29525   IRTemp addr  = IRTemp_INVALID;
29526   Int    alen  = 0;
29527   HChar  dis_buf[50];
29528   Long   delta = deltaIN;
29529   UChar  opc   = getUChar(delta);
29530   delta++;
29531   *uses_vvvv = False;
29532
29533   switch (opc) {
29534
29535   case 0x00:
29536   case 0x01:
29537      /* VPERMQ imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.W1 00 /r ib */
29538      /* VPERMPD imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.W1 01 /r ib */
29539      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/
29540          && 1==getRexW(pfx)/*W1*/) {
29541         UChar  modrm = getUChar(delta);
29542         UInt   imm8  = 0;
29543         UInt   rG    = gregOfRexRM(pfx, modrm);
29544         IRTemp sV    = newTemp(Ity_V256);
29545         const HChar *name  = opc == 0 ? "vpermq" : "vpermpd";
29546         if (epartIsReg(modrm)) {
29547            UInt rE = eregOfRexRM(pfx, modrm);
29548            delta += 1;
29549            imm8 = getUChar(delta);
29550            DIP("%s $%u,%s,%s\n",
29551                name, imm8, nameYMMReg(rE), nameYMMReg(rG));
29552            assign(sV, getYMMReg(rE));
29553         } else {
29554            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
29555            delta += alen;
29556            imm8 = getUChar(delta);
29557            DIP("%s $%u,%s,%s\n",
29558                name, imm8, dis_buf, nameYMMReg(rG));
29559            assign(sV, loadLE(Ity_V256, mkexpr(addr)));
29560         }
29561         delta++;
29562         IRTemp s[4];
29563         s[3] = s[2] = s[1] = s[0] = IRTemp_INVALID;
29564         breakupV256to64s(sV, &s[3], &s[2], &s[1], &s[0]);
29565         IRTemp dV = newTemp(Ity_V256);
29566         assign(dV, IRExpr_Qop(Iop_64x4toV256,
29567                               mkexpr(s[(imm8 >> 6) & 3]),
29568                               mkexpr(s[(imm8 >> 4) & 3]),
29569                               mkexpr(s[(imm8 >> 2) & 3]),
29570                               mkexpr(s[(imm8 >> 0) & 3])));
29571         putYMMReg(rG, mkexpr(dV));
29572         goto decode_success;
29573      }
29574      break;
29575
29576   case 0x02:
29577      /* VPBLENDD imm8, xmm3/m128, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 02 /r ib */
29578      if (have66noF2noF3(pfx)
29579          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
29580         UChar  modrm = getUChar(delta);
29581         UInt   imm8  = 0;
29582         UInt   rG    = gregOfRexRM(pfx, modrm);
29583         UInt   rV    = getVexNvvvv(pfx);
29584         IRTemp sV    = newTemp(Ity_V128);
29585         IRTemp dV    = newTemp(Ity_V128);
29586         UInt   i;
29587         IRTemp s[4], d[4];
29588         assign(sV, getXMMReg(rV));
29589         if (epartIsReg(modrm)) {
29590            UInt rE = eregOfRexRM(pfx, modrm);
29591            delta += 1;
29592            imm8 = getUChar(delta);
29593            DIP("vpblendd $%u,%s,%s,%s\n",
29594                imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
29595            assign(dV, getXMMReg(rE));
29596         } else {
29597            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
29598            delta += alen;
29599            imm8 = getUChar(delta);
29600            DIP("vpblendd $%u,%s,%s,%s\n",
29601                imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
29602            assign(dV, loadLE(Ity_V128, mkexpr(addr)));
29603         }
29604         delta++;
29605         for (i = 0; i < 4; i++) {
29606            s[i] = IRTemp_INVALID;
29607            d[i] = IRTemp_INVALID;
29608         }
29609         breakupV128to32s( sV, &s[3], &s[2], &s[1], &s[0] );
29610         breakupV128to32s( dV, &d[3], &d[2], &d[1], &d[0] );
29611         for (i = 0; i < 4; i++)
29612            putYMMRegLane32(rG, i, mkexpr((imm8 & (1<<i)) ? d[i] : s[i]));
29613         putYMMRegLane128(rG, 1, mkV128(0));
29614         *uses_vvvv = True;
29615         goto decode_success;
29616      }
29617      /* VPBLENDD imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.256.66.0F3A.W0 02 /r ib */
29618      if (have66noF2noF3(pfx)
29619          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
29620         UChar  modrm = getUChar(delta);
29621         UInt   imm8  = 0;
29622         UInt   rG    = gregOfRexRM(pfx, modrm);
29623         UInt   rV    = getVexNvvvv(pfx);
29624         IRTemp sV    = newTemp(Ity_V256);
29625         IRTemp dV    = newTemp(Ity_V256);
29626         UInt   i;
29627         IRTemp s[8], d[8];
29628         assign(sV, getYMMReg(rV));
29629         if (epartIsReg(modrm)) {
29630            UInt rE = eregOfRexRM(pfx, modrm);
29631            delta += 1;
29632            imm8 = getUChar(delta);
29633            DIP("vpblendd $%u,%s,%s,%s\n",
29634                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
29635            assign(dV, getYMMReg(rE));
29636         } else {
29637            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
29638            delta += alen;
29639            imm8 = getUChar(delta);
29640            DIP("vpblendd $%u,%s,%s,%s\n",
29641                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
29642            assign(dV, loadLE(Ity_V256, mkexpr(addr)));
29643         }
29644         delta++;
29645         for (i = 0; i < 8; i++) {
29646            s[i] = IRTemp_INVALID;
29647            d[i] = IRTemp_INVALID;
29648         }
29649         breakupV256to32s( sV, &s[7], &s[6], &s[5], &s[4],
29650                               &s[3], &s[2], &s[1], &s[0] );
29651         breakupV256to32s( dV, &d[7], &d[6], &d[5], &d[4],
29652                               &d[3], &d[2], &d[1], &d[0] );
29653         for (i = 0; i < 8; i++)
29654            putYMMRegLane32(rG, i, mkexpr((imm8 & (1<<i)) ? d[i] : s[i]));
29655         *uses_vvvv = True;
29656         goto decode_success;
29657      }
29658      break;
29659
29660   case 0x04:
29661      /* VPERMILPS imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.WIG 04 /r ib */
29662      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29663         UChar  modrm = getUChar(delta);
29664         UInt   imm8  = 0;
29665         UInt   rG    = gregOfRexRM(pfx, modrm);
29666         IRTemp sV    = newTemp(Ity_V256);
29667         if (epartIsReg(modrm)) {
29668            UInt rE = eregOfRexRM(pfx, modrm);
29669            delta += 1;
29670            imm8 = getUChar(delta);
29671            DIP("vpermilps $%u,%s,%s\n",
29672                imm8, nameYMMReg(rE), nameYMMReg(rG));
29673            assign(sV, getYMMReg(rE));
29674         } else {
29675            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
29676            delta += alen;
29677            imm8 = getUChar(delta);
29678            DIP("vpermilps $%u,%s,%s\n",
29679                imm8, dis_buf, nameYMMReg(rG));
29680            assign(sV, loadLE(Ity_V256, mkexpr(addr)));
29681         }
29682         delta++;
29683         IRTemp  sVhi = IRTemp_INVALID, sVlo = IRTemp_INVALID;
29684         breakupV256toV128s( sV, &sVhi, &sVlo );
29685         IRTemp  dVhi = math_VPERMILPS_128( sVhi, imm8 );
29686         IRTemp  dVlo = math_VPERMILPS_128( sVlo, imm8 );
29687         IRExpr* res  = binop(Iop_V128HLtoV256, mkexpr(dVhi), mkexpr(dVlo));
29688         putYMMReg(rG, res);
29689         goto decode_success;
29690      }
29691      /* VPERMILPS imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG 04 /r ib */
29692      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29693         UChar  modrm = getUChar(delta);
29694         UInt   imm8  = 0;
29695         UInt   rG    = gregOfRexRM(pfx, modrm);
29696         IRTemp sV    = newTemp(Ity_V128);
29697         if (epartIsReg(modrm)) {
29698            UInt rE = eregOfRexRM(pfx, modrm);
29699            delta += 1;
29700            imm8 = getUChar(delta);
29701            DIP("vpermilps $%u,%s,%s\n",
29702                imm8, nameXMMReg(rE), nameXMMReg(rG));
29703            assign(sV, getXMMReg(rE));
29704         } else {
29705            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
29706            delta += alen;
29707            imm8 = getUChar(delta);
29708            DIP("vpermilps $%u,%s,%s\n",
29709                imm8, dis_buf, nameXMMReg(rG));
29710            assign(sV, loadLE(Ity_V128, mkexpr(addr)));
29711         }
29712         delta++;
29713         putYMMRegLoAndZU(rG, mkexpr ( math_VPERMILPS_128 ( sV, imm8 ) ) );
29714         goto decode_success;
29715      }
29716      break;
29717
29718   case 0x05:
29719      /* VPERMILPD imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG 05 /r ib */
29720      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29721         UChar  modrm = getUChar(delta);
29722         UInt   imm8  = 0;
29723         UInt   rG    = gregOfRexRM(pfx, modrm);
29724         IRTemp sV    = newTemp(Ity_V128);
29725         if (epartIsReg(modrm)) {
29726            UInt rE = eregOfRexRM(pfx, modrm);
29727            delta += 1;
29728            imm8 = getUChar(delta);
29729            DIP("vpermilpd $%u,%s,%s\n",
29730                imm8, nameXMMReg(rE), nameXMMReg(rG));
29731            assign(sV, getXMMReg(rE));
29732         } else {
29733            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
29734            delta += alen;
29735            imm8 = getUChar(delta);
29736            DIP("vpermilpd $%u,%s,%s\n",
29737                imm8, dis_buf, nameXMMReg(rG));
29738            assign(sV, loadLE(Ity_V128, mkexpr(addr)));
29739         }
29740         delta++;
29741         IRTemp s1 = newTemp(Ity_I64);
29742         IRTemp s0 = newTemp(Ity_I64);
29743         assign(s1, unop(Iop_V128HIto64, mkexpr(sV)));
29744         assign(s0, unop(Iop_V128to64,   mkexpr(sV)));
29745         IRTemp dV = newTemp(Ity_V128);
29746         assign(dV, binop(Iop_64HLtoV128,
29747                               mkexpr((imm8 & (1<<1)) ? s1 : s0),
29748                               mkexpr((imm8 & (1<<0)) ? s1 : s0)));
29749         putYMMRegLoAndZU(rG, mkexpr(dV));
29750         goto decode_success;
29751      }
29752      /* VPERMILPD imm8, ymm2/m256, ymm1 = VEX.256.66.0F3A.WIG 05 /r ib */
29753      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29754         UChar  modrm = getUChar(delta);
29755         UInt   imm8  = 0;
29756         UInt   rG    = gregOfRexRM(pfx, modrm);
29757         IRTemp sV    = newTemp(Ity_V256);
29758         if (epartIsReg(modrm)) {
29759            UInt rE = eregOfRexRM(pfx, modrm);
29760            delta += 1;
29761            imm8 = getUChar(delta);
29762            DIP("vpermilpd $%u,%s,%s\n",
29763                imm8, nameYMMReg(rE), nameYMMReg(rG));
29764            assign(sV, getYMMReg(rE));
29765         } else {
29766            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
29767            delta += alen;
29768            imm8 = getUChar(delta);
29769            DIP("vpermilpd $%u,%s,%s\n",
29770                imm8, dis_buf, nameYMMReg(rG));
29771            assign(sV, loadLE(Ity_V256, mkexpr(addr)));
29772         }
29773         delta++;
29774         IRTemp s3, s2, s1, s0;
29775         s3 = s2 = s1 = s0 = IRTemp_INVALID;
29776         breakupV256to64s(sV, &s3, &s2, &s1, &s0);
29777         IRTemp dV = newTemp(Ity_V256);
29778         assign(dV, IRExpr_Qop(Iop_64x4toV256,
29779                               mkexpr((imm8 & (1<<3)) ? s3 : s2),
29780                               mkexpr((imm8 & (1<<2)) ? s3 : s2),
29781                               mkexpr((imm8 & (1<<1)) ? s1 : s0),
29782                               mkexpr((imm8 & (1<<0)) ? s1 : s0)));
29783         putYMMReg(rG, mkexpr(dV));
29784         goto decode_success;
29785      }
29786      break;
29787
29788   case 0x06:
29789      /* VPERM2F128 imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.66.0F3A.W0 06 /r ib */
29790      if (have66noF2noF3(pfx)
29791          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
29792         UChar  modrm = getUChar(delta);
29793         UInt   imm8  = 0;
29794         UInt   rG    = gregOfRexRM(pfx, modrm);
29795         UInt   rV    = getVexNvvvv(pfx);
29796         IRTemp s00   = newTemp(Ity_V128);
29797         IRTemp s01   = newTemp(Ity_V128);
29798         IRTemp s10   = newTemp(Ity_V128);
29799         IRTemp s11   = newTemp(Ity_V128);
29800         assign(s00, getYMMRegLane128(rV, 0));
29801         assign(s01, getYMMRegLane128(rV, 1));
29802         if (epartIsReg(modrm)) {
29803            UInt rE = eregOfRexRM(pfx, modrm);
29804            delta += 1;
29805            imm8 = getUChar(delta);
29806            DIP("vperm2f128 $%u,%s,%s,%s\n",
29807                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
29808            assign(s10, getYMMRegLane128(rE, 0));
29809            assign(s11, getYMMRegLane128(rE, 1));
29810         } else {
29811            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
29812            delta += alen;
29813            imm8 = getUChar(delta);
29814            DIP("vperm2f128 $%u,%s,%s,%s\n",
29815                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
29816            assign(s10, loadLE(Ity_V128, binop(Iop_Add64,
29817                                               mkexpr(addr), mkU64(0))));
29818            assign(s11, loadLE(Ity_V128, binop(Iop_Add64,
29819                                               mkexpr(addr), mkU64(16))));
29820         }
29821         delta++;
29822#        define SEL(_nn) (((_nn)==0) ? s00 : ((_nn)==1) ? s01 \
29823                                           : ((_nn)==2) ? s10 : s11)
29824         putYMMRegLane128(rG, 0, mkexpr(SEL((imm8 >> 0) & 3)));
29825         putYMMRegLane128(rG, 1, mkexpr(SEL((imm8 >> 4) & 3)));
29826#        undef SEL
29827         if (imm8 & (1<<3)) putYMMRegLane128(rG, 0, mkV128(0));
29828         if (imm8 & (1<<7)) putYMMRegLane128(rG, 1, mkV128(0));
29829         *uses_vvvv = True;
29830         goto decode_success;
29831      }
29832      break;
29833
29834   case 0x08:
29835      /* VROUNDPS imm8, xmm2/m128, xmm1 */
29836      /* VROUNDPS = VEX.NDS.128.66.0F3A.WIG 08 ib */
29837      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29838         UChar  modrm = getUChar(delta);
29839         UInt   rG    = gregOfRexRM(pfx, modrm);
29840         IRTemp src   = newTemp(Ity_V128);
29841         IRTemp s0    = IRTemp_INVALID;
29842         IRTemp s1    = IRTemp_INVALID;
29843         IRTemp s2    = IRTemp_INVALID;
29844         IRTemp s3    = IRTemp_INVALID;
29845         IRTemp rm    = newTemp(Ity_I32);
29846         Int    imm   = 0;
29847
29848         modrm = getUChar(delta);
29849
29850         if (epartIsReg(modrm)) {
29851            UInt rE = eregOfRexRM(pfx, modrm);
29852            assign( src, getXMMReg( rE ) );
29853            imm = getUChar(delta+1);
29854            if (imm & ~15) break;
29855            delta += 1+1;
29856            DIP( "vroundps $%d,%s,%s\n", imm, nameXMMReg(rE), nameXMMReg(rG) );
29857         } else {
29858            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
29859            assign( src, loadLE(Ity_V128, mkexpr(addr) ) );
29860            imm = getUChar(delta+alen);
29861            if (imm & ~15) break;
29862            delta += alen+1;
29863            DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameXMMReg(rG) );
29864         }
29865
29866         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
29867            that encoding is the same as the encoding for IRRoundingMode,
29868            we can use that value directly in the IR as a rounding
29869            mode. */
29870         assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
29871
29872         breakupV128to32s( src, &s3, &s2, &s1, &s0 );
29873         putYMMRegLane128( rG, 1, mkV128(0) );
29874#        define CVT(s) binop(Iop_RoundF32toInt, mkexpr(rm), \
29875                             unop(Iop_ReinterpI32asF32, mkexpr(s)))
29876         putYMMRegLane32F( rG, 3, CVT(s3) );
29877         putYMMRegLane32F( rG, 2, CVT(s2) );
29878         putYMMRegLane32F( rG, 1, CVT(s1) );
29879         putYMMRegLane32F( rG, 0, CVT(s0) );
29880#        undef CVT
29881         goto decode_success;
29882      }
29883      /* VROUNDPS imm8, ymm2/m256, ymm1 */
29884      /* VROUNDPS = VEX.NDS.256.66.0F3A.WIG 08 ib */
29885      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29886         UChar  modrm = getUChar(delta);
29887         UInt   rG    = gregOfRexRM(pfx, modrm);
29888         IRTemp src   = newTemp(Ity_V256);
29889         IRTemp s0    = IRTemp_INVALID;
29890         IRTemp s1    = IRTemp_INVALID;
29891         IRTemp s2    = IRTemp_INVALID;
29892         IRTemp s3    = IRTemp_INVALID;
29893         IRTemp s4    = IRTemp_INVALID;
29894         IRTemp s5    = IRTemp_INVALID;
29895         IRTemp s6    = IRTemp_INVALID;
29896         IRTemp s7    = IRTemp_INVALID;
29897         IRTemp rm    = newTemp(Ity_I32);
29898         Int    imm   = 0;
29899
29900         modrm = getUChar(delta);
29901
29902         if (epartIsReg(modrm)) {
29903            UInt rE = eregOfRexRM(pfx, modrm);
29904            assign( src, getYMMReg( rE ) );
29905            imm = getUChar(delta+1);
29906            if (imm & ~15) break;
29907            delta += 1+1;
29908            DIP( "vroundps $%d,%s,%s\n", imm, nameYMMReg(rE), nameYMMReg(rG) );
29909         } else {
29910            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
29911            assign( src, loadLE(Ity_V256, mkexpr(addr) ) );
29912            imm = getUChar(delta+alen);
29913            if (imm & ~15) break;
29914            delta += alen+1;
29915            DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameYMMReg(rG) );
29916         }
29917
29918         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
29919            that encoding is the same as the encoding for IRRoundingMode,
29920            we can use that value directly in the IR as a rounding
29921            mode. */
29922         assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
29923
29924         breakupV256to32s( src, &s7, &s6, &s5, &s4, &s3, &s2, &s1, &s0 );
29925#        define CVT(s) binop(Iop_RoundF32toInt, mkexpr(rm), \
29926                             unop(Iop_ReinterpI32asF32, mkexpr(s)))
29927         putYMMRegLane32F( rG, 7, CVT(s7) );
29928         putYMMRegLane32F( rG, 6, CVT(s6) );
29929         putYMMRegLane32F( rG, 5, CVT(s5) );
29930         putYMMRegLane32F( rG, 4, CVT(s4) );
29931         putYMMRegLane32F( rG, 3, CVT(s3) );
29932         putYMMRegLane32F( rG, 2, CVT(s2) );
29933         putYMMRegLane32F( rG, 1, CVT(s1) );
29934         putYMMRegLane32F( rG, 0, CVT(s0) );
29935#        undef CVT
29936         goto decode_success;
29937      }
29938
29939   case 0x09:
29940      /* VROUNDPD imm8, xmm2/m128, xmm1 */
29941      /* VROUNDPD = VEX.NDS.128.66.0F3A.WIG 09 ib */
29942      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
29943         UChar  modrm = getUChar(delta);
29944         UInt   rG    = gregOfRexRM(pfx, modrm);
29945         IRTemp src   = newTemp(Ity_V128);
29946         IRTemp s0    = IRTemp_INVALID;
29947         IRTemp s1    = IRTemp_INVALID;
29948         IRTemp rm    = newTemp(Ity_I32);
29949         Int    imm   = 0;
29950
29951         modrm = getUChar(delta);
29952
29953         if (epartIsReg(modrm)) {
29954            UInt rE = eregOfRexRM(pfx, modrm);
29955            assign( src, getXMMReg( rE ) );
29956            imm = getUChar(delta+1);
29957            if (imm & ~15) break;
29958            delta += 1+1;
29959            DIP( "vroundpd $%d,%s,%s\n", imm, nameXMMReg(rE), nameXMMReg(rG) );
29960         } else {
29961            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
29962            assign( src, loadLE(Ity_V128, mkexpr(addr) ) );
29963            imm = getUChar(delta+alen);
29964            if (imm & ~15) break;
29965            delta += alen+1;
29966            DIP( "vroundpd $%d,%s,%s\n", imm, dis_buf, nameXMMReg(rG) );
29967         }
29968
29969         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
29970            that encoding is the same as the encoding for IRRoundingMode,
29971            we can use that value directly in the IR as a rounding
29972            mode. */
29973         assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
29974
29975         breakupV128to64s( src, &s1, &s0 );
29976         putYMMRegLane128( rG, 1, mkV128(0) );
29977#        define CVT(s) binop(Iop_RoundF64toInt, mkexpr(rm), \
29978                             unop(Iop_ReinterpI64asF64, mkexpr(s)))
29979         putYMMRegLane64F( rG, 1, CVT(s1) );
29980         putYMMRegLane64F( rG, 0, CVT(s0) );
29981#        undef CVT
29982         goto decode_success;
29983      }
29984      /* VROUNDPD imm8, ymm2/m256, ymm1 */
29985      /* VROUNDPD = VEX.NDS.256.66.0F3A.WIG 09 ib */
29986      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
29987         UChar  modrm = getUChar(delta);
29988         UInt   rG    = gregOfRexRM(pfx, modrm);
29989         IRTemp src   = newTemp(Ity_V256);
29990         IRTemp s0    = IRTemp_INVALID;
29991         IRTemp s1    = IRTemp_INVALID;
29992         IRTemp s2    = IRTemp_INVALID;
29993         IRTemp s3    = IRTemp_INVALID;
29994         IRTemp rm    = newTemp(Ity_I32);
29995         Int    imm   = 0;
29996
29997         modrm = getUChar(delta);
29998
29999         if (epartIsReg(modrm)) {
30000            UInt rE = eregOfRexRM(pfx, modrm);
30001            assign( src, getYMMReg( rE ) );
30002            imm = getUChar(delta+1);
30003            if (imm & ~15) break;
30004            delta += 1+1;
30005            DIP( "vroundpd $%d,%s,%s\n", imm, nameYMMReg(rE), nameYMMReg(rG) );
30006         } else {
30007            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30008            assign( src, loadLE(Ity_V256, mkexpr(addr) ) );
30009            imm = getUChar(delta+alen);
30010            if (imm & ~15) break;
30011            delta += alen+1;
30012            DIP( "vroundps $%d,%s,%s\n", imm, dis_buf, nameYMMReg(rG) );
30013         }
30014
30015         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
30016            that encoding is the same as the encoding for IRRoundingMode,
30017            we can use that value directly in the IR as a rounding
30018            mode. */
30019         assign(rm, (imm & 4) ? get_sse_roundingmode() : mkU32(imm & 3));
30020
30021         breakupV256to64s( src, &s3, &s2, &s1, &s0 );
30022#        define CVT(s) binop(Iop_RoundF64toInt, mkexpr(rm), \
30023                             unop(Iop_ReinterpI64asF64, mkexpr(s)))
30024         putYMMRegLane64F( rG, 3, CVT(s3) );
30025         putYMMRegLane64F( rG, 2, CVT(s2) );
30026         putYMMRegLane64F( rG, 1, CVT(s1) );
30027         putYMMRegLane64F( rG, 0, CVT(s0) );
30028#        undef CVT
30029         goto decode_success;
30030      }
30031
30032   case 0x0A:
30033   case 0x0B:
30034      /* VROUNDSS imm8, xmm3/m32, xmm2, xmm1 */
30035      /* VROUNDSS = VEX.NDS.128.66.0F3A.WIG 0A ib */
30036      /* VROUNDSD imm8, xmm3/m64, xmm2, xmm1 */
30037      /* VROUNDSD = VEX.NDS.128.66.0F3A.WIG 0B ib */
30038      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30039         UChar  modrm = getUChar(delta);
30040         UInt   rG    = gregOfRexRM(pfx, modrm);
30041         UInt   rV    = getVexNvvvv(pfx);
30042         Bool   isD   = opc == 0x0B;
30043         IRTemp src   = newTemp(isD ? Ity_F64 : Ity_F32);
30044         IRTemp res   = newTemp(isD ? Ity_F64 : Ity_F32);
30045         Int    imm   = 0;
30046
30047         if (epartIsReg(modrm)) {
30048            UInt rE = eregOfRexRM(pfx, modrm);
30049            assign( src,
30050                    isD ? getXMMRegLane64F(rE, 0) : getXMMRegLane32F(rE, 0) );
30051            imm = getUChar(delta+1);
30052            if (imm & ~15) break;
30053            delta += 1+1;
30054            DIP( "vrounds%c $%d,%s,%s,%s\n",
30055                 isD ? 'd' : 's',
30056                 imm, nameXMMReg( rE ), nameXMMReg( rV ), nameXMMReg( rG ) );
30057         } else {
30058            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30059            assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
30060            imm = getUChar(delta+alen);
30061            if (imm & ~15) break;
30062            delta += alen+1;
30063            DIP( "vrounds%c $%d,%s,%s,%s\n",
30064                 isD ? 'd' : 's',
30065                 imm, dis_buf, nameXMMReg( rV ), nameXMMReg( rG ) );
30066         }
30067
30068         /* (imm & 3) contains an Intel-encoded rounding mode.  Because
30069            that encoding is the same as the encoding for IRRoundingMode,
30070            we can use that value directly in the IR as a rounding
30071            mode. */
30072         assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
30073                           (imm & 4) ? get_sse_roundingmode()
30074                                     : mkU32(imm & 3),
30075                           mkexpr(src)) );
30076
30077         if (isD)
30078            putXMMRegLane64F( rG, 0, mkexpr(res) );
30079         else {
30080            putXMMRegLane32F( rG, 0, mkexpr(res) );
30081            putXMMRegLane32F( rG, 1, getXMMRegLane32F( rV, 1 ) );
30082         }
30083         putXMMRegLane64F( rG, 1, getXMMRegLane64F( rV, 1 ) );
30084         putYMMRegLane128( rG, 1, mkV128(0) );
30085         *uses_vvvv = True;
30086         goto decode_success;
30087      }
30088      break;
30089
30090   case 0x0C:
30091      /* VBLENDPS imm8, ymm3/m256, ymm2, ymm1 */
30092      /* VBLENDPS = VEX.NDS.256.66.0F3A.WIG 0C /r ib */
30093      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
30094         UChar  modrm = getUChar(delta);
30095         UInt   imm8;
30096         UInt   rG    = gregOfRexRM(pfx, modrm);
30097         UInt   rV    = getVexNvvvv(pfx);
30098         IRTemp sV    = newTemp(Ity_V256);
30099         IRTemp sE    = newTemp(Ity_V256);
30100         assign ( sV, getYMMReg(rV) );
30101         if (epartIsReg(modrm)) {
30102            UInt rE = eregOfRexRM(pfx, modrm);
30103            delta += 1;
30104            imm8 = getUChar(delta);
30105            DIP("vblendps $%u,%s,%s,%s\n",
30106                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
30107            assign(sE, getYMMReg(rE));
30108         } else {
30109            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30110            delta += alen;
30111            imm8 = getUChar(delta);
30112            DIP("vblendps $%u,%s,%s,%s\n",
30113                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
30114            assign(sE, loadLE(Ity_V256, mkexpr(addr)));
30115         }
30116         delta++;
30117         putYMMReg( rG,
30118                    mkexpr( math_BLENDPS_256( sE, sV, imm8) ) );
30119         *uses_vvvv = True;
30120         goto decode_success;
30121      }
30122      /* VBLENDPS imm8, xmm3/m128, xmm2, xmm1 */
30123      /* VBLENDPS = VEX.NDS.128.66.0F3A.WIG 0C /r ib */
30124      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30125         UChar  modrm = getUChar(delta);
30126         UInt   imm8;
30127         UInt   rG    = gregOfRexRM(pfx, modrm);
30128         UInt   rV    = getVexNvvvv(pfx);
30129         IRTemp sV    = newTemp(Ity_V128);
30130         IRTemp sE    = newTemp(Ity_V128);
30131         assign ( sV, getXMMReg(rV) );
30132         if (epartIsReg(modrm)) {
30133            UInt rE = eregOfRexRM(pfx, modrm);
30134            delta += 1;
30135            imm8 = getUChar(delta);
30136            DIP("vblendps $%u,%s,%s,%s\n",
30137                imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
30138            assign(sE, getXMMReg(rE));
30139         } else {
30140            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30141            delta += alen;
30142            imm8 = getUChar(delta);
30143            DIP("vblendps $%u,%s,%s,%s\n",
30144                imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
30145            assign(sE, loadLE(Ity_V128, mkexpr(addr)));
30146         }
30147         delta++;
30148         putYMMRegLoAndZU( rG,
30149                           mkexpr( math_BLENDPS_128( sE, sV, imm8) ) );
30150         *uses_vvvv = True;
30151         goto decode_success;
30152      }
30153      break;
30154
30155   case 0x0D:
30156      /* VBLENDPD imm8, ymm3/m256, ymm2, ymm1 */
30157      /* VBLENDPD = VEX.NDS.256.66.0F3A.WIG 0D /r ib */
30158      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
30159         UChar  modrm = getUChar(delta);
30160         UInt   imm8;
30161         UInt   rG    = gregOfRexRM(pfx, modrm);
30162         UInt   rV    = getVexNvvvv(pfx);
30163         IRTemp sV    = newTemp(Ity_V256);
30164         IRTemp sE    = newTemp(Ity_V256);
30165         assign ( sV, getYMMReg(rV) );
30166         if (epartIsReg(modrm)) {
30167            UInt rE = eregOfRexRM(pfx, modrm);
30168            delta += 1;
30169            imm8 = getUChar(delta);
30170            DIP("vblendpd $%u,%s,%s,%s\n",
30171                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
30172            assign(sE, getYMMReg(rE));
30173         } else {
30174            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30175            delta += alen;
30176            imm8 = getUChar(delta);
30177            DIP("vblendpd $%u,%s,%s,%s\n",
30178                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
30179            assign(sE, loadLE(Ity_V256, mkexpr(addr)));
30180         }
30181         delta++;
30182         putYMMReg( rG,
30183                    mkexpr( math_BLENDPD_256( sE, sV, imm8) ) );
30184         *uses_vvvv = True;
30185         goto decode_success;
30186      }
30187      /* VBLENDPD imm8, xmm3/m128, xmm2, xmm1 */
30188      /* VBLENDPD = VEX.NDS.128.66.0F3A.WIG 0D /r ib */
30189      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30190         UChar  modrm = getUChar(delta);
30191         UInt   imm8;
30192         UInt   rG    = gregOfRexRM(pfx, modrm);
30193         UInt   rV    = getVexNvvvv(pfx);
30194         IRTemp sV    = newTemp(Ity_V128);
30195         IRTemp sE    = newTemp(Ity_V128);
30196         assign ( sV, getXMMReg(rV) );
30197         if (epartIsReg(modrm)) {
30198            UInt rE = eregOfRexRM(pfx, modrm);
30199            delta += 1;
30200            imm8 = getUChar(delta);
30201            DIP("vblendpd $%u,%s,%s,%s\n",
30202                imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
30203            assign(sE, getXMMReg(rE));
30204         } else {
30205            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30206            delta += alen;
30207            imm8 = getUChar(delta);
30208            DIP("vblendpd $%u,%s,%s,%s\n",
30209                imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
30210            assign(sE, loadLE(Ity_V128, mkexpr(addr)));
30211         }
30212         delta++;
30213         putYMMRegLoAndZU( rG,
30214                           mkexpr( math_BLENDPD_128( sE, sV, imm8) ) );
30215         *uses_vvvv = True;
30216         goto decode_success;
30217      }
30218      break;
30219
30220   case 0x0E:
30221      /* VPBLENDW imm8, xmm3/m128, xmm2, xmm1 */
30222      /* VPBLENDW = VEX.NDS.128.66.0F3A.WIG 0E /r ib */
30223      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30224         UChar  modrm = getUChar(delta);
30225         UInt   imm8;
30226         UInt   rG    = gregOfRexRM(pfx, modrm);
30227         UInt   rV    = getVexNvvvv(pfx);
30228         IRTemp sV    = newTemp(Ity_V128);
30229         IRTemp sE    = newTemp(Ity_V128);
30230         assign ( sV, getXMMReg(rV) );
30231         if (epartIsReg(modrm)) {
30232            UInt rE = eregOfRexRM(pfx, modrm);
30233            delta += 1;
30234            imm8 = getUChar(delta);
30235            DIP("vpblendw $%u,%s,%s,%s\n",
30236                imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG));
30237            assign(sE, getXMMReg(rE));
30238         } else {
30239            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30240            delta += alen;
30241            imm8 = getUChar(delta);
30242            DIP("vpblendw $%u,%s,%s,%s\n",
30243                imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG));
30244            assign(sE, loadLE(Ity_V128, mkexpr(addr)));
30245         }
30246         delta++;
30247         putYMMRegLoAndZU( rG,
30248                           mkexpr( math_PBLENDW_128( sE, sV, imm8) ) );
30249         *uses_vvvv = True;
30250         goto decode_success;
30251      }
30252      /* VPBLENDW imm8, ymm3/m256, ymm2, ymm1 */
30253      /* VPBLENDW = VEX.NDS.256.66.0F3A.WIG 0E /r ib */
30254      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
30255         UChar  modrm = getUChar(delta);
30256         UInt   imm8;
30257         UInt   rG    = gregOfRexRM(pfx, modrm);
30258         UInt   rV    = getVexNvvvv(pfx);
30259         IRTemp sV    = newTemp(Ity_V256);
30260         IRTemp sE    = newTemp(Ity_V256);
30261         IRTemp sVhi, sVlo, sEhi, sElo;
30262         sVhi = sVlo = sEhi = sElo = IRTemp_INVALID;
30263         assign ( sV, getYMMReg(rV) );
30264         if (epartIsReg(modrm)) {
30265            UInt rE = eregOfRexRM(pfx, modrm);
30266            delta += 1;
30267            imm8 = getUChar(delta);
30268            DIP("vpblendw $%u,%s,%s,%s\n",
30269                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
30270            assign(sE, getYMMReg(rE));
30271         } else {
30272            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30273            delta += alen;
30274            imm8 = getUChar(delta);
30275            DIP("vpblendw $%u,%s,%s,%s\n",
30276                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
30277            assign(sE, loadLE(Ity_V256, mkexpr(addr)));
30278         }
30279         delta++;
30280         breakupV256toV128s( sV, &sVhi, &sVlo );
30281         breakupV256toV128s( sE, &sEhi, &sElo );
30282         putYMMReg( rG, binop( Iop_V128HLtoV256,
30283                               mkexpr( math_PBLENDW_128( sEhi, sVhi, imm8) ),
30284                               mkexpr( math_PBLENDW_128( sElo, sVlo, imm8) ) ) );
30285         *uses_vvvv = True;
30286         goto decode_success;
30287      }
30288      break;
30289
30290   case 0x0F:
30291      /* VPALIGNR imm8, xmm3/m128, xmm2, xmm1 */
30292      /* VPALIGNR = VEX.NDS.128.66.0F3A.WIG 0F /r ib */
30293      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30294         UChar  modrm = getUChar(delta);
30295         UInt   rG    = gregOfRexRM(pfx, modrm);
30296         UInt   rV    = getVexNvvvv(pfx);
30297         IRTemp sV    = newTemp(Ity_V128);
30298         IRTemp dV    = newTemp(Ity_V128);
30299         UInt   imm8;
30300
30301         assign( dV, getXMMReg(rV) );
30302
30303         if ( epartIsReg( modrm ) ) {
30304            UInt   rE = eregOfRexRM(pfx, modrm);
30305            assign( sV, getXMMReg(rE) );
30306            imm8 = getUChar(delta+1);
30307            delta += 1+1;
30308            DIP("vpalignr $%d,%s,%s,%s\n", imm8, nameXMMReg(rE),
30309                                           nameXMMReg(rV), nameXMMReg(rG));
30310         } else {
30311            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30312            assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
30313            imm8 = getUChar(delta+alen);
30314            delta += alen+1;
30315            DIP("vpalignr $%d,%s,%s,%s\n", imm8, dis_buf,
30316                                           nameXMMReg(rV), nameXMMReg(rG));
30317         }
30318
30319         IRTemp res = math_PALIGNR_XMM( sV, dV, imm8 );
30320         putYMMRegLoAndZU( rG, mkexpr(res) );
30321         *uses_vvvv = True;
30322         goto decode_success;
30323      }
30324      /* VPALIGNR imm8, ymm3/m256, ymm2, ymm1 */
30325      /* VPALIGNR = VEX.NDS.256.66.0F3A.WIG 0F /r ib */
30326      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
30327         UChar  modrm = getUChar(delta);
30328         UInt   rG    = gregOfRexRM(pfx, modrm);
30329         UInt   rV    = getVexNvvvv(pfx);
30330         IRTemp sV    = newTemp(Ity_V256);
30331         IRTemp dV    = newTemp(Ity_V256);
30332         IRTemp sHi, sLo, dHi, dLo;
30333         sHi = sLo = dHi = dLo = IRTemp_INVALID;
30334         UInt   imm8;
30335
30336         assign( dV, getYMMReg(rV) );
30337
30338         if ( epartIsReg( modrm ) ) {
30339            UInt   rE = eregOfRexRM(pfx, modrm);
30340            assign( sV, getYMMReg(rE) );
30341            imm8 = getUChar(delta+1);
30342            delta += 1+1;
30343            DIP("vpalignr $%d,%s,%s,%s\n", imm8, nameYMMReg(rE),
30344                                           nameYMMReg(rV), nameYMMReg(rG));
30345         } else {
30346            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30347            assign( sV, loadLE(Ity_V256, mkexpr(addr)) );
30348            imm8 = getUChar(delta+alen);
30349            delta += alen+1;
30350            DIP("vpalignr $%d,%s,%s,%s\n", imm8, dis_buf,
30351                                           nameYMMReg(rV), nameYMMReg(rG));
30352         }
30353
30354         breakupV256toV128s( dV, &dHi, &dLo );
30355         breakupV256toV128s( sV, &sHi, &sLo );
30356         putYMMReg( rG, binop( Iop_V128HLtoV256,
30357                               mkexpr( math_PALIGNR_XMM( sHi, dHi, imm8 ) ),
30358                               mkexpr( math_PALIGNR_XMM( sLo, dLo, imm8 ) ) )
30359                    );
30360         *uses_vvvv = True;
30361         goto decode_success;
30362      }
30363      break;
30364
30365   case 0x14:
30366      /* VPEXTRB imm8, xmm2, reg/m8 = VEX.128.66.0F3A.W0 14 /r ib */
30367      if (have66noF2noF3(pfx)
30368          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
30369         delta = dis_PEXTRB_128_GtoE( vbi, pfx, delta, False/*!isAvx*/ );
30370         goto decode_success;
30371      }
30372      break;
30373
30374   case 0x15:
30375      /* VPEXTRW imm8, reg/m16, xmm2 */
30376      /* VPEXTRW = VEX.128.66.0F3A.W0 15 /r ib */
30377      if (have66noF2noF3(pfx)
30378          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
30379         delta = dis_PEXTRW( vbi, pfx, delta, True/*isAvx*/ );
30380         goto decode_success;
30381      }
30382      break;
30383
30384   case 0x16:
30385      /* VPEXTRD imm8, r32/m32, xmm2 */
30386      /* VPEXTRD = VEX.128.66.0F3A.W0 16 /r ib */
30387      if (have66noF2noF3(pfx)
30388          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
30389         delta = dis_PEXTRD( vbi, pfx, delta, True/*isAvx*/ );
30390         goto decode_success;
30391      }
30392      /* VPEXTRQ = VEX.128.66.0F3A.W1 16 /r ib */
30393      if (have66noF2noF3(pfx)
30394          && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
30395         delta = dis_PEXTRQ( vbi, pfx, delta, True/*isAvx*/ );
30396         goto decode_success;
30397      }
30398      break;
30399
30400   case 0x17:
30401      /* VEXTRACTPS imm8, xmm1, r32/m32 = VEX.128.66.0F3A.WIG 17 /r ib */
30402      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30403         delta = dis_EXTRACTPS( vbi, pfx, delta, True/*isAvx*/ );
30404         goto decode_success;
30405      }
30406      break;
30407
30408   case 0x18:
30409      /* VINSERTF128 r/m, rV, rD
30410         ::: rD = insertinto(a lane in rV, 128 bits from r/m) */
30411      /* VINSERTF128 = VEX.NDS.256.66.0F3A.W0 18 /r ib */
30412      if (have66noF2noF3(pfx)
30413          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
30414         UChar  modrm = getUChar(delta);
30415         UInt   ib    = 0;
30416         UInt   rG    = gregOfRexRM(pfx, modrm);
30417         UInt   rV    = getVexNvvvv(pfx);
30418         IRTemp t128  = newTemp(Ity_V128);
30419         if (epartIsReg(modrm)) {
30420            UInt rE = eregOfRexRM(pfx, modrm);
30421            delta += 1;
30422            assign(t128, getXMMReg(rE));
30423            ib = getUChar(delta);
30424            DIP("vinsertf128 $%u,%s,%s,%s\n",
30425                ib, nameXMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
30426         } else {
30427            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30428            assign(t128, loadLE(Ity_V128, mkexpr(addr)));
30429            delta += alen;
30430            ib = getUChar(delta);
30431            DIP("vinsertf128 $%u,%s,%s,%s\n",
30432                ib, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
30433         }
30434         delta++;
30435         putYMMRegLane128(rG, 0,   getYMMRegLane128(rV, 0));
30436         putYMMRegLane128(rG, 1,   getYMMRegLane128(rV, 1));
30437         putYMMRegLane128(rG, ib & 1, mkexpr(t128));
30438         *uses_vvvv = True;
30439         goto decode_success;
30440      }
30441      break;
30442
30443   case 0x19:
30444     /* VEXTRACTF128 $lane_no, rS, r/m
30445        ::: r/m:V128 = a lane of rS:V256 (RM format) */
30446     /* VEXTRACTF128 = VEX.256.66.0F3A.W0 19 /r ib */
30447      if (have66noF2noF3(pfx)
30448          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
30449         UChar  modrm = getUChar(delta);
30450         UInt   ib    = 0;
30451         UInt   rS    = gregOfRexRM(pfx, modrm);
30452         IRTemp t128  = newTemp(Ity_V128);
30453         if (epartIsReg(modrm)) {
30454            UInt rD = eregOfRexRM(pfx, modrm);
30455            delta += 1;
30456            ib = getUChar(delta);
30457            assign(t128, getYMMRegLane128(rS, ib & 1));
30458            putYMMRegLoAndZU(rD, mkexpr(t128));
30459            DIP("vextractf128 $%u,%s,%s\n",
30460                ib, nameXMMReg(rS), nameYMMReg(rD));
30461         } else {
30462            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30463            delta += alen;
30464            ib = getUChar(delta);
30465            assign(t128, getYMMRegLane128(rS, ib & 1));
30466            storeLE(mkexpr(addr), mkexpr(t128));
30467            DIP("vextractf128 $%u,%s,%s\n",
30468                ib, nameYMMReg(rS), dis_buf);
30469         }
30470         delta++;
30471         /* doesn't use vvvv */
30472         goto decode_success;
30473      }
30474      break;
30475
30476   case 0x20:
30477      /* VPINSRB r32/m8, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 20 /r ib */
30478      if (have66noF2noF3(pfx)
30479          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
30480         UChar  modrm  = getUChar(delta);
30481         UInt   rG     = gregOfRexRM(pfx, modrm);
30482         UInt   rV     = getVexNvvvv(pfx);
30483         Int    imm8;
30484         IRTemp src_u8 = newTemp(Ity_I8);
30485
30486         if ( epartIsReg( modrm ) ) {
30487            UInt rE = eregOfRexRM(pfx,modrm);
30488            imm8 = (Int)(getUChar(delta+1) & 15);
30489            assign( src_u8, unop(Iop_32to8, getIReg32( rE )) );
30490            delta += 1+1;
30491            DIP( "vpinsrb $%d,%s,%s,%s\n",
30492                 imm8, nameIReg32(rE), nameXMMReg(rV), nameXMMReg(rG) );
30493         } else {
30494            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30495            imm8 = (Int)(getUChar(delta+alen) & 15);
30496            assign( src_u8, loadLE( Ity_I8, mkexpr(addr) ) );
30497            delta += alen+1;
30498            DIP( "vpinsrb $%d,%s,%s,%s\n",
30499                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
30500         }
30501
30502         IRTemp src_vec = newTemp(Ity_V128);
30503         assign(src_vec, getXMMReg( rV ));
30504         IRTemp res_vec = math_PINSRB_128( src_vec, src_u8, imm8 );
30505         putYMMRegLoAndZU( rG, mkexpr(res_vec) );
30506         *uses_vvvv = True;
30507         goto decode_success;
30508      }
30509      break;
30510
30511   case 0x21:
30512      /* VINSERTPS imm8, xmm3/m32, xmm2, xmm1
30513         = VEX.NDS.128.66.0F3A.WIG 21 /r ib */
30514      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30515         UChar  modrm = getUChar(delta);
30516         UInt   rG    = gregOfRexRM(pfx, modrm);
30517         UInt   rV    = getVexNvvvv(pfx);
30518         UInt   imm8;
30519         IRTemp d2ins = newTemp(Ity_I32); /* comes from the E part */
30520         const IRTemp inval = IRTemp_INVALID;
30521
30522         if ( epartIsReg( modrm ) ) {
30523            UInt   rE = eregOfRexRM(pfx, modrm);
30524            IRTemp vE = newTemp(Ity_V128);
30525            assign( vE, getXMMReg(rE) );
30526            IRTemp dsE[4] = { inval, inval, inval, inval };
30527            breakupV128to32s( vE, &dsE[3], &dsE[2], &dsE[1], &dsE[0] );
30528            imm8 = getUChar(delta+1);
30529            d2ins = dsE[(imm8 >> 6) & 3]; /* "imm8_count_s" */
30530            delta += 1+1;
30531            DIP( "insertps $%u, %s,%s\n",
30532                 imm8, nameXMMReg(rE), nameXMMReg(rG) );
30533         } else {
30534            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30535            assign( d2ins, loadLE( Ity_I32, mkexpr(addr) ) );
30536            imm8 = getUChar(delta+alen);
30537            delta += alen+1;
30538            DIP( "insertps $%u, %s,%s\n",
30539                 imm8, dis_buf, nameXMMReg(rG) );
30540         }
30541
30542         IRTemp vV = newTemp(Ity_V128);
30543         assign( vV, getXMMReg(rV) );
30544
30545         putYMMRegLoAndZU( rG, mkexpr(math_INSERTPS( vV, d2ins, imm8 )) );
30546         *uses_vvvv = True;
30547         goto decode_success;
30548      }
30549      break;
30550
30551   case 0x22:
30552      /* VPINSRD r32/m32, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W0 22 /r ib */
30553      if (have66noF2noF3(pfx)
30554          && 0==getVexL(pfx)/*128*/ && 0==getRexW(pfx)/*W0*/) {
30555         UChar  modrm = getUChar(delta);
30556         UInt   rG    = gregOfRexRM(pfx, modrm);
30557         UInt   rV    = getVexNvvvv(pfx);
30558         Int    imm8_10;
30559         IRTemp src_u32 = newTemp(Ity_I32);
30560
30561         if ( epartIsReg( modrm ) ) {
30562            UInt rE = eregOfRexRM(pfx,modrm);
30563            imm8_10 = (Int)(getUChar(delta+1) & 3);
30564            assign( src_u32, getIReg32( rE ) );
30565            delta += 1+1;
30566            DIP( "vpinsrd $%d,%s,%s,%s\n",
30567                 imm8_10, nameIReg32(rE), nameXMMReg(rV), nameXMMReg(rG) );
30568         } else {
30569            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30570            imm8_10 = (Int)(getUChar(delta+alen) & 3);
30571            assign( src_u32, loadLE( Ity_I32, mkexpr(addr) ) );
30572            delta += alen+1;
30573            DIP( "vpinsrd $%d,%s,%s,%s\n",
30574                 imm8_10, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
30575         }
30576
30577         IRTemp src_vec = newTemp(Ity_V128);
30578         assign(src_vec, getXMMReg( rV ));
30579         IRTemp res_vec = math_PINSRD_128( src_vec, src_u32, imm8_10 );
30580         putYMMRegLoAndZU( rG, mkexpr(res_vec) );
30581         *uses_vvvv = True;
30582         goto decode_success;
30583      }
30584      /* VPINSRQ r64/m64, xmm2, xmm1 = VEX.NDS.128.66.0F3A.W1 22 /r ib */
30585      if (have66noF2noF3(pfx)
30586          && 0==getVexL(pfx)/*128*/ && 1==getRexW(pfx)/*W1*/) {
30587         UChar  modrm = getUChar(delta);
30588         UInt   rG    = gregOfRexRM(pfx, modrm);
30589         UInt   rV    = getVexNvvvv(pfx);
30590         Int    imm8_0;
30591         IRTemp src_u64 = newTemp(Ity_I64);
30592
30593         if ( epartIsReg( modrm ) ) {
30594            UInt rE = eregOfRexRM(pfx,modrm);
30595            imm8_0 = (Int)(getUChar(delta+1) & 1);
30596            assign( src_u64, getIReg64( rE ) );
30597            delta += 1+1;
30598            DIP( "vpinsrq $%d,%s,%s,%s\n",
30599                 imm8_0, nameIReg64(rE), nameXMMReg(rV), nameXMMReg(rG) );
30600         } else {
30601            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30602            imm8_0 = (Int)(getUChar(delta+alen) & 1);
30603            assign( src_u64, loadLE( Ity_I64, mkexpr(addr) ) );
30604            delta += alen+1;
30605            DIP( "vpinsrd $%d,%s,%s,%s\n",
30606                 imm8_0, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
30607         }
30608
30609         IRTemp src_vec = newTemp(Ity_V128);
30610         assign(src_vec, getXMMReg( rV ));
30611         IRTemp res_vec = math_PINSRQ_128( src_vec, src_u64, imm8_0 );
30612         putYMMRegLoAndZU( rG, mkexpr(res_vec) );
30613         *uses_vvvv = True;
30614         goto decode_success;
30615      }
30616      break;
30617
30618   case 0x38:
30619      /* VINSERTI128 r/m, rV, rD
30620         ::: rD = insertinto(a lane in rV, 128 bits from r/m) */
30621      /* VINSERTI128 = VEX.NDS.256.66.0F3A.W0 38 /r ib */
30622      if (have66noF2noF3(pfx)
30623          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
30624         UChar  modrm = getUChar(delta);
30625         UInt   ib    = 0;
30626         UInt   rG    = gregOfRexRM(pfx, modrm);
30627         UInt   rV    = getVexNvvvv(pfx);
30628         IRTemp t128  = newTemp(Ity_V128);
30629         if (epartIsReg(modrm)) {
30630            UInt rE = eregOfRexRM(pfx, modrm);
30631            delta += 1;
30632            assign(t128, getXMMReg(rE));
30633            ib = getUChar(delta);
30634            DIP("vinserti128 $%u,%s,%s,%s\n",
30635                ib, nameXMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
30636         } else {
30637            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30638            assign(t128, loadLE(Ity_V128, mkexpr(addr)));
30639            delta += alen;
30640            ib = getUChar(delta);
30641            DIP("vinserti128 $%u,%s,%s,%s\n",
30642                ib, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
30643         }
30644         delta++;
30645         putYMMRegLane128(rG, 0,   getYMMRegLane128(rV, 0));
30646         putYMMRegLane128(rG, 1,   getYMMRegLane128(rV, 1));
30647         putYMMRegLane128(rG, ib & 1, mkexpr(t128));
30648         *uses_vvvv = True;
30649         goto decode_success;
30650      }
30651      break;
30652
30653   case 0x39:
30654      /* VEXTRACTI128 $lane_no, rS, r/m
30655         ::: r/m:V128 = a lane of rS:V256 (RM format) */
30656      /* VEXTRACTI128 = VEX.256.66.0F3A.W0 39 /r ib */
30657      if (have66noF2noF3(pfx)
30658          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
30659         UChar  modrm = getUChar(delta);
30660         UInt   ib    = 0;
30661         UInt   rS    = gregOfRexRM(pfx, modrm);
30662         IRTemp t128  = newTemp(Ity_V128);
30663         if (epartIsReg(modrm)) {
30664            UInt rD = eregOfRexRM(pfx, modrm);
30665            delta += 1;
30666            ib = getUChar(delta);
30667            assign(t128, getYMMRegLane128(rS, ib & 1));
30668            putYMMRegLoAndZU(rD, mkexpr(t128));
30669            DIP("vextracti128 $%u,%s,%s\n",
30670                ib, nameXMMReg(rS), nameYMMReg(rD));
30671         } else {
30672            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30673            delta += alen;
30674            ib = getUChar(delta);
30675            assign(t128, getYMMRegLane128(rS, ib & 1));
30676            storeLE(mkexpr(addr), mkexpr(t128));
30677            DIP("vextracti128 $%u,%s,%s\n",
30678                ib, nameYMMReg(rS), dis_buf);
30679         }
30680         delta++;
30681         /* doesn't use vvvv */
30682         goto decode_success;
30683      }
30684      break;
30685
30686   case 0x40:
30687      /* VDPPS imm8, xmm3/m128,xmm2,xmm1 = VEX.NDS.128.66.0F3A.WIG 40 /r ib */
30688      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30689         UChar  modrm   = getUChar(delta);
30690         UInt   rG      = gregOfRexRM(pfx, modrm);
30691         UInt   rV      = getVexNvvvv(pfx);
30692         IRTemp dst_vec = newTemp(Ity_V128);
30693         Int    imm8;
30694         if (epartIsReg( modrm )) {
30695            UInt rE = eregOfRexRM(pfx,modrm);
30696            imm8 = (Int)getUChar(delta+1);
30697            assign( dst_vec, getXMMReg( rE ) );
30698            delta += 1+1;
30699            DIP( "vdpps $%d,%s,%s,%s\n",
30700                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
30701         } else {
30702            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30703            imm8 = (Int)getUChar(delta+alen);
30704            assign( dst_vec, loadLE( Ity_V128, mkexpr(addr) ) );
30705            delta += alen+1;
30706            DIP( "vdpps $%d,%s,%s,%s\n",
30707                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
30708         }
30709
30710         IRTemp src_vec = newTemp(Ity_V128);
30711         assign(src_vec, getXMMReg( rV ));
30712         IRTemp res_vec = math_DPPS_128( src_vec, dst_vec, imm8 );
30713         putYMMRegLoAndZU( rG, mkexpr(res_vec) );
30714         *uses_vvvv = True;
30715         goto decode_success;
30716      }
30717      /* VDPPS imm8, ymm3/m128,ymm2,ymm1 = VEX.NDS.256.66.0F3A.WIG 40 /r ib */
30718      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
30719         UChar  modrm   = getUChar(delta);
30720         UInt   rG      = gregOfRexRM(pfx, modrm);
30721         UInt   rV      = getVexNvvvv(pfx);
30722         IRTemp dst_vec = newTemp(Ity_V256);
30723         Int    imm8;
30724         if (epartIsReg( modrm )) {
30725            UInt rE = eregOfRexRM(pfx,modrm);
30726            imm8 = (Int)getUChar(delta+1);
30727            assign( dst_vec, getYMMReg( rE ) );
30728            delta += 1+1;
30729            DIP( "vdpps $%d,%s,%s,%s\n",
30730                 imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG) );
30731         } else {
30732            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30733            imm8 = (Int)getUChar(delta+alen);
30734            assign( dst_vec, loadLE( Ity_V256, mkexpr(addr) ) );
30735            delta += alen+1;
30736            DIP( "vdpps $%d,%s,%s,%s\n",
30737                 imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
30738         }
30739
30740         IRTemp src_vec = newTemp(Ity_V256);
30741         assign(src_vec, getYMMReg( rV ));
30742         IRTemp s0, s1, d0, d1;
30743         s0 = s1 = d0 = d1 = IRTemp_INVALID;
30744         breakupV256toV128s( dst_vec, &d1, &d0 );
30745         breakupV256toV128s( src_vec, &s1, &s0 );
30746         putYMMReg( rG, binop( Iop_V128HLtoV256,
30747                               mkexpr( math_DPPS_128(s1, d1, imm8) ),
30748                               mkexpr( math_DPPS_128(s0, d0, imm8) ) ) );
30749         *uses_vvvv = True;
30750         goto decode_success;
30751      }
30752      break;
30753
30754   case 0x41:
30755      /* VDPPD imm8, xmm3/m128,xmm2,xmm1 = VEX.NDS.128.66.0F3A.WIG 41 /r ib */
30756      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30757         UChar  modrm   = getUChar(delta);
30758         UInt   rG      = gregOfRexRM(pfx, modrm);
30759         UInt   rV      = getVexNvvvv(pfx);
30760         IRTemp dst_vec = newTemp(Ity_V128);
30761         Int    imm8;
30762         if (epartIsReg( modrm )) {
30763            UInt rE = eregOfRexRM(pfx,modrm);
30764            imm8 = (Int)getUChar(delta+1);
30765            assign( dst_vec, getXMMReg( rE ) );
30766            delta += 1+1;
30767            DIP( "vdppd $%d,%s,%s,%s\n",
30768                 imm8, nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
30769         } else {
30770            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30771            imm8 = (Int)getUChar(delta+alen);
30772            assign( dst_vec, loadLE( Ity_V128, mkexpr(addr) ) );
30773            delta += alen+1;
30774            DIP( "vdppd $%d,%s,%s,%s\n",
30775                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
30776         }
30777
30778         IRTemp src_vec = newTemp(Ity_V128);
30779         assign(src_vec, getXMMReg( rV ));
30780         IRTemp res_vec = math_DPPD_128( src_vec, dst_vec, imm8 );
30781         putYMMRegLoAndZU( rG, mkexpr(res_vec) );
30782         *uses_vvvv = True;
30783         goto decode_success;
30784      }
30785      break;
30786
30787   case 0x42:
30788      /* VMPSADBW imm8, xmm3/m128,xmm2,xmm1 */
30789      /* VMPSADBW = VEX.NDS.128.66.0F3A.WIG 42 /r ib */
30790      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30791         UChar  modrm   = getUChar(delta);
30792         Int    imm8;
30793         IRTemp src_vec = newTemp(Ity_V128);
30794         IRTemp dst_vec = newTemp(Ity_V128);
30795         UInt   rG      = gregOfRexRM(pfx, modrm);
30796         UInt   rV      = getVexNvvvv(pfx);
30797
30798         assign( dst_vec, getXMMReg(rV) );
30799
30800         if ( epartIsReg( modrm ) ) {
30801            UInt rE = eregOfRexRM(pfx, modrm);
30802
30803            imm8 = (Int)getUChar(delta+1);
30804            assign( src_vec, getXMMReg(rE) );
30805            delta += 1+1;
30806            DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
30807                 nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
30808         } else {
30809            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
30810                             1/* imm8 is 1 byte after the amode */ );
30811            assign( src_vec, loadLE( Ity_V128, mkexpr(addr) ) );
30812            imm8 = (Int)getUChar(delta+alen);
30813            delta += alen+1;
30814            DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
30815                 dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
30816         }
30817
30818         putYMMRegLoAndZU( rG, mkexpr( math_MPSADBW_128(dst_vec,
30819                                                        src_vec, imm8) ) );
30820         *uses_vvvv = True;
30821         goto decode_success;
30822      }
30823      /* VMPSADBW imm8, ymm3/m256,ymm2,ymm1 */
30824      /* VMPSADBW = VEX.NDS.256.66.0F3A.WIG 42 /r ib */
30825      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
30826         UChar  modrm   = getUChar(delta);
30827         Int    imm8;
30828         IRTemp src_vec = newTemp(Ity_V256);
30829         IRTemp dst_vec = newTemp(Ity_V256);
30830         UInt   rG      = gregOfRexRM(pfx, modrm);
30831         UInt   rV      = getVexNvvvv(pfx);
30832         IRTemp sHi, sLo, dHi, dLo;
30833         sHi = sLo = dHi = dLo = IRTemp_INVALID;
30834
30835         assign( dst_vec, getYMMReg(rV) );
30836
30837         if ( epartIsReg( modrm ) ) {
30838            UInt rE = eregOfRexRM(pfx, modrm);
30839
30840            imm8 = (Int)getUChar(delta+1);
30841            assign( src_vec, getYMMReg(rE) );
30842            delta += 1+1;
30843            DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
30844                 nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG) );
30845         } else {
30846            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
30847                             1/* imm8 is 1 byte after the amode */ );
30848            assign( src_vec, loadLE( Ity_V256, mkexpr(addr) ) );
30849            imm8 = (Int)getUChar(delta+alen);
30850            delta += alen+1;
30851            DIP( "vmpsadbw $%d, %s,%s,%s\n", imm8,
30852                 dis_buf, nameYMMReg(rV), nameYMMReg(rG) );
30853         }
30854
30855         breakupV256toV128s( dst_vec, &dHi, &dLo );
30856         breakupV256toV128s( src_vec, &sHi, &sLo );
30857         putYMMReg( rG, binop( Iop_V128HLtoV256,
30858                               mkexpr( math_MPSADBW_128(dHi, sHi, imm8 >> 3) ),
30859                               mkexpr( math_MPSADBW_128(dLo, sLo, imm8) ) ) );
30860         *uses_vvvv = True;
30861         goto decode_success;
30862      }
30863      break;
30864
30865   case 0x44:
30866      /* VPCLMULQDQ imm8, xmm3/m128,xmm2,xmm1 */
30867      /* VPCLMULQDQ = VEX.NDS.128.66.0F3A.WIG 44 /r ib */
30868      /* 66 0F 3A 44 /r ib = PCLMULQDQ xmm1, xmm2/m128, imm8
30869       * Carry-less multiplication of selected XMM quadwords into XMM
30870       * registers (a.k.a multiplication of polynomials over GF(2))
30871       */
30872      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30873         UChar  modrm = getUChar(delta);
30874         Int imm8;
30875         IRTemp sV    = newTemp(Ity_V128);
30876         IRTemp dV    = newTemp(Ity_V128);
30877         UInt   rG    = gregOfRexRM(pfx, modrm);
30878         UInt   rV    = getVexNvvvv(pfx);
30879
30880         assign( dV, getXMMReg(rV) );
30881
30882         if ( epartIsReg( modrm ) ) {
30883            UInt rE = eregOfRexRM(pfx, modrm);
30884            imm8 = (Int)getUChar(delta+1);
30885            assign( sV, getXMMReg(rE) );
30886            delta += 1+1;
30887            DIP( "vpclmulqdq $%d, %s,%s,%s\n", imm8,
30888                 nameXMMReg(rE), nameXMMReg(rV), nameXMMReg(rG) );
30889         } else {
30890            addr = disAMode( &alen, vbi, pfx, delta, dis_buf,
30891                             1/* imm8 is 1 byte after the amode */ );
30892            assign( sV, loadLE( Ity_V128, mkexpr(addr) ) );
30893            imm8 = (Int)getUChar(delta+alen);
30894            delta += alen+1;
30895            DIP( "vpclmulqdq $%d, %s,%s,%s\n",
30896                 imm8, dis_buf, nameXMMReg(rV), nameXMMReg(rG) );
30897         }
30898
30899         putYMMRegLoAndZU( rG, mkexpr( math_PCLMULQDQ(dV, sV, imm8) ) );
30900         *uses_vvvv = True;
30901         goto decode_success;
30902      }
30903      break;
30904
30905   case 0x46:
30906      /* VPERM2I128 imm8, ymm3/m256, ymm2, ymm1 = VEX.NDS.66.0F3A.W0 46 /r ib */
30907      if (have66noF2noF3(pfx)
30908          && 1==getVexL(pfx)/*256*/ && 0==getRexW(pfx)/*W0*/) {
30909         UChar  modrm = getUChar(delta);
30910         UInt   imm8  = 0;
30911         UInt   rG    = gregOfRexRM(pfx, modrm);
30912         UInt   rV    = getVexNvvvv(pfx);
30913         IRTemp s00   = newTemp(Ity_V128);
30914         IRTemp s01   = newTemp(Ity_V128);
30915         IRTemp s10   = newTemp(Ity_V128);
30916         IRTemp s11   = newTemp(Ity_V128);
30917         assign(s00, getYMMRegLane128(rV, 0));
30918         assign(s01, getYMMRegLane128(rV, 1));
30919         if (epartIsReg(modrm)) {
30920            UInt rE = eregOfRexRM(pfx, modrm);
30921            delta += 1;
30922            imm8 = getUChar(delta);
30923            DIP("vperm2i128 $%u,%s,%s,%s\n",
30924                imm8, nameYMMReg(rE), nameYMMReg(rV), nameYMMReg(rG));
30925            assign(s10, getYMMRegLane128(rE, 0));
30926            assign(s11, getYMMRegLane128(rE, 1));
30927         } else {
30928            addr = disAMode( &alen, vbi, pfx, delta, dis_buf, 1 );
30929            delta += alen;
30930            imm8 = getUChar(delta);
30931            DIP("vperm2i128 $%u,%s,%s,%s\n",
30932                imm8, dis_buf, nameYMMReg(rV), nameYMMReg(rG));
30933            assign(s10, loadLE(Ity_V128, binop(Iop_Add64,
30934                                               mkexpr(addr), mkU64(0))));
30935            assign(s11, loadLE(Ity_V128, binop(Iop_Add64,
30936                                               mkexpr(addr), mkU64(16))));
30937         }
30938         delta++;
30939#        define SEL(_nn) (((_nn)==0) ? s00 : ((_nn)==1) ? s01 \
30940                                           : ((_nn)==2) ? s10 : s11)
30941         putYMMRegLane128(rG, 0, mkexpr(SEL((imm8 >> 0) & 3)));
30942         putYMMRegLane128(rG, 1, mkexpr(SEL((imm8 >> 4) & 3)));
30943#        undef SEL
30944         if (imm8 & (1<<3)) putYMMRegLane128(rG, 0, mkV128(0));
30945         if (imm8 & (1<<7)) putYMMRegLane128(rG, 1, mkV128(0));
30946         *uses_vvvv = True;
30947         goto decode_success;
30948      }
30949      break;
30950
30951   case 0x4A:
30952      /* VBLENDVPS xmmG, xmmE/memE, xmmV, xmmIS4
30953         ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
30954      /* VBLENDVPS = VEX.NDS.128.66.0F3A.WIG 4A /r /is4 */
30955      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30956         delta = dis_VBLENDV_128 ( vbi, pfx, delta,
30957                                   "vblendvps", 4, Iop_SarN32x4 );
30958         *uses_vvvv = True;
30959         goto decode_success;
30960      }
30961      /* VBLENDVPS ymmG, ymmE/memE, ymmV, ymmIS4
30962         ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
30963      /* VBLENDVPS = VEX.NDS.256.66.0F3A.WIG 4A /r /is4 */
30964      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
30965         delta = dis_VBLENDV_256 ( vbi, pfx, delta,
30966                                   "vblendvps", 4, Iop_SarN32x4 );
30967         *uses_vvvv = True;
30968         goto decode_success;
30969      }
30970      break;
30971
30972   case 0x4B:
30973      /* VBLENDVPD xmmG, xmmE/memE, xmmV, xmmIS4
30974         ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
30975      /* VBLENDVPD = VEX.NDS.128.66.0F3A.WIG 4B /r /is4 */
30976      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30977         delta = dis_VBLENDV_128 ( vbi, pfx, delta,
30978                                   "vblendvpd", 8, Iop_SarN64x2 );
30979         *uses_vvvv = True;
30980         goto decode_success;
30981      }
30982      /* VBLENDVPD ymmG, ymmE/memE, ymmV, ymmIS4
30983         ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
30984      /* VBLENDVPD = VEX.NDS.256.66.0F3A.WIG 4B /r /is4 */
30985      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
30986         delta = dis_VBLENDV_256 ( vbi, pfx, delta,
30987                                   "vblendvpd", 8, Iop_SarN64x2 );
30988         *uses_vvvv = True;
30989         goto decode_success;
30990      }
30991      break;
30992
30993   case 0x4C:
30994      /* VPBLENDVB xmmG, xmmE/memE, xmmV, xmmIS4
30995         ::: xmmG:V128 = PBLEND(xmmE, xmmV, xmmIS4) (RMVR) */
30996      /* VPBLENDVB = VEX.NDS.128.66.0F3A.WIG 4C /r /is4 */
30997      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
30998         delta = dis_VBLENDV_128 ( vbi, pfx, delta,
30999                                   "vpblendvb", 1, Iop_SarN8x16 );
31000         *uses_vvvv = True;
31001         goto decode_success;
31002      }
31003      /* VPBLENDVB ymmG, ymmE/memE, ymmV, ymmIS4
31004         ::: ymmG:V256 = PBLEND(ymmE, ymmV, ymmIS4) (RMVR) */
31005      /* VPBLENDVB = VEX.NDS.256.66.0F3A.WIG 4C /r /is4 */
31006      if (have66noF2noF3(pfx) && 1==getVexL(pfx)/*256*/) {
31007         delta = dis_VBLENDV_256 ( vbi, pfx, delta,
31008                                   "vpblendvb", 1, Iop_SarN8x16 );
31009         *uses_vvvv = True;
31010         goto decode_success;
31011      }
31012      break;
31013
31014   case 0x60:
31015   case 0x61:
31016   case 0x62:
31017   case 0x63:
31018      /* VEX.128.66.0F3A.WIG 63 /r ib = VPCMPISTRI imm8, xmm2/m128, xmm1
31019         VEX.128.66.0F3A.WIG 62 /r ib = VPCMPISTRM imm8, xmm2/m128, xmm1
31020         VEX.128.66.0F3A.WIG 61 /r ib = VPCMPESTRI imm8, xmm2/m128, xmm1
31021         VEX.128.66.0F3A.WIG 60 /r ib = VPCMPESTRM imm8, xmm2/m128, xmm1
31022         (selected special cases that actually occur in glibc,
31023          not by any means a complete implementation.)
31024      */
31025      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31026         Long delta0 = delta;
31027         delta = dis_PCMPxSTRx( vbi, pfx, delta, True/*isAvx*/, opc );
31028         if (delta > delta0) goto decode_success;
31029         /* else fall though; dis_PCMPxSTRx failed to decode it */
31030      }
31031      break;
31032
31033   case 0xDF:
31034      /* VAESKEYGENASSIST imm8, xmm2/m128, xmm1 = VEX.128.66.0F3A.WIG DF /r */
31035      if (have66noF2noF3(pfx) && 0==getVexL(pfx)/*128*/) {
31036         delta = dis_AESKEYGENASSIST( vbi, pfx, delta, True/*!isAvx*/ );
31037         goto decode_success;
31038      }
31039      break;
31040
31041   case 0xF0:
31042      /* RORX imm8, r/m32, r32a = VEX.LZ.F2.0F3A.W0 F0 /r /i */
31043      /* RORX imm8, r/m64, r64a = VEX.LZ.F2.0F3A.W1 F0 /r /i */
31044      if (haveF2no66noF3(pfx) && 0==getVexL(pfx)/*LZ*/ && !haveREX(pfx)) {
31045         Int     size = getRexW(pfx) ? 8 : 4;
31046         IRType  ty   = szToITy(size);
31047         IRTemp  src  = newTemp(ty);
31048         UChar   rm   = getUChar(delta);
31049         UChar   imm8;
31050
31051         if (epartIsReg(rm)) {
31052            imm8 = getUChar(delta+1);
31053            assign( src, getIRegE(size,pfx,rm) );
31054            DIP("rorx %d,%s,%s\n", imm8, nameIRegE(size,pfx,rm),
31055                                   nameIRegG(size,pfx,rm));
31056            delta += 2;
31057         } else {
31058            addr = disAMode ( &alen, vbi, pfx, delta, dis_buf, 0 );
31059            imm8 = getUChar(delta+alen);
31060            assign( src, loadLE(ty, mkexpr(addr)) );
31061            DIP("rorx %d,%s,%s\n", imm8, dis_buf, nameIRegG(size,pfx,rm));
31062            delta += alen + 1;
31063         }
31064         imm8 &= 8*size-1;
31065
31066         /* dst = (src >>u imm8) | (src << (size-imm8)) */
31067         putIRegG( size, pfx, rm,
31068                   imm8 == 0 ? mkexpr(src)
31069                   : binop( mkSizedOp(ty,Iop_Or8),
31070                            binop( mkSizedOp(ty,Iop_Shr8), mkexpr(src),
31071                                   mkU8(imm8) ),
31072                            binop( mkSizedOp(ty,Iop_Shl8), mkexpr(src),
31073                                   mkU8(8*size-imm8) ) ) );
31074         /* Flags aren't modified.  */
31075         goto decode_success;
31076      }
31077      break;
31078
31079   default:
31080      break;
31081
31082   }
31083
31084  //decode_failure:
31085   return deltaIN;
31086
31087  decode_success:
31088   return delta;
31089}
31090
31091
31092/*------------------------------------------------------------*/
31093/*---                                                      ---*/
31094/*--- Disassemble a single instruction                     ---*/
31095/*---                                                      ---*/
31096/*------------------------------------------------------------*/
31097
31098/* Disassemble a single instruction into IR.  The instruction is
31099   located in host memory at &guest_code[delta]. */
31100
31101static
31102DisResult disInstr_AMD64_WRK (
31103             /*OUT*/Bool* expect_CAS,
31104             Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
31105             Bool         resteerCisOk,
31106             void*        callback_opaque,
31107             Long         delta64,
31108             VexArchInfo* archinfo,
31109             VexAbiInfo*  vbi,
31110             Bool         sigill_diag
31111          )
31112{
31113   IRTemp    t1, t2, t3, t4, t5, t6;
31114   UChar     pre;
31115   Int       n, n_prefixes;
31116   DisResult dres;
31117
31118   /* The running delta */
31119   Long delta = delta64;
31120
31121   /* Holds eip at the start of the insn, so that we can print
31122      consistent error messages for unimplemented insns. */
31123   Long delta_start = delta;
31124
31125   /* sz denotes the nominal data-op size of the insn; we change it to
31126      2 if an 0x66 prefix is seen and 8 if REX.W is 1.  In case of
31127      conflict REX.W takes precedence. */
31128   Int sz = 4;
31129
31130   /* pfx holds the summary of prefixes. */
31131   Prefix pfx = PFX_EMPTY;
31132
31133   /* Holds the computed opcode-escape indication. */
31134   Escape esc = ESC_NONE;
31135
31136   /* Set result defaults. */
31137   dres.whatNext    = Dis_Continue;
31138   dres.len         = 0;
31139   dres.continueAt  = 0;
31140   dres.jk_StopHere = Ijk_INVALID;
31141   *expect_CAS = False;
31142
31143   vassert(guest_RIP_next_assumed == 0);
31144   vassert(guest_RIP_next_mustcheck == False);
31145
31146   t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID;
31147
31148   DIP("\t0x%llx:  ", guest_RIP_bbstart+delta);
31149
31150   /* Spot "Special" instructions (see comment at top of file). */
31151   {
31152      UChar* code = (UChar*)(guest_code + delta);
31153      /* Spot the 16-byte preamble:
31154         48C1C703   rolq $3,  %rdi
31155         48C1C70D   rolq $13, %rdi
31156         48C1C73D   rolq $61, %rdi
31157         48C1C733   rolq $51, %rdi
31158      */
31159      if (code[ 0] == 0x48 && code[ 1] == 0xC1 && code[ 2] == 0xC7
31160                                               && code[ 3] == 0x03 &&
31161          code[ 4] == 0x48 && code[ 5] == 0xC1 && code[ 6] == 0xC7
31162                                               && code[ 7] == 0x0D &&
31163          code[ 8] == 0x48 && code[ 9] == 0xC1 && code[10] == 0xC7
31164                                               && code[11] == 0x3D &&
31165          code[12] == 0x48 && code[13] == 0xC1 && code[14] == 0xC7
31166                                               && code[15] == 0x33) {
31167         /* Got a "Special" instruction preamble.  Which one is it? */
31168         if (code[16] == 0x48 && code[17] == 0x87
31169                              && code[18] == 0xDB /* xchgq %rbx,%rbx */) {
31170            /* %RDX = client_request ( %RAX ) */
31171            DIP("%%rdx = client_request ( %%rax )\n");
31172            delta += 19;
31173            jmp_lit(&dres, Ijk_ClientReq, guest_RIP_bbstart+delta);
31174            vassert(dres.whatNext == Dis_StopHere);
31175            goto decode_success;
31176         }
31177         else
31178         if (code[16] == 0x48 && code[17] == 0x87
31179                              && code[18] == 0xC9 /* xchgq %rcx,%rcx */) {
31180            /* %RAX = guest_NRADDR */
31181            DIP("%%rax = guest_NRADDR\n");
31182            delta += 19;
31183            putIRegRAX(8, IRExpr_Get( OFFB_NRADDR, Ity_I64 ));
31184            goto decode_success;
31185         }
31186         else
31187         if (code[16] == 0x48 && code[17] == 0x87
31188                              && code[18] == 0xD2 /* xchgq %rdx,%rdx */) {
31189            /* call-noredir *%RAX */
31190            DIP("call-noredir *%%rax\n");
31191            delta += 19;
31192            t1 = newTemp(Ity_I64);
31193            assign(t1, getIRegRAX(8));
31194            t2 = newTemp(Ity_I64);
31195            assign(t2, binop(Iop_Sub64, getIReg64(R_RSP), mkU64(8)));
31196            putIReg64(R_RSP, mkexpr(t2));
31197            storeLE( mkexpr(t2), mkU64(guest_RIP_bbstart+delta));
31198            jmp_treg(&dres, Ijk_NoRedir, t1);
31199            vassert(dres.whatNext == Dis_StopHere);
31200            goto decode_success;
31201         }
31202         else
31203         if (code[16] == 0x48 && code[17] == 0x87
31204                              && code[18] == 0xff /* xchgq %rdi,%rdi */) {
31205           /* IR injection */
31206            DIP("IR injection\n");
31207            vex_inject_ir(irsb, Iend_LE);
31208
31209            // Invalidate the current insn. The reason is that the IRop we're
31210            // injecting here can change. In which case the translation has to
31211            // be redone. For ease of handling, we simply invalidate all the
31212            // time.
31213            stmt(IRStmt_Put(OFFB_CMSTART, mkU64(guest_RIP_curr_instr)));
31214            stmt(IRStmt_Put(OFFB_CMLEN,   mkU64(19)));
31215
31216            delta += 19;
31217
31218            stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_bbstart + delta) ) );
31219            dres.whatNext    = Dis_StopHere;
31220            dres.jk_StopHere = Ijk_InvalICache;
31221            goto decode_success;
31222         }
31223         /* We don't know what it is. */
31224         goto decode_failure;
31225         /*NOTREACHED*/
31226      }
31227   }
31228
31229   /* Eat prefixes, summarising the result in pfx and sz, and rejecting
31230      as many invalid combinations as possible. */
31231   n_prefixes = 0;
31232   while (True) {
31233      if (n_prefixes > 7) goto decode_failure;
31234      pre = getUChar(delta);
31235      switch (pre) {
31236         case 0x66: pfx |= PFX_66; break;
31237         case 0x67: pfx |= PFX_ASO; break;
31238         case 0xF2: pfx |= PFX_F2; break;
31239         case 0xF3: pfx |= PFX_F3; break;
31240         case 0xF0: pfx |= PFX_LOCK; *expect_CAS = True; break;
31241         case 0x2E: pfx |= PFX_CS; break;
31242         case 0x3E: pfx |= PFX_DS; break;
31243         case 0x26: pfx |= PFX_ES; break;
31244         case 0x64: pfx |= PFX_FS; break;
31245         case 0x65: pfx |= PFX_GS; break;
31246         case 0x36: pfx |= PFX_SS; break;
31247         case 0x40 ... 0x4F:
31248            pfx |= PFX_REX;
31249            if (pre & (1<<3)) pfx |= PFX_REXW;
31250            if (pre & (1<<2)) pfx |= PFX_REXR;
31251            if (pre & (1<<1)) pfx |= PFX_REXX;
31252            if (pre & (1<<0)) pfx |= PFX_REXB;
31253            break;
31254         default:
31255            goto not_a_legacy_prefix;
31256      }
31257      n_prefixes++;
31258      delta++;
31259   }
31260
31261   not_a_legacy_prefix:
31262   /* We've used up all the non-VEX prefixes.  Parse and validate a
31263      VEX prefix if that's appropriate. */
31264   if (archinfo->hwcaps & VEX_HWCAPS_AMD64_AVX) {
31265      /* Used temporarily for holding VEX prefixes. */
31266      UChar vex0 = getUChar(delta);
31267      if (vex0 == 0xC4) {
31268         /* 3-byte VEX */
31269         UChar vex1 = getUChar(delta+1);
31270         UChar vex2 = getUChar(delta+2);
31271         delta += 3;
31272         pfx |= PFX_VEX;
31273         /* Snarf contents of byte 1 */
31274         /* R */ pfx |= (vex1 & (1<<7)) ? 0 : PFX_REXR;
31275         /* X */ pfx |= (vex1 & (1<<6)) ? 0 : PFX_REXX;
31276         /* B */ pfx |= (vex1 & (1<<5)) ? 0 : PFX_REXB;
31277         /* m-mmmm */
31278         switch (vex1 & 0x1F) {
31279            case 1: esc = ESC_0F;   break;
31280            case 2: esc = ESC_0F38; break;
31281            case 3: esc = ESC_0F3A; break;
31282            /* Any other m-mmmm field will #UD */
31283            default: goto decode_failure;
31284         }
31285         /* Snarf contents of byte 2 */
31286         /* W */    pfx |= (vex2 & (1<<7)) ? PFX_REXW : 0;
31287         /* ~v3 */  pfx |= (vex2 & (1<<6)) ? 0 : PFX_VEXnV3;
31288         /* ~v2 */  pfx |= (vex2 & (1<<5)) ? 0 : PFX_VEXnV2;
31289         /* ~v1 */  pfx |= (vex2 & (1<<4)) ? 0 : PFX_VEXnV1;
31290         /* ~v0 */  pfx |= (vex2 & (1<<3)) ? 0 : PFX_VEXnV0;
31291         /* L */    pfx |= (vex2 & (1<<2)) ? PFX_VEXL : 0;
31292         /* pp */
31293         switch (vex2 & 3) {
31294            case 0: break;
31295            case 1: pfx |= PFX_66; break;
31296            case 2: pfx |= PFX_F3; break;
31297            case 3: pfx |= PFX_F2; break;
31298            default: vassert(0);
31299         }
31300      }
31301      else if (vex0 == 0xC5) {
31302         /* 2-byte VEX */
31303         UChar vex1 = getUChar(delta+1);
31304         delta += 2;
31305         pfx |= PFX_VEX;
31306         /* Snarf contents of byte 1 */
31307         /* R */    pfx |= (vex1 & (1<<7)) ? 0 : PFX_REXR;
31308         /* ~v3 */  pfx |= (vex1 & (1<<6)) ? 0 : PFX_VEXnV3;
31309         /* ~v2 */  pfx |= (vex1 & (1<<5)) ? 0 : PFX_VEXnV2;
31310         /* ~v1 */  pfx |= (vex1 & (1<<4)) ? 0 : PFX_VEXnV1;
31311         /* ~v0 */  pfx |= (vex1 & (1<<3)) ? 0 : PFX_VEXnV0;
31312         /* L */    pfx |= (vex1 & (1<<2)) ? PFX_VEXL : 0;
31313         /* pp */
31314         switch (vex1 & 3) {
31315            case 0: break;
31316            case 1: pfx |= PFX_66; break;
31317            case 2: pfx |= PFX_F3; break;
31318            case 3: pfx |= PFX_F2; break;
31319            default: vassert(0);
31320         }
31321         /* implied: */
31322         esc = ESC_0F;
31323      }
31324      /* Can't have both VEX and REX */
31325      if ((pfx & PFX_VEX) && (pfx & PFX_REX))
31326         goto decode_failure; /* can't have both */
31327   }
31328
31329   /* Dump invalid combinations */
31330   n = 0;
31331   if (pfx & PFX_F2) n++;
31332   if (pfx & PFX_F3) n++;
31333   if (n > 1)
31334      goto decode_failure; /* can't have both */
31335
31336   n = 0;
31337   if (pfx & PFX_CS) n++;
31338   if (pfx & PFX_DS) n++;
31339   if (pfx & PFX_ES) n++;
31340   if (pfx & PFX_FS) n++;
31341   if (pfx & PFX_GS) n++;
31342   if (pfx & PFX_SS) n++;
31343   if (n > 1)
31344      goto decode_failure; /* multiple seg overrides == illegal */
31345
31346   /* We have a %fs prefix.  Reject it if there's no evidence in 'vbi'
31347      that we should accept it. */
31348   if ((pfx & PFX_FS) && !vbi->guest_amd64_assume_fs_is_zero)
31349      goto decode_failure;
31350
31351   /* Ditto for %gs prefixes. */
31352   if ((pfx & PFX_GS) && !vbi->guest_amd64_assume_gs_is_0x60)
31353      goto decode_failure;
31354
31355   /* Set up sz. */
31356   sz = 4;
31357   if (pfx & PFX_66) sz = 2;
31358   if ((pfx & PFX_REX) && (pfx & PFX_REXW)) sz = 8;
31359
31360   /* Now we should be looking at the primary opcode byte or the
31361      leading escapes.  Check that any LOCK prefix is actually
31362      allowed. */
31363   if (haveLOCK(pfx)) {
31364      if (can_be_used_with_LOCK_prefix( (UChar*)&guest_code[delta] )) {
31365         DIP("lock ");
31366      } else {
31367         *expect_CAS = False;
31368         goto decode_failure;
31369      }
31370   }
31371
31372   /* Eat up opcode escape bytes, until we're really looking at the
31373      primary opcode byte.  But only if there's no VEX present. */
31374   if (!(pfx & PFX_VEX)) {
31375      vassert(esc == ESC_NONE);
31376      pre = getUChar(delta);
31377      if (pre == 0x0F) {
31378         delta++;
31379         pre = getUChar(delta);
31380         switch (pre) {
31381            case 0x38: esc = ESC_0F38; delta++; break;
31382            case 0x3A: esc = ESC_0F3A; delta++; break;
31383            default:   esc = ESC_0F; break;
31384         }
31385      }
31386   }
31387
31388   /* So now we're really really looking at the primary opcode
31389      byte. */
31390   Long delta_at_primary_opcode = delta;
31391
31392   if (!(pfx & PFX_VEX)) {
31393      /* Handle non-VEX prefixed instructions.  "Legacy" (non-VEX) SSE
31394         instructions preserve the upper 128 bits of YMM registers;
31395         iow we can simply ignore the presence of the upper halves of
31396         these registers. */
31397      switch (esc) {
31398         case ESC_NONE:
31399            delta = dis_ESC_NONE( &dres, expect_CAS,
31400                                  resteerOkFn, resteerCisOk, callback_opaque,
31401                                  archinfo, vbi, pfx, sz, delta );
31402            break;
31403         case ESC_0F:
31404            delta = dis_ESC_0F  ( &dres, expect_CAS,
31405                                  resteerOkFn, resteerCisOk, callback_opaque,
31406                                  archinfo, vbi, pfx, sz, delta );
31407            break;
31408         case ESC_0F38:
31409            delta = dis_ESC_0F38( &dres,
31410                                  resteerOkFn, resteerCisOk, callback_opaque,
31411                                  archinfo, vbi, pfx, sz, delta );
31412            break;
31413         case ESC_0F3A:
31414            delta = dis_ESC_0F3A( &dres,
31415                                  resteerOkFn, resteerCisOk, callback_opaque,
31416                                  archinfo, vbi, pfx, sz, delta );
31417            break;
31418         default:
31419            vassert(0);
31420      }
31421   } else {
31422      /* VEX prefixed instruction */
31423      /* Sloppy Intel wording: "An instruction encoded with a VEX.128
31424         prefix that loads a YMM register operand ..." zeroes out bits
31425         128 and above of the register. */
31426      Bool uses_vvvv = False;
31427      switch (esc) {
31428         case ESC_0F:
31429            delta = dis_ESC_0F__VEX ( &dres, &uses_vvvv,
31430                                      resteerOkFn, resteerCisOk,
31431                                      callback_opaque,
31432                                      archinfo, vbi, pfx, sz, delta );
31433            break;
31434         case ESC_0F38:
31435            delta = dis_ESC_0F38__VEX ( &dres, &uses_vvvv,
31436                                        resteerOkFn, resteerCisOk,
31437                                        callback_opaque,
31438                                        archinfo, vbi, pfx, sz, delta );
31439            break;
31440         case ESC_0F3A:
31441            delta = dis_ESC_0F3A__VEX ( &dres, &uses_vvvv,
31442                                        resteerOkFn, resteerCisOk,
31443                                        callback_opaque,
31444                                        archinfo, vbi, pfx, sz, delta );
31445            break;
31446         case ESC_NONE:
31447            /* The presence of a VEX prefix, by Intel definition,
31448               always implies at least an 0F escape. */
31449            goto decode_failure;
31450         default:
31451            vassert(0);
31452      }
31453      /* If the insn doesn't use VEX.vvvv then it must be all ones.
31454         Check this. */
31455      if (!uses_vvvv) {
31456         if (getVexNvvvv(pfx) != 0)
31457            goto decode_failure;
31458      }
31459   }
31460
31461   vassert(delta - delta_at_primary_opcode >= 0);
31462   vassert(delta - delta_at_primary_opcode < 16/*let's say*/);
31463
31464   /* Use delta == delta_at_primary_opcode to denote decode failure.
31465      This implies that any successful decode must use at least one
31466      byte up. */
31467   if (delta == delta_at_primary_opcode)
31468      goto decode_failure;
31469   else
31470      goto decode_success; /* \o/ */
31471
31472#if 0 /* XYZZY */
31473
31474   /* ---------------------------------------------------- */
31475   /* --- The SSE/SSE2 decoder.                        --- */
31476   /* ---------------------------------------------------- */
31477
31478   /* What did I do to deserve SSE ?  Perhaps I was really bad in a
31479      previous life? */
31480
31481   /* Note, this doesn't handle SSE3 right now.  All amd64s support
31482      SSE2 as a minimum so there is no point distinguishing SSE1 vs
31483      SSE2. */
31484
31485   insn = (UChar*)&guest_code[delta];
31486
31487   /* FXSAVE is spuriously at the start here only because it is
31488      thusly placed in guest-x86/toIR.c. */
31489
31490   /* ------ SSE decoder main ------ */
31491
31492   /* ---------------------------------------------------- */
31493   /* --- end of the SSE decoder.                      --- */
31494   /* ---------------------------------------------------- */
31495
31496   /* ---------------------------------------------------- */
31497   /* --- start of the SSE2 decoder.                   --- */
31498   /* ---------------------------------------------------- */
31499
31500   /* ---------------------------------------------------- */
31501   /* --- end of the SSE/SSE2 decoder.                 --- */
31502   /* ---------------------------------------------------- */
31503
31504   /* ---------------------------------------------------- */
31505   /* --- start of the SSE3 decoder.                   --- */
31506   /* ---------------------------------------------------- */
31507
31508   /* ---------------------------------------------------- */
31509   /* --- end of the SSE3 decoder.                     --- */
31510   /* ---------------------------------------------------- */
31511
31512   /* ---------------------------------------------------- */
31513   /* --- start of the SSSE3 decoder.                  --- */
31514   /* ---------------------------------------------------- */
31515
31516   /* ---------------------------------------------------- */
31517   /* --- end of the SSSE3 decoder.                    --- */
31518   /* ---------------------------------------------------- */
31519
31520   /* ---------------------------------------------------- */
31521   /* --- start of the SSE4 decoder                    --- */
31522   /* ---------------------------------------------------- */
31523
31524   /* ---------------------------------------------------- */
31525   /* --- end of the SSE4 decoder                      --- */
31526   /* ---------------------------------------------------- */
31527
31528   /*after_sse_decoders:*/
31529
31530   /* Get the primary opcode. */
31531   opc = getUChar(delta); delta++;
31532
31533   /* We get here if the current insn isn't SSE, or this CPU doesn't
31534      support SSE. */
31535
31536   switch (opc) {
31537
31538   /* ------------------------ Control flow --------------- */
31539
31540   /* ------------------------ CWD/CDQ -------------------- */
31541
31542   /* ------------------------ FPU ops -------------------- */
31543
31544   /* ------------------------ INT ------------------------ */
31545
31546   case 0xCD: { /* INT imm8 */
31547      IRJumpKind jk = Ijk_Boring;
31548      if (have66orF2orF3(pfx)) goto decode_failure;
31549      d64 = getUChar(delta); delta++;
31550      switch (d64) {
31551         case 32: jk = Ijk_Sys_int32; break;
31552         default: goto decode_failure;
31553      }
31554      guest_RIP_next_mustcheck = True;
31555      guest_RIP_next_assumed = guest_RIP_bbstart + delta;
31556      jmp_lit(jk, guest_RIP_next_assumed);
31557      /* It's important that all ArchRegs carry their up-to-date value
31558         at this point.  So we declare an end-of-block here, which
31559         forces any TempRegs caching ArchRegs to be flushed. */
31560      vassert(dres.whatNext == Dis_StopHere);
31561      DIP("int $0x%02x\n", (UInt)d64);
31562      break;
31563   }
31564
31565   /* ------------------------ Jcond, byte offset --------- */
31566
31567   /* ------------------------ IMUL ----------------------- */
31568
31569   /* ------------------------ MOV ------------------------ */
31570
31571   /* ------------------------ MOVx ------------------------ */
31572
31573   /* ------------------------ opl imm, A ----------------- */
31574
31575   /* ------------------------ opl Ev, Gv ----------------- */
31576
31577   /* ------------------------ opl Gv, Ev ----------------- */
31578
31579   /* ------------------------ POP ------------------------ */
31580
31581   /* ------------------------ PUSH ----------------------- */
31582
31583   /* ------ AE: SCAS variants ------ */
31584
31585   /* ------ A6, A7: CMPS variants ------ */
31586
31587   /* ------ AA, AB: STOS variants ------ */
31588
31589   /* ------ A4, A5: MOVS variants ------ */
31590
31591   /* ------------------------ XCHG ----------------------- */
31592
31593   /* ------------------------ IN / OUT ----------------------- */
31594
31595   /* ------------------------ (Grp1 extensions) ---------- */
31596
31597   /* ------------------------ (Grp2 extensions) ---------- */
31598
31599   /* ------------------------ (Grp3 extensions) ---------- */
31600
31601   /* ------------------------ (Grp4 extensions) ---------- */
31602
31603   /* ------------------------ (Grp5 extensions) ---------- */
31604
31605   /* ------------------------ Escapes to 2-byte opcodes -- */
31606
31607   case 0x0F: {
31608      opc = getUChar(delta); delta++;
31609      switch (opc) {
31610
31611      /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
31612
31613      /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
31614
31615      /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
31616
31617      /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
31618
31619      /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
31620
31621      /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
31622
31623      /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
31624
31625      /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
31626
31627      /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
31628
31629      /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
31630
31631      /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
31632
31633      /* =-=-=-=-=-=-=-=-=- PREFETCH =-=-=-=-=-=-=-=-=-= */
31634
31635      /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
31636
31637      /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
31638
31639      /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
31640
31641      /* =-=-=-=-=-=-=-=-=- SYSCALL -=-=-=-=-=-=-=-=-=-= */
31642
31643      /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
31644
31645      /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
31646
31647      /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
31648
31649      default:
31650         goto decode_failure;
31651   } /* switch (opc) for the 2-byte opcodes */
31652   goto decode_success;
31653   } /* case 0x0F: of primary opcode */
31654
31655   /* ------------------------ ??? ------------------------ */
31656#endif /* XYZZY */
31657
31658     //default:
31659  decode_failure:
31660   /* All decode failures end up here. */
31661   if (sigill_diag) {
31662      vex_printf("vex amd64->IR: unhandled instruction bytes: "
31663                 "0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x 0x%x\n",
31664                 (Int)getUChar(delta_start+0),
31665                 (Int)getUChar(delta_start+1),
31666                 (Int)getUChar(delta_start+2),
31667                 (Int)getUChar(delta_start+3),
31668                 (Int)getUChar(delta_start+4),
31669                 (Int)getUChar(delta_start+5),
31670                 (Int)getUChar(delta_start+6),
31671                 (Int)getUChar(delta_start+7) );
31672      vex_printf("vex amd64->IR:   REX=%d REX.W=%d REX.R=%d REX.X=%d REX.B=%d\n",
31673                 haveREX(pfx) ? 1 : 0, getRexW(pfx), getRexR(pfx),
31674                 getRexX(pfx), getRexB(pfx));
31675      vex_printf("vex amd64->IR:   VEX=%d VEX.L=%d VEX.nVVVV=0x%x ESC=%s\n",
31676                 haveVEX(pfx) ? 1 : 0, getVexL(pfx),
31677                 getVexNvvvv(pfx),
31678                 esc==ESC_NONE ? "NONE" :
31679                   esc==ESC_0F ? "0F" :
31680                   esc==ESC_0F38 ? "0F38" :
31681                   esc==ESC_0F3A ? "0F3A" : "???");
31682      vex_printf("vex amd64->IR:   PFX.66=%d PFX.F2=%d PFX.F3=%d\n",
31683                 have66(pfx) ? 1 : 0, haveF2(pfx) ? 1 : 0,
31684                 haveF3(pfx) ? 1 : 0);
31685   }
31686
31687   /* Tell the dispatcher that this insn cannot be decoded, and so has
31688      not been executed, and (is currently) the next to be executed.
31689      RIP should be up-to-date since it made so at the start of each
31690      insn, but nevertheless be paranoid and update it again right
31691      now. */
31692   stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_curr_instr) ) );
31693   jmp_lit(&dres, Ijk_NoDecode, guest_RIP_curr_instr);
31694   vassert(dres.whatNext == Dis_StopHere);
31695   dres.len = 0;
31696   /* We also need to say that a CAS is not expected now, regardless
31697      of what it might have been set to at the start of the function,
31698      since the IR that we've emitted just above (to synthesis a
31699      SIGILL) does not involve any CAS, and presumably no other IR has
31700      been emitted for this (non-decoded) insn. */
31701   *expect_CAS = False;
31702   return dres;
31703
31704   //   } /* switch (opc) for the main (primary) opcode switch. */
31705
31706  decode_success:
31707   /* All decode successes end up here. */
31708   switch (dres.whatNext) {
31709      case Dis_Continue:
31710         stmt( IRStmt_Put( OFFB_RIP, mkU64(guest_RIP_bbstart + delta) ) );
31711         break;
31712      case Dis_ResteerU:
31713      case Dis_ResteerC:
31714         stmt( IRStmt_Put( OFFB_RIP, mkU64(dres.continueAt) ) );
31715         break;
31716      case Dis_StopHere:
31717         break;
31718      default:
31719         vassert(0);
31720   }
31721
31722   DIP("\n");
31723   dres.len = (Int)toUInt(delta - delta_start);
31724   return dres;
31725}
31726
31727#undef DIP
31728#undef DIS
31729
31730
31731/*------------------------------------------------------------*/
31732/*--- Top-level fn                                         ---*/
31733/*------------------------------------------------------------*/
31734
31735/* Disassemble a single instruction into IR.  The instruction
31736   is located in host memory at &guest_code[delta]. */
31737
31738DisResult disInstr_AMD64 ( IRSB*        irsb_IN,
31739                           Bool         (*resteerOkFn) ( void*, Addr64 ),
31740                           Bool         resteerCisOk,
31741                           void*        callback_opaque,
31742                           UChar*       guest_code_IN,
31743                           Long         delta,
31744                           Addr64       guest_IP,
31745                           VexArch      guest_arch,
31746                           VexArchInfo* archinfo,
31747                           VexAbiInfo*  abiinfo,
31748                           Bool         host_bigendian_IN,
31749                           Bool         sigill_diag_IN )
31750{
31751   Int       i, x1, x2;
31752   Bool      expect_CAS, has_CAS;
31753   DisResult dres;
31754
31755   /* Set globals (see top of this file) */
31756   vassert(guest_arch == VexArchAMD64);
31757   guest_code           = guest_code_IN;
31758   irsb                 = irsb_IN;
31759   host_is_bigendian    = host_bigendian_IN;
31760   guest_RIP_curr_instr = guest_IP;
31761   guest_RIP_bbstart    = guest_IP - delta;
31762
31763   /* We'll consult these after doing disInstr_AMD64_WRK. */
31764   guest_RIP_next_assumed   = 0;
31765   guest_RIP_next_mustcheck = False;
31766
31767   x1 = irsb_IN->stmts_used;
31768   expect_CAS = False;
31769   dres = disInstr_AMD64_WRK ( &expect_CAS, resteerOkFn,
31770                               resteerCisOk,
31771                               callback_opaque,
31772                               delta, archinfo, abiinfo, sigill_diag_IN );
31773   x2 = irsb_IN->stmts_used;
31774   vassert(x2 >= x1);
31775
31776   /* If disInstr_AMD64_WRK tried to figure out the next rip, check it
31777      got it right.  Failure of this assertion is serious and denotes
31778      a bug in disInstr. */
31779   if (guest_RIP_next_mustcheck
31780       && guest_RIP_next_assumed != guest_RIP_curr_instr + dres.len) {
31781      vex_printf("\n");
31782      vex_printf("assumed next %%rip = 0x%llx\n",
31783                 guest_RIP_next_assumed );
31784      vex_printf(" actual next %%rip = 0x%llx\n",
31785                 guest_RIP_curr_instr + dres.len );
31786      vpanic("disInstr_AMD64: disInstr miscalculated next %rip");
31787   }
31788
31789   /* See comment at the top of disInstr_AMD64_WRK for meaning of
31790      expect_CAS.  Here, we (sanity-)check for the presence/absence of
31791      IRCAS as directed by the returned expect_CAS value. */
31792   has_CAS = False;
31793   for (i = x1; i < x2; i++) {
31794      if (irsb_IN->stmts[i]->tag == Ist_CAS)
31795         has_CAS = True;
31796   }
31797
31798   if (expect_CAS != has_CAS) {
31799      /* inconsistency detected.  re-disassemble the instruction so as
31800         to generate a useful error message; then assert. */
31801      vex_traceflags |= VEX_TRACE_FE;
31802      dres = disInstr_AMD64_WRK ( &expect_CAS, resteerOkFn,
31803                                  resteerCisOk,
31804                                  callback_opaque,
31805                                  delta, archinfo, abiinfo, sigill_diag_IN );
31806      for (i = x1; i < x2; i++) {
31807         vex_printf("\t\t");
31808         ppIRStmt(irsb_IN->stmts[i]);
31809         vex_printf("\n");
31810      }
31811      /* Failure of this assertion is serious and denotes a bug in
31812         disInstr. */
31813      vpanic("disInstr_AMD64: inconsistency in LOCK prefix handling");
31814   }
31815
31816   return dres;
31817}
31818
31819
31820/*------------------------------------------------------------*/
31821/*--- Unused stuff                                         ---*/
31822/*------------------------------------------------------------*/
31823
31824// A potentially more Memcheck-friendly version of gen_LZCNT, if
31825// this should ever be needed.
31826//
31827//static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
31828//{
31829//   /* Scheme is simple: propagate the most significant 1-bit into all
31830//      lower positions in the word.  This gives a word of the form
31831//      0---01---1.  Now invert it, giving a word of the form
31832//      1---10---0, then do a population-count idiom (to count the 1s,
31833//      which is the number of leading zeroes, or the word size if the
31834//      original word was 0.
31835//   */
31836//   Int i;
31837//   IRTemp t[7];
31838//   for (i = 0; i < 7; i++) {
31839//      t[i] = newTemp(ty);
31840//   }
31841//   if (ty == Ity_I64) {
31842//      assign(t[0], binop(Iop_Or64, mkexpr(src),
31843//                                   binop(Iop_Shr64, mkexpr(src),  mkU8(1))));
31844//      assign(t[1], binop(Iop_Or64, mkexpr(t[0]),
31845//                                   binop(Iop_Shr64, mkexpr(t[0]), mkU8(2))));
31846//      assign(t[2], binop(Iop_Or64, mkexpr(t[1]),
31847//                                   binop(Iop_Shr64, mkexpr(t[1]), mkU8(4))));
31848//      assign(t[3], binop(Iop_Or64, mkexpr(t[2]),
31849//                                   binop(Iop_Shr64, mkexpr(t[2]), mkU8(8))));
31850//      assign(t[4], binop(Iop_Or64, mkexpr(t[3]),
31851//                                   binop(Iop_Shr64, mkexpr(t[3]), mkU8(16))));
31852//      assign(t[5], binop(Iop_Or64, mkexpr(t[4]),
31853//                                   binop(Iop_Shr64, mkexpr(t[4]), mkU8(32))));
31854//      assign(t[6], unop(Iop_Not64, mkexpr(t[5])));
31855//      return gen_POPCOUNT(ty, t[6]);
31856//   }
31857//   if (ty == Ity_I32) {
31858//      assign(t[0], binop(Iop_Or32, mkexpr(src),
31859//                                   binop(Iop_Shr32, mkexpr(src),  mkU8(1))));
31860//      assign(t[1], binop(Iop_Or32, mkexpr(t[0]),
31861//                                   binop(Iop_Shr32, mkexpr(t[0]), mkU8(2))));
31862//      assign(t[2], binop(Iop_Or32, mkexpr(t[1]),
31863//                                   binop(Iop_Shr32, mkexpr(t[1]), mkU8(4))));
31864//      assign(t[3], binop(Iop_Or32, mkexpr(t[2]),
31865//                                   binop(Iop_Shr32, mkexpr(t[2]), mkU8(8))));
31866//      assign(t[4], binop(Iop_Or32, mkexpr(t[3]),
31867//                                   binop(Iop_Shr32, mkexpr(t[3]), mkU8(16))));
31868//      assign(t[5], unop(Iop_Not32, mkexpr(t[4])));
31869//      return gen_POPCOUNT(ty, t[5]);
31870//   }
31871//   if (ty == Ity_I16) {
31872//      assign(t[0], binop(Iop_Or16, mkexpr(src),
31873//                                   binop(Iop_Shr16, mkexpr(src),  mkU8(1))));
31874//      assign(t[1], binop(Iop_Or16, mkexpr(t[0]),
31875//                                   binop(Iop_Shr16, mkexpr(t[0]), mkU8(2))));
31876//      assign(t[2], binop(Iop_Or16, mkexpr(t[1]),
31877//                                   binop(Iop_Shr16, mkexpr(t[1]), mkU8(4))));
31878//      assign(t[3], binop(Iop_Or16, mkexpr(t[2]),
31879//                                   binop(Iop_Shr16, mkexpr(t[2]), mkU8(8))));
31880//      assign(t[4], unop(Iop_Not16, mkexpr(t[3])));
31881//      return gen_POPCOUNT(ty, t[4]);
31882//   }
31883//   vassert(0);
31884//}
31885
31886
31887/*--------------------------------------------------------------------*/
31888/*--- end                                       guest_amd64_toIR.c ---*/
31889/*--------------------------------------------------------------------*/
31890