guest_x86_toIR.c revision 9bea4c13fca0e3bb4b719dcb3ed63d47d479294e
1
2/*--------------------------------------------------------------------*/
3/*--- begin                                       guest_x86_toIR.c ---*/
4/*--------------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2010 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36/* Translates x86 code to IR. */
37
38/* TODO:
39
40   All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
41   to ensure a 32-bit value is being written.
42
43   FUCOMI(P): what happens to A and S flags?  Currently are forced
44      to zero.
45
46   x87 FP Limitations:
47
48   * all arithmetic done at 64 bits
49
50   * no FP exceptions, except for handling stack over/underflow
51
52   * FP rounding mode observed only for float->int conversions
53     and int->float conversions which could lose accuracy, and
54     for float-to-float rounding.  For all other operations,
55     round-to-nearest is used, regardless.
56
57   * FP sin/cos/tan/sincos: C2 flag is always cleared.  IOW the
58     simulation claims the argument is in-range (-2^63 <= arg <= 2^63)
59     even when it isn't.
60
61   * some of the FCOM cases could do with testing -- not convinced
62     that the args are the right way round.
63
64   * FSAVE does not re-initialise the FPU; it should do
65
66   * FINIT not only initialises the FPU environment, it also
67     zeroes all the FP registers.  It should leave the registers
68     unchanged.
69
70   SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
71   per Intel docs this bit has no meaning anyway.  Since PUSHF is the
72   only way to observe eflags[1], a proper fix would be to make that
73   bit be set by PUSHF.
74
75   The state of %eflags.AC (alignment check, bit 18) is recorded by
76   the simulation (viz, if you set it with popf then a pushf produces
77   the value you set it to), but it is otherwise ignored.  In
78   particular, setting it to 1 does NOT cause alignment checking to
79   happen.  Programs that set it to 1 and then rely on the resulting
80   SIGBUSs to inform them of misaligned accesses will not work.
81
82   Implementation of sysenter is necessarily partial.  sysenter is a
83   kind of system call entry.  When doing a sysenter, the return
84   address is not known -- that is something that is beyond Vex's
85   knowledge.  So the generated IR forces a return to the scheduler,
86   which can do what it likes to simulate the systenter, but it MUST
87   set this thread's guest_EIP field with the continuation address
88   before resuming execution.  If that doesn't happen, the thread will
89   jump to address zero, which is probably fatal.
90
91   This module uses global variables and so is not MT-safe (if that
92   should ever become relevant).
93
94   The delta values are 32-bit ints, not 64-bit ints.  That means
95   this module may not work right if run on a 64-bit host.  That should
96   be fixed properly, really -- if anyone ever wants to use Vex to
97   translate x86 code for execution on a 64-bit host.
98
99   casLE (implementation of lock-prefixed insns) and rep-prefixed
100   insns: the side-exit back to the start of the insn is done with
101   Ijk_Boring.  This is quite wrong, it should be done with
102   Ijk_NoRedir, since otherwise the side exit, which is intended to
103   restart the instruction for whatever reason, could go somewhere
104   entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
105   no-redir jumps performance critical, at least for rep-prefixed
106   instructions, since all iterations thereof would involve such a
107   jump.  It's not such a big deal with casLE since the side exit is
108   only taken if the CAS fails, that is, the location is contended,
109   which is relatively unlikely.
110
111   XXXX: Nov 2009: handling of SWP on ARM suffers from the same
112   problem.
113
114   Note also, the test for CAS success vs failure is done using
115   Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
116   Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
117   shouldn't definedness-check these comparisons.  See
118   COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
119   background/rationale.
120*/
121
122/* Performance holes:
123
124   - fcom ; fstsw %ax ; sahf
125     sahf does not update the O flag (sigh) and so O needs to
126     be computed.  This is done expensively; it would be better
127     to have a calculate_eflags_o helper.
128
129   - emwarns; some FP codes can generate huge numbers of these
130     if the fpucw is changed in an inner loop.  It would be
131     better for the guest state to have an emwarn-enable reg
132     which can be set zero or nonzero.  If it is zero, emwarns
133     are not flagged, and instead control just flows all the
134     way through bbs as usual.
135*/
136
137/* "Special" instructions.
138
139   This instruction decoder can decode three special instructions
140   which mean nothing natively (are no-ops as far as regs/mem are
141   concerned) but have meaning for supporting Valgrind.  A special
142   instruction is flagged by the 12-byte preamble C1C703 C1C70D C1C71D
143   C1C713 (in the standard interpretation, that means: roll $3, %edi;
144   roll $13, %edi; roll $29, %edi; roll $19, %edi).  Following that,
145   one of the following 3 are allowed (standard interpretation in
146   parentheses):
147
148      87DB (xchgl %ebx,%ebx)   %EDX = client_request ( %EAX )
149      87C9 (xchgl %ecx,%ecx)   %EAX = guest_NRADDR
150      87D2 (xchgl %edx,%edx)   call-noredir *%EAX
151
152   Any other bytes following the 12-byte preamble are illegal and
153   constitute a failure in instruction decoding.  This all assumes
154   that the preamble will never occur except in specific code
155   fragments designed for Valgrind to catch.
156
157   No prefixes may precede a "Special" instruction.
158*/
159
160/* LOCK prefixed instructions.  These are translated using IR-level
161   CAS statements (IRCAS) and are believed to preserve atomicity, even
162   from the point of view of some other process racing against a
163   simulated one (presumably they communicate via a shared memory
164   segment).
165
166   Handlers which are aware of LOCK prefixes are:
167      dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
168      dis_cmpxchg_G_E  (cmpxchg)
169      dis_Grp1         (add, or, adc, sbb, and, sub, xor)
170      dis_Grp3         (not, neg)
171      dis_Grp4         (inc, dec)
172      dis_Grp5         (inc, dec)
173      dis_Grp8_Imm     (bts, btc, btr)
174      dis_bt_G_E       (bts, btc, btr)
175      dis_xadd_G_E     (xadd)
176*/
177
178
179#include "libvex_basictypes.h"
180#include "libvex_ir.h"
181#include "libvex.h"
182#include "libvex_guest_x86.h"
183
184#include "main_util.h"
185#include "main_globals.h"
186#include "guest_generic_bb_to_IR.h"
187#include "guest_generic_x87.h"
188#include "guest_x86_defs.h"
189
190
191/*------------------------------------------------------------*/
192/*--- Globals                                              ---*/
193/*------------------------------------------------------------*/
194
195/* These are set at the start of the translation of an insn, right
196   down in disInstr_X86, so that we don't have to pass them around
197   endlessly.  They are all constant during the translation of any
198   given insn. */
199
200/* We need to know this to do sub-register accesses correctly. */
201static Bool host_is_bigendian;
202
203/* Pointer to the guest code area (points to start of BB, not to the
204   insn being processed). */
205static UChar* guest_code;
206
207/* The guest address corresponding to guest_code[0]. */
208static Addr32 guest_EIP_bbstart;
209
210/* The guest address for the instruction currently being
211   translated. */
212static Addr32 guest_EIP_curr_instr;
213
214/* The IRSB* into which we're generating code. */
215static IRSB* irsb;
216
217
218/*------------------------------------------------------------*/
219/*--- Debugging output                                     ---*/
220/*------------------------------------------------------------*/
221
222#define DIP(format, args...)           \
223   if (vex_traceflags & VEX_TRACE_FE)  \
224      vex_printf(format, ## args)
225
226#define DIS(buf, format, args...)      \
227   if (vex_traceflags & VEX_TRACE_FE)  \
228      vex_sprintf(buf, format, ## args)
229
230
231/*------------------------------------------------------------*/
232/*--- Offsets of various parts of the x86 guest state.     ---*/
233/*------------------------------------------------------------*/
234
235#define OFFB_EAX       offsetof(VexGuestX86State,guest_EAX)
236#define OFFB_EBX       offsetof(VexGuestX86State,guest_EBX)
237#define OFFB_ECX       offsetof(VexGuestX86State,guest_ECX)
238#define OFFB_EDX       offsetof(VexGuestX86State,guest_EDX)
239#define OFFB_ESP       offsetof(VexGuestX86State,guest_ESP)
240#define OFFB_EBP       offsetof(VexGuestX86State,guest_EBP)
241#define OFFB_ESI       offsetof(VexGuestX86State,guest_ESI)
242#define OFFB_EDI       offsetof(VexGuestX86State,guest_EDI)
243
244#define OFFB_EIP       offsetof(VexGuestX86State,guest_EIP)
245
246#define OFFB_CC_OP     offsetof(VexGuestX86State,guest_CC_OP)
247#define OFFB_CC_DEP1   offsetof(VexGuestX86State,guest_CC_DEP1)
248#define OFFB_CC_DEP2   offsetof(VexGuestX86State,guest_CC_DEP2)
249#define OFFB_CC_NDEP   offsetof(VexGuestX86State,guest_CC_NDEP)
250
251#define OFFB_FPREGS    offsetof(VexGuestX86State,guest_FPREG[0])
252#define OFFB_FPTAGS    offsetof(VexGuestX86State,guest_FPTAG[0])
253#define OFFB_DFLAG     offsetof(VexGuestX86State,guest_DFLAG)
254#define OFFB_IDFLAG    offsetof(VexGuestX86State,guest_IDFLAG)
255#define OFFB_ACFLAG    offsetof(VexGuestX86State,guest_ACFLAG)
256#define OFFB_FTOP      offsetof(VexGuestX86State,guest_FTOP)
257#define OFFB_FC3210    offsetof(VexGuestX86State,guest_FC3210)
258#define OFFB_FPROUND   offsetof(VexGuestX86State,guest_FPROUND)
259
260#define OFFB_CS        offsetof(VexGuestX86State,guest_CS)
261#define OFFB_DS        offsetof(VexGuestX86State,guest_DS)
262#define OFFB_ES        offsetof(VexGuestX86State,guest_ES)
263#define OFFB_FS        offsetof(VexGuestX86State,guest_FS)
264#define OFFB_GS        offsetof(VexGuestX86State,guest_GS)
265#define OFFB_SS        offsetof(VexGuestX86State,guest_SS)
266#define OFFB_LDT       offsetof(VexGuestX86State,guest_LDT)
267#define OFFB_GDT       offsetof(VexGuestX86State,guest_GDT)
268
269#define OFFB_SSEROUND  offsetof(VexGuestX86State,guest_SSEROUND)
270#define OFFB_XMM0      offsetof(VexGuestX86State,guest_XMM0)
271#define OFFB_XMM1      offsetof(VexGuestX86State,guest_XMM1)
272#define OFFB_XMM2      offsetof(VexGuestX86State,guest_XMM2)
273#define OFFB_XMM3      offsetof(VexGuestX86State,guest_XMM3)
274#define OFFB_XMM4      offsetof(VexGuestX86State,guest_XMM4)
275#define OFFB_XMM5      offsetof(VexGuestX86State,guest_XMM5)
276#define OFFB_XMM6      offsetof(VexGuestX86State,guest_XMM6)
277#define OFFB_XMM7      offsetof(VexGuestX86State,guest_XMM7)
278
279#define OFFB_EMWARN    offsetof(VexGuestX86State,guest_EMWARN)
280
281#define OFFB_TISTART   offsetof(VexGuestX86State,guest_TISTART)
282#define OFFB_TILEN     offsetof(VexGuestX86State,guest_TILEN)
283#define OFFB_NRADDR    offsetof(VexGuestX86State,guest_NRADDR)
284
285#define OFFB_IP_AT_SYSCALL offsetof(VexGuestX86State,guest_IP_AT_SYSCALL)
286
287
288/*------------------------------------------------------------*/
289/*--- Helper bits and pieces for deconstructing the        ---*/
290/*--- x86 insn stream.                                     ---*/
291/*------------------------------------------------------------*/
292
293/* This is the Intel register encoding -- integer regs. */
294#define R_EAX 0
295#define R_ECX 1
296#define R_EDX 2
297#define R_EBX 3
298#define R_ESP 4
299#define R_EBP 5
300#define R_ESI 6
301#define R_EDI 7
302
303#define R_AL (0+R_EAX)
304#define R_AH (4+R_EAX)
305
306/* This is the Intel register encoding -- segment regs. */
307#define R_ES 0
308#define R_CS 1
309#define R_SS 2
310#define R_DS 3
311#define R_FS 4
312#define R_GS 5
313
314
315/* Add a statement to the list held by "irbb". */
316static void stmt ( IRStmt* st )
317{
318   addStmtToIRSB( irsb, st );
319}
320
321/* Generate a new temporary of the given type. */
322static IRTemp newTemp ( IRType ty )
323{
324   vassert(isPlausibleIRType(ty));
325   return newIRTemp( irsb->tyenv, ty );
326}
327
328/* Various simple conversions */
329
330static UInt extend_s_8to32( UInt x )
331{
332   return (UInt)((((Int)x) << 24) >> 24);
333}
334
335static UInt extend_s_16to32 ( UInt x )
336{
337   return (UInt)((((Int)x) << 16) >> 16);
338}
339
340/* Fetch a byte from the guest insn stream. */
341static UChar getIByte ( Int delta )
342{
343   return guest_code[delta];
344}
345
346/* Extract the reg field from a modRM byte. */
347static Int gregOfRM ( UChar mod_reg_rm )
348{
349   return (Int)( (mod_reg_rm >> 3) & 7 );
350}
351
352/* Figure out whether the mod and rm parts of a modRM byte refer to a
353   register or memory.  If so, the byte will have the form 11XXXYYY,
354   where YYY is the register number. */
355static Bool epartIsReg ( UChar mod_reg_rm )
356{
357   return toBool(0xC0 == (mod_reg_rm & 0xC0));
358}
359
360/* ... and extract the register number ... */
361static Int eregOfRM ( UChar mod_reg_rm )
362{
363   return (Int)(mod_reg_rm & 0x7);
364}
365
366/* Get a 8/16/32-bit unsigned value out of the insn stream. */
367
368static UChar getUChar ( Int delta )
369{
370   UChar v = guest_code[delta+0];
371   return toUChar(v);
372}
373
374static UInt getUDisp16 ( Int delta )
375{
376   UInt v = guest_code[delta+1]; v <<= 8;
377   v |= guest_code[delta+0];
378   return v & 0xFFFF;
379}
380
381static UInt getUDisp32 ( Int delta )
382{
383   UInt v = guest_code[delta+3]; v <<= 8;
384   v |= guest_code[delta+2]; v <<= 8;
385   v |= guest_code[delta+1]; v <<= 8;
386   v |= guest_code[delta+0];
387   return v;
388}
389
390static UInt getUDisp ( Int size, Int delta )
391{
392   switch (size) {
393      case 4: return getUDisp32(delta);
394      case 2: return getUDisp16(delta);
395      case 1: return (UInt)getUChar(delta);
396      default: vpanic("getUDisp(x86)");
397   }
398   return 0; /*notreached*/
399}
400
401
402/* Get a byte value out of the insn stream and sign-extend to 32
403   bits. */
404static UInt getSDisp8 ( Int delta )
405{
406   return extend_s_8to32( (UInt) (guest_code[delta]) );
407}
408
409static UInt getSDisp16 ( Int delta0 )
410{
411   UChar* eip = (UChar*)(&guest_code[delta0]);
412   UInt d = *eip++;
413   d |= ((*eip++) << 8);
414   return extend_s_16to32(d);
415}
416
417static UInt getSDisp ( Int size, Int delta )
418{
419   switch (size) {
420      case 4: return getUDisp32(delta);
421      case 2: return getSDisp16(delta);
422      case 1: return getSDisp8(delta);
423      default: vpanic("getSDisp(x86)");
424  }
425  return 0; /*notreached*/
426}
427
428
429/*------------------------------------------------------------*/
430/*--- Helpers for constructing IR.                         ---*/
431/*------------------------------------------------------------*/
432
433/* Create a 1/2/4 byte read of an x86 integer registers.  For 16/8 bit
434   register references, we need to take the host endianness into
435   account.  Supplied value is 0 .. 7 and in the Intel instruction
436   encoding. */
437
438static IRType szToITy ( Int n )
439{
440   switch (n) {
441      case 1: return Ity_I8;
442      case 2: return Ity_I16;
443      case 4: return Ity_I32;
444      default: vpanic("szToITy(x86)");
445   }
446}
447
448/* On a little-endian host, less significant bits of the guest
449   registers are at lower addresses.  Therefore, if a reference to a
450   register low half has the safe guest state offset as a reference to
451   the full register.
452*/
453static Int integerGuestRegOffset ( Int sz, UInt archreg )
454{
455   vassert(archreg < 8);
456
457   /* Correct for little-endian host only. */
458   vassert(!host_is_bigendian);
459
460   if (sz == 4 || sz == 2 || (sz == 1 && archreg < 4)) {
461      switch (archreg) {
462         case R_EAX: return OFFB_EAX;
463         case R_EBX: return OFFB_EBX;
464         case R_ECX: return OFFB_ECX;
465         case R_EDX: return OFFB_EDX;
466         case R_ESI: return OFFB_ESI;
467         case R_EDI: return OFFB_EDI;
468         case R_ESP: return OFFB_ESP;
469         case R_EBP: return OFFB_EBP;
470         default: vpanic("integerGuestRegOffset(x86,le)(4,2)");
471      }
472   }
473
474   vassert(archreg >= 4 && archreg < 8 && sz == 1);
475   switch (archreg-4) {
476      case R_EAX: return 1+ OFFB_EAX;
477      case R_EBX: return 1+ OFFB_EBX;
478      case R_ECX: return 1+ OFFB_ECX;
479      case R_EDX: return 1+ OFFB_EDX;
480      default: vpanic("integerGuestRegOffset(x86,le)(1h)");
481   }
482
483   /* NOTREACHED */
484   vpanic("integerGuestRegOffset(x86,le)");
485}
486
487static Int segmentGuestRegOffset ( UInt sreg )
488{
489   switch (sreg) {
490      case R_ES: return OFFB_ES;
491      case R_CS: return OFFB_CS;
492      case R_SS: return OFFB_SS;
493      case R_DS: return OFFB_DS;
494      case R_FS: return OFFB_FS;
495      case R_GS: return OFFB_GS;
496      default: vpanic("segmentGuestRegOffset(x86)");
497   }
498}
499
500static Int xmmGuestRegOffset ( UInt xmmreg )
501{
502   switch (xmmreg) {
503      case 0: return OFFB_XMM0;
504      case 1: return OFFB_XMM1;
505      case 2: return OFFB_XMM2;
506      case 3: return OFFB_XMM3;
507      case 4: return OFFB_XMM4;
508      case 5: return OFFB_XMM5;
509      case 6: return OFFB_XMM6;
510      case 7: return OFFB_XMM7;
511      default: vpanic("xmmGuestRegOffset");
512   }
513}
514
515/* Lanes of vector registers are always numbered from zero being the
516   least significant lane (rightmost in the register).  */
517
518static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
519{
520   /* Correct for little-endian host only. */
521   vassert(!host_is_bigendian);
522   vassert(laneno >= 0 && laneno < 8);
523   return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
524}
525
526static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
527{
528   /* Correct for little-endian host only. */
529   vassert(!host_is_bigendian);
530   vassert(laneno >= 0 && laneno < 4);
531   return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
532}
533
534static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
535{
536   /* Correct for little-endian host only. */
537   vassert(!host_is_bigendian);
538   vassert(laneno >= 0 && laneno < 2);
539   return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
540}
541
542static IRExpr* getIReg ( Int sz, UInt archreg )
543{
544   vassert(sz == 1 || sz == 2 || sz == 4);
545   vassert(archreg < 8);
546   return IRExpr_Get( integerGuestRegOffset(sz,archreg),
547                      szToITy(sz) );
548}
549
550/* Ditto, but write to a reg instead. */
551static void putIReg ( Int sz, UInt archreg, IRExpr* e )
552{
553   IRType ty = typeOfIRExpr(irsb->tyenv, e);
554   switch (sz) {
555      case 1: vassert(ty == Ity_I8); break;
556      case 2: vassert(ty == Ity_I16); break;
557      case 4: vassert(ty == Ity_I32); break;
558      default: vpanic("putIReg(x86)");
559   }
560   vassert(archreg < 8);
561   stmt( IRStmt_Put(integerGuestRegOffset(sz,archreg), e) );
562}
563
564static IRExpr* getSReg ( UInt sreg )
565{
566   return IRExpr_Get( segmentGuestRegOffset(sreg), Ity_I16 );
567}
568
569static void putSReg ( UInt sreg, IRExpr* e )
570{
571   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
572   stmt( IRStmt_Put( segmentGuestRegOffset(sreg), e ) );
573}
574
575static IRExpr* getXMMReg ( UInt xmmreg )
576{
577   return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
578}
579
580static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
581{
582   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
583}
584
585static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
586{
587   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
588}
589
590static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
591{
592   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
593}
594
595static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
596{
597   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
598}
599
600static void putXMMReg ( UInt xmmreg, IRExpr* e )
601{
602   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
603   stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
604}
605
606static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
607{
608   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
609   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
610}
611
612static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
613{
614   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
615   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
616}
617
618static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
619{
620   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
621   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
622}
623
624static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
625{
626   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
627   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
628}
629
630static void putXMMRegLane16 ( UInt xmmreg, Int laneno, IRExpr* e )
631{
632   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
633   stmt( IRStmt_Put( xmmGuestRegLane16offset(xmmreg,laneno), e ) );
634}
635
636static void assign ( IRTemp dst, IRExpr* e )
637{
638   stmt( IRStmt_WrTmp(dst, e) );
639}
640
641static void storeLE ( IRExpr* addr, IRExpr* data )
642{
643   stmt( IRStmt_Store(Iend_LE, addr, data) );
644}
645
646static IRExpr* unop ( IROp op, IRExpr* a )
647{
648   return IRExpr_Unop(op, a);
649}
650
651static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
652{
653   return IRExpr_Binop(op, a1, a2);
654}
655
656static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
657{
658   return IRExpr_Triop(op, a1, a2, a3);
659}
660
661static IRExpr* mkexpr ( IRTemp tmp )
662{
663   return IRExpr_RdTmp(tmp);
664}
665
666static IRExpr* mkU8 ( UInt i )
667{
668   vassert(i < 256);
669   return IRExpr_Const(IRConst_U8( (UChar)i ));
670}
671
672static IRExpr* mkU16 ( UInt i )
673{
674   vassert(i < 65536);
675   return IRExpr_Const(IRConst_U16( (UShort)i ));
676}
677
678static IRExpr* mkU32 ( UInt i )
679{
680   return IRExpr_Const(IRConst_U32(i));
681}
682
683static IRExpr* mkU64 ( ULong i )
684{
685   return IRExpr_Const(IRConst_U64(i));
686}
687
688static IRExpr* mkU ( IRType ty, UInt i )
689{
690   if (ty == Ity_I8)  return mkU8(i);
691   if (ty == Ity_I16) return mkU16(i);
692   if (ty == Ity_I32) return mkU32(i);
693   /* If this panics, it usually means you passed a size (1,2,4)
694      value as the IRType, rather than a real IRType. */
695   vpanic("mkU(x86)");
696}
697
698static IRExpr* mkV128 ( UShort mask )
699{
700   return IRExpr_Const(IRConst_V128(mask));
701}
702
703static IRExpr* loadLE ( IRType ty, IRExpr* addr )
704{
705   return IRExpr_Load(Iend_LE, ty, addr);
706}
707
708static IROp mkSizedOp ( IRType ty, IROp op8 )
709{
710   Int adj;
711   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
712   vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
713           || op8 == Iop_Mul8
714           || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
715           || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
716           || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
717           || op8 == Iop_CasCmpNE8
718           || op8 == Iop_Not8);
719   adj = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
720   return adj + op8;
721}
722
723static IROp mkWidenOp ( Int szSmall, Int szBig, Bool signd )
724{
725   if (szSmall == 1 && szBig == 4) {
726      return signd ? Iop_8Sto32 : Iop_8Uto32;
727   }
728   if (szSmall == 1 && szBig == 2) {
729      return signd ? Iop_8Sto16 : Iop_8Uto16;
730   }
731   if (szSmall == 2 && szBig == 4) {
732      return signd ? Iop_16Sto32 : Iop_16Uto32;
733   }
734   vpanic("mkWidenOp(x86,guest)");
735}
736
737static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
738{
739   vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
740   vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
741   return unop(Iop_32to1,
742               binop(Iop_And32,
743                     unop(Iop_1Uto32,x),
744                     unop(Iop_1Uto32,y)));
745}
746
747/* Generate a compare-and-swap operation, operating on memory at
748   'addr'.  The expected value is 'expVal' and the new value is
749   'newVal'.  If the operation fails, then transfer control (with a
750   no-redir jump (XXX no -- see comment at top of this file)) to
751   'restart_point', which is presumably the address of the guest
752   instruction again -- retrying, essentially. */
753static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
754                    Addr32 restart_point )
755{
756   IRCAS* cas;
757   IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
758   IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
759   IRTemp oldTmp = newTemp(tyE);
760   IRTemp expTmp = newTemp(tyE);
761   vassert(tyE == tyN);
762   vassert(tyE == Ity_I32 || tyE == Ity_I16 || tyE == Ity_I8);
763   assign(expTmp, expVal);
764   cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
765                  NULL, mkexpr(expTmp), NULL, newVal );
766   stmt( IRStmt_CAS(cas) );
767   stmt( IRStmt_Exit(
768            binop( mkSizedOp(tyE,Iop_CasCmpNE8),
769                   mkexpr(oldTmp), mkexpr(expTmp) ),
770            Ijk_Boring, /*Ijk_NoRedir*/
771            IRConst_U32( restart_point )
772         ));
773}
774
775
776/*------------------------------------------------------------*/
777/*--- Helpers for %eflags.                                 ---*/
778/*------------------------------------------------------------*/
779
780/* -------------- Evaluating the flags-thunk. -------------- */
781
782/* Build IR to calculate all the eflags from stored
783   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
784   Ity_I32. */
785static IRExpr* mk_x86g_calculate_eflags_all ( void )
786{
787   IRExpr** args
788      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
789                       IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
790                       IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
791                       IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
792   IRExpr* call
793      = mkIRExprCCall(
794           Ity_I32,
795           0/*regparm*/,
796           "x86g_calculate_eflags_all", &x86g_calculate_eflags_all,
797           args
798        );
799   /* Exclude OP and NDEP from definedness checking.  We're only
800      interested in DEP1 and DEP2. */
801   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
802   return call;
803}
804
805/* Build IR to calculate some particular condition from stored
806   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
807   Ity_Bit. */
808static IRExpr* mk_x86g_calculate_condition ( X86Condcode cond )
809{
810   IRExpr** args
811      = mkIRExprVec_5( mkU32(cond),
812                       IRExpr_Get(OFFB_CC_OP,  Ity_I32),
813                       IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
814                       IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
815                       IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
816   IRExpr* call
817      = mkIRExprCCall(
818           Ity_I32,
819           0/*regparm*/,
820           "x86g_calculate_condition", &x86g_calculate_condition,
821           args
822        );
823   /* Exclude the requested condition, OP and NDEP from definedness
824      checking.  We're only interested in DEP1 and DEP2. */
825   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
826   return unop(Iop_32to1, call);
827}
828
829/* Build IR to calculate just the carry flag from stored
830   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I32. */
831static IRExpr* mk_x86g_calculate_eflags_c ( void )
832{
833   IRExpr** args
834      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
835                       IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
836                       IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
837                       IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
838   IRExpr* call
839      = mkIRExprCCall(
840           Ity_I32,
841           3/*regparm*/,
842           "x86g_calculate_eflags_c", &x86g_calculate_eflags_c,
843           args
844        );
845   /* Exclude OP and NDEP from definedness checking.  We're only
846      interested in DEP1 and DEP2. */
847   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
848   return call;
849}
850
851
852/* -------------- Building the flags-thunk. -------------- */
853
854/* The machinery in this section builds the flag-thunk following a
855   flag-setting operation.  Hence the various setFlags_* functions.
856*/
857
858static Bool isAddSub ( IROp op8 )
859{
860   return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
861}
862
863static Bool isLogic ( IROp op8 )
864{
865   return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
866}
867
868/* U-widen 8/16/32 bit int expr to 32. */
869static IRExpr* widenUto32 ( IRExpr* e )
870{
871   switch (typeOfIRExpr(irsb->tyenv,e)) {
872      case Ity_I32: return e;
873      case Ity_I16: return unop(Iop_16Uto32,e);
874      case Ity_I8:  return unop(Iop_8Uto32,e);
875      default: vpanic("widenUto32");
876   }
877}
878
879/* S-widen 8/16/32 bit int expr to 32. */
880static IRExpr* widenSto32 ( IRExpr* e )
881{
882   switch (typeOfIRExpr(irsb->tyenv,e)) {
883      case Ity_I32: return e;
884      case Ity_I16: return unop(Iop_16Sto32,e);
885      case Ity_I8:  return unop(Iop_8Sto32,e);
886      default: vpanic("widenSto32");
887   }
888}
889
890/* Narrow 8/16/32 bit int expr to 8/16/32.  Clearly only some
891   of these combinations make sense. */
892static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
893{
894   IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
895   if (src_ty == dst_ty)
896      return e;
897   if (src_ty == Ity_I32 && dst_ty == Ity_I16)
898      return unop(Iop_32to16, e);
899   if (src_ty == Ity_I32 && dst_ty == Ity_I8)
900      return unop(Iop_32to8, e);
901
902   vex_printf("\nsrc, dst tys are: ");
903   ppIRType(src_ty);
904   vex_printf(", ");
905   ppIRType(dst_ty);
906   vex_printf("\n");
907   vpanic("narrowTo(x86)");
908}
909
910
911/* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
912   auto-sized up to the real op. */
913
914static
915void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
916{
917   Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
918
919   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
920
921   switch (op8) {
922      case Iop_Add8: ccOp += X86G_CC_OP_ADDB;   break;
923      case Iop_Sub8: ccOp += X86G_CC_OP_SUBB;   break;
924      default:       ppIROp(op8);
925                     vpanic("setFlags_DEP1_DEP2(x86)");
926   }
927   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
928   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
929   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(dep2))) );
930   /* Set NDEP even though it isn't used.  This makes redundant-PUT
931      elimination of previous stores to this field work better. */
932   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
933}
934
935
936/* Set the OP and DEP1 fields only, and write zero to DEP2. */
937
938static
939void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
940{
941   Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
942
943   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
944
945   switch (op8) {
946      case Iop_Or8:
947      case Iop_And8:
948      case Iop_Xor8: ccOp += X86G_CC_OP_LOGICB; break;
949      default:       ppIROp(op8);
950                     vpanic("setFlags_DEP1(x86)");
951   }
952   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
953   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
954   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
955   /* Set NDEP even though it isn't used.  This makes redundant-PUT
956      elimination of previous stores to this field work better. */
957   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
958}
959
960
961/* For shift operations, we put in the result and the undershifted
962   result.  Except if the shift amount is zero, the thunk is left
963   unchanged. */
964
965static void setFlags_DEP1_DEP2_shift ( IROp    op32,
966                                       IRTemp  res,
967                                       IRTemp  resUS,
968                                       IRType  ty,
969                                       IRTemp  guard )
970{
971   Int ccOp = ty==Ity_I8 ? 2 : (ty==Ity_I16 ? 1 : 0);
972
973   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
974   vassert(guard);
975
976   /* Both kinds of right shifts are handled by the same thunk
977      operation. */
978   switch (op32) {
979      case Iop_Shr32:
980      case Iop_Sar32: ccOp = X86G_CC_OP_SHRL - ccOp; break;
981      case Iop_Shl32: ccOp = X86G_CC_OP_SHLL - ccOp; break;
982      default:        ppIROp(op32);
983                      vpanic("setFlags_DEP1_DEP2_shift(x86)");
984   }
985
986   /* DEP1 contains the result, DEP2 contains the undershifted value. */
987   stmt( IRStmt_Put( OFFB_CC_OP,
988                     IRExpr_Mux0X( mkexpr(guard),
989                                   IRExpr_Get(OFFB_CC_OP,Ity_I32),
990                                   mkU32(ccOp))) );
991   stmt( IRStmt_Put( OFFB_CC_DEP1,
992                     IRExpr_Mux0X( mkexpr(guard),
993                                   IRExpr_Get(OFFB_CC_DEP1,Ity_I32),
994                                   widenUto32(mkexpr(res)))) );
995   stmt( IRStmt_Put( OFFB_CC_DEP2,
996                     IRExpr_Mux0X( mkexpr(guard),
997                                   IRExpr_Get(OFFB_CC_DEP2,Ity_I32),
998                                   widenUto32(mkexpr(resUS)))) );
999   /* Set NDEP even though it isn't used.  This makes redundant-PUT
1000      elimination of previous stores to this field work better. */
1001   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
1002}
1003
1004
1005/* For the inc/dec case, we store in DEP1 the result value and in NDEP
1006   the former value of the carry flag, which unfortunately we have to
1007   compute. */
1008
1009static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
1010{
1011   Int ccOp = inc ? X86G_CC_OP_INCB : X86G_CC_OP_DECB;
1012
1013   ccOp += ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
1014   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
1015
1016   /* This has to come first, because calculating the C flag
1017      may require reading all four thunk fields. */
1018   stmt( IRStmt_Put( OFFB_CC_NDEP, mk_x86g_calculate_eflags_c()) );
1019   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
1020   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(res))) );
1021   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
1022}
1023
1024
1025/* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
1026   two arguments. */
1027
1028static
1029void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, UInt base_op )
1030{
1031   switch (ty) {
1032      case Ity_I8:
1033         stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+0) ) );
1034         break;
1035      case Ity_I16:
1036         stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+1) ) );
1037         break;
1038      case Ity_I32:
1039         stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+2) ) );
1040         break;
1041      default:
1042         vpanic("setFlags_MUL(x86)");
1043   }
1044   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(arg1)) ));
1045   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(arg2)) ));
1046   /* Set NDEP even though it isn't used.  This makes redundant-PUT
1047      elimination of previous stores to this field work better. */
1048   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
1049}
1050
1051
1052/* -------------- Condition codes. -------------- */
1053
1054/* Condition codes, using the Intel encoding.  */
1055
1056static HChar* name_X86Condcode ( X86Condcode cond )
1057{
1058   switch (cond) {
1059      case X86CondO:      return "o";
1060      case X86CondNO:     return "no";
1061      case X86CondB:      return "b";
1062      case X86CondNB:     return "nb";
1063      case X86CondZ:      return "z";
1064      case X86CondNZ:     return "nz";
1065      case X86CondBE:     return "be";
1066      case X86CondNBE:    return "nbe";
1067      case X86CondS:      return "s";
1068      case X86CondNS:     return "ns";
1069      case X86CondP:      return "p";
1070      case X86CondNP:     return "np";
1071      case X86CondL:      return "l";
1072      case X86CondNL:     return "nl";
1073      case X86CondLE:     return "le";
1074      case X86CondNLE:    return "nle";
1075      case X86CondAlways: return "ALWAYS";
1076      default: vpanic("name_X86Condcode");
1077   }
1078}
1079
1080static
1081X86Condcode positiveIse_X86Condcode ( X86Condcode  cond,
1082                                      Bool*        needInvert )
1083{
1084   vassert(cond >= X86CondO && cond <= X86CondNLE);
1085   if (cond & 1) {
1086      *needInvert = True;
1087      return cond-1;
1088   } else {
1089      *needInvert = False;
1090      return cond;
1091   }
1092}
1093
1094
1095/* -------------- Helpers for ADD/SUB with carry. -------------- */
1096
1097/* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
1098   appropriately.
1099
1100   Optionally, generate a store for the 'tres' value.  This can either
1101   be a normal store, or it can be a cas-with-possible-failure style
1102   store:
1103
1104   if taddr is IRTemp_INVALID, then no store is generated.
1105
1106   if taddr is not IRTemp_INVALID, then a store (using taddr as
1107   the address) is generated:
1108
1109     if texpVal is IRTemp_INVALID then a normal store is
1110     generated, and restart_point must be zero (it is irrelevant).
1111
1112     if texpVal is not IRTemp_INVALID then a cas-style store is
1113     generated.  texpVal is the expected value, restart_point
1114     is the restart point if the store fails, and texpVal must
1115     have the same type as tres.
1116*/
1117static void helper_ADC ( Int sz,
1118                         IRTemp tres, IRTemp ta1, IRTemp ta2,
1119                         /* info about optional store: */
1120                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
1121{
1122   UInt    thunkOp;
1123   IRType  ty    = szToITy(sz);
1124   IRTemp  oldc  = newTemp(Ity_I32);
1125   IRTemp  oldcn = newTemp(ty);
1126   IROp    plus  = mkSizedOp(ty, Iop_Add8);
1127   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
1128
1129   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
1130   vassert(sz == 1 || sz == 2 || sz == 4);
1131   thunkOp = sz==4 ? X86G_CC_OP_ADCL
1132                   : (sz==2 ? X86G_CC_OP_ADCW : X86G_CC_OP_ADCB);
1133
1134   /* oldc = old carry flag, 0 or 1 */
1135   assign( oldc,  binop(Iop_And32,
1136                        mk_x86g_calculate_eflags_c(),
1137                        mkU32(1)) );
1138
1139   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
1140
1141   assign( tres, binop(plus,
1142                       binop(plus,mkexpr(ta1),mkexpr(ta2)),
1143                       mkexpr(oldcn)) );
1144
1145   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
1146      start of this function. */
1147   if (taddr != IRTemp_INVALID) {
1148      if (texpVal == IRTemp_INVALID) {
1149         vassert(restart_point == 0);
1150         storeLE( mkexpr(taddr), mkexpr(tres) );
1151      } else {
1152         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
1153         /* .. and hence 'texpVal' has the same type as 'tres'. */
1154         casLE( mkexpr(taddr),
1155                mkexpr(texpVal), mkexpr(tres), restart_point );
1156      }
1157   }
1158
1159   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
1160   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1)) ));
1161   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
1162                                                         mkexpr(oldcn)) )) );
1163   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
1164}
1165
1166
1167/* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
1168   appropriately.  As with helper_ADC, possibly generate a store of
1169   the result -- see comments on helper_ADC for details.
1170*/
1171static void helper_SBB ( Int sz,
1172                         IRTemp tres, IRTemp ta1, IRTemp ta2,
1173                         /* info about optional store: */
1174                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
1175{
1176   UInt    thunkOp;
1177   IRType  ty    = szToITy(sz);
1178   IRTemp  oldc  = newTemp(Ity_I32);
1179   IRTemp  oldcn = newTemp(ty);
1180   IROp    minus = mkSizedOp(ty, Iop_Sub8);
1181   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
1182
1183   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
1184   vassert(sz == 1 || sz == 2 || sz == 4);
1185   thunkOp = sz==4 ? X86G_CC_OP_SBBL
1186                   : (sz==2 ? X86G_CC_OP_SBBW : X86G_CC_OP_SBBB);
1187
1188   /* oldc = old carry flag, 0 or 1 */
1189   assign( oldc, binop(Iop_And32,
1190                       mk_x86g_calculate_eflags_c(),
1191                       mkU32(1)) );
1192
1193   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
1194
1195   assign( tres, binop(minus,
1196                       binop(minus,mkexpr(ta1),mkexpr(ta2)),
1197                       mkexpr(oldcn)) );
1198
1199   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
1200      start of this function. */
1201   if (taddr != IRTemp_INVALID) {
1202      if (texpVal == IRTemp_INVALID) {
1203         vassert(restart_point == 0);
1204         storeLE( mkexpr(taddr), mkexpr(tres) );
1205      } else {
1206         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
1207         /* .. and hence 'texpVal' has the same type as 'tres'. */
1208         casLE( mkexpr(taddr),
1209                mkexpr(texpVal), mkexpr(tres), restart_point );
1210      }
1211   }
1212
1213   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
1214   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1) )) );
1215   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
1216                                                         mkexpr(oldcn)) )) );
1217   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
1218}
1219
1220
1221/* -------------- Helpers for disassembly printing. -------------- */
1222
1223static HChar* nameGrp1 ( Int opc_aux )
1224{
1225   static HChar* grp1_names[8]
1226     = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
1227   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(x86)");
1228   return grp1_names[opc_aux];
1229}
1230
1231static HChar* nameGrp2 ( Int opc_aux )
1232{
1233   static HChar* grp2_names[8]
1234     = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
1235   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(x86)");
1236   return grp2_names[opc_aux];
1237}
1238
1239static HChar* nameGrp4 ( Int opc_aux )
1240{
1241   static HChar* grp4_names[8]
1242     = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
1243   if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(x86)");
1244   return grp4_names[opc_aux];
1245}
1246
1247static HChar* nameGrp5 ( Int opc_aux )
1248{
1249   static HChar* grp5_names[8]
1250     = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
1251   if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(x86)");
1252   return grp5_names[opc_aux];
1253}
1254
1255static HChar* nameGrp8 ( Int opc_aux )
1256{
1257   static HChar* grp8_names[8]
1258     = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
1259   if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(x86)");
1260   return grp8_names[opc_aux];
1261}
1262
1263static HChar* nameIReg ( Int size, Int reg )
1264{
1265   static HChar* ireg32_names[8]
1266     = { "%eax", "%ecx", "%edx", "%ebx",
1267         "%esp", "%ebp", "%esi", "%edi" };
1268   static HChar* ireg16_names[8]
1269     = { "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di" };
1270   static HChar* ireg8_names[8]
1271     = { "%al", "%cl", "%dl", "%bl",
1272         "%ah{sp}", "%ch{bp}", "%dh{si}", "%bh{di}" };
1273   if (reg < 0 || reg > 7) goto bad;
1274   switch (size) {
1275      case 4: return ireg32_names[reg];
1276      case 2: return ireg16_names[reg];
1277      case 1: return ireg8_names[reg];
1278   }
1279  bad:
1280   vpanic("nameIReg(X86)");
1281   return NULL; /*notreached*/
1282}
1283
1284static HChar* nameSReg ( UInt sreg )
1285{
1286   switch (sreg) {
1287      case R_ES: return "%es";
1288      case R_CS: return "%cs";
1289      case R_SS: return "%ss";
1290      case R_DS: return "%ds";
1291      case R_FS: return "%fs";
1292      case R_GS: return "%gs";
1293      default: vpanic("nameSReg(x86)");
1294   }
1295}
1296
1297static HChar* nameMMXReg ( Int mmxreg )
1298{
1299   static HChar* mmx_names[8]
1300     = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
1301   if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(x86,guest)");
1302   return mmx_names[mmxreg];
1303}
1304
1305static HChar* nameXMMReg ( Int xmmreg )
1306{
1307   static HChar* xmm_names[8]
1308     = { "%xmm0", "%xmm1", "%xmm2", "%xmm3",
1309         "%xmm4", "%xmm5", "%xmm6", "%xmm7" };
1310   if (xmmreg < 0 || xmmreg > 7) vpanic("name_of_xmm_reg");
1311   return xmm_names[xmmreg];
1312}
1313
1314static HChar* nameMMXGran ( Int gran )
1315{
1316   switch (gran) {
1317      case 0: return "b";
1318      case 1: return "w";
1319      case 2: return "d";
1320      case 3: return "q";
1321      default: vpanic("nameMMXGran(x86,guest)");
1322   }
1323}
1324
1325static HChar nameISize ( Int size )
1326{
1327   switch (size) {
1328      case 4: return 'l';
1329      case 2: return 'w';
1330      case 1: return 'b';
1331      default: vpanic("nameISize(x86)");
1332   }
1333}
1334
1335
1336/*------------------------------------------------------------*/
1337/*--- JMP helpers                                          ---*/
1338/*------------------------------------------------------------*/
1339
1340static void jmp_lit( IRJumpKind kind, Addr32 d32 )
1341{
1342   irsb->next     = mkU32(d32);
1343   irsb->jumpkind = kind;
1344}
1345
1346static void jmp_treg( IRJumpKind kind, IRTemp t )
1347{
1348   irsb->next = mkexpr(t);
1349   irsb->jumpkind = kind;
1350}
1351
1352static
1353void jcc_01( X86Condcode cond, Addr32 d32_false, Addr32 d32_true )
1354{
1355   Bool        invert;
1356   X86Condcode condPos;
1357   condPos = positiveIse_X86Condcode ( cond, &invert );
1358   if (invert) {
1359      stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
1360                         Ijk_Boring,
1361                         IRConst_U32(d32_false) ) );
1362      irsb->next     = mkU32(d32_true);
1363      irsb->jumpkind = Ijk_Boring;
1364   } else {
1365      stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
1366                         Ijk_Boring,
1367                         IRConst_U32(d32_true) ) );
1368      irsb->next     = mkU32(d32_false);
1369      irsb->jumpkind = Ijk_Boring;
1370   }
1371}
1372
1373
1374/*------------------------------------------------------------*/
1375/*--- Disassembling addressing modes                       ---*/
1376/*------------------------------------------------------------*/
1377
1378static
1379HChar* sorbTxt ( UChar sorb )
1380{
1381   switch (sorb) {
1382      case 0:    return ""; /* no override */
1383      case 0x3E: return "%ds";
1384      case 0x26: return "%es:";
1385      case 0x64: return "%fs:";
1386      case 0x65: return "%gs:";
1387      default: vpanic("sorbTxt(x86,guest)");
1388   }
1389}
1390
1391
1392/* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
1393   linear address by adding any required segment override as indicated
1394   by sorb. */
1395static
1396IRExpr* handleSegOverride ( UChar sorb, IRExpr* virtual )
1397{
1398   Int    sreg;
1399   IRType hWordTy;
1400   IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
1401
1402   if (sorb == 0)
1403      /* the common case - no override */
1404      return virtual;
1405
1406   switch (sorb) {
1407      case 0x3E: sreg = R_DS; break;
1408      case 0x26: sreg = R_ES; break;
1409      case 0x64: sreg = R_FS; break;
1410      case 0x65: sreg = R_GS; break;
1411      default: vpanic("handleSegOverride(x86,guest)");
1412   }
1413
1414   hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
1415
1416   seg_selector = newTemp(Ity_I32);
1417   ldt_ptr      = newTemp(hWordTy);
1418   gdt_ptr      = newTemp(hWordTy);
1419   r64          = newTemp(Ity_I64);
1420
1421   assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
1422   assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
1423   assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
1424
1425   /*
1426   Call this to do the translation and limit checks:
1427   ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
1428                                 UInt seg_selector, UInt virtual_addr )
1429   */
1430   assign(
1431      r64,
1432      mkIRExprCCall(
1433         Ity_I64,
1434         0/*regparms*/,
1435         "x86g_use_seg_selector",
1436         &x86g_use_seg_selector,
1437         mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
1438                        mkexpr(seg_selector), virtual)
1439      )
1440   );
1441
1442   /* If the high 32 of the result are non-zero, there was a
1443      failure in address translation.  In which case, make a
1444      quick exit.
1445   */
1446   stmt(
1447      IRStmt_Exit(
1448         binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
1449         Ijk_MapFail,
1450         IRConst_U32( guest_EIP_curr_instr )
1451      )
1452   );
1453
1454   /* otherwise, here's the translated result. */
1455   return unop(Iop_64to32, mkexpr(r64));
1456}
1457
1458
1459/* Generate IR to calculate an address indicated by a ModRM and
1460   following SIB bytes.  The expression, and the number of bytes in
1461   the address mode, are returned.  Note that this fn should not be
1462   called if the R/M part of the address denotes a register instead of
1463   memory.  If print_codegen is true, text of the addressing mode is
1464   placed in buf.
1465
1466   The computed address is stored in a new tempreg, and the
1467   identity of the tempreg is returned.  */
1468
1469static IRTemp disAMode_copy2tmp ( IRExpr* addr32 )
1470{
1471   IRTemp tmp = newTemp(Ity_I32);
1472   assign( tmp, addr32 );
1473   return tmp;
1474}
1475
1476static
1477IRTemp disAMode ( Int* len, UChar sorb, Int delta, HChar* buf )
1478{
1479   UChar mod_reg_rm = getIByte(delta);
1480   delta++;
1481
1482   buf[0] = (UChar)0;
1483
1484   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
1485      jump table seems a bit excessive.
1486   */
1487   mod_reg_rm &= 0xC7;                      /* is now XX000YYY */
1488   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
1489                                            /* is now XX0XXYYY */
1490   mod_reg_rm &= 0x1F;                      /* is now 000XXYYY */
1491   switch (mod_reg_rm) {
1492
1493      /* (%eax) .. (%edi), not including (%esp) or (%ebp).
1494         --> GET %reg, t
1495      */
1496      case 0x00: case 0x01: case 0x02: case 0x03:
1497      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
1498         { UChar rm = mod_reg_rm;
1499           DIS(buf, "%s(%s)", sorbTxt(sorb), nameIReg(4,rm));
1500           *len = 1;
1501           return disAMode_copy2tmp(
1502                  handleSegOverride(sorb, getIReg(4,rm)));
1503         }
1504
1505      /* d8(%eax) ... d8(%edi), not including d8(%esp)
1506         --> GET %reg, t ; ADDL d8, t
1507      */
1508      case 0x08: case 0x09: case 0x0A: case 0x0B:
1509      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
1510         { UChar rm = toUChar(mod_reg_rm & 7);
1511           UInt  d  = getSDisp8(delta);
1512           DIS(buf, "%s%d(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
1513           *len = 2;
1514           return disAMode_copy2tmp(
1515                  handleSegOverride(sorb,
1516                     binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
1517         }
1518
1519      /* d32(%eax) ... d32(%edi), not including d32(%esp)
1520         --> GET %reg, t ; ADDL d8, t
1521      */
1522      case 0x10: case 0x11: case 0x12: case 0x13:
1523      /* ! 14 */ case 0x15: case 0x16: case 0x17:
1524         { UChar rm = toUChar(mod_reg_rm & 7);
1525           UInt  d  = getUDisp32(delta);
1526           DIS(buf, "%s0x%x(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
1527           *len = 5;
1528           return disAMode_copy2tmp(
1529                  handleSegOverride(sorb,
1530                     binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
1531         }
1532
1533      /* a register, %eax .. %edi.  This shouldn't happen. */
1534      case 0x18: case 0x19: case 0x1A: case 0x1B:
1535      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
1536         vpanic("disAMode(x86): not an addr!");
1537
1538      /* a 32-bit literal address
1539         --> MOV d32, tmp
1540      */
1541      case 0x05:
1542         { UInt d = getUDisp32(delta);
1543           *len = 5;
1544           DIS(buf, "%s(0x%x)", sorbTxt(sorb), d);
1545           return disAMode_copy2tmp(
1546                     handleSegOverride(sorb, mkU32(d)));
1547         }
1548
1549      case 0x04: {
1550         /* SIB, with no displacement.  Special cases:
1551            -- %esp cannot act as an index value.
1552               If index_r indicates %esp, zero is used for the index.
1553            -- when mod is zero and base indicates EBP, base is instead
1554               a 32-bit literal.
1555            It's all madness, I tell you.  Extract %index, %base and
1556            scale from the SIB byte.  The value denoted is then:
1557               | %index == %ESP && %base == %EBP
1558               = d32 following SIB byte
1559               | %index == %ESP && %base != %EBP
1560               = %base
1561               | %index != %ESP && %base == %EBP
1562               = d32 following SIB byte + (%index << scale)
1563               | %index != %ESP && %base != %ESP
1564               = %base + (%index << scale)
1565
1566            What happens to the souls of CPU architects who dream up such
1567            horrendous schemes, do you suppose?
1568         */
1569         UChar sib     = getIByte(delta);
1570         UChar scale   = toUChar((sib >> 6) & 3);
1571         UChar index_r = toUChar((sib >> 3) & 7);
1572         UChar base_r  = toUChar(sib & 7);
1573         delta++;
1574
1575         if (index_r != R_ESP && base_r != R_EBP) {
1576            DIS(buf, "%s(%s,%s,%d)", sorbTxt(sorb),
1577                      nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
1578            *len = 2;
1579            return
1580               disAMode_copy2tmp(
1581               handleSegOverride(sorb,
1582                  binop(Iop_Add32,
1583                        getIReg(4,base_r),
1584                        binop(Iop_Shl32, getIReg(4,index_r),
1585                              mkU8(scale)))));
1586         }
1587
1588         if (index_r != R_ESP && base_r == R_EBP) {
1589            UInt d = getUDisp32(delta);
1590            DIS(buf, "%s0x%x(,%s,%d)", sorbTxt(sorb), d,
1591                      nameIReg(4,index_r), 1<<scale);
1592            *len = 6;
1593            return
1594               disAMode_copy2tmp(
1595               handleSegOverride(sorb,
1596                  binop(Iop_Add32,
1597                        binop(Iop_Shl32, getIReg(4,index_r), mkU8(scale)),
1598                        mkU32(d))));
1599         }
1600
1601         if (index_r == R_ESP && base_r != R_EBP) {
1602            DIS(buf, "%s(%s,,)", sorbTxt(sorb), nameIReg(4,base_r));
1603            *len = 2;
1604            return disAMode_copy2tmp(
1605                   handleSegOverride(sorb, getIReg(4,base_r)));
1606         }
1607
1608         if (index_r == R_ESP && base_r == R_EBP) {
1609            UInt d = getUDisp32(delta);
1610            DIS(buf, "%s0x%x(,,)", sorbTxt(sorb), d);
1611            *len = 6;
1612            return disAMode_copy2tmp(
1613                   handleSegOverride(sorb, mkU32(d)));
1614         }
1615         /*NOTREACHED*/
1616         vassert(0);
1617      }
1618
1619      /* SIB, with 8-bit displacement.  Special cases:
1620         -- %esp cannot act as an index value.
1621            If index_r indicates %esp, zero is used for the index.
1622         Denoted value is:
1623            | %index == %ESP
1624            = d8 + %base
1625            | %index != %ESP
1626            = d8 + %base + (%index << scale)
1627      */
1628      case 0x0C: {
1629         UChar sib     = getIByte(delta);
1630         UChar scale   = toUChar((sib >> 6) & 3);
1631         UChar index_r = toUChar((sib >> 3) & 7);
1632         UChar base_r  = toUChar(sib & 7);
1633         UInt  d       = getSDisp8(delta+1);
1634
1635         if (index_r == R_ESP) {
1636            DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
1637                                   (Int)d, nameIReg(4,base_r));
1638            *len = 3;
1639            return disAMode_copy2tmp(
1640                   handleSegOverride(sorb,
1641                      binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
1642         } else {
1643            DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
1644                     nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
1645            *len = 3;
1646            return
1647                disAMode_copy2tmp(
1648                handleSegOverride(sorb,
1649                  binop(Iop_Add32,
1650                        binop(Iop_Add32,
1651                              getIReg(4,base_r),
1652                              binop(Iop_Shl32,
1653                                    getIReg(4,index_r), mkU8(scale))),
1654                        mkU32(d))));
1655         }
1656	 /*NOTREACHED*/
1657         vassert(0);
1658      }
1659
1660      /* SIB, with 32-bit displacement.  Special cases:
1661         -- %esp cannot act as an index value.
1662            If index_r indicates %esp, zero is used for the index.
1663         Denoted value is:
1664            | %index == %ESP
1665            = d32 + %base
1666            | %index != %ESP
1667            = d32 + %base + (%index << scale)
1668      */
1669      case 0x14: {
1670         UChar sib     = getIByte(delta);
1671         UChar scale   = toUChar((sib >> 6) & 3);
1672         UChar index_r = toUChar((sib >> 3) & 7);
1673         UChar base_r  = toUChar(sib & 7);
1674         UInt d        = getUDisp32(delta+1);
1675
1676         if (index_r == R_ESP) {
1677            DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
1678                                   (Int)d, nameIReg(4,base_r));
1679            *len = 6;
1680            return disAMode_copy2tmp(
1681                   handleSegOverride(sorb,
1682                      binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
1683         } else {
1684            DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
1685                     nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
1686            *len = 6;
1687            return
1688                disAMode_copy2tmp(
1689                handleSegOverride(sorb,
1690                  binop(Iop_Add32,
1691                        binop(Iop_Add32,
1692                              getIReg(4,base_r),
1693                              binop(Iop_Shl32,
1694                                    getIReg(4,index_r), mkU8(scale))),
1695                        mkU32(d))));
1696         }
1697	 /*NOTREACHED*/
1698         vassert(0);
1699      }
1700
1701      default:
1702         vpanic("disAMode(x86)");
1703         return 0; /*notreached*/
1704   }
1705}
1706
1707
1708/* Figure out the number of (insn-stream) bytes constituting the amode
1709   beginning at delta.  Is useful for getting hold of literals beyond
1710   the end of the amode before it has been disassembled.  */
1711
1712static UInt lengthAMode ( Int delta )
1713{
1714   UChar mod_reg_rm = getIByte(delta); delta++;
1715
1716   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
1717      jump table seems a bit excessive.
1718   */
1719   mod_reg_rm &= 0xC7;               /* is now XX000YYY */
1720   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
1721                                     /* is now XX0XXYYY */
1722   mod_reg_rm &= 0x1F;               /* is now 000XXYYY */
1723   switch (mod_reg_rm) {
1724
1725      /* (%eax) .. (%edi), not including (%esp) or (%ebp). */
1726      case 0x00: case 0x01: case 0x02: case 0x03:
1727      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
1728         return 1;
1729
1730      /* d8(%eax) ... d8(%edi), not including d8(%esp). */
1731      case 0x08: case 0x09: case 0x0A: case 0x0B:
1732      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
1733         return 2;
1734
1735      /* d32(%eax) ... d32(%edi), not including d32(%esp). */
1736      case 0x10: case 0x11: case 0x12: case 0x13:
1737      /* ! 14 */ case 0x15: case 0x16: case 0x17:
1738         return 5;
1739
1740      /* a register, %eax .. %edi.  (Not an addr, but still handled.) */
1741      case 0x18: case 0x19: case 0x1A: case 0x1B:
1742      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
1743         return 1;
1744
1745      /* a 32-bit literal address. */
1746      case 0x05: return 5;
1747
1748      /* SIB, no displacement.  */
1749      case 0x04: {
1750         UChar sib    = getIByte(delta);
1751         UChar base_r = toUChar(sib & 7);
1752         if (base_r == R_EBP) return 6; else return 2;
1753      }
1754      /* SIB, with 8-bit displacement.  */
1755      case 0x0C: return 3;
1756
1757      /* SIB, with 32-bit displacement.  */
1758      case 0x14: return 6;
1759
1760      default:
1761         vpanic("lengthAMode");
1762         return 0; /*notreached*/
1763   }
1764}
1765
1766/*------------------------------------------------------------*/
1767/*--- Disassembling common idioms                          ---*/
1768/*------------------------------------------------------------*/
1769
1770/* Handle binary integer instructions of the form
1771      op E, G  meaning
1772      op reg-or-mem, reg
1773   Is passed the a ptr to the modRM byte, the actual operation, and the
1774   data size.  Returns the address advanced completely over this
1775   instruction.
1776
1777   E(src) is reg-or-mem
1778   G(dst) is reg.
1779
1780   If E is reg, -->    GET %G,  tmp
1781                       OP %E,   tmp
1782                       PUT tmp, %G
1783
1784   If E is mem and OP is not reversible,
1785                -->    (getAddr E) -> tmpa
1786                       LD (tmpa), tmpa
1787                       GET %G, tmp2
1788                       OP tmpa, tmp2
1789                       PUT tmp2, %G
1790
1791   If E is mem and OP is reversible
1792                -->    (getAddr E) -> tmpa
1793                       LD (tmpa), tmpa
1794                       OP %G, tmpa
1795                       PUT tmpa, %G
1796*/
1797static
1798UInt dis_op2_E_G ( UChar       sorb,
1799                   Bool        addSubCarry,
1800                   IROp        op8,
1801                   Bool        keep,
1802                   Int         size,
1803                   Int         delta0,
1804                   HChar*      t_x86opc )
1805{
1806   HChar   dis_buf[50];
1807   Int     len;
1808   IRType  ty   = szToITy(size);
1809   IRTemp  dst1 = newTemp(ty);
1810   IRTemp  src  = newTemp(ty);
1811   IRTemp  dst0 = newTemp(ty);
1812   UChar   rm   = getUChar(delta0);
1813   IRTemp  addr = IRTemp_INVALID;
1814
1815   /* addSubCarry == True indicates the intended operation is
1816      add-with-carry or subtract-with-borrow. */
1817   if (addSubCarry) {
1818      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
1819      vassert(keep);
1820   }
1821
1822   if (epartIsReg(rm)) {
1823      /* Specially handle XOR reg,reg, because that doesn't really
1824         depend on reg, and doing the obvious thing potentially
1825         generates a spurious value check failure due to the bogus
1826         dependency.  Ditto SBB reg,reg. */
1827      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
1828          && gregOfRM(rm) == eregOfRM(rm)) {
1829         putIReg(size, gregOfRM(rm), mkU(ty,0));
1830      }
1831      assign( dst0, getIReg(size,gregOfRM(rm)) );
1832      assign( src,  getIReg(size,eregOfRM(rm)) );
1833
1834      if (addSubCarry && op8 == Iop_Add8) {
1835         helper_ADC( size, dst1, dst0, src,
1836                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1837         putIReg(size, gregOfRM(rm), mkexpr(dst1));
1838      } else
1839      if (addSubCarry && op8 == Iop_Sub8) {
1840         helper_SBB( size, dst1, dst0, src,
1841                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1842         putIReg(size, gregOfRM(rm), mkexpr(dst1));
1843      } else {
1844         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
1845         if (isAddSub(op8))
1846            setFlags_DEP1_DEP2(op8, dst0, src, ty);
1847         else
1848            setFlags_DEP1(op8, dst1, ty);
1849         if (keep)
1850            putIReg(size, gregOfRM(rm), mkexpr(dst1));
1851      }
1852
1853      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
1854                          nameIReg(size,eregOfRM(rm)),
1855                          nameIReg(size,gregOfRM(rm)));
1856      return 1+delta0;
1857   } else {
1858      /* E refers to memory */
1859      addr = disAMode ( &len, sorb, delta0, dis_buf);
1860      assign( dst0, getIReg(size,gregOfRM(rm)) );
1861      assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
1862
1863      if (addSubCarry && op8 == Iop_Add8) {
1864         helper_ADC( size, dst1, dst0, src,
1865                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1866         putIReg(size, gregOfRM(rm), mkexpr(dst1));
1867      } else
1868      if (addSubCarry && op8 == Iop_Sub8) {
1869         helper_SBB( size, dst1, dst0, src,
1870                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1871         putIReg(size, gregOfRM(rm), mkexpr(dst1));
1872      } else {
1873         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
1874         if (isAddSub(op8))
1875            setFlags_DEP1_DEP2(op8, dst0, src, ty);
1876         else
1877            setFlags_DEP1(op8, dst1, ty);
1878         if (keep)
1879            putIReg(size, gregOfRM(rm), mkexpr(dst1));
1880      }
1881
1882      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
1883                          dis_buf,nameIReg(size,gregOfRM(rm)));
1884      return len+delta0;
1885   }
1886}
1887
1888
1889
1890/* Handle binary integer instructions of the form
1891      op G, E  meaning
1892      op reg, reg-or-mem
1893   Is passed the a ptr to the modRM byte, the actual operation, and the
1894   data size.  Returns the address advanced completely over this
1895   instruction.
1896
1897   G(src) is reg.
1898   E(dst) is reg-or-mem
1899
1900   If E is reg, -->    GET %E,  tmp
1901                       OP %G,   tmp
1902                       PUT tmp, %E
1903
1904   If E is mem, -->    (getAddr E) -> tmpa
1905                       LD (tmpa), tmpv
1906                       OP %G, tmpv
1907                       ST tmpv, (tmpa)
1908*/
1909static
1910UInt dis_op2_G_E ( UChar       sorb,
1911                   Bool        locked,
1912                   Bool        addSubCarry,
1913                   IROp        op8,
1914                   Bool        keep,
1915                   Int         size,
1916                   Int         delta0,
1917                   HChar*      t_x86opc )
1918{
1919   HChar   dis_buf[50];
1920   Int     len;
1921   IRType  ty   = szToITy(size);
1922   IRTemp  dst1 = newTemp(ty);
1923   IRTemp  src  = newTemp(ty);
1924   IRTemp  dst0 = newTemp(ty);
1925   UChar   rm   = getIByte(delta0);
1926   IRTemp  addr = IRTemp_INVALID;
1927
1928   /* addSubCarry == True indicates the intended operation is
1929      add-with-carry or subtract-with-borrow. */
1930   if (addSubCarry) {
1931      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
1932      vassert(keep);
1933   }
1934
1935   if (epartIsReg(rm)) {
1936      /* Specially handle XOR reg,reg, because that doesn't really
1937         depend on reg, and doing the obvious thing potentially
1938         generates a spurious value check failure due to the bogus
1939         dependency.  Ditto SBB reg,reg.*/
1940      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
1941          && gregOfRM(rm) == eregOfRM(rm)) {
1942         putIReg(size, eregOfRM(rm), mkU(ty,0));
1943      }
1944      assign(dst0, getIReg(size,eregOfRM(rm)));
1945      assign(src,  getIReg(size,gregOfRM(rm)));
1946
1947      if (addSubCarry && op8 == Iop_Add8) {
1948         helper_ADC( size, dst1, dst0, src,
1949                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1950         putIReg(size, eregOfRM(rm), mkexpr(dst1));
1951      } else
1952      if (addSubCarry && op8 == Iop_Sub8) {
1953         helper_SBB( size, dst1, dst0, src,
1954                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1955         putIReg(size, eregOfRM(rm), mkexpr(dst1));
1956      } else {
1957         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
1958         if (isAddSub(op8))
1959            setFlags_DEP1_DEP2(op8, dst0, src, ty);
1960         else
1961            setFlags_DEP1(op8, dst1, ty);
1962         if (keep)
1963            putIReg(size, eregOfRM(rm), mkexpr(dst1));
1964      }
1965
1966      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
1967                          nameIReg(size,gregOfRM(rm)),
1968                          nameIReg(size,eregOfRM(rm)));
1969      return 1+delta0;
1970   }
1971
1972   /* E refers to memory */
1973   {
1974      addr = disAMode ( &len, sorb, delta0, dis_buf);
1975      assign(dst0, loadLE(ty,mkexpr(addr)));
1976      assign(src,  getIReg(size,gregOfRM(rm)));
1977
1978      if (addSubCarry && op8 == Iop_Add8) {
1979         if (locked) {
1980            /* cas-style store */
1981            helper_ADC( size, dst1, dst0, src,
1982                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
1983         } else {
1984            /* normal store */
1985            helper_ADC( size, dst1, dst0, src,
1986                        /*store*/addr, IRTemp_INVALID, 0 );
1987         }
1988      } else
1989      if (addSubCarry && op8 == Iop_Sub8) {
1990         if (locked) {
1991            /* cas-style store */
1992            helper_SBB( size, dst1, dst0, src,
1993                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
1994         } else {
1995            /* normal store */
1996            helper_SBB( size, dst1, dst0, src,
1997                        /*store*/addr, IRTemp_INVALID, 0 );
1998         }
1999      } else {
2000         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2001         if (keep) {
2002            if (locked) {
2003               if (0) vex_printf("locked case\n" );
2004               casLE( mkexpr(addr),
2005                      mkexpr(dst0)/*expval*/,
2006                      mkexpr(dst1)/*newval*/, guest_EIP_curr_instr );
2007            } else {
2008               if (0) vex_printf("nonlocked case\n");
2009               storeLE(mkexpr(addr), mkexpr(dst1));
2010            }
2011         }
2012         if (isAddSub(op8))
2013            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2014         else
2015            setFlags_DEP1(op8, dst1, ty);
2016      }
2017
2018      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
2019                          nameIReg(size,gregOfRM(rm)), dis_buf);
2020      return len+delta0;
2021   }
2022}
2023
2024
2025/* Handle move instructions of the form
2026      mov E, G  meaning
2027      mov reg-or-mem, reg
2028   Is passed the a ptr to the modRM byte, and the data size.  Returns
2029   the address advanced completely over this instruction.
2030
2031   E(src) is reg-or-mem
2032   G(dst) is reg.
2033
2034   If E is reg, -->    GET %E,  tmpv
2035                       PUT tmpv, %G
2036
2037   If E is mem  -->    (getAddr E) -> tmpa
2038                       LD (tmpa), tmpb
2039                       PUT tmpb, %G
2040*/
2041static
2042UInt dis_mov_E_G ( UChar       sorb,
2043                   Int         size,
2044                   Int         delta0 )
2045{
2046   Int len;
2047   UChar rm = getIByte(delta0);
2048   HChar dis_buf[50];
2049
2050   if (epartIsReg(rm)) {
2051      putIReg(size, gregOfRM(rm), getIReg(size, eregOfRM(rm)));
2052      DIP("mov%c %s,%s\n", nameISize(size),
2053                           nameIReg(size,eregOfRM(rm)),
2054                           nameIReg(size,gregOfRM(rm)));
2055      return 1+delta0;
2056   }
2057
2058   /* E refers to memory */
2059   {
2060      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
2061      putIReg(size, gregOfRM(rm), loadLE(szToITy(size), mkexpr(addr)));
2062      DIP("mov%c %s,%s\n", nameISize(size),
2063                           dis_buf,nameIReg(size,gregOfRM(rm)));
2064      return delta0+len;
2065   }
2066}
2067
2068
2069/* Handle move instructions of the form
2070      mov G, E  meaning
2071      mov reg, reg-or-mem
2072   Is passed the a ptr to the modRM byte, and the data size.  Returns
2073   the address advanced completely over this instruction.
2074
2075   G(src) is reg.
2076   E(dst) is reg-or-mem
2077
2078   If E is reg, -->    GET %G,  tmp
2079                       PUT tmp, %E
2080
2081   If E is mem, -->    (getAddr E) -> tmpa
2082                       GET %G, tmpv
2083                       ST tmpv, (tmpa)
2084*/
2085static
2086UInt dis_mov_G_E ( UChar       sorb,
2087                   Int         size,
2088                   Int         delta0 )
2089{
2090   Int len;
2091   UChar rm = getIByte(delta0);
2092   HChar dis_buf[50];
2093
2094   if (epartIsReg(rm)) {
2095      putIReg(size, eregOfRM(rm), getIReg(size, gregOfRM(rm)));
2096      DIP("mov%c %s,%s\n", nameISize(size),
2097                           nameIReg(size,gregOfRM(rm)),
2098                           nameIReg(size,eregOfRM(rm)));
2099      return 1+delta0;
2100   }
2101
2102   /* E refers to memory */
2103   {
2104      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf);
2105      storeLE( mkexpr(addr), getIReg(size, gregOfRM(rm)) );
2106      DIP("mov%c %s,%s\n", nameISize(size),
2107                           nameIReg(size,gregOfRM(rm)), dis_buf);
2108      return len+delta0;
2109   }
2110}
2111
2112
2113/* op $immediate, AL/AX/EAX. */
2114static
2115UInt dis_op_imm_A ( Int    size,
2116                    Bool   carrying,
2117                    IROp   op8,
2118                    Bool   keep,
2119                    Int    delta,
2120                    HChar* t_x86opc )
2121{
2122   IRType ty   = szToITy(size);
2123   IRTemp dst0 = newTemp(ty);
2124   IRTemp src  = newTemp(ty);
2125   IRTemp dst1 = newTemp(ty);
2126   UInt lit    = getUDisp(size,delta);
2127   assign(dst0, getIReg(size,R_EAX));
2128   assign(src,  mkU(ty,lit));
2129
2130   if (isAddSub(op8) && !carrying) {
2131      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
2132      setFlags_DEP1_DEP2(op8, dst0, src, ty);
2133   }
2134   else
2135   if (isLogic(op8)) {
2136      vassert(!carrying);
2137      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
2138      setFlags_DEP1(op8, dst1, ty);
2139   }
2140   else
2141   if (op8 == Iop_Add8 && carrying) {
2142      helper_ADC( size, dst1, dst0, src,
2143                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2144   }
2145   else
2146   if (op8 == Iop_Sub8 && carrying) {
2147      helper_SBB( size, dst1, dst0, src,
2148                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2149   }
2150   else
2151      vpanic("dis_op_imm_A(x86,guest)");
2152
2153   if (keep)
2154      putIReg(size, R_EAX, mkexpr(dst1));
2155
2156   DIP("%s%c $0x%x, %s\n", t_x86opc, nameISize(size),
2157                           lit, nameIReg(size,R_EAX));
2158   return delta+size;
2159}
2160
2161
2162/* Sign- and Zero-extending moves. */
2163static
2164UInt dis_movx_E_G ( UChar      sorb,
2165                    Int delta, Int szs, Int szd, Bool sign_extend )
2166{
2167   UChar rm = getIByte(delta);
2168   if (epartIsReg(rm)) {
2169      if (szd == szs) {
2170         // mutant case.  See #250799
2171         putIReg(szd, gregOfRM(rm),
2172                           getIReg(szs,eregOfRM(rm)));
2173      } else {
2174         // normal case
2175         putIReg(szd, gregOfRM(rm),
2176                      unop(mkWidenOp(szs,szd,sign_extend),
2177                           getIReg(szs,eregOfRM(rm))));
2178      }
2179      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
2180                               nameISize(szs), nameISize(szd),
2181                               nameIReg(szs,eregOfRM(rm)),
2182                               nameIReg(szd,gregOfRM(rm)));
2183      return 1+delta;
2184   }
2185
2186   /* E refers to memory */
2187   {
2188      Int    len;
2189      HChar  dis_buf[50];
2190      IRTemp addr = disAMode ( &len, sorb, delta, dis_buf );
2191      if (szd == szs) {
2192         // mutant case.  See #250799
2193         putIReg(szd, gregOfRM(rm),
2194                           loadLE(szToITy(szs),mkexpr(addr)));
2195      } else {
2196         // normal case
2197         putIReg(szd, gregOfRM(rm),
2198                      unop(mkWidenOp(szs,szd,sign_extend),
2199                           loadLE(szToITy(szs),mkexpr(addr))));
2200      }
2201      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
2202                               nameISize(szs), nameISize(szd),
2203                               dis_buf, nameIReg(szd,gregOfRM(rm)));
2204      return len+delta;
2205   }
2206}
2207
2208
2209/* Generate code to divide ArchRegs EDX:EAX / DX:AX / AX by the 32 /
2210   16 / 8 bit quantity in the given IRTemp.  */
2211static
2212void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
2213{
2214   IROp   op    = signed_divide ? Iop_DivModS64to32 : Iop_DivModU64to32;
2215   IRTemp src64 = newTemp(Ity_I64);
2216   IRTemp dst64 = newTemp(Ity_I64);
2217   switch (sz) {
2218      case 4:
2219         assign( src64, binop(Iop_32HLto64,
2220                              getIReg(4,R_EDX), getIReg(4,R_EAX)) );
2221         assign( dst64, binop(op, mkexpr(src64), mkexpr(t)) );
2222         putIReg( 4, R_EAX, unop(Iop_64to32,mkexpr(dst64)) );
2223         putIReg( 4, R_EDX, unop(Iop_64HIto32,mkexpr(dst64)) );
2224         break;
2225      case 2: {
2226         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
2227         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
2228         assign( src64, unop(widen3264,
2229                             binop(Iop_16HLto32,
2230                                   getIReg(2,R_EDX), getIReg(2,R_EAX))) );
2231         assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
2232         putIReg( 2, R_EAX, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
2233         putIReg( 2, R_EDX, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
2234         break;
2235      }
2236      case 1: {
2237         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
2238         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
2239         IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
2240         assign( src64, unop(widen3264, unop(widen1632, getIReg(2,R_EAX))) );
2241         assign( dst64,
2242                 binop(op, mkexpr(src64),
2243                           unop(widen1632, unop(widen816, mkexpr(t)))) );
2244         putIReg( 1, R_AL, unop(Iop_16to8, unop(Iop_32to16,
2245                           unop(Iop_64to32,mkexpr(dst64)))) );
2246         putIReg( 1, R_AH, unop(Iop_16to8, unop(Iop_32to16,
2247                           unop(Iop_64HIto32,mkexpr(dst64)))) );
2248         break;
2249      }
2250      default: vpanic("codegen_div(x86)");
2251   }
2252}
2253
2254
2255static
2256UInt dis_Grp1 ( UChar sorb, Bool locked,
2257                Int delta, UChar modrm,
2258                Int am_sz, Int d_sz, Int sz, UInt d32 )
2259{
2260   Int     len;
2261   HChar   dis_buf[50];
2262   IRType  ty   = szToITy(sz);
2263   IRTemp  dst1 = newTemp(ty);
2264   IRTemp  src  = newTemp(ty);
2265   IRTemp  dst0 = newTemp(ty);
2266   IRTemp  addr = IRTemp_INVALID;
2267   IROp    op8  = Iop_INVALID;
2268   UInt    mask = sz==1 ? 0xFF : (sz==2 ? 0xFFFF : 0xFFFFFFFF);
2269
2270   switch (gregOfRM(modrm)) {
2271      case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
2272      case 2: break;  // ADC
2273      case 3: break;  // SBB
2274      case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
2275      case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
2276      /*NOTREACHED*/
2277      default: vpanic("dis_Grp1: unhandled case");
2278   }
2279
2280   if (epartIsReg(modrm)) {
2281      vassert(am_sz == 1);
2282
2283      assign(dst0, getIReg(sz,eregOfRM(modrm)));
2284      assign(src,  mkU(ty,d32 & mask));
2285
2286      if (gregOfRM(modrm) == 2 /* ADC */) {
2287         helper_ADC( sz, dst1, dst0, src,
2288                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2289      } else
2290      if (gregOfRM(modrm) == 3 /* SBB */) {
2291         helper_SBB( sz, dst1, dst0, src,
2292                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2293      } else {
2294         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2295         if (isAddSub(op8))
2296            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2297         else
2298            setFlags_DEP1(op8, dst1, ty);
2299      }
2300
2301      if (gregOfRM(modrm) < 7)
2302         putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
2303
2304      delta += (am_sz + d_sz);
2305      DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz), d32,
2306                              nameIReg(sz,eregOfRM(modrm)));
2307   } else {
2308      addr = disAMode ( &len, sorb, delta, dis_buf);
2309
2310      assign(dst0, loadLE(ty,mkexpr(addr)));
2311      assign(src, mkU(ty,d32 & mask));
2312
2313      if (gregOfRM(modrm) == 2 /* ADC */) {
2314         if (locked) {
2315            /* cas-style store */
2316            helper_ADC( sz, dst1, dst0, src,
2317                       /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
2318         } else {
2319            /* normal store */
2320            helper_ADC( sz, dst1, dst0, src,
2321                        /*store*/addr, IRTemp_INVALID, 0 );
2322         }
2323      } else
2324      if (gregOfRM(modrm) == 3 /* SBB */) {
2325         if (locked) {
2326            /* cas-style store */
2327            helper_SBB( sz, dst1, dst0, src,
2328                       /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
2329         } else {
2330            /* normal store */
2331            helper_SBB( sz, dst1, dst0, src,
2332                        /*store*/addr, IRTemp_INVALID, 0 );
2333         }
2334      } else {
2335         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2336         if (gregOfRM(modrm) < 7) {
2337            if (locked) {
2338               casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
2339                                    mkexpr(dst1)/*newVal*/,
2340                                    guest_EIP_curr_instr );
2341            } else {
2342               storeLE(mkexpr(addr), mkexpr(dst1));
2343            }
2344         }
2345         if (isAddSub(op8))
2346            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2347         else
2348            setFlags_DEP1(op8, dst1, ty);
2349      }
2350
2351      delta += (len+d_sz);
2352      DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz),
2353                              d32, dis_buf);
2354   }
2355   return delta;
2356}
2357
2358
2359/* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
2360   expression. */
2361
2362static
2363UInt dis_Grp2 ( UChar sorb,
2364                Int delta, UChar modrm,
2365                Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
2366                HChar* shift_expr_txt, Bool* decode_OK )
2367{
2368   /* delta on entry points at the modrm byte. */
2369   HChar  dis_buf[50];
2370   Int    len;
2371   Bool   isShift, isRotate, isRotateC;
2372   IRType ty    = szToITy(sz);
2373   IRTemp dst0  = newTemp(ty);
2374   IRTemp dst1  = newTemp(ty);
2375   IRTemp addr  = IRTemp_INVALID;
2376
2377   *decode_OK = True;
2378
2379   vassert(sz == 1 || sz == 2 || sz == 4);
2380
2381   /* Put value to shift/rotate in dst0. */
2382   if (epartIsReg(modrm)) {
2383      assign(dst0, getIReg(sz, eregOfRM(modrm)));
2384      delta += (am_sz + d_sz);
2385   } else {
2386      addr = disAMode ( &len, sorb, delta, dis_buf);
2387      assign(dst0, loadLE(ty,mkexpr(addr)));
2388      delta += len + d_sz;
2389   }
2390
2391   isShift = False;
2392   switch (gregOfRM(modrm)) { case 4: case 5: case 7: isShift = True; }
2393
2394   isRotate = False;
2395   switch (gregOfRM(modrm)) { case 0: case 1: isRotate = True; }
2396
2397   isRotateC = False;
2398   switch (gregOfRM(modrm)) { case 2: case 3: isRotateC = True; }
2399
2400   if (gregOfRM(modrm) == 6) {
2401      *decode_OK = False;
2402      return delta;
2403   }
2404
2405   if (!isShift && !isRotate && !isRotateC) {
2406      /*NOTREACHED*/
2407      vpanic("dis_Grp2(Reg): unhandled case(x86)");
2408   }
2409
2410   if (isRotateC) {
2411      /* call a helper; these insns are so ridiculous they do not
2412         deserve better */
2413      Bool     left = toBool(gregOfRM(modrm) == 2);
2414      IRTemp   r64  = newTemp(Ity_I64);
2415      IRExpr** args
2416         = mkIRExprVec_4( widenUto32(mkexpr(dst0)), /* thing to rotate */
2417                          widenUto32(shift_expr),   /* rotate amount */
2418                          widenUto32(mk_x86g_calculate_eflags_all()),
2419                          mkU32(sz) );
2420      assign( r64, mkIRExprCCall(
2421                      Ity_I64,
2422                      0/*regparm*/,
2423                      left ? "x86g_calculate_RCL" : "x86g_calculate_RCR",
2424                      left ? &x86g_calculate_RCL  : &x86g_calculate_RCR,
2425                      args
2426                   )
2427            );
2428      /* new eflags in hi half r64; new value in lo half r64 */
2429      assign( dst1, narrowTo(ty, unop(Iop_64to32, mkexpr(r64))) );
2430      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
2431      stmt( IRStmt_Put( OFFB_CC_DEP1, unop(Iop_64HIto32, mkexpr(r64)) ));
2432      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
2433      /* Set NDEP even though it isn't used.  This makes redundant-PUT
2434         elimination of previous stores to this field work better. */
2435      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
2436   }
2437
2438   if (isShift) {
2439
2440      IRTemp pre32     = newTemp(Ity_I32);
2441      IRTemp res32     = newTemp(Ity_I32);
2442      IRTemp res32ss   = newTemp(Ity_I32);
2443      IRTemp shift_amt = newTemp(Ity_I8);
2444      IROp   op32;
2445
2446      switch (gregOfRM(modrm)) {
2447         case 4: op32 = Iop_Shl32; break;
2448         case 5: op32 = Iop_Shr32; break;
2449         case 7: op32 = Iop_Sar32; break;
2450         /*NOTREACHED*/
2451         default: vpanic("dis_Grp2:shift"); break;
2452      }
2453
2454      /* Widen the value to be shifted to 32 bits, do the shift, and
2455         narrow back down.  This seems surprisingly long-winded, but
2456         unfortunately the Intel semantics requires that 8/16-bit
2457         shifts give defined results for shift values all the way up
2458         to 31, and this seems the simplest way to do it.  It has the
2459         advantage that the only IR level shifts generated are of 32
2460         bit values, and the shift amount is guaranteed to be in the
2461         range 0 .. 31, thereby observing the IR semantics requiring
2462         all shift values to be in the range 0 .. 2^word_size-1. */
2463
2464      /* shift_amt = shift_expr & 31, regardless of operation size */
2465      assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(31)) );
2466
2467      /* suitably widen the value to be shifted to 32 bits. */
2468      assign( pre32, op32==Iop_Sar32 ? widenSto32(mkexpr(dst0))
2469                                     : widenUto32(mkexpr(dst0)) );
2470
2471      /* res32 = pre32 `shift` shift_amt */
2472      assign( res32, binop(op32, mkexpr(pre32), mkexpr(shift_amt)) );
2473
2474      /* res32ss = pre32 `shift` ((shift_amt - 1) & 31) */
2475      assign( res32ss,
2476              binop(op32,
2477                    mkexpr(pre32),
2478                    binop(Iop_And8,
2479                          binop(Iop_Sub8,
2480                                mkexpr(shift_amt), mkU8(1)),
2481                          mkU8(31))) );
2482
2483      /* Build the flags thunk. */
2484      setFlags_DEP1_DEP2_shift(op32, res32, res32ss, ty, shift_amt);
2485
2486      /* Narrow the result back down. */
2487      assign( dst1, narrowTo(ty, mkexpr(res32)) );
2488
2489   } /* if (isShift) */
2490
2491   else
2492   if (isRotate) {
2493      Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
2494      Bool   left      = toBool(gregOfRM(modrm) == 0);
2495      IRTemp rot_amt   = newTemp(Ity_I8);
2496      IRTemp rot_amt32 = newTemp(Ity_I8);
2497      IRTemp oldFlags  = newTemp(Ity_I32);
2498
2499      /* rot_amt = shift_expr & mask */
2500      /* By masking the rotate amount thusly, the IR-level Shl/Shr
2501         expressions never shift beyond the word size and thus remain
2502         well defined. */
2503      assign(rot_amt32, binop(Iop_And8, shift_expr, mkU8(31)));
2504
2505      if (ty == Ity_I32)
2506         assign(rot_amt, mkexpr(rot_amt32));
2507      else
2508         assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt32), mkU8(8*sz-1)));
2509
2510      if (left) {
2511
2512         /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
2513         assign(dst1,
2514            binop( mkSizedOp(ty,Iop_Or8),
2515                   binop( mkSizedOp(ty,Iop_Shl8),
2516                          mkexpr(dst0),
2517                          mkexpr(rot_amt)
2518                   ),
2519                   binop( mkSizedOp(ty,Iop_Shr8),
2520                          mkexpr(dst0),
2521                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
2522                   )
2523            )
2524         );
2525         ccOp += X86G_CC_OP_ROLB;
2526
2527      } else { /* right */
2528
2529         /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
2530         assign(dst1,
2531            binop( mkSizedOp(ty,Iop_Or8),
2532                   binop( mkSizedOp(ty,Iop_Shr8),
2533                          mkexpr(dst0),
2534                          mkexpr(rot_amt)
2535                   ),
2536                   binop( mkSizedOp(ty,Iop_Shl8),
2537                          mkexpr(dst0),
2538                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
2539                   )
2540            )
2541         );
2542         ccOp += X86G_CC_OP_RORB;
2543
2544      }
2545
2546      /* dst1 now holds the rotated value.  Build flag thunk.  We
2547         need the resulting value for this, and the previous flags.
2548         Except don't set it if the rotate count is zero. */
2549
2550      assign(oldFlags, mk_x86g_calculate_eflags_all());
2551
2552      /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
2553      stmt( IRStmt_Put( OFFB_CC_OP,
2554                        IRExpr_Mux0X( mkexpr(rot_amt32),
2555                                      IRExpr_Get(OFFB_CC_OP,Ity_I32),
2556                                      mkU32(ccOp))) );
2557      stmt( IRStmt_Put( OFFB_CC_DEP1,
2558                        IRExpr_Mux0X( mkexpr(rot_amt32),
2559                                      IRExpr_Get(OFFB_CC_DEP1,Ity_I32),
2560                                      widenUto32(mkexpr(dst1)))) );
2561      stmt( IRStmt_Put( OFFB_CC_DEP2,
2562                        IRExpr_Mux0X( mkexpr(rot_amt32),
2563                                      IRExpr_Get(OFFB_CC_DEP2,Ity_I32),
2564                                      mkU32(0))) );
2565      stmt( IRStmt_Put( OFFB_CC_NDEP,
2566                        IRExpr_Mux0X( mkexpr(rot_amt32),
2567                                      IRExpr_Get(OFFB_CC_NDEP,Ity_I32),
2568                                      mkexpr(oldFlags))) );
2569   } /* if (isRotate) */
2570
2571   /* Save result, and finish up. */
2572   if (epartIsReg(modrm)) {
2573      putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
2574      if (vex_traceflags & VEX_TRACE_FE) {
2575         vex_printf("%s%c ",
2576                    nameGrp2(gregOfRM(modrm)), nameISize(sz) );
2577         if (shift_expr_txt)
2578            vex_printf("%s", shift_expr_txt);
2579         else
2580            ppIRExpr(shift_expr);
2581         vex_printf(", %s\n", nameIReg(sz,eregOfRM(modrm)));
2582      }
2583   } else {
2584      storeLE(mkexpr(addr), mkexpr(dst1));
2585      if (vex_traceflags & VEX_TRACE_FE) {
2586         vex_printf("%s%c ",
2587                    nameGrp2(gregOfRM(modrm)), nameISize(sz) );
2588         if (shift_expr_txt)
2589            vex_printf("%s", shift_expr_txt);
2590         else
2591            ppIRExpr(shift_expr);
2592         vex_printf(", %s\n", dis_buf);
2593      }
2594   }
2595   return delta;
2596}
2597
2598
2599/* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
2600static
2601UInt dis_Grp8_Imm ( UChar sorb,
2602                    Bool locked,
2603                    Int delta, UChar modrm,
2604                    Int am_sz, Int sz, UInt src_val,
2605                    Bool* decode_OK )
2606{
2607   /* src_val denotes a d8.
2608      And delta on entry points at the modrm byte. */
2609
2610   IRType ty     = szToITy(sz);
2611   IRTemp t2     = newTemp(Ity_I32);
2612   IRTemp t2m    = newTemp(Ity_I32);
2613   IRTemp t_addr = IRTemp_INVALID;
2614   HChar  dis_buf[50];
2615   UInt   mask;
2616
2617   /* we're optimists :-) */
2618   *decode_OK = True;
2619
2620   /* Limit src_val -- the bit offset -- to something within a word.
2621      The Intel docs say that literal offsets larger than a word are
2622      masked in this way. */
2623   switch (sz) {
2624      case 2:  src_val &= 15; break;
2625      case 4:  src_val &= 31; break;
2626      default: *decode_OK = False; return delta;
2627   }
2628
2629   /* Invent a mask suitable for the operation. */
2630   switch (gregOfRM(modrm)) {
2631      case 4: /* BT */  mask = 0;               break;
2632      case 5: /* BTS */ mask = 1 << src_val;    break;
2633      case 6: /* BTR */ mask = ~(1 << src_val); break;
2634      case 7: /* BTC */ mask = 1 << src_val;    break;
2635         /* If this needs to be extended, probably simplest to make a
2636            new function to handle the other cases (0 .. 3).  The
2637            Intel docs do however not indicate any use for 0 .. 3, so
2638            we don't expect this to happen. */
2639      default: *decode_OK = False; return delta;
2640   }
2641
2642   /* Fetch the value to be tested and modified into t2, which is
2643      32-bits wide regardless of sz. */
2644   if (epartIsReg(modrm)) {
2645      vassert(am_sz == 1);
2646      assign( t2, widenUto32(getIReg(sz, eregOfRM(modrm))) );
2647      delta += (am_sz + 1);
2648      DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
2649                              src_val, nameIReg(sz,eregOfRM(modrm)));
2650   } else {
2651      Int len;
2652      t_addr = disAMode ( &len, sorb, delta, dis_buf);
2653      delta  += (len+1);
2654      assign( t2, widenUto32(loadLE(ty, mkexpr(t_addr))) );
2655      DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
2656                              src_val, dis_buf);
2657   }
2658
2659   /* Compute the new value into t2m, if non-BT. */
2660   switch (gregOfRM(modrm)) {
2661      case 4: /* BT */
2662         break;
2663      case 5: /* BTS */
2664         assign( t2m, binop(Iop_Or32, mkU32(mask), mkexpr(t2)) );
2665         break;
2666      case 6: /* BTR */
2667         assign( t2m, binop(Iop_And32, mkU32(mask), mkexpr(t2)) );
2668         break;
2669      case 7: /* BTC */
2670         assign( t2m, binop(Iop_Xor32, mkU32(mask), mkexpr(t2)) );
2671         break;
2672      default:
2673         /*NOTREACHED*/ /*the previous switch guards this*/
2674         vassert(0);
2675   }
2676
2677   /* Write the result back, if non-BT.  If the CAS fails then we
2678      side-exit from the trace at this point, and so the flag state is
2679      not affected.  This is of course as required. */
2680   if (gregOfRM(modrm) != 4 /* BT */) {
2681      if (epartIsReg(modrm)) {
2682         putIReg(sz, eregOfRM(modrm), narrowTo(ty, mkexpr(t2m)));
2683      } else {
2684         if (locked) {
2685            casLE( mkexpr(t_addr),
2686                   narrowTo(ty, mkexpr(t2))/*expd*/,
2687                   narrowTo(ty, mkexpr(t2m))/*new*/,
2688                   guest_EIP_curr_instr );
2689         } else {
2690            storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
2691         }
2692      }
2693   }
2694
2695   /* Copy relevant bit from t2 into the carry flag. */
2696   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
2697   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
2698   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
2699   stmt( IRStmt_Put(
2700            OFFB_CC_DEP1,
2701            binop(Iop_And32,
2702                  binop(Iop_Shr32, mkexpr(t2), mkU8(src_val)),
2703                  mkU32(1))
2704       ));
2705   /* Set NDEP even though it isn't used.  This makes redundant-PUT
2706      elimination of previous stores to this field work better. */
2707   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
2708
2709   return delta;
2710}
2711
2712
2713/* Signed/unsigned widening multiply.  Generate IR to multiply the
2714   value in EAX/AX/AL by the given IRTemp, and park the result in
2715   EDX:EAX/DX:AX/AX.
2716*/
2717static void codegen_mulL_A_D ( Int sz, Bool syned,
2718                               IRTemp tmp, HChar* tmp_txt )
2719{
2720   IRType ty = szToITy(sz);
2721   IRTemp t1 = newTemp(ty);
2722
2723   assign( t1, getIReg(sz, R_EAX) );
2724
2725   switch (ty) {
2726      case Ity_I32: {
2727         IRTemp res64   = newTemp(Ity_I64);
2728         IRTemp resHi   = newTemp(Ity_I32);
2729         IRTemp resLo   = newTemp(Ity_I32);
2730         IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
2731         UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
2732         setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
2733         assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
2734         assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
2735         assign( resLo, unop(Iop_64to32,mkexpr(res64)));
2736         putIReg(4, R_EDX, mkexpr(resHi));
2737         putIReg(4, R_EAX, mkexpr(resLo));
2738         break;
2739      }
2740      case Ity_I16: {
2741         IRTemp res32   = newTemp(Ity_I32);
2742         IRTemp resHi   = newTemp(Ity_I16);
2743         IRTemp resLo   = newTemp(Ity_I16);
2744         IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
2745         UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
2746         setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
2747         assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
2748         assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
2749         assign( resLo, unop(Iop_32to16,mkexpr(res32)));
2750         putIReg(2, R_EDX, mkexpr(resHi));
2751         putIReg(2, R_EAX, mkexpr(resLo));
2752         break;
2753      }
2754      case Ity_I8: {
2755         IRTemp res16   = newTemp(Ity_I16);
2756         IRTemp resHi   = newTemp(Ity_I8);
2757         IRTemp resLo   = newTemp(Ity_I8);
2758         IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
2759         UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
2760         setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
2761         assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
2762         assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
2763         assign( resLo, unop(Iop_16to8,mkexpr(res16)));
2764         putIReg(2, R_EAX, mkexpr(res16));
2765         break;
2766      }
2767      default:
2768         vpanic("codegen_mulL_A_D(x86)");
2769   }
2770   DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
2771}
2772
2773
2774/* Group 3 extended opcodes. */
2775static
2776UInt dis_Grp3 ( UChar sorb, Bool locked, Int sz, Int delta, Bool* decode_OK )
2777{
2778   UInt    d32;
2779   UChar   modrm;
2780   HChar   dis_buf[50];
2781   Int     len;
2782   IRTemp  addr;
2783   IRType  ty = szToITy(sz);
2784   IRTemp  t1 = newTemp(ty);
2785   IRTemp dst1, src, dst0;
2786
2787   *decode_OK = True; /* may change this later */
2788
2789   modrm = getIByte(delta);
2790
2791   if (locked && (gregOfRM(modrm) != 2 && gregOfRM(modrm) != 3)) {
2792      /* LOCK prefix only allowed with not and neg subopcodes */
2793      *decode_OK = False;
2794      return delta;
2795   }
2796
2797   if (epartIsReg(modrm)) {
2798      switch (gregOfRM(modrm)) {
2799         case 0: { /* TEST */
2800            delta++; d32 = getUDisp(sz, delta); delta += sz;
2801            dst1 = newTemp(ty);
2802            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
2803                               getIReg(sz,eregOfRM(modrm)),
2804                               mkU(ty,d32)));
2805            setFlags_DEP1( Iop_And8, dst1, ty );
2806            DIP("test%c $0x%x, %s\n", nameISize(sz), d32,
2807                                      nameIReg(sz, eregOfRM(modrm)));
2808            break;
2809         }
2810         case 1: /* UNDEFINED */
2811           /* The Intel docs imply this insn is undefined and binutils
2812              agrees.  Unfortunately Core 2 will run it (with who
2813              knows what result?)  sandpile.org reckons it's an alias
2814              for case 0.  We play safe. */
2815           *decode_OK = False;
2816           break;
2817         case 2: /* NOT */
2818            delta++;
2819            putIReg(sz, eregOfRM(modrm),
2820                        unop(mkSizedOp(ty,Iop_Not8),
2821                             getIReg(sz, eregOfRM(modrm))));
2822            DIP("not%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
2823            break;
2824         case 3: /* NEG */
2825            delta++;
2826            dst0 = newTemp(ty);
2827            src  = newTemp(ty);
2828            dst1 = newTemp(ty);
2829            assign(dst0, mkU(ty,0));
2830            assign(src,  getIReg(sz,eregOfRM(modrm)));
2831            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0), mkexpr(src)));
2832            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
2833            putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
2834            DIP("neg%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
2835            break;
2836         case 4: /* MUL (unsigned widening) */
2837            delta++;
2838            src = newTemp(ty);
2839            assign(src, getIReg(sz,eregOfRM(modrm)));
2840            codegen_mulL_A_D ( sz, False, src, nameIReg(sz,eregOfRM(modrm)) );
2841            break;
2842         case 5: /* IMUL (signed widening) */
2843            delta++;
2844            src = newTemp(ty);
2845            assign(src, getIReg(sz,eregOfRM(modrm)));
2846            codegen_mulL_A_D ( sz, True, src, nameIReg(sz,eregOfRM(modrm)) );
2847            break;
2848         case 6: /* DIV */
2849            delta++;
2850            assign( t1, getIReg(sz, eregOfRM(modrm)) );
2851            codegen_div ( sz, t1, False );
2852            DIP("div%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
2853            break;
2854         case 7: /* IDIV */
2855            delta++;
2856            assign( t1, getIReg(sz, eregOfRM(modrm)) );
2857            codegen_div ( sz, t1, True );
2858            DIP("idiv%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
2859            break;
2860         default:
2861            /* This can't happen - gregOfRM should return 0 .. 7 only */
2862            vpanic("Grp3(x86)");
2863      }
2864   } else {
2865      addr = disAMode ( &len, sorb, delta, dis_buf );
2866      t1   = newTemp(ty);
2867      delta += len;
2868      assign(t1, loadLE(ty,mkexpr(addr)));
2869      switch (gregOfRM(modrm)) {
2870         case 0: { /* TEST */
2871            d32 = getUDisp(sz, delta); delta += sz;
2872            dst1 = newTemp(ty);
2873            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
2874                               mkexpr(t1), mkU(ty,d32)));
2875            setFlags_DEP1( Iop_And8, dst1, ty );
2876            DIP("test%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
2877            break;
2878         }
2879         case 1: /* UNDEFINED */
2880           /* See comment above on R case */
2881           *decode_OK = False;
2882           break;
2883         case 2: /* NOT */
2884            dst1 = newTemp(ty);
2885            assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
2886            if (locked) {
2887               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
2888                                    guest_EIP_curr_instr );
2889            } else {
2890               storeLE( mkexpr(addr), mkexpr(dst1) );
2891            }
2892            DIP("not%c %s\n", nameISize(sz), dis_buf);
2893            break;
2894         case 3: /* NEG */
2895            dst0 = newTemp(ty);
2896            src  = newTemp(ty);
2897            dst1 = newTemp(ty);
2898            assign(dst0, mkU(ty,0));
2899            assign(src,  mkexpr(t1));
2900            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8),
2901                               mkexpr(dst0), mkexpr(src)));
2902            if (locked) {
2903               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
2904                                    guest_EIP_curr_instr );
2905            } else {
2906               storeLE( mkexpr(addr), mkexpr(dst1) );
2907            }
2908            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
2909            DIP("neg%c %s\n", nameISize(sz), dis_buf);
2910            break;
2911         case 4: /* MUL */
2912            codegen_mulL_A_D ( sz, False, t1, dis_buf );
2913            break;
2914         case 5: /* IMUL */
2915            codegen_mulL_A_D ( sz, True, t1, dis_buf );
2916            break;
2917         case 6: /* DIV */
2918            codegen_div ( sz, t1, False );
2919            DIP("div%c %s\n", nameISize(sz), dis_buf);
2920            break;
2921         case 7: /* IDIV */
2922            codegen_div ( sz, t1, True );
2923            DIP("idiv%c %s\n", nameISize(sz), dis_buf);
2924            break;
2925         default:
2926            /* This can't happen - gregOfRM should return 0 .. 7 only */
2927            vpanic("Grp3(x86)");
2928      }
2929   }
2930   return delta;
2931}
2932
2933
2934/* Group 4 extended opcodes. */
2935static
2936UInt dis_Grp4 ( UChar sorb, Bool locked, Int delta, Bool* decode_OK )
2937{
2938   Int   alen;
2939   UChar modrm;
2940   HChar dis_buf[50];
2941   IRType ty = Ity_I8;
2942   IRTemp t1 = newTemp(ty);
2943   IRTemp t2 = newTemp(ty);
2944
2945   *decode_OK = True;
2946
2947   modrm = getIByte(delta);
2948
2949   if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
2950      /* LOCK prefix only allowed with inc and dec subopcodes */
2951      *decode_OK = False;
2952      return delta;
2953   }
2954
2955   if (epartIsReg(modrm)) {
2956      assign(t1, getIReg(1, eregOfRM(modrm)));
2957      switch (gregOfRM(modrm)) {
2958         case 0: /* INC */
2959            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
2960            putIReg(1, eregOfRM(modrm), mkexpr(t2));
2961            setFlags_INC_DEC( True, t2, ty );
2962            break;
2963         case 1: /* DEC */
2964            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
2965            putIReg(1, eregOfRM(modrm), mkexpr(t2));
2966            setFlags_INC_DEC( False, t2, ty );
2967            break;
2968         default:
2969            *decode_OK = False;
2970            return delta;
2971      }
2972      delta++;
2973      DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)),
2974                      nameIReg(1, eregOfRM(modrm)));
2975   } else {
2976      IRTemp addr = disAMode ( &alen, sorb, delta, dis_buf );
2977      assign( t1, loadLE(ty, mkexpr(addr)) );
2978      switch (gregOfRM(modrm)) {
2979         case 0: /* INC */
2980            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
2981            if (locked) {
2982               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
2983                      guest_EIP_curr_instr );
2984            } else {
2985               storeLE( mkexpr(addr), mkexpr(t2) );
2986            }
2987            setFlags_INC_DEC( True, t2, ty );
2988            break;
2989         case 1: /* DEC */
2990            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
2991            if (locked) {
2992               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
2993                      guest_EIP_curr_instr );
2994            } else {
2995               storeLE( mkexpr(addr), mkexpr(t2) );
2996            }
2997            setFlags_INC_DEC( False, t2, ty );
2998            break;
2999         default:
3000            *decode_OK = False;
3001            return delta;
3002      }
3003      delta += alen;
3004      DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)), dis_buf);
3005   }
3006   return delta;
3007}
3008
3009
3010/* Group 5 extended opcodes. */
3011static
3012UInt dis_Grp5 ( UChar sorb, Bool locked, Int sz, Int delta,
3013                DisResult* dres, Bool* decode_OK )
3014{
3015   Int     len;
3016   UChar   modrm;
3017   HChar   dis_buf[50];
3018   IRTemp  addr = IRTemp_INVALID;
3019   IRType  ty = szToITy(sz);
3020   IRTemp  t1 = newTemp(ty);
3021   IRTemp  t2 = IRTemp_INVALID;
3022
3023   *decode_OK = True;
3024
3025   modrm = getIByte(delta);
3026
3027   if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
3028      /* LOCK prefix only allowed with inc and dec subopcodes */
3029      *decode_OK = False;
3030      return delta;
3031   }
3032
3033   if (epartIsReg(modrm)) {
3034      assign(t1, getIReg(sz,eregOfRM(modrm)));
3035      switch (gregOfRM(modrm)) {
3036         case 0: /* INC */
3037            vassert(sz == 2 || sz == 4);
3038            t2 = newTemp(ty);
3039            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
3040                             mkexpr(t1), mkU(ty,1)));
3041            setFlags_INC_DEC( True, t2, ty );
3042            putIReg(sz,eregOfRM(modrm),mkexpr(t2));
3043            break;
3044         case 1: /* DEC */
3045            vassert(sz == 2 || sz == 4);
3046            t2 = newTemp(ty);
3047            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
3048                             mkexpr(t1), mkU(ty,1)));
3049            setFlags_INC_DEC( False, t2, ty );
3050            putIReg(sz,eregOfRM(modrm),mkexpr(t2));
3051            break;
3052         case 2: /* call Ev */
3053            vassert(sz == 4);
3054            t2 = newTemp(Ity_I32);
3055            assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
3056            putIReg(4, R_ESP, mkexpr(t2));
3057            storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+1));
3058            jmp_treg(Ijk_Call,t1);
3059            dres->whatNext = Dis_StopHere;
3060            break;
3061         case 4: /* jmp Ev */
3062            vassert(sz == 4);
3063            jmp_treg(Ijk_Boring,t1);
3064            dres->whatNext = Dis_StopHere;
3065            break;
3066         case 6: /* PUSH Ev */
3067            vassert(sz == 4 || sz == 2);
3068            t2 = newTemp(Ity_I32);
3069            assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
3070            putIReg(4, R_ESP, mkexpr(t2) );
3071            storeLE( mkexpr(t2), mkexpr(t1) );
3072            break;
3073         default:
3074            *decode_OK = False;
3075            return delta;
3076      }
3077      delta++;
3078      DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
3079                       nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
3080   } else {
3081      addr = disAMode ( &len, sorb, delta, dis_buf );
3082      assign(t1, loadLE(ty,mkexpr(addr)));
3083      switch (gregOfRM(modrm)) {
3084         case 0: /* INC */
3085            t2 = newTemp(ty);
3086            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
3087                             mkexpr(t1), mkU(ty,1)));
3088            if (locked) {
3089               casLE( mkexpr(addr),
3090                      mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
3091            } else {
3092               storeLE(mkexpr(addr),mkexpr(t2));
3093            }
3094            setFlags_INC_DEC( True, t2, ty );
3095            break;
3096         case 1: /* DEC */
3097            t2 = newTemp(ty);
3098            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
3099                             mkexpr(t1), mkU(ty,1)));
3100            if (locked) {
3101               casLE( mkexpr(addr),
3102                      mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
3103            } else {
3104               storeLE(mkexpr(addr),mkexpr(t2));
3105            }
3106            setFlags_INC_DEC( False, t2, ty );
3107            break;
3108         case 2: /* call Ev */
3109            vassert(sz == 4);
3110            t2 = newTemp(Ity_I32);
3111            assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
3112            putIReg(4, R_ESP, mkexpr(t2));
3113            storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+len));
3114            jmp_treg(Ijk_Call,t1);
3115            dres->whatNext = Dis_StopHere;
3116            break;
3117         case 4: /* JMP Ev */
3118            vassert(sz == 4);
3119            jmp_treg(Ijk_Boring,t1);
3120            dres->whatNext = Dis_StopHere;
3121            break;
3122         case 6: /* PUSH Ev */
3123            vassert(sz == 4 || sz == 2);
3124            t2 = newTemp(Ity_I32);
3125            assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
3126            putIReg(4, R_ESP, mkexpr(t2) );
3127            storeLE( mkexpr(t2), mkexpr(t1) );
3128            break;
3129         default:
3130            *decode_OK = False;
3131            return delta;
3132      }
3133      delta += len;
3134      DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
3135                       nameISize(sz), dis_buf);
3136   }
3137   return delta;
3138}
3139
3140
3141/*------------------------------------------------------------*/
3142/*--- Disassembling string ops (including REP prefixes)    ---*/
3143/*------------------------------------------------------------*/
3144
3145/* Code shared by all the string ops */
3146static
3147void dis_string_op_increment(Int sz, Int t_inc)
3148{
3149   if (sz == 4 || sz == 2) {
3150      assign( t_inc,
3151              binop(Iop_Shl32, IRExpr_Get( OFFB_DFLAG, Ity_I32 ),
3152                               mkU8(sz/2) ) );
3153   } else {
3154      assign( t_inc,
3155              IRExpr_Get( OFFB_DFLAG, Ity_I32 ) );
3156   }
3157}
3158
3159static
3160void dis_string_op( void (*dis_OP)( Int, IRTemp ),
3161                    Int sz, HChar* name, UChar sorb )
3162{
3163   IRTemp t_inc = newTemp(Ity_I32);
3164   vassert(sorb == 0); /* hmm.  so what was the point of passing it in? */
3165   dis_string_op_increment(sz, t_inc);
3166   dis_OP( sz, t_inc );
3167   DIP("%s%c\n", name, nameISize(sz));
3168}
3169
3170static
3171void dis_MOVS ( Int sz, IRTemp t_inc )
3172{
3173   IRType ty = szToITy(sz);
3174   IRTemp td = newTemp(Ity_I32);   /* EDI */
3175   IRTemp ts = newTemp(Ity_I32);   /* ESI */
3176
3177   assign( td, getIReg(4, R_EDI) );
3178   assign( ts, getIReg(4, R_ESI) );
3179
3180   storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
3181
3182   putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3183   putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
3184}
3185
3186static
3187void dis_LODS ( Int sz, IRTemp t_inc )
3188{
3189   IRType ty = szToITy(sz);
3190   IRTemp ts = newTemp(Ity_I32);   /* ESI */
3191
3192   assign( ts, getIReg(4, R_ESI) );
3193
3194   putIReg( sz, R_EAX, loadLE(ty, mkexpr(ts)) );
3195
3196   putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
3197}
3198
3199static
3200void dis_STOS ( Int sz, IRTemp t_inc )
3201{
3202   IRType ty = szToITy(sz);
3203   IRTemp ta = newTemp(ty);        /* EAX */
3204   IRTemp td = newTemp(Ity_I32);   /* EDI */
3205
3206   assign( ta, getIReg(sz, R_EAX) );
3207   assign( td, getIReg(4, R_EDI) );
3208
3209   storeLE( mkexpr(td), mkexpr(ta) );
3210
3211   putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3212}
3213
3214static
3215void dis_CMPS ( Int sz, IRTemp t_inc )
3216{
3217   IRType ty  = szToITy(sz);
3218   IRTemp tdv = newTemp(ty);      /* (EDI) */
3219   IRTemp tsv = newTemp(ty);      /* (ESI) */
3220   IRTemp td  = newTemp(Ity_I32); /*  EDI  */
3221   IRTemp ts  = newTemp(Ity_I32); /*  ESI  */
3222
3223   assign( td, getIReg(4, R_EDI) );
3224   assign( ts, getIReg(4, R_ESI) );
3225
3226   assign( tdv, loadLE(ty,mkexpr(td)) );
3227   assign( tsv, loadLE(ty,mkexpr(ts)) );
3228
3229   setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
3230
3231   putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3232   putIReg(4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
3233}
3234
3235static
3236void dis_SCAS ( Int sz, IRTemp t_inc )
3237{
3238   IRType ty  = szToITy(sz);
3239   IRTemp ta  = newTemp(ty);       /*  EAX  */
3240   IRTemp td  = newTemp(Ity_I32);  /*  EDI  */
3241   IRTemp tdv = newTemp(ty);       /* (EDI) */
3242
3243   assign( ta, getIReg(sz, R_EAX) );
3244   assign( td, getIReg(4, R_EDI) );
3245
3246   assign( tdv, loadLE(ty,mkexpr(td)) );
3247   setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
3248
3249   putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3250}
3251
3252
3253/* Wrap the appropriate string op inside a REP/REPE/REPNE.
3254   We assume the insn is the last one in the basic block, and so emit a jump
3255   to the next insn, rather than just falling through. */
3256static
3257void dis_REP_op ( X86Condcode cond,
3258                  void (*dis_OP)(Int, IRTemp),
3259                  Int sz, Addr32 eip, Addr32 eip_next, HChar* name )
3260{
3261   IRTemp t_inc = newTemp(Ity_I32);
3262   IRTemp tc    = newTemp(Ity_I32);  /*  ECX  */
3263
3264   assign( tc, getIReg(4,R_ECX) );
3265
3266   stmt( IRStmt_Exit( binop(Iop_CmpEQ32,mkexpr(tc),mkU32(0)),
3267                      Ijk_Boring,
3268                      IRConst_U32(eip_next) ) );
3269
3270   putIReg(4, R_ECX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
3271
3272   dis_string_op_increment(sz, t_inc);
3273   dis_OP (sz, t_inc);
3274
3275   if (cond == X86CondAlways) {
3276      jmp_lit(Ijk_Boring,eip);
3277   } else {
3278      stmt( IRStmt_Exit( mk_x86g_calculate_condition(cond),
3279                         Ijk_Boring,
3280                         IRConst_U32(eip) ) );
3281      jmp_lit(Ijk_Boring,eip_next);
3282   }
3283   DIP("%s%c\n", name, nameISize(sz));
3284}
3285
3286
3287/*------------------------------------------------------------*/
3288/*--- Arithmetic, etc.                                     ---*/
3289/*------------------------------------------------------------*/
3290
3291/* IMUL E, G.  Supplied eip points to the modR/M byte. */
3292static
3293UInt dis_mul_E_G ( UChar       sorb,
3294                   Int         size,
3295                   Int         delta0 )
3296{
3297   Int    alen;
3298   HChar  dis_buf[50];
3299   UChar  rm = getIByte(delta0);
3300   IRType ty = szToITy(size);
3301   IRTemp te = newTemp(ty);
3302   IRTemp tg = newTemp(ty);
3303   IRTemp resLo = newTemp(ty);
3304
3305   assign( tg, getIReg(size, gregOfRM(rm)) );
3306   if (epartIsReg(rm)) {
3307      assign( te, getIReg(size, eregOfRM(rm)) );
3308   } else {
3309      IRTemp addr = disAMode( &alen, sorb, delta0, dis_buf );
3310      assign( te, loadLE(ty,mkexpr(addr)) );
3311   }
3312
3313   setFlags_MUL ( ty, te, tg, X86G_CC_OP_SMULB );
3314
3315   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
3316
3317   putIReg(size, gregOfRM(rm), mkexpr(resLo) );
3318
3319   if (epartIsReg(rm)) {
3320      DIP("imul%c %s, %s\n", nameISize(size),
3321                             nameIReg(size,eregOfRM(rm)),
3322                             nameIReg(size,gregOfRM(rm)));
3323      return 1+delta0;
3324   } else {
3325      DIP("imul%c %s, %s\n", nameISize(size),
3326                             dis_buf, nameIReg(size,gregOfRM(rm)));
3327      return alen+delta0;
3328   }
3329}
3330
3331
3332/* IMUL I * E -> G.  Supplied eip points to the modR/M byte. */
3333static
3334UInt dis_imul_I_E_G ( UChar       sorb,
3335                      Int         size,
3336                      Int         delta,
3337                      Int         litsize )
3338{
3339   Int    d32, alen;
3340   HChar  dis_buf[50];
3341   UChar  rm = getIByte(delta);
3342   IRType ty = szToITy(size);
3343   IRTemp te = newTemp(ty);
3344   IRTemp tl = newTemp(ty);
3345   IRTemp resLo = newTemp(ty);
3346
3347   vassert(size == 1 || size == 2 || size == 4);
3348
3349   if (epartIsReg(rm)) {
3350      assign(te, getIReg(size, eregOfRM(rm)));
3351      delta++;
3352   } else {
3353      IRTemp addr = disAMode( &alen, sorb, delta, dis_buf );
3354      assign(te, loadLE(ty, mkexpr(addr)));
3355      delta += alen;
3356   }
3357   d32 = getSDisp(litsize,delta);
3358   delta += litsize;
3359
3360   if (size == 1) d32 &= 0xFF;
3361   if (size == 2) d32 &= 0xFFFF;
3362
3363   assign(tl, mkU(ty,d32));
3364
3365   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
3366
3367   setFlags_MUL ( ty, te, tl, X86G_CC_OP_SMULB );
3368
3369   putIReg(size, gregOfRM(rm), mkexpr(resLo));
3370
3371   DIP("imul %d, %s, %s\n", d32,
3372       ( epartIsReg(rm) ? nameIReg(size,eregOfRM(rm)) : dis_buf ),
3373       nameIReg(size,gregOfRM(rm)) );
3374   return delta;
3375}
3376
3377
3378/* Generate an IR sequence to do a count-leading-zeroes operation on
3379   the supplied IRTemp, and return a new IRTemp holding the result.
3380   'ty' may be Ity_I16 or Ity_I32 only.  In the case where the
3381   argument is zero, return the number of bits in the word (the
3382   natural semantics). */
3383static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
3384{
3385   vassert(ty == Ity_I32 || ty == Ity_I16);
3386
3387   IRTemp src32 = newTemp(Ity_I32);
3388   assign(src32, widenUto32( mkexpr(src) ));
3389
3390   IRTemp src32x = newTemp(Ity_I32);
3391   assign(src32x,
3392          binop(Iop_Shl32, mkexpr(src32),
3393                           mkU8(32 - 8 * sizeofIRType(ty))));
3394
3395   // Clz32 has undefined semantics when its input is zero, so
3396   // special-case around that.
3397   IRTemp res32 = newTemp(Ity_I32);
3398   assign(res32,
3399          IRExpr_Mux0X(
3400             unop(Iop_1Uto8,
3401                  binop(Iop_CmpEQ32, mkexpr(src32x), mkU32(0))),
3402             unop(Iop_Clz32, mkexpr(src32x)),
3403             mkU32(8 * sizeofIRType(ty))
3404   ));
3405
3406   IRTemp res = newTemp(ty);
3407   assign(res, narrowTo(ty, mkexpr(res32)));
3408   return res;
3409}
3410
3411
3412/*------------------------------------------------------------*/
3413/*---                                                      ---*/
3414/*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
3415/*---                                                      ---*/
3416/*------------------------------------------------------------*/
3417
3418/* --- Helper functions for dealing with the register stack. --- */
3419
3420/* --- Set the emulation-warning pseudo-register. --- */
3421
3422static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
3423{
3424   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
3425   stmt( IRStmt_Put( OFFB_EMWARN, e ) );
3426}
3427
3428/* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
3429
3430static IRExpr* mkQNaN64 ( void )
3431{
3432  /* QNaN is 0 2047 1 0(51times)
3433     == 0b 11111111111b 1 0(51times)
3434     == 0x7FF8 0000 0000 0000
3435   */
3436   return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
3437}
3438
3439/* --------- Get/put the top-of-stack pointer. --------- */
3440
3441static IRExpr* get_ftop ( void )
3442{
3443   return IRExpr_Get( OFFB_FTOP, Ity_I32 );
3444}
3445
3446static void put_ftop ( IRExpr* e )
3447{
3448   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
3449   stmt( IRStmt_Put( OFFB_FTOP, e ) );
3450}
3451
3452/* --------- Get/put the C3210 bits. --------- */
3453
3454static IRExpr* get_C3210 ( void )
3455{
3456   return IRExpr_Get( OFFB_FC3210, Ity_I32 );
3457}
3458
3459static void put_C3210 ( IRExpr* e )
3460{
3461   stmt( IRStmt_Put( OFFB_FC3210, e ) );
3462}
3463
3464/* --------- Get/put the FPU rounding mode. --------- */
3465static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
3466{
3467   return IRExpr_Get( OFFB_FPROUND, Ity_I32 );
3468}
3469
3470static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
3471{
3472   stmt( IRStmt_Put( OFFB_FPROUND, e ) );
3473}
3474
3475
3476/* --------- Synthesise a 2-bit FPU rounding mode. --------- */
3477/* Produces a value in 0 .. 3, which is encoded as per the type
3478   IRRoundingMode.  Since the guest_FPROUND value is also encoded as
3479   per IRRoundingMode, we merely need to get it and mask it for
3480   safety.
3481*/
3482static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
3483{
3484   return binop( Iop_And32, get_fpround(), mkU32(3) );
3485}
3486
3487static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
3488{
3489   return mkU32(Irrm_NEAREST);
3490}
3491
3492
3493/* --------- Get/set FP register tag bytes. --------- */
3494
3495/* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
3496
3497static void put_ST_TAG ( Int i, IRExpr* value )
3498{
3499   IRRegArray* descr;
3500   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
3501   descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
3502   stmt( IRStmt_PutI( descr, get_ftop(), i, value ) );
3503}
3504
3505/* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
3506   zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
3507
3508static IRExpr* get_ST_TAG ( Int i )
3509{
3510   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
3511   return IRExpr_GetI( descr, get_ftop(), i );
3512}
3513
3514
3515/* --------- Get/set FP registers. --------- */
3516
3517/* Given i, and some expression e, emit 'ST(i) = e' and set the
3518   register's tag to indicate the register is full.  The previous
3519   state of the register is not checked. */
3520
3521static void put_ST_UNCHECKED ( Int i, IRExpr* value )
3522{
3523   IRRegArray* descr;
3524   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
3525   descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
3526   stmt( IRStmt_PutI( descr, get_ftop(), i, value ) );
3527   /* Mark the register as in-use. */
3528   put_ST_TAG(i, mkU8(1));
3529}
3530
3531/* Given i, and some expression e, emit
3532      ST(i) = is_full(i) ? NaN : e
3533   and set the tag accordingly.
3534*/
3535
3536static void put_ST ( Int i, IRExpr* value )
3537{
3538   put_ST_UNCHECKED( i,
3539                     IRExpr_Mux0X( get_ST_TAG(i),
3540                                   /* 0 means empty */
3541                                   value,
3542                                   /* non-0 means full */
3543                                   mkQNaN64()
3544                   )
3545   );
3546}
3547
3548
3549/* Given i, generate an expression yielding 'ST(i)'. */
3550
3551static IRExpr* get_ST_UNCHECKED ( Int i )
3552{
3553   IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
3554   return IRExpr_GetI( descr, get_ftop(), i );
3555}
3556
3557
3558/* Given i, generate an expression yielding
3559  is_full(i) ? ST(i) : NaN
3560*/
3561
3562static IRExpr* get_ST ( Int i )
3563{
3564   return
3565      IRExpr_Mux0X( get_ST_TAG(i),
3566                    /* 0 means empty */
3567                    mkQNaN64(),
3568                    /* non-0 means full */
3569                    get_ST_UNCHECKED(i));
3570}
3571
3572
3573/* Adjust FTOP downwards by one register. */
3574
3575static void fp_push ( void )
3576{
3577   put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
3578}
3579
3580/* Adjust FTOP upwards by one register, and mark the vacated register
3581   as empty.  */
3582
3583static void fp_pop ( void )
3584{
3585   put_ST_TAG(0, mkU8(0));
3586   put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
3587}
3588
3589/* Clear the C2 bit of the FPU status register, for
3590   sin/cos/tan/sincos. */
3591
3592static void clear_C2 ( void )
3593{
3594   put_C3210( binop(Iop_And32, get_C3210(), mkU32(~X86G_FC_MASK_C2)) );
3595}
3596
3597/* Invent a plausible-looking FPU status word value:
3598      ((ftop & 7) << 11) | (c3210 & 0x4700)
3599 */
3600static IRExpr* get_FPU_sw ( void )
3601{
3602   return
3603      unop(Iop_32to16,
3604           binop(Iop_Or32,
3605                 binop(Iop_Shl32,
3606                       binop(Iop_And32, get_ftop(), mkU32(7)),
3607                             mkU8(11)),
3608                       binop(Iop_And32, get_C3210(), mkU32(0x4700))
3609      ));
3610}
3611
3612
3613/* ------------------------------------------------------- */
3614/* Given all that stack-mangling junk, we can now go ahead
3615   and describe FP instructions.
3616*/
3617
3618/* ST(0) = ST(0) `op` mem64/32(addr)
3619   Need to check ST(0)'s tag on read, but not on write.
3620*/
3621static
3622void fp_do_op_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
3623                         IROp op, Bool dbl )
3624{
3625   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
3626   if (dbl) {
3627      put_ST_UNCHECKED(0,
3628         triop( op,
3629                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3630                get_ST(0),
3631                loadLE(Ity_F64,mkexpr(addr))
3632         ));
3633   } else {
3634      put_ST_UNCHECKED(0,
3635         triop( op,
3636                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3637                get_ST(0),
3638                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
3639         ));
3640   }
3641}
3642
3643
3644/* ST(0) = mem64/32(addr) `op` ST(0)
3645   Need to check ST(0)'s tag on read, but not on write.
3646*/
3647static
3648void fp_do_oprev_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
3649                            IROp op, Bool dbl )
3650{
3651   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
3652   if (dbl) {
3653      put_ST_UNCHECKED(0,
3654         triop( op,
3655                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3656                loadLE(Ity_F64,mkexpr(addr)),
3657                get_ST(0)
3658         ));
3659   } else {
3660      put_ST_UNCHECKED(0,
3661         triop( op,
3662                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3663                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
3664                get_ST(0)
3665         ));
3666   }
3667}
3668
3669
3670/* ST(dst) = ST(dst) `op` ST(src).
3671   Check dst and src tags when reading but not on write.
3672*/
3673static
3674void fp_do_op_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
3675                      Bool pop_after )
3676{
3677   DIP("f%s%s st(%d), st(%d)\n", op_txt, pop_after?"p":"",
3678                                 (Int)st_src, (Int)st_dst );
3679   put_ST_UNCHECKED(
3680      st_dst,
3681      triop( op,
3682             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3683             get_ST(st_dst),
3684             get_ST(st_src) )
3685   );
3686   if (pop_after)
3687      fp_pop();
3688}
3689
3690/* ST(dst) = ST(src) `op` ST(dst).
3691   Check dst and src tags when reading but not on write.
3692*/
3693static
3694void fp_do_oprev_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
3695                         Bool pop_after )
3696{
3697   DIP("f%s%s st(%d), st(%d)\n", op_txt, pop_after?"p":"",
3698                                 (Int)st_src, (Int)st_dst );
3699   put_ST_UNCHECKED(
3700      st_dst,
3701      triop( op,
3702             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3703             get_ST(st_src),
3704             get_ST(st_dst) )
3705   );
3706   if (pop_after)
3707      fp_pop();
3708}
3709
3710/* %eflags(Z,P,C) = UCOMI( st(0), st(i) ) */
3711static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
3712{
3713   DIP("fucomi%s %%st(0),%%st(%d)\n", pop_after ? "p" : "", (Int)i );
3714   /* This is a bit of a hack (and isn't really right).  It sets
3715      Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
3716      documentation implies A and S are unchanged.
3717   */
3718   /* It's also fishy in that it is used both for COMIP and
3719      UCOMIP, and they aren't the same (although similar). */
3720   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
3721   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
3722   stmt( IRStmt_Put( OFFB_CC_DEP1,
3723                     binop( Iop_And32,
3724                            binop(Iop_CmpF64, get_ST(0), get_ST(i)),
3725                            mkU32(0x45)
3726       )));
3727   /* Set NDEP even though it isn't used.  This makes redundant-PUT
3728      elimination of previous stores to this field work better. */
3729   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
3730   if (pop_after)
3731      fp_pop();
3732}
3733
3734
3735static
3736UInt dis_FPU ( Bool* decode_ok, UChar sorb, Int delta )
3737{
3738   Int    len;
3739   UInt   r_src, r_dst;
3740   HChar  dis_buf[50];
3741   IRTemp t1, t2;
3742
3743   /* On entry, delta points at the second byte of the insn (the modrm
3744      byte).*/
3745   UChar first_opcode = getIByte(delta-1);
3746   UChar modrm        = getIByte(delta+0);
3747
3748   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
3749
3750   if (first_opcode == 0xD8) {
3751      if (modrm < 0xC0) {
3752
3753         /* bits 5,4,3 are an opcode extension, and the modRM also
3754           specifies an address. */
3755         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
3756         delta += len;
3757
3758         switch (gregOfRM(modrm)) {
3759
3760            case 0: /* FADD single-real */
3761               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
3762               break;
3763
3764            case 1: /* FMUL single-real */
3765               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
3766               break;
3767
3768            case 2: /* FCOM single-real */
3769               DIP("fcoms %s\n", dis_buf);
3770               /* This forces C1 to zero, which isn't right. */
3771               put_C3210(
3772                   binop( Iop_And32,
3773                          binop(Iop_Shl32,
3774                                binop(Iop_CmpF64,
3775                                      get_ST(0),
3776                                      unop(Iop_F32toF64,
3777                                           loadLE(Ity_F32,mkexpr(addr)))),
3778                                mkU8(8)),
3779                          mkU32(0x4500)
3780                   ));
3781               break;
3782
3783            case 3: /* FCOMP single-real */
3784               DIP("fcomps %s\n", dis_buf);
3785               /* This forces C1 to zero, which isn't right. */
3786               put_C3210(
3787                   binop( Iop_And32,
3788                          binop(Iop_Shl32,
3789                                binop(Iop_CmpF64,
3790                                      get_ST(0),
3791                                      unop(Iop_F32toF64,
3792                                           loadLE(Ity_F32,mkexpr(addr)))),
3793                                mkU8(8)),
3794                          mkU32(0x4500)
3795                   ));
3796               fp_pop();
3797               break;
3798
3799            case 4: /* FSUB single-real */
3800               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
3801               break;
3802
3803            case 5: /* FSUBR single-real */
3804               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
3805               break;
3806
3807            case 6: /* FDIV single-real */
3808               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
3809               break;
3810
3811            case 7: /* FDIVR single-real */
3812               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
3813               break;
3814
3815            default:
3816               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
3817               vex_printf("first_opcode == 0xD8\n");
3818               goto decode_fail;
3819         }
3820      } else {
3821         delta++;
3822         switch (modrm) {
3823
3824            case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
3825               fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
3826               break;
3827
3828            case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
3829               fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
3830               break;
3831
3832            /* Dunno if this is right */
3833            case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
3834               r_dst = (UInt)modrm - 0xD0;
3835               DIP("fcom %%st(0),%%st(%d)\n", (Int)r_dst);
3836               /* This forces C1 to zero, which isn't right. */
3837               put_C3210(
3838                   binop( Iop_And32,
3839                          binop(Iop_Shl32,
3840                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
3841                                mkU8(8)),
3842                          mkU32(0x4500)
3843                   ));
3844               break;
3845
3846            /* Dunno if this is right */
3847            case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
3848               r_dst = (UInt)modrm - 0xD8;
3849               DIP("fcomp %%st(0),%%st(%d)\n", (Int)r_dst);
3850               /* This forces C1 to zero, which isn't right. */
3851               put_C3210(
3852                   binop( Iop_And32,
3853                          binop(Iop_Shl32,
3854                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
3855                                mkU8(8)),
3856                          mkU32(0x4500)
3857                   ));
3858               fp_pop();
3859               break;
3860
3861            case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
3862               fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
3863               break;
3864
3865            case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
3866               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
3867               break;
3868
3869            case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
3870               fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
3871               break;
3872
3873            case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
3874               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
3875               break;
3876
3877            default:
3878               goto decode_fail;
3879         }
3880      }
3881   }
3882
3883   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
3884   else
3885   if (first_opcode == 0xD9) {
3886      if (modrm < 0xC0) {
3887
3888         /* bits 5,4,3 are an opcode extension, and the modRM also
3889            specifies an address. */
3890         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
3891         delta += len;
3892
3893         switch (gregOfRM(modrm)) {
3894
3895            case 0: /* FLD single-real */
3896               DIP("flds %s\n", dis_buf);
3897               fp_push();
3898               put_ST(0, unop(Iop_F32toF64,
3899                              loadLE(Ity_F32, mkexpr(addr))));
3900               break;
3901
3902            case 2: /* FST single-real */
3903               DIP("fsts %s\n", dis_buf);
3904               storeLE(mkexpr(addr),
3905                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
3906               break;
3907
3908            case 3: /* FSTP single-real */
3909               DIP("fstps %s\n", dis_buf);
3910               storeLE(mkexpr(addr),
3911                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
3912               fp_pop();
3913               break;
3914
3915            case 4: { /* FLDENV m28 */
3916               /* Uses dirty helper:
3917                     VexEmWarn x86g_do_FLDENV ( VexGuestX86State*, HWord ) */
3918               IRTemp   ew = newTemp(Ity_I32);
3919               IRDirty* d  = unsafeIRDirty_0_N (
3920                                0/*regparms*/,
3921                                "x86g_dirtyhelper_FLDENV",
3922                                &x86g_dirtyhelper_FLDENV,
3923                                mkIRExprVec_1( mkexpr(addr) )
3924                             );
3925               d->needsBBP = True;
3926               d->tmp      = ew;
3927               /* declare we're reading memory */
3928               d->mFx   = Ifx_Read;
3929               d->mAddr = mkexpr(addr);
3930               d->mSize = 28;
3931
3932               /* declare we're writing guest state */
3933               d->nFxState = 4;
3934
3935               d->fxState[0].fx     = Ifx_Write;
3936               d->fxState[0].offset = OFFB_FTOP;
3937               d->fxState[0].size   = sizeof(UInt);
3938
3939               d->fxState[1].fx     = Ifx_Write;
3940               d->fxState[1].offset = OFFB_FPTAGS;
3941               d->fxState[1].size   = 8 * sizeof(UChar);
3942
3943               d->fxState[2].fx     = Ifx_Write;
3944               d->fxState[2].offset = OFFB_FPROUND;
3945               d->fxState[2].size   = sizeof(UInt);
3946
3947               d->fxState[3].fx     = Ifx_Write;
3948               d->fxState[3].offset = OFFB_FC3210;
3949               d->fxState[3].size   = sizeof(UInt);
3950
3951               stmt( IRStmt_Dirty(d) );
3952
3953               /* ew contains any emulation warning we may need to
3954                  issue.  If needed, side-exit to the next insn,
3955                  reporting the warning, so that Valgrind's dispatcher
3956                  sees the warning. */
3957               put_emwarn( mkexpr(ew) );
3958               stmt(
3959                  IRStmt_Exit(
3960                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
3961                     Ijk_EmWarn,
3962                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
3963                  )
3964               );
3965
3966               DIP("fldenv %s\n", dis_buf);
3967               break;
3968            }
3969
3970            case 5: {/* FLDCW */
3971               /* The only thing we observe in the control word is the
3972                  rounding mode.  Therefore, pass the 16-bit value
3973                  (x87 native-format control word) to a clean helper,
3974                  getting back a 64-bit value, the lower half of which
3975                  is the FPROUND value to store, and the upper half of
3976                  which is the emulation-warning token which may be
3977                  generated.
3978               */
3979               /* ULong x86h_check_fldcw ( UInt ); */
3980               IRTemp t64 = newTemp(Ity_I64);
3981               IRTemp ew = newTemp(Ity_I32);
3982               DIP("fldcw %s\n", dis_buf);
3983               assign( t64, mkIRExprCCall(
3984                               Ity_I64, 0/*regparms*/,
3985                               "x86g_check_fldcw",
3986                               &x86g_check_fldcw,
3987                               mkIRExprVec_1(
3988                                  unop( Iop_16Uto32,
3989                                        loadLE(Ity_I16, mkexpr(addr)))
3990                               )
3991                            )
3992                     );
3993
3994               put_fpround( unop(Iop_64to32, mkexpr(t64)) );
3995               assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
3996               put_emwarn( mkexpr(ew) );
3997               /* Finally, if an emulation warning was reported,
3998                  side-exit to the next insn, reporting the warning,
3999                  so that Valgrind's dispatcher sees the warning. */
4000               stmt(
4001                  IRStmt_Exit(
4002                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
4003                     Ijk_EmWarn,
4004                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
4005                  )
4006               );
4007               break;
4008            }
4009
4010            case 6: { /* FNSTENV m28 */
4011               /* Uses dirty helper:
4012                     void x86g_do_FSTENV ( VexGuestX86State*, HWord ) */
4013               IRDirty* d = unsafeIRDirty_0_N (
4014                               0/*regparms*/,
4015                               "x86g_dirtyhelper_FSTENV",
4016                               &x86g_dirtyhelper_FSTENV,
4017                               mkIRExprVec_1( mkexpr(addr) )
4018                            );
4019               d->needsBBP = True;
4020               /* declare we're writing memory */
4021               d->mFx   = Ifx_Write;
4022               d->mAddr = mkexpr(addr);
4023               d->mSize = 28;
4024
4025               /* declare we're reading guest state */
4026               d->nFxState = 4;
4027
4028               d->fxState[0].fx     = Ifx_Read;
4029               d->fxState[0].offset = OFFB_FTOP;
4030               d->fxState[0].size   = sizeof(UInt);
4031
4032               d->fxState[1].fx     = Ifx_Read;
4033               d->fxState[1].offset = OFFB_FPTAGS;
4034               d->fxState[1].size   = 8 * sizeof(UChar);
4035
4036               d->fxState[2].fx     = Ifx_Read;
4037               d->fxState[2].offset = OFFB_FPROUND;
4038               d->fxState[2].size   = sizeof(UInt);
4039
4040               d->fxState[3].fx     = Ifx_Read;
4041               d->fxState[3].offset = OFFB_FC3210;
4042               d->fxState[3].size   = sizeof(UInt);
4043
4044               stmt( IRStmt_Dirty(d) );
4045
4046               DIP("fnstenv %s\n", dis_buf);
4047               break;
4048            }
4049
4050            case 7: /* FNSTCW */
4051              /* Fake up a native x87 FPU control word.  The only
4052                 thing it depends on is FPROUND[1:0], so call a clean
4053                 helper to cook it up. */
4054               /* UInt x86h_create_fpucw ( UInt fpround ) */
4055               DIP("fnstcw %s\n", dis_buf);
4056               storeLE(
4057                  mkexpr(addr),
4058                  unop( Iop_32to16,
4059                        mkIRExprCCall(
4060                           Ity_I32, 0/*regp*/,
4061                           "x86g_create_fpucw", &x86g_create_fpucw,
4062                           mkIRExprVec_1( get_fpround() )
4063                        )
4064                  )
4065               );
4066               break;
4067
4068            default:
4069               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
4070               vex_printf("first_opcode == 0xD9\n");
4071               goto decode_fail;
4072         }
4073
4074      } else {
4075         delta++;
4076         switch (modrm) {
4077
4078            case 0xC0 ... 0xC7: /* FLD %st(?) */
4079               r_src = (UInt)modrm - 0xC0;
4080               DIP("fld %%st(%d)\n", (Int)r_src);
4081               t1 = newTemp(Ity_F64);
4082               assign(t1, get_ST(r_src));
4083               fp_push();
4084               put_ST(0, mkexpr(t1));
4085               break;
4086
4087            case 0xC8 ... 0xCF: /* FXCH %st(?) */
4088               r_src = (UInt)modrm - 0xC8;
4089               DIP("fxch %%st(%d)\n", (Int)r_src);
4090               t1 = newTemp(Ity_F64);
4091               t2 = newTemp(Ity_F64);
4092               assign(t1, get_ST(0));
4093               assign(t2, get_ST(r_src));
4094               put_ST_UNCHECKED(0, mkexpr(t2));
4095               put_ST_UNCHECKED(r_src, mkexpr(t1));
4096               break;
4097
4098            case 0xE0: /* FCHS */
4099               DIP("fchs\n");
4100               put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
4101               break;
4102
4103            case 0xE1: /* FABS */
4104               DIP("fabs\n");
4105               put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
4106               break;
4107
4108            case 0xE4: /* FTST */
4109               DIP("ftst\n");
4110               /* This forces C1 to zero, which isn't right. */
4111               /* Well, in fact the Intel docs say (bizarrely): "C1 is
4112                  set to 0 if stack underflow occurred; otherwise, set
4113                  to 0" which is pretty nonsensical.  I guess it's a
4114                   typo. */
4115               put_C3210(
4116                   binop( Iop_And32,
4117                          binop(Iop_Shl32,
4118                                binop(Iop_CmpF64,
4119                                      get_ST(0),
4120                                      IRExpr_Const(IRConst_F64i(0x0ULL))),
4121                                mkU8(8)),
4122                          mkU32(0x4500)
4123                   ));
4124               break;
4125
4126            case 0xE5: { /* FXAM */
4127               /* This is an interesting one.  It examines %st(0),
4128                  regardless of whether the tag says it's empty or not.
4129                  Here, just pass both the tag (in our format) and the
4130                  value (as a double, actually a ULong) to a helper
4131                  function. */
4132               IRExpr** args
4133                  = mkIRExprVec_2( unop(Iop_8Uto32, get_ST_TAG(0)),
4134                                   unop(Iop_ReinterpF64asI64,
4135                                        get_ST_UNCHECKED(0)) );
4136               put_C3210(mkIRExprCCall(
4137                            Ity_I32,
4138                            0/*regparm*/,
4139                            "x86g_calculate_FXAM", &x86g_calculate_FXAM,
4140                            args
4141                        ));
4142               DIP("fxam\n");
4143               break;
4144            }
4145
4146            case 0xE8: /* FLD1 */
4147               DIP("fld1\n");
4148               fp_push();
4149               /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
4150               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
4151               break;
4152
4153            case 0xE9: /* FLDL2T */
4154               DIP("fldl2t\n");
4155               fp_push();
4156               /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
4157               put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
4158               break;
4159
4160            case 0xEA: /* FLDL2E */
4161               DIP("fldl2e\n");
4162               fp_push();
4163               /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
4164               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
4165               break;
4166
4167            case 0xEB: /* FLDPI */
4168               DIP("fldpi\n");
4169               fp_push();
4170               /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
4171               put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
4172               break;
4173
4174            case 0xEC: /* FLDLG2 */
4175               DIP("fldlg2\n");
4176               fp_push();
4177               /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
4178               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
4179               break;
4180
4181            case 0xED: /* FLDLN2 */
4182               DIP("fldln2\n");
4183               fp_push();
4184               /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
4185               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
4186               break;
4187
4188            case 0xEE: /* FLDZ */
4189               DIP("fldz\n");
4190               fp_push();
4191               /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
4192               put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
4193               break;
4194
4195            case 0xF0: /* F2XM1 */
4196               DIP("f2xm1\n");
4197               put_ST_UNCHECKED(0,
4198                  binop(Iop_2xm1F64,
4199                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4200                        get_ST(0)));
4201               break;
4202
4203            case 0xF1: /* FYL2X */
4204               DIP("fyl2x\n");
4205               put_ST_UNCHECKED(1,
4206                  triop(Iop_Yl2xF64,
4207                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4208                        get_ST(1),
4209                        get_ST(0)));
4210               fp_pop();
4211               break;
4212
4213            case 0xF2: /* FPTAN */
4214               DIP("ftan\n");
4215               put_ST_UNCHECKED(0,
4216                  binop(Iop_TanF64,
4217                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4218                        get_ST(0)));
4219               fp_push();
4220               put_ST(0, IRExpr_Const(IRConst_F64(1.0)));
4221               clear_C2(); /* HACK */
4222               break;
4223
4224            case 0xF3: /* FPATAN */
4225               DIP("fpatan\n");
4226               put_ST_UNCHECKED(1,
4227                  triop(Iop_AtanF64,
4228                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4229                        get_ST(1),
4230                        get_ST(0)));
4231               fp_pop();
4232               break;
4233
4234            case 0xF4: { /* FXTRACT */
4235               IRTemp argF = newTemp(Ity_F64);
4236               IRTemp sigF = newTemp(Ity_F64);
4237               IRTemp expF = newTemp(Ity_F64);
4238               IRTemp argI = newTemp(Ity_I64);
4239               IRTemp sigI = newTemp(Ity_I64);
4240               IRTemp expI = newTemp(Ity_I64);
4241               DIP("fxtract\n");
4242               assign( argF, get_ST(0) );
4243               assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
4244               assign( sigI,
4245                       mkIRExprCCall(
4246                          Ity_I64, 0/*regparms*/,
4247                          "x86amd64g_calculate_FXTRACT",
4248                          &x86amd64g_calculate_FXTRACT,
4249                          mkIRExprVec_2( mkexpr(argI),
4250                                         mkIRExpr_HWord(0)/*sig*/ ))
4251               );
4252               assign( expI,
4253                       mkIRExprCCall(
4254                          Ity_I64, 0/*regparms*/,
4255                          "x86amd64g_calculate_FXTRACT",
4256                          &x86amd64g_calculate_FXTRACT,
4257                          mkIRExprVec_2( mkexpr(argI),
4258                                         mkIRExpr_HWord(1)/*exp*/ ))
4259               );
4260               assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
4261               assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
4262               /* exponent */
4263               put_ST_UNCHECKED(0, mkexpr(expF) );
4264               fp_push();
4265               /* significand */
4266               put_ST(0, mkexpr(sigF) );
4267               break;
4268            }
4269
4270            case 0xF5: { /* FPREM1 -- IEEE compliant */
4271               IRTemp a1 = newTemp(Ity_F64);
4272               IRTemp a2 = newTemp(Ity_F64);
4273               DIP("fprem1\n");
4274               /* Do FPREM1 twice, once to get the remainder, and once
4275                  to get the C3210 flag values. */
4276               assign( a1, get_ST(0) );
4277               assign( a2, get_ST(1) );
4278               put_ST_UNCHECKED(0,
4279                  triop(Iop_PRem1F64,
4280                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4281                        mkexpr(a1),
4282                        mkexpr(a2)));
4283               put_C3210(
4284                  triop(Iop_PRem1C3210F64,
4285                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4286                        mkexpr(a1),
4287                        mkexpr(a2)) );
4288               break;
4289            }
4290
4291            case 0xF7: /* FINCSTP */
4292               DIP("fprem\n");
4293               put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
4294               break;
4295
4296            case 0xF8: { /* FPREM -- not IEEE compliant */
4297               IRTemp a1 = newTemp(Ity_F64);
4298               IRTemp a2 = newTemp(Ity_F64);
4299               DIP("fprem\n");
4300               /* Do FPREM twice, once to get the remainder, and once
4301                  to get the C3210 flag values. */
4302               assign( a1, get_ST(0) );
4303               assign( a2, get_ST(1) );
4304               put_ST_UNCHECKED(0,
4305                  triop(Iop_PRemF64,
4306                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4307                        mkexpr(a1),
4308                        mkexpr(a2)));
4309               put_C3210(
4310                  triop(Iop_PRemC3210F64,
4311                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4312                        mkexpr(a1),
4313                        mkexpr(a2)) );
4314               break;
4315            }
4316
4317            case 0xF9: /* FYL2XP1 */
4318               DIP("fyl2xp1\n");
4319               put_ST_UNCHECKED(1,
4320                  triop(Iop_Yl2xp1F64,
4321                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4322                        get_ST(1),
4323                        get_ST(0)));
4324               fp_pop();
4325               break;
4326
4327            case 0xFA: /* FSQRT */
4328               DIP("fsqrt\n");
4329               put_ST_UNCHECKED(0,
4330                  binop(Iop_SqrtF64,
4331                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4332                        get_ST(0)));
4333               break;
4334
4335            case 0xFB: { /* FSINCOS */
4336               IRTemp a1 = newTemp(Ity_F64);
4337               assign( a1, get_ST(0) );
4338               DIP("fsincos\n");
4339               put_ST_UNCHECKED(0,
4340                  binop(Iop_SinF64,
4341                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4342                        mkexpr(a1)));
4343               fp_push();
4344               put_ST(0,
4345                  binop(Iop_CosF64,
4346                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4347                        mkexpr(a1)));
4348               clear_C2(); /* HACK */
4349               break;
4350            }
4351
4352            case 0xFC: /* FRNDINT */
4353               DIP("frndint\n");
4354               put_ST_UNCHECKED(0,
4355                  binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
4356               break;
4357
4358            case 0xFD: /* FSCALE */
4359               DIP("fscale\n");
4360               put_ST_UNCHECKED(0,
4361                  triop(Iop_ScaleF64,
4362                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4363                        get_ST(0),
4364                        get_ST(1)));
4365               break;
4366
4367            case 0xFE: /* FSIN */
4368               DIP("fsin\n");
4369               put_ST_UNCHECKED(0,
4370                  binop(Iop_SinF64,
4371                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4372                        get_ST(0)));
4373               clear_C2(); /* HACK */
4374               break;
4375
4376            case 0xFF: /* FCOS */
4377               DIP("fcos\n");
4378               put_ST_UNCHECKED(0,
4379                  binop(Iop_CosF64,
4380                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4381                        get_ST(0)));
4382               clear_C2(); /* HACK */
4383               break;
4384
4385            default:
4386               goto decode_fail;
4387         }
4388      }
4389   }
4390
4391   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
4392   else
4393   if (first_opcode == 0xDA) {
4394
4395      if (modrm < 0xC0) {
4396
4397         /* bits 5,4,3 are an opcode extension, and the modRM also
4398            specifies an address. */
4399         IROp   fop;
4400         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4401         delta += len;
4402         switch (gregOfRM(modrm)) {
4403
4404            case 0: /* FIADD m32int */ /* ST(0) += m32int */
4405               DIP("fiaddl %s\n", dis_buf);
4406               fop = Iop_AddF64;
4407               goto do_fop_m32;
4408
4409            case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
4410               DIP("fimull %s\n", dis_buf);
4411               fop = Iop_MulF64;
4412               goto do_fop_m32;
4413
4414            case 2: /* FICOM m32int */
4415               DIP("ficoml %s\n", dis_buf);
4416               /* This forces C1 to zero, which isn't right. */
4417               put_C3210(
4418                   binop( Iop_And32,
4419                          binop(Iop_Shl32,
4420                                binop(Iop_CmpF64,
4421                                      get_ST(0),
4422                                      unop(Iop_I32StoF64,
4423                                           loadLE(Ity_I32,mkexpr(addr)))),
4424                                mkU8(8)),
4425                          mkU32(0x4500)
4426                   ));
4427               break;
4428
4429            case 3: /* FICOMP m32int */
4430               DIP("ficompl %s\n", dis_buf);
4431               /* This forces C1 to zero, which isn't right. */
4432               put_C3210(
4433                   binop( Iop_And32,
4434                          binop(Iop_Shl32,
4435                                binop(Iop_CmpF64,
4436                                      get_ST(0),
4437                                      unop(Iop_I32StoF64,
4438                                           loadLE(Ity_I32,mkexpr(addr)))),
4439                                mkU8(8)),
4440                          mkU32(0x4500)
4441                   ));
4442               fp_pop();
4443               break;
4444
4445            case 4: /* FISUB m32int */ /* ST(0) -= m32int */
4446               DIP("fisubl %s\n", dis_buf);
4447               fop = Iop_SubF64;
4448               goto do_fop_m32;
4449
4450            case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
4451               DIP("fisubrl %s\n", dis_buf);
4452               fop = Iop_SubF64;
4453               goto do_foprev_m32;
4454
4455            case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
4456               DIP("fidivl %s\n", dis_buf);
4457               fop = Iop_DivF64;
4458               goto do_fop_m32;
4459
4460            case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
4461               DIP("fidivrl %s\n", dis_buf);
4462               fop = Iop_DivF64;
4463               goto do_foprev_m32;
4464
4465            do_fop_m32:
4466               put_ST_UNCHECKED(0,
4467                  triop(fop,
4468                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4469                        get_ST(0),
4470                        unop(Iop_I32StoF64,
4471                             loadLE(Ity_I32, mkexpr(addr)))));
4472               break;
4473
4474            do_foprev_m32:
4475               put_ST_UNCHECKED(0,
4476                  triop(fop,
4477                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4478                        unop(Iop_I32StoF64,
4479                             loadLE(Ity_I32, mkexpr(addr))),
4480                        get_ST(0)));
4481               break;
4482
4483            default:
4484               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
4485               vex_printf("first_opcode == 0xDA\n");
4486               goto decode_fail;
4487         }
4488
4489      } else {
4490
4491         delta++;
4492         switch (modrm) {
4493
4494            case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
4495               r_src = (UInt)modrm - 0xC0;
4496               DIP("fcmovb %%st(%d), %%st(0)\n", (Int)r_src);
4497               put_ST_UNCHECKED(0,
4498                                IRExpr_Mux0X(
4499                                    unop(Iop_1Uto8,
4500                                         mk_x86g_calculate_condition(X86CondB)),
4501                                    get_ST(0), get_ST(r_src)) );
4502               break;
4503
4504            case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
4505               r_src = (UInt)modrm - 0xC8;
4506               DIP("fcmovz %%st(%d), %%st(0)\n", (Int)r_src);
4507               put_ST_UNCHECKED(0,
4508                                IRExpr_Mux0X(
4509                                    unop(Iop_1Uto8,
4510                                         mk_x86g_calculate_condition(X86CondZ)),
4511                                    get_ST(0), get_ST(r_src)) );
4512               break;
4513
4514            case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
4515               r_src = (UInt)modrm - 0xD0;
4516               DIP("fcmovbe %%st(%d), %%st(0)\n", (Int)r_src);
4517               put_ST_UNCHECKED(0,
4518                                IRExpr_Mux0X(
4519                                    unop(Iop_1Uto8,
4520                                         mk_x86g_calculate_condition(X86CondBE)),
4521                                    get_ST(0), get_ST(r_src)) );
4522               break;
4523
4524            case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
4525               r_src = (UInt)modrm - 0xD8;
4526               DIP("fcmovu %%st(%d), %%st(0)\n", (Int)r_src);
4527               put_ST_UNCHECKED(0,
4528                                IRExpr_Mux0X(
4529                                    unop(Iop_1Uto8,
4530                                         mk_x86g_calculate_condition(X86CondP)),
4531                                    get_ST(0), get_ST(r_src)) );
4532               break;
4533
4534            case 0xE9: /* FUCOMPP %st(0),%st(1) */
4535               DIP("fucompp %%st(0),%%st(1)\n");
4536               /* This forces C1 to zero, which isn't right. */
4537               put_C3210(
4538                   binop( Iop_And32,
4539                          binop(Iop_Shl32,
4540                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
4541                                mkU8(8)),
4542                          mkU32(0x4500)
4543                   ));
4544               fp_pop();
4545               fp_pop();
4546               break;
4547
4548            default:
4549               goto decode_fail;
4550         }
4551
4552      }
4553   }
4554
4555   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
4556   else
4557   if (first_opcode == 0xDB) {
4558      if (modrm < 0xC0) {
4559
4560         /* bits 5,4,3 are an opcode extension, and the modRM also
4561            specifies an address. */
4562         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4563         delta += len;
4564
4565         switch (gregOfRM(modrm)) {
4566
4567            case 0: /* FILD m32int */
4568               DIP("fildl %s\n", dis_buf);
4569               fp_push();
4570               put_ST(0, unop(Iop_I32StoF64,
4571                              loadLE(Ity_I32, mkexpr(addr))));
4572               break;
4573
4574            case 1: /* FISTTPL m32 (SSE3) */
4575               DIP("fisttpl %s\n", dis_buf);
4576               storeLE( mkexpr(addr),
4577                        binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
4578               fp_pop();
4579               break;
4580
4581            case 2: /* FIST m32 */
4582               DIP("fistl %s\n", dis_buf);
4583               storeLE( mkexpr(addr),
4584                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
4585               break;
4586
4587            case 3: /* FISTP m32 */
4588               DIP("fistpl %s\n", dis_buf);
4589               storeLE( mkexpr(addr),
4590                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
4591               fp_pop();
4592               break;
4593
4594            case 5: { /* FLD extended-real */
4595               /* Uses dirty helper:
4596                     ULong x86g_loadF80le ( UInt )
4597                  addr holds the address.  First, do a dirty call to
4598                  get hold of the data. */
4599               IRTemp   val  = newTemp(Ity_I64);
4600               IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
4601
4602               IRDirty* d = unsafeIRDirty_1_N (
4603                               val,
4604                               0/*regparms*/,
4605                               "x86g_dirtyhelper_loadF80le",
4606                               &x86g_dirtyhelper_loadF80le,
4607                               args
4608                            );
4609               /* declare that we're reading memory */
4610               d->mFx   = Ifx_Read;
4611               d->mAddr = mkexpr(addr);
4612               d->mSize = 10;
4613
4614               /* execute the dirty call, dumping the result in val. */
4615               stmt( IRStmt_Dirty(d) );
4616               fp_push();
4617               put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
4618
4619               DIP("fldt %s\n", dis_buf);
4620               break;
4621            }
4622
4623            case 7: { /* FSTP extended-real */
4624               /* Uses dirty helper: void x86g_storeF80le ( UInt, ULong ) */
4625               IRExpr** args
4626                  = mkIRExprVec_2( mkexpr(addr),
4627                                   unop(Iop_ReinterpF64asI64, get_ST(0)) );
4628
4629               IRDirty* d = unsafeIRDirty_0_N (
4630                               0/*regparms*/,
4631                               "x86g_dirtyhelper_storeF80le",
4632                               &x86g_dirtyhelper_storeF80le,
4633                               args
4634                            );
4635               /* declare we're writing memory */
4636               d->mFx   = Ifx_Write;
4637               d->mAddr = mkexpr(addr);
4638               d->mSize = 10;
4639
4640               /* execute the dirty call. */
4641               stmt( IRStmt_Dirty(d) );
4642               fp_pop();
4643
4644               DIP("fstpt\n %s", dis_buf);
4645               break;
4646            }
4647
4648            default:
4649               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
4650               vex_printf("first_opcode == 0xDB\n");
4651               goto decode_fail;
4652         }
4653
4654      } else {
4655
4656         delta++;
4657         switch (modrm) {
4658
4659            case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
4660               r_src = (UInt)modrm - 0xC0;
4661               DIP("fcmovnb %%st(%d), %%st(0)\n", (Int)r_src);
4662               put_ST_UNCHECKED(0,
4663                                IRExpr_Mux0X(
4664                                    unop(Iop_1Uto8,
4665                                         mk_x86g_calculate_condition(X86CondNB)),
4666                                    get_ST(0), get_ST(r_src)) );
4667               break;
4668
4669            case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
4670               r_src = (UInt)modrm - 0xC8;
4671               DIP("fcmovnz %%st(%d), %%st(0)\n", (Int)r_src);
4672               put_ST_UNCHECKED(0,
4673                                IRExpr_Mux0X(
4674                                    unop(Iop_1Uto8,
4675                                         mk_x86g_calculate_condition(X86CondNZ)),
4676                                    get_ST(0), get_ST(r_src)) );
4677               break;
4678
4679            case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
4680               r_src = (UInt)modrm - 0xD0;
4681               DIP("fcmovnbe %%st(%d), %%st(0)\n", (Int)r_src);
4682               put_ST_UNCHECKED(0,
4683                                IRExpr_Mux0X(
4684                                    unop(Iop_1Uto8,
4685                                         mk_x86g_calculate_condition(X86CondNBE)),
4686                                    get_ST(0), get_ST(r_src)) );
4687               break;
4688
4689            case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
4690               r_src = (UInt)modrm - 0xD8;
4691               DIP("fcmovnu %%st(%d), %%st(0)\n", (Int)r_src);
4692               put_ST_UNCHECKED(0,
4693                                IRExpr_Mux0X(
4694                                    unop(Iop_1Uto8,
4695                                         mk_x86g_calculate_condition(X86CondNP)),
4696                                    get_ST(0), get_ST(r_src)) );
4697               break;
4698
4699            case 0xE2:
4700               DIP("fnclex\n");
4701               break;
4702
4703            case 0xE3: {
4704               /* Uses dirty helper:
4705                     void x86g_do_FINIT ( VexGuestX86State* ) */
4706               IRDirty* d  = unsafeIRDirty_0_N (
4707                                0/*regparms*/,
4708                                "x86g_dirtyhelper_FINIT",
4709                                &x86g_dirtyhelper_FINIT,
4710                                mkIRExprVec_0()
4711                             );
4712               d->needsBBP = True;
4713
4714               /* declare we're writing guest state */
4715               d->nFxState = 5;
4716
4717               d->fxState[0].fx     = Ifx_Write;
4718               d->fxState[0].offset = OFFB_FTOP;
4719               d->fxState[0].size   = sizeof(UInt);
4720
4721               d->fxState[1].fx     = Ifx_Write;
4722               d->fxState[1].offset = OFFB_FPREGS;
4723               d->fxState[1].size   = 8 * sizeof(ULong);
4724
4725               d->fxState[2].fx     = Ifx_Write;
4726               d->fxState[2].offset = OFFB_FPTAGS;
4727               d->fxState[2].size   = 8 * sizeof(UChar);
4728
4729               d->fxState[3].fx     = Ifx_Write;
4730               d->fxState[3].offset = OFFB_FPROUND;
4731               d->fxState[3].size   = sizeof(UInt);
4732
4733               d->fxState[4].fx     = Ifx_Write;
4734               d->fxState[4].offset = OFFB_FC3210;
4735               d->fxState[4].size   = sizeof(UInt);
4736
4737               stmt( IRStmt_Dirty(d) );
4738
4739               DIP("fninit\n");
4740               break;
4741            }
4742
4743            case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
4744               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
4745               break;
4746
4747            case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
4748               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
4749               break;
4750
4751            default:
4752               goto decode_fail;
4753         }
4754      }
4755   }
4756
4757   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
4758   else
4759   if (first_opcode == 0xDC) {
4760      if (modrm < 0xC0) {
4761
4762         /* bits 5,4,3 are an opcode extension, and the modRM also
4763            specifies an address. */
4764         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4765         delta += len;
4766
4767         switch (gregOfRM(modrm)) {
4768
4769            case 0: /* FADD double-real */
4770               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
4771               break;
4772
4773            case 1: /* FMUL double-real */
4774               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
4775               break;
4776
4777            case 2: /* FCOM double-real */
4778               DIP("fcoml %s\n", dis_buf);
4779               /* This forces C1 to zero, which isn't right. */
4780               put_C3210(
4781                   binop( Iop_And32,
4782                          binop(Iop_Shl32,
4783                                binop(Iop_CmpF64,
4784                                      get_ST(0),
4785                                      loadLE(Ity_F64,mkexpr(addr))),
4786                                mkU8(8)),
4787                          mkU32(0x4500)
4788                   ));
4789               break;
4790
4791            case 3: /* FCOMP double-real */
4792               DIP("fcompl %s\n", dis_buf);
4793               /* This forces C1 to zero, which isn't right. */
4794               put_C3210(
4795                   binop( Iop_And32,
4796                          binop(Iop_Shl32,
4797                                binop(Iop_CmpF64,
4798                                      get_ST(0),
4799                                      loadLE(Ity_F64,mkexpr(addr))),
4800                                mkU8(8)),
4801                          mkU32(0x4500)
4802                   ));
4803               fp_pop();
4804               break;
4805
4806            case 4: /* FSUB double-real */
4807               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
4808               break;
4809
4810            case 5: /* FSUBR double-real */
4811               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
4812               break;
4813
4814            case 6: /* FDIV double-real */
4815               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
4816               break;
4817
4818            case 7: /* FDIVR double-real */
4819               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
4820               break;
4821
4822            default:
4823               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
4824               vex_printf("first_opcode == 0xDC\n");
4825               goto decode_fail;
4826         }
4827
4828      } else {
4829
4830         delta++;
4831         switch (modrm) {
4832
4833            case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
4834               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
4835               break;
4836
4837            case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
4838               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
4839               break;
4840
4841            case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
4842               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
4843               break;
4844
4845            case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
4846               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
4847               break;
4848
4849            case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
4850               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
4851               break;
4852
4853            case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
4854               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
4855               break;
4856
4857            default:
4858               goto decode_fail;
4859         }
4860
4861      }
4862   }
4863
4864   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
4865   else
4866   if (first_opcode == 0xDD) {
4867
4868      if (modrm < 0xC0) {
4869
4870         /* bits 5,4,3 are an opcode extension, and the modRM also
4871            specifies an address. */
4872         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4873         delta += len;
4874
4875         switch (gregOfRM(modrm)) {
4876
4877            case 0: /* FLD double-real */
4878               DIP("fldl %s\n", dis_buf);
4879               fp_push();
4880               put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
4881               break;
4882
4883            case 1: /* FISTTPQ m64 (SSE3) */
4884               DIP("fistppll %s\n", dis_buf);
4885               storeLE( mkexpr(addr),
4886                        binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
4887               fp_pop();
4888               break;
4889
4890            case 2: /* FST double-real */
4891               DIP("fstl %s\n", dis_buf);
4892               storeLE(mkexpr(addr), get_ST(0));
4893               break;
4894
4895            case 3: /* FSTP double-real */
4896               DIP("fstpl %s\n", dis_buf);
4897               storeLE(mkexpr(addr), get_ST(0));
4898               fp_pop();
4899               break;
4900
4901            case 4: { /* FRSTOR m108 */
4902               /* Uses dirty helper:
4903                     VexEmWarn x86g_do_FRSTOR ( VexGuestX86State*, Addr32 ) */
4904               IRTemp   ew = newTemp(Ity_I32);
4905               IRDirty* d  = unsafeIRDirty_0_N (
4906                                0/*regparms*/,
4907                                "x86g_dirtyhelper_FRSTOR",
4908                                &x86g_dirtyhelper_FRSTOR,
4909                                mkIRExprVec_1( mkexpr(addr) )
4910                             );
4911               d->needsBBP = True;
4912               d->tmp      = ew;
4913               /* declare we're reading memory */
4914               d->mFx   = Ifx_Read;
4915               d->mAddr = mkexpr(addr);
4916               d->mSize = 108;
4917
4918               /* declare we're writing guest state */
4919               d->nFxState = 5;
4920
4921               d->fxState[0].fx     = Ifx_Write;
4922               d->fxState[0].offset = OFFB_FTOP;
4923               d->fxState[0].size   = sizeof(UInt);
4924
4925               d->fxState[1].fx     = Ifx_Write;
4926               d->fxState[1].offset = OFFB_FPREGS;
4927               d->fxState[1].size   = 8 * sizeof(ULong);
4928
4929               d->fxState[2].fx     = Ifx_Write;
4930               d->fxState[2].offset = OFFB_FPTAGS;
4931               d->fxState[2].size   = 8 * sizeof(UChar);
4932
4933               d->fxState[3].fx     = Ifx_Write;
4934               d->fxState[3].offset = OFFB_FPROUND;
4935               d->fxState[3].size   = sizeof(UInt);
4936
4937               d->fxState[4].fx     = Ifx_Write;
4938               d->fxState[4].offset = OFFB_FC3210;
4939               d->fxState[4].size   = sizeof(UInt);
4940
4941               stmt( IRStmt_Dirty(d) );
4942
4943               /* ew contains any emulation warning we may need to
4944                  issue.  If needed, side-exit to the next insn,
4945                  reporting the warning, so that Valgrind's dispatcher
4946                  sees the warning. */
4947               put_emwarn( mkexpr(ew) );
4948               stmt(
4949                  IRStmt_Exit(
4950                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
4951                     Ijk_EmWarn,
4952                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
4953                  )
4954               );
4955
4956               DIP("frstor %s\n", dis_buf);
4957               break;
4958            }
4959
4960            case 6: { /* FNSAVE m108 */
4961               /* Uses dirty helper:
4962                     void x86g_do_FSAVE ( VexGuestX86State*, UInt ) */
4963               IRDirty* d = unsafeIRDirty_0_N (
4964                               0/*regparms*/,
4965                               "x86g_dirtyhelper_FSAVE",
4966                               &x86g_dirtyhelper_FSAVE,
4967                               mkIRExprVec_1( mkexpr(addr) )
4968                            );
4969               d->needsBBP = True;
4970               /* declare we're writing memory */
4971               d->mFx   = Ifx_Write;
4972               d->mAddr = mkexpr(addr);
4973               d->mSize = 108;
4974
4975               /* declare we're reading guest state */
4976               d->nFxState = 5;
4977
4978               d->fxState[0].fx     = Ifx_Read;
4979               d->fxState[0].offset = OFFB_FTOP;
4980               d->fxState[0].size   = sizeof(UInt);
4981
4982               d->fxState[1].fx     = Ifx_Read;
4983               d->fxState[1].offset = OFFB_FPREGS;
4984               d->fxState[1].size   = 8 * sizeof(ULong);
4985
4986               d->fxState[2].fx     = Ifx_Read;
4987               d->fxState[2].offset = OFFB_FPTAGS;
4988               d->fxState[2].size   = 8 * sizeof(UChar);
4989
4990               d->fxState[3].fx     = Ifx_Read;
4991               d->fxState[3].offset = OFFB_FPROUND;
4992               d->fxState[3].size   = sizeof(UInt);
4993
4994               d->fxState[4].fx     = Ifx_Read;
4995               d->fxState[4].offset = OFFB_FC3210;
4996               d->fxState[4].size   = sizeof(UInt);
4997
4998               stmt( IRStmt_Dirty(d) );
4999
5000               DIP("fnsave %s\n", dis_buf);
5001               break;
5002            }
5003
5004            case 7: { /* FNSTSW m16 */
5005               IRExpr* sw = get_FPU_sw();
5006               vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
5007               storeLE( mkexpr(addr), sw );
5008               DIP("fnstsw %s\n", dis_buf);
5009               break;
5010            }
5011
5012            default:
5013               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
5014               vex_printf("first_opcode == 0xDD\n");
5015               goto decode_fail;
5016         }
5017      } else {
5018         delta++;
5019         switch (modrm) {
5020
5021            case 0xC0 ... 0xC7: /* FFREE %st(?) */
5022               r_dst = (UInt)modrm - 0xC0;
5023               DIP("ffree %%st(%d)\n", (Int)r_dst);
5024               put_ST_TAG ( r_dst, mkU8(0) );
5025               break;
5026
5027            case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
5028               r_dst = (UInt)modrm - 0xD0;
5029               DIP("fst %%st(0),%%st(%d)\n", (Int)r_dst);
5030               /* P4 manual says: "If the destination operand is a
5031                  non-empty register, the invalid-operation exception
5032                  is not generated.  Hence put_ST_UNCHECKED. */
5033               put_ST_UNCHECKED(r_dst, get_ST(0));
5034               break;
5035
5036            case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
5037               r_dst = (UInt)modrm - 0xD8;
5038               DIP("fstp %%st(0),%%st(%d)\n", (Int)r_dst);
5039               /* P4 manual says: "If the destination operand is a
5040                  non-empty register, the invalid-operation exception
5041                  is not generated.  Hence put_ST_UNCHECKED. */
5042               put_ST_UNCHECKED(r_dst, get_ST(0));
5043               fp_pop();
5044               break;
5045
5046            case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
5047               r_dst = (UInt)modrm - 0xE0;
5048               DIP("fucom %%st(0),%%st(%d)\n", (Int)r_dst);
5049               /* This forces C1 to zero, which isn't right. */
5050               put_C3210(
5051                   binop( Iop_And32,
5052                          binop(Iop_Shl32,
5053                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
5054                                mkU8(8)),
5055                          mkU32(0x4500)
5056                   ));
5057               break;
5058
5059            case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
5060               r_dst = (UInt)modrm - 0xE8;
5061               DIP("fucomp %%st(0),%%st(%d)\n", (Int)r_dst);
5062               /* This forces C1 to zero, which isn't right. */
5063               put_C3210(
5064                   binop( Iop_And32,
5065                          binop(Iop_Shl32,
5066                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
5067                                mkU8(8)),
5068                          mkU32(0x4500)
5069                   ));
5070               fp_pop();
5071               break;
5072
5073            default:
5074               goto decode_fail;
5075         }
5076      }
5077   }
5078
5079   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
5080   else
5081   if (first_opcode == 0xDE) {
5082
5083      if (modrm < 0xC0) {
5084
5085         /* bits 5,4,3 are an opcode extension, and the modRM also
5086            specifies an address. */
5087         IROp   fop;
5088         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5089         delta += len;
5090
5091         switch (gregOfRM(modrm)) {
5092
5093            case 0: /* FIADD m16int */ /* ST(0) += m16int */
5094               DIP("fiaddw %s\n", dis_buf);
5095               fop = Iop_AddF64;
5096               goto do_fop_m16;
5097
5098            case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
5099               DIP("fimulw %s\n", dis_buf);
5100               fop = Iop_MulF64;
5101               goto do_fop_m16;
5102
5103            case 2: /* FICOM m16int */
5104               DIP("ficomw %s\n", dis_buf);
5105               /* This forces C1 to zero, which isn't right. */
5106               put_C3210(
5107                   binop( Iop_And32,
5108                          binop(Iop_Shl32,
5109                                binop(Iop_CmpF64,
5110                                      get_ST(0),
5111                                      unop(Iop_I32StoF64,
5112                                         unop(Iop_16Sto32,
5113                                           loadLE(Ity_I16,mkexpr(addr))))),
5114                                mkU8(8)),
5115                          mkU32(0x4500)
5116                   ));
5117               break;
5118
5119            case 3: /* FICOMP m16int */
5120               DIP("ficompw %s\n", dis_buf);
5121               /* This forces C1 to zero, which isn't right. */
5122               put_C3210(
5123                   binop( Iop_And32,
5124                          binop(Iop_Shl32,
5125                                binop(Iop_CmpF64,
5126                                      get_ST(0),
5127                                      unop(Iop_I32StoF64,
5128                                         unop(Iop_16Sto32,
5129                                              loadLE(Ity_I16,mkexpr(addr))))),
5130                                mkU8(8)),
5131                          mkU32(0x4500)
5132                   ));
5133               fp_pop();
5134               break;
5135
5136            case 4: /* FISUB m16int */ /* ST(0) -= m16int */
5137               DIP("fisubw %s\n", dis_buf);
5138               fop = Iop_SubF64;
5139               goto do_fop_m16;
5140
5141            case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
5142               DIP("fisubrw %s\n", dis_buf);
5143               fop = Iop_SubF64;
5144               goto do_foprev_m16;
5145
5146            case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
5147               DIP("fisubw %s\n", dis_buf);
5148               fop = Iop_DivF64;
5149               goto do_fop_m16;
5150
5151            case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
5152               DIP("fidivrw %s\n", dis_buf);
5153               fop = Iop_DivF64;
5154               goto do_foprev_m16;
5155
5156            do_fop_m16:
5157               put_ST_UNCHECKED(0,
5158                  triop(fop,
5159                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5160                        get_ST(0),
5161                        unop(Iop_I32StoF64,
5162                             unop(Iop_16Sto32,
5163                                  loadLE(Ity_I16, mkexpr(addr))))));
5164               break;
5165
5166            do_foprev_m16:
5167               put_ST_UNCHECKED(0,
5168                  triop(fop,
5169                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5170                        unop(Iop_I32StoF64,
5171                             unop(Iop_16Sto32,
5172                                  loadLE(Ity_I16, mkexpr(addr)))),
5173                        get_ST(0)));
5174               break;
5175
5176            default:
5177               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
5178               vex_printf("first_opcode == 0xDE\n");
5179               goto decode_fail;
5180         }
5181
5182      } else {
5183
5184         delta++;
5185         switch (modrm) {
5186
5187            case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
5188               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
5189               break;
5190
5191            case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
5192               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
5193               break;
5194
5195            case 0xD9: /* FCOMPP %st(0),%st(1) */
5196               DIP("fuompp %%st(0),%%st(1)\n");
5197               /* This forces C1 to zero, which isn't right. */
5198               put_C3210(
5199                   binop( Iop_And32,
5200                          binop(Iop_Shl32,
5201                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
5202                                mkU8(8)),
5203                          mkU32(0x4500)
5204                   ));
5205               fp_pop();
5206               fp_pop();
5207               break;
5208
5209            case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
5210               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
5211               break;
5212
5213            case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
5214               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
5215               break;
5216
5217            case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
5218               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
5219               break;
5220
5221            case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
5222               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
5223               break;
5224
5225            default:
5226               goto decode_fail;
5227         }
5228
5229      }
5230   }
5231
5232   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
5233   else
5234   if (first_opcode == 0xDF) {
5235
5236      if (modrm < 0xC0) {
5237
5238         /* bits 5,4,3 are an opcode extension, and the modRM also
5239            specifies an address. */
5240         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5241         delta += len;
5242
5243         switch (gregOfRM(modrm)) {
5244
5245            case 0: /* FILD m16int */
5246               DIP("fildw %s\n", dis_buf);
5247               fp_push();
5248               put_ST(0, unop(Iop_I32StoF64,
5249                              unop(Iop_16Sto32,
5250                                   loadLE(Ity_I16, mkexpr(addr)))));
5251               break;
5252
5253            case 1: /* FISTTPS m16 (SSE3) */
5254               DIP("fisttps %s\n", dis_buf);
5255               storeLE( mkexpr(addr),
5256                        binop(Iop_F64toI16S, mkU32(Irrm_ZERO), get_ST(0)) );
5257               fp_pop();
5258               break;
5259
5260            case 2: /* FIST m16 */
5261               DIP("fistp %s\n", dis_buf);
5262               storeLE( mkexpr(addr),
5263                        binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
5264               break;
5265
5266            case 3: /* FISTP m16 */
5267               DIP("fistps %s\n", dis_buf);
5268               storeLE( mkexpr(addr),
5269                        binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
5270               fp_pop();
5271               break;
5272
5273            case 5: /* FILD m64 */
5274               DIP("fildll %s\n", dis_buf);
5275               fp_push();
5276               put_ST(0, binop(Iop_I64StoF64,
5277                               get_roundingmode(),
5278                               loadLE(Ity_I64, mkexpr(addr))));
5279               break;
5280
5281            case 7: /* FISTP m64 */
5282               DIP("fistpll %s\n", dis_buf);
5283               storeLE( mkexpr(addr),
5284                        binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
5285               fp_pop();
5286               break;
5287
5288            default:
5289               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
5290               vex_printf("first_opcode == 0xDF\n");
5291               goto decode_fail;
5292         }
5293
5294      } else {
5295
5296         delta++;
5297         switch (modrm) {
5298
5299            case 0xC0: /* FFREEP %st(0) */
5300               DIP("ffreep %%st(%d)\n", 0);
5301               put_ST_TAG ( 0, mkU8(0) );
5302               fp_pop();
5303               break;
5304
5305            case 0xE0: /* FNSTSW %ax */
5306               DIP("fnstsw %%ax\n");
5307               /* Get the FPU status word value and dump it in %AX. */
5308               if (0) {
5309                  /* The obvious thing to do is simply dump the 16-bit
5310                     status word value in %AX.  However, due to a
5311                     limitation in Memcheck's origin tracking
5312                     machinery, this causes Memcheck not to track the
5313                     origin of any undefinedness into %AH (only into
5314                     %AL/%AX/%EAX), which means origins are lost in
5315                     the sequence "fnstsw %ax; test $M,%ah; jcond .." */
5316                  putIReg(2, R_EAX, get_FPU_sw());
5317               } else {
5318                  /* So a somewhat lame kludge is to make it very
5319                     clear to Memcheck that the value is written to
5320                     both %AH and %AL.  This generates marginally
5321                     worse code, but I don't think it matters much. */
5322                  IRTemp t16 = newTemp(Ity_I16);
5323                  assign(t16, get_FPU_sw());
5324                  putIReg( 1, R_AL, unop(Iop_16to8, mkexpr(t16)) );
5325                  putIReg( 1, R_AH, unop(Iop_16HIto8, mkexpr(t16)) );
5326               }
5327               break;
5328
5329            case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
5330               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
5331               break;
5332
5333            case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
5334               /* not really right since COMIP != UCOMIP */
5335               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
5336               break;
5337
5338            default:
5339               goto decode_fail;
5340         }
5341      }
5342
5343   }
5344
5345   else
5346   vpanic("dis_FPU(x86): invalid primary opcode");
5347
5348   *decode_ok = True;
5349   return delta;
5350
5351  decode_fail:
5352   *decode_ok = False;
5353   return delta;
5354}
5355
5356
5357/*------------------------------------------------------------*/
5358/*---                                                      ---*/
5359/*--- MMX INSTRUCTIONS                                     ---*/
5360/*---                                                      ---*/
5361/*------------------------------------------------------------*/
5362
5363/* Effect of MMX insns on x87 FPU state (table 11-2 of
5364   IA32 arch manual, volume 3):
5365
5366   Read from, or write to MMX register (viz, any insn except EMMS):
5367   * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
5368   * FP stack pointer set to zero
5369
5370   EMMS:
5371   * All tags set to Invalid (empty) -- FPTAGS[i] := zero
5372   * FP stack pointer set to zero
5373*/
5374
5375static void do_MMX_preamble ( void )
5376{
5377   Int         i;
5378   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
5379   IRExpr*     zero  = mkU32(0);
5380   IRExpr*     tag1  = mkU8(1);
5381   put_ftop(zero);
5382   for (i = 0; i < 8; i++)
5383      stmt( IRStmt_PutI( descr, zero, i, tag1 ) );
5384}
5385
5386static void do_EMMS_preamble ( void )
5387{
5388   Int         i;
5389   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
5390   IRExpr*     zero  = mkU32(0);
5391   IRExpr*     tag0  = mkU8(0);
5392   put_ftop(zero);
5393   for (i = 0; i < 8; i++)
5394      stmt( IRStmt_PutI( descr, zero, i, tag0 ) );
5395}
5396
5397
5398static IRExpr* getMMXReg ( UInt archreg )
5399{
5400   vassert(archreg < 8);
5401   return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
5402}
5403
5404
5405static void putMMXReg ( UInt archreg, IRExpr* e )
5406{
5407   vassert(archreg < 8);
5408   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
5409   stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
5410}
5411
5412
5413/* Helper for non-shift MMX insns.  Note this is incomplete in the
5414   sense that it does not first call do_MMX_preamble() -- that is the
5415   responsibility of its caller. */
5416
5417static
5418UInt dis_MMXop_regmem_to_reg ( UChar  sorb,
5419                               Int    delta,
5420                               UChar  opc,
5421                               HChar* name,
5422                               Bool   show_granularity )
5423{
5424   HChar   dis_buf[50];
5425   UChar   modrm = getIByte(delta);
5426   Bool    isReg = epartIsReg(modrm);
5427   IRExpr* argL  = NULL;
5428   IRExpr* argR  = NULL;
5429   IRExpr* argG  = NULL;
5430   IRExpr* argE  = NULL;
5431   IRTemp  res   = newTemp(Ity_I64);
5432
5433   Bool    invG  = False;
5434   IROp    op    = Iop_INVALID;
5435   void*   hAddr = NULL;
5436   HChar*  hName = NULL;
5437   Bool    eLeft = False;
5438
5439#  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
5440
5441   switch (opc) {
5442      /* Original MMX ones */
5443      case 0xFC: op = Iop_Add8x8; break;
5444      case 0xFD: op = Iop_Add16x4; break;
5445      case 0xFE: op = Iop_Add32x2; break;
5446
5447      case 0xEC: op = Iop_QAdd8Sx8; break;
5448      case 0xED: op = Iop_QAdd16Sx4; break;
5449
5450      case 0xDC: op = Iop_QAdd8Ux8; break;
5451      case 0xDD: op = Iop_QAdd16Ux4; break;
5452
5453      case 0xF8: op = Iop_Sub8x8;  break;
5454      case 0xF9: op = Iop_Sub16x4; break;
5455      case 0xFA: op = Iop_Sub32x2; break;
5456
5457      case 0xE8: op = Iop_QSub8Sx8; break;
5458      case 0xE9: op = Iop_QSub16Sx4; break;
5459
5460      case 0xD8: op = Iop_QSub8Ux8; break;
5461      case 0xD9: op = Iop_QSub16Ux4; break;
5462
5463      case 0xE5: op = Iop_MulHi16Sx4; break;
5464      case 0xD5: op = Iop_Mul16x4; break;
5465      case 0xF5: XXX(x86g_calculate_mmx_pmaddwd); break;
5466
5467      case 0x74: op = Iop_CmpEQ8x8; break;
5468      case 0x75: op = Iop_CmpEQ16x4; break;
5469      case 0x76: op = Iop_CmpEQ32x2; break;
5470
5471      case 0x64: op = Iop_CmpGT8Sx8; break;
5472      case 0x65: op = Iop_CmpGT16Sx4; break;
5473      case 0x66: op = Iop_CmpGT32Sx2; break;
5474
5475      case 0x6B: op = Iop_QNarrow32Sx2; eLeft = True; break;
5476      case 0x63: op = Iop_QNarrow16Sx4; eLeft = True; break;
5477      case 0x67: op = Iop_QNarrow16Ux4; eLeft = True; break;
5478
5479      case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
5480      case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
5481      case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
5482
5483      case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
5484      case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
5485      case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
5486
5487      case 0xDB: op = Iop_And64; break;
5488      case 0xDF: op = Iop_And64; invG = True; break;
5489      case 0xEB: op = Iop_Or64; break;
5490      case 0xEF: /* Possibly do better here if argL and argR are the
5491                    same reg */
5492                 op = Iop_Xor64; break;
5493
5494      /* Introduced in SSE1 */
5495      case 0xE0: op = Iop_Avg8Ux8;    break;
5496      case 0xE3: op = Iop_Avg16Ux4;   break;
5497      case 0xEE: op = Iop_Max16Sx4;   break;
5498      case 0xDE: op = Iop_Max8Ux8;    break;
5499      case 0xEA: op = Iop_Min16Sx4;   break;
5500      case 0xDA: op = Iop_Min8Ux8;    break;
5501      case 0xE4: op = Iop_MulHi16Ux4; break;
5502      case 0xF6: XXX(x86g_calculate_mmx_psadbw); break;
5503
5504      /* Introduced in SSE2 */
5505      case 0xD4: op = Iop_Add64; break;
5506      case 0xFB: op = Iop_Sub64; break;
5507
5508      default:
5509         vex_printf("\n0x%x\n", (Int)opc);
5510         vpanic("dis_MMXop_regmem_to_reg");
5511   }
5512
5513#  undef XXX
5514
5515   argG = getMMXReg(gregOfRM(modrm));
5516   if (invG)
5517      argG = unop(Iop_Not64, argG);
5518
5519   if (isReg) {
5520      delta++;
5521      argE = getMMXReg(eregOfRM(modrm));
5522   } else {
5523      Int    len;
5524      IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5525      delta += len;
5526      argE = loadLE(Ity_I64, mkexpr(addr));
5527   }
5528
5529   if (eLeft) {
5530      argL = argE;
5531      argR = argG;
5532   } else {
5533      argL = argG;
5534      argR = argE;
5535   }
5536
5537   if (op != Iop_INVALID) {
5538      vassert(hName == NULL);
5539      vassert(hAddr == NULL);
5540      assign(res, binop(op, argL, argR));
5541   } else {
5542      vassert(hName != NULL);
5543      vassert(hAddr != NULL);
5544      assign( res,
5545              mkIRExprCCall(
5546                 Ity_I64,
5547                 0/*regparms*/, hName, hAddr,
5548                 mkIRExprVec_2( argL, argR )
5549              )
5550            );
5551   }
5552
5553   putMMXReg( gregOfRM(modrm), mkexpr(res) );
5554
5555   DIP("%s%s %s, %s\n",
5556       name, show_granularity ? nameMMXGran(opc & 3) : "",
5557       ( isReg ? nameMMXReg(eregOfRM(modrm)) : dis_buf ),
5558       nameMMXReg(gregOfRM(modrm)) );
5559
5560   return delta;
5561}
5562
5563
5564/* Vector by scalar shift of G by the amount specified at the bottom
5565   of E.  This is a straight copy of dis_SSE_shiftG_byE. */
5566
5567static UInt dis_MMX_shiftG_byE ( UChar sorb, Int delta,
5568                                 HChar* opname, IROp op )
5569{
5570   HChar   dis_buf[50];
5571   Int     alen, size;
5572   IRTemp  addr;
5573   Bool    shl, shr, sar;
5574   UChar   rm   = getIByte(delta);
5575   IRTemp  g0   = newTemp(Ity_I64);
5576   IRTemp  g1   = newTemp(Ity_I64);
5577   IRTemp  amt  = newTemp(Ity_I32);
5578   IRTemp  amt8 = newTemp(Ity_I8);
5579
5580   if (epartIsReg(rm)) {
5581      assign( amt, unop(Iop_64to32, getMMXReg(eregOfRM(rm))) );
5582      DIP("%s %s,%s\n", opname,
5583                        nameMMXReg(eregOfRM(rm)),
5584                        nameMMXReg(gregOfRM(rm)) );
5585      delta++;
5586   } else {
5587      addr = disAMode ( &alen, sorb, delta, dis_buf );
5588      assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
5589      DIP("%s %s,%s\n", opname,
5590                        dis_buf,
5591                        nameMMXReg(gregOfRM(rm)) );
5592      delta += alen;
5593   }
5594   assign( g0,   getMMXReg(gregOfRM(rm)) );
5595   assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
5596
5597   shl = shr = sar = False;
5598   size = 0;
5599   switch (op) {
5600      case Iop_ShlN16x4: shl = True; size = 32; break;
5601      case Iop_ShlN32x2: shl = True; size = 32; break;
5602      case Iop_Shl64:    shl = True; size = 64; break;
5603      case Iop_ShrN16x4: shr = True; size = 16; break;
5604      case Iop_ShrN32x2: shr = True; size = 32; break;
5605      case Iop_Shr64:    shr = True; size = 64; break;
5606      case Iop_SarN16x4: sar = True; size = 16; break;
5607      case Iop_SarN32x2: sar = True; size = 32; break;
5608      default: vassert(0);
5609   }
5610
5611   if (shl || shr) {
5612     assign(
5613        g1,
5614        IRExpr_Mux0X(
5615           unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
5616           mkU64(0),
5617           binop(op, mkexpr(g0), mkexpr(amt8))
5618        )
5619     );
5620   } else
5621   if (sar) {
5622     assign(
5623        g1,
5624        IRExpr_Mux0X(
5625           unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
5626           binop(op, mkexpr(g0), mkU8(size-1)),
5627           binop(op, mkexpr(g0), mkexpr(amt8))
5628        )
5629     );
5630   } else {
5631      /*NOTREACHED*/
5632      vassert(0);
5633   }
5634
5635   putMMXReg( gregOfRM(rm), mkexpr(g1) );
5636   return delta;
5637}
5638
5639
5640/* Vector by scalar shift of E by an immediate byte.  This is a
5641   straight copy of dis_SSE_shiftE_imm. */
5642
5643static
5644UInt dis_MMX_shiftE_imm ( Int delta, HChar* opname, IROp op )
5645{
5646   Bool    shl, shr, sar;
5647   UChar   rm   = getIByte(delta);
5648   IRTemp  e0   = newTemp(Ity_I64);
5649   IRTemp  e1   = newTemp(Ity_I64);
5650   UChar   amt, size;
5651   vassert(epartIsReg(rm));
5652   vassert(gregOfRM(rm) == 2
5653           || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
5654   amt = getIByte(delta+1);
5655   delta += 2;
5656   DIP("%s $%d,%s\n", opname,
5657                      (Int)amt,
5658                      nameMMXReg(eregOfRM(rm)) );
5659
5660   assign( e0, getMMXReg(eregOfRM(rm)) );
5661
5662   shl = shr = sar = False;
5663   size = 0;
5664   switch (op) {
5665      case Iop_ShlN16x4: shl = True; size = 16; break;
5666      case Iop_ShlN32x2: shl = True; size = 32; break;
5667      case Iop_Shl64:    shl = True; size = 64; break;
5668      case Iop_SarN16x4: sar = True; size = 16; break;
5669      case Iop_SarN32x2: sar = True; size = 32; break;
5670      case Iop_ShrN16x4: shr = True; size = 16; break;
5671      case Iop_ShrN32x2: shr = True; size = 32; break;
5672      case Iop_Shr64:    shr = True; size = 64; break;
5673      default: vassert(0);
5674   }
5675
5676   if (shl || shr) {
5677      assign( e1, amt >= size
5678                     ? mkU64(0)
5679                     : binop(op, mkexpr(e0), mkU8(amt))
5680      );
5681   } else
5682   if (sar) {
5683      assign( e1, amt >= size
5684                     ? binop(op, mkexpr(e0), mkU8(size-1))
5685                     : binop(op, mkexpr(e0), mkU8(amt))
5686      );
5687   } else {
5688      /*NOTREACHED*/
5689      vassert(0);
5690   }
5691
5692   putMMXReg( eregOfRM(rm), mkexpr(e1) );
5693   return delta;
5694}
5695
5696
5697/* Completely handle all MMX instructions except emms. */
5698
5699static
5700UInt dis_MMX ( Bool* decode_ok, UChar sorb, Int sz, Int delta )
5701{
5702   Int   len;
5703   UChar modrm;
5704   HChar dis_buf[50];
5705   UChar opc = getIByte(delta);
5706   delta++;
5707
5708   /* dis_MMX handles all insns except emms. */
5709   do_MMX_preamble();
5710
5711   switch (opc) {
5712
5713      case 0x6E:
5714         /* MOVD (src)ireg-or-mem (E), (dst)mmxreg (G)*/
5715         if (sz != 4)
5716            goto mmx_decode_failure;
5717         modrm = getIByte(delta);
5718         if (epartIsReg(modrm)) {
5719            delta++;
5720            putMMXReg(
5721               gregOfRM(modrm),
5722               binop( Iop_32HLto64,
5723                      mkU32(0),
5724                      getIReg(4, eregOfRM(modrm)) ) );
5725            DIP("movd %s, %s\n",
5726                nameIReg(4,eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
5727         } else {
5728            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5729            delta += len;
5730            putMMXReg(
5731               gregOfRM(modrm),
5732               binop( Iop_32HLto64,
5733                      mkU32(0),
5734                      loadLE(Ity_I32, mkexpr(addr)) ) );
5735            DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregOfRM(modrm)));
5736         }
5737         break;
5738
5739      case 0x7E: /* MOVD (src)mmxreg (G), (dst)ireg-or-mem (E) */
5740         if (sz != 4)
5741            goto mmx_decode_failure;
5742         modrm = getIByte(delta);
5743         if (epartIsReg(modrm)) {
5744            delta++;
5745            putIReg( 4, eregOfRM(modrm),
5746                     unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
5747            DIP("movd %s, %s\n",
5748                nameMMXReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
5749         } else {
5750            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5751            delta += len;
5752            storeLE( mkexpr(addr),
5753                     unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
5754            DIP("movd %s, %s\n", nameMMXReg(gregOfRM(modrm)), dis_buf);
5755         }
5756         break;
5757
5758      case 0x6F:
5759         /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
5760         if (sz != 4)
5761            goto mmx_decode_failure;
5762         modrm = getIByte(delta);
5763         if (epartIsReg(modrm)) {
5764            delta++;
5765            putMMXReg( gregOfRM(modrm), getMMXReg(eregOfRM(modrm)) );
5766            DIP("movq %s, %s\n",
5767                nameMMXReg(eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
5768         } else {
5769            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5770            delta += len;
5771            putMMXReg( gregOfRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
5772            DIP("movq %s, %s\n",
5773                dis_buf, nameMMXReg(gregOfRM(modrm)));
5774         }
5775         break;
5776
5777      case 0x7F:
5778         /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
5779         if (sz != 4)
5780            goto mmx_decode_failure;
5781         modrm = getIByte(delta);
5782         if (epartIsReg(modrm)) {
5783            delta++;
5784            putMMXReg( eregOfRM(modrm), getMMXReg(gregOfRM(modrm)) );
5785            DIP("movq %s, %s\n",
5786                nameMMXReg(gregOfRM(modrm)), nameMMXReg(eregOfRM(modrm)));
5787         } else {
5788            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5789            delta += len;
5790            storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
5791            DIP("mov(nt)q %s, %s\n",
5792                nameMMXReg(gregOfRM(modrm)), dis_buf);
5793         }
5794         break;
5795
5796      case 0xFC:
5797      case 0xFD:
5798      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
5799         if (sz != 4)
5800            goto mmx_decode_failure;
5801         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padd", True );
5802         break;
5803
5804      case 0xEC:
5805      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
5806         if (sz != 4)
5807            goto mmx_decode_failure;
5808         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padds", True );
5809         break;
5810
5811      case 0xDC:
5812      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
5813         if (sz != 4)
5814            goto mmx_decode_failure;
5815         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "paddus", True );
5816         break;
5817
5818      case 0xF8:
5819      case 0xF9:
5820      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
5821         if (sz != 4)
5822            goto mmx_decode_failure;
5823         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psub", True );
5824         break;
5825
5826      case 0xE8:
5827      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
5828         if (sz != 4)
5829            goto mmx_decode_failure;
5830         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubs", True );
5831         break;
5832
5833      case 0xD8:
5834      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
5835         if (sz != 4)
5836            goto mmx_decode_failure;
5837         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubus", True );
5838         break;
5839
5840      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
5841         if (sz != 4)
5842            goto mmx_decode_failure;
5843         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmulhw", False );
5844         break;
5845
5846      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
5847         if (sz != 4)
5848            goto mmx_decode_failure;
5849         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmullw", False );
5850         break;
5851
5852      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
5853         vassert(sz == 4);
5854         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmaddwd", False );
5855         break;
5856
5857      case 0x74:
5858      case 0x75:
5859      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
5860         if (sz != 4)
5861            goto mmx_decode_failure;
5862         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpeq", True );
5863         break;
5864
5865      case 0x64:
5866      case 0x65:
5867      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
5868         if (sz != 4)
5869            goto mmx_decode_failure;
5870         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpgt", True );
5871         break;
5872
5873      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
5874         if (sz != 4)
5875            goto mmx_decode_failure;
5876         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packssdw", False );
5877         break;
5878
5879      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
5880         if (sz != 4)
5881            goto mmx_decode_failure;
5882         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packsswb", False );
5883         break;
5884
5885      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
5886         if (sz != 4)
5887            goto mmx_decode_failure;
5888         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packuswb", False );
5889         break;
5890
5891      case 0x68:
5892      case 0x69:
5893      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
5894         if (sz != 4)
5895            goto mmx_decode_failure;
5896         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckh", True );
5897         break;
5898
5899      case 0x60:
5900      case 0x61:
5901      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
5902         if (sz != 4)
5903            goto mmx_decode_failure;
5904         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckl", True );
5905         break;
5906
5907      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
5908         if (sz != 4)
5909            goto mmx_decode_failure;
5910         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pand", False );
5911         break;
5912
5913      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
5914         if (sz != 4)
5915            goto mmx_decode_failure;
5916         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pandn", False );
5917         break;
5918
5919      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
5920         if (sz != 4)
5921            goto mmx_decode_failure;
5922         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "por", False );
5923         break;
5924
5925      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
5926         if (sz != 4)
5927            goto mmx_decode_failure;
5928         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pxor", False );
5929         break;
5930
5931#     define SHIFT_BY_REG(_name,_op)                                 \
5932                delta = dis_MMX_shiftG_byE(sorb, delta, _name, _op); \
5933                break;
5934
5935      /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
5936      case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
5937      case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
5938      case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
5939
5940      /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
5941      case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
5942      case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
5943      case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
5944
5945      /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
5946      case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
5947      case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
5948
5949#     undef SHIFT_BY_REG
5950
5951      case 0x71:
5952      case 0x72:
5953      case 0x73: {
5954         /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
5955         UChar byte2, subopc;
5956         if (sz != 4)
5957            goto mmx_decode_failure;
5958         byte2  = getIByte(delta);           /* amode / sub-opcode */
5959         subopc = toUChar( (byte2 >> 3) & 7 );
5960
5961#        define SHIFT_BY_IMM(_name,_op)                         \
5962             do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
5963             } while (0)
5964
5965              if (subopc == 2 /*SRL*/ && opc == 0x71)
5966                 SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
5967         else if (subopc == 2 /*SRL*/ && opc == 0x72)
5968                 SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
5969         else if (subopc == 2 /*SRL*/ && opc == 0x73)
5970                 SHIFT_BY_IMM("psrlq", Iop_Shr64);
5971
5972         else if (subopc == 4 /*SAR*/ && opc == 0x71)
5973                 SHIFT_BY_IMM("psraw", Iop_SarN16x4);
5974         else if (subopc == 4 /*SAR*/ && opc == 0x72)
5975                 SHIFT_BY_IMM("psrad", Iop_SarN32x2);
5976
5977         else if (subopc == 6 /*SHL*/ && opc == 0x71)
5978                 SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
5979         else if (subopc == 6 /*SHL*/ && opc == 0x72)
5980                 SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
5981         else if (subopc == 6 /*SHL*/ && opc == 0x73)
5982                 SHIFT_BY_IMM("psllq", Iop_Shl64);
5983
5984         else goto mmx_decode_failure;
5985
5986#        undef SHIFT_BY_IMM
5987         break;
5988      }
5989
5990      case 0xF7: {
5991         IRTemp addr    = newTemp(Ity_I32);
5992         IRTemp regD    = newTemp(Ity_I64);
5993         IRTemp regM    = newTemp(Ity_I64);
5994         IRTemp mask    = newTemp(Ity_I64);
5995         IRTemp olddata = newTemp(Ity_I64);
5996         IRTemp newdata = newTemp(Ity_I64);
5997
5998         modrm = getIByte(delta);
5999         if (sz != 4 || (!epartIsReg(modrm)))
6000            goto mmx_decode_failure;
6001         delta++;
6002
6003         assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
6004         assign( regM, getMMXReg( eregOfRM(modrm) ));
6005         assign( regD, getMMXReg( gregOfRM(modrm) ));
6006         assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
6007         assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
6008         assign( newdata,
6009                 binop(Iop_Or64,
6010                       binop(Iop_And64,
6011                             mkexpr(regD),
6012                             mkexpr(mask) ),
6013                       binop(Iop_And64,
6014                             mkexpr(olddata),
6015                             unop(Iop_Not64, mkexpr(mask)))) );
6016         storeLE( mkexpr(addr), mkexpr(newdata) );
6017         DIP("maskmovq %s,%s\n", nameMMXReg( eregOfRM(modrm) ),
6018                                 nameMMXReg( gregOfRM(modrm) ) );
6019         break;
6020      }
6021
6022      /* --- MMX decode failure --- */
6023      default:
6024      mmx_decode_failure:
6025         *decode_ok = False;
6026         return delta; /* ignored */
6027
6028   }
6029
6030   *decode_ok = True;
6031   return delta;
6032}
6033
6034
6035/*------------------------------------------------------------*/
6036/*--- More misc arithmetic and other obscure insns.        ---*/
6037/*------------------------------------------------------------*/
6038
6039/* Double length left and right shifts.  Apparently only required in
6040   v-size (no b- variant). */
6041static
6042UInt dis_SHLRD_Gv_Ev ( UChar sorb,
6043                       Int delta, UChar modrm,
6044                       Int sz,
6045                       IRExpr* shift_amt,
6046                       Bool amt_is_literal,
6047                       HChar* shift_amt_txt,
6048                       Bool left_shift )
6049{
6050   /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
6051      for printing it.   And eip on entry points at the modrm byte. */
6052   Int len;
6053   HChar dis_buf[50];
6054
6055   IRType ty       = szToITy(sz);
6056   IRTemp gsrc     = newTemp(ty);
6057   IRTemp esrc     = newTemp(ty);
6058   IRTemp addr     = IRTemp_INVALID;
6059   IRTemp tmpSH    = newTemp(Ity_I8);
6060   IRTemp tmpL     = IRTemp_INVALID;
6061   IRTemp tmpRes   = IRTemp_INVALID;
6062   IRTemp tmpSubSh = IRTemp_INVALID;
6063   IROp   mkpair;
6064   IROp   getres;
6065   IROp   shift;
6066   IRExpr* mask = NULL;
6067
6068   vassert(sz == 2 || sz == 4);
6069
6070   /* The E-part is the destination; this is shifted.  The G-part
6071      supplies bits to be shifted into the E-part, but is not
6072      changed.
6073
6074      If shifting left, form a double-length word with E at the top
6075      and G at the bottom, and shift this left.  The result is then in
6076      the high part.
6077
6078      If shifting right, form a double-length word with G at the top
6079      and E at the bottom, and shift this right.  The result is then
6080      at the bottom.  */
6081
6082   /* Fetch the operands. */
6083
6084   assign( gsrc, getIReg(sz, gregOfRM(modrm)) );
6085
6086   if (epartIsReg(modrm)) {
6087      delta++;
6088      assign( esrc, getIReg(sz, eregOfRM(modrm)) );
6089      DIP("sh%cd%c %s, %s, %s\n",
6090          ( left_shift ? 'l' : 'r' ), nameISize(sz),
6091          shift_amt_txt,
6092          nameIReg(sz, gregOfRM(modrm)), nameIReg(sz, eregOfRM(modrm)));
6093   } else {
6094      addr = disAMode ( &len, sorb, delta, dis_buf );
6095      delta += len;
6096      assign( esrc, loadLE(ty, mkexpr(addr)) );
6097      DIP("sh%cd%c %s, %s, %s\n",
6098          ( left_shift ? 'l' : 'r' ), nameISize(sz),
6099          shift_amt_txt,
6100          nameIReg(sz, gregOfRM(modrm)), dis_buf);
6101   }
6102
6103   /* Round up the relevant primops. */
6104
6105   if (sz == 4) {
6106      tmpL     = newTemp(Ity_I64);
6107      tmpRes   = newTemp(Ity_I32);
6108      tmpSubSh = newTemp(Ity_I32);
6109      mkpair   = Iop_32HLto64;
6110      getres   = left_shift ? Iop_64HIto32 : Iop_64to32;
6111      shift    = left_shift ? Iop_Shl64 : Iop_Shr64;
6112      mask     = mkU8(31);
6113   } else {
6114      /* sz == 2 */
6115      tmpL     = newTemp(Ity_I32);
6116      tmpRes   = newTemp(Ity_I16);
6117      tmpSubSh = newTemp(Ity_I16);
6118      mkpair   = Iop_16HLto32;
6119      getres   = left_shift ? Iop_32HIto16 : Iop_32to16;
6120      shift    = left_shift ? Iop_Shl32 : Iop_Shr32;
6121      mask     = mkU8(15);
6122   }
6123
6124   /* Do the shift, calculate the subshift value, and set
6125      the flag thunk. */
6126
6127   assign( tmpSH, binop(Iop_And8, shift_amt, mask) );
6128
6129   if (left_shift)
6130      assign( tmpL, binop(mkpair, mkexpr(esrc), mkexpr(gsrc)) );
6131   else
6132      assign( tmpL, binop(mkpair, mkexpr(gsrc), mkexpr(esrc)) );
6133
6134   assign( tmpRes, unop(getres, binop(shift, mkexpr(tmpL), mkexpr(tmpSH)) ) );
6135   assign( tmpSubSh,
6136           unop(getres,
6137                binop(shift,
6138                      mkexpr(tmpL),
6139                      binop(Iop_And8,
6140                            binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
6141                            mask))) );
6142
6143   setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl32 : Iop_Sar32,
6144                              tmpRes, tmpSubSh, ty, tmpSH );
6145
6146   /* Put result back. */
6147
6148   if (epartIsReg(modrm)) {
6149      putIReg(sz, eregOfRM(modrm), mkexpr(tmpRes));
6150   } else {
6151      storeLE( mkexpr(addr), mkexpr(tmpRes) );
6152   }
6153
6154   if (amt_is_literal) delta++;
6155   return delta;
6156}
6157
6158
6159/* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
6160   required. */
6161
6162typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
6163
6164static HChar* nameBtOp ( BtOp op )
6165{
6166   switch (op) {
6167      case BtOpNone:  return "";
6168      case BtOpSet:   return "s";
6169      case BtOpReset: return "r";
6170      case BtOpComp:  return "c";
6171      default: vpanic("nameBtOp(x86)");
6172   }
6173}
6174
6175
6176static
6177UInt dis_bt_G_E ( VexAbiInfo* vbi,
6178                  UChar sorb, Bool locked, Int sz, Int delta, BtOp op )
6179{
6180   HChar  dis_buf[50];
6181   UChar  modrm;
6182   Int    len;
6183   IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
6184          t_addr1, t_esp, t_mask, t_new;
6185
6186   vassert(sz == 2 || sz == 4);
6187
6188   t_fetched = t_bitno0 = t_bitno1 = t_bitno2
6189             = t_addr0 = t_addr1 = t_esp
6190             = t_mask = t_new = IRTemp_INVALID;
6191
6192   t_fetched = newTemp(Ity_I8);
6193   t_new     = newTemp(Ity_I8);
6194   t_bitno0  = newTemp(Ity_I32);
6195   t_bitno1  = newTemp(Ity_I32);
6196   t_bitno2  = newTemp(Ity_I8);
6197   t_addr1   = newTemp(Ity_I32);
6198   modrm     = getIByte(delta);
6199
6200   assign( t_bitno0, widenSto32(getIReg(sz, gregOfRM(modrm))) );
6201
6202   if (epartIsReg(modrm)) {
6203      delta++;
6204      /* Get it onto the client's stack. */
6205      t_esp = newTemp(Ity_I32);
6206      t_addr0 = newTemp(Ity_I32);
6207
6208      /* For the choice of the value 128, see comment in dis_bt_G_E in
6209         guest_amd64_toIR.c.  We point out here only that 128 is
6210         fast-cased in Memcheck and is > 0, so seems like a good
6211         choice. */
6212      vassert(vbi->guest_stack_redzone_size == 0);
6213      assign( t_esp, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(128)) );
6214      putIReg(4, R_ESP, mkexpr(t_esp));
6215
6216      storeLE( mkexpr(t_esp), getIReg(sz, eregOfRM(modrm)) );
6217
6218      /* Make t_addr0 point at it. */
6219      assign( t_addr0, mkexpr(t_esp) );
6220
6221      /* Mask out upper bits of the shift amount, since we're doing a
6222         reg. */
6223      assign( t_bitno1, binop(Iop_And32,
6224                              mkexpr(t_bitno0),
6225                              mkU32(sz == 4 ? 31 : 15)) );
6226
6227   } else {
6228      t_addr0 = disAMode ( &len, sorb, delta, dis_buf );
6229      delta += len;
6230      assign( t_bitno1, mkexpr(t_bitno0) );
6231   }
6232
6233   /* At this point: t_addr0 is the address being operated on.  If it
6234      was a reg, we will have pushed it onto the client's stack.
6235      t_bitno1 is the bit number, suitably masked in the case of a
6236      reg.  */
6237
6238   /* Now the main sequence. */
6239   assign( t_addr1,
6240           binop(Iop_Add32,
6241                 mkexpr(t_addr0),
6242                 binop(Iop_Sar32, mkexpr(t_bitno1), mkU8(3))) );
6243
6244   /* t_addr1 now holds effective address */
6245
6246   assign( t_bitno2,
6247           unop(Iop_32to8,
6248                binop(Iop_And32, mkexpr(t_bitno1), mkU32(7))) );
6249
6250   /* t_bitno2 contains offset of bit within byte */
6251
6252   if (op != BtOpNone) {
6253      t_mask = newTemp(Ity_I8);
6254      assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
6255   }
6256
6257   /* t_mask is now a suitable byte mask */
6258
6259   assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
6260
6261   if (op != BtOpNone) {
6262      switch (op) {
6263         case BtOpSet:
6264            assign( t_new,
6265                    binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
6266            break;
6267         case BtOpComp:
6268            assign( t_new,
6269                    binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
6270            break;
6271         case BtOpReset:
6272            assign( t_new,
6273                    binop(Iop_And8, mkexpr(t_fetched),
6274                                    unop(Iop_Not8, mkexpr(t_mask))) );
6275            break;
6276         default:
6277            vpanic("dis_bt_G_E(x86)");
6278      }
6279      if (locked && !epartIsReg(modrm)) {
6280         casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
6281                                 mkexpr(t_new)/*new*/,
6282                                 guest_EIP_curr_instr );
6283      } else {
6284         storeLE( mkexpr(t_addr1), mkexpr(t_new) );
6285      }
6286   }
6287
6288   /* Side effect done; now get selected bit into Carry flag */
6289   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
6290   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
6291   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
6292   stmt( IRStmt_Put(
6293            OFFB_CC_DEP1,
6294            binop(Iop_And32,
6295                  binop(Iop_Shr32,
6296                        unop(Iop_8Uto32, mkexpr(t_fetched)),
6297                        mkexpr(t_bitno2)),
6298                  mkU32(1)))
6299       );
6300   /* Set NDEP even though it isn't used.  This makes redundant-PUT
6301      elimination of previous stores to this field work better. */
6302   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6303
6304   /* Move reg operand from stack back to reg */
6305   if (epartIsReg(modrm)) {
6306      /* t_esp still points at it. */
6307      putIReg(sz, eregOfRM(modrm), loadLE(szToITy(sz), mkexpr(t_esp)) );
6308      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t_esp), mkU32(128)) );
6309   }
6310
6311   DIP("bt%s%c %s, %s\n",
6312       nameBtOp(op), nameISize(sz), nameIReg(sz, gregOfRM(modrm)),
6313       ( epartIsReg(modrm) ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ) );
6314
6315   return delta;
6316}
6317
6318
6319
6320/* Handle BSF/BSR.  Only v-size seems necessary. */
6321static
6322UInt dis_bs_E_G ( UChar sorb, Int sz, Int delta, Bool fwds )
6323{
6324   Bool   isReg;
6325   UChar  modrm;
6326   HChar  dis_buf[50];
6327
6328   IRType ty  = szToITy(sz);
6329   IRTemp src = newTemp(ty);
6330   IRTemp dst = newTemp(ty);
6331
6332   IRTemp src32 = newTemp(Ity_I32);
6333   IRTemp dst32 = newTemp(Ity_I32);
6334   IRTemp src8  = newTemp(Ity_I8);
6335
6336   vassert(sz == 4 || sz == 2);
6337
6338   modrm = getIByte(delta);
6339
6340   isReg = epartIsReg(modrm);
6341   if (isReg) {
6342      delta++;
6343      assign( src, getIReg(sz, eregOfRM(modrm)) );
6344   } else {
6345      Int    len;
6346      IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
6347      delta += len;
6348      assign( src, loadLE(ty, mkexpr(addr)) );
6349   }
6350
6351   DIP("bs%c%c %s, %s\n",
6352       fwds ? 'f' : 'r', nameISize(sz),
6353       ( isReg ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ),
6354       nameIReg(sz, gregOfRM(modrm)));
6355
6356   /* Generate an 8-bit expression which is zero iff the
6357      original is zero, and nonzero otherwise */
6358   assign( src8,
6359           unop(Iop_1Uto8, binop(mkSizedOp(ty,Iop_CmpNE8),
6360                           mkexpr(src), mkU(ty,0))) );
6361
6362   /* Flags: Z is 1 iff source value is zero.  All others
6363      are undefined -- we force them to zero. */
6364   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
6365   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
6366   stmt( IRStmt_Put(
6367            OFFB_CC_DEP1,
6368            IRExpr_Mux0X( mkexpr(src8),
6369                          /* src==0 */
6370                          mkU32(X86G_CC_MASK_Z),
6371                          /* src!=0 */
6372                          mkU32(0)
6373                        )
6374       ));
6375   /* Set NDEP even though it isn't used.  This makes redundant-PUT
6376      elimination of previous stores to this field work better. */
6377   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6378
6379   /* Result: iff source value is zero, we can't use
6380      Iop_Clz32/Iop_Ctz32 as they have no defined result in that case.
6381      But anyway, Intel x86 semantics say the result is undefined in
6382      such situations.  Hence handle the zero case specially. */
6383
6384   /* Bleh.  What we compute:
6385
6386          bsf32:  if src == 0 then 0 else  Ctz32(src)
6387          bsr32:  if src == 0 then 0 else  31 - Clz32(src)
6388
6389          bsf16:  if src == 0 then 0 else  Ctz32(16Uto32(src))
6390          bsr16:  if src == 0 then 0 else  31 - Clz32(16Uto32(src))
6391
6392      First, widen src to 32 bits if it is not already.
6393
6394      Postscript 15 Oct 04: it seems that at least VIA Nehemiah leaves the
6395      dst register unchanged when src == 0.  Hence change accordingly.
6396   */
6397   if (sz == 2)
6398      assign( src32, unop(Iop_16Uto32, mkexpr(src)) );
6399   else
6400      assign( src32, mkexpr(src) );
6401
6402   /* The main computation, guarding against zero. */
6403   assign( dst32,
6404           IRExpr_Mux0X(
6405              mkexpr(src8),
6406              /* src == 0 -- leave dst unchanged */
6407              widenUto32( getIReg( sz, gregOfRM(modrm) ) ),
6408              /* src != 0 */
6409              fwds ? unop(Iop_Ctz32, mkexpr(src32))
6410                   : binop(Iop_Sub32,
6411                           mkU32(31),
6412                           unop(Iop_Clz32, mkexpr(src32)))
6413           )
6414         );
6415
6416   if (sz == 2)
6417      assign( dst, unop(Iop_32to16, mkexpr(dst32)) );
6418   else
6419      assign( dst, mkexpr(dst32) );
6420
6421   /* dump result back */
6422   putIReg( sz, gregOfRM(modrm), mkexpr(dst) );
6423
6424   return delta;
6425}
6426
6427
6428static
6429void codegen_xchg_eAX_Reg ( Int sz, Int reg )
6430{
6431   IRType ty = szToITy(sz);
6432   IRTemp t1 = newTemp(ty);
6433   IRTemp t2 = newTemp(ty);
6434   vassert(sz == 2 || sz == 4);
6435   assign( t1, getIReg(sz, R_EAX) );
6436   assign( t2, getIReg(sz, reg) );
6437   putIReg( sz, R_EAX, mkexpr(t2) );
6438   putIReg( sz, reg, mkexpr(t1) );
6439   DIP("xchg%c %s, %s\n",
6440       nameISize(sz), nameIReg(sz, R_EAX), nameIReg(sz, reg));
6441}
6442
6443
6444static
6445void codegen_SAHF ( void )
6446{
6447   /* Set the flags to:
6448      (x86g_calculate_flags_all() & X86G_CC_MASK_O)  -- retain the old O flag
6449      | (%AH & (X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
6450                |X86G_CC_MASK_P|X86G_CC_MASK_C)
6451   */
6452   UInt   mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
6453                       |X86G_CC_MASK_C|X86G_CC_MASK_P;
6454   IRTemp oldflags   = newTemp(Ity_I32);
6455   assign( oldflags, mk_x86g_calculate_eflags_all() );
6456   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
6457   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6458   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
6459   stmt( IRStmt_Put( OFFB_CC_DEP1,
6460         binop(Iop_Or32,
6461               binop(Iop_And32, mkexpr(oldflags), mkU32(X86G_CC_MASK_O)),
6462               binop(Iop_And32,
6463                     binop(Iop_Shr32, getIReg(4, R_EAX), mkU8(8)),
6464                     mkU32(mask_SZACP))
6465              )
6466   ));
6467   /* Set NDEP even though it isn't used.  This makes redundant-PUT
6468      elimination of previous stores to this field work better. */
6469   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6470}
6471
6472
6473static
6474void codegen_LAHF ( void  )
6475{
6476   /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
6477   IRExpr* eax_with_hole;
6478   IRExpr* new_byte;
6479   IRExpr* new_eax;
6480   UInt    mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
6481                        |X86G_CC_MASK_C|X86G_CC_MASK_P;
6482
6483   IRTemp  flags = newTemp(Ity_I32);
6484   assign( flags, mk_x86g_calculate_eflags_all() );
6485
6486   eax_with_hole
6487      = binop(Iop_And32, getIReg(4, R_EAX), mkU32(0xFFFF00FF));
6488   new_byte
6489      = binop(Iop_Or32, binop(Iop_And32, mkexpr(flags), mkU32(mask_SZACP)),
6490                        mkU32(1<<1));
6491   new_eax
6492      = binop(Iop_Or32, eax_with_hole,
6493                        binop(Iop_Shl32, new_byte, mkU8(8)));
6494   putIReg(4, R_EAX, new_eax);
6495}
6496
6497
6498static
6499UInt dis_cmpxchg_G_E ( UChar       sorb,
6500                       Bool        locked,
6501                       Int         size,
6502                       Int         delta0 )
6503{
6504   HChar dis_buf[50];
6505   Int   len;
6506
6507   IRType ty    = szToITy(size);
6508   IRTemp acc   = newTemp(ty);
6509   IRTemp src   = newTemp(ty);
6510   IRTemp dest  = newTemp(ty);
6511   IRTemp dest2 = newTemp(ty);
6512   IRTemp acc2  = newTemp(ty);
6513   IRTemp cond8 = newTemp(Ity_I8);
6514   IRTemp addr  = IRTemp_INVALID;
6515   UChar  rm    = getUChar(delta0);
6516
6517   /* There are 3 cases to consider:
6518
6519      reg-reg: ignore any lock prefix, generate sequence based
6520               on Mux0X
6521
6522      reg-mem, not locked: ignore any lock prefix, generate sequence
6523                           based on Mux0X
6524
6525      reg-mem, locked: use IRCAS
6526   */
6527   if (epartIsReg(rm)) {
6528      /* case 1 */
6529      assign( dest, getIReg(size, eregOfRM(rm)) );
6530      delta0++;
6531      assign( src, getIReg(size, gregOfRM(rm)) );
6532      assign( acc, getIReg(size, R_EAX) );
6533      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
6534      assign( cond8, unop(Iop_1Uto8, mk_x86g_calculate_condition(X86CondZ)) );
6535      assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
6536      assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
6537      putIReg(size, R_EAX, mkexpr(acc2));
6538      putIReg(size, eregOfRM(rm), mkexpr(dest2));
6539      DIP("cmpxchg%c %s,%s\n", nameISize(size),
6540                               nameIReg(size,gregOfRM(rm)),
6541                               nameIReg(size,eregOfRM(rm)) );
6542   }
6543   else if (!epartIsReg(rm) && !locked) {
6544      /* case 2 */
6545      addr = disAMode ( &len, sorb, delta0, dis_buf );
6546      assign( dest, loadLE(ty, mkexpr(addr)) );
6547      delta0 += len;
6548      assign( src, getIReg(size, gregOfRM(rm)) );
6549      assign( acc, getIReg(size, R_EAX) );
6550      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
6551      assign( cond8, unop(Iop_1Uto8, mk_x86g_calculate_condition(X86CondZ)) );
6552      assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
6553      assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
6554      putIReg(size, R_EAX, mkexpr(acc2));
6555      storeLE( mkexpr(addr), mkexpr(dest2) );
6556      DIP("cmpxchg%c %s,%s\n", nameISize(size),
6557                               nameIReg(size,gregOfRM(rm)), dis_buf);
6558   }
6559   else if (!epartIsReg(rm) && locked) {
6560      /* case 3 */
6561      /* src is new value.  acc is expected value.  dest is old value.
6562         Compute success from the output of the IRCAS, and steer the
6563         new value for EAX accordingly: in case of success, EAX is
6564         unchanged. */
6565      addr = disAMode ( &len, sorb, delta0, dis_buf );
6566      delta0 += len;
6567      assign( src, getIReg(size, gregOfRM(rm)) );
6568      assign( acc, getIReg(size, R_EAX) );
6569      stmt( IRStmt_CAS(
6570         mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
6571                  NULL, mkexpr(acc), NULL, mkexpr(src) )
6572      ));
6573      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
6574      assign( cond8, unop(Iop_1Uto8, mk_x86g_calculate_condition(X86CondZ)) );
6575      assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
6576      putIReg(size, R_EAX, mkexpr(acc2));
6577      DIP("cmpxchg%c %s,%s\n", nameISize(size),
6578                               nameIReg(size,gregOfRM(rm)), dis_buf);
6579   }
6580   else vassert(0);
6581
6582   return delta0;
6583}
6584
6585
6586/* Handle conditional move instructions of the form
6587      cmovcc E(reg-or-mem), G(reg)
6588
6589   E(src) is reg-or-mem
6590   G(dst) is reg.
6591
6592   If E is reg, -->    GET %E, tmps
6593                       GET %G, tmpd
6594                       CMOVcc tmps, tmpd
6595                       PUT tmpd, %G
6596
6597   If E is mem  -->    (getAddr E) -> tmpa
6598                       LD (tmpa), tmps
6599                       GET %G, tmpd
6600                       CMOVcc tmps, tmpd
6601                       PUT tmpd, %G
6602*/
6603static
6604UInt dis_cmov_E_G ( UChar       sorb,
6605                    Int         sz,
6606                    X86Condcode cond,
6607                    Int         delta0 )
6608{
6609   UChar rm  = getIByte(delta0);
6610   HChar dis_buf[50];
6611   Int   len;
6612
6613   IRType ty   = szToITy(sz);
6614   IRTemp tmps = newTemp(ty);
6615   IRTemp tmpd = newTemp(ty);
6616
6617   if (epartIsReg(rm)) {
6618      assign( tmps, getIReg(sz, eregOfRM(rm)) );
6619      assign( tmpd, getIReg(sz, gregOfRM(rm)) );
6620
6621      putIReg(sz, gregOfRM(rm),
6622                  IRExpr_Mux0X( unop(Iop_1Uto8,
6623                                     mk_x86g_calculate_condition(cond)),
6624                                mkexpr(tmpd),
6625                                mkexpr(tmps) )
6626             );
6627      DIP("cmov%c%s %s,%s\n", nameISize(sz),
6628                              name_X86Condcode(cond),
6629                              nameIReg(sz,eregOfRM(rm)),
6630                              nameIReg(sz,gregOfRM(rm)));
6631      return 1+delta0;
6632   }
6633
6634   /* E refers to memory */
6635   {
6636      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
6637      assign( tmps, loadLE(ty, mkexpr(addr)) );
6638      assign( tmpd, getIReg(sz, gregOfRM(rm)) );
6639
6640      putIReg(sz, gregOfRM(rm),
6641                  IRExpr_Mux0X( unop(Iop_1Uto8,
6642                                     mk_x86g_calculate_condition(cond)),
6643                                mkexpr(tmpd),
6644                                mkexpr(tmps) )
6645             );
6646
6647      DIP("cmov%c%s %s,%s\n", nameISize(sz),
6648                              name_X86Condcode(cond),
6649                              dis_buf,
6650                              nameIReg(sz,gregOfRM(rm)));
6651      return len+delta0;
6652   }
6653}
6654
6655
6656static
6657UInt dis_xadd_G_E ( UChar sorb, Bool locked, Int sz, Int delta0,
6658                    Bool* decodeOK )
6659{
6660   Int   len;
6661   UChar rm = getIByte(delta0);
6662   HChar dis_buf[50];
6663
6664   IRType ty    = szToITy(sz);
6665   IRTemp tmpd  = newTemp(ty);
6666   IRTemp tmpt0 = newTemp(ty);
6667   IRTemp tmpt1 = newTemp(ty);
6668
6669   /* There are 3 cases to consider:
6670
6671      reg-reg: ignore any lock prefix,
6672               generate 'naive' (non-atomic) sequence
6673
6674      reg-mem, not locked: ignore any lock prefix, generate 'naive'
6675                           (non-atomic) sequence
6676
6677      reg-mem, locked: use IRCAS
6678   */
6679
6680   if (epartIsReg(rm)) {
6681      /* case 1 */
6682      assign( tmpd,  getIReg(sz, eregOfRM(rm)));
6683      assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
6684      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
6685                           mkexpr(tmpd), mkexpr(tmpt0)) );
6686      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
6687      putIReg(sz, eregOfRM(rm), mkexpr(tmpt1));
6688      putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
6689      DIP("xadd%c %s, %s\n",
6690          nameISize(sz), nameIReg(sz,gregOfRM(rm)),
6691          				 nameIReg(sz,eregOfRM(rm)));
6692      *decodeOK = True;
6693      return 1+delta0;
6694   }
6695   else if (!epartIsReg(rm) && !locked) {
6696      /* case 2 */
6697      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
6698      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
6699      assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
6700      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
6701                           mkexpr(tmpd), mkexpr(tmpt0)) );
6702      storeLE( mkexpr(addr), mkexpr(tmpt1) );
6703      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
6704      putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
6705      DIP("xadd%c %s, %s\n",
6706          nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
6707      *decodeOK = True;
6708      return len+delta0;
6709   }
6710   else if (!epartIsReg(rm) && locked) {
6711      /* case 3 */
6712      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
6713      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
6714      assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
6715      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
6716                           mkexpr(tmpd), mkexpr(tmpt0)) );
6717      casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
6718                           mkexpr(tmpt1)/*newVal*/, guest_EIP_curr_instr );
6719      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
6720      putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
6721      DIP("xadd%c %s, %s\n",
6722          nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
6723      *decodeOK = True;
6724      return len+delta0;
6725   }
6726   /*UNREACHED*/
6727   vassert(0);
6728}
6729
6730/* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
6731
6732static
6733UInt dis_mov_Ew_Sw ( UChar sorb, Int delta0 )
6734{
6735   Int    len;
6736   IRTemp addr;
6737   UChar  rm  = getIByte(delta0);
6738   HChar  dis_buf[50];
6739
6740   if (epartIsReg(rm)) {
6741      putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
6742      DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
6743      return 1+delta0;
6744   } else {
6745      addr = disAMode ( &len, sorb, delta0, dis_buf );
6746      putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
6747      DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
6748      return len+delta0;
6749   }
6750}
6751
6752/* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
6753   dst is ireg and sz==4, zero out top half of it.  */
6754
6755static
6756UInt dis_mov_Sw_Ew ( UChar sorb,
6757                     Int   sz,
6758                     Int   delta0 )
6759{
6760   Int    len;
6761   IRTemp addr;
6762   UChar  rm  = getIByte(delta0);
6763   HChar  dis_buf[50];
6764
6765   vassert(sz == 2 || sz == 4);
6766
6767   if (epartIsReg(rm)) {
6768      if (sz == 4)
6769         putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
6770      else
6771         putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
6772
6773      DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
6774      return 1+delta0;
6775   } else {
6776      addr = disAMode ( &len, sorb, delta0, dis_buf );
6777      storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
6778      DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
6779      return len+delta0;
6780   }
6781}
6782
6783
6784static
6785void dis_push_segreg ( UInt sreg, Int sz )
6786{
6787    IRTemp t1 = newTemp(Ity_I16);
6788    IRTemp ta = newTemp(Ity_I32);
6789    vassert(sz == 2 || sz == 4);
6790
6791    assign( t1, getSReg(sreg) );
6792    assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
6793    putIReg(4, R_ESP, mkexpr(ta));
6794    storeLE( mkexpr(ta), mkexpr(t1) );
6795
6796    DIP("push%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
6797}
6798
6799static
6800void dis_pop_segreg ( UInt sreg, Int sz )
6801{
6802    IRTemp t1 = newTemp(Ity_I16);
6803    IRTemp ta = newTemp(Ity_I32);
6804    vassert(sz == 2 || sz == 4);
6805
6806    assign( ta, getIReg(4, R_ESP) );
6807    assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
6808
6809    putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
6810    putSReg( sreg, mkexpr(t1) );
6811    DIP("pop%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
6812}
6813
6814static
6815void dis_ret ( UInt d32 )
6816{
6817   IRTemp t1 = newTemp(Ity_I32), t2 = newTemp(Ity_I32);
6818   assign(t1, getIReg(4,R_ESP));
6819   assign(t2, loadLE(Ity_I32,mkexpr(t1)));
6820   putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(4+d32)));
6821   jmp_treg(Ijk_Ret,t2);
6822}
6823
6824/*------------------------------------------------------------*/
6825/*--- SSE/SSE2/SSE3 helpers                                ---*/
6826/*------------------------------------------------------------*/
6827
6828/* Worker function; do not call directly.
6829   Handles full width G = G `op` E   and   G = (not G) `op` E.
6830*/
6831
6832static UInt dis_SSE_E_to_G_all_wrk (
6833               UChar sorb, Int delta,
6834               HChar* opname, IROp op,
6835               Bool   invertG
6836            )
6837{
6838   HChar   dis_buf[50];
6839   Int     alen;
6840   IRTemp  addr;
6841   UChar   rm = getIByte(delta);
6842   IRExpr* gpart
6843      = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRM(rm)))
6844                : getXMMReg(gregOfRM(rm));
6845   if (epartIsReg(rm)) {
6846      putXMMReg( gregOfRM(rm),
6847                 binop(op, gpart,
6848                           getXMMReg(eregOfRM(rm))) );
6849      DIP("%s %s,%s\n", opname,
6850                        nameXMMReg(eregOfRM(rm)),
6851                        nameXMMReg(gregOfRM(rm)) );
6852      return delta+1;
6853   } else {
6854      addr = disAMode ( &alen, sorb, delta, dis_buf );
6855      putXMMReg( gregOfRM(rm),
6856                 binop(op, gpart,
6857                           loadLE(Ity_V128, mkexpr(addr))) );
6858      DIP("%s %s,%s\n", opname,
6859                        dis_buf,
6860                        nameXMMReg(gregOfRM(rm)) );
6861      return delta+alen;
6862   }
6863}
6864
6865
6866/* All lanes SSE binary operation, G = G `op` E. */
6867
6868static
6869UInt dis_SSE_E_to_G_all ( UChar sorb, Int delta, HChar* opname, IROp op )
6870{
6871   return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, False );
6872}
6873
6874/* All lanes SSE binary operation, G = (not G) `op` E. */
6875
6876static
6877UInt dis_SSE_E_to_G_all_invG ( UChar sorb, Int delta,
6878                               HChar* opname, IROp op )
6879{
6880   return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, True );
6881}
6882
6883
6884/* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
6885
6886static UInt dis_SSE_E_to_G_lo32 ( UChar sorb, Int delta,
6887                                  HChar* opname, IROp op )
6888{
6889   HChar   dis_buf[50];
6890   Int     alen;
6891   IRTemp  addr;
6892   UChar   rm = getIByte(delta);
6893   IRExpr* gpart = getXMMReg(gregOfRM(rm));
6894   if (epartIsReg(rm)) {
6895      putXMMReg( gregOfRM(rm),
6896                 binop(op, gpart,
6897                           getXMMReg(eregOfRM(rm))) );
6898      DIP("%s %s,%s\n", opname,
6899                        nameXMMReg(eregOfRM(rm)),
6900                        nameXMMReg(gregOfRM(rm)) );
6901      return delta+1;
6902   } else {
6903      /* We can only do a 32-bit memory read, so the upper 3/4 of the
6904         E operand needs to be made simply of zeroes. */
6905      IRTemp epart = newTemp(Ity_V128);
6906      addr = disAMode ( &alen, sorb, delta, dis_buf );
6907      assign( epart, unop( Iop_32UtoV128,
6908                           loadLE(Ity_I32, mkexpr(addr))) );
6909      putXMMReg( gregOfRM(rm),
6910                 binop(op, gpart, mkexpr(epart)) );
6911      DIP("%s %s,%s\n", opname,
6912                        dis_buf,
6913                        nameXMMReg(gregOfRM(rm)) );
6914      return delta+alen;
6915   }
6916}
6917
6918
6919/* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
6920
6921static UInt dis_SSE_E_to_G_lo64 ( UChar sorb, Int delta,
6922                                  HChar* opname, IROp op )
6923{
6924   HChar   dis_buf[50];
6925   Int     alen;
6926   IRTemp  addr;
6927   UChar   rm = getIByte(delta);
6928   IRExpr* gpart = getXMMReg(gregOfRM(rm));
6929   if (epartIsReg(rm)) {
6930      putXMMReg( gregOfRM(rm),
6931                 binop(op, gpart,
6932                           getXMMReg(eregOfRM(rm))) );
6933      DIP("%s %s,%s\n", opname,
6934                        nameXMMReg(eregOfRM(rm)),
6935                        nameXMMReg(gregOfRM(rm)) );
6936      return delta+1;
6937   } else {
6938      /* We can only do a 64-bit memory read, so the upper half of the
6939         E operand needs to be made simply of zeroes. */
6940      IRTemp epart = newTemp(Ity_V128);
6941      addr = disAMode ( &alen, sorb, delta, dis_buf );
6942      assign( epart, unop( Iop_64UtoV128,
6943                           loadLE(Ity_I64, mkexpr(addr))) );
6944      putXMMReg( gregOfRM(rm),
6945                 binop(op, gpart, mkexpr(epart)) );
6946      DIP("%s %s,%s\n", opname,
6947                        dis_buf,
6948                        nameXMMReg(gregOfRM(rm)) );
6949      return delta+alen;
6950   }
6951}
6952
6953
6954/* All lanes unary SSE operation, G = op(E). */
6955
6956static UInt dis_SSE_E_to_G_unary_all (
6957               UChar sorb, Int delta,
6958               HChar* opname, IROp op
6959            )
6960{
6961   HChar   dis_buf[50];
6962   Int     alen;
6963   IRTemp  addr;
6964   UChar   rm = getIByte(delta);
6965   if (epartIsReg(rm)) {
6966      putXMMReg( gregOfRM(rm),
6967                 unop(op, getXMMReg(eregOfRM(rm))) );
6968      DIP("%s %s,%s\n", opname,
6969                        nameXMMReg(eregOfRM(rm)),
6970                        nameXMMReg(gregOfRM(rm)) );
6971      return delta+1;
6972   } else {
6973      addr = disAMode ( &alen, sorb, delta, dis_buf );
6974      putXMMReg( gregOfRM(rm),
6975                 unop(op, loadLE(Ity_V128, mkexpr(addr))) );
6976      DIP("%s %s,%s\n", opname,
6977                        dis_buf,
6978                        nameXMMReg(gregOfRM(rm)) );
6979      return delta+alen;
6980   }
6981}
6982
6983
6984/* Lowest 32-bit lane only unary SSE operation, G = op(E). */
6985
6986static UInt dis_SSE_E_to_G_unary_lo32 (
6987               UChar sorb, Int delta,
6988               HChar* opname, IROp op
6989            )
6990{
6991   /* First we need to get the old G value and patch the low 32 bits
6992      of the E operand into it.  Then apply op and write back to G. */
6993   HChar   dis_buf[50];
6994   Int     alen;
6995   IRTemp  addr;
6996   UChar   rm = getIByte(delta);
6997   IRTemp  oldG0 = newTemp(Ity_V128);
6998   IRTemp  oldG1 = newTemp(Ity_V128);
6999
7000   assign( oldG0, getXMMReg(gregOfRM(rm)) );
7001
7002   if (epartIsReg(rm)) {
7003      assign( oldG1,
7004              binop( Iop_SetV128lo32,
7005                     mkexpr(oldG0),
7006                     getXMMRegLane32(eregOfRM(rm), 0)) );
7007      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7008      DIP("%s %s,%s\n", opname,
7009                        nameXMMReg(eregOfRM(rm)),
7010                        nameXMMReg(gregOfRM(rm)) );
7011      return delta+1;
7012   } else {
7013      addr = disAMode ( &alen, sorb, delta, dis_buf );
7014      assign( oldG1,
7015              binop( Iop_SetV128lo32,
7016                     mkexpr(oldG0),
7017                     loadLE(Ity_I32, mkexpr(addr)) ));
7018      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7019      DIP("%s %s,%s\n", opname,
7020                        dis_buf,
7021                        nameXMMReg(gregOfRM(rm)) );
7022      return delta+alen;
7023   }
7024}
7025
7026
7027/* Lowest 64-bit lane only unary SSE operation, G = op(E). */
7028
7029static UInt dis_SSE_E_to_G_unary_lo64 (
7030               UChar sorb, Int delta,
7031               HChar* opname, IROp op
7032            )
7033{
7034   /* First we need to get the old G value and patch the low 64 bits
7035      of the E operand into it.  Then apply op and write back to G. */
7036   HChar   dis_buf[50];
7037   Int     alen;
7038   IRTemp  addr;
7039   UChar   rm = getIByte(delta);
7040   IRTemp  oldG0 = newTemp(Ity_V128);
7041   IRTemp  oldG1 = newTemp(Ity_V128);
7042
7043   assign( oldG0, getXMMReg(gregOfRM(rm)) );
7044
7045   if (epartIsReg(rm)) {
7046      assign( oldG1,
7047              binop( Iop_SetV128lo64,
7048                     mkexpr(oldG0),
7049                     getXMMRegLane64(eregOfRM(rm), 0)) );
7050      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7051      DIP("%s %s,%s\n", opname,
7052                        nameXMMReg(eregOfRM(rm)),
7053                        nameXMMReg(gregOfRM(rm)) );
7054      return delta+1;
7055   } else {
7056      addr = disAMode ( &alen, sorb, delta, dis_buf );
7057      assign( oldG1,
7058              binop( Iop_SetV128lo64,
7059                     mkexpr(oldG0),
7060                     loadLE(Ity_I64, mkexpr(addr)) ));
7061      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7062      DIP("%s %s,%s\n", opname,
7063                        dis_buf,
7064                        nameXMMReg(gregOfRM(rm)) );
7065      return delta+alen;
7066   }
7067}
7068
7069
7070/* SSE integer binary operation:
7071      G = G `op` E   (eLeft == False)
7072      G = E `op` G   (eLeft == True)
7073*/
7074static UInt dis_SSEint_E_to_G(
7075               UChar sorb, Int delta,
7076               HChar* opname, IROp op,
7077               Bool   eLeft
7078            )
7079{
7080   HChar   dis_buf[50];
7081   Int     alen;
7082   IRTemp  addr;
7083   UChar   rm = getIByte(delta);
7084   IRExpr* gpart = getXMMReg(gregOfRM(rm));
7085   IRExpr* epart = NULL;
7086   if (epartIsReg(rm)) {
7087      epart = getXMMReg(eregOfRM(rm));
7088      DIP("%s %s,%s\n", opname,
7089                        nameXMMReg(eregOfRM(rm)),
7090                        nameXMMReg(gregOfRM(rm)) );
7091      delta += 1;
7092   } else {
7093      addr  = disAMode ( &alen, sorb, delta, dis_buf );
7094      epart = loadLE(Ity_V128, mkexpr(addr));
7095      DIP("%s %s,%s\n", opname,
7096                        dis_buf,
7097                        nameXMMReg(gregOfRM(rm)) );
7098      delta += alen;
7099   }
7100   putXMMReg( gregOfRM(rm),
7101              eLeft ? binop(op, epart, gpart)
7102	            : binop(op, gpart, epart) );
7103   return delta;
7104}
7105
7106
7107/* Helper for doing SSE FP comparisons. */
7108
7109static void findSSECmpOp ( Bool* needNot, IROp* op,
7110                           Int imm8, Bool all_lanes, Int sz )
7111{
7112   imm8 &= 7;
7113   *needNot = False;
7114   *op      = Iop_INVALID;
7115   if (imm8 >= 4) {
7116      *needNot = True;
7117      imm8 -= 4;
7118   }
7119
7120   if (sz == 4 && all_lanes) {
7121      switch (imm8) {
7122         case 0: *op = Iop_CmpEQ32Fx4; return;
7123         case 1: *op = Iop_CmpLT32Fx4; return;
7124         case 2: *op = Iop_CmpLE32Fx4; return;
7125         case 3: *op = Iop_CmpUN32Fx4; return;
7126         default: break;
7127      }
7128   }
7129   if (sz == 4 && !all_lanes) {
7130      switch (imm8) {
7131         case 0: *op = Iop_CmpEQ32F0x4; return;
7132         case 1: *op = Iop_CmpLT32F0x4; return;
7133         case 2: *op = Iop_CmpLE32F0x4; return;
7134         case 3: *op = Iop_CmpUN32F0x4; return;
7135         default: break;
7136      }
7137   }
7138   if (sz == 8 && all_lanes) {
7139      switch (imm8) {
7140         case 0: *op = Iop_CmpEQ64Fx2; return;
7141         case 1: *op = Iop_CmpLT64Fx2; return;
7142         case 2: *op = Iop_CmpLE64Fx2; return;
7143         case 3: *op = Iop_CmpUN64Fx2; return;
7144         default: break;
7145      }
7146   }
7147   if (sz == 8 && !all_lanes) {
7148      switch (imm8) {
7149         case 0: *op = Iop_CmpEQ64F0x2; return;
7150         case 1: *op = Iop_CmpLT64F0x2; return;
7151         case 2: *op = Iop_CmpLE64F0x2; return;
7152         case 3: *op = Iop_CmpUN64F0x2; return;
7153         default: break;
7154      }
7155   }
7156   vpanic("findSSECmpOp(x86,guest)");
7157}
7158
7159/* Handles SSE 32F/64F comparisons. */
7160
7161static UInt dis_SSEcmp_E_to_G ( UChar sorb, Int delta,
7162				HChar* opname, Bool all_lanes, Int sz )
7163{
7164   HChar   dis_buf[50];
7165   Int     alen, imm8;
7166   IRTemp  addr;
7167   Bool    needNot = False;
7168   IROp    op      = Iop_INVALID;
7169   IRTemp  plain   = newTemp(Ity_V128);
7170   UChar   rm      = getIByte(delta);
7171   UShort  mask    = 0;
7172   vassert(sz == 4 || sz == 8);
7173   if (epartIsReg(rm)) {
7174      imm8 = getIByte(delta+1);
7175      findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
7176      assign( plain, binop(op, getXMMReg(gregOfRM(rm)),
7177                               getXMMReg(eregOfRM(rm))) );
7178      delta += 2;
7179      DIP("%s $%d,%s,%s\n", opname,
7180                            (Int)imm8,
7181                            nameXMMReg(eregOfRM(rm)),
7182                            nameXMMReg(gregOfRM(rm)) );
7183   } else {
7184      addr = disAMode ( &alen, sorb, delta, dis_buf );
7185      imm8 = getIByte(delta+alen);
7186      findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
7187      assign( plain,
7188              binop(
7189                 op,
7190                 getXMMReg(gregOfRM(rm)),
7191                   all_lanes  ? loadLE(Ity_V128, mkexpr(addr))
7192                 : sz == 8    ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
7193                 : /*sz==4*/    unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
7194             )
7195      );
7196      delta += alen+1;
7197      DIP("%s $%d,%s,%s\n", opname,
7198                            (Int)imm8,
7199                            dis_buf,
7200                            nameXMMReg(gregOfRM(rm)) );
7201   }
7202
7203   if (needNot && all_lanes) {
7204      putXMMReg( gregOfRM(rm),
7205                 unop(Iop_NotV128, mkexpr(plain)) );
7206   }
7207   else
7208   if (needNot && !all_lanes) {
7209      mask = toUShort( sz==4 ? 0x000F : 0x00FF );
7210      putXMMReg( gregOfRM(rm),
7211                 binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
7212   }
7213   else {
7214      putXMMReg( gregOfRM(rm), mkexpr(plain) );
7215   }
7216
7217   return delta;
7218}
7219
7220
7221/* Vector by scalar shift of G by the amount specified at the bottom
7222   of E. */
7223
7224static UInt dis_SSE_shiftG_byE ( UChar sorb, Int delta,
7225                                 HChar* opname, IROp op )
7226{
7227   HChar   dis_buf[50];
7228   Int     alen, size;
7229   IRTemp  addr;
7230   Bool    shl, shr, sar;
7231   UChar   rm   = getIByte(delta);
7232   IRTemp  g0   = newTemp(Ity_V128);
7233   IRTemp  g1   = newTemp(Ity_V128);
7234   IRTemp  amt  = newTemp(Ity_I32);
7235   IRTemp  amt8 = newTemp(Ity_I8);
7236   if (epartIsReg(rm)) {
7237      assign( amt, getXMMRegLane32(eregOfRM(rm), 0) );
7238      DIP("%s %s,%s\n", opname,
7239                        nameXMMReg(eregOfRM(rm)),
7240                        nameXMMReg(gregOfRM(rm)) );
7241      delta++;
7242   } else {
7243      addr = disAMode ( &alen, sorb, delta, dis_buf );
7244      assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
7245      DIP("%s %s,%s\n", opname,
7246                        dis_buf,
7247                        nameXMMReg(gregOfRM(rm)) );
7248      delta += alen;
7249   }
7250   assign( g0,   getXMMReg(gregOfRM(rm)) );
7251   assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
7252
7253   shl = shr = sar = False;
7254   size = 0;
7255   switch (op) {
7256      case Iop_ShlN16x8: shl = True; size = 32; break;
7257      case Iop_ShlN32x4: shl = True; size = 32; break;
7258      case Iop_ShlN64x2: shl = True; size = 64; break;
7259      case Iop_SarN16x8: sar = True; size = 16; break;
7260      case Iop_SarN32x4: sar = True; size = 32; break;
7261      case Iop_ShrN16x8: shr = True; size = 16; break;
7262      case Iop_ShrN32x4: shr = True; size = 32; break;
7263      case Iop_ShrN64x2: shr = True; size = 64; break;
7264      default: vassert(0);
7265   }
7266
7267   if (shl || shr) {
7268     assign(
7269        g1,
7270        IRExpr_Mux0X(
7271           unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
7272           mkV128(0x0000),
7273           binop(op, mkexpr(g0), mkexpr(amt8))
7274        )
7275     );
7276   } else
7277   if (sar) {
7278     assign(
7279        g1,
7280        IRExpr_Mux0X(
7281           unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
7282           binop(op, mkexpr(g0), mkU8(size-1)),
7283           binop(op, mkexpr(g0), mkexpr(amt8))
7284        )
7285     );
7286   } else {
7287      /*NOTREACHED*/
7288      vassert(0);
7289   }
7290
7291   putXMMReg( gregOfRM(rm), mkexpr(g1) );
7292   return delta;
7293}
7294
7295
7296/* Vector by scalar shift of E by an immediate byte. */
7297
7298static
7299UInt dis_SSE_shiftE_imm ( Int delta, HChar* opname, IROp op )
7300{
7301   Bool    shl, shr, sar;
7302   UChar   rm   = getIByte(delta);
7303   IRTemp  e0   = newTemp(Ity_V128);
7304   IRTemp  e1   = newTemp(Ity_V128);
7305   UChar   amt, size;
7306   vassert(epartIsReg(rm));
7307   vassert(gregOfRM(rm) == 2
7308           || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
7309   amt = getIByte(delta+1);
7310   delta += 2;
7311   DIP("%s $%d,%s\n", opname,
7312                      (Int)amt,
7313                      nameXMMReg(eregOfRM(rm)) );
7314   assign( e0, getXMMReg(eregOfRM(rm)) );
7315
7316   shl = shr = sar = False;
7317   size = 0;
7318   switch (op) {
7319      case Iop_ShlN16x8: shl = True; size = 16; break;
7320      case Iop_ShlN32x4: shl = True; size = 32; break;
7321      case Iop_ShlN64x2: shl = True; size = 64; break;
7322      case Iop_SarN16x8: sar = True; size = 16; break;
7323      case Iop_SarN32x4: sar = True; size = 32; break;
7324      case Iop_ShrN16x8: shr = True; size = 16; break;
7325      case Iop_ShrN32x4: shr = True; size = 32; break;
7326      case Iop_ShrN64x2: shr = True; size = 64; break;
7327      default: vassert(0);
7328   }
7329
7330   if (shl || shr) {
7331      assign( e1, amt >= size
7332                     ? mkV128(0x0000)
7333                     : binop(op, mkexpr(e0), mkU8(amt))
7334      );
7335   } else
7336   if (sar) {
7337      assign( e1, amt >= size
7338                     ? binop(op, mkexpr(e0), mkU8(size-1))
7339                     : binop(op, mkexpr(e0), mkU8(amt))
7340      );
7341   } else {
7342      /*NOTREACHED*/
7343      vassert(0);
7344   }
7345
7346   putXMMReg( eregOfRM(rm), mkexpr(e1) );
7347   return delta;
7348}
7349
7350
7351/* Get the current SSE rounding mode. */
7352
7353static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
7354{
7355   return binop( Iop_And32,
7356                 IRExpr_Get( OFFB_SSEROUND, Ity_I32 ),
7357                 mkU32(3) );
7358}
7359
7360static void put_sse_roundingmode ( IRExpr* sseround )
7361{
7362   vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
7363   stmt( IRStmt_Put( OFFB_SSEROUND, sseround ) );
7364}
7365
7366/* Break a 128-bit value up into four 32-bit ints. */
7367
7368static void breakup128to32s ( IRTemp t128,
7369			      /*OUTs*/
7370                              IRTemp* t3, IRTemp* t2,
7371                              IRTemp* t1, IRTemp* t0 )
7372{
7373   IRTemp hi64 = newTemp(Ity_I64);
7374   IRTemp lo64 = newTemp(Ity_I64);
7375   assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
7376   assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
7377
7378   vassert(t0 && *t0 == IRTemp_INVALID);
7379   vassert(t1 && *t1 == IRTemp_INVALID);
7380   vassert(t2 && *t2 == IRTemp_INVALID);
7381   vassert(t3 && *t3 == IRTemp_INVALID);
7382
7383   *t0 = newTemp(Ity_I32);
7384   *t1 = newTemp(Ity_I32);
7385   *t2 = newTemp(Ity_I32);
7386   *t3 = newTemp(Ity_I32);
7387   assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
7388   assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
7389   assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
7390   assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
7391}
7392
7393/* Construct a 128-bit value from four 32-bit ints. */
7394
7395static IRExpr* mk128from32s ( IRTemp t3, IRTemp t2,
7396                              IRTemp t1, IRTemp t0 )
7397{
7398   return
7399      binop( Iop_64HLtoV128,
7400             binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
7401             binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
7402   );
7403}
7404
7405/* Break a 64-bit value up into four 16-bit ints. */
7406
7407static void breakup64to16s ( IRTemp t64,
7408                             /*OUTs*/
7409                             IRTemp* t3, IRTemp* t2,
7410                             IRTemp* t1, IRTemp* t0 )
7411{
7412   IRTemp hi32 = newTemp(Ity_I32);
7413   IRTemp lo32 = newTemp(Ity_I32);
7414   assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
7415   assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
7416
7417   vassert(t0 && *t0 == IRTemp_INVALID);
7418   vassert(t1 && *t1 == IRTemp_INVALID);
7419   vassert(t2 && *t2 == IRTemp_INVALID);
7420   vassert(t3 && *t3 == IRTemp_INVALID);
7421
7422   *t0 = newTemp(Ity_I16);
7423   *t1 = newTemp(Ity_I16);
7424   *t2 = newTemp(Ity_I16);
7425   *t3 = newTemp(Ity_I16);
7426   assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
7427   assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
7428   assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
7429   assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
7430}
7431
7432/* Construct a 64-bit value from four 16-bit ints. */
7433
7434static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
7435                             IRTemp t1, IRTemp t0 )
7436{
7437   return
7438      binop( Iop_32HLto64,
7439             binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
7440             binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
7441   );
7442}
7443
7444/* Generate IR to set the guest %EFLAGS from the pushfl-format image
7445   in the given 32-bit temporary.  The flags that are set are: O S Z A
7446   C P D ID AC.
7447
7448   In all cases, code to set AC is generated.  However, VEX actually
7449   ignores the AC value and so can optionally emit an emulation
7450   warning when it is enabled.  In this routine, an emulation warning
7451   is only emitted if emit_AC_emwarn is True, in which case
7452   next_insn_EIP must be correct (this allows for correct code
7453   generation for popfl/popfw).  If emit_AC_emwarn is False,
7454   next_insn_EIP is unimportant (this allows for easy if kludgey code
7455   generation for IRET.) */
7456
7457static
7458void set_EFLAGS_from_value ( IRTemp t1,
7459                             Bool   emit_AC_emwarn,
7460                             Addr32 next_insn_EIP )
7461{
7462   vassert(typeOfIRTemp(irsb->tyenv,t1) == Ity_I32);
7463
7464   /* t1 is the flag word.  Mask out everything except OSZACP and set
7465      the flags thunk to X86G_CC_OP_COPY. */
7466   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
7467   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
7468   stmt( IRStmt_Put( OFFB_CC_DEP1,
7469                     binop(Iop_And32,
7470                           mkexpr(t1),
7471                           mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
7472                                  | X86G_CC_MASK_A | X86G_CC_MASK_Z
7473                                  | X86G_CC_MASK_S| X86G_CC_MASK_O )
7474                          )
7475                    )
7476       );
7477   /* Set NDEP even though it isn't used.  This makes redundant-PUT
7478      elimination of previous stores to this field work better. */
7479   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
7480
7481   /* Also need to set the D flag, which is held in bit 10 of t1.
7482      If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
7483   stmt( IRStmt_Put(
7484            OFFB_DFLAG,
7485            IRExpr_Mux0X(
7486               unop(Iop_32to8,
7487                    binop(Iop_And32,
7488                          binop(Iop_Shr32, mkexpr(t1), mkU8(10)),
7489                          mkU32(1))),
7490               mkU32(1),
7491               mkU32(0xFFFFFFFF)))
7492       );
7493
7494   /* Set the ID flag */
7495   stmt( IRStmt_Put(
7496            OFFB_IDFLAG,
7497            IRExpr_Mux0X(
7498               unop(Iop_32to8,
7499                    binop(Iop_And32,
7500                          binop(Iop_Shr32, mkexpr(t1), mkU8(21)),
7501                          mkU32(1))),
7502               mkU32(0),
7503               mkU32(1)))
7504       );
7505
7506   /* And set the AC flag.  If setting it 1 to, possibly emit an
7507      emulation warning. */
7508   stmt( IRStmt_Put(
7509            OFFB_ACFLAG,
7510            IRExpr_Mux0X(
7511               unop(Iop_32to8,
7512                    binop(Iop_And32,
7513                          binop(Iop_Shr32, mkexpr(t1), mkU8(18)),
7514                          mkU32(1))),
7515               mkU32(0),
7516               mkU32(1)))
7517       );
7518
7519   if (emit_AC_emwarn) {
7520      put_emwarn( mkU32(EmWarn_X86_acFlag) );
7521      stmt(
7522         IRStmt_Exit(
7523            binop( Iop_CmpNE32,
7524                   binop(Iop_And32, mkexpr(t1), mkU32(1<<18)),
7525                   mkU32(0) ),
7526            Ijk_EmWarn,
7527            IRConst_U32( next_insn_EIP )
7528         )
7529      );
7530   }
7531}
7532
7533
7534/* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
7535   values (aa,bb), computes, for each of the 4 16-bit lanes:
7536
7537   (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
7538*/
7539static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
7540{
7541   IRTemp aa      = newTemp(Ity_I64);
7542   IRTemp bb      = newTemp(Ity_I64);
7543   IRTemp aahi32s = newTemp(Ity_I64);
7544   IRTemp aalo32s = newTemp(Ity_I64);
7545   IRTemp bbhi32s = newTemp(Ity_I64);
7546   IRTemp bblo32s = newTemp(Ity_I64);
7547   IRTemp rHi     = newTemp(Ity_I64);
7548   IRTemp rLo     = newTemp(Ity_I64);
7549   IRTemp one32x2 = newTemp(Ity_I64);
7550   assign(aa, aax);
7551   assign(bb, bbx);
7552   assign( aahi32s,
7553           binop(Iop_SarN32x2,
7554                 binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
7555                 mkU8(16) ));
7556   assign( aalo32s,
7557           binop(Iop_SarN32x2,
7558                 binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
7559                 mkU8(16) ));
7560   assign( bbhi32s,
7561           binop(Iop_SarN32x2,
7562                 binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
7563                 mkU8(16) ));
7564   assign( bblo32s,
7565           binop(Iop_SarN32x2,
7566                 binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
7567                 mkU8(16) ));
7568   assign(one32x2, mkU64( (1ULL << 32) + 1 ));
7569   assign(
7570      rHi,
7571      binop(
7572         Iop_ShrN32x2,
7573         binop(
7574            Iop_Add32x2,
7575            binop(
7576               Iop_ShrN32x2,
7577               binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
7578               mkU8(14)
7579            ),
7580            mkexpr(one32x2)
7581         ),
7582         mkU8(1)
7583      )
7584   );
7585   assign(
7586      rLo,
7587      binop(
7588         Iop_ShrN32x2,
7589         binop(
7590            Iop_Add32x2,
7591            binop(
7592               Iop_ShrN32x2,
7593               binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
7594               mkU8(14)
7595            ),
7596            mkexpr(one32x2)
7597         ),
7598         mkU8(1)
7599      )
7600   );
7601   return
7602      binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
7603}
7604
7605/* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
7606   values (aa,bb), computes, for each lane:
7607
7608          if aa_lane < 0 then - bb_lane
7609     else if aa_lane > 0 then bb_lane
7610     else 0
7611*/
7612static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
7613{
7614   IRTemp aa       = newTemp(Ity_I64);
7615   IRTemp bb       = newTemp(Ity_I64);
7616   IRTemp zero     = newTemp(Ity_I64);
7617   IRTemp bbNeg    = newTemp(Ity_I64);
7618   IRTemp negMask  = newTemp(Ity_I64);
7619   IRTemp posMask  = newTemp(Ity_I64);
7620   IROp   opSub    = Iop_INVALID;
7621   IROp   opCmpGTS = Iop_INVALID;
7622
7623   switch (laneszB) {
7624      case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
7625      case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
7626      case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
7627      default: vassert(0);
7628   }
7629
7630   assign( aa,      aax );
7631   assign( bb,      bbx );
7632   assign( zero,    mkU64(0) );
7633   assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
7634   assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
7635   assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
7636
7637   return
7638      binop(Iop_Or64,
7639            binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
7640            binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
7641
7642}
7643
7644/* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
7645   value aa, computes, for each lane
7646
7647   if aa < 0 then -aa else aa
7648
7649   Note that the result is interpreted as unsigned, so that the
7650   absolute value of the most negative signed input can be
7651   represented.
7652*/
7653static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB )
7654{
7655   IRTemp aa      = newTemp(Ity_I64);
7656   IRTemp zero    = newTemp(Ity_I64);
7657   IRTemp aaNeg   = newTemp(Ity_I64);
7658   IRTemp negMask = newTemp(Ity_I64);
7659   IRTemp posMask = newTemp(Ity_I64);
7660   IROp   opSub   = Iop_INVALID;
7661   IROp   opSarN  = Iop_INVALID;
7662
7663   switch (laneszB) {
7664      case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
7665      case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
7666      case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
7667      default: vassert(0);
7668   }
7669
7670   assign( aa,      aax );
7671   assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
7672   assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
7673   assign( zero,    mkU64(0) );
7674   assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
7675   return
7676      binop(Iop_Or64,
7677            binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
7678            binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) );
7679}
7680
7681static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
7682                                        IRTemp lo64, Int byteShift )
7683{
7684   vassert(byteShift >= 1 && byteShift <= 7);
7685   return
7686      binop(Iop_Or64,
7687            binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
7688            binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
7689      );
7690}
7691
7692/* Generate a SIGSEGV followed by a restart of the current instruction
7693   if effective_addr is not 16-aligned.  This is required behaviour
7694   for some SSE3 instructions and all 128-bit SSSE3 instructions.
7695   This assumes that guest_RIP_curr_instr is set correctly! */
7696/* TODO(glider): we've replaced the 0xF mask with 0x0, effectively disabling
7697 * the check. Need to enable it once TSan stops generating unaligned
7698 * accesses in the wrappers.
7699 * See http://code.google.com/p/data-race-test/issues/detail?id=49 */
7700static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr )
7701{
7702   stmt(
7703      IRStmt_Exit(
7704         binop(Iop_CmpNE32,
7705               binop(Iop_And32,mkexpr(effective_addr),mkU32(0x0)),
7706               mkU32(0)),
7707         Ijk_SigSEGV,
7708         IRConst_U32(guest_EIP_curr_instr)
7709      )
7710   );
7711}
7712
7713
7714/* Helper for deciding whether a given insn (starting at the opcode
7715   byte) may validly be used with a LOCK prefix.  The following insns
7716   may be used with LOCK when their destination operand is in memory.
7717   AFAICS this is exactly the same for both 32-bit and 64-bit mode.
7718
7719   ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
7720   OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
7721   ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
7722   SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
7723   AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
7724   SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
7725   XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
7726
7727   DEC        FE /1,  FF /1
7728   INC        FE /0,  FF /0
7729
7730   NEG        F6 /3,  F7 /3
7731   NOT        F6 /2,  F7 /2
7732
7733   XCHG       86, 87
7734
7735   BTC        0F BB,  0F BA /7
7736   BTR        0F B3,  0F BA /6
7737   BTS        0F AB,  0F BA /5
7738
7739   CMPXCHG    0F B0,  0F B1
7740   CMPXCHG8B  0F C7 /1
7741
7742   XADD       0F C0,  0F C1
7743
7744   ------------------------------
7745
7746   80 /0  =  addb $imm8,  rm8
7747   81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
7748   82 /0  =  addb $imm8,  rm8
7749   83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
7750
7751   00     =  addb r8,  rm8
7752   01     =  addl r32, rm32  and  addw r16, rm16
7753
7754   Same for ADD OR ADC SBB AND SUB XOR
7755
7756   FE /1  = dec rm8
7757   FF /1  = dec rm32  and  dec rm16
7758
7759   FE /0  = inc rm8
7760   FF /0  = inc rm32  and  inc rm16
7761
7762   F6 /3  = neg rm8
7763   F7 /3  = neg rm32  and  neg rm16
7764
7765   F6 /2  = not rm8
7766   F7 /2  = not rm32  and  not rm16
7767
7768   0F BB     = btcw r16, rm16    and  btcl r32, rm32
7769   OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
7770
7771   Same for BTS, BTR
7772*/
7773static Bool can_be_used_with_LOCK_prefix ( UChar* opc )
7774{
7775   switch (opc[0]) {
7776      case 0x00: case 0x01: case 0x08: case 0x09:
7777      case 0x10: case 0x11: case 0x18: case 0x19:
7778      case 0x20: case 0x21: case 0x28: case 0x29:
7779      case 0x30: case 0x31:
7780         if (!epartIsReg(opc[1]))
7781            return True;
7782         break;
7783
7784      case 0x80: case 0x81: case 0x82: case 0x83:
7785         if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 6
7786             && !epartIsReg(opc[1]))
7787            return True;
7788         break;
7789
7790      case 0xFE: case 0xFF:
7791         if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 1
7792             && !epartIsReg(opc[1]))
7793            return True;
7794         break;
7795
7796      case 0xF6: case 0xF7:
7797         if (gregOfRM(opc[1]) >= 2 && gregOfRM(opc[1]) <= 3
7798             && !epartIsReg(opc[1]))
7799            return True;
7800         break;
7801
7802      case 0x86: case 0x87:
7803         if (!epartIsReg(opc[1]))
7804            return True;
7805         break;
7806
7807      case 0x0F: {
7808         switch (opc[1]) {
7809            case 0xBB: case 0xB3: case 0xAB:
7810               if (!epartIsReg(opc[2]))
7811                  return True;
7812               break;
7813            case 0xBA:
7814               if (gregOfRM(opc[2]) >= 5 && gregOfRM(opc[2]) <= 7
7815                   && !epartIsReg(opc[2]))
7816                  return True;
7817               break;
7818            case 0xB0: case 0xB1:
7819               if (!epartIsReg(opc[2]))
7820                  return True;
7821               break;
7822            case 0xC7:
7823               if (gregOfRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
7824                  return True;
7825               break;
7826            case 0xC0: case 0xC1:
7827               if (!epartIsReg(opc[2]))
7828                  return True;
7829               break;
7830            default:
7831               break;
7832         } /* switch (opc[1]) */
7833         break;
7834      }
7835
7836      default:
7837         break;
7838   } /* switch (opc[0]) */
7839
7840   return False;
7841}
7842
7843
7844/*------------------------------------------------------------*/
7845/*--- Disassemble a single instruction                     ---*/
7846/*------------------------------------------------------------*/
7847
7848/* Disassemble a single instruction into IR.  The instruction is
7849   located in host memory at &guest_code[delta].  *expect_CAS is set
7850   to True if the resulting IR is expected to contain an IRCAS
7851   statement, and False if it's not expected to.  This makes it
7852   possible for the caller of disInstr_X86_WRK to check that
7853   LOCK-prefixed instructions are at least plausibly translated, in
7854   that it becomes possible to check that a (validly) LOCK-prefixed
7855   instruction generates a translation containing an IRCAS, and
7856   instructions without LOCK prefixes don't generate translations
7857   containing an IRCAS.
7858*/
7859static
7860DisResult disInstr_X86_WRK (
7861             /*OUT*/Bool* expect_CAS,
7862             Bool         put_IP,
7863             Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
7864             Bool         resteerCisOk,
7865             void*        callback_opaque,
7866             Long         delta64,
7867             VexArchInfo* archinfo,
7868             VexAbiInfo*  vbi
7869          )
7870{
7871   IRType    ty;
7872   IRTemp    addr, t0, t1, t2, t3, t4, t5, t6;
7873   Int       alen;
7874   UChar     opc, modrm, abyte, pre;
7875   UInt      d32;
7876   HChar     dis_buf[50];
7877   Int       am_sz, d_sz, n_prefixes;
7878   DisResult dres;
7879   UChar*    insn; /* used in SSE decoders */
7880
7881   /* The running delta */
7882   Int delta = (Int)delta64;
7883
7884   /* Holds eip at the start of the insn, so that we can print
7885      consistent error messages for unimplemented insns. */
7886   Int delta_start = delta;
7887
7888   /* sz denotes the nominal data-op size of the insn; we change it to
7889      2 if an 0x66 prefix is seen */
7890   Int sz = 4;
7891
7892   /* sorb holds the segment-override-prefix byte, if any.  Zero if no
7893      prefix has been seen, else one of {0x26, 0x3E, 0x64, 0x65}
7894      indicating the prefix.  */
7895   UChar sorb = 0;
7896
7897   /* Gets set to True if a LOCK prefix is seen. */
7898   Bool pfx_lock = False;
7899
7900   /* Set result defaults. */
7901   dres.whatNext   = Dis_Continue;
7902   dres.len        = 0;
7903   dres.continueAt = 0;
7904
7905   *expect_CAS = False;
7906
7907   addr = t0 = t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID;
7908
7909   vassert(guest_EIP_bbstart + delta == guest_EIP_curr_instr);
7910   DIP("\t0x%x:  ", guest_EIP_bbstart+delta);
7911
7912   /* We may be asked to update the guest EIP before going further. */
7913   if (put_IP)
7914      stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_curr_instr)) );
7915
7916   /* Spot "Special" instructions (see comment at top of file). */
7917   {
7918      UChar* code = (UChar*)(guest_code + delta);
7919      /* Spot the 12-byte preamble:
7920         C1C703   roll $3,  %edi
7921         C1C70D   roll $13, %edi
7922         C1C71D   roll $29, %edi
7923         C1C713   roll $19, %edi
7924      */
7925      if (code[ 0] == 0xC1 && code[ 1] == 0xC7 && code[ 2] == 0x03 &&
7926          code[ 3] == 0xC1 && code[ 4] == 0xC7 && code[ 5] == 0x0D &&
7927          code[ 6] == 0xC1 && code[ 7] == 0xC7 && code[ 8] == 0x1D &&
7928          code[ 9] == 0xC1 && code[10] == 0xC7 && code[11] == 0x13) {
7929         /* Got a "Special" instruction preamble.  Which one is it? */
7930         if (code[12] == 0x87 && code[13] == 0xDB /* xchgl %ebx,%ebx */) {
7931            /* %EDX = client_request ( %EAX ) */
7932            DIP("%%edx = client_request ( %%eax )\n");
7933            delta += 14;
7934            jmp_lit(Ijk_ClientReq, guest_EIP_bbstart+delta);
7935            dres.whatNext = Dis_StopHere;
7936            goto decode_success;
7937         }
7938         else
7939         if (code[12] == 0x87 && code[13] == 0xC9 /* xchgl %ecx,%ecx */) {
7940            /* %EAX = guest_NRADDR */
7941            DIP("%%eax = guest_NRADDR\n");
7942            delta += 14;
7943            putIReg(4, R_EAX, IRExpr_Get( OFFB_NRADDR, Ity_I32 ));
7944            goto decode_success;
7945         }
7946         else
7947         if (code[12] == 0x87 && code[13] == 0xD2 /* xchgl %edx,%edx */) {
7948            /* call-noredir *%EAX */
7949            DIP("call-noredir *%%eax\n");
7950            delta += 14;
7951            t1 = newTemp(Ity_I32);
7952            assign(t1, getIReg(4,R_EAX));
7953            t2 = newTemp(Ity_I32);
7954            assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
7955            putIReg(4, R_ESP, mkexpr(t2));
7956            storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta));
7957            jmp_treg(Ijk_NoRedir,t1);
7958            dres.whatNext = Dis_StopHere;
7959            goto decode_success;
7960         }
7961         /* We don't know what it is. */
7962         goto decode_failure;
7963         /*NOTREACHED*/
7964      }
7965   }
7966
7967   /* Handle a couple of weird-ass NOPs that have been observed in the
7968      wild. */
7969   {
7970      UChar* code = (UChar*)(guest_code + delta);
7971      /* Sun's JVM 1.5.0 uses the following as a NOP:
7972         26 2E 64 65 90  %es:%cs:%fs:%gs:nop */
7973      if (code[0] == 0x26 && code[1] == 0x2E && code[2] == 0x64
7974          && code[3] == 0x65 && code[4] == 0x90) {
7975         DIP("%%es:%%cs:%%fs:%%gs:nop\n");
7976         delta += 5;
7977         goto decode_success;
7978      }
7979      /* Don't barf on recent binutils padding,
7980         all variants of which are: nopw %cs:0x0(%eax,%eax,1)
7981         66 2e 0f 1f 84 00 00 00 00 00
7982         66 66 2e 0f 1f 84 00 00 00 00 00
7983         66 66 66 2e 0f 1f 84 00 00 00 00 00
7984         66 66 66 66 2e 0f 1f 84 00 00 00 00 00
7985         66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
7986         66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
7987      */
7988      if (code[0] == 0x66) {
7989         Int data16_cnt;
7990         for (data16_cnt = 1; data16_cnt < 6; data16_cnt++)
7991            if (code[data16_cnt] != 0x66)
7992               break;
7993         if (code[data16_cnt] == 0x2E && code[data16_cnt + 1] == 0x0F
7994             && code[data16_cnt + 2] == 0x1F && code[data16_cnt + 3] == 0x84
7995             && code[data16_cnt + 4] == 0x00 && code[data16_cnt + 5] == 0x00
7996             && code[data16_cnt + 6] == 0x00 && code[data16_cnt + 7] == 0x00
7997             && code[data16_cnt + 8] == 0x00 ) {
7998            DIP("nopw %%cs:0x0(%%eax,%%eax,1)\n");
7999            delta += 9 + data16_cnt;
8000            goto decode_success;
8001         }
8002      }
8003   }
8004
8005   /* Normal instruction handling starts here. */
8006
8007   /* Deal with some but not all prefixes:
8008         66(oso)
8009         F0(lock)
8010         2E(cs:) 3E(ds:) 26(es:) 64(fs:) 65(gs:) 36(ss:)
8011      Not dealt with (left in place):
8012         F2 F3
8013   */
8014   n_prefixes = 0;
8015   while (True) {
8016      if (n_prefixes > 7) goto decode_failure;
8017      pre = getUChar(delta);
8018      switch (pre) {
8019         case 0x66:
8020            sz = 2;
8021            break;
8022         case 0xF0:
8023            pfx_lock = True;
8024            *expect_CAS = True;
8025            break;
8026         case 0x3E: /* %DS: */
8027         case 0x26: /* %ES: */
8028         case 0x64: /* %FS: */
8029         case 0x65: /* %GS: */
8030            if (sorb != 0)
8031               goto decode_failure; /* only one seg override allowed */
8032            sorb = pre;
8033            break;
8034         case 0x2E: { /* %CS: */
8035            /* 2E prefix on a conditional branch instruction is a
8036               branch-prediction hint, which can safely be ignored.  */
8037            UChar op1 = getIByte(delta+1);
8038            UChar op2 = getIByte(delta+2);
8039            if ((op1 >= 0x70 && op1 <= 0x7F)
8040                || (op1 == 0xE3)
8041                || (op1 == 0x0F && op2 >= 0x80 && op2 <= 0x8F)) {
8042               if (0) vex_printf("vex x86->IR: ignoring branch hint\n");
8043            } else {
8044               /* All other CS override cases are not handled */
8045               goto decode_failure;
8046            }
8047            break;
8048         }
8049         case 0x36: /* %SS: */
8050            /* SS override cases are not handled */
8051            goto decode_failure;
8052         default:
8053            goto not_a_prefix;
8054      }
8055      n_prefixes++;
8056      delta++;
8057   }
8058
8059   not_a_prefix:
8060
8061   /* Now we should be looking at the primary opcode byte or the
8062      leading F2 or F3.  Check that any LOCK prefix is actually
8063      allowed. */
8064
8065   if (pfx_lock) {
8066      if (can_be_used_with_LOCK_prefix( (UChar*)&guest_code[delta] )) {
8067         DIP("lock ");
8068      } else {
8069         *expect_CAS = False;
8070         goto decode_failure;
8071      }
8072   }
8073
8074
8075   /* ---------------------------------------------------- */
8076   /* --- The SSE decoder.                             --- */
8077   /* ---------------------------------------------------- */
8078
8079   /* What did I do to deserve SSE ?  Perhaps I was really bad in a
8080      previous life? */
8081
8082   /* Note, this doesn't handle SSE2 or SSE3.  That is handled in a
8083      later section, further on. */
8084
8085   insn = (UChar*)&guest_code[delta];
8086
8087   /* Treat fxsave specially.  It should be doable even on an SSE0
8088      (Pentium-II class) CPU.  Hence be prepared to handle it on
8089      any subarchitecture variant.
8090   */
8091
8092   /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory */
8093   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
8094       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 0) {
8095      IRDirty* d;
8096      modrm = getIByte(delta+2);
8097      vassert(sz == 4);
8098      vassert(!epartIsReg(modrm));
8099
8100      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8101      delta += 2+alen;
8102
8103      DIP("fxsave %s\n", dis_buf);
8104
8105      /* Uses dirty helper:
8106            void x86g_do_FXSAVE ( VexGuestX86State*, UInt ) */
8107      d = unsafeIRDirty_0_N (
8108             0/*regparms*/,
8109             "x86g_dirtyhelper_FXSAVE",
8110             &x86g_dirtyhelper_FXSAVE,
8111             mkIRExprVec_1( mkexpr(addr) )
8112          );
8113      d->needsBBP = True;
8114
8115      /* declare we're writing memory */
8116      d->mFx   = Ifx_Write;
8117      d->mAddr = mkexpr(addr);
8118      d->mSize = 512;
8119
8120      /* declare we're reading guest state */
8121      d->nFxState = 7;
8122
8123      d->fxState[0].fx     = Ifx_Read;
8124      d->fxState[0].offset = OFFB_FTOP;
8125      d->fxState[0].size   = sizeof(UInt);
8126
8127      d->fxState[1].fx     = Ifx_Read;
8128      d->fxState[1].offset = OFFB_FPREGS;
8129      d->fxState[1].size   = 8 * sizeof(ULong);
8130
8131      d->fxState[2].fx     = Ifx_Read;
8132      d->fxState[2].offset = OFFB_FPTAGS;
8133      d->fxState[2].size   = 8 * sizeof(UChar);
8134
8135      d->fxState[3].fx     = Ifx_Read;
8136      d->fxState[3].offset = OFFB_FPROUND;
8137      d->fxState[3].size   = sizeof(UInt);
8138
8139      d->fxState[4].fx     = Ifx_Read;
8140      d->fxState[4].offset = OFFB_FC3210;
8141      d->fxState[4].size   = sizeof(UInt);
8142
8143      d->fxState[5].fx     = Ifx_Read;
8144      d->fxState[5].offset = OFFB_XMM0;
8145      d->fxState[5].size   = 8 * sizeof(U128);
8146
8147      d->fxState[6].fx     = Ifx_Read;
8148      d->fxState[6].offset = OFFB_SSEROUND;
8149      d->fxState[6].size   = sizeof(UInt);
8150
8151      /* Be paranoid ... this assertion tries to ensure the 8 %xmm
8152	 images are packed back-to-back.  If not, the value of
8153	 d->fxState[5].size is wrong. */
8154      vassert(16 == sizeof(U128));
8155      vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
8156
8157      stmt( IRStmt_Dirty(d) );
8158
8159      goto decode_success;
8160   }
8161
8162   /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory */
8163   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
8164       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 1) {
8165      IRDirty* d;
8166      modrm = getIByte(delta+2);
8167      vassert(sz == 4);
8168      vassert(!epartIsReg(modrm));
8169
8170      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8171      delta += 2+alen;
8172
8173      DIP("fxrstor %s\n", dis_buf);
8174
8175      /* Uses dirty helper:
8176            void x86g_do_FXRSTOR ( VexGuestX86State*, UInt ) */
8177      d = unsafeIRDirty_0_N (
8178             0/*regparms*/,
8179             "x86g_dirtyhelper_FXRSTOR",
8180             &x86g_dirtyhelper_FXRSTOR,
8181             mkIRExprVec_1( mkexpr(addr) )
8182          );
8183      d->needsBBP = True;
8184
8185      /* declare we're reading memory */
8186      d->mFx   = Ifx_Read;
8187      d->mAddr = mkexpr(addr);
8188      d->mSize = 512;
8189
8190      /* declare we're writing guest state */
8191      d->nFxState = 7;
8192
8193      d->fxState[0].fx     = Ifx_Write;
8194      d->fxState[0].offset = OFFB_FTOP;
8195      d->fxState[0].size   = sizeof(UInt);
8196
8197      d->fxState[1].fx     = Ifx_Write;
8198      d->fxState[1].offset = OFFB_FPREGS;
8199      d->fxState[1].size   = 8 * sizeof(ULong);
8200
8201      d->fxState[2].fx     = Ifx_Write;
8202      d->fxState[2].offset = OFFB_FPTAGS;
8203      d->fxState[2].size   = 8 * sizeof(UChar);
8204
8205      d->fxState[3].fx     = Ifx_Write;
8206      d->fxState[3].offset = OFFB_FPROUND;
8207      d->fxState[3].size   = sizeof(UInt);
8208
8209      d->fxState[4].fx     = Ifx_Write;
8210      d->fxState[4].offset = OFFB_FC3210;
8211      d->fxState[4].size   = sizeof(UInt);
8212
8213      d->fxState[5].fx     = Ifx_Write;
8214      d->fxState[5].offset = OFFB_XMM0;
8215      d->fxState[5].size   = 8 * sizeof(U128);
8216
8217      d->fxState[6].fx     = Ifx_Write;
8218      d->fxState[6].offset = OFFB_SSEROUND;
8219      d->fxState[6].size   = sizeof(UInt);
8220
8221      /* Be paranoid ... this assertion tries to ensure the 8 %xmm
8222	 images are packed back-to-back.  If not, the value of
8223	 d->fxState[5].size is wrong. */
8224      vassert(16 == sizeof(U128));
8225      vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
8226
8227      stmt( IRStmt_Dirty(d) );
8228
8229      goto decode_success;
8230   }
8231
8232   /* ------ SSE decoder main ------ */
8233
8234   /* Skip parts of the decoder which don't apply given the stated
8235      guest subarchitecture. */
8236   if (archinfo->hwcaps == 0/*baseline, no sse at all*/)
8237      goto after_sse_decoders;
8238
8239   /* Otherwise we must be doing sse1 or sse2, so we can at least try
8240      for SSE1 here. */
8241
8242   /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
8243   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x58) {
8244      delta = dis_SSE_E_to_G_all( sorb, delta+2, "addps", Iop_Add32Fx4 );
8245      goto decode_success;
8246   }
8247
8248   /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
8249   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x58) {
8250      vassert(sz == 4);
8251      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "addss", Iop_Add32F0x4 );
8252      goto decode_success;
8253   }
8254
8255   /* 0F 55 = ANDNPS -- G = (not G) and E */
8256   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x55) {
8257      delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnps", Iop_AndV128 );
8258      goto decode_success;
8259   }
8260
8261   /* 0F 54 = ANDPS -- G = G and E */
8262   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x54) {
8263      delta = dis_SSE_E_to_G_all( sorb, delta+2, "andps", Iop_AndV128 );
8264      goto decode_success;
8265   }
8266
8267   /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
8268   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC2) {
8269      delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmpps", True, 4 );
8270      goto decode_success;
8271   }
8272
8273   /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
8274   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xC2) {
8275      vassert(sz == 4);
8276      delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpss", False, 4 );
8277      goto decode_success;
8278   }
8279
8280   /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
8281   /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
8282   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
8283      IRTemp argL = newTemp(Ity_F32);
8284      IRTemp argR = newTemp(Ity_F32);
8285      modrm = getIByte(delta+2);
8286      if (epartIsReg(modrm)) {
8287         assign( argR, getXMMRegLane32F( eregOfRM(modrm), 0/*lowest lane*/ ) );
8288         delta += 2+1;
8289         DIP("[u]comiss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
8290                                  nameXMMReg(gregOfRM(modrm)) );
8291      } else {
8292         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8293	 assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
8294         delta += 2+alen;
8295         DIP("[u]comiss %s,%s\n", dis_buf,
8296                                  nameXMMReg(gregOfRM(modrm)) );
8297      }
8298      assign( argL, getXMMRegLane32F( gregOfRM(modrm), 0/*lowest lane*/ ) );
8299
8300      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
8301      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
8302      stmt( IRStmt_Put(
8303               OFFB_CC_DEP1,
8304               binop( Iop_And32,
8305                      binop(Iop_CmpF64,
8306                            unop(Iop_F32toF64,mkexpr(argL)),
8307                            unop(Iop_F32toF64,mkexpr(argR))),
8308                      mkU32(0x45)
8309          )));
8310      /* Set NDEP even though it isn't used.  This makes redundant-PUT
8311         elimination of previous stores to this field work better. */
8312      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
8313      goto decode_success;
8314   }
8315
8316   /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
8317      half xmm */
8318   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x2A) {
8319      IRTemp arg64 = newTemp(Ity_I64);
8320      IRTemp rmode = newTemp(Ity_I32);
8321      vassert(sz == 4);
8322
8323      modrm = getIByte(delta+2);
8324      do_MMX_preamble();
8325      if (epartIsReg(modrm)) {
8326         assign( arg64, getMMXReg(eregOfRM(modrm)) );
8327         delta += 2+1;
8328         DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregOfRM(modrm)),
8329                                 nameXMMReg(gregOfRM(modrm)));
8330      } else {
8331         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8332	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
8333         delta += 2+alen;
8334         DIP("cvtpi2ps %s,%s\n", dis_buf,
8335                                 nameXMMReg(gregOfRM(modrm)) );
8336      }
8337
8338      assign( rmode, get_sse_roundingmode() );
8339
8340      putXMMRegLane32F(
8341         gregOfRM(modrm), 0,
8342         binop(Iop_F64toF32,
8343               mkexpr(rmode),
8344               unop(Iop_I32StoF64,
8345                    unop(Iop_64to32, mkexpr(arg64)) )) );
8346
8347      putXMMRegLane32F(
8348         gregOfRM(modrm), 1,
8349         binop(Iop_F64toF32,
8350               mkexpr(rmode),
8351               unop(Iop_I32StoF64,
8352                    unop(Iop_64HIto32, mkexpr(arg64)) )) );
8353
8354      goto decode_success;
8355   }
8356
8357   /* F3 0F 2A = CVTSI2SS -- convert I32 in mem/ireg to F32 in low
8358      quarter xmm */
8359   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x2A) {
8360      IRTemp arg32 = newTemp(Ity_I32);
8361      IRTemp rmode = newTemp(Ity_I32);
8362      vassert(sz == 4);
8363
8364      modrm = getIByte(delta+3);
8365      if (epartIsReg(modrm)) {
8366         assign( arg32, getIReg(4, eregOfRM(modrm)) );
8367         delta += 3+1;
8368         DIP("cvtsi2ss %s,%s\n", nameIReg(4, eregOfRM(modrm)),
8369                                 nameXMMReg(gregOfRM(modrm)));
8370      } else {
8371         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
8372	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
8373         delta += 3+alen;
8374         DIP("cvtsi2ss %s,%s\n", dis_buf,
8375                                 nameXMMReg(gregOfRM(modrm)) );
8376      }
8377
8378      assign( rmode, get_sse_roundingmode() );
8379
8380      putXMMRegLane32F(
8381         gregOfRM(modrm), 0,
8382         binop(Iop_F64toF32,
8383               mkexpr(rmode),
8384               unop(Iop_I32StoF64, mkexpr(arg32)) ) );
8385
8386      goto decode_success;
8387   }
8388
8389   /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
8390      I32 in mmx, according to prevailing SSE rounding mode */
8391   /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
8392      I32 in mmx, rounding towards zero */
8393   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
8394      IRTemp dst64  = newTemp(Ity_I64);
8395      IRTemp rmode  = newTemp(Ity_I32);
8396      IRTemp f32lo  = newTemp(Ity_F32);
8397      IRTemp f32hi  = newTemp(Ity_F32);
8398      Bool   r2zero = toBool(insn[1] == 0x2C);
8399
8400      do_MMX_preamble();
8401      modrm = getIByte(delta+2);
8402
8403      if (epartIsReg(modrm)) {
8404         delta += 2+1;
8405	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
8406	 assign(f32hi, getXMMRegLane32F(eregOfRM(modrm), 1));
8407         DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
8408                                   nameXMMReg(eregOfRM(modrm)),
8409                                   nameMMXReg(gregOfRM(modrm)));
8410      } else {
8411         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8412	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
8413	 assign(f32hi, loadLE(Ity_F32, binop( Iop_Add32,
8414                                              mkexpr(addr),
8415                                              mkU32(4) )));
8416         delta += 2+alen;
8417         DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
8418                                   dis_buf,
8419                                   nameMMXReg(gregOfRM(modrm)));
8420      }
8421
8422      if (r2zero) {
8423         assign(rmode, mkU32((UInt)Irrm_ZERO) );
8424      } else {
8425         assign( rmode, get_sse_roundingmode() );
8426      }
8427
8428      assign(
8429         dst64,
8430         binop( Iop_32HLto64,
8431                binop( Iop_F64toI32S,
8432                       mkexpr(rmode),
8433                       unop( Iop_F32toF64, mkexpr(f32hi) ) ),
8434                binop( Iop_F64toI32S,
8435                       mkexpr(rmode),
8436                       unop( Iop_F32toF64, mkexpr(f32lo) ) )
8437              )
8438      );
8439
8440      putMMXReg(gregOfRM(modrm), mkexpr(dst64));
8441      goto decode_success;
8442   }
8443
8444   /* F3 0F 2D = CVTSS2SI -- convert F32 in mem/low quarter xmm to
8445      I32 in ireg, according to prevailing SSE rounding mode */
8446   /* F3 0F 2C = CVTTSS2SI -- convert F32 in mem/low quarter xmm to
8447      I32 in ireg, rounding towards zero */
8448   if (insn[0] == 0xF3 && insn[1] == 0x0F
8449       && (insn[2] == 0x2D || insn[2] == 0x2C)) {
8450      IRTemp rmode = newTemp(Ity_I32);
8451      IRTemp f32lo = newTemp(Ity_F32);
8452      Bool   r2zero = toBool(insn[2] == 0x2C);
8453      vassert(sz == 4);
8454
8455      modrm = getIByte(delta+3);
8456      if (epartIsReg(modrm)) {
8457         delta += 3+1;
8458	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
8459         DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
8460                                   nameXMMReg(eregOfRM(modrm)),
8461                                   nameIReg(4, gregOfRM(modrm)));
8462      } else {
8463         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
8464	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
8465         delta += 3+alen;
8466         DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
8467                                   dis_buf,
8468                                   nameIReg(4, gregOfRM(modrm)));
8469      }
8470
8471      if (r2zero) {
8472         assign( rmode, mkU32((UInt)Irrm_ZERO) );
8473      } else {
8474         assign( rmode, get_sse_roundingmode() );
8475      }
8476
8477      putIReg(4, gregOfRM(modrm),
8478                 binop( Iop_F64toI32S,
8479                        mkexpr(rmode),
8480                        unop( Iop_F32toF64, mkexpr(f32lo) ) )
8481      );
8482
8483      goto decode_success;
8484   }
8485
8486   /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
8487   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5E) {
8488      delta = dis_SSE_E_to_G_all( sorb, delta+2, "divps", Iop_Div32Fx4 );
8489      goto decode_success;
8490   }
8491
8492   /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
8493   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5E) {
8494      vassert(sz == 4);
8495      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "divss", Iop_Div32F0x4 );
8496      goto decode_success;
8497   }
8498
8499   /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
8500   if (insn[0] == 0x0F && insn[1] == 0xAE
8501       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 2) {
8502
8503      IRTemp t64 = newTemp(Ity_I64);
8504      IRTemp ew = newTemp(Ity_I32);
8505
8506      modrm = getIByte(delta+2);
8507      vassert(!epartIsReg(modrm));
8508      vassert(sz == 4);
8509
8510      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8511      delta += 2+alen;
8512      DIP("ldmxcsr %s\n", dis_buf);
8513
8514      /* The only thing we observe in %mxcsr is the rounding mode.
8515         Therefore, pass the 32-bit value (SSE native-format control
8516         word) to a clean helper, getting back a 64-bit value, the
8517         lower half of which is the SSEROUND value to store, and the
8518         upper half of which is the emulation-warning token which may
8519         be generated.
8520      */
8521      /* ULong x86h_check_ldmxcsr ( UInt ); */
8522      assign( t64, mkIRExprCCall(
8523                      Ity_I64, 0/*regparms*/,
8524                      "x86g_check_ldmxcsr",
8525                      &x86g_check_ldmxcsr,
8526                      mkIRExprVec_1( loadLE(Ity_I32, mkexpr(addr)) )
8527                   )
8528            );
8529
8530      put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
8531      assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
8532      put_emwarn( mkexpr(ew) );
8533      /* Finally, if an emulation warning was reported, side-exit to
8534         the next insn, reporting the warning, so that Valgrind's
8535         dispatcher sees the warning. */
8536      stmt(
8537         IRStmt_Exit(
8538            binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
8539            Ijk_EmWarn,
8540            IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta)
8541         )
8542      );
8543      goto decode_success;
8544   }
8545
8546   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8547   /* 0F F7 = MASKMOVQ -- 8x8 masked store */
8548   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF7) {
8549      Bool ok = False;
8550      delta = dis_MMX( &ok, sorb, sz, delta+1 );
8551      if (!ok)
8552         goto decode_failure;
8553      goto decode_success;
8554   }
8555
8556   /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
8557   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5F) {
8558      delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxps", Iop_Max32Fx4 );
8559      goto decode_success;
8560   }
8561
8562   /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
8563   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5F) {
8564      vassert(sz == 4);
8565      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "maxss", Iop_Max32F0x4 );
8566      goto decode_success;
8567   }
8568
8569   /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
8570   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5D) {
8571      delta = dis_SSE_E_to_G_all( sorb, delta+2, "minps", Iop_Min32Fx4 );
8572      goto decode_success;
8573   }
8574
8575   /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
8576   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5D) {
8577      vassert(sz == 4);
8578      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "minss", Iop_Min32F0x4 );
8579      goto decode_success;
8580   }
8581
8582   /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
8583   /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
8584   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x28 || insn[1] == 0x10)) {
8585      modrm = getIByte(delta+2);
8586      if (epartIsReg(modrm)) {
8587         putXMMReg( gregOfRM(modrm),
8588                    getXMMReg( eregOfRM(modrm) ));
8589         DIP("mov[ua]ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
8590                                  nameXMMReg(gregOfRM(modrm)));
8591         delta += 2+1;
8592      } else {
8593         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8594         if (insn[1] == 0x28/*movaps*/)
8595            gen_SEGV_if_not_16_aligned( addr );
8596         putXMMReg( gregOfRM(modrm),
8597                    loadLE(Ity_V128, mkexpr(addr)) );
8598         DIP("mov[ua]ps %s,%s\n", dis_buf,
8599                                  nameXMMReg(gregOfRM(modrm)));
8600         delta += 2+alen;
8601      }
8602      goto decode_success;
8603   }
8604
8605   /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
8606   /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
8607   if (sz == 4 && insn[0] == 0x0F
8608       && (insn[1] == 0x29 || insn[1] == 0x11)) {
8609      modrm = getIByte(delta+2);
8610      if (epartIsReg(modrm)) {
8611         /* fall through; awaiting test case */
8612      } else {
8613         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8614         if (insn[1] == 0x29/*movaps*/)
8615            gen_SEGV_if_not_16_aligned( addr );
8616         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
8617         DIP("mov[ua]ps %s,%s\n", nameXMMReg(gregOfRM(modrm)),
8618                                  dis_buf );
8619         delta += 2+alen;
8620         goto decode_success;
8621      }
8622   }
8623
8624   /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
8625   /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
8626   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x16) {
8627      modrm = getIByte(delta+2);
8628      if (epartIsReg(modrm)) {
8629         delta += 2+1;
8630         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
8631                          getXMMRegLane64( eregOfRM(modrm), 0 ) );
8632         DIP("movhps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
8633                               nameXMMReg(gregOfRM(modrm)));
8634      } else {
8635         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8636         delta += 2+alen;
8637         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
8638                          loadLE(Ity_I64, mkexpr(addr)) );
8639         DIP("movhps %s,%s\n", dis_buf,
8640                               nameXMMReg( gregOfRM(modrm) ));
8641      }
8642      goto decode_success;
8643   }
8644
8645   /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
8646   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x17) {
8647      if (!epartIsReg(insn[2])) {
8648         delta += 2;
8649         addr = disAMode ( &alen, sorb, delta, dis_buf );
8650         delta += alen;
8651         storeLE( mkexpr(addr),
8652                  getXMMRegLane64( gregOfRM(insn[2]),
8653                                   1/*upper lane*/ ) );
8654         DIP("movhps %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
8655                               dis_buf);
8656         goto decode_success;
8657      }
8658      /* else fall through */
8659   }
8660
8661   /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
8662   /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
8663   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x12) {
8664      modrm = getIByte(delta+2);
8665      if (epartIsReg(modrm)) {
8666         delta += 2+1;
8667         putXMMRegLane64( gregOfRM(modrm),
8668                          0/*lower lane*/,
8669                          getXMMRegLane64( eregOfRM(modrm), 1 ));
8670         DIP("movhlps %s, %s\n", nameXMMReg(eregOfRM(modrm)),
8671                                 nameXMMReg(gregOfRM(modrm)));
8672      } else {
8673         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8674         delta += 2+alen;
8675         putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
8676                          loadLE(Ity_I64, mkexpr(addr)) );
8677         DIP("movlps %s, %s\n",
8678             dis_buf, nameXMMReg( gregOfRM(modrm) ));
8679      }
8680      goto decode_success;
8681   }
8682
8683   /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
8684   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x13) {
8685      if (!epartIsReg(insn[2])) {
8686         delta += 2;
8687         addr = disAMode ( &alen, sorb, delta, dis_buf );
8688         delta += alen;
8689         storeLE( mkexpr(addr),
8690                  getXMMRegLane64( gregOfRM(insn[2]),
8691                                   0/*lower lane*/ ) );
8692         DIP("movlps %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
8693                                dis_buf);
8694         goto decode_success;
8695      }
8696      /* else fall through */
8697   }
8698
8699   /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
8700      to 4 lowest bits of ireg(G) */
8701   if (insn[0] == 0x0F && insn[1] == 0x50) {
8702      modrm = getIByte(delta+2);
8703      if (sz == 4 && epartIsReg(modrm)) {
8704         Int src;
8705         t0 = newTemp(Ity_I32);
8706         t1 = newTemp(Ity_I32);
8707         t2 = newTemp(Ity_I32);
8708         t3 = newTemp(Ity_I32);
8709         delta += 2+1;
8710         src = eregOfRM(modrm);
8711         assign( t0, binop( Iop_And32,
8712                            binop(Iop_Shr32, getXMMRegLane32(src,0), mkU8(31)),
8713                            mkU32(1) ));
8714         assign( t1, binop( Iop_And32,
8715                            binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(30)),
8716                            mkU32(2) ));
8717         assign( t2, binop( Iop_And32,
8718                            binop(Iop_Shr32, getXMMRegLane32(src,2), mkU8(29)),
8719                            mkU32(4) ));
8720         assign( t3, binop( Iop_And32,
8721                            binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(28)),
8722                            mkU32(8) ));
8723         putIReg(4, gregOfRM(modrm),
8724                    binop(Iop_Or32,
8725                          binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
8726                          binop(Iop_Or32, mkexpr(t2), mkexpr(t3))
8727                         )
8728                 );
8729         DIP("movmskps %s,%s\n", nameXMMReg(src),
8730                                 nameIReg(4, gregOfRM(modrm)));
8731         goto decode_success;
8732      }
8733      /* else fall through */
8734   }
8735
8736   /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
8737   /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
8738   if (insn[0] == 0x0F && insn[1] == 0x2B) {
8739      modrm = getIByte(delta+2);
8740      if (!epartIsReg(modrm)) {
8741         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8742         gen_SEGV_if_not_16_aligned( addr );
8743         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
8744         DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
8745                                 dis_buf,
8746                                 nameXMMReg(gregOfRM(modrm)));
8747         delta += 2+alen;
8748         goto decode_success;
8749      }
8750      /* else fall through */
8751   }
8752
8753   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8754   /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
8755      Intel manual does not say anything about the usual business of
8756      the FP reg tags getting trashed whenever an MMX insn happens.
8757      So we just leave them alone.
8758   */
8759   if (insn[0] == 0x0F && insn[1] == 0xE7) {
8760      modrm = getIByte(delta+2);
8761      if (sz == 4 && !epartIsReg(modrm)) {
8762         /* do_MMX_preamble(); Intel docs don't specify this */
8763         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8764         storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
8765         DIP("movntq %s,%s\n", dis_buf,
8766                               nameMMXReg(gregOfRM(modrm)));
8767         delta += 2+alen;
8768         goto decode_success;
8769      }
8770      /* else fall through */
8771   }
8772
8773   /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
8774      (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
8775   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x10) {
8776      vassert(sz == 4);
8777      modrm = getIByte(delta+3);
8778      if (epartIsReg(modrm)) {
8779         putXMMRegLane32( gregOfRM(modrm), 0,
8780                          getXMMRegLane32( eregOfRM(modrm), 0 ));
8781         DIP("movss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
8782                              nameXMMReg(gregOfRM(modrm)));
8783         delta += 3+1;
8784      } else {
8785         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
8786         /* zero bits 127:64 */
8787         putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
8788         /* zero bits 63:32 */
8789         putXMMRegLane32( gregOfRM(modrm), 1, mkU32(0) );
8790         /* write bits 31:0 */
8791         putXMMRegLane32( gregOfRM(modrm), 0,
8792                          loadLE(Ity_I32, mkexpr(addr)) );
8793         DIP("movss %s,%s\n", dis_buf,
8794                              nameXMMReg(gregOfRM(modrm)));
8795         delta += 3+alen;
8796      }
8797      goto decode_success;
8798   }
8799
8800   /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
8801      or lo 1/4 xmm). */
8802   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x11) {
8803      vassert(sz == 4);
8804      modrm = getIByte(delta+3);
8805      if (epartIsReg(modrm)) {
8806         /* fall through, we don't yet have a test case */
8807      } else {
8808         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
8809         storeLE( mkexpr(addr),
8810                  getXMMRegLane32(gregOfRM(modrm), 0) );
8811         DIP("movss %s,%s\n", nameXMMReg(gregOfRM(modrm)),
8812                              dis_buf);
8813         delta += 3+alen;
8814         goto decode_success;
8815      }
8816   }
8817
8818   /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
8819   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x59) {
8820      delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulps", Iop_Mul32Fx4 );
8821      goto decode_success;
8822   }
8823
8824   /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
8825   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x59) {
8826      vassert(sz == 4);
8827      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "mulss", Iop_Mul32F0x4 );
8828      goto decode_success;
8829   }
8830
8831   /* 0F 56 = ORPS -- G = G and E */
8832   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x56) {
8833      delta = dis_SSE_E_to_G_all( sorb, delta+2, "orps", Iop_OrV128 );
8834      goto decode_success;
8835   }
8836
8837   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8838   /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
8839   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE0) {
8840      do_MMX_preamble();
8841      delta = dis_MMXop_regmem_to_reg (
8842                sorb, delta+2, insn[1], "pavgb", False );
8843      goto decode_success;
8844   }
8845
8846   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8847   /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
8848   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE3) {
8849      do_MMX_preamble();
8850      delta = dis_MMXop_regmem_to_reg (
8851                sorb, delta+2, insn[1], "pavgw", False );
8852      goto decode_success;
8853   }
8854
8855   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8856   /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
8857      zero-extend of it in ireg(G). */
8858   if (insn[0] == 0x0F && insn[1] == 0xC5) {
8859      modrm = insn[2];
8860      if (sz == 4 && epartIsReg(modrm)) {
8861         IRTemp sV = newTemp(Ity_I64);
8862         t5 = newTemp(Ity_I16);
8863         do_MMX_preamble();
8864         assign(sV, getMMXReg(eregOfRM(modrm)));
8865         breakup64to16s( sV, &t3, &t2, &t1, &t0 );
8866         switch (insn[3] & 3) {
8867            case 0:  assign(t5, mkexpr(t0)); break;
8868            case 1:  assign(t5, mkexpr(t1)); break;
8869            case 2:  assign(t5, mkexpr(t2)); break;
8870            case 3:  assign(t5, mkexpr(t3)); break;
8871            default: vassert(0); /*NOTREACHED*/
8872         }
8873         putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t5)));
8874         DIP("pextrw $%d,%s,%s\n",
8875             (Int)insn[3], nameMMXReg(eregOfRM(modrm)),
8876                           nameIReg(4,gregOfRM(modrm)));
8877         delta += 4;
8878         goto decode_success;
8879      }
8880      /* else fall through */
8881   }
8882
8883   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8884   /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
8885      put it into the specified lane of mmx(G). */
8886   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC4) {
8887      /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
8888         mmx reg.  t4 is the new lane value.  t5 is the original
8889         mmx value. t6 is the new mmx value. */
8890      Int lane;
8891      t4 = newTemp(Ity_I16);
8892      t5 = newTemp(Ity_I64);
8893      t6 = newTemp(Ity_I64);
8894      modrm = insn[2];
8895      do_MMX_preamble();
8896
8897      assign(t5, getMMXReg(gregOfRM(modrm)));
8898      breakup64to16s( t5, &t3, &t2, &t1, &t0 );
8899
8900      if (epartIsReg(modrm)) {
8901         assign(t4, getIReg(2, eregOfRM(modrm)));
8902         delta += 3+1;
8903         lane = insn[3+1-1];
8904         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
8905                                   nameIReg(2,eregOfRM(modrm)),
8906                                   nameMMXReg(gregOfRM(modrm)));
8907      } else {
8908         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8909         delta += 3+alen;
8910         lane = insn[3+alen-1];
8911         assign(t4, loadLE(Ity_I16, mkexpr(addr)));
8912         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
8913                                   dis_buf,
8914                                   nameMMXReg(gregOfRM(modrm)));
8915      }
8916
8917      switch (lane & 3) {
8918         case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
8919         case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
8920         case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
8921         case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
8922         default: vassert(0); /*NOTREACHED*/
8923      }
8924      putMMXReg(gregOfRM(modrm), mkexpr(t6));
8925      goto decode_success;
8926   }
8927
8928   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8929   /* 0F EE = PMAXSW -- 16x4 signed max */
8930   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEE) {
8931      do_MMX_preamble();
8932      delta = dis_MMXop_regmem_to_reg (
8933                sorb, delta+2, insn[1], "pmaxsw", False );
8934      goto decode_success;
8935   }
8936
8937   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8938   /* 0F DE = PMAXUB -- 8x8 unsigned max */
8939   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDE) {
8940      do_MMX_preamble();
8941      delta = dis_MMXop_regmem_to_reg (
8942                sorb, delta+2, insn[1], "pmaxub", False );
8943      goto decode_success;
8944   }
8945
8946   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8947   /* 0F EA = PMINSW -- 16x4 signed min */
8948   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEA) {
8949      do_MMX_preamble();
8950      delta = dis_MMXop_regmem_to_reg (
8951                sorb, delta+2, insn[1], "pminsw", False );
8952      goto decode_success;
8953   }
8954
8955   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8956   /* 0F DA = PMINUB -- 8x8 unsigned min */
8957   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDA) {
8958      do_MMX_preamble();
8959      delta = dis_MMXop_regmem_to_reg (
8960                sorb, delta+2, insn[1], "pminub", False );
8961      goto decode_success;
8962   }
8963
8964   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8965   /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
8966      mmx(G), turn them into a byte, and put zero-extend of it in
8967      ireg(G). */
8968   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD7) {
8969      modrm = insn[2];
8970      if (epartIsReg(modrm)) {
8971         do_MMX_preamble();
8972         t0 = newTemp(Ity_I64);
8973         t1 = newTemp(Ity_I32);
8974         assign(t0, getMMXReg(eregOfRM(modrm)));
8975         assign(t1, mkIRExprCCall(
8976                       Ity_I32, 0/*regparms*/,
8977                       "x86g_calculate_mmx_pmovmskb",
8978                       &x86g_calculate_mmx_pmovmskb,
8979                       mkIRExprVec_1(mkexpr(t0))));
8980         putIReg(4, gregOfRM(modrm), mkexpr(t1));
8981         DIP("pmovmskb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
8982                                 nameIReg(4,gregOfRM(modrm)));
8983         delta += 3;
8984         goto decode_success;
8985      }
8986      /* else fall through */
8987   }
8988
8989   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8990   /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
8991   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE4) {
8992      do_MMX_preamble();
8993      delta = dis_MMXop_regmem_to_reg (
8994                sorb, delta+2, insn[1], "pmuluh", False );
8995      goto decode_success;
8996   }
8997
8998   /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
8999   /* 0F 18 /1 = PREFETCH0   -- with various different hints */
9000   /* 0F 18 /2 = PREFETCH1 */
9001   /* 0F 18 /3 = PREFETCH2 */
9002   if (insn[0] == 0x0F && insn[1] == 0x18
9003       && !epartIsReg(insn[2])
9004       && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 3) {
9005      HChar* hintstr = "??";
9006
9007      modrm = getIByte(delta+2);
9008      vassert(!epartIsReg(modrm));
9009
9010      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9011      delta += 2+alen;
9012
9013      switch (gregOfRM(modrm)) {
9014         case 0: hintstr = "nta"; break;
9015         case 1: hintstr = "t0"; break;
9016         case 2: hintstr = "t1"; break;
9017         case 3: hintstr = "t2"; break;
9018         default: vassert(0); /*NOTREACHED*/
9019      }
9020
9021      DIP("prefetch%s %s\n", hintstr, dis_buf);
9022      goto decode_success;
9023   }
9024
9025   /* 0F 0D /0 = PREFETCH  m8 -- 3DNow! prefetch */
9026   /* 0F 0D /1 = PREFETCHW m8 -- ditto, with some other hint */
9027   if (insn[0] == 0x0F && insn[1] == 0x0D
9028       && !epartIsReg(insn[2])
9029       && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 1) {
9030      HChar* hintstr = "??";
9031
9032      modrm = getIByte(delta+2);
9033      vassert(!epartIsReg(modrm));
9034
9035      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9036      delta += 2+alen;
9037
9038      switch (gregOfRM(modrm)) {
9039         case 0: hintstr = ""; break;
9040         case 1: hintstr = "w"; break;
9041         default: vassert(0); /*NOTREACHED*/
9042      }
9043
9044      DIP("prefetch%s %s\n", hintstr, dis_buf);
9045      goto decode_success;
9046   }
9047
9048   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9049   /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
9050   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF6) {
9051      do_MMX_preamble();
9052      delta = dis_MMXop_regmem_to_reg (
9053                 sorb, delta+2, insn[1], "psadbw", False );
9054      goto decode_success;
9055   }
9056
9057   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9058   /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
9059   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x70) {
9060      Int order;
9061      IRTemp sV, dV, s3, s2, s1, s0;
9062      s3 = s2 = s1 = s0 = IRTemp_INVALID;
9063      sV = newTemp(Ity_I64);
9064      dV = newTemp(Ity_I64);
9065      do_MMX_preamble();
9066      modrm = insn[2];
9067      if (epartIsReg(modrm)) {
9068         assign( sV, getMMXReg(eregOfRM(modrm)) );
9069         order = (Int)insn[3];
9070         delta += 2+2;
9071         DIP("pshufw $%d,%s,%s\n", order,
9072                                   nameMMXReg(eregOfRM(modrm)),
9073                                   nameMMXReg(gregOfRM(modrm)));
9074      } else {
9075         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9076         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
9077	 order = (Int)insn[2+alen];
9078         delta += 3+alen;
9079         DIP("pshufw $%d,%s,%s\n", order,
9080                                   dis_buf,
9081                                   nameMMXReg(gregOfRM(modrm)));
9082      }
9083      breakup64to16s( sV, &s3, &s2, &s1, &s0 );
9084
9085#     define SEL(n) \
9086                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
9087      assign(dV,
9088	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
9089                          SEL((order>>2)&3), SEL((order>>0)&3) )
9090      );
9091      putMMXReg(gregOfRM(modrm), mkexpr(dV));
9092#     undef SEL
9093      goto decode_success;
9094   }
9095
9096   /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
9097   if (insn[0] == 0x0F && insn[1] == 0x53) {
9098      vassert(sz == 4);
9099      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
9100                                        "rcpps", Iop_Recip32Fx4 );
9101      goto decode_success;
9102   }
9103
9104   /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
9105   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x53) {
9106      vassert(sz == 4);
9107      delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
9108                                         "rcpss", Iop_Recip32F0x4 );
9109      goto decode_success;
9110   }
9111
9112   /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
9113   if (insn[0] == 0x0F && insn[1] == 0x52) {
9114      vassert(sz == 4);
9115      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
9116                                        "rsqrtps", Iop_RSqrt32Fx4 );
9117      goto decode_success;
9118   }
9119
9120   /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
9121   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x52) {
9122      vassert(sz == 4);
9123      delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
9124                                         "rsqrtss", Iop_RSqrt32F0x4 );
9125      goto decode_success;
9126   }
9127
9128   /* 0F AE /7 = SFENCE -- flush pending operations to memory */
9129   if (insn[0] == 0x0F && insn[1] == 0xAE
9130       && epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
9131      vassert(sz == 4);
9132      delta += 3;
9133      /* Insert a memory fence.  It's sometimes important that these
9134         are carried through to the generated code. */
9135      stmt( IRStmt_MBE(Imbe_Fence) );
9136      DIP("sfence\n");
9137      goto decode_success;
9138   }
9139
9140   /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
9141   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC6) {
9142      Int    select;
9143      IRTemp sV, dV;
9144      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
9145      sV = newTemp(Ity_V128);
9146      dV = newTemp(Ity_V128);
9147      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
9148      modrm = insn[2];
9149      assign( dV, getXMMReg(gregOfRM(modrm)) );
9150
9151      if (epartIsReg(modrm)) {
9152         assign( sV, getXMMReg(eregOfRM(modrm)) );
9153         select = (Int)insn[3];
9154         delta += 2+2;
9155         DIP("shufps $%d,%s,%s\n", select,
9156                                   nameXMMReg(eregOfRM(modrm)),
9157                                   nameXMMReg(gregOfRM(modrm)));
9158      } else {
9159         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9160         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
9161         select = (Int)insn[2+alen];
9162         delta += 3+alen;
9163         DIP("shufps $%d,%s,%s\n", select,
9164                                   dis_buf,
9165                                   nameXMMReg(gregOfRM(modrm)));
9166      }
9167
9168      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
9169      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
9170
9171#     define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
9172#     define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
9173
9174      putXMMReg(
9175         gregOfRM(modrm),
9176         mk128from32s( SELS((select>>6)&3), SELS((select>>4)&3),
9177                       SELD((select>>2)&3), SELD((select>>0)&3) )
9178      );
9179
9180#     undef SELD
9181#     undef SELS
9182
9183      goto decode_success;
9184   }
9185
9186   /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
9187   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x51) {
9188      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
9189                                        "sqrtps", Iop_Sqrt32Fx4 );
9190      goto decode_success;
9191   }
9192
9193   /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
9194   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x51) {
9195      vassert(sz == 4);
9196      delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
9197                                         "sqrtss", Iop_Sqrt32F0x4 );
9198      goto decode_success;
9199   }
9200
9201   /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
9202   if (insn[0] == 0x0F && insn[1] == 0xAE
9203       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 3) {
9204      modrm = getIByte(delta+2);
9205      vassert(sz == 4);
9206      vassert(!epartIsReg(modrm));
9207
9208      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9209      delta += 2+alen;
9210
9211      /* Fake up a native SSE mxcsr word.  The only thing it depends
9212         on is SSEROUND[1:0], so call a clean helper to cook it up.
9213      */
9214      /* UInt x86h_create_mxcsr ( UInt sseround ) */
9215      DIP("stmxcsr %s\n", dis_buf);
9216      storeLE( mkexpr(addr),
9217               mkIRExprCCall(
9218                  Ity_I32, 0/*regp*/,
9219                  "x86g_create_mxcsr", &x86g_create_mxcsr,
9220                  mkIRExprVec_1( get_sse_roundingmode() )
9221               )
9222             );
9223      goto decode_success;
9224   }
9225
9226   /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
9227   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5C) {
9228      delta = dis_SSE_E_to_G_all( sorb, delta+2, "subps", Iop_Sub32Fx4 );
9229      goto decode_success;
9230   }
9231
9232   /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
9233   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5C) {
9234      vassert(sz == 4);
9235      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "subss", Iop_Sub32F0x4 );
9236      goto decode_success;
9237   }
9238
9239   /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
9240   /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
9241   /* These just appear to be special cases of SHUFPS */
9242   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
9243      IRTemp sV, dV;
9244      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
9245      Bool hi = toBool(insn[1] == 0x15);
9246      sV = newTemp(Ity_V128);
9247      dV = newTemp(Ity_V128);
9248      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
9249      modrm = insn[2];
9250      assign( dV, getXMMReg(gregOfRM(modrm)) );
9251
9252      if (epartIsReg(modrm)) {
9253         assign( sV, getXMMReg(eregOfRM(modrm)) );
9254         delta += 2+1;
9255         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
9256                                  nameXMMReg(eregOfRM(modrm)),
9257                                  nameXMMReg(gregOfRM(modrm)));
9258      } else {
9259         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9260         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
9261         delta += 2+alen;
9262         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
9263                                  dis_buf,
9264                                  nameXMMReg(gregOfRM(modrm)));
9265      }
9266
9267      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
9268      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
9269
9270      if (hi) {
9271         putXMMReg( gregOfRM(modrm), mk128from32s( s3, d3, s2, d2 ) );
9272      } else {
9273         putXMMReg( gregOfRM(modrm), mk128from32s( s1, d1, s0, d0 ) );
9274      }
9275
9276      goto decode_success;
9277   }
9278
9279   /* 0F 57 = XORPS -- G = G and E */
9280   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x57) {
9281      delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorps", Iop_XorV128 );
9282      goto decode_success;
9283   }
9284
9285   /* ---------------------------------------------------- */
9286   /* --- end of the SSE decoder.                      --- */
9287   /* ---------------------------------------------------- */
9288
9289   /* ---------------------------------------------------- */
9290   /* --- start of the SSE2 decoder.                   --- */
9291   /* ---------------------------------------------------- */
9292
9293   /* Skip parts of the decoder which don't apply given the stated
9294      guest subarchitecture. */
9295   if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
9296      goto after_sse_decoders; /* no SSE2 capabilities */
9297
9298   insn = (UChar*)&guest_code[delta];
9299
9300   /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
9301   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x58) {
9302      delta = dis_SSE_E_to_G_all( sorb, delta+2, "addpd", Iop_Add64Fx2 );
9303      goto decode_success;
9304   }
9305
9306   /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
9307   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x58) {
9308      vassert(sz == 4);
9309      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "addsd", Iop_Add64F0x2 );
9310      goto decode_success;
9311   }
9312
9313   /* 66 0F 55 = ANDNPD -- G = (not G) and E */
9314   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x55) {
9315      delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnpd", Iop_AndV128 );
9316      goto decode_success;
9317   }
9318
9319   /* 66 0F 54 = ANDPD -- G = G and E */
9320   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x54) {
9321      delta = dis_SSE_E_to_G_all( sorb, delta+2, "andpd", Iop_AndV128 );
9322      goto decode_success;
9323   }
9324
9325   /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
9326   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC2) {
9327      delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmppd", True, 8 );
9328      goto decode_success;
9329   }
9330
9331   /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
9332   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xC2) {
9333      vassert(sz == 4);
9334      delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpsd", False, 8 );
9335      goto decode_success;
9336   }
9337
9338   /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
9339   /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
9340   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
9341      IRTemp argL = newTemp(Ity_F64);
9342      IRTemp argR = newTemp(Ity_F64);
9343      modrm = getIByte(delta+2);
9344      if (epartIsReg(modrm)) {
9345         assign( argR, getXMMRegLane64F( eregOfRM(modrm), 0/*lowest lane*/ ) );
9346         delta += 2+1;
9347         DIP("[u]comisd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9348                                  nameXMMReg(gregOfRM(modrm)) );
9349      } else {
9350         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9351	 assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
9352         delta += 2+alen;
9353         DIP("[u]comisd %s,%s\n", dis_buf,
9354                                  nameXMMReg(gregOfRM(modrm)) );
9355      }
9356      assign( argL, getXMMRegLane64F( gregOfRM(modrm), 0/*lowest lane*/ ) );
9357
9358      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
9359      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
9360      stmt( IRStmt_Put(
9361               OFFB_CC_DEP1,
9362               binop( Iop_And32,
9363                      binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)),
9364                      mkU32(0x45)
9365          )));
9366      /* Set NDEP even though it isn't used.  This makes redundant-PUT
9367         elimination of previous stores to this field work better. */
9368      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
9369      goto decode_success;
9370   }
9371
9372   /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
9373      F64 in xmm(G) */
9374   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xE6) {
9375      IRTemp arg64 = newTemp(Ity_I64);
9376      vassert(sz == 4);
9377
9378      modrm = getIByte(delta+3);
9379      if (epartIsReg(modrm)) {
9380         assign( arg64, getXMMRegLane64(eregOfRM(modrm), 0) );
9381         delta += 3+1;
9382         DIP("cvtdq2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9383                                 nameXMMReg(gregOfRM(modrm)));
9384      } else {
9385         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9386	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
9387         delta += 3+alen;
9388         DIP("cvtdq2pd %s,%s\n", dis_buf,
9389                                 nameXMMReg(gregOfRM(modrm)) );
9390      }
9391
9392      putXMMRegLane64F(
9393         gregOfRM(modrm), 0,
9394         unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
9395      );
9396
9397      putXMMRegLane64F(
9398         gregOfRM(modrm), 1,
9399         unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
9400      );
9401
9402      goto decode_success;
9403   }
9404
9405   /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
9406      xmm(G) */
9407   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5B) {
9408      IRTemp argV  = newTemp(Ity_V128);
9409      IRTemp rmode = newTemp(Ity_I32);
9410
9411      modrm = getIByte(delta+2);
9412      if (epartIsReg(modrm)) {
9413         assign( argV, getXMMReg(eregOfRM(modrm)) );
9414         delta += 2+1;
9415         DIP("cvtdq2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9416                                 nameXMMReg(gregOfRM(modrm)));
9417      } else {
9418         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9419	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9420         delta += 2+alen;
9421         DIP("cvtdq2ps %s,%s\n", dis_buf,
9422                                 nameXMMReg(gregOfRM(modrm)) );
9423      }
9424
9425      assign( rmode, get_sse_roundingmode() );
9426      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
9427
9428#     define CVT(_t)  binop( Iop_F64toF32,                    \
9429                             mkexpr(rmode),                   \
9430                             unop(Iop_I32StoF64,mkexpr(_t)))
9431
9432      putXMMRegLane32F( gregOfRM(modrm), 3, CVT(t3) );
9433      putXMMRegLane32F( gregOfRM(modrm), 2, CVT(t2) );
9434      putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
9435      putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
9436
9437#     undef CVT
9438
9439      goto decode_success;
9440   }
9441
9442   /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
9443      lo half xmm(G), and zero upper half */
9444   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xE6) {
9445      IRTemp argV  = newTemp(Ity_V128);
9446      IRTemp rmode = newTemp(Ity_I32);
9447      vassert(sz == 4);
9448
9449      modrm = getIByte(delta+3);
9450      if (epartIsReg(modrm)) {
9451         assign( argV, getXMMReg(eregOfRM(modrm)) );
9452         delta += 3+1;
9453         DIP("cvtpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9454                                 nameXMMReg(gregOfRM(modrm)));
9455      } else {
9456         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9457	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9458         delta += 3+alen;
9459         DIP("cvtpd2dq %s,%s\n", dis_buf,
9460                                 nameXMMReg(gregOfRM(modrm)) );
9461      }
9462
9463      assign( rmode, get_sse_roundingmode() );
9464      t0 = newTemp(Ity_F64);
9465      t1 = newTemp(Ity_F64);
9466      assign( t0, unop(Iop_ReinterpI64asF64,
9467                       unop(Iop_V128to64, mkexpr(argV))) );
9468      assign( t1, unop(Iop_ReinterpI64asF64,
9469                       unop(Iop_V128HIto64, mkexpr(argV))) );
9470
9471#     define CVT(_t)  binop( Iop_F64toI32S,                   \
9472                             mkexpr(rmode),                   \
9473                             mkexpr(_t) )
9474
9475      putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
9476      putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
9477      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
9478      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
9479
9480#     undef CVT
9481
9482      goto decode_success;
9483   }
9484
9485   /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
9486      I32 in mmx, according to prevailing SSE rounding mode */
9487   /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
9488      I32 in mmx, rounding towards zero */
9489   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
9490      IRTemp dst64  = newTemp(Ity_I64);
9491      IRTemp rmode  = newTemp(Ity_I32);
9492      IRTemp f64lo  = newTemp(Ity_F64);
9493      IRTemp f64hi  = newTemp(Ity_F64);
9494      Bool   r2zero = toBool(insn[1] == 0x2C);
9495
9496      do_MMX_preamble();
9497      modrm = getIByte(delta+2);
9498
9499      if (epartIsReg(modrm)) {
9500         delta += 2+1;
9501	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
9502	 assign(f64hi, getXMMRegLane64F(eregOfRM(modrm), 1));
9503         DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
9504                                   nameXMMReg(eregOfRM(modrm)),
9505                                   nameMMXReg(gregOfRM(modrm)));
9506      } else {
9507         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9508	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
9509	 assign(f64hi, loadLE(Ity_F64, binop( Iop_Add32,
9510                                              mkexpr(addr),
9511                                              mkU32(8) )));
9512         delta += 2+alen;
9513         DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
9514                                   dis_buf,
9515                                   nameMMXReg(gregOfRM(modrm)));
9516      }
9517
9518      if (r2zero) {
9519         assign(rmode, mkU32((UInt)Irrm_ZERO) );
9520      } else {
9521         assign( rmode, get_sse_roundingmode() );
9522      }
9523
9524      assign(
9525         dst64,
9526         binop( Iop_32HLto64,
9527                binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
9528                binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
9529              )
9530      );
9531
9532      putMMXReg(gregOfRM(modrm), mkexpr(dst64));
9533      goto decode_success;
9534   }
9535
9536   /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
9537      lo half xmm(G), and zero upper half */
9538   /* Note, this is practically identical to CVTPD2DQ.  It would have
9539      been nicer to merge them together, but the insn[] offsets differ
9540      by one. */
9541   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5A) {
9542      IRTemp argV  = newTemp(Ity_V128);
9543      IRTemp rmode = newTemp(Ity_I32);
9544
9545      modrm = getIByte(delta+2);
9546      if (epartIsReg(modrm)) {
9547         assign( argV, getXMMReg(eregOfRM(modrm)) );
9548         delta += 2+1;
9549         DIP("cvtpd2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9550                                 nameXMMReg(gregOfRM(modrm)));
9551      } else {
9552         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9553	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9554         delta += 2+alen;
9555         DIP("cvtpd2ps %s,%s\n", dis_buf,
9556                                 nameXMMReg(gregOfRM(modrm)) );
9557      }
9558
9559      assign( rmode, get_sse_roundingmode() );
9560      t0 = newTemp(Ity_F64);
9561      t1 = newTemp(Ity_F64);
9562      assign( t0, unop(Iop_ReinterpI64asF64,
9563                       unop(Iop_V128to64, mkexpr(argV))) );
9564      assign( t1, unop(Iop_ReinterpI64asF64,
9565                       unop(Iop_V128HIto64, mkexpr(argV))) );
9566
9567#     define CVT(_t)  binop( Iop_F64toF32,                    \
9568                             mkexpr(rmode),                   \
9569                             mkexpr(_t) )
9570
9571      putXMMRegLane32(  gregOfRM(modrm), 3, mkU32(0) );
9572      putXMMRegLane32(  gregOfRM(modrm), 2, mkU32(0) );
9573      putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
9574      putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
9575
9576#     undef CVT
9577
9578      goto decode_success;
9579   }
9580
9581   /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
9582      xmm(G) */
9583   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x2A) {
9584      IRTemp arg64 = newTemp(Ity_I64);
9585
9586      modrm = getIByte(delta+2);
9587      if (epartIsReg(modrm)) {
9588         /* Only switch to MMX mode if the source is a MMX register.
9589            This is inconsistent with all other instructions which
9590            convert between XMM and (M64 or MMX), which always switch
9591            to MMX mode even if 64-bit operand is M64 and not MMX.  At
9592            least, that's what the Intel docs seem to me to say.
9593            Fixes #210264. */
9594         do_MMX_preamble();
9595         assign( arg64, getMMXReg(eregOfRM(modrm)) );
9596         delta += 2+1;
9597         DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregOfRM(modrm)),
9598                                 nameXMMReg(gregOfRM(modrm)));
9599      } else {
9600         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9601	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
9602         delta += 2+alen;
9603         DIP("cvtpi2pd %s,%s\n", dis_buf,
9604                                 nameXMMReg(gregOfRM(modrm)) );
9605      }
9606
9607      putXMMRegLane64F(
9608         gregOfRM(modrm), 0,
9609         unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
9610      );
9611
9612      putXMMRegLane64F(
9613         gregOfRM(modrm), 1,
9614         unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
9615      );
9616
9617      goto decode_success;
9618   }
9619
9620   /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
9621      xmm(G) */
9622   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5B) {
9623      IRTemp argV  = newTemp(Ity_V128);
9624      IRTemp rmode = newTemp(Ity_I32);
9625
9626      modrm = getIByte(delta+2);
9627      if (epartIsReg(modrm)) {
9628         assign( argV, getXMMReg(eregOfRM(modrm)) );
9629         delta += 2+1;
9630         DIP("cvtps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9631                                 nameXMMReg(gregOfRM(modrm)));
9632      } else {
9633         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9634	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9635         delta += 2+alen;
9636         DIP("cvtps2dq %s,%s\n", dis_buf,
9637                                 nameXMMReg(gregOfRM(modrm)) );
9638      }
9639
9640      assign( rmode, get_sse_roundingmode() );
9641      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
9642
9643      /* This is less than ideal.  If it turns out to be a performance
9644	 bottleneck it can be improved. */
9645#     define CVT(_t)                            \
9646        binop( Iop_F64toI32S,                   \
9647               mkexpr(rmode),                   \
9648               unop( Iop_F32toF64,              \
9649                     unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
9650
9651      putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
9652      putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
9653      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
9654      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
9655
9656#     undef CVT
9657
9658      goto decode_success;
9659   }
9660
9661   /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
9662      F64 in xmm(G). */
9663   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5A) {
9664      IRTemp f32lo = newTemp(Ity_F32);
9665      IRTemp f32hi = newTemp(Ity_F32);
9666
9667      modrm = getIByte(delta+2);
9668      if (epartIsReg(modrm)) {
9669         assign( f32lo, getXMMRegLane32F(eregOfRM(modrm), 0) );
9670         assign( f32hi, getXMMRegLane32F(eregOfRM(modrm), 1) );
9671         delta += 2+1;
9672         DIP("cvtps2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9673                                 nameXMMReg(gregOfRM(modrm)));
9674      } else {
9675         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9676	 assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
9677	 assign( f32hi, loadLE(Ity_F32,
9678                               binop(Iop_Add32,mkexpr(addr),mkU32(4))) );
9679         delta += 2+alen;
9680         DIP("cvtps2pd %s,%s\n", dis_buf,
9681                                 nameXMMReg(gregOfRM(modrm)) );
9682      }
9683
9684      putXMMRegLane64F( gregOfRM(modrm), 1,
9685                        unop(Iop_F32toF64, mkexpr(f32hi)) );
9686      putXMMRegLane64F( gregOfRM(modrm), 0,
9687                        unop(Iop_F32toF64, mkexpr(f32lo)) );
9688
9689      goto decode_success;
9690   }
9691
9692   /* F2 0F 2D = CVTSD2SI -- convert F64 in mem/low half xmm to
9693      I32 in ireg, according to prevailing SSE rounding mode */
9694   /* F2 0F 2C = CVTTSD2SI -- convert F64 in mem/low half xmm to
9695      I32 in ireg, rounding towards zero */
9696   if (insn[0] == 0xF2 && insn[1] == 0x0F
9697       && (insn[2] == 0x2D || insn[2] == 0x2C)) {
9698      IRTemp rmode = newTemp(Ity_I32);
9699      IRTemp f64lo = newTemp(Ity_F64);
9700      Bool   r2zero = toBool(insn[2] == 0x2C);
9701      vassert(sz == 4);
9702
9703      modrm = getIByte(delta+3);
9704      if (epartIsReg(modrm)) {
9705         delta += 3+1;
9706	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
9707         DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
9708                                   nameXMMReg(eregOfRM(modrm)),
9709                                   nameIReg(4, gregOfRM(modrm)));
9710      } else {
9711         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9712	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
9713         delta += 3+alen;
9714         DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
9715                                   dis_buf,
9716                                   nameIReg(4, gregOfRM(modrm)));
9717      }
9718
9719      if (r2zero) {
9720         assign( rmode, mkU32((UInt)Irrm_ZERO) );
9721      } else {
9722         assign( rmode, get_sse_roundingmode() );
9723      }
9724
9725      putIReg(4, gregOfRM(modrm),
9726                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
9727
9728      goto decode_success;
9729   }
9730
9731   /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
9732      low 1/4 xmm(G), according to prevailing SSE rounding mode */
9733   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5A) {
9734      IRTemp rmode = newTemp(Ity_I32);
9735      IRTemp f64lo = newTemp(Ity_F64);
9736      vassert(sz == 4);
9737
9738      modrm = getIByte(delta+3);
9739      if (epartIsReg(modrm)) {
9740         delta += 3+1;
9741	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
9742         DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9743                                 nameXMMReg(gregOfRM(modrm)));
9744      } else {
9745         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9746	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
9747         delta += 3+alen;
9748         DIP("cvtsd2ss %s,%s\n", dis_buf,
9749                                 nameXMMReg(gregOfRM(modrm)));
9750      }
9751
9752      assign( rmode, get_sse_roundingmode() );
9753      putXMMRegLane32F(
9754         gregOfRM(modrm), 0,
9755         binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
9756      );
9757
9758      goto decode_success;
9759   }
9760
9761   /* F2 0F 2A = CVTSI2SD -- convert I32 in mem/ireg to F64 in low
9762      half xmm */
9763   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x2A) {
9764      IRTemp arg32 = newTemp(Ity_I32);
9765      vassert(sz == 4);
9766
9767      modrm = getIByte(delta+3);
9768      if (epartIsReg(modrm)) {
9769         assign( arg32, getIReg(4, eregOfRM(modrm)) );
9770         delta += 3+1;
9771         DIP("cvtsi2sd %s,%s\n", nameIReg(4, eregOfRM(modrm)),
9772                                 nameXMMReg(gregOfRM(modrm)));
9773      } else {
9774         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9775	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
9776         delta += 3+alen;
9777         DIP("cvtsi2sd %s,%s\n", dis_buf,
9778                                 nameXMMReg(gregOfRM(modrm)) );
9779      }
9780
9781      putXMMRegLane64F(
9782         gregOfRM(modrm), 0,
9783         unop(Iop_I32StoF64, mkexpr(arg32)) );
9784
9785      goto decode_success;
9786   }
9787
9788   /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
9789      low half xmm(G) */
9790   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5A) {
9791      IRTemp f32lo = newTemp(Ity_F32);
9792      vassert(sz == 4);
9793
9794      modrm = getIByte(delta+3);
9795      if (epartIsReg(modrm)) {
9796         delta += 3+1;
9797	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
9798         DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9799                                 nameXMMReg(gregOfRM(modrm)));
9800      } else {
9801         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9802	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
9803         delta += 3+alen;
9804         DIP("cvtss2sd %s,%s\n", dis_buf,
9805                                 nameXMMReg(gregOfRM(modrm)));
9806      }
9807
9808      putXMMRegLane64F( gregOfRM(modrm), 0,
9809                        unop( Iop_F32toF64, mkexpr(f32lo) ) );
9810
9811      goto decode_success;
9812   }
9813
9814   /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
9815      lo half xmm(G), and zero upper half, rounding towards zero */
9816   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE6) {
9817      IRTemp argV  = newTemp(Ity_V128);
9818      IRTemp rmode = newTemp(Ity_I32);
9819
9820      modrm = getIByte(delta+2);
9821      if (epartIsReg(modrm)) {
9822         assign( argV, getXMMReg(eregOfRM(modrm)) );
9823         delta += 2+1;
9824         DIP("cvttpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9825                                  nameXMMReg(gregOfRM(modrm)));
9826      } else {
9827         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9828	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9829         delta += 2+alen;
9830         DIP("cvttpd2dq %s,%s\n", dis_buf,
9831                                  nameXMMReg(gregOfRM(modrm)) );
9832      }
9833
9834      assign( rmode, mkU32((UInt)Irrm_ZERO) );
9835
9836      t0 = newTemp(Ity_F64);
9837      t1 = newTemp(Ity_F64);
9838      assign( t0, unop(Iop_ReinterpI64asF64,
9839                       unop(Iop_V128to64, mkexpr(argV))) );
9840      assign( t1, unop(Iop_ReinterpI64asF64,
9841                       unop(Iop_V128HIto64, mkexpr(argV))) );
9842
9843#     define CVT(_t)  binop( Iop_F64toI32S,                   \
9844                             mkexpr(rmode),                   \
9845                             mkexpr(_t) )
9846
9847      putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
9848      putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
9849      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
9850      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
9851
9852#     undef CVT
9853
9854      goto decode_success;
9855   }
9856
9857   /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
9858      xmm(G), rounding towards zero */
9859   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5B) {
9860      IRTemp argV  = newTemp(Ity_V128);
9861      IRTemp rmode = newTemp(Ity_I32);
9862      vassert(sz == 4);
9863
9864      modrm = getIByte(delta+3);
9865      if (epartIsReg(modrm)) {
9866         assign( argV, getXMMReg(eregOfRM(modrm)) );
9867         delta += 3+1;
9868         DIP("cvttps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9869                                  nameXMMReg(gregOfRM(modrm)));
9870      } else {
9871         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9872	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9873         delta += 3+alen;
9874         DIP("cvttps2dq %s,%s\n", dis_buf,
9875                                  nameXMMReg(gregOfRM(modrm)) );
9876      }
9877
9878      assign( rmode, mkU32((UInt)Irrm_ZERO) );
9879      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
9880
9881      /* This is less than ideal.  If it turns out to be a performance
9882	 bottleneck it can be improved. */
9883#     define CVT(_t)                            \
9884        binop( Iop_F64toI32S,                   \
9885               mkexpr(rmode),                   \
9886               unop( Iop_F32toF64,              \
9887                     unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
9888
9889      putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
9890      putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
9891      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
9892      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
9893
9894#     undef CVT
9895
9896      goto decode_success;
9897   }
9898
9899   /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
9900   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5E) {
9901      delta = dis_SSE_E_to_G_all( sorb, delta+2, "divpd", Iop_Div64Fx2 );
9902      goto decode_success;
9903   }
9904
9905   /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
9906   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5E) {
9907      vassert(sz == 4);
9908      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "divsd", Iop_Div64F0x2 );
9909      goto decode_success;
9910   }
9911
9912   /* 0F AE /5 = LFENCE -- flush pending operations to memory */
9913   /* 0F AE /6 = MFENCE -- flush pending operations to memory */
9914   if (insn[0] == 0x0F && insn[1] == 0xAE
9915       && epartIsReg(insn[2])
9916       && (gregOfRM(insn[2]) == 5 || gregOfRM(insn[2]) == 6)) {
9917      vassert(sz == 4);
9918      delta += 3;
9919      /* Insert a memory fence.  It's sometimes important that these
9920         are carried through to the generated code. */
9921      stmt( IRStmt_MBE(Imbe_Fence) );
9922      DIP("%sfence\n", gregOfRM(insn[2])==5 ? "l" : "m");
9923      goto decode_success;
9924   }
9925
9926   /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
9927   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5F) {
9928      delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxpd", Iop_Max64Fx2 );
9929      goto decode_success;
9930   }
9931
9932   /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
9933   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5F) {
9934      vassert(sz == 4);
9935      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "maxsd", Iop_Max64F0x2 );
9936      goto decode_success;
9937   }
9938
9939   /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
9940   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5D) {
9941      delta = dis_SSE_E_to_G_all( sorb, delta+2, "minpd", Iop_Min64Fx2 );
9942      goto decode_success;
9943   }
9944
9945   /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
9946   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5D) {
9947      vassert(sz == 4);
9948      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "minsd", Iop_Min64F0x2 );
9949      goto decode_success;
9950   }
9951
9952   /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
9953   /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
9954   /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
9955   if (sz == 2 && insn[0] == 0x0F
9956       && (insn[1] == 0x28 || insn[1] == 0x10 || insn[1] == 0x6F)) {
9957      HChar* wot = insn[1]==0x28 ? "apd" :
9958                   insn[1]==0x10 ? "upd" : "dqa";
9959      modrm = getIByte(delta+2);
9960      if (epartIsReg(modrm)) {
9961         putXMMReg( gregOfRM(modrm),
9962                    getXMMReg( eregOfRM(modrm) ));
9963         DIP("mov%s %s,%s\n", wot, nameXMMReg(eregOfRM(modrm)),
9964                                   nameXMMReg(gregOfRM(modrm)));
9965         delta += 2+1;
9966      } else {
9967         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9968         if (insn[1] == 0x28/*movapd*/ || insn[1] == 0x6F/*movdqa*/)
9969            gen_SEGV_if_not_16_aligned( addr );
9970         putXMMReg( gregOfRM(modrm),
9971                    loadLE(Ity_V128, mkexpr(addr)) );
9972         DIP("mov%s %s,%s\n", wot, dis_buf,
9973                                   nameXMMReg(gregOfRM(modrm)));
9974         delta += 2+alen;
9975      }
9976      goto decode_success;
9977   }
9978
9979   /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
9980   /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
9981   if (sz == 2 && insn[0] == 0x0F
9982       && (insn[1] == 0x29 || insn[1] == 0x11)) {
9983      HChar* wot = insn[1]==0x29 ? "apd" : "upd";
9984      modrm = getIByte(delta+2);
9985      if (epartIsReg(modrm)) {
9986         /* fall through; awaiting test case */
9987      } else {
9988         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9989         if (insn[1] == 0x29/*movapd*/)
9990            gen_SEGV_if_not_16_aligned( addr );
9991         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
9992         DIP("mov%s %s,%s\n", wot, nameXMMReg(gregOfRM(modrm)),
9993                                   dis_buf );
9994         delta += 2+alen;
9995         goto decode_success;
9996      }
9997   }
9998
9999   /* 66 0F 6E = MOVD from r/m32 to xmm, zeroing high 3/4 of xmm. */
10000   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6E) {
10001      modrm = getIByte(delta+2);
10002      if (epartIsReg(modrm)) {
10003         delta += 2+1;
10004         putXMMReg(
10005            gregOfRM(modrm),
10006            unop( Iop_32UtoV128, getIReg(4, eregOfRM(modrm)) )
10007         );
10008         DIP("movd %s, %s\n",
10009             nameIReg(4,eregOfRM(modrm)), nameXMMReg(gregOfRM(modrm)));
10010      } else {
10011         addr = disAMode( &alen, sorb, delta+2, dis_buf );
10012         delta += 2+alen;
10013         putXMMReg(
10014            gregOfRM(modrm),
10015            unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
10016         );
10017         DIP("movd %s, %s\n", dis_buf, nameXMMReg(gregOfRM(modrm)));
10018      }
10019      goto decode_success;
10020   }
10021
10022   /* 66 0F 7E = MOVD from xmm low 1/4 to r/m32. */
10023   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7E) {
10024      modrm = getIByte(delta+2);
10025      if (epartIsReg(modrm)) {
10026         delta += 2+1;
10027         putIReg( 4, eregOfRM(modrm),
10028                  getXMMRegLane32(gregOfRM(modrm), 0) );
10029         DIP("movd %s, %s\n",
10030             nameXMMReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
10031      } else {
10032         addr = disAMode( &alen, sorb, delta+2, dis_buf );
10033         delta += 2+alen;
10034         storeLE( mkexpr(addr),
10035                  getXMMRegLane32(gregOfRM(modrm), 0) );
10036         DIP("movd %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
10037      }
10038      goto decode_success;
10039   }
10040
10041   /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
10042   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7F) {
10043      modrm = getIByte(delta+2);
10044      if (epartIsReg(modrm)) {
10045         delta += 2+1;
10046         putXMMReg( eregOfRM(modrm),
10047                    getXMMReg(gregOfRM(modrm)) );
10048         DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)),
10049                                nameXMMReg(eregOfRM(modrm)));
10050      } else {
10051         addr = disAMode( &alen, sorb, delta+2, dis_buf );
10052         delta += 2+alen;
10053         gen_SEGV_if_not_16_aligned( addr );
10054         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10055         DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
10056      }
10057      goto decode_success;
10058   }
10059
10060   /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
10061   /* Unfortunately can't simply use the MOVDQA case since the
10062      prefix lengths are different (66 vs F3) */
10063   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x6F) {
10064      vassert(sz == 4);
10065      modrm = getIByte(delta+3);
10066      if (epartIsReg(modrm)) {
10067         putXMMReg( gregOfRM(modrm),
10068                    getXMMReg( eregOfRM(modrm) ));
10069         DIP("movdqu %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10070                               nameXMMReg(gregOfRM(modrm)));
10071         delta += 3+1;
10072      } else {
10073         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10074         putXMMReg( gregOfRM(modrm),
10075                    loadLE(Ity_V128, mkexpr(addr)) );
10076         DIP("movdqu %s,%s\n", dis_buf,
10077                               nameXMMReg(gregOfRM(modrm)));
10078         delta += 3+alen;
10079      }
10080      goto decode_success;
10081   }
10082
10083   /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
10084   /* Unfortunately can't simply use the MOVDQA case since the
10085      prefix lengths are different (66 vs F3) */
10086   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7F) {
10087      vassert(sz == 4);
10088      modrm = getIByte(delta+3);
10089      if (epartIsReg(modrm)) {
10090         delta += 3+1;
10091         putXMMReg( eregOfRM(modrm),
10092                    getXMMReg(gregOfRM(modrm)) );
10093         DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)),
10094                                nameXMMReg(eregOfRM(modrm)));
10095      } else {
10096         addr = disAMode( &alen, sorb, delta+3, dis_buf );
10097         delta += 3+alen;
10098         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10099         DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
10100      }
10101      goto decode_success;
10102   }
10103
10104   /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
10105   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD6) {
10106      vassert(sz == 4);
10107      modrm = getIByte(delta+3);
10108      if (epartIsReg(modrm)) {
10109         do_MMX_preamble();
10110         putMMXReg( gregOfRM(modrm),
10111                    getXMMRegLane64( eregOfRM(modrm), 0 ));
10112         DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10113                                nameMMXReg(gregOfRM(modrm)));
10114         delta += 3+1;
10115         goto decode_success;
10116      } else {
10117         /* fall through, apparently no mem case for this insn */
10118      }
10119   }
10120
10121   /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
10122   /* These seems identical to MOVHPS.  This instruction encoding is
10123      completely crazy. */
10124   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x16) {
10125      modrm = getIByte(delta+2);
10126      if (epartIsReg(modrm)) {
10127         /* fall through; apparently reg-reg is not possible */
10128      } else {
10129         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10130         delta += 2+alen;
10131         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
10132                          loadLE(Ity_I64, mkexpr(addr)) );
10133         DIP("movhpd %s,%s\n", dis_buf,
10134                               nameXMMReg( gregOfRM(modrm) ));
10135         goto decode_success;
10136      }
10137   }
10138
10139   /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
10140   /* Again, this seems identical to MOVHPS. */
10141   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x17) {
10142      if (!epartIsReg(insn[2])) {
10143         delta += 2;
10144         addr = disAMode ( &alen, sorb, delta, dis_buf );
10145         delta += alen;
10146         storeLE( mkexpr(addr),
10147                  getXMMRegLane64( gregOfRM(insn[2]),
10148                                   1/*upper lane*/ ) );
10149         DIP("movhpd %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
10150                               dis_buf);
10151         goto decode_success;
10152      }
10153      /* else fall through */
10154   }
10155
10156   /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
10157   /* Identical to MOVLPS ? */
10158   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x12) {
10159      modrm = getIByte(delta+2);
10160      if (epartIsReg(modrm)) {
10161         /* fall through; apparently reg-reg is not possible */
10162      } else {
10163         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10164         delta += 2+alen;
10165         putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
10166                          loadLE(Ity_I64, mkexpr(addr)) );
10167         DIP("movlpd %s, %s\n",
10168             dis_buf, nameXMMReg( gregOfRM(modrm) ));
10169         goto decode_success;
10170      }
10171   }
10172
10173   /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
10174   /* Identical to MOVLPS ? */
10175   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x13) {
10176      if (!epartIsReg(insn[2])) {
10177         delta += 2;
10178         addr = disAMode ( &alen, sorb, delta, dis_buf );
10179         delta += alen;
10180         storeLE( mkexpr(addr),
10181                  getXMMRegLane64( gregOfRM(insn[2]),
10182                                   0/*lower lane*/ ) );
10183         DIP("movlpd %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
10184                                dis_buf);
10185         goto decode_success;
10186      }
10187      /* else fall through */
10188   }
10189
10190   /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
10191      2 lowest bits of ireg(G) */
10192   if (insn[0] == 0x0F && insn[1] == 0x50) {
10193      modrm = getIByte(delta+2);
10194      if (sz == 2 && epartIsReg(modrm)) {
10195         Int src;
10196         t0 = newTemp(Ity_I32);
10197         t1 = newTemp(Ity_I32);
10198         delta += 2+1;
10199         src = eregOfRM(modrm);
10200         assign( t0, binop( Iop_And32,
10201                            binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(31)),
10202                            mkU32(1) ));
10203         assign( t1, binop( Iop_And32,
10204                            binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(30)),
10205                            mkU32(2) ));
10206         putIReg(4, gregOfRM(modrm),
10207                    binop(Iop_Or32, mkexpr(t0), mkexpr(t1))
10208                 );
10209         DIP("movmskpd %s,%s\n", nameXMMReg(src),
10210                                 nameIReg(4, gregOfRM(modrm)));
10211         goto decode_success;
10212      }
10213      /* else fall through */
10214   }
10215
10216   /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
10217   if (insn[0] == 0x0F && insn[1] == 0xF7) {
10218      modrm = getIByte(delta+2);
10219      if (sz == 2 && epartIsReg(modrm)) {
10220         IRTemp regD    = newTemp(Ity_V128);
10221         IRTemp mask    = newTemp(Ity_V128);
10222         IRTemp olddata = newTemp(Ity_V128);
10223         IRTemp newdata = newTemp(Ity_V128);
10224                addr    = newTemp(Ity_I32);
10225
10226         assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
10227         assign( regD, getXMMReg( gregOfRM(modrm) ));
10228
10229         /* Unfortunately can't do the obvious thing with SarN8x16
10230            here since that can't be re-emitted as SSE2 code - no such
10231            insn. */
10232	 assign(
10233            mask,
10234            binop(Iop_64HLtoV128,
10235                  binop(Iop_SarN8x8,
10236                        getXMMRegLane64( eregOfRM(modrm), 1 ),
10237                        mkU8(7) ),
10238                  binop(Iop_SarN8x8,
10239                        getXMMRegLane64( eregOfRM(modrm), 0 ),
10240                        mkU8(7) ) ));
10241         assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
10242         assign( newdata,
10243                 binop(Iop_OrV128,
10244                       binop(Iop_AndV128,
10245                             mkexpr(regD),
10246                             mkexpr(mask) ),
10247                       binop(Iop_AndV128,
10248                             mkexpr(olddata),
10249                             unop(Iop_NotV128, mkexpr(mask)))) );
10250         storeLE( mkexpr(addr), mkexpr(newdata) );
10251
10252         delta += 2+1;
10253         DIP("maskmovdqu %s,%s\n", nameXMMReg( eregOfRM(modrm) ),
10254                                   nameXMMReg( gregOfRM(modrm) ) );
10255         goto decode_success;
10256      }
10257      /* else fall through */
10258   }
10259
10260   /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
10261   if (insn[0] == 0x0F && insn[1] == 0xE7) {
10262      modrm = getIByte(delta+2);
10263      if (sz == 2 && !epartIsReg(modrm)) {
10264         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10265         gen_SEGV_if_not_16_aligned( addr );
10266         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10267         DIP("movntdq %s,%s\n", dis_buf,
10268                                nameXMMReg(gregOfRM(modrm)));
10269         delta += 2+alen;
10270         goto decode_success;
10271      }
10272      /* else fall through */
10273   }
10274
10275   /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
10276   if (insn[0] == 0x0F && insn[1] == 0xC3) {
10277      vassert(sz == 4);
10278      modrm = getIByte(delta+2);
10279      if (!epartIsReg(modrm)) {
10280         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10281         storeLE( mkexpr(addr), getIReg(4, gregOfRM(modrm)) );
10282         DIP("movnti %s,%s\n", dis_buf,
10283                               nameIReg(4, gregOfRM(modrm)));
10284         delta += 2+alen;
10285         goto decode_success;
10286      }
10287      /* else fall through */
10288   }
10289
10290   /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
10291      or lo half xmm).  */
10292   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD6) {
10293      modrm = getIByte(delta+2);
10294      if (epartIsReg(modrm)) {
10295         /* fall through, awaiting test case */
10296         /* dst: lo half copied, hi half zeroed */
10297      } else {
10298         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10299         storeLE( mkexpr(addr),
10300                  getXMMRegLane64( gregOfRM(modrm), 0 ));
10301         DIP("movq %s,%s\n", nameXMMReg(gregOfRM(modrm)), dis_buf );
10302         delta += 2+alen;
10303         goto decode_success;
10304      }
10305   }
10306
10307   /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
10308      hi half). */
10309   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xD6) {
10310      vassert(sz == 4);
10311      modrm = getIByte(delta+3);
10312      if (epartIsReg(modrm)) {
10313         do_MMX_preamble();
10314         putXMMReg( gregOfRM(modrm),
10315                    unop(Iop_64UtoV128, getMMXReg( eregOfRM(modrm) )) );
10316         DIP("movq2dq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
10317                                nameXMMReg(gregOfRM(modrm)));
10318         delta += 3+1;
10319         goto decode_success;
10320      } else {
10321         /* fall through, apparently no mem case for this insn */
10322      }
10323   }
10324
10325   /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
10326      G (lo half xmm).  Upper half of G is zeroed out. */
10327   /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
10328      G (lo half xmm).  If E is mem, upper half of G is zeroed out.
10329      If E is reg, upper half of G is unchanged. */
10330   if ((insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x10)
10331       || (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7E)) {
10332      vassert(sz == 4);
10333      modrm = getIByte(delta+3);
10334      if (epartIsReg(modrm)) {
10335         putXMMRegLane64( gregOfRM(modrm), 0,
10336                          getXMMRegLane64( eregOfRM(modrm), 0 ));
10337         if (insn[0] == 0xF3/*MOVQ*/) {
10338            /* zero bits 127:64 */
10339            putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
10340         }
10341         DIP("movsd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10342                              nameXMMReg(gregOfRM(modrm)));
10343         delta += 3+1;
10344      } else {
10345         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10346         /* zero bits 127:64 */
10347         putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
10348         /* write bits 63:0 */
10349         putXMMRegLane64( gregOfRM(modrm), 0,
10350                          loadLE(Ity_I64, mkexpr(addr)) );
10351         DIP("movsd %s,%s\n", dis_buf,
10352                              nameXMMReg(gregOfRM(modrm)));
10353         delta += 3+alen;
10354      }
10355      goto decode_success;
10356   }
10357
10358   /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
10359      or lo half xmm). */
10360   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x11) {
10361      vassert(sz == 4);
10362      modrm = getIByte(delta+3);
10363      if (epartIsReg(modrm)) {
10364         putXMMRegLane64( eregOfRM(modrm), 0,
10365                          getXMMRegLane64( gregOfRM(modrm), 0 ));
10366         DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
10367                              nameXMMReg(eregOfRM(modrm)));
10368         delta += 3+1;
10369      } else {
10370         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10371         storeLE( mkexpr(addr),
10372                  getXMMRegLane64(gregOfRM(modrm), 0) );
10373         DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
10374                              dis_buf);
10375         delta += 3+alen;
10376      }
10377      goto decode_success;
10378   }
10379
10380   /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
10381   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x59) {
10382      delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulpd", Iop_Mul64Fx2 );
10383      goto decode_success;
10384   }
10385
10386   /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
10387   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x59) {
10388      vassert(sz == 4);
10389      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "mulsd", Iop_Mul64F0x2 );
10390      goto decode_success;
10391   }
10392
10393   /* 66 0F 56 = ORPD -- G = G and E */
10394   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x56) {
10395      delta = dis_SSE_E_to_G_all( sorb, delta+2, "orpd", Iop_OrV128 );
10396      goto decode_success;
10397   }
10398
10399   /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
10400   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC6) {
10401      Int    select;
10402      IRTemp sV = newTemp(Ity_V128);
10403      IRTemp dV = newTemp(Ity_V128);
10404      IRTemp s1 = newTemp(Ity_I64);
10405      IRTemp s0 = newTemp(Ity_I64);
10406      IRTemp d1 = newTemp(Ity_I64);
10407      IRTemp d0 = newTemp(Ity_I64);
10408
10409      modrm = insn[2];
10410      assign( dV, getXMMReg(gregOfRM(modrm)) );
10411
10412      if (epartIsReg(modrm)) {
10413         assign( sV, getXMMReg(eregOfRM(modrm)) );
10414         select = (Int)insn[3];
10415         delta += 2+2;
10416         DIP("shufpd $%d,%s,%s\n", select,
10417                                   nameXMMReg(eregOfRM(modrm)),
10418                                   nameXMMReg(gregOfRM(modrm)));
10419      } else {
10420         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10421         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
10422         select = (Int)insn[2+alen];
10423         delta += 3+alen;
10424         DIP("shufpd $%d,%s,%s\n", select,
10425                                   dis_buf,
10426                                   nameXMMReg(gregOfRM(modrm)));
10427      }
10428
10429      assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
10430      assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
10431      assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
10432      assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
10433
10434#     define SELD(n) mkexpr((n)==0 ? d0 : d1)
10435#     define SELS(n) mkexpr((n)==0 ? s0 : s1)
10436
10437      putXMMReg(
10438         gregOfRM(modrm),
10439         binop(Iop_64HLtoV128, SELS((select>>1)&1), SELD((select>>0)&1) )
10440      );
10441
10442#     undef SELD
10443#     undef SELS
10444
10445      goto decode_success;
10446   }
10447
10448   /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
10449   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x51) {
10450      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
10451                                        "sqrtpd", Iop_Sqrt64Fx2 );
10452      goto decode_success;
10453   }
10454
10455   /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
10456   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x51) {
10457      vassert(sz == 4);
10458      delta = dis_SSE_E_to_G_unary_lo64( sorb, delta+3,
10459                                         "sqrtsd", Iop_Sqrt64F0x2 );
10460      goto decode_success;
10461   }
10462
10463   /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
10464   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5C) {
10465      delta = dis_SSE_E_to_G_all( sorb, delta+2, "subpd", Iop_Sub64Fx2 );
10466      goto decode_success;
10467   }
10468
10469   /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
10470   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5C) {
10471      vassert(sz == 4);
10472      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "subsd", Iop_Sub64F0x2 );
10473      goto decode_success;
10474   }
10475
10476   /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
10477   /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
10478   /* These just appear to be special cases of SHUFPS */
10479   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
10480      IRTemp s1 = newTemp(Ity_I64);
10481      IRTemp s0 = newTemp(Ity_I64);
10482      IRTemp d1 = newTemp(Ity_I64);
10483      IRTemp d0 = newTemp(Ity_I64);
10484      IRTemp sV = newTemp(Ity_V128);
10485      IRTemp dV = newTemp(Ity_V128);
10486      Bool   hi = toBool(insn[1] == 0x15);
10487
10488      modrm = insn[2];
10489      assign( dV, getXMMReg(gregOfRM(modrm)) );
10490
10491      if (epartIsReg(modrm)) {
10492         assign( sV, getXMMReg(eregOfRM(modrm)) );
10493         delta += 2+1;
10494         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
10495                                  nameXMMReg(eregOfRM(modrm)),
10496                                  nameXMMReg(gregOfRM(modrm)));
10497      } else {
10498         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10499         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
10500         delta += 2+alen;
10501         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
10502                                  dis_buf,
10503                                  nameXMMReg(gregOfRM(modrm)));
10504      }
10505
10506      assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
10507      assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
10508      assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
10509      assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
10510
10511      if (hi) {
10512         putXMMReg( gregOfRM(modrm),
10513                    binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
10514      } else {
10515         putXMMReg( gregOfRM(modrm),
10516                    binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
10517      }
10518
10519      goto decode_success;
10520   }
10521
10522   /* 66 0F 57 = XORPD -- G = G and E */
10523   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x57) {
10524      delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorpd", Iop_XorV128 );
10525      goto decode_success;
10526   }
10527
10528   /* 66 0F 6B = PACKSSDW */
10529   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6B) {
10530      delta = dis_SSEint_E_to_G( sorb, delta+2,
10531                                 "packssdw", Iop_QNarrow32Sx4, True );
10532      goto decode_success;
10533   }
10534
10535   /* 66 0F 63 = PACKSSWB */
10536   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x63) {
10537      delta = dis_SSEint_E_to_G( sorb, delta+2,
10538                                 "packsswb", Iop_QNarrow16Sx8, True );
10539      goto decode_success;
10540   }
10541
10542   /* 66 0F 67 = PACKUSWB */
10543   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x67) {
10544      delta = dis_SSEint_E_to_G( sorb, delta+2,
10545                                 "packuswb", Iop_QNarrow16Ux8, True );
10546      goto decode_success;
10547   }
10548
10549   /* 66 0F FC = PADDB */
10550   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFC) {
10551      delta = dis_SSEint_E_to_G( sorb, delta+2,
10552                                 "paddb", Iop_Add8x16, False );
10553      goto decode_success;
10554   }
10555
10556   /* 66 0F FE = PADDD */
10557   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFE) {
10558      delta = dis_SSEint_E_to_G( sorb, delta+2,
10559                                 "paddd", Iop_Add32x4, False );
10560      goto decode_success;
10561   }
10562
10563   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
10564   /* 0F D4 = PADDQ -- add 64x1 */
10565   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD4) {
10566      do_MMX_preamble();
10567      delta = dis_MMXop_regmem_to_reg (
10568                sorb, delta+2, insn[1], "paddq", False );
10569      goto decode_success;
10570   }
10571
10572   /* 66 0F D4 = PADDQ */
10573   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD4) {
10574      delta = dis_SSEint_E_to_G( sorb, delta+2,
10575                                 "paddq", Iop_Add64x2, False );
10576      goto decode_success;
10577   }
10578
10579   /* 66 0F FD = PADDW */
10580   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFD) {
10581      delta = dis_SSEint_E_to_G( sorb, delta+2,
10582                                 "paddw", Iop_Add16x8, False );
10583      goto decode_success;
10584   }
10585
10586   /* 66 0F EC = PADDSB */
10587   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEC) {
10588      delta = dis_SSEint_E_to_G( sorb, delta+2,
10589                                 "paddsb", Iop_QAdd8Sx16, False );
10590      goto decode_success;
10591   }
10592
10593   /* 66 0F ED = PADDSW */
10594   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xED) {
10595      delta = dis_SSEint_E_to_G( sorb, delta+2,
10596                                 "paddsw", Iop_QAdd16Sx8, False );
10597      goto decode_success;
10598   }
10599
10600   /* 66 0F DC = PADDUSB */
10601   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDC) {
10602      delta = dis_SSEint_E_to_G( sorb, delta+2,
10603                                 "paddusb", Iop_QAdd8Ux16, False );
10604      goto decode_success;
10605   }
10606
10607   /* 66 0F DD = PADDUSW */
10608   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDD) {
10609      delta = dis_SSEint_E_to_G( sorb, delta+2,
10610                                 "paddusw", Iop_QAdd16Ux8, False );
10611      goto decode_success;
10612   }
10613
10614   /* 66 0F DB = PAND */
10615   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDB) {
10616      delta = dis_SSE_E_to_G_all( sorb, delta+2, "pand", Iop_AndV128 );
10617      goto decode_success;
10618   }
10619
10620   /* 66 0F DF = PANDN */
10621   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDF) {
10622      delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "pandn", Iop_AndV128 );
10623      goto decode_success;
10624   }
10625
10626   /* 66 0F E0 = PAVGB */
10627   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE0) {
10628      delta = dis_SSEint_E_to_G( sorb, delta+2,
10629                                 "pavgb", Iop_Avg8Ux16, False );
10630      goto decode_success;
10631   }
10632
10633   /* 66 0F E3 = PAVGW */
10634   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE3) {
10635      delta = dis_SSEint_E_to_G( sorb, delta+2,
10636                                 "pavgw", Iop_Avg16Ux8, False );
10637      goto decode_success;
10638   }
10639
10640   /* 66 0F 74 = PCMPEQB */
10641   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x74) {
10642      delta = dis_SSEint_E_to_G( sorb, delta+2,
10643                                 "pcmpeqb", Iop_CmpEQ8x16, False );
10644      goto decode_success;
10645   }
10646
10647   /* 66 0F 76 = PCMPEQD */
10648   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x76) {
10649      delta = dis_SSEint_E_to_G( sorb, delta+2,
10650                                 "pcmpeqd", Iop_CmpEQ32x4, False );
10651      goto decode_success;
10652   }
10653
10654   /* 66 0F 75 = PCMPEQW */
10655   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x75) {
10656      delta = dis_SSEint_E_to_G( sorb, delta+2,
10657                                 "pcmpeqw", Iop_CmpEQ16x8, False );
10658      goto decode_success;
10659   }
10660
10661   /* 66 0F 64 = PCMPGTB */
10662   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x64) {
10663      delta = dis_SSEint_E_to_G( sorb, delta+2,
10664                                 "pcmpgtb", Iop_CmpGT8Sx16, False );
10665      goto decode_success;
10666   }
10667
10668   /* 66 0F 66 = PCMPGTD */
10669   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x66) {
10670      delta = dis_SSEint_E_to_G( sorb, delta+2,
10671                                 "pcmpgtd", Iop_CmpGT32Sx4, False );
10672      goto decode_success;
10673   }
10674
10675   /* 66 0F 65 = PCMPGTW */
10676   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x65) {
10677      delta = dis_SSEint_E_to_G( sorb, delta+2,
10678                                 "pcmpgtw", Iop_CmpGT16Sx8, False );
10679      goto decode_success;
10680   }
10681
10682   /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
10683      zero-extend of it in ireg(G). */
10684   if (insn[0] == 0x0F && insn[1] == 0xC5) {
10685      modrm = insn[2];
10686      if (sz == 2 && epartIsReg(modrm)) {
10687         t5 = newTemp(Ity_V128);
10688         t4 = newTemp(Ity_I16);
10689         assign(t5, getXMMReg(eregOfRM(modrm)));
10690         breakup128to32s( t5, &t3, &t2, &t1, &t0 );
10691         switch (insn[3] & 7) {
10692            case 0:  assign(t4, unop(Iop_32to16,   mkexpr(t0))); break;
10693            case 1:  assign(t4, unop(Iop_32HIto16, mkexpr(t0))); break;
10694            case 2:  assign(t4, unop(Iop_32to16,   mkexpr(t1))); break;
10695            case 3:  assign(t4, unop(Iop_32HIto16, mkexpr(t1))); break;
10696            case 4:  assign(t4, unop(Iop_32to16,   mkexpr(t2))); break;
10697            case 5:  assign(t4, unop(Iop_32HIto16, mkexpr(t2))); break;
10698            case 6:  assign(t4, unop(Iop_32to16,   mkexpr(t3))); break;
10699            case 7:  assign(t4, unop(Iop_32HIto16, mkexpr(t3))); break;
10700            default: vassert(0); /*NOTREACHED*/
10701         }
10702         putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t4)));
10703         DIP("pextrw $%d,%s,%s\n",
10704             (Int)insn[3], nameXMMReg(eregOfRM(modrm)),
10705                           nameIReg(4,gregOfRM(modrm)));
10706         delta += 4;
10707         goto decode_success;
10708      }
10709      /* else fall through */
10710   }
10711
10712   /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
10713      put it into the specified lane of xmm(G). */
10714   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC4) {
10715      Int lane;
10716      t4 = newTemp(Ity_I16);
10717      modrm = insn[2];
10718
10719      if (epartIsReg(modrm)) {
10720         assign(t4, getIReg(2, eregOfRM(modrm)));
10721         delta += 3+1;
10722         lane = insn[3+1-1];
10723         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
10724                                   nameIReg(2,eregOfRM(modrm)),
10725                                   nameXMMReg(gregOfRM(modrm)));
10726      } else {
10727         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10728         delta += 3+alen;
10729         lane = insn[3+alen-1];
10730         assign(t4, loadLE(Ity_I16, mkexpr(addr)));
10731         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
10732                                   dis_buf,
10733                                   nameXMMReg(gregOfRM(modrm)));
10734      }
10735
10736      putXMMRegLane16( gregOfRM(modrm), lane & 7, mkexpr(t4) );
10737      goto decode_success;
10738   }
10739
10740   /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
10741      E(xmm or mem) to G(xmm) */
10742   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF5) {
10743      IRTemp s1V  = newTemp(Ity_V128);
10744      IRTemp s2V  = newTemp(Ity_V128);
10745      IRTemp dV   = newTemp(Ity_V128);
10746      IRTemp s1Hi = newTemp(Ity_I64);
10747      IRTemp s1Lo = newTemp(Ity_I64);
10748      IRTemp s2Hi = newTemp(Ity_I64);
10749      IRTemp s2Lo = newTemp(Ity_I64);
10750      IRTemp dHi  = newTemp(Ity_I64);
10751      IRTemp dLo  = newTemp(Ity_I64);
10752      modrm = insn[2];
10753      if (epartIsReg(modrm)) {
10754         assign( s1V, getXMMReg(eregOfRM(modrm)) );
10755         delta += 2+1;
10756         DIP("pmaddwd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10757                                nameXMMReg(gregOfRM(modrm)));
10758      } else {
10759         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10760         assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
10761         delta += 2+alen;
10762         DIP("pmaddwd %s,%s\n", dis_buf,
10763                                nameXMMReg(gregOfRM(modrm)));
10764      }
10765      assign( s2V, getXMMReg(gregOfRM(modrm)) );
10766      assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
10767      assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
10768      assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
10769      assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
10770      assign( dHi, mkIRExprCCall(
10771                      Ity_I64, 0/*regparms*/,
10772                      "x86g_calculate_mmx_pmaddwd",
10773                      &x86g_calculate_mmx_pmaddwd,
10774                      mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
10775                   ));
10776      assign( dLo, mkIRExprCCall(
10777                      Ity_I64, 0/*regparms*/,
10778                      "x86g_calculate_mmx_pmaddwd",
10779                      &x86g_calculate_mmx_pmaddwd,
10780                      mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
10781                   ));
10782      assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
10783      putXMMReg(gregOfRM(modrm), mkexpr(dV));
10784      goto decode_success;
10785   }
10786
10787   /* 66 0F EE = PMAXSW -- 16x8 signed max */
10788   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEE) {
10789      delta = dis_SSEint_E_to_G( sorb, delta+2,
10790                                 "pmaxsw", Iop_Max16Sx8, False );
10791      goto decode_success;
10792   }
10793
10794   /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
10795   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDE) {
10796      delta = dis_SSEint_E_to_G( sorb, delta+2,
10797                                 "pmaxub", Iop_Max8Ux16, False );
10798      goto decode_success;
10799   }
10800
10801   /* 66 0F EA = PMINSW -- 16x8 signed min */
10802   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEA) {
10803      delta = dis_SSEint_E_to_G( sorb, delta+2,
10804                                 "pminsw", Iop_Min16Sx8, False );
10805      goto decode_success;
10806   }
10807
10808   /* 66 0F DA = PMINUB -- 8x16 unsigned min */
10809   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDA) {
10810      delta = dis_SSEint_E_to_G( sorb, delta+2,
10811                                 "pminub", Iop_Min8Ux16, False );
10812      goto decode_success;
10813   }
10814
10815   /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16 lanes in
10816      xmm(G), turn them into a byte, and put zero-extend of it in
10817      ireg(G).  Doing this directly is just too cumbersome; give up
10818      therefore and call a helper. */
10819   /* UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ); */
10820   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD7) {
10821      modrm = insn[2];
10822      if (epartIsReg(modrm)) {
10823         t0 = newTemp(Ity_I64);
10824         t1 = newTemp(Ity_I64);
10825         assign(t0, getXMMRegLane64(eregOfRM(modrm), 0));
10826         assign(t1, getXMMRegLane64(eregOfRM(modrm), 1));
10827         t5 = newTemp(Ity_I32);
10828         assign(t5, mkIRExprCCall(
10829                       Ity_I32, 0/*regparms*/,
10830                       "x86g_calculate_sse_pmovmskb",
10831                       &x86g_calculate_sse_pmovmskb,
10832                       mkIRExprVec_2( mkexpr(t1), mkexpr(t0) )));
10833         putIReg(4, gregOfRM(modrm), mkexpr(t5));
10834         DIP("pmovmskb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10835                                 nameIReg(4,gregOfRM(modrm)));
10836         delta += 3;
10837         goto decode_success;
10838      }
10839      /* else fall through */
10840   }
10841
10842   /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
10843   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE4) {
10844      delta = dis_SSEint_E_to_G( sorb, delta+2,
10845                                 "pmulhuw", Iop_MulHi16Ux8, False );
10846      goto decode_success;
10847   }
10848
10849   /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
10850   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE5) {
10851      delta = dis_SSEint_E_to_G( sorb, delta+2,
10852                                 "pmulhw", Iop_MulHi16Sx8, False );
10853      goto decode_success;
10854   }
10855
10856   /* 66 0F D5 = PMULHL -- 16x8 multiply */
10857   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD5) {
10858      delta = dis_SSEint_E_to_G( sorb, delta+2,
10859                                 "pmullw", Iop_Mul16x8, False );
10860      goto decode_success;
10861   }
10862
10863   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
10864   /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
10865      0 to form 64-bit result */
10866   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF4) {
10867      IRTemp sV = newTemp(Ity_I64);
10868      IRTemp dV = newTemp(Ity_I64);
10869      t1 = newTemp(Ity_I32);
10870      t0 = newTemp(Ity_I32);
10871      modrm = insn[2];
10872
10873      do_MMX_preamble();
10874      assign( dV, getMMXReg(gregOfRM(modrm)) );
10875
10876      if (epartIsReg(modrm)) {
10877         assign( sV, getMMXReg(eregOfRM(modrm)) );
10878         delta += 2+1;
10879         DIP("pmuludq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
10880                                nameMMXReg(gregOfRM(modrm)));
10881      } else {
10882         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10883         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
10884         delta += 2+alen;
10885         DIP("pmuludq %s,%s\n", dis_buf,
10886                                nameMMXReg(gregOfRM(modrm)));
10887      }
10888
10889      assign( t0, unop(Iop_64to32, mkexpr(dV)) );
10890      assign( t1, unop(Iop_64to32, mkexpr(sV)) );
10891      putMMXReg( gregOfRM(modrm),
10892                 binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
10893      goto decode_success;
10894   }
10895
10896   /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
10897      0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
10898      half */
10899   /* This is a really poor translation -- could be improved if
10900      performance critical */
10901   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF4) {
10902      IRTemp sV, dV;
10903      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
10904      sV = newTemp(Ity_V128);
10905      dV = newTemp(Ity_V128);
10906      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
10907      t1 = newTemp(Ity_I64);
10908      t0 = newTemp(Ity_I64);
10909      modrm = insn[2];
10910      assign( dV, getXMMReg(gregOfRM(modrm)) );
10911
10912      if (epartIsReg(modrm)) {
10913         assign( sV, getXMMReg(eregOfRM(modrm)) );
10914         delta += 2+1;
10915         DIP("pmuludq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10916                                nameXMMReg(gregOfRM(modrm)));
10917      } else {
10918         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10919         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
10920         delta += 2+alen;
10921         DIP("pmuludq %s,%s\n", dis_buf,
10922                                nameXMMReg(gregOfRM(modrm)));
10923      }
10924
10925      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
10926      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
10927
10928      assign( t0, binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) );
10929      putXMMRegLane64( gregOfRM(modrm), 0, mkexpr(t0) );
10930      assign( t1, binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)) );
10931      putXMMRegLane64( gregOfRM(modrm), 1, mkexpr(t1) );
10932      goto decode_success;
10933   }
10934
10935   /* 66 0F EB = POR */
10936   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEB) {
10937      delta = dis_SSE_E_to_G_all( sorb, delta+2, "por", Iop_OrV128 );
10938      goto decode_success;
10939   }
10940
10941   /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
10942      from E(xmm or mem) to G(xmm) */
10943   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF6) {
10944      IRTemp s1V  = newTemp(Ity_V128);
10945      IRTemp s2V  = newTemp(Ity_V128);
10946      IRTemp dV   = newTemp(Ity_V128);
10947      IRTemp s1Hi = newTemp(Ity_I64);
10948      IRTemp s1Lo = newTemp(Ity_I64);
10949      IRTemp s2Hi = newTemp(Ity_I64);
10950      IRTemp s2Lo = newTemp(Ity_I64);
10951      IRTemp dHi  = newTemp(Ity_I64);
10952      IRTemp dLo  = newTemp(Ity_I64);
10953      modrm = insn[2];
10954      if (epartIsReg(modrm)) {
10955         assign( s1V, getXMMReg(eregOfRM(modrm)) );
10956         delta += 2+1;
10957         DIP("psadbw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10958                               nameXMMReg(gregOfRM(modrm)));
10959      } else {
10960         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10961         assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
10962         delta += 2+alen;
10963         DIP("psadbw %s,%s\n", dis_buf,
10964                               nameXMMReg(gregOfRM(modrm)));
10965      }
10966      assign( s2V, getXMMReg(gregOfRM(modrm)) );
10967      assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
10968      assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
10969      assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
10970      assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
10971      assign( dHi, mkIRExprCCall(
10972                      Ity_I64, 0/*regparms*/,
10973                      "x86g_calculate_mmx_psadbw",
10974                      &x86g_calculate_mmx_psadbw,
10975                      mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
10976                   ));
10977      assign( dLo, mkIRExprCCall(
10978                      Ity_I64, 0/*regparms*/,
10979                      "x86g_calculate_mmx_psadbw",
10980                      &x86g_calculate_mmx_psadbw,
10981                      mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
10982                   ));
10983      assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
10984      putXMMReg(gregOfRM(modrm), mkexpr(dV));
10985      goto decode_success;
10986   }
10987
10988   /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
10989   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x70) {
10990      Int order;
10991      IRTemp sV, dV, s3, s2, s1, s0;
10992      s3 = s2 = s1 = s0 = IRTemp_INVALID;
10993      sV = newTemp(Ity_V128);
10994      dV = newTemp(Ity_V128);
10995      modrm = insn[2];
10996      if (epartIsReg(modrm)) {
10997         assign( sV, getXMMReg(eregOfRM(modrm)) );
10998         order = (Int)insn[3];
10999         delta += 2+2;
11000         DIP("pshufd $%d,%s,%s\n", order,
11001                                   nameXMMReg(eregOfRM(modrm)),
11002                                   nameXMMReg(gregOfRM(modrm)));
11003      } else {
11004         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11005         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11006	 order = (Int)insn[2+alen];
11007         delta += 3+alen;
11008         DIP("pshufd $%d,%s,%s\n", order,
11009                                   dis_buf,
11010                                   nameXMMReg(gregOfRM(modrm)));
11011      }
11012      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
11013
11014#     define SEL(n) \
11015                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11016      assign(dV,
11017	     mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
11018                           SEL((order>>2)&3), SEL((order>>0)&3) )
11019      );
11020      putXMMReg(gregOfRM(modrm), mkexpr(dV));
11021#     undef SEL
11022      goto decode_success;
11023   }
11024
11025   /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
11026      mem) to G(xmm), and copy lower half */
11027   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x70) {
11028      Int order;
11029      IRTemp sVhi, dVhi, sV, dV, s3, s2, s1, s0;
11030      s3 = s2 = s1 = s0 = IRTemp_INVALID;
11031      sV   = newTemp(Ity_V128);
11032      dV   = newTemp(Ity_V128);
11033      sVhi = newTemp(Ity_I64);
11034      dVhi = newTemp(Ity_I64);
11035      modrm = insn[3];
11036      if (epartIsReg(modrm)) {
11037         assign( sV, getXMMReg(eregOfRM(modrm)) );
11038         order = (Int)insn[4];
11039         delta += 4+1;
11040         DIP("pshufhw $%d,%s,%s\n", order,
11041                                    nameXMMReg(eregOfRM(modrm)),
11042                                    nameXMMReg(gregOfRM(modrm)));
11043      } else {
11044         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11045         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11046	 order = (Int)insn[3+alen];
11047         delta += 4+alen;
11048         DIP("pshufhw $%d,%s,%s\n", order,
11049                                    dis_buf,
11050                                    nameXMMReg(gregOfRM(modrm)));
11051      }
11052      assign( sVhi, unop(Iop_V128HIto64, mkexpr(sV)) );
11053      breakup64to16s( sVhi, &s3, &s2, &s1, &s0 );
11054
11055#     define SEL(n) \
11056                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11057      assign(dVhi,
11058	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
11059                          SEL((order>>2)&3), SEL((order>>0)&3) )
11060      );
11061      assign(dV, binop( Iop_64HLtoV128,
11062                        mkexpr(dVhi),
11063                        unop(Iop_V128to64, mkexpr(sV))) );
11064      putXMMReg(gregOfRM(modrm), mkexpr(dV));
11065#     undef SEL
11066      goto decode_success;
11067   }
11068
11069   /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
11070      mem) to G(xmm), and copy upper half */
11071   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x70) {
11072      Int order;
11073      IRTemp sVlo, dVlo, sV, dV, s3, s2, s1, s0;
11074      s3 = s2 = s1 = s0 = IRTemp_INVALID;
11075      sV   = newTemp(Ity_V128);
11076      dV   = newTemp(Ity_V128);
11077      sVlo = newTemp(Ity_I64);
11078      dVlo = newTemp(Ity_I64);
11079      modrm = insn[3];
11080      if (epartIsReg(modrm)) {
11081         assign( sV, getXMMReg(eregOfRM(modrm)) );
11082         order = (Int)insn[4];
11083         delta += 4+1;
11084         DIP("pshuflw $%d,%s,%s\n", order,
11085                                    nameXMMReg(eregOfRM(modrm)),
11086                                    nameXMMReg(gregOfRM(modrm)));
11087      } else {
11088         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11089         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11090	 order = (Int)insn[3+alen];
11091         delta += 4+alen;
11092         DIP("pshuflw $%d,%s,%s\n", order,
11093                                    dis_buf,
11094                                    nameXMMReg(gregOfRM(modrm)));
11095      }
11096      assign( sVlo, unop(Iop_V128to64, mkexpr(sV)) );
11097      breakup64to16s( sVlo, &s3, &s2, &s1, &s0 );
11098
11099#     define SEL(n) \
11100                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11101      assign(dVlo,
11102	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
11103                          SEL((order>>2)&3), SEL((order>>0)&3) )
11104      );
11105      assign(dV, binop( Iop_64HLtoV128,
11106                        unop(Iop_V128HIto64, mkexpr(sV)),
11107                        mkexpr(dVlo) ) );
11108      putXMMReg(gregOfRM(modrm), mkexpr(dV));
11109#     undef SEL
11110      goto decode_success;
11111   }
11112
11113   /* 66 0F 72 /6 ib = PSLLD by immediate */
11114   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
11115       && epartIsReg(insn[2])
11116       && gregOfRM(insn[2]) == 6) {
11117      delta = dis_SSE_shiftE_imm( delta+2, "pslld", Iop_ShlN32x4 );
11118      goto decode_success;
11119   }
11120
11121   /* 66 0F F2 = PSLLD by E */
11122   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF2) {
11123      delta = dis_SSE_shiftG_byE( sorb, delta+2, "pslld", Iop_ShlN32x4 );
11124      goto decode_success;
11125   }
11126
11127   /* 66 0F 73 /7 ib = PSLLDQ by immediate */
11128   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
11129       && epartIsReg(insn[2])
11130       && gregOfRM(insn[2]) == 7) {
11131      IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
11132      Int    imm = (Int)insn[3];
11133      Int    reg = eregOfRM(insn[2]);
11134      DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
11135      vassert(imm >= 0 && imm <= 255);
11136      delta += 4;
11137
11138      sV    = newTemp(Ity_V128);
11139      dV    = newTemp(Ity_V128);
11140      hi64  = newTemp(Ity_I64);
11141      lo64  = newTemp(Ity_I64);
11142      hi64r = newTemp(Ity_I64);
11143      lo64r = newTemp(Ity_I64);
11144
11145      if (imm >= 16) {
11146         putXMMReg(reg, mkV128(0x0000));
11147         goto decode_success;
11148      }
11149
11150      assign( sV, getXMMReg(reg) );
11151      assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
11152      assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
11153
11154      if (imm == 0) {
11155         assign( lo64r, mkexpr(lo64) );
11156         assign( hi64r, mkexpr(hi64) );
11157      }
11158      else
11159      if (imm == 8) {
11160         assign( lo64r, mkU64(0) );
11161         assign( hi64r, mkexpr(lo64) );
11162      }
11163      else
11164      if (imm > 8) {
11165         assign( lo64r, mkU64(0) );
11166         assign( hi64r, binop( Iop_Shl64,
11167                               mkexpr(lo64),
11168                               mkU8( 8*(imm-8) ) ));
11169      } else {
11170         assign( lo64r, binop( Iop_Shl64,
11171                               mkexpr(lo64),
11172                               mkU8(8 * imm) ));
11173         assign( hi64r,
11174                 binop( Iop_Or64,
11175                        binop(Iop_Shl64, mkexpr(hi64),
11176                                         mkU8(8 * imm)),
11177                        binop(Iop_Shr64, mkexpr(lo64),
11178                                         mkU8(8 * (8 - imm)) )
11179                      )
11180               );
11181      }
11182      assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
11183      putXMMReg(reg, mkexpr(dV));
11184      goto decode_success;
11185   }
11186
11187   /* 66 0F 73 /6 ib = PSLLQ by immediate */
11188   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
11189       && epartIsReg(insn[2])
11190       && gregOfRM(insn[2]) == 6) {
11191      delta = dis_SSE_shiftE_imm( delta+2, "psllq", Iop_ShlN64x2 );
11192      goto decode_success;
11193   }
11194
11195   /* 66 0F F3 = PSLLQ by E */
11196   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF3) {
11197      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllq", Iop_ShlN64x2 );
11198      goto decode_success;
11199   }
11200
11201   /* 66 0F 71 /6 ib = PSLLW by immediate */
11202   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
11203       && epartIsReg(insn[2])
11204       && gregOfRM(insn[2]) == 6) {
11205      delta = dis_SSE_shiftE_imm( delta+2, "psllw", Iop_ShlN16x8 );
11206      goto decode_success;
11207   }
11208
11209   /* 66 0F F1 = PSLLW by E */
11210   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF1) {
11211      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllw", Iop_ShlN16x8 );
11212      goto decode_success;
11213   }
11214
11215   /* 66 0F 72 /4 ib = PSRAD by immediate */
11216   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
11217       && epartIsReg(insn[2])
11218       && gregOfRM(insn[2]) == 4) {
11219      delta = dis_SSE_shiftE_imm( delta+2, "psrad", Iop_SarN32x4 );
11220      goto decode_success;
11221   }
11222
11223   /* 66 0F E2 = PSRAD by E */
11224   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE2) {
11225      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrad", Iop_SarN32x4 );
11226      goto decode_success;
11227   }
11228
11229   /* 66 0F 71 /4 ib = PSRAW by immediate */
11230   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
11231       && epartIsReg(insn[2])
11232       && gregOfRM(insn[2]) == 4) {
11233      delta = dis_SSE_shiftE_imm( delta+2, "psraw", Iop_SarN16x8 );
11234      goto decode_success;
11235   }
11236
11237   /* 66 0F E1 = PSRAW by E */
11238   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE1) {
11239      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psraw", Iop_SarN16x8 );
11240      goto decode_success;
11241   }
11242
11243   /* 66 0F 72 /2 ib = PSRLD by immediate */
11244   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
11245       && epartIsReg(insn[2])
11246       && gregOfRM(insn[2]) == 2) {
11247      delta = dis_SSE_shiftE_imm( delta+2, "psrld", Iop_ShrN32x4 );
11248      goto decode_success;
11249   }
11250
11251   /* 66 0F D2 = PSRLD by E */
11252   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD2) {
11253      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrld", Iop_ShrN32x4 );
11254      goto decode_success;
11255   }
11256
11257   /* 66 0F 73 /3 ib = PSRLDQ by immediate */
11258   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
11259       && epartIsReg(insn[2])
11260       && gregOfRM(insn[2]) == 3) {
11261      IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
11262      Int    imm = (Int)insn[3];
11263      Int    reg = eregOfRM(insn[2]);
11264      DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
11265      vassert(imm >= 0 && imm <= 255);
11266      delta += 4;
11267
11268      sV    = newTemp(Ity_V128);
11269      dV    = newTemp(Ity_V128);
11270      hi64  = newTemp(Ity_I64);
11271      lo64  = newTemp(Ity_I64);
11272      hi64r = newTemp(Ity_I64);
11273      lo64r = newTemp(Ity_I64);
11274
11275      if (imm >= 16) {
11276         putXMMReg(reg, mkV128(0x0000));
11277         goto decode_success;
11278      }
11279
11280      assign( sV, getXMMReg(reg) );
11281      assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
11282      assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
11283
11284      if (imm == 0) {
11285         assign( lo64r, mkexpr(lo64) );
11286         assign( hi64r, mkexpr(hi64) );
11287      }
11288      else
11289      if (imm == 8) {
11290         assign( hi64r, mkU64(0) );
11291         assign( lo64r, mkexpr(hi64) );
11292      }
11293      else
11294      if (imm > 8) {
11295         assign( hi64r, mkU64(0) );
11296         assign( lo64r, binop( Iop_Shr64,
11297                               mkexpr(hi64),
11298                               mkU8( 8*(imm-8) ) ));
11299      } else {
11300         assign( hi64r, binop( Iop_Shr64,
11301                               mkexpr(hi64),
11302                               mkU8(8 * imm) ));
11303         assign( lo64r,
11304                 binop( Iop_Or64,
11305                        binop(Iop_Shr64, mkexpr(lo64),
11306                                         mkU8(8 * imm)),
11307                        binop(Iop_Shl64, mkexpr(hi64),
11308                                         mkU8(8 * (8 - imm)) )
11309                      )
11310               );
11311      }
11312
11313      assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
11314      putXMMReg(reg, mkexpr(dV));
11315      goto decode_success;
11316   }
11317
11318   /* 66 0F 73 /2 ib = PSRLQ by immediate */
11319   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
11320       && epartIsReg(insn[2])
11321       && gregOfRM(insn[2]) == 2) {
11322      delta = dis_SSE_shiftE_imm( delta+2, "psrlq", Iop_ShrN64x2 );
11323      goto decode_success;
11324   }
11325
11326   /* 66 0F D3 = PSRLQ by E */
11327   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD3) {
11328      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlq", Iop_ShrN64x2 );
11329      goto decode_success;
11330   }
11331
11332   /* 66 0F 71 /2 ib = PSRLW by immediate */
11333   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
11334       && epartIsReg(insn[2])
11335       && gregOfRM(insn[2]) == 2) {
11336      delta = dis_SSE_shiftE_imm( delta+2, "psrlw", Iop_ShrN16x8 );
11337      goto decode_success;
11338   }
11339
11340   /* 66 0F D1 = PSRLW by E */
11341   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD1) {
11342      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlw", Iop_ShrN16x8 );
11343      goto decode_success;
11344   }
11345
11346   /* 66 0F F8 = PSUBB */
11347   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF8) {
11348      delta = dis_SSEint_E_to_G( sorb, delta+2,
11349                                 "psubb", Iop_Sub8x16, False );
11350      goto decode_success;
11351   }
11352
11353   /* 66 0F FA = PSUBD */
11354   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFA) {
11355      delta = dis_SSEint_E_to_G( sorb, delta+2,
11356                                 "psubd", Iop_Sub32x4, False );
11357      goto decode_success;
11358   }
11359
11360   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
11361   /* 0F FB = PSUBQ -- sub 64x1 */
11362   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xFB) {
11363      do_MMX_preamble();
11364      delta = dis_MMXop_regmem_to_reg (
11365                sorb, delta+2, insn[1], "psubq", False );
11366      goto decode_success;
11367   }
11368
11369   /* 66 0F FB = PSUBQ */
11370   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFB) {
11371      delta = dis_SSEint_E_to_G( sorb, delta+2,
11372                                 "psubq", Iop_Sub64x2, False );
11373      goto decode_success;
11374   }
11375
11376   /* 66 0F F9 = PSUBW */
11377   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF9) {
11378      delta = dis_SSEint_E_to_G( sorb, delta+2,
11379                                 "psubw", Iop_Sub16x8, False );
11380      goto decode_success;
11381   }
11382
11383   /* 66 0F E8 = PSUBSB */
11384   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE8) {
11385      delta = dis_SSEint_E_to_G( sorb, delta+2,
11386                                 "psubsb", Iop_QSub8Sx16, False );
11387      goto decode_success;
11388   }
11389
11390   /* 66 0F E9 = PSUBSW */
11391   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE9) {
11392      delta = dis_SSEint_E_to_G( sorb, delta+2,
11393                                 "psubsw", Iop_QSub16Sx8, False );
11394      goto decode_success;
11395   }
11396
11397   /* 66 0F D8 = PSUBSB */
11398   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD8) {
11399      delta = dis_SSEint_E_to_G( sorb, delta+2,
11400                                 "psubusb", Iop_QSub8Ux16, False );
11401      goto decode_success;
11402   }
11403
11404   /* 66 0F D9 = PSUBSW */
11405   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD9) {
11406      delta = dis_SSEint_E_to_G( sorb, delta+2,
11407                                 "psubusw", Iop_QSub16Ux8, False );
11408      goto decode_success;
11409   }
11410
11411   /* 66 0F 68 = PUNPCKHBW */
11412   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x68) {
11413      delta = dis_SSEint_E_to_G( sorb, delta+2,
11414                                 "punpckhbw",
11415                                 Iop_InterleaveHI8x16, True );
11416      goto decode_success;
11417   }
11418
11419   /* 66 0F 6A = PUNPCKHDQ */
11420   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6A) {
11421      delta = dis_SSEint_E_to_G( sorb, delta+2,
11422                                 "punpckhdq",
11423                                 Iop_InterleaveHI32x4, True );
11424      goto decode_success;
11425   }
11426
11427   /* 66 0F 6D = PUNPCKHQDQ */
11428   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6D) {
11429      delta = dis_SSEint_E_to_G( sorb, delta+2,
11430                                 "punpckhqdq",
11431                                 Iop_InterleaveHI64x2, True );
11432      goto decode_success;
11433   }
11434
11435   /* 66 0F 69 = PUNPCKHWD */
11436   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x69) {
11437      delta = dis_SSEint_E_to_G( sorb, delta+2,
11438                                 "punpckhwd",
11439                                 Iop_InterleaveHI16x8, True );
11440      goto decode_success;
11441   }
11442
11443   /* 66 0F 60 = PUNPCKLBW */
11444   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x60) {
11445      delta = dis_SSEint_E_to_G( sorb, delta+2,
11446                                 "punpcklbw",
11447                                 Iop_InterleaveLO8x16, True );
11448      goto decode_success;
11449   }
11450
11451   /* 66 0F 62 = PUNPCKLDQ */
11452   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x62) {
11453      delta = dis_SSEint_E_to_G( sorb, delta+2,
11454                                 "punpckldq",
11455                                 Iop_InterleaveLO32x4, True );
11456      goto decode_success;
11457   }
11458
11459   /* 66 0F 6C = PUNPCKLQDQ */
11460   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6C) {
11461      delta = dis_SSEint_E_to_G( sorb, delta+2,
11462                                 "punpcklqdq",
11463                                 Iop_InterleaveLO64x2, True );
11464      goto decode_success;
11465   }
11466
11467   /* 66 0F 61 = PUNPCKLWD */
11468   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x61) {
11469      delta = dis_SSEint_E_to_G( sorb, delta+2,
11470                                 "punpcklwd",
11471                                 Iop_InterleaveLO16x8, True );
11472      goto decode_success;
11473   }
11474
11475   /* 66 0F EF = PXOR */
11476   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEF) {
11477      delta = dis_SSE_E_to_G_all( sorb, delta+2, "pxor", Iop_XorV128 );
11478      goto decode_success;
11479   }
11480
11481//--    /* FXSAVE/FXRSTOR m32 -- load/store the FPU/MMX/SSE state. */
11482//--    if (insn[0] == 0x0F && insn[1] == 0xAE
11483//--        && (!epartIsReg(insn[2]))
11484//--        && (gregOfRM(insn[2]) == 1 || gregOfRM(insn[2]) == 0) ) {
11485//--       Bool store = gregOfRM(insn[2]) == 0;
11486//--       vg_assert(sz == 4);
11487//--       pair = disAMode ( cb, sorb, eip+2, dis_buf );
11488//--       t1   = LOW24(pair);
11489//--       eip += 2+HI8(pair);
11490//--       uInstr3(cb, store ? SSE2a_MemWr : SSE2a_MemRd, 512,
11491//--                   Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1],
11492//--                   Lit16, (UShort)insn[2],
11493//--                   TempReg, t1 );
11494//--       DIP("fx%s %s\n", store ? "save" : "rstor", dis_buf );
11495//--       goto decode_success;
11496//--    }
11497
11498   /* 0F AE /7 = CLFLUSH -- flush cache line */
11499   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
11500       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
11501
11502      /* This is something of a hack.  We need to know the size of the
11503         cache line containing addr.  Since we don't (easily), assume
11504         256 on the basis that no real cache would have a line that
11505         big.  It's safe to invalidate more stuff than we need, just
11506         inefficient. */
11507      UInt lineszB = 256;
11508
11509      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11510      delta += 2+alen;
11511
11512      /* Round addr down to the start of the containing block. */
11513      stmt( IRStmt_Put(
11514               OFFB_TISTART,
11515               binop( Iop_And32,
11516                      mkexpr(addr),
11517                      mkU32( ~(lineszB-1) ))) );
11518
11519      stmt( IRStmt_Put(OFFB_TILEN, mkU32(lineszB) ) );
11520
11521      irsb->jumpkind = Ijk_TInval;
11522      irsb->next     = mkU32(guest_EIP_bbstart+delta);
11523      dres.whatNext  = Dis_StopHere;
11524
11525      DIP("clflush %s\n", dis_buf);
11526      goto decode_success;
11527   }
11528
11529   /* ---------------------------------------------------- */
11530   /* --- end of the SSE2 decoder.                     --- */
11531   /* ---------------------------------------------------- */
11532
11533   /* ---------------------------------------------------- */
11534   /* --- start of the SSE3 decoder.                   --- */
11535   /* ---------------------------------------------------- */
11536
11537   /* Skip parts of the decoder which don't apply given the stated
11538      guest subarchitecture. */
11539   /* if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE3)) */
11540   /* In fact this is highly bogus; we accept SSE3 insns even on a
11541      SSE2-only guest since they turn into IR which can be re-emitted
11542      successfully on an SSE2 host. */
11543   if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
11544      goto after_sse_decoders; /* no SSE3 capabilities */
11545
11546   insn = (UChar*)&guest_code[delta];
11547
11548   /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
11549      duplicating some lanes (2:2:0:0). */
11550   /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
11551      duplicating some lanes (3:3:1:1). */
11552   if (sz == 4 && insn[0] == 0xF3 && insn[1] == 0x0F
11553       && (insn[2] == 0x12 || insn[2] == 0x16)) {
11554      IRTemp s3, s2, s1, s0;
11555      IRTemp sV  = newTemp(Ity_V128);
11556      Bool   isH = insn[2] == 0x16;
11557      s3 = s2 = s1 = s0 = IRTemp_INVALID;
11558
11559      modrm = insn[3];
11560      if (epartIsReg(modrm)) {
11561         assign( sV, getXMMReg( eregOfRM(modrm)) );
11562         DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
11563                                  nameXMMReg(eregOfRM(modrm)),
11564                                  nameXMMReg(gregOfRM(modrm)));
11565         delta += 3+1;
11566      } else {
11567         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11568         gen_SEGV_if_not_16_aligned( addr );
11569         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11570         DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
11571	     dis_buf,
11572             nameXMMReg(gregOfRM(modrm)));
11573         delta += 3+alen;
11574      }
11575
11576      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
11577      putXMMReg( gregOfRM(modrm),
11578                 isH ? mk128from32s( s3, s3, s1, s1 )
11579                     : mk128from32s( s2, s2, s0, s0 ) );
11580      goto decode_success;
11581   }
11582
11583   /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
11584      duplicating some lanes (0:1:0:1). */
11585   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x12) {
11586      IRTemp sV = newTemp(Ity_V128);
11587      IRTemp d0 = newTemp(Ity_I64);
11588
11589      modrm = insn[3];
11590      if (epartIsReg(modrm)) {
11591         assign( sV, getXMMReg( eregOfRM(modrm)) );
11592         DIP("movddup %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11593                                nameXMMReg(gregOfRM(modrm)));
11594         delta += 3+1;
11595         assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
11596      } else {
11597         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11598         assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
11599         DIP("movddup %s,%s\n", dis_buf,
11600                                nameXMMReg(gregOfRM(modrm)));
11601         delta += 3+alen;
11602      }
11603
11604      putXMMReg( gregOfRM(modrm), binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
11605      goto decode_success;
11606   }
11607
11608   /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
11609   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD0) {
11610      IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
11611      IRTemp eV   = newTemp(Ity_V128);
11612      IRTemp gV   = newTemp(Ity_V128);
11613      IRTemp addV = newTemp(Ity_V128);
11614      IRTemp subV = newTemp(Ity_V128);
11615      a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
11616
11617      modrm = insn[3];
11618      if (epartIsReg(modrm)) {
11619         assign( eV, getXMMReg( eregOfRM(modrm)) );
11620         DIP("addsubps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11621                                 nameXMMReg(gregOfRM(modrm)));
11622         delta += 3+1;
11623      } else {
11624         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11625         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
11626         DIP("addsubps %s,%s\n", dis_buf,
11627                                 nameXMMReg(gregOfRM(modrm)));
11628         delta += 3+alen;
11629      }
11630
11631      assign( gV, getXMMReg(gregOfRM(modrm)) );
11632
11633      assign( addV, binop(Iop_Add32Fx4, mkexpr(gV), mkexpr(eV)) );
11634      assign( subV, binop(Iop_Sub32Fx4, mkexpr(gV), mkexpr(eV)) );
11635
11636      breakup128to32s( addV, &a3, &a2, &a1, &a0 );
11637      breakup128to32s( subV, &s3, &s2, &s1, &s0 );
11638
11639      putXMMReg( gregOfRM(modrm), mk128from32s( a3, s2, a1, s0 ));
11640      goto decode_success;
11641   }
11642
11643   /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
11644   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD0) {
11645      IRTemp eV   = newTemp(Ity_V128);
11646      IRTemp gV   = newTemp(Ity_V128);
11647      IRTemp addV = newTemp(Ity_V128);
11648      IRTemp subV = newTemp(Ity_V128);
11649      IRTemp a1     = newTemp(Ity_I64);
11650      IRTemp s0     = newTemp(Ity_I64);
11651
11652      modrm = insn[2];
11653      if (epartIsReg(modrm)) {
11654         assign( eV, getXMMReg( eregOfRM(modrm)) );
11655         DIP("addsubpd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11656                                 nameXMMReg(gregOfRM(modrm)));
11657         delta += 2+1;
11658      } else {
11659         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11660         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
11661         DIP("addsubpd %s,%s\n", dis_buf,
11662                                 nameXMMReg(gregOfRM(modrm)));
11663         delta += 2+alen;
11664      }
11665
11666      assign( gV, getXMMReg(gregOfRM(modrm)) );
11667
11668      assign( addV, binop(Iop_Add64Fx2, mkexpr(gV), mkexpr(eV)) );
11669      assign( subV, binop(Iop_Sub64Fx2, mkexpr(gV), mkexpr(eV)) );
11670
11671      assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
11672      assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
11673
11674      putXMMReg( gregOfRM(modrm),
11675                 binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
11676      goto decode_success;
11677   }
11678
11679   /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
11680   /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
11681   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F
11682       && (insn[2] == 0x7C || insn[2] == 0x7D)) {
11683      IRTemp e3, e2, e1, e0, g3, g2, g1, g0;
11684      IRTemp eV     = newTemp(Ity_V128);
11685      IRTemp gV     = newTemp(Ity_V128);
11686      IRTemp leftV  = newTemp(Ity_V128);
11687      IRTemp rightV = newTemp(Ity_V128);
11688      Bool   isAdd  = insn[2] == 0x7C;
11689      HChar* str    = isAdd ? "add" : "sub";
11690      e3 = e2 = e1 = e0 = g3 = g2 = g1 = g0 = IRTemp_INVALID;
11691
11692      modrm = insn[3];
11693      if (epartIsReg(modrm)) {
11694         assign( eV, getXMMReg( eregOfRM(modrm)) );
11695         DIP("h%sps %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
11696                                   nameXMMReg(gregOfRM(modrm)));
11697         delta += 3+1;
11698      } else {
11699         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11700         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
11701         DIP("h%sps %s,%s\n", str, dis_buf,
11702                                   nameXMMReg(gregOfRM(modrm)));
11703         delta += 3+alen;
11704      }
11705
11706      assign( gV, getXMMReg(gregOfRM(modrm)) );
11707
11708      breakup128to32s( eV, &e3, &e2, &e1, &e0 );
11709      breakup128to32s( gV, &g3, &g2, &g1, &g0 );
11710
11711      assign( leftV,  mk128from32s( e2, e0, g2, g0 ) );
11712      assign( rightV, mk128from32s( e3, e1, g3, g1 ) );
11713
11714      putXMMReg( gregOfRM(modrm),
11715                 binop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
11716                       mkexpr(leftV), mkexpr(rightV) ) );
11717      goto decode_success;
11718   }
11719
11720   /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
11721   /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
11722   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x7C || insn[1] == 0x7D)) {
11723      IRTemp e1     = newTemp(Ity_I64);
11724      IRTemp e0     = newTemp(Ity_I64);
11725      IRTemp g1     = newTemp(Ity_I64);
11726      IRTemp g0     = newTemp(Ity_I64);
11727      IRTemp eV     = newTemp(Ity_V128);
11728      IRTemp gV     = newTemp(Ity_V128);
11729      IRTemp leftV  = newTemp(Ity_V128);
11730      IRTemp rightV = newTemp(Ity_V128);
11731      Bool   isAdd  = insn[1] == 0x7C;
11732      HChar* str    = isAdd ? "add" : "sub";
11733
11734      modrm = insn[2];
11735      if (epartIsReg(modrm)) {
11736         assign( eV, getXMMReg( eregOfRM(modrm)) );
11737         DIP("h%spd %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
11738                                   nameXMMReg(gregOfRM(modrm)));
11739         delta += 2+1;
11740      } else {
11741         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11742         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
11743         DIP("h%spd %s,%s\n", str, dis_buf,
11744                              nameXMMReg(gregOfRM(modrm)));
11745         delta += 2+alen;
11746      }
11747
11748      assign( gV, getXMMReg(gregOfRM(modrm)) );
11749
11750      assign( e1, unop(Iop_V128HIto64, mkexpr(eV) ));
11751      assign( e0, unop(Iop_V128to64, mkexpr(eV) ));
11752      assign( g1, unop(Iop_V128HIto64, mkexpr(gV) ));
11753      assign( g0, unop(Iop_V128to64, mkexpr(gV) ));
11754
11755      assign( leftV,  binop(Iop_64HLtoV128, mkexpr(e0),mkexpr(g0)) );
11756      assign( rightV, binop(Iop_64HLtoV128, mkexpr(e1),mkexpr(g1)) );
11757
11758      putXMMReg( gregOfRM(modrm),
11759                 binop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
11760                       mkexpr(leftV), mkexpr(rightV) ) );
11761      goto decode_success;
11762   }
11763
11764   /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
11765   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xF0) {
11766      modrm = getIByte(delta+3);
11767      if (epartIsReg(modrm)) {
11768         goto decode_failure;
11769      } else {
11770         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11771         putXMMReg( gregOfRM(modrm),
11772                    loadLE(Ity_V128, mkexpr(addr)) );
11773         DIP("lddqu %s,%s\n", dis_buf,
11774                              nameXMMReg(gregOfRM(modrm)));
11775         delta += 3+alen;
11776      }
11777      goto decode_success;
11778   }
11779
11780   /* ---------------------------------------------------- */
11781   /* --- end of the SSE3 decoder.                     --- */
11782   /* ---------------------------------------------------- */
11783
11784   /* ---------------------------------------------------- */
11785   /* --- start of the SSSE3 decoder.                  --- */
11786   /* ---------------------------------------------------- */
11787
11788   /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
11789      Unsigned Bytes (MMX) */
11790   if (sz == 4
11791       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
11792      IRTemp sV        = newTemp(Ity_I64);
11793      IRTemp dV        = newTemp(Ity_I64);
11794      IRTemp sVoddsSX  = newTemp(Ity_I64);
11795      IRTemp sVevensSX = newTemp(Ity_I64);
11796      IRTemp dVoddsZX  = newTemp(Ity_I64);
11797      IRTemp dVevensZX = newTemp(Ity_I64);
11798
11799      modrm = insn[3];
11800      do_MMX_preamble();
11801      assign( dV, getMMXReg(gregOfRM(modrm)) );
11802
11803      if (epartIsReg(modrm)) {
11804         assign( sV, getMMXReg(eregOfRM(modrm)) );
11805         delta += 3+1;
11806         DIP("pmaddubsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
11807                                  nameMMXReg(gregOfRM(modrm)));
11808      } else {
11809         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11810         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
11811         delta += 3+alen;
11812         DIP("pmaddubsw %s,%s\n", dis_buf,
11813                                  nameMMXReg(gregOfRM(modrm)));
11814      }
11815
11816      /* compute dV unsigned x sV signed */
11817      assign( sVoddsSX,
11818              binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
11819      assign( sVevensSX,
11820              binop(Iop_SarN16x4,
11821                    binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
11822                    mkU8(8)) );
11823      assign( dVoddsZX,
11824              binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
11825      assign( dVevensZX,
11826              binop(Iop_ShrN16x4,
11827                    binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
11828                    mkU8(8)) );
11829
11830      putMMXReg(
11831         gregOfRM(modrm),
11832         binop(Iop_QAdd16Sx4,
11833               binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
11834               binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
11835         )
11836      );
11837      goto decode_success;
11838   }
11839
11840   /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
11841      Unsigned Bytes (XMM) */
11842   if (sz == 2
11843       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
11844      IRTemp sV        = newTemp(Ity_V128);
11845      IRTemp dV        = newTemp(Ity_V128);
11846      IRTemp sVoddsSX  = newTemp(Ity_V128);
11847      IRTemp sVevensSX = newTemp(Ity_V128);
11848      IRTemp dVoddsZX  = newTemp(Ity_V128);
11849      IRTemp dVevensZX = newTemp(Ity_V128);
11850
11851      modrm = insn[3];
11852      assign( dV, getXMMReg(gregOfRM(modrm)) );
11853
11854      if (epartIsReg(modrm)) {
11855         assign( sV, getXMMReg(eregOfRM(modrm)) );
11856         delta += 3+1;
11857         DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11858                                  nameXMMReg(gregOfRM(modrm)));
11859      } else {
11860         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11861         gen_SEGV_if_not_16_aligned( addr );
11862         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11863         delta += 3+alen;
11864         DIP("pmaddubsw %s,%s\n", dis_buf,
11865                                  nameXMMReg(gregOfRM(modrm)));
11866      }
11867
11868      /* compute dV unsigned x sV signed */
11869      assign( sVoddsSX,
11870              binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
11871      assign( sVevensSX,
11872              binop(Iop_SarN16x8,
11873                    binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
11874                    mkU8(8)) );
11875      assign( dVoddsZX,
11876              binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
11877      assign( dVevensZX,
11878              binop(Iop_ShrN16x8,
11879                    binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
11880                    mkU8(8)) );
11881
11882      putXMMReg(
11883         gregOfRM(modrm),
11884         binop(Iop_QAdd16Sx8,
11885               binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
11886               binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
11887         )
11888      );
11889      goto decode_success;
11890   }
11891
11892   /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
11893   /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
11894      mmx) and G to G (mmx). */
11895   /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
11896      mmx) and G to G (mmx). */
11897   /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
11898      to G (mmx). */
11899   /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
11900      to G (mmx). */
11901   /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
11902      to G (mmx). */
11903   /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
11904      to G (mmx). */
11905
11906   if (sz == 4
11907       && insn[0] == 0x0F && insn[1] == 0x38
11908       && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
11909           || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
11910      HChar* str    = "???";
11911      IROp   opV64  = Iop_INVALID;
11912      IROp   opCatO = Iop_CatOddLanes16x4;
11913      IROp   opCatE = Iop_CatEvenLanes16x4;
11914      IRTemp sV     = newTemp(Ity_I64);
11915      IRTemp dV     = newTemp(Ity_I64);
11916
11917      modrm = insn[3];
11918
11919      switch (insn[2]) {
11920         case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
11921         case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
11922         case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
11923         case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
11924         case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
11925         case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
11926         default: vassert(0);
11927      }
11928      if (insn[2] == 0x02 || insn[2] == 0x06) {
11929         opCatO = Iop_InterleaveHI32x2;
11930         opCatE = Iop_InterleaveLO32x2;
11931      }
11932
11933      do_MMX_preamble();
11934      assign( dV, getMMXReg(gregOfRM(modrm)) );
11935
11936      if (epartIsReg(modrm)) {
11937         assign( sV, getMMXReg(eregOfRM(modrm)) );
11938         delta += 3+1;
11939         DIP("ph%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
11940                                  nameMMXReg(gregOfRM(modrm)));
11941      } else {
11942         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11943         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
11944         delta += 3+alen;
11945         DIP("ph%s %s,%s\n", str, dis_buf,
11946                                  nameMMXReg(gregOfRM(modrm)));
11947      }
11948
11949      putMMXReg(
11950         gregOfRM(modrm),
11951         binop(opV64,
11952               binop(opCatE,mkexpr(sV),mkexpr(dV)),
11953               binop(opCatO,mkexpr(sV),mkexpr(dV))
11954         )
11955      );
11956      goto decode_success;
11957   }
11958
11959   /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
11960      xmm) and G to G (xmm). */
11961   /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
11962      xmm) and G to G (xmm). */
11963   /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
11964      G to G (xmm). */
11965   /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
11966      G to G (xmm). */
11967   /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
11968      G to G (xmm). */
11969   /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
11970      G to G (xmm). */
11971
11972   if (sz == 2
11973       && insn[0] == 0x0F && insn[1] == 0x38
11974       && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
11975           || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
11976      HChar* str    = "???";
11977      IROp   opV64  = Iop_INVALID;
11978      IROp   opCatO = Iop_CatOddLanes16x4;
11979      IROp   opCatE = Iop_CatEvenLanes16x4;
11980      IRTemp sV     = newTemp(Ity_V128);
11981      IRTemp dV     = newTemp(Ity_V128);
11982      IRTemp sHi    = newTemp(Ity_I64);
11983      IRTemp sLo    = newTemp(Ity_I64);
11984      IRTemp dHi    = newTemp(Ity_I64);
11985      IRTemp dLo    = newTemp(Ity_I64);
11986
11987      modrm = insn[3];
11988
11989      switch (insn[2]) {
11990         case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
11991         case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
11992         case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
11993         case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
11994         case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
11995         case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
11996         default: vassert(0);
11997      }
11998      if (insn[2] == 0x02 || insn[2] == 0x06) {
11999         opCatO = Iop_InterleaveHI32x2;
12000         opCatE = Iop_InterleaveLO32x2;
12001      }
12002
12003      assign( dV, getXMMReg(gregOfRM(modrm)) );
12004
12005      if (epartIsReg(modrm)) {
12006         assign( sV, getXMMReg( eregOfRM(modrm)) );
12007         DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
12008                                  nameXMMReg(gregOfRM(modrm)));
12009         delta += 3+1;
12010      } else {
12011         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12012         gen_SEGV_if_not_16_aligned( addr );
12013         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12014         DIP("ph%s %s,%s\n", str, dis_buf,
12015                             nameXMMReg(gregOfRM(modrm)));
12016         delta += 3+alen;
12017      }
12018
12019      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12020      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12021      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12022      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12023
12024      /* This isn't a particularly efficient way to compute the
12025         result, but at least it avoids a proliferation of IROps,
12026         hence avoids complication all the backends. */
12027      putXMMReg(
12028         gregOfRM(modrm),
12029         binop(Iop_64HLtoV128,
12030               binop(opV64,
12031                     binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
12032                     binop(opCatO,mkexpr(sHi),mkexpr(sLo))
12033               ),
12034               binop(opV64,
12035                     binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
12036                     binop(opCatO,mkexpr(dHi),mkexpr(dLo))
12037               )
12038         )
12039      );
12040      goto decode_success;
12041   }
12042
12043   /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
12044      (MMX) */
12045   if (sz == 4
12046       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
12047      IRTemp sV = newTemp(Ity_I64);
12048      IRTemp dV = newTemp(Ity_I64);
12049
12050      modrm = insn[3];
12051      do_MMX_preamble();
12052      assign( dV, getMMXReg(gregOfRM(modrm)) );
12053
12054      if (epartIsReg(modrm)) {
12055         assign( sV, getMMXReg(eregOfRM(modrm)) );
12056         delta += 3+1;
12057         DIP("pmulhrsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
12058                                 nameMMXReg(gregOfRM(modrm)));
12059      } else {
12060         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12061         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12062         delta += 3+alen;
12063         DIP("pmulhrsw %s,%s\n", dis_buf,
12064                                 nameMMXReg(gregOfRM(modrm)));
12065      }
12066
12067      putMMXReg(
12068         gregOfRM(modrm),
12069         dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
12070      );
12071      goto decode_success;
12072   }
12073
12074   /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
12075      Scale (XMM) */
12076   if (sz == 2
12077       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
12078      IRTemp sV  = newTemp(Ity_V128);
12079      IRTemp dV  = newTemp(Ity_V128);
12080      IRTemp sHi = newTemp(Ity_I64);
12081      IRTemp sLo = newTemp(Ity_I64);
12082      IRTemp dHi = newTemp(Ity_I64);
12083      IRTemp dLo = newTemp(Ity_I64);
12084
12085      modrm = insn[3];
12086      assign( dV, getXMMReg(gregOfRM(modrm)) );
12087
12088      if (epartIsReg(modrm)) {
12089         assign( sV, getXMMReg(eregOfRM(modrm)) );
12090         delta += 3+1;
12091         DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
12092                                 nameXMMReg(gregOfRM(modrm)));
12093      } else {
12094         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12095         gen_SEGV_if_not_16_aligned( addr );
12096         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12097         delta += 3+alen;
12098         DIP("pmulhrsw %s,%s\n", dis_buf,
12099                                 nameXMMReg(gregOfRM(modrm)));
12100      }
12101
12102      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12103      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12104      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12105      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12106
12107      putXMMReg(
12108         gregOfRM(modrm),
12109         binop(Iop_64HLtoV128,
12110               dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
12111               dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
12112         )
12113      );
12114      goto decode_success;
12115   }
12116
12117   /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
12118   /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
12119   /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */
12120   if (sz == 4
12121       && insn[0] == 0x0F && insn[1] == 0x38
12122       && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
12123      IRTemp sV      = newTemp(Ity_I64);
12124      IRTemp dV      = newTemp(Ity_I64);
12125      HChar* str     = "???";
12126      Int    laneszB = 0;
12127
12128      switch (insn[2]) {
12129         case 0x08: laneszB = 1; str = "b"; break;
12130         case 0x09: laneszB = 2; str = "w"; break;
12131         case 0x0A: laneszB = 4; str = "d"; break;
12132         default: vassert(0);
12133      }
12134
12135      modrm = insn[3];
12136      do_MMX_preamble();
12137      assign( dV, getMMXReg(gregOfRM(modrm)) );
12138
12139      if (epartIsReg(modrm)) {
12140         assign( sV, getMMXReg(eregOfRM(modrm)) );
12141         delta += 3+1;
12142         DIP("psign%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
12143                                     nameMMXReg(gregOfRM(modrm)));
12144      } else {
12145         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12146         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12147         delta += 3+alen;
12148         DIP("psign%s %s,%s\n", str, dis_buf,
12149                                     nameMMXReg(gregOfRM(modrm)));
12150      }
12151
12152      putMMXReg(
12153         gregOfRM(modrm),
12154         dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
12155      );
12156      goto decode_success;
12157   }
12158
12159   /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
12160   /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
12161   /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */
12162   if (sz == 2
12163       && insn[0] == 0x0F && insn[1] == 0x38
12164       && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
12165      IRTemp sV      = newTemp(Ity_V128);
12166      IRTemp dV      = newTemp(Ity_V128);
12167      IRTemp sHi     = newTemp(Ity_I64);
12168      IRTemp sLo     = newTemp(Ity_I64);
12169      IRTemp dHi     = newTemp(Ity_I64);
12170      IRTemp dLo     = newTemp(Ity_I64);
12171      HChar* str     = "???";
12172      Int    laneszB = 0;
12173
12174      switch (insn[2]) {
12175         case 0x08: laneszB = 1; str = "b"; break;
12176         case 0x09: laneszB = 2; str = "w"; break;
12177         case 0x0A: laneszB = 4; str = "d"; break;
12178         default: vassert(0);
12179      }
12180
12181      modrm = insn[3];
12182      assign( dV, getXMMReg(gregOfRM(modrm)) );
12183
12184      if (epartIsReg(modrm)) {
12185         assign( sV, getXMMReg(eregOfRM(modrm)) );
12186         delta += 3+1;
12187         DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
12188                                     nameXMMReg(gregOfRM(modrm)));
12189      } else {
12190         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12191         gen_SEGV_if_not_16_aligned( addr );
12192         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12193         delta += 3+alen;
12194         DIP("psign%s %s,%s\n", str, dis_buf,
12195                                     nameXMMReg(gregOfRM(modrm)));
12196      }
12197
12198      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12199      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12200      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12201      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12202
12203      putXMMReg(
12204         gregOfRM(modrm),
12205         binop(Iop_64HLtoV128,
12206               dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
12207               dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
12208         )
12209      );
12210      goto decode_success;
12211   }
12212
12213   /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
12214   /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
12215   /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
12216   if (sz == 4
12217       && insn[0] == 0x0F && insn[1] == 0x38
12218       && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
12219      IRTemp sV      = newTemp(Ity_I64);
12220      HChar* str     = "???";
12221      Int    laneszB = 0;
12222
12223      switch (insn[2]) {
12224         case 0x1C: laneszB = 1; str = "b"; break;
12225         case 0x1D: laneszB = 2; str = "w"; break;
12226         case 0x1E: laneszB = 4; str = "d"; break;
12227         default: vassert(0);
12228      }
12229
12230      modrm = insn[3];
12231      do_MMX_preamble();
12232
12233      if (epartIsReg(modrm)) {
12234         assign( sV, getMMXReg(eregOfRM(modrm)) );
12235         delta += 3+1;
12236         DIP("pabs%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
12237                                    nameMMXReg(gregOfRM(modrm)));
12238      } else {
12239         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12240         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12241         delta += 3+alen;
12242         DIP("pabs%s %s,%s\n", str, dis_buf,
12243                                    nameMMXReg(gregOfRM(modrm)));
12244      }
12245
12246      putMMXReg(
12247         gregOfRM(modrm),
12248         dis_PABS_helper( mkexpr(sV), laneszB )
12249      );
12250      goto decode_success;
12251   }
12252
12253   /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
12254   /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
12255   /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
12256   if (sz == 2
12257       && insn[0] == 0x0F && insn[1] == 0x38
12258       && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
12259      IRTemp sV      = newTemp(Ity_V128);
12260      IRTemp sHi     = newTemp(Ity_I64);
12261      IRTemp sLo     = newTemp(Ity_I64);
12262      HChar* str     = "???";
12263      Int    laneszB = 0;
12264
12265      switch (insn[2]) {
12266         case 0x1C: laneszB = 1; str = "b"; break;
12267         case 0x1D: laneszB = 2; str = "w"; break;
12268         case 0x1E: laneszB = 4; str = "d"; break;
12269         default: vassert(0);
12270      }
12271
12272      modrm = insn[3];
12273
12274      if (epartIsReg(modrm)) {
12275         assign( sV, getXMMReg(eregOfRM(modrm)) );
12276         delta += 3+1;
12277         DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
12278                                    nameXMMReg(gregOfRM(modrm)));
12279      } else {
12280         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12281         gen_SEGV_if_not_16_aligned( addr );
12282         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12283         delta += 3+alen;
12284         DIP("pabs%s %s,%s\n", str, dis_buf,
12285                                    nameXMMReg(gregOfRM(modrm)));
12286      }
12287
12288      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12289      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12290
12291      putXMMReg(
12292         gregOfRM(modrm),
12293         binop(Iop_64HLtoV128,
12294               dis_PABS_helper( mkexpr(sHi), laneszB ),
12295               dis_PABS_helper( mkexpr(sLo), laneszB )
12296         )
12297      );
12298      goto decode_success;
12299   }
12300
12301   /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
12302   if (sz == 4
12303       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
12304      IRTemp sV  = newTemp(Ity_I64);
12305      IRTemp dV  = newTemp(Ity_I64);
12306      IRTemp res = newTemp(Ity_I64);
12307
12308      modrm = insn[3];
12309      do_MMX_preamble();
12310      assign( dV, getMMXReg(gregOfRM(modrm)) );
12311
12312      if (epartIsReg(modrm)) {
12313         assign( sV, getMMXReg(eregOfRM(modrm)) );
12314         d32 = (UInt)insn[3+1];
12315         delta += 3+1+1;
12316         DIP("palignr $%d,%s,%s\n",  (Int)d32,
12317                                     nameMMXReg(eregOfRM(modrm)),
12318                                     nameMMXReg(gregOfRM(modrm)));
12319      } else {
12320         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12321         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12322         d32 = (UInt)insn[3+alen];
12323         delta += 3+alen+1;
12324         DIP("palignr $%d%s,%s\n", (Int)d32,
12325                                   dis_buf,
12326                                   nameMMXReg(gregOfRM(modrm)));
12327      }
12328
12329      if (d32 == 0) {
12330         assign( res, mkexpr(sV) );
12331      }
12332      else if (d32 >= 1 && d32 <= 7) {
12333         assign(res,
12334                binop(Iop_Or64,
12335                      binop(Iop_Shr64, mkexpr(sV), mkU8(8*d32)),
12336                      binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d32))
12337                     )));
12338      }
12339      else if (d32 == 8) {
12340        assign( res, mkexpr(dV) );
12341      }
12342      else if (d32 >= 9 && d32 <= 15) {
12343         assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d32-8))) );
12344      }
12345      else if (d32 >= 16 && d32 <= 255) {
12346         assign( res, mkU64(0) );
12347      }
12348      else
12349         vassert(0);
12350
12351      putMMXReg( gregOfRM(modrm), mkexpr(res) );
12352      goto decode_success;
12353   }
12354
12355   /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
12356   if (sz == 2
12357       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
12358      IRTemp sV  = newTemp(Ity_V128);
12359      IRTemp dV  = newTemp(Ity_V128);
12360      IRTemp sHi = newTemp(Ity_I64);
12361      IRTemp sLo = newTemp(Ity_I64);
12362      IRTemp dHi = newTemp(Ity_I64);
12363      IRTemp dLo = newTemp(Ity_I64);
12364      IRTemp rHi = newTemp(Ity_I64);
12365      IRTemp rLo = newTemp(Ity_I64);
12366
12367      modrm = insn[3];
12368      assign( dV, getXMMReg(gregOfRM(modrm)) );
12369
12370      if (epartIsReg(modrm)) {
12371         assign( sV, getXMMReg(eregOfRM(modrm)) );
12372         d32 = (UInt)insn[3+1];
12373         delta += 3+1+1;
12374         DIP("palignr $%d,%s,%s\n", (Int)d32,
12375                                    nameXMMReg(eregOfRM(modrm)),
12376                                    nameXMMReg(gregOfRM(modrm)));
12377      } else {
12378         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12379         gen_SEGV_if_not_16_aligned( addr );
12380         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12381         d32 = (UInt)insn[3+alen];
12382         delta += 3+alen+1;
12383         DIP("palignr $%d,%s,%s\n", (Int)d32,
12384                                    dis_buf,
12385                                    nameXMMReg(gregOfRM(modrm)));
12386      }
12387
12388      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12389      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12390      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12391      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12392
12393      if (d32 == 0) {
12394         assign( rHi, mkexpr(sHi) );
12395         assign( rLo, mkexpr(sLo) );
12396      }
12397      else if (d32 >= 1 && d32 <= 7) {
12398         assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d32) );
12399         assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d32) );
12400      }
12401      else if (d32 == 8) {
12402         assign( rHi, mkexpr(dLo) );
12403         assign( rLo, mkexpr(sHi) );
12404      }
12405      else if (d32 >= 9 && d32 <= 15) {
12406         assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d32-8) );
12407         assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d32-8) );
12408      }
12409      else if (d32 == 16) {
12410         assign( rHi, mkexpr(dHi) );
12411         assign( rLo, mkexpr(dLo) );
12412      }
12413      else if (d32 >= 17 && d32 <= 23) {
12414         assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-16))) );
12415         assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d32-16) );
12416      }
12417      else if (d32 == 24) {
12418         assign( rHi, mkU64(0) );
12419         assign( rLo, mkexpr(dHi) );
12420      }
12421      else if (d32 >= 25 && d32 <= 31) {
12422         assign( rHi, mkU64(0) );
12423         assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-24))) );
12424      }
12425      else if (d32 >= 32 && d32 <= 255) {
12426         assign( rHi, mkU64(0) );
12427         assign( rLo, mkU64(0) );
12428      }
12429      else
12430         vassert(0);
12431
12432      putXMMReg(
12433         gregOfRM(modrm),
12434         binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
12435      );
12436      goto decode_success;
12437   }
12438
12439   /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
12440   if (sz == 4
12441       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
12442      IRTemp sV      = newTemp(Ity_I64);
12443      IRTemp dV      = newTemp(Ity_I64);
12444
12445      modrm = insn[3];
12446      do_MMX_preamble();
12447      assign( dV, getMMXReg(gregOfRM(modrm)) );
12448
12449      if (epartIsReg(modrm)) {
12450         assign( sV, getMMXReg(eregOfRM(modrm)) );
12451         delta += 3+1;
12452         DIP("pshufb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
12453                               nameMMXReg(gregOfRM(modrm)));
12454      } else {
12455         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12456         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12457         delta += 3+alen;
12458         DIP("pshufb %s,%s\n", dis_buf,
12459                               nameMMXReg(gregOfRM(modrm)));
12460      }
12461
12462      putMMXReg(
12463         gregOfRM(modrm),
12464         binop(
12465            Iop_And64,
12466            /* permute the lanes */
12467            binop(
12468               Iop_Perm8x8,
12469               mkexpr(dV),
12470               binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
12471            ),
12472            /* mask off lanes which have (index & 0x80) == 0x80 */
12473            unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
12474         )
12475      );
12476      goto decode_success;
12477   }
12478
12479   /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
12480   if (sz == 2
12481       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
12482      IRTemp sV         = newTemp(Ity_V128);
12483      IRTemp dV         = newTemp(Ity_V128);
12484      IRTemp sHi        = newTemp(Ity_I64);
12485      IRTemp sLo        = newTemp(Ity_I64);
12486      IRTemp dHi        = newTemp(Ity_I64);
12487      IRTemp dLo        = newTemp(Ity_I64);
12488      IRTemp rHi        = newTemp(Ity_I64);
12489      IRTemp rLo        = newTemp(Ity_I64);
12490      IRTemp sevens     = newTemp(Ity_I64);
12491      IRTemp mask0x80hi = newTemp(Ity_I64);
12492      IRTemp mask0x80lo = newTemp(Ity_I64);
12493      IRTemp maskBit3hi = newTemp(Ity_I64);
12494      IRTemp maskBit3lo = newTemp(Ity_I64);
12495      IRTemp sAnd7hi    = newTemp(Ity_I64);
12496      IRTemp sAnd7lo    = newTemp(Ity_I64);
12497      IRTemp permdHi    = newTemp(Ity_I64);
12498      IRTemp permdLo    = newTemp(Ity_I64);
12499
12500      modrm = insn[3];
12501      assign( dV, getXMMReg(gregOfRM(modrm)) );
12502
12503      if (epartIsReg(modrm)) {
12504         assign( sV, getXMMReg(eregOfRM(modrm)) );
12505         delta += 3+1;
12506         DIP("pshufb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
12507                               nameXMMReg(gregOfRM(modrm)));
12508      } else {
12509         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12510         gen_SEGV_if_not_16_aligned( addr );
12511         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12512         delta += 3+alen;
12513         DIP("pshufb %s,%s\n", dis_buf,
12514                               nameXMMReg(gregOfRM(modrm)));
12515      }
12516
12517      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12518      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12519      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12520      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12521
12522      assign( sevens, mkU64(0x0707070707070707ULL) );
12523
12524      /*
12525      mask0x80hi = Not(SarN8x8(sHi,7))
12526      maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
12527      sAnd7hi    = And(sHi,sevens)
12528      permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
12529                       And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
12530      rHi        = And(permdHi,mask0x80hi)
12531      */
12532      assign(
12533         mask0x80hi,
12534         unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
12535
12536      assign(
12537         maskBit3hi,
12538         binop(Iop_SarN8x8,
12539               binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
12540               mkU8(7)));
12541
12542      assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
12543
12544      assign(
12545         permdHi,
12546         binop(
12547            Iop_Or64,
12548            binop(Iop_And64,
12549                  binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
12550                  mkexpr(maskBit3hi)),
12551            binop(Iop_And64,
12552                  binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
12553                  unop(Iop_Not64,mkexpr(maskBit3hi))) ));
12554
12555      assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
12556
12557      /* And the same for the lower half of the result.  What fun. */
12558
12559      assign(
12560         mask0x80lo,
12561         unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
12562
12563      assign(
12564         maskBit3lo,
12565         binop(Iop_SarN8x8,
12566               binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
12567               mkU8(7)));
12568
12569      assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
12570
12571      assign(
12572         permdLo,
12573         binop(
12574            Iop_Or64,
12575            binop(Iop_And64,
12576                  binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
12577                  mkexpr(maskBit3lo)),
12578            binop(Iop_And64,
12579                  binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
12580                  unop(Iop_Not64,mkexpr(maskBit3lo))) ));
12581
12582      assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
12583
12584      putXMMReg(
12585         gregOfRM(modrm),
12586         binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
12587      );
12588      goto decode_success;
12589   }
12590
12591   /* ---------------------------------------------------- */
12592   /* --- end of the SSSE3 decoder.                    --- */
12593   /* ---------------------------------------------------- */
12594
12595   /* ---------------------------------------------------- */
12596   /* --- start of the SSE4 decoder                    --- */
12597   /* ---------------------------------------------------- */
12598
12599   /* 66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
12600      (Partial implementation only -- only deal with cases where
12601      the rounding mode is specified directly by the immediate byte.)
12602      66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
12603      (Limitations ditto)
12604   */
12605   if (sz == 2
12606       && insn[0] == 0x0F && insn[1] == 0x3A
12607       && (/*insn[2] == 0x0B || */insn[2] == 0x0A)) {
12608
12609      Bool   isD = insn[2] == 0x0B;
12610      IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
12611      IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
12612      Int    imm = 0;
12613
12614      modrm = insn[3];
12615
12616      if (epartIsReg(modrm)) {
12617         assign( src,
12618                 isD ? getXMMRegLane64F( eregOfRM(modrm), 0 )
12619                     : getXMMRegLane32F( eregOfRM(modrm), 0 ) );
12620         imm = insn[3+1];
12621         if (imm & ~3) goto decode_failure;
12622         delta += 3+1+1;
12623         DIP( "rounds%c $%d,%s,%s\n",
12624              isD ? 'd' : 's',
12625              imm, nameXMMReg( eregOfRM(modrm) ),
12626                   nameXMMReg( gregOfRM(modrm) ) );
12627      } else {
12628         addr = disAMode( &alen, sorb, delta+3, dis_buf );
12629         assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
12630         imm = insn[3+alen];
12631         if (imm & ~3) goto decode_failure;
12632         delta += 3+alen+1;
12633         DIP( "roundsd $%d,%s,%s\n",
12634              imm, dis_buf, nameXMMReg( gregOfRM(modrm) ) );
12635      }
12636
12637      /* (imm & 3) contains an Intel-encoded rounding mode.  Because
12638         that encoding is the same as the encoding for IRRoundingMode,
12639         we can use that value directly in the IR as a rounding
12640         mode. */
12641      assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
12642                  mkU32(imm & 3), mkexpr(src)) );
12643
12644      if (isD)
12645         putXMMRegLane64F( gregOfRM(modrm), 0, mkexpr(res) );
12646      else
12647         putXMMRegLane32F( gregOfRM(modrm), 0, mkexpr(res) );
12648
12649      goto decode_success;
12650   }
12651
12652   /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
12653      which we can only decode if we're sure this is an AMD cpu that
12654      supports LZCNT, since otherwise it's BSR, which behaves
12655      differently. */
12656   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xBD
12657       && 0 != (archinfo->hwcaps & VEX_HWCAPS_X86_LZCNT)) {
12658      vassert(sz == 2 || sz == 4);
12659      /*IRType*/ ty  = szToITy(sz);
12660      IRTemp     src = newTemp(ty);
12661      modrm = insn[3];
12662      if (epartIsReg(modrm)) {
12663         assign(src, getIReg(sz, eregOfRM(modrm)));
12664         delta += 3+1;
12665         DIP("lzcnt%c %s, %s\n", nameISize(sz),
12666             nameIReg(sz, eregOfRM(modrm)),
12667             nameIReg(sz, gregOfRM(modrm)));
12668      } else {
12669         addr = disAMode( &alen, sorb, delta+3, dis_buf );
12670         assign(src, loadLE(ty, mkexpr(addr)));
12671         delta += 3+alen;
12672         DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
12673             nameIReg(sz, gregOfRM(modrm)));
12674      }
12675
12676      IRTemp res = gen_LZCNT(ty, src);
12677      putIReg(sz, gregOfRM(modrm), mkexpr(res));
12678
12679      // Update flags.  This is pretty lame .. perhaps can do better
12680      // if this turns out to be performance critical.
12681      // O S A P are cleared.  Z is set if RESULT == 0.
12682      // C is set if SRC is zero.
12683      IRTemp src32 = newTemp(Ity_I32);
12684      IRTemp res32 = newTemp(Ity_I32);
12685      assign(src32, widenUto32(mkexpr(src)));
12686      assign(res32, widenUto32(mkexpr(res)));
12687
12688      IRTemp oszacp = newTemp(Ity_I32);
12689      assign(
12690         oszacp,
12691         binop(Iop_Or32,
12692               binop(Iop_Shl32,
12693                     unop(Iop_1Uto32,
12694                          binop(Iop_CmpEQ32, mkexpr(res32), mkU32(0))),
12695                     mkU8(X86G_CC_SHIFT_Z)),
12696               binop(Iop_Shl32,
12697                     unop(Iop_1Uto32,
12698                          binop(Iop_CmpEQ32, mkexpr(src32), mkU32(0))),
12699                     mkU8(X86G_CC_SHIFT_C))
12700         )
12701      );
12702
12703      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
12704      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
12705      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
12706      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
12707
12708      goto decode_success;
12709   }
12710
12711   /* ---------------------------------------------------- */
12712   /* --- end of the SSE4 decoder                      --- */
12713   /* ---------------------------------------------------- */
12714
12715   after_sse_decoders:
12716
12717   /* ---------------------------------------------------- */
12718   /* --- deal with misc 0x67 pfxs (addr size override) -- */
12719   /* ---------------------------------------------------- */
12720
12721   /* 67 E3 = JCXZ (for JECXZ see below) */
12722   if (insn[0] == 0x67 && insn[1] == 0xE3 && sz == 4) {
12723      delta += 2;
12724      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
12725      delta ++;
12726      stmt( IRStmt_Exit(
12727               binop(Iop_CmpEQ16, getIReg(2,R_ECX), mkU16(0)),
12728               Ijk_Boring,
12729               IRConst_U32(d32)
12730            ));
12731       DIP("jcxz 0x%x\n", d32);
12732       goto decode_success;
12733   }
12734
12735   /* ---------------------------------------------------- */
12736   /* --- start of the baseline insn decoder            -- */
12737   /* ---------------------------------------------------- */
12738
12739   /* Get the primary opcode. */
12740   opc = getIByte(delta); delta++;
12741
12742   /* We get here if the current insn isn't SSE, or this CPU doesn't
12743      support SSE. */
12744
12745   switch (opc) {
12746
12747   /* ------------------------ Control flow --------------- */
12748
12749   case 0xC2: /* RET imm16 */
12750      d32 = getUDisp16(delta);
12751      delta += 2;
12752      dis_ret(d32);
12753      dres.whatNext = Dis_StopHere;
12754      DIP("ret %d\n", (Int)d32);
12755      break;
12756   case 0xC3: /* RET */
12757      dis_ret(0);
12758      dres.whatNext = Dis_StopHere;
12759      DIP("ret\n");
12760      break;
12761
12762   case 0xCF: /* IRET */
12763      /* Note, this is an extremely kludgey and limited implementation
12764         of iret.  All it really does is:
12765            popl %EIP; popl %CS; popl %EFLAGS.
12766         %CS is set but ignored (as it is in (eg) popw %cs)". */
12767      t1 = newTemp(Ity_I32); /* ESP */
12768      t2 = newTemp(Ity_I32); /* new EIP */
12769      t3 = newTemp(Ity_I32); /* new CS */
12770      t4 = newTemp(Ity_I32); /* new EFLAGS */
12771      assign(t1, getIReg(4,R_ESP));
12772      assign(t2, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(0) )));
12773      assign(t3, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(4) )));
12774      assign(t4, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(8) )));
12775      /* Get stuff off stack */
12776      putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(12)));
12777      /* set %CS (which is ignored anyway) */
12778      putSReg( R_CS, unop(Iop_32to16, mkexpr(t3)) );
12779      /* set %EFLAGS */
12780      set_EFLAGS_from_value( t4, False/*!emit_AC_emwarn*/, 0/*unused*/ );
12781      /* goto new EIP value */
12782      jmp_treg(Ijk_Ret,t2);
12783      dres.whatNext = Dis_StopHere;
12784      DIP("iret (very kludgey)\n");
12785      break;
12786
12787   case 0xE8: /* CALL J4 */
12788      d32 = getUDisp32(delta); delta += 4;
12789      d32 += (guest_EIP_bbstart+delta);
12790      /* (guest_eip_bbstart+delta) == return-to addr, d32 == call-to addr */
12791      if (d32 == guest_EIP_bbstart+delta && getIByte(delta) >= 0x58
12792                                         && getIByte(delta) <= 0x5F) {
12793         /* Specially treat the position-independent-code idiom
12794                 call X
12795              X: popl %reg
12796            as
12797                 movl %eip, %reg.
12798            since this generates better code, but for no other reason. */
12799         Int archReg = getIByte(delta) - 0x58;
12800         /* vex_printf("-- fPIC thingy\n"); */
12801         putIReg(4, archReg, mkU32(guest_EIP_bbstart+delta));
12802         delta++; /* Step over the POP */
12803         DIP("call 0x%x ; popl %s\n",d32,nameIReg(4,archReg));
12804      } else {
12805         /* The normal sequence for a call. */
12806         t1 = newTemp(Ity_I32);
12807         assign(t1, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
12808         putIReg(4, R_ESP, mkexpr(t1));
12809         storeLE( mkexpr(t1), mkU32(guest_EIP_bbstart+delta));
12810         if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32 )) {
12811            /* follow into the call target. */
12812            dres.whatNext   = Dis_ResteerU;
12813            dres.continueAt = (Addr64)(Addr32)d32;
12814         } else {
12815            jmp_lit(Ijk_Call,d32);
12816            dres.whatNext = Dis_StopHere;
12817         }
12818         DIP("call 0x%x\n",d32);
12819      }
12820      break;
12821
12822//--    case 0xC8: /* ENTER */
12823//--       d32 = getUDisp16(eip); eip += 2;
12824//--       abyte = getIByte(delta); delta++;
12825//--
12826//--       vg_assert(sz == 4);
12827//--       vg_assert(abyte == 0);
12828//--
12829//--       t1 = newTemp(cb); t2 = newTemp(cb);
12830//--       uInstr2(cb, GET,   sz, ArchReg, R_EBP, TempReg, t1);
12831//--       uInstr2(cb, GET,    4, ArchReg, R_ESP, TempReg, t2);
12832//--       uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
12833//--       uLiteral(cb, sz);
12834//--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
12835//--       uInstr2(cb, STORE,  4, TempReg, t1,    TempReg, t2);
12836//--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_EBP);
12837//--       if (d32) {
12838//--          uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
12839//--          uLiteral(cb, d32);
12840//--          uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
12841//--       }
12842//--       DIP("enter 0x%x, 0x%x", d32, abyte);
12843//--       break;
12844
12845   case 0xC9: /* LEAVE */
12846      vassert(sz == 4);
12847      t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
12848      assign(t1, getIReg(4,R_EBP));
12849      /* First PUT ESP looks redundant, but need it because ESP must
12850         always be up-to-date for Memcheck to work... */
12851      putIReg(4, R_ESP, mkexpr(t1));
12852      assign(t2, loadLE(Ity_I32,mkexpr(t1)));
12853      putIReg(4, R_EBP, mkexpr(t2));
12854      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(4)) );
12855      DIP("leave\n");
12856      break;
12857
12858   /* ---------------- Misc weird-ass insns --------------- */
12859
12860   case 0x27: /* DAA */
12861   case 0x2F: /* DAS */
12862   case 0x37: /* AAA */
12863   case 0x3F: /* AAS */
12864      /* An ugly implementation for some ugly instructions.  Oh
12865	 well. */
12866      if (sz != 4) goto decode_failure;
12867      t1 = newTemp(Ity_I32);
12868      t2 = newTemp(Ity_I32);
12869      /* Make up a 32-bit value (t1), with the old value of AX in the
12870         bottom 16 bits, and the old OSZACP bitmask in the upper 16
12871         bits. */
12872      assign(t1,
12873             binop(Iop_16HLto32,
12874                   unop(Iop_32to16,
12875                        mk_x86g_calculate_eflags_all()),
12876                   getIReg(2, R_EAX)
12877            ));
12878      /* Call the helper fn, to get a new AX and OSZACP value, and
12879         poke both back into the guest state.  Also pass the helper
12880         the actual opcode so it knows which of the 4 instructions it
12881         is doing the computation for. */
12882      vassert(opc == 0x27 || opc == 0x2F || opc == 0x37 || opc == 0x3F);
12883      assign(t2,
12884              mkIRExprCCall(
12885                 Ity_I32, 0/*regparm*/, "x86g_calculate_daa_das_aaa_aas",
12886                 &x86g_calculate_daa_das_aaa_aas,
12887                 mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
12888            ));
12889     putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
12890
12891     stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
12892     stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
12893     stmt( IRStmt_Put( OFFB_CC_DEP1,
12894                       binop(Iop_And32,
12895                             binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
12896                             mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
12897                                    | X86G_CC_MASK_A | X86G_CC_MASK_Z
12898                                    | X86G_CC_MASK_S| X86G_CC_MASK_O )
12899                            )
12900                      )
12901         );
12902     /* Set NDEP even though it isn't used.  This makes redundant-PUT
12903        elimination of previous stores to this field work better. */
12904     stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
12905     switch (opc) {
12906        case 0x27: DIP("daa\n"); break;
12907        case 0x2F: DIP("das\n"); break;
12908        case 0x37: DIP("aaa\n"); break;
12909        case 0x3F: DIP("aas\n"); break;
12910        default: vassert(0);
12911     }
12912     break;
12913
12914   case 0xD4: /* AAM */
12915   case 0xD5: /* AAD */
12916      d32 = getIByte(delta); delta++;
12917      if (sz != 4 || d32 != 10) goto decode_failure;
12918      t1 = newTemp(Ity_I32);
12919      t2 = newTemp(Ity_I32);
12920      /* Make up a 32-bit value (t1), with the old value of AX in the
12921         bottom 16 bits, and the old OSZACP bitmask in the upper 16
12922         bits. */
12923      assign(t1,
12924             binop(Iop_16HLto32,
12925                   unop(Iop_32to16,
12926                        mk_x86g_calculate_eflags_all()),
12927                   getIReg(2, R_EAX)
12928            ));
12929      /* Call the helper fn, to get a new AX and OSZACP value, and
12930         poke both back into the guest state.  Also pass the helper
12931         the actual opcode so it knows which of the 2 instructions it
12932         is doing the computation for. */
12933      assign(t2,
12934              mkIRExprCCall(
12935                 Ity_I32, 0/*regparm*/, "x86g_calculate_aad_aam",
12936                 &x86g_calculate_aad_aam,
12937                 mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
12938            ));
12939      putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
12940
12941      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
12942      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
12943      stmt( IRStmt_Put( OFFB_CC_DEP1,
12944                        binop(Iop_And32,
12945                              binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
12946                              mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
12947                                     | X86G_CC_MASK_A | X86G_CC_MASK_Z
12948                                     | X86G_CC_MASK_S| X86G_CC_MASK_O )
12949                             )
12950                       )
12951          );
12952      /* Set NDEP even though it isn't used.  This makes
12953         redundant-PUT elimination of previous stores to this field
12954         work better. */
12955      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
12956
12957      DIP(opc == 0xD4 ? "aam\n" : "aad\n");
12958      break;
12959
12960   /* ------------------------ CWD/CDQ -------------------- */
12961
12962   case 0x98: /* CBW */
12963      if (sz == 4) {
12964         putIReg(4, R_EAX, unop(Iop_16Sto32, getIReg(2, R_EAX)));
12965         DIP("cwde\n");
12966      } else {
12967         vassert(sz == 2);
12968         putIReg(2, R_EAX, unop(Iop_8Sto16, getIReg(1, R_EAX)));
12969         DIP("cbw\n");
12970      }
12971      break;
12972
12973   case 0x99: /* CWD/CDQ */
12974      ty = szToITy(sz);
12975      putIReg(sz, R_EDX,
12976                  binop(mkSizedOp(ty,Iop_Sar8),
12977                        getIReg(sz, R_EAX),
12978                        mkU8(sz == 2 ? 15 : 31)) );
12979      DIP(sz == 2 ? "cwdq\n" : "cdqq\n");
12980      break;
12981
12982   /* ------------------------ FPU ops -------------------- */
12983
12984   case 0x9E: /* SAHF */
12985      codegen_SAHF();
12986      DIP("sahf\n");
12987      break;
12988
12989   case 0x9F: /* LAHF */
12990      codegen_LAHF();
12991      DIP("lahf\n");
12992      break;
12993
12994   case 0x9B: /* FWAIT */
12995      /* ignore? */
12996      DIP("fwait\n");
12997      break;
12998
12999   case 0xD8:
13000   case 0xD9:
13001   case 0xDA:
13002   case 0xDB:
13003   case 0xDC:
13004   case 0xDD:
13005   case 0xDE:
13006   case 0xDF: {
13007      Int  delta0    = delta;
13008      Bool decode_OK = False;
13009      delta = dis_FPU ( &decode_OK, sorb, delta );
13010      if (!decode_OK) {
13011         delta = delta0;
13012         goto decode_failure;
13013      }
13014      break;
13015   }
13016
13017   /* ------------------------ INC & DEC ------------------ */
13018
13019   case 0x40: /* INC eAX */
13020   case 0x41: /* INC eCX */
13021   case 0x42: /* INC eDX */
13022   case 0x43: /* INC eBX */
13023   case 0x44: /* INC eSP */
13024   case 0x45: /* INC eBP */
13025   case 0x46: /* INC eSI */
13026   case 0x47: /* INC eDI */
13027      vassert(sz == 2 || sz == 4);
13028      ty = szToITy(sz);
13029      t1 = newTemp(ty);
13030      assign( t1, binop(mkSizedOp(ty,Iop_Add8),
13031                        getIReg(sz, (UInt)(opc - 0x40)),
13032                        mkU(ty,1)) );
13033      setFlags_INC_DEC( True, t1, ty );
13034      putIReg(sz, (UInt)(opc - 0x40), mkexpr(t1));
13035      DIP("inc%c %s\n", nameISize(sz), nameIReg(sz,opc-0x40));
13036      break;
13037
13038   case 0x48: /* DEC eAX */
13039   case 0x49: /* DEC eCX */
13040   case 0x4A: /* DEC eDX */
13041   case 0x4B: /* DEC eBX */
13042   case 0x4C: /* DEC eSP */
13043   case 0x4D: /* DEC eBP */
13044   case 0x4E: /* DEC eSI */
13045   case 0x4F: /* DEC eDI */
13046      vassert(sz == 2 || sz == 4);
13047      ty = szToITy(sz);
13048      t1 = newTemp(ty);
13049      assign( t1, binop(mkSizedOp(ty,Iop_Sub8),
13050                        getIReg(sz, (UInt)(opc - 0x48)),
13051                        mkU(ty,1)) );
13052      setFlags_INC_DEC( False, t1, ty );
13053      putIReg(sz, (UInt)(opc - 0x48), mkexpr(t1));
13054      DIP("dec%c %s\n", nameISize(sz), nameIReg(sz,opc-0x48));
13055      break;
13056
13057   /* ------------------------ INT ------------------------ */
13058
13059   case 0xCC: /* INT 3 */
13060      jmp_lit(Ijk_SigTRAP,((Addr32)guest_EIP_bbstart)+delta);
13061      dres.whatNext = Dis_StopHere;
13062      DIP("int $0x3\n");
13063      break;
13064
13065   case 0xCD: /* INT imm8 */
13066      d32 = getIByte(delta); delta++;
13067
13068      /* For any of the cases where we emit a jump (that is, for all
13069         currently handled cases), it's important that all ArchRegs
13070         carry their up-to-date value at this point.  So we declare an
13071         end-of-block here, which forces any TempRegs caching ArchRegs
13072         to be flushed. */
13073
13074      /* Handle int $0x40 .. $0x43 by synthesising a segfault and a
13075         restart of this instruction (hence the "-2" two lines below,
13076         to get the restart EIP to be this instruction.  This is
13077         probably Linux-specific and it would be more correct to only
13078         do this if the VexAbiInfo says that is what we should do. */
13079      if (d32 >= 0x40 && d32 <= 0x43) {
13080         jmp_lit(Ijk_SigSEGV,((Addr32)guest_EIP_bbstart)+delta-2);
13081         dres.whatNext = Dis_StopHere;
13082         DIP("int $0x%x\n", (Int)d32);
13083         break;
13084      }
13085
13086      /* Handle int $0x80 (linux syscalls), int $0x81 and $0x82
13087         (darwin syscalls).  As part of this, note where we are, so we
13088         can back up the guest to this point if the syscall needs to
13089         be restarted. */
13090      if (d32 == 0x80) {
13091         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
13092                           mkU32(guest_EIP_curr_instr) ) );
13093         jmp_lit(Ijk_Sys_int128,((Addr32)guest_EIP_bbstart)+delta);
13094         dres.whatNext = Dis_StopHere;
13095         DIP("int $0x80\n");
13096         break;
13097      }
13098      if (d32 == 0x81) {
13099         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
13100                           mkU32(guest_EIP_curr_instr) ) );
13101         jmp_lit(Ijk_Sys_int129,((Addr32)guest_EIP_bbstart)+delta);
13102         dres.whatNext = Dis_StopHere;
13103         DIP("int $0x81\n");
13104         break;
13105      }
13106      if (d32 == 0x82) {
13107         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
13108                           mkU32(guest_EIP_curr_instr) ) );
13109         jmp_lit(Ijk_Sys_int130,((Addr32)guest_EIP_bbstart)+delta);
13110         dres.whatNext = Dis_StopHere;
13111         DIP("int $0x82\n");
13112         break;
13113      }
13114
13115      /* none of the above */
13116      goto decode_failure;
13117
13118   /* ------------------------ Jcond, byte offset --------- */
13119
13120   case 0xEB: /* Jb (jump, byte offset) */
13121      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
13122      delta++;
13123      if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
13124         dres.whatNext   = Dis_ResteerU;
13125         dres.continueAt = (Addr64)(Addr32)d32;
13126      } else {
13127         jmp_lit(Ijk_Boring,d32);
13128         dres.whatNext = Dis_StopHere;
13129      }
13130      DIP("jmp-8 0x%x\n", d32);
13131      break;
13132
13133   case 0xE9: /* Jv (jump, 16/32 offset) */
13134      vassert(sz == 4); /* JRS added 2004 July 11 */
13135      d32 = (((Addr32)guest_EIP_bbstart)+delta+sz) + getSDisp(sz,delta);
13136      delta += sz;
13137      if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
13138         dres.whatNext   = Dis_ResteerU;
13139         dres.continueAt = (Addr64)(Addr32)d32;
13140      } else {
13141         jmp_lit(Ijk_Boring,d32);
13142         dres.whatNext = Dis_StopHere;
13143      }
13144      DIP("jmp 0x%x\n", d32);
13145      break;
13146
13147   case 0x70:
13148   case 0x71:
13149   case 0x72: /* JBb/JNAEb (jump below) */
13150   case 0x73: /* JNBb/JAEb (jump not below) */
13151   case 0x74: /* JZb/JEb (jump zero) */
13152   case 0x75: /* JNZb/JNEb (jump not zero) */
13153   case 0x76: /* JBEb/JNAb (jump below or equal) */
13154   case 0x77: /* JNBEb/JAb (jump not below or equal) */
13155   case 0x78: /* JSb (jump negative) */
13156   case 0x79: /* JSb (jump not negative) */
13157   case 0x7A: /* JP (jump parity even) */
13158   case 0x7B: /* JNP/JPO (jump parity odd) */
13159   case 0x7C: /* JLb/JNGEb (jump less) */
13160   case 0x7D: /* JGEb/JNLb (jump greater or equal) */
13161   case 0x7E: /* JLEb/JNGb (jump less or equal) */
13162   case 0x7F: /* JGb/JNLEb (jump greater) */
13163    { Int    jmpDelta;
13164      HChar* comment  = "";
13165      jmpDelta = (Int)getSDisp8(delta);
13166      vassert(-128 <= jmpDelta && jmpDelta < 128);
13167      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + jmpDelta;
13168      delta++;
13169      if (resteerCisOk
13170          && vex_control.guest_chase_cond
13171          && (Addr32)d32 != (Addr32)guest_EIP_bbstart
13172          && jmpDelta < 0
13173          && resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
13174         /* Speculation: assume this backward branch is taken.  So we
13175            need to emit a side-exit to the insn following this one,
13176            on the negation of the condition, and continue at the
13177            branch target address (d32).  If we wind up back at the
13178            first instruction of the trace, just stop; it's better to
13179            let the IR loop unroller handle that case. */
13180         stmt( IRStmt_Exit(
13181                  mk_x86g_calculate_condition((X86Condcode)(1 ^ (opc - 0x70))),
13182                  Ijk_Boring,
13183                  IRConst_U32(guest_EIP_bbstart+delta) ) );
13184         dres.whatNext   = Dis_ResteerC;
13185         dres.continueAt = (Addr64)(Addr32)d32;
13186         comment = "(assumed taken)";
13187      }
13188      else
13189      if (resteerCisOk
13190          && vex_control.guest_chase_cond
13191          && (Addr32)d32 != (Addr32)guest_EIP_bbstart
13192          && jmpDelta >= 0
13193          && resteerOkFn( callback_opaque,
13194                          (Addr64)(Addr32)(guest_EIP_bbstart+delta)) ) {
13195         /* Speculation: assume this forward branch is not taken.  So
13196            we need to emit a side-exit to d32 (the dest) and continue
13197            disassembling at the insn immediately following this
13198            one. */
13199         stmt( IRStmt_Exit(
13200                  mk_x86g_calculate_condition((X86Condcode)(opc - 0x70)),
13201                  Ijk_Boring,
13202                  IRConst_U32(d32) ) );
13203         dres.whatNext   = Dis_ResteerC;
13204         dres.continueAt = (Addr64)(Addr32)(guest_EIP_bbstart+delta);
13205         comment = "(assumed not taken)";
13206      }
13207      else {
13208         /* Conservative default translation - end the block at this
13209            point. */
13210         jcc_01( (X86Condcode)(opc - 0x70),
13211                 (Addr32)(guest_EIP_bbstart+delta), d32);
13212         dres.whatNext = Dis_StopHere;
13213      }
13214      DIP("j%s-8 0x%x %s\n", name_X86Condcode(opc - 0x70), d32, comment);
13215      break;
13216    }
13217
13218   case 0xE3: /* JECXZ (for JCXZ see above) */
13219      if (sz != 4) goto decode_failure;
13220      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
13221      delta ++;
13222      stmt( IRStmt_Exit(
13223               binop(Iop_CmpEQ32, getIReg(4,R_ECX), mkU32(0)),
13224            Ijk_Boring,
13225            IRConst_U32(d32)
13226          ));
13227      DIP("jecxz 0x%x\n", d32);
13228      break;
13229
13230   case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
13231   case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
13232   case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
13233    { /* Again, the docs say this uses ECX/CX as a count depending on
13234         the address size override, not the operand one.  Since we
13235         don't handle address size overrides, I guess that means
13236         ECX. */
13237      IRExpr* zbit  = NULL;
13238      IRExpr* count = NULL;
13239      IRExpr* cond  = NULL;
13240      HChar*  xtra  = NULL;
13241
13242      if (sz != 4) goto decode_failure;
13243      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
13244      delta++;
13245      putIReg(4, R_ECX, binop(Iop_Sub32, getIReg(4,R_ECX), mkU32(1)));
13246
13247      count = getIReg(4,R_ECX);
13248      cond = binop(Iop_CmpNE32, count, mkU32(0));
13249      switch (opc) {
13250         case 0xE2:
13251            xtra = "";
13252            break;
13253         case 0xE1:
13254            xtra = "e";
13255            zbit = mk_x86g_calculate_condition( X86CondZ );
13256	    cond = mkAnd1(cond, zbit);
13257            break;
13258         case 0xE0:
13259            xtra = "ne";
13260            zbit = mk_x86g_calculate_condition( X86CondNZ );
13261	    cond = mkAnd1(cond, zbit);
13262            break;
13263         default:
13264	    vassert(0);
13265      }
13266      stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U32(d32)) );
13267
13268      DIP("loop%s 0x%x\n", xtra, d32);
13269      break;
13270    }
13271
13272   /* ------------------------ IMUL ----------------------- */
13273
13274   case 0x69: /* IMUL Iv, Ev, Gv */
13275      delta = dis_imul_I_E_G ( sorb, sz, delta, sz );
13276      break;
13277   case 0x6B: /* IMUL Ib, Ev, Gv */
13278      delta = dis_imul_I_E_G ( sorb, sz, delta, 1 );
13279      break;
13280
13281   /* ------------------------ MOV ------------------------ */
13282
13283   case 0x88: /* MOV Gb,Eb */
13284      delta = dis_mov_G_E(sorb, 1, delta);
13285      break;
13286
13287   case 0x89: /* MOV Gv,Ev */
13288      delta = dis_mov_G_E(sorb, sz, delta);
13289      break;
13290
13291   case 0x8A: /* MOV Eb,Gb */
13292      delta = dis_mov_E_G(sorb, 1, delta);
13293      break;
13294
13295   case 0x8B: /* MOV Ev,Gv */
13296      delta = dis_mov_E_G(sorb, sz, delta);
13297      break;
13298
13299   case 0x8D: /* LEA M,Gv */
13300      if (sz != 4)
13301         goto decode_failure;
13302      modrm = getIByte(delta);
13303      if (epartIsReg(modrm))
13304         goto decode_failure;
13305      /* NOTE!  this is the one place where a segment override prefix
13306         has no effect on the address calculation.  Therefore we pass
13307         zero instead of sorb here. */
13308      addr = disAMode ( &alen, /*sorb*/ 0, delta, dis_buf );
13309      delta += alen;
13310      putIReg(sz, gregOfRM(modrm), mkexpr(addr));
13311      DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
13312                            nameIReg(sz,gregOfRM(modrm)));
13313      break;
13314
13315   case 0x8C: /* MOV Sw,Ew -- MOV from a SEGMENT REGISTER */
13316      delta = dis_mov_Sw_Ew(sorb, sz, delta);
13317      break;
13318
13319   case 0x8E: /* MOV Ew,Sw -- MOV to a SEGMENT REGISTER */
13320      delta = dis_mov_Ew_Sw(sorb, delta);
13321      break;
13322
13323   case 0xA0: /* MOV Ob,AL */
13324      sz = 1;
13325      /* Fall through ... */
13326   case 0xA1: /* MOV Ov,eAX */
13327      d32 = getUDisp32(delta); delta += 4;
13328      ty = szToITy(sz);
13329      addr = newTemp(Ity_I32);
13330      assign( addr, handleSegOverride(sorb, mkU32(d32)) );
13331      putIReg(sz, R_EAX, loadLE(ty, mkexpr(addr)));
13332      DIP("mov%c %s0x%x, %s\n", nameISize(sz), sorbTxt(sorb),
13333                                d32, nameIReg(sz,R_EAX));
13334      break;
13335
13336   case 0xA2: /* MOV Ob,AL */
13337      sz = 1;
13338      /* Fall through ... */
13339   case 0xA3: /* MOV eAX,Ov */
13340      d32 = getUDisp32(delta); delta += 4;
13341      ty = szToITy(sz);
13342      addr = newTemp(Ity_I32);
13343      assign( addr, handleSegOverride(sorb, mkU32(d32)) );
13344      storeLE( mkexpr(addr), getIReg(sz,R_EAX) );
13345      DIP("mov%c %s, %s0x%x\n", nameISize(sz), nameIReg(sz,R_EAX),
13346                                sorbTxt(sorb), d32);
13347      break;
13348
13349   case 0xB0: /* MOV imm,AL */
13350   case 0xB1: /* MOV imm,CL */
13351   case 0xB2: /* MOV imm,DL */
13352   case 0xB3: /* MOV imm,BL */
13353   case 0xB4: /* MOV imm,AH */
13354   case 0xB5: /* MOV imm,CH */
13355   case 0xB6: /* MOV imm,DH */
13356   case 0xB7: /* MOV imm,BH */
13357      d32 = getIByte(delta); delta += 1;
13358      putIReg(1, opc-0xB0, mkU8(d32));
13359      DIP("movb $0x%x,%s\n", d32, nameIReg(1,opc-0xB0));
13360      break;
13361
13362   case 0xB8: /* MOV imm,eAX */
13363   case 0xB9: /* MOV imm,eCX */
13364   case 0xBA: /* MOV imm,eDX */
13365   case 0xBB: /* MOV imm,eBX */
13366   case 0xBC: /* MOV imm,eSP */
13367   case 0xBD: /* MOV imm,eBP */
13368   case 0xBE: /* MOV imm,eSI */
13369   case 0xBF: /* MOV imm,eDI */
13370      d32 = getUDisp(sz,delta); delta += sz;
13371      putIReg(sz, opc-0xB8, mkU(szToITy(sz), d32));
13372      DIP("mov%c $0x%x,%s\n", nameISize(sz), d32, nameIReg(sz,opc-0xB8));
13373      break;
13374
13375   case 0xC6: /* MOV Ib,Eb */
13376      sz = 1;
13377      goto do_Mov_I_E;
13378   case 0xC7: /* MOV Iv,Ev */
13379      goto do_Mov_I_E;
13380
13381   do_Mov_I_E:
13382      modrm = getIByte(delta);
13383      if (epartIsReg(modrm)) {
13384         delta++; /* mod/rm byte */
13385         d32 = getUDisp(sz,delta); delta += sz;
13386         putIReg(sz, eregOfRM(modrm), mkU(szToITy(sz), d32));
13387         DIP("mov%c $0x%x, %s\n", nameISize(sz), d32,
13388                                  nameIReg(sz,eregOfRM(modrm)));
13389      } else {
13390         addr = disAMode ( &alen, sorb, delta, dis_buf );
13391         delta += alen;
13392         d32 = getUDisp(sz,delta); delta += sz;
13393         storeLE(mkexpr(addr), mkU(szToITy(sz), d32));
13394         DIP("mov%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
13395      }
13396      break;
13397
13398   /* ------------------------ opl imm, A ----------------- */
13399
13400   case 0x04: /* ADD Ib, AL */
13401      delta = dis_op_imm_A(  1, False, Iop_Add8, True, delta, "add" );
13402      break;
13403   case 0x05: /* ADD Iv, eAX */
13404      delta = dis_op_imm_A( sz, False, Iop_Add8, True, delta, "add" );
13405      break;
13406
13407   case 0x0C: /* OR Ib, AL */
13408      delta = dis_op_imm_A(  1, False, Iop_Or8, True, delta, "or" );
13409      break;
13410   case 0x0D: /* OR Iv, eAX */
13411      delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
13412      break;
13413
13414   case 0x14: /* ADC Ib, AL */
13415      delta = dis_op_imm_A(  1, True, Iop_Add8, True, delta, "adc" );
13416      break;
13417   case 0x15: /* ADC Iv, eAX */
13418      delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
13419      break;
13420
13421   case 0x1C: /* SBB Ib, AL */
13422      delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
13423      break;
13424   case 0x1D: /* SBB Iv, eAX */
13425      delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
13426      break;
13427
13428   case 0x24: /* AND Ib, AL */
13429      delta = dis_op_imm_A(  1, False, Iop_And8, True, delta, "and" );
13430      break;
13431   case 0x25: /* AND Iv, eAX */
13432      delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
13433      break;
13434
13435   case 0x2C: /* SUB Ib, AL */
13436      delta = dis_op_imm_A(  1, False, Iop_Sub8, True, delta, "sub" );
13437      break;
13438   case 0x2D: /* SUB Iv, eAX */
13439      delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
13440      break;
13441
13442   case 0x34: /* XOR Ib, AL */
13443      delta = dis_op_imm_A(  1, False, Iop_Xor8, True, delta, "xor" );
13444      break;
13445   case 0x35: /* XOR Iv, eAX */
13446      delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
13447      break;
13448
13449   case 0x3C: /* CMP Ib, AL */
13450      delta = dis_op_imm_A(  1, False, Iop_Sub8, False, delta, "cmp" );
13451      break;
13452   case 0x3D: /* CMP Iv, eAX */
13453      delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
13454      break;
13455
13456   case 0xA8: /* TEST Ib, AL */
13457      delta = dis_op_imm_A(  1, False, Iop_And8, False, delta, "test" );
13458      break;
13459   case 0xA9: /* TEST Iv, eAX */
13460      delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
13461      break;
13462
13463   /* ------------------------ opl Ev, Gv ----------------- */
13464
13465   case 0x02: /* ADD Eb,Gb */
13466      delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, 1, delta, "add" );
13467      break;
13468   case 0x03: /* ADD Ev,Gv */
13469      delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, sz, delta, "add" );
13470      break;
13471
13472   case 0x0A: /* OR Eb,Gb */
13473      delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, 1, delta, "or" );
13474      break;
13475   case 0x0B: /* OR Ev,Gv */
13476      delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, sz, delta, "or" );
13477      break;
13478
13479   case 0x12: /* ADC Eb,Gb */
13480      delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, 1, delta, "adc" );
13481      break;
13482   case 0x13: /* ADC Ev,Gv */
13483      delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, sz, delta, "adc" );
13484      break;
13485
13486   case 0x1A: /* SBB Eb,Gb */
13487      delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, 1, delta, "sbb" );
13488      break;
13489   case 0x1B: /* SBB Ev,Gv */
13490      delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, sz, delta, "sbb" );
13491      break;
13492
13493   case 0x22: /* AND Eb,Gb */
13494      delta = dis_op2_E_G ( sorb, False, Iop_And8, True, 1, delta, "and" );
13495      break;
13496   case 0x23: /* AND Ev,Gv */
13497      delta = dis_op2_E_G ( sorb, False, Iop_And8, True, sz, delta, "and" );
13498      break;
13499
13500   case 0x2A: /* SUB Eb,Gb */
13501      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, 1, delta, "sub" );
13502      break;
13503   case 0x2B: /* SUB Ev,Gv */
13504      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, sz, delta, "sub" );
13505      break;
13506
13507   case 0x32: /* XOR Eb,Gb */
13508      delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, 1, delta, "xor" );
13509      break;
13510   case 0x33: /* XOR Ev,Gv */
13511      delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, sz, delta, "xor" );
13512      break;
13513
13514   case 0x3A: /* CMP Eb,Gb */
13515      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, 1, delta, "cmp" );
13516      break;
13517   case 0x3B: /* CMP Ev,Gv */
13518      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, sz, delta, "cmp" );
13519      break;
13520
13521   case 0x84: /* TEST Eb,Gb */
13522      delta = dis_op2_E_G ( sorb, False, Iop_And8, False, 1, delta, "test" );
13523      break;
13524   case 0x85: /* TEST Ev,Gv */
13525      delta = dis_op2_E_G ( sorb, False, Iop_And8, False, sz, delta, "test" );
13526      break;
13527
13528   /* ------------------------ opl Gv, Ev ----------------- */
13529
13530   case 0x00: /* ADD Gb,Eb */
13531      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13532                            Iop_Add8, True, 1, delta, "add" );
13533      break;
13534   case 0x01: /* ADD Gv,Ev */
13535      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13536                            Iop_Add8, True, sz, delta, "add" );
13537      break;
13538
13539   case 0x08: /* OR Gb,Eb */
13540      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13541                            Iop_Or8, True, 1, delta, "or" );
13542      break;
13543   case 0x09: /* OR Gv,Ev */
13544      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13545                            Iop_Or8, True, sz, delta, "or" );
13546      break;
13547
13548   case 0x10: /* ADC Gb,Eb */
13549      delta = dis_op2_G_E ( sorb, pfx_lock, True,
13550                            Iop_Add8, True, 1, delta, "adc" );
13551      break;
13552   case 0x11: /* ADC Gv,Ev */
13553      delta = dis_op2_G_E ( sorb, pfx_lock, True,
13554                            Iop_Add8, True, sz, delta, "adc" );
13555      break;
13556
13557   case 0x18: /* SBB Gb,Eb */
13558      delta = dis_op2_G_E ( sorb, pfx_lock, True,
13559                            Iop_Sub8, True, 1, delta, "sbb" );
13560      break;
13561   case 0x19: /* SBB Gv,Ev */
13562      delta = dis_op2_G_E ( sorb, pfx_lock, True,
13563                            Iop_Sub8, True, sz, delta, "sbb" );
13564      break;
13565
13566   case 0x20: /* AND Gb,Eb */
13567      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13568                            Iop_And8, True, 1, delta, "and" );
13569      break;
13570   case 0x21: /* AND Gv,Ev */
13571      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13572                            Iop_And8, True, sz, delta, "and" );
13573      break;
13574
13575   case 0x28: /* SUB Gb,Eb */
13576      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13577                            Iop_Sub8, True, 1, delta, "sub" );
13578      break;
13579   case 0x29: /* SUB Gv,Ev */
13580      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13581                            Iop_Sub8, True, sz, delta, "sub" );
13582      break;
13583
13584   case 0x30: /* XOR Gb,Eb */
13585      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13586                            Iop_Xor8, True, 1, delta, "xor" );
13587      break;
13588   case 0x31: /* XOR Gv,Ev */
13589      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13590                            Iop_Xor8, True, sz, delta, "xor" );
13591      break;
13592
13593   case 0x38: /* CMP Gb,Eb */
13594      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13595                            Iop_Sub8, False, 1, delta, "cmp" );
13596      break;
13597   case 0x39: /* CMP Gv,Ev */
13598      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13599                            Iop_Sub8, False, sz, delta, "cmp" );
13600      break;
13601
13602   /* ------------------------ POP ------------------------ */
13603
13604   case 0x58: /* POP eAX */
13605   case 0x59: /* POP eCX */
13606   case 0x5A: /* POP eDX */
13607   case 0x5B: /* POP eBX */
13608   case 0x5D: /* POP eBP */
13609   case 0x5E: /* POP eSI */
13610   case 0x5F: /* POP eDI */
13611   case 0x5C: /* POP eSP */
13612      vassert(sz == 2 || sz == 4);
13613      t1 = newTemp(szToITy(sz)); t2 = newTemp(Ity_I32);
13614      assign(t2, getIReg(4, R_ESP));
13615      assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
13616      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
13617      putIReg(sz, opc-0x58, mkexpr(t1));
13618      DIP("pop%c %s\n", nameISize(sz), nameIReg(sz,opc-0x58));
13619      break;
13620
13621   case 0x9D: /* POPF */
13622      vassert(sz == 2 || sz == 4);
13623      t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
13624      assign(t2, getIReg(4, R_ESP));
13625      assign(t1, widenUto32(loadLE(szToITy(sz),mkexpr(t2))));
13626      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
13627
13628      /* Generate IR to set %EFLAGS{O,S,Z,A,C,P,D,ID,AC} from the
13629	 value in t1. */
13630      set_EFLAGS_from_value( t1, True/*emit_AC_emwarn*/,
13631                                 ((Addr32)guest_EIP_bbstart)+delta );
13632
13633      DIP("popf%c\n", nameISize(sz));
13634      break;
13635
13636   case 0x61: /* POPA */
13637      /* This is almost certainly wrong for sz==2.  So ... */
13638      if (sz != 4) goto decode_failure;
13639
13640      /* t5 is the old %ESP value. */
13641      t5 = newTemp(Ity_I32);
13642      assign( t5, getIReg(4, R_ESP) );
13643
13644      /* Reload all the registers, except %esp. */
13645      putIReg(4,R_EAX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(28)) ));
13646      putIReg(4,R_ECX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(24)) ));
13647      putIReg(4,R_EDX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(20)) ));
13648      putIReg(4,R_EBX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(16)) ));
13649      /* ignore saved %ESP */
13650      putIReg(4,R_EBP, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 8)) ));
13651      putIReg(4,R_ESI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 4)) ));
13652      putIReg(4,R_EDI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 0)) ));
13653
13654      /* and move %ESP back up */
13655      putIReg( 4, R_ESP, binop(Iop_Add32, mkexpr(t5), mkU32(8*4)) );
13656
13657      DIP("popa%c\n", nameISize(sz));
13658      break;
13659
13660   case 0x8F: /* POPL/POPW m32 */
13661     { Int    len;
13662       UChar  rm = getIByte(delta);
13663
13664       /* make sure this instruction is correct POP */
13665       if (epartIsReg(rm) || gregOfRM(rm) != 0)
13666          goto decode_failure;
13667       /* and has correct size */
13668       if (sz != 4 && sz != 2)
13669          goto decode_failure;
13670       ty = szToITy(sz);
13671
13672       t1 = newTemp(Ity_I32); /* stack address */
13673       t3 = newTemp(ty); /* data */
13674       /* set t1 to ESP: t1 = ESP */
13675       assign( t1, getIReg(4, R_ESP) );
13676       /* load M[ESP] to virtual register t3: t3 = M[t1] */
13677       assign( t3, loadLE(ty, mkexpr(t1)) );
13678
13679       /* increase ESP; must be done before the STORE.  Intel manual says:
13680            If the ESP register is used as a base register for addressing
13681            a destination operand in memory, the POP instruction computes
13682            the effective address of the operand after it increments the
13683            ESP register.
13684       */
13685       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(sz)) );
13686
13687       /* resolve MODR/M */
13688       addr = disAMode ( &len, sorb, delta, dis_buf);
13689       storeLE( mkexpr(addr), mkexpr(t3) );
13690
13691       DIP("pop%c %s\n", sz==2 ? 'w' : 'l', dis_buf);
13692
13693       delta += len;
13694       break;
13695     }
13696
13697   case 0x1F: /* POP %DS */
13698      dis_pop_segreg( R_DS, sz ); break;
13699   case 0x07: /* POP %ES */
13700      dis_pop_segreg( R_ES, sz ); break;
13701   case 0x17: /* POP %SS */
13702      dis_pop_segreg( R_SS, sz ); break;
13703
13704   /* ------------------------ PUSH ----------------------- */
13705
13706   case 0x50: /* PUSH eAX */
13707   case 0x51: /* PUSH eCX */
13708   case 0x52: /* PUSH eDX */
13709   case 0x53: /* PUSH eBX */
13710   case 0x55: /* PUSH eBP */
13711   case 0x56: /* PUSH eSI */
13712   case 0x57: /* PUSH eDI */
13713   case 0x54: /* PUSH eSP */
13714      /* This is the Right Way, in that the value to be pushed is
13715         established before %esp is changed, so that pushl %esp
13716         correctly pushes the old value. */
13717      vassert(sz == 2 || sz == 4);
13718      ty = sz==2 ? Ity_I16 : Ity_I32;
13719      t1 = newTemp(ty); t2 = newTemp(Ity_I32);
13720      assign(t1, getIReg(sz, opc-0x50));
13721      assign(t2, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)));
13722      putIReg(4, R_ESP, mkexpr(t2) );
13723      storeLE(mkexpr(t2),mkexpr(t1));
13724      DIP("push%c %s\n", nameISize(sz), nameIReg(sz,opc-0x50));
13725      break;
13726
13727
13728   case 0x68: /* PUSH Iv */
13729      d32 = getUDisp(sz,delta); delta += sz;
13730      goto do_push_I;
13731   case 0x6A: /* PUSH Ib, sign-extended to sz */
13732      d32 = getSDisp8(delta); delta += 1;
13733      goto do_push_I;
13734   do_push_I:
13735      ty = szToITy(sz);
13736      t1 = newTemp(Ity_I32); t2 = newTemp(ty);
13737      assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
13738      putIReg(4, R_ESP, mkexpr(t1) );
13739      /* stop mkU16 asserting if d32 is a negative 16-bit number
13740         (bug #132813) */
13741      if (ty == Ity_I16)
13742         d32 &= 0xFFFF;
13743      storeLE( mkexpr(t1), mkU(ty,d32) );
13744      DIP("push%c $0x%x\n", nameISize(sz), d32);
13745      break;
13746
13747   case 0x9C: /* PUSHF */ {
13748      vassert(sz == 2 || sz == 4);
13749
13750      t1 = newTemp(Ity_I32);
13751      assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
13752      putIReg(4, R_ESP, mkexpr(t1) );
13753
13754      /* Calculate OSZACP, and patch in fixed fields as per
13755         Intel docs.
13756         - bit 1 is always 1
13757         - bit 9 is Interrupt Enable (should always be 1 in user mode?)
13758      */
13759      t2 = newTemp(Ity_I32);
13760      assign( t2, binop(Iop_Or32,
13761                        mk_x86g_calculate_eflags_all(),
13762                        mkU32( (1<<1)|(1<<9) ) ));
13763
13764      /* Patch in the D flag.  This can simply be a copy of bit 10 of
13765         baseBlock[OFFB_DFLAG]. */
13766      t3 = newTemp(Ity_I32);
13767      assign( t3, binop(Iop_Or32,
13768                        mkexpr(t2),
13769                        binop(Iop_And32,
13770                              IRExpr_Get(OFFB_DFLAG,Ity_I32),
13771                              mkU32(1<<10)))
13772            );
13773
13774      /* And patch in the ID flag. */
13775      t4 = newTemp(Ity_I32);
13776      assign( t4, binop(Iop_Or32,
13777                        mkexpr(t3),
13778                        binop(Iop_And32,
13779                              binop(Iop_Shl32, IRExpr_Get(OFFB_IDFLAG,Ity_I32),
13780                                               mkU8(21)),
13781                              mkU32(1<<21)))
13782            );
13783
13784      /* And patch in the AC flag. */
13785      t5 = newTemp(Ity_I32);
13786      assign( t5, binop(Iop_Or32,
13787                        mkexpr(t4),
13788                        binop(Iop_And32,
13789                              binop(Iop_Shl32, IRExpr_Get(OFFB_ACFLAG,Ity_I32),
13790                                               mkU8(18)),
13791                              mkU32(1<<18)))
13792            );
13793
13794      /* if sz==2, the stored value needs to be narrowed. */
13795      if (sz == 2)
13796        storeLE( mkexpr(t1), unop(Iop_32to16,mkexpr(t5)) );
13797      else
13798        storeLE( mkexpr(t1), mkexpr(t5) );
13799
13800      DIP("pushf%c\n", nameISize(sz));
13801      break;
13802   }
13803
13804   case 0x60: /* PUSHA */
13805      /* This is almost certainly wrong for sz==2.  So ... */
13806      if (sz != 4) goto decode_failure;
13807
13808      /* This is the Right Way, in that the value to be pushed is
13809         established before %esp is changed, so that pusha
13810         correctly pushes the old %esp value.  New value of %esp is
13811         pushed at start. */
13812      /* t0 is the %ESP value we're going to push. */
13813      t0 = newTemp(Ity_I32);
13814      assign( t0, getIReg(4, R_ESP) );
13815
13816      /* t5 will be the new %ESP value. */
13817      t5 = newTemp(Ity_I32);
13818      assign( t5, binop(Iop_Sub32, mkexpr(t0), mkU32(8*4)) );
13819
13820      /* Update guest state before prodding memory. */
13821      putIReg(4, R_ESP, mkexpr(t5));
13822
13823      /* Dump all the registers. */
13824      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(28)), getIReg(4,R_EAX) );
13825      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(24)), getIReg(4,R_ECX) );
13826      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(20)), getIReg(4,R_EDX) );
13827      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(16)), getIReg(4,R_EBX) );
13828      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(12)), mkexpr(t0) /*esp*/);
13829      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 8)), getIReg(4,R_EBP) );
13830      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 4)), getIReg(4,R_ESI) );
13831      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 0)), getIReg(4,R_EDI) );
13832
13833      DIP("pusha%c\n", nameISize(sz));
13834      break;
13835
13836   case 0x0E: /* PUSH %CS */
13837      dis_push_segreg( R_CS, sz ); break;
13838   case 0x1E: /* PUSH %DS */
13839      dis_push_segreg( R_DS, sz ); break;
13840   case 0x06: /* PUSH %ES */
13841      dis_push_segreg( R_ES, sz ); break;
13842   case 0x16: /* PUSH %SS */
13843      dis_push_segreg( R_SS, sz ); break;
13844
13845   /* ------------------------ SCAS et al ----------------- */
13846
13847   case 0xA4: /* MOVS, no REP prefix */
13848   case 0xA5:
13849      if (sorb != 0)
13850         goto decode_failure; /* else dis_string_op asserts */
13851      dis_string_op( dis_MOVS, ( opc == 0xA4 ? 1 : sz ), "movs", sorb );
13852      break;
13853
13854  case 0xA6: /* CMPSb, no REP prefix */
13855  case 0xA7:
13856      if (sorb != 0)
13857         goto decode_failure; /* else dis_string_op asserts */
13858      dis_string_op( dis_CMPS, ( opc == 0xA6 ? 1 : sz ), "cmps", sorb );
13859      break;
13860
13861   case 0xAA: /* STOS, no REP prefix */
13862   case 0xAB:
13863      if (sorb != 0)
13864         goto decode_failure; /* else dis_string_op asserts */
13865      dis_string_op( dis_STOS, ( opc == 0xAA ? 1 : sz ), "stos", sorb );
13866      break;
13867
13868   case 0xAC: /* LODS, no REP prefix */
13869   case 0xAD:
13870      if (sorb != 0)
13871         goto decode_failure; /* else dis_string_op asserts */
13872      dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", sorb );
13873      break;
13874
13875   case 0xAE: /* SCAS, no REP prefix */
13876   case 0xAF:
13877      if (sorb != 0)
13878         goto decode_failure; /* else dis_string_op asserts */
13879      dis_string_op( dis_SCAS, ( opc == 0xAE ? 1 : sz ), "scas", sorb );
13880      break;
13881
13882
13883   case 0xFC: /* CLD */
13884      stmt( IRStmt_Put( OFFB_DFLAG, mkU32(1)) );
13885      DIP("cld\n");
13886      break;
13887
13888   case 0xFD: /* STD */
13889      stmt( IRStmt_Put( OFFB_DFLAG, mkU32(0xFFFFFFFF)) );
13890      DIP("std\n");
13891      break;
13892
13893   case 0xF8: /* CLC */
13894   case 0xF9: /* STC */
13895   case 0xF5: /* CMC */
13896      t0 = newTemp(Ity_I32);
13897      t1 = newTemp(Ity_I32);
13898      assign( t0, mk_x86g_calculate_eflags_all() );
13899      switch (opc) {
13900         case 0xF8:
13901            assign( t1, binop(Iop_And32, mkexpr(t0),
13902                                         mkU32(~X86G_CC_MASK_C)));
13903            DIP("clc\n");
13904            break;
13905         case 0xF9:
13906            assign( t1, binop(Iop_Or32, mkexpr(t0),
13907                                        mkU32(X86G_CC_MASK_C)));
13908            DIP("stc\n");
13909            break;
13910         case 0xF5:
13911            assign( t1, binop(Iop_Xor32, mkexpr(t0),
13912                                         mkU32(X86G_CC_MASK_C)));
13913            DIP("cmc\n");
13914            break;
13915         default:
13916            vpanic("disInstr(x86)(clc/stc/cmc)");
13917      }
13918      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
13919      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
13920      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t1) ));
13921      /* Set NDEP even though it isn't used.  This makes redundant-PUT
13922         elimination of previous stores to this field work better. */
13923      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
13924      break;
13925
13926   case 0xD6: /* SALC */
13927      t0 = newTemp(Ity_I32);
13928      t1 = newTemp(Ity_I32);
13929      assign( t0,  binop(Iop_And32,
13930                         mk_x86g_calculate_eflags_c(),
13931                         mkU32(1)) );
13932      assign( t1, binop(Iop_Sar32,
13933                        binop(Iop_Shl32, mkexpr(t0), mkU8(31)),
13934                        mkU8(31)) );
13935      putIReg(1, R_EAX, unop(Iop_32to8, mkexpr(t1)) );
13936      DIP("salc\n");
13937      break;
13938
13939   /* REPNE prefix insn */
13940   case 0xF2: {
13941      Addr32 eip_orig = guest_EIP_bbstart + delta_start;
13942      if (sorb != 0) goto decode_failure;
13943      abyte = getIByte(delta); delta++;
13944
13945      if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
13946      dres.whatNext = Dis_StopHere;
13947
13948      switch (abyte) {
13949      /* According to the Intel manual, "repne movs" should never occur, but
13950       * in practice it has happened, so allow for it here... */
13951      case 0xA4: sz = 1;   /* REPNE MOVS<sz> */
13952      case 0xA5:
13953         dis_REP_op ( X86CondNZ, dis_MOVS, sz, eip_orig,
13954                                 guest_EIP_bbstart+delta, "repne movs" );
13955         break;
13956
13957      case 0xA6: sz = 1;   /* REPNE CMP<sz> */
13958      case 0xA7:
13959         dis_REP_op ( X86CondNZ, dis_CMPS, sz, eip_orig,
13960                                 guest_EIP_bbstart+delta, "repne cmps" );
13961         break;
13962
13963      case 0xAA: sz = 1;   /* REPNE STOS<sz> */
13964      case 0xAB:
13965         dis_REP_op ( X86CondNZ, dis_STOS, sz, eip_orig,
13966                                 guest_EIP_bbstart+delta, "repne stos" );
13967         break;
13968
13969      case 0xAE: sz = 1;   /* REPNE SCAS<sz> */
13970      case 0xAF:
13971         dis_REP_op ( X86CondNZ, dis_SCAS, sz, eip_orig,
13972                                 guest_EIP_bbstart+delta, "repne scas" );
13973         break;
13974
13975      default:
13976         goto decode_failure;
13977      }
13978      break;
13979   }
13980
13981   /* REP/REPE prefix insn (for SCAS and CMPS, 0xF3 means REPE,
13982      for the rest, it means REP) */
13983   case 0xF3: {
13984      Addr32 eip_orig = guest_EIP_bbstart + delta_start;
13985      if (sorb != 0) goto decode_failure;
13986      abyte = getIByte(delta); delta++;
13987
13988      if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
13989      dres.whatNext = Dis_StopHere;
13990
13991      switch (abyte) {
13992      case 0xA4: sz = 1;   /* REP MOVS<sz> */
13993      case 0xA5:
13994         dis_REP_op ( X86CondAlways, dis_MOVS, sz, eip_orig,
13995                                     guest_EIP_bbstart+delta, "rep movs" );
13996         break;
13997
13998      case 0xA6: sz = 1;   /* REPE CMP<sz> */
13999      case 0xA7:
14000         dis_REP_op ( X86CondZ, dis_CMPS, sz, eip_orig,
14001                                guest_EIP_bbstart+delta, "repe cmps" );
14002         break;
14003
14004      case 0xAA: sz = 1;   /* REP STOS<sz> */
14005      case 0xAB:
14006         dis_REP_op ( X86CondAlways, dis_STOS, sz, eip_orig,
14007                                     guest_EIP_bbstart+delta, "rep stos" );
14008         break;
14009
14010      case 0xAC: sz = 1;   /* REP LODS<sz> */
14011      case 0xAD:
14012         dis_REP_op ( X86CondAlways, dis_LODS, sz, eip_orig,
14013                                     guest_EIP_bbstart+delta, "rep lods" );
14014         break;
14015
14016      case 0xAE: sz = 1;   /* REPE SCAS<sz> */
14017      case 0xAF:
14018         dis_REP_op ( X86CondZ, dis_SCAS, sz, eip_orig,
14019                                guest_EIP_bbstart+delta, "repe scas" );
14020         break;
14021
14022      case 0x90:           /* REP NOP (PAUSE) */
14023         /* a hint to the P4 re spin-wait loop */
14024         DIP("rep nop (P4 pause)\n");
14025         /* "observe" the hint.  The Vex client needs to be careful not
14026            to cause very long delays as a result, though. */
14027         jmp_lit(Ijk_Yield, ((Addr32)guest_EIP_bbstart)+delta);
14028         dres.whatNext = Dis_StopHere;
14029         break;
14030
14031      case 0xC3:           /* REP RET -- same as normal ret? */
14032         dis_ret(0);
14033         dres.whatNext = Dis_StopHere;
14034         DIP("rep ret\n");
14035         break;
14036
14037      default:
14038         goto decode_failure;
14039      }
14040      break;
14041   }
14042
14043   /* ------------------------ XCHG ----------------------- */
14044
14045   /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
14046      prefix; hence it must be translated with an IRCAS (at least, the
14047      memory variant). */
14048   case 0x86: /* XCHG Gb,Eb */
14049      sz = 1;
14050      /* Fall through ... */
14051   case 0x87: /* XCHG Gv,Ev */
14052      modrm = getIByte(delta);
14053      ty = szToITy(sz);
14054      t1 = newTemp(ty); t2 = newTemp(ty);
14055      if (epartIsReg(modrm)) {
14056         assign(t1, getIReg(sz, eregOfRM(modrm)));
14057         assign(t2, getIReg(sz, gregOfRM(modrm)));
14058         putIReg(sz, gregOfRM(modrm), mkexpr(t1));
14059         putIReg(sz, eregOfRM(modrm), mkexpr(t2));
14060         delta++;
14061         DIP("xchg%c %s, %s\n",
14062             nameISize(sz), nameIReg(sz,gregOfRM(modrm)),
14063                            nameIReg(sz,eregOfRM(modrm)));
14064      } else {
14065         *expect_CAS = True;
14066         addr = disAMode ( &alen, sorb, delta, dis_buf );
14067         assign( t1, loadLE(ty,mkexpr(addr)) );
14068         assign( t2, getIReg(sz,gregOfRM(modrm)) );
14069         casLE( mkexpr(addr),
14070                mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
14071         putIReg( sz, gregOfRM(modrm), mkexpr(t1) );
14072         delta += alen;
14073         DIP("xchg%c %s, %s\n", nameISize(sz),
14074                                nameIReg(sz,gregOfRM(modrm)), dis_buf);
14075      }
14076      break;
14077
14078   case 0x90: /* XCHG eAX,eAX */
14079      DIP("nop\n");
14080      break;
14081   case 0x91: /* XCHG eAX,eCX */
14082   case 0x92: /* XCHG eAX,eDX */
14083   case 0x93: /* XCHG eAX,eBX */
14084   case 0x94: /* XCHG eAX,eSP */
14085   case 0x95: /* XCHG eAX,eBP */
14086   case 0x96: /* XCHG eAX,eSI */
14087   case 0x97: /* XCHG eAX,eDI */
14088      codegen_xchg_eAX_Reg ( sz, opc - 0x90 );
14089      break;
14090
14091   /* ------------------------ XLAT ----------------------- */
14092
14093   case 0xD7: /* XLAT */
14094      if (sz != 4) goto decode_failure; /* sz == 2 is also allowed (0x66) */
14095      putIReg(
14096         1,
14097         R_EAX/*AL*/,
14098         loadLE(Ity_I8,
14099                handleSegOverride(
14100                   sorb,
14101                   binop(Iop_Add32,
14102                         getIReg(4, R_EBX),
14103                         unop(Iop_8Uto32, getIReg(1, R_EAX/*AL*/))))));
14104
14105      DIP("xlat%c [ebx]\n", nameISize(sz));
14106      break;
14107
14108   /* ------------------------ IN / OUT ----------------------- */
14109
14110   case 0xE4: /* IN imm8, AL */
14111      sz = 1;
14112      t1 = newTemp(Ity_I32);
14113      abyte = getIByte(delta); delta++;
14114      assign(t1, mkU32( abyte & 0xFF ));
14115      DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIReg(sz,R_EAX));
14116      goto do_IN;
14117   case 0xE5: /* IN imm8, eAX */
14118      vassert(sz == 2 || sz == 4);
14119      t1 = newTemp(Ity_I32);
14120      abyte = getIByte(delta); delta++;
14121      assign(t1, mkU32( abyte & 0xFF ));
14122      DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIReg(sz,R_EAX));
14123      goto do_IN;
14124   case 0xEC: /* IN %DX, AL */
14125      sz = 1;
14126      t1 = newTemp(Ity_I32);
14127      assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
14128      DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
14129                                         nameIReg(sz,R_EAX));
14130      goto do_IN;
14131   case 0xED: /* IN %DX, eAX */
14132      vassert(sz == 2 || sz == 4);
14133      t1 = newTemp(Ity_I32);
14134      assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
14135      DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
14136                                         nameIReg(sz,R_EAX));
14137      goto do_IN;
14138   do_IN: {
14139      /* At this point, sz indicates the width, and t1 is a 32-bit
14140         value giving port number. */
14141      IRDirty* d;
14142      vassert(sz == 1 || sz == 2 || sz == 4);
14143      ty = szToITy(sz);
14144      t2 = newTemp(Ity_I32);
14145      d = unsafeIRDirty_1_N(
14146             t2,
14147             0/*regparms*/,
14148             "x86g_dirtyhelper_IN",
14149             &x86g_dirtyhelper_IN,
14150             mkIRExprVec_2( mkexpr(t1), mkU32(sz) )
14151          );
14152      /* do the call, dumping the result in t2. */
14153      stmt( IRStmt_Dirty(d) );
14154      putIReg(sz, R_EAX, narrowTo( ty, mkexpr(t2) ) );
14155      break;
14156   }
14157
14158   case 0xE6: /* OUT AL, imm8 */
14159      sz = 1;
14160      t1 = newTemp(Ity_I32);
14161      abyte = getIByte(delta); delta++;
14162      assign( t1, mkU32( abyte & 0xFF ) );
14163      DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), (Int)abyte);
14164      goto do_OUT;
14165   case 0xE7: /* OUT eAX, imm8 */
14166      vassert(sz == 2 || sz == 4);
14167      t1 = newTemp(Ity_I32);
14168      abyte = getIByte(delta); delta++;
14169      assign( t1, mkU32( abyte & 0xFF ) );
14170      DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), (Int)abyte);
14171      goto do_OUT;
14172   case 0xEE: /* OUT AL, %DX */
14173      sz = 1;
14174      t1 = newTemp(Ity_I32);
14175      assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
14176      DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
14177                                          nameIReg(2,R_EDX));
14178      goto do_OUT;
14179   case 0xEF: /* OUT eAX, %DX */
14180      vassert(sz == 2 || sz == 4);
14181      t1 = newTemp(Ity_I32);
14182      assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
14183      DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
14184                                          nameIReg(2,R_EDX));
14185      goto do_OUT;
14186   do_OUT: {
14187      /* At this point, sz indicates the width, and t1 is a 32-bit
14188         value giving port number. */
14189      IRDirty* d;
14190      vassert(sz == 1 || sz == 2 || sz == 4);
14191      ty = szToITy(sz);
14192      d = unsafeIRDirty_0_N(
14193             0/*regparms*/,
14194             "x86g_dirtyhelper_OUT",
14195             &x86g_dirtyhelper_OUT,
14196             mkIRExprVec_3( mkexpr(t1),
14197                            widenUto32( getIReg(sz, R_EAX) ),
14198                            mkU32(sz) )
14199          );
14200      stmt( IRStmt_Dirty(d) );
14201      break;
14202   }
14203
14204   /* ------------------------ (Grp1 extensions) ---------- */
14205
14206   case 0x82: /* Grp1 Ib,Eb too.  Apparently this is the same as
14207                 case 0x80, but only in 32-bit mode. */
14208      /* fallthru */
14209   case 0x80: /* Grp1 Ib,Eb */
14210      modrm = getIByte(delta);
14211      am_sz = lengthAMode(delta);
14212      sz    = 1;
14213      d_sz  = 1;
14214      d32   = getUChar(delta + am_sz);
14215      delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
14216      break;
14217
14218   case 0x81: /* Grp1 Iv,Ev */
14219      modrm = getIByte(delta);
14220      am_sz = lengthAMode(delta);
14221      d_sz  = sz;
14222      d32   = getUDisp(d_sz, delta + am_sz);
14223      delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
14224      break;
14225
14226   case 0x83: /* Grp1 Ib,Ev */
14227      modrm = getIByte(delta);
14228      am_sz = lengthAMode(delta);
14229      d_sz  = 1;
14230      d32   = getSDisp8(delta + am_sz);
14231      delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
14232      break;
14233
14234   /* ------------------------ (Grp2 extensions) ---------- */
14235
14236   case 0xC0: { /* Grp2 Ib,Eb */
14237      Bool decode_OK = True;
14238      modrm = getIByte(delta);
14239      am_sz = lengthAMode(delta);
14240      d_sz  = 1;
14241      d32   = getUChar(delta + am_sz);
14242      sz    = 1;
14243      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14244                         mkU8(d32 & 0xFF), NULL, &decode_OK );
14245      if (!decode_OK)
14246         goto decode_failure;
14247      break;
14248   }
14249   case 0xC1: { /* Grp2 Ib,Ev */
14250      Bool decode_OK = True;
14251      modrm = getIByte(delta);
14252      am_sz = lengthAMode(delta);
14253      d_sz  = 1;
14254      d32   = getUChar(delta + am_sz);
14255      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14256                         mkU8(d32 & 0xFF), NULL, &decode_OK );
14257      if (!decode_OK)
14258         goto decode_failure;
14259      break;
14260   }
14261   case 0xD0: { /* Grp2 1,Eb */
14262      Bool decode_OK = True;
14263      modrm = getIByte(delta);
14264      am_sz = lengthAMode(delta);
14265      d_sz  = 0;
14266      d32   = 1;
14267      sz    = 1;
14268      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14269                         mkU8(d32), NULL, &decode_OK );
14270      if (!decode_OK)
14271         goto decode_failure;
14272      break;
14273   }
14274   case 0xD1: { /* Grp2 1,Ev */
14275      Bool decode_OK = True;
14276      modrm = getUChar(delta);
14277      am_sz = lengthAMode(delta);
14278      d_sz  = 0;
14279      d32   = 1;
14280      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14281                         mkU8(d32), NULL, &decode_OK );
14282      if (!decode_OK)
14283         goto decode_failure;
14284      break;
14285   }
14286   case 0xD2: { /* Grp2 CL,Eb */
14287      Bool decode_OK = True;
14288      modrm = getUChar(delta);
14289      am_sz = lengthAMode(delta);
14290      d_sz  = 0;
14291      sz    = 1;
14292      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14293                         getIReg(1,R_ECX), "%cl", &decode_OK );
14294      if (!decode_OK)
14295         goto decode_failure;
14296      break;
14297   }
14298   case 0xD3: { /* Grp2 CL,Ev */
14299      Bool decode_OK = True;
14300      modrm = getIByte(delta);
14301      am_sz = lengthAMode(delta);
14302      d_sz  = 0;
14303      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14304                         getIReg(1,R_ECX), "%cl", &decode_OK );
14305      if (!decode_OK)
14306         goto decode_failure;
14307      break;
14308   }
14309
14310   /* ------------------------ (Grp3 extensions) ---------- */
14311
14312   case 0xF6: { /* Grp3 Eb */
14313      Bool decode_OK = True;
14314      delta = dis_Grp3 ( sorb, pfx_lock, 1, delta, &decode_OK );
14315      if (!decode_OK)
14316         goto decode_failure;
14317      break;
14318   }
14319   case 0xF7: { /* Grp3 Ev */
14320      Bool decode_OK = True;
14321      delta = dis_Grp3 ( sorb, pfx_lock, sz, delta, &decode_OK );
14322      if (!decode_OK)
14323         goto decode_failure;
14324      break;
14325   }
14326
14327   /* ------------------------ (Grp4 extensions) ---------- */
14328
14329   case 0xFE: { /* Grp4 Eb */
14330      Bool decode_OK = True;
14331      delta = dis_Grp4 ( sorb, pfx_lock, delta, &decode_OK );
14332      if (!decode_OK)
14333         goto decode_failure;
14334      break;
14335   }
14336
14337   /* ------------------------ (Grp5 extensions) ---------- */
14338
14339   case 0xFF: { /* Grp5 Ev */
14340      Bool decode_OK = True;
14341      delta = dis_Grp5 ( sorb, pfx_lock, sz, delta, &dres, &decode_OK );
14342      if (!decode_OK)
14343         goto decode_failure;
14344      break;
14345   }
14346
14347   /* ------------------------ Escapes to 2-byte opcodes -- */
14348
14349   case 0x0F: {
14350      opc = getIByte(delta); delta++;
14351      switch (opc) {
14352
14353      /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
14354
14355      case 0xBA: { /* Grp8 Ib,Ev */
14356         Bool decode_OK = False;
14357         modrm = getUChar(delta);
14358         am_sz = lengthAMode(delta);
14359         d32   = getSDisp8(delta + am_sz);
14360         delta = dis_Grp8_Imm ( sorb, pfx_lock, delta, modrm,
14361                                am_sz, sz, d32, &decode_OK );
14362         if (!decode_OK)
14363            goto decode_failure;
14364         break;
14365      }
14366
14367      /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
14368
14369      case 0xBC: /* BSF Gv,Ev */
14370         delta = dis_bs_E_G ( sorb, sz, delta, True );
14371         break;
14372      case 0xBD: /* BSR Gv,Ev */
14373         delta = dis_bs_E_G ( sorb, sz, delta, False );
14374         break;
14375
14376      /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
14377
14378      case 0xC8: /* BSWAP %eax */
14379      case 0xC9:
14380      case 0xCA:
14381      case 0xCB:
14382      case 0xCC:
14383      case 0xCD:
14384      case 0xCE:
14385      case 0xCF: /* BSWAP %edi */
14386         /* AFAICS from the Intel docs, this only exists at size 4. */
14387         vassert(sz == 4);
14388         t1 = newTemp(Ity_I32);
14389         t2 = newTemp(Ity_I32);
14390         assign( t1, getIReg(4, opc-0xC8) );
14391
14392         assign( t2,
14393            binop(Iop_Or32,
14394               binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
14395            binop(Iop_Or32,
14396               binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
14397                                mkU32(0x00FF0000)),
14398            binop(Iop_Or32,
14399               binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
14400                                mkU32(0x0000FF00)),
14401               binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
14402                                mkU32(0x000000FF) )
14403            )))
14404         );
14405
14406         putIReg(4, opc-0xC8, mkexpr(t2));
14407         DIP("bswapl %s\n", nameIReg(4, opc-0xC8));
14408         break;
14409
14410      /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
14411
14412      case 0xA3: /* BT Gv,Ev */
14413         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpNone );
14414         break;
14415      case 0xB3: /* BTR Gv,Ev */
14416         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpReset );
14417         break;
14418      case 0xAB: /* BTS Gv,Ev */
14419         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpSet );
14420         break;
14421      case 0xBB: /* BTC Gv,Ev */
14422         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpComp );
14423         break;
14424
14425      /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
14426
14427      case 0x40:
14428      case 0x41:
14429      case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
14430      case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
14431      case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
14432      case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
14433      case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
14434      case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
14435      case 0x48: /* CMOVSb (cmov negative) */
14436      case 0x49: /* CMOVSb (cmov not negative) */
14437      case 0x4A: /* CMOVP (cmov parity even) */
14438      case 0x4B: /* CMOVNP (cmov parity odd) */
14439      case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
14440      case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
14441      case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
14442      case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
14443         delta = dis_cmov_E_G(sorb, sz, (X86Condcode)(opc - 0x40), delta);
14444         break;
14445
14446      /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
14447
14448      case 0xB0: /* CMPXCHG Gb,Eb */
14449         delta = dis_cmpxchg_G_E ( sorb, pfx_lock, 1, delta );
14450         break;
14451      case 0xB1: /* CMPXCHG Gv,Ev */
14452         delta = dis_cmpxchg_G_E ( sorb, pfx_lock, sz, delta );
14453         break;
14454
14455      case 0xC7: { /* CMPXCHG8B Gv (0F C7 /1) */
14456         IRTemp expdHi    = newTemp(Ity_I32);
14457         IRTemp expdLo    = newTemp(Ity_I32);
14458         IRTemp dataHi    = newTemp(Ity_I32);
14459         IRTemp dataLo    = newTemp(Ity_I32);
14460         IRTemp oldHi     = newTemp(Ity_I32);
14461         IRTemp oldLo     = newTemp(Ity_I32);
14462         IRTemp flags_old = newTemp(Ity_I32);
14463         IRTemp flags_new = newTemp(Ity_I32);
14464         IRTemp success   = newTemp(Ity_I1);
14465
14466         /* Translate this using a DCAS, even if there is no LOCK
14467            prefix.  Life is too short to bother with generating two
14468            different translations for the with/without-LOCK-prefix
14469            cases. */
14470         *expect_CAS = True;
14471
14472	 /* Decode, and generate address. */
14473         if (sz != 4) goto decode_failure;
14474         modrm = getIByte(delta);
14475         if (epartIsReg(modrm)) goto decode_failure;
14476         if (gregOfRM(modrm) != 1) goto decode_failure;
14477         addr = disAMode ( &alen, sorb, delta, dis_buf );
14478         delta += alen;
14479
14480         /* Get the expected and new values. */
14481         assign( expdHi, getIReg(4,R_EDX) );
14482         assign( expdLo, getIReg(4,R_EAX) );
14483         assign( dataHi, getIReg(4,R_ECX) );
14484         assign( dataLo, getIReg(4,R_EBX) );
14485
14486         /* Do the DCAS */
14487         stmt( IRStmt_CAS(
14488                  mkIRCAS( oldHi, oldLo,
14489                           Iend_LE, mkexpr(addr),
14490                           mkexpr(expdHi), mkexpr(expdLo),
14491                           mkexpr(dataHi), mkexpr(dataLo)
14492               )));
14493
14494         /* success when oldHi:oldLo == expdHi:expdLo */
14495         assign( success,
14496                 binop(Iop_CasCmpEQ32,
14497                       binop(Iop_Or32,
14498                             binop(Iop_Xor32, mkexpr(oldHi), mkexpr(expdHi)),
14499                             binop(Iop_Xor32, mkexpr(oldLo), mkexpr(expdLo))
14500                       ),
14501                       mkU32(0)
14502                 ));
14503
14504         /* If the DCAS is successful, that is to say oldHi:oldLo ==
14505            expdHi:expdLo, then put expdHi:expdLo back in EDX:EAX,
14506            which is where they came from originally.  Both the actual
14507            contents of these two regs, and any shadow values, are
14508            unchanged.  If the DCAS fails then we're putting into
14509            EDX:EAX the value seen in memory. */
14510         putIReg(4, R_EDX,
14511                    IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
14512                                  mkexpr(oldHi),
14513                                  mkexpr(expdHi)
14514                ));
14515         putIReg(4, R_EAX,
14516                    IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
14517                                  mkexpr(oldLo),
14518                                  mkexpr(expdLo)
14519                ));
14520
14521         /* Copy the success bit into the Z flag and leave the others
14522            unchanged */
14523         assign( flags_old, widenUto32(mk_x86g_calculate_eflags_all()));
14524         assign(
14525            flags_new,
14526            binop(Iop_Or32,
14527                  binop(Iop_And32, mkexpr(flags_old),
14528                                   mkU32(~X86G_CC_MASK_Z)),
14529                  binop(Iop_Shl32,
14530                        binop(Iop_And32,
14531                              unop(Iop_1Uto32, mkexpr(success)), mkU32(1)),
14532                        mkU8(X86G_CC_SHIFT_Z)) ));
14533
14534         stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
14535         stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
14536         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
14537         /* Set NDEP even though it isn't used.  This makes
14538            redundant-PUT elimination of previous stores to this field
14539            work better. */
14540         stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
14541
14542         /* Sheesh.  Aren't you glad it was me and not you that had to
14543	    write and validate all this grunge? */
14544
14545	 DIP("cmpxchg8b %s\n", dis_buf);
14546	 break;
14547      }
14548
14549      /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
14550
14551      case 0xA2: { /* CPUID */
14552         /* Uses dirty helper:
14553               void dirtyhelper_CPUID_sse[012] ( VexGuestX86State* )
14554            declared to mod eax, wr ebx, ecx, edx
14555         */
14556         IRDirty* d     = NULL;
14557         HChar*   fName = NULL;
14558         void*    fAddr = NULL;
14559         if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2) {
14560            fName = "x86g_dirtyhelper_CPUID_sse2";
14561            fAddr = &x86g_dirtyhelper_CPUID_sse2;
14562         }
14563         else
14564         if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE1) {
14565            fName = "x86g_dirtyhelper_CPUID_sse1";
14566            fAddr = &x86g_dirtyhelper_CPUID_sse1;
14567         }
14568         else
14569         if (archinfo->hwcaps == 0/*no SSE*/) {
14570            fName = "x86g_dirtyhelper_CPUID_sse0";
14571            fAddr = &x86g_dirtyhelper_CPUID_sse0;
14572         } else
14573            vpanic("disInstr(x86)(cpuid)");
14574
14575         vassert(fName); vassert(fAddr);
14576         d = unsafeIRDirty_0_N ( 0/*regparms*/,
14577                                 fName, fAddr, mkIRExprVec_0() );
14578         /* declare guest state effects */
14579         d->needsBBP = True;
14580         d->nFxState = 4;
14581         d->fxState[0].fx     = Ifx_Modify;
14582         d->fxState[0].offset = OFFB_EAX;
14583         d->fxState[0].size   = 4;
14584         d->fxState[1].fx     = Ifx_Write;
14585         d->fxState[1].offset = OFFB_EBX;
14586         d->fxState[1].size   = 4;
14587         d->fxState[2].fx     = Ifx_Modify;
14588         d->fxState[2].offset = OFFB_ECX;
14589         d->fxState[2].size   = 4;
14590         d->fxState[3].fx     = Ifx_Write;
14591         d->fxState[3].offset = OFFB_EDX;
14592         d->fxState[3].size   = 4;
14593         /* execute the dirty call, side-effecting guest state */
14594         stmt( IRStmt_Dirty(d) );
14595         /* CPUID is a serialising insn.  So, just in case someone is
14596            using it as a memory fence ... */
14597         stmt( IRStmt_MBE(Imbe_Fence) );
14598         DIP("cpuid\n");
14599         break;
14600      }
14601
14602//--          if (!VG_(cpu_has_feature)(VG_X86_FEAT_CPUID))
14603//--             goto decode_failure;
14604//--
14605//--          t1 = newTemp(cb);
14606//--          t2 = newTemp(cb);
14607//--          t3 = newTemp(cb);
14608//--          t4 = newTemp(cb);
14609//--          uInstr0(cb, CALLM_S, 0);
14610//--
14611//--          uInstr2(cb, GET,   4, ArchReg, R_EAX, TempReg, t1);
14612//--          uInstr1(cb, PUSH,  4, TempReg, t1);
14613//--
14614//--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t2);
14615//--          uLiteral(cb, 0);
14616//--          uInstr1(cb, PUSH,  4, TempReg, t2);
14617//--
14618//--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t3);
14619//--          uLiteral(cb, 0);
14620//--          uInstr1(cb, PUSH,  4, TempReg, t3);
14621//--
14622//--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t4);
14623//--          uLiteral(cb, 0);
14624//--          uInstr1(cb, PUSH,  4, TempReg, t4);
14625//--
14626//--          uInstr1(cb, CALLM, 0, Lit16,   VGOFF_(helper_CPUID));
14627//--          uFlagsRWU(cb, FlagsEmpty, FlagsEmpty, FlagsEmpty);
14628//--
14629//--          uInstr1(cb, POP,   4, TempReg, t4);
14630//--          uInstr2(cb, PUT,   4, TempReg, t4, ArchReg, R_EDX);
14631//--
14632//--          uInstr1(cb, POP,   4, TempReg, t3);
14633//--          uInstr2(cb, PUT,   4, TempReg, t3, ArchReg, R_ECX);
14634//--
14635//--          uInstr1(cb, POP,   4, TempReg, t2);
14636//--          uInstr2(cb, PUT,   4, TempReg, t2, ArchReg, R_EBX);
14637//--
14638//--          uInstr1(cb, POP,   4, TempReg, t1);
14639//--          uInstr2(cb, PUT,   4, TempReg, t1, ArchReg, R_EAX);
14640//--
14641//--          uInstr0(cb, CALLM_E, 0);
14642//--          DIP("cpuid\n");
14643//--          break;
14644//--
14645      /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
14646
14647      case 0xB6: /* MOVZXb Eb,Gv */
14648         if (sz != 2 && sz != 4)
14649            goto decode_failure;
14650         delta = dis_movx_E_G ( sorb, delta, 1, sz, False );
14651         break;
14652
14653      case 0xB7: /* MOVZXw Ew,Gv */
14654         if (sz != 4)
14655            goto decode_failure;
14656         delta = dis_movx_E_G ( sorb, delta, 2, 4, False );
14657         break;
14658
14659      case 0xBE: /* MOVSXb Eb,Gv */
14660         if (sz != 2 && sz != 4)
14661            goto decode_failure;
14662         delta = dis_movx_E_G ( sorb, delta, 1, sz, True );
14663         break;
14664
14665      case 0xBF: /* MOVSXw Ew,Gv */
14666         if (sz != 4 && /* accept movsww, sigh, see #250799 */sz != 2)
14667            goto decode_failure;
14668         delta = dis_movx_E_G ( sorb, delta, 2, sz, True );
14669         break;
14670
14671//--       /* =-=-=-=-=-=-=-=-=-=-= MOVNTI -=-=-=-=-=-=-=-=-= */
14672//--
14673//--       case 0xC3: /* MOVNTI Gv,Ev */
14674//--          vg_assert(sz == 4);
14675//--          modrm = getUChar(eip);
14676//--          vg_assert(!epartIsReg(modrm));
14677//--          t1 = newTemp(cb);
14678//--          uInstr2(cb, GET, 4, ArchReg, gregOfRM(modrm), TempReg, t1);
14679//--          pair = disAMode ( cb, sorb, eip, dis_buf );
14680//--          t2 = LOW24(pair);
14681//--          eip += HI8(pair);
14682//--          uInstr2(cb, STORE, 4, TempReg, t1, TempReg, t2);
14683//--          DIP("movnti %s,%s\n", nameIReg(4,gregOfRM(modrm)), dis_buf);
14684//--          break;
14685
14686      /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
14687
14688      case 0xAF: /* IMUL Ev, Gv */
14689         delta = dis_mul_E_G ( sorb, sz, delta );
14690         break;
14691
14692      /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
14693
14694      case 0x1F:
14695         modrm = getUChar(delta);
14696         if (epartIsReg(modrm)) goto decode_failure;
14697         addr = disAMode ( &alen, sorb, delta, dis_buf );
14698         delta += alen;
14699         DIP("nop%c %s\n", nameISize(sz), dis_buf);
14700         break;
14701
14702      /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
14703      case 0x80:
14704      case 0x81:
14705      case 0x82: /* JBb/JNAEb (jump below) */
14706      case 0x83: /* JNBb/JAEb (jump not below) */
14707      case 0x84: /* JZb/JEb (jump zero) */
14708      case 0x85: /* JNZb/JNEb (jump not zero) */
14709      case 0x86: /* JBEb/JNAb (jump below or equal) */
14710      case 0x87: /* JNBEb/JAb (jump not below or equal) */
14711      case 0x88: /* JSb (jump negative) */
14712      case 0x89: /* JSb (jump not negative) */
14713      case 0x8A: /* JP (jump parity even) */
14714      case 0x8B: /* JNP/JPO (jump parity odd) */
14715      case 0x8C: /* JLb/JNGEb (jump less) */
14716      case 0x8D: /* JGEb/JNLb (jump greater or equal) */
14717      case 0x8E: /* JLEb/JNGb (jump less or equal) */
14718      case 0x8F: /* JGb/JNLEb (jump greater) */
14719       { Int    jmpDelta;
14720         HChar* comment  = "";
14721         jmpDelta = (Int)getUDisp32(delta);
14722         d32 = (((Addr32)guest_EIP_bbstart)+delta+4) + jmpDelta;
14723         delta += 4;
14724         if (resteerCisOk
14725             && vex_control.guest_chase_cond
14726             && (Addr32)d32 != (Addr32)guest_EIP_bbstart
14727             && jmpDelta < 0
14728             && resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
14729            /* Speculation: assume this backward branch is taken.  So
14730               we need to emit a side-exit to the insn following this
14731               one, on the negation of the condition, and continue at
14732               the branch target address (d32).  If we wind up back at
14733               the first instruction of the trace, just stop; it's
14734               better to let the IR loop unroller handle that case.*/
14735            stmt( IRStmt_Exit(
14736                     mk_x86g_calculate_condition((X86Condcode)
14737                                                 (1 ^ (opc - 0x80))),
14738                     Ijk_Boring,
14739                     IRConst_U32(guest_EIP_bbstart+delta) ) );
14740            dres.whatNext   = Dis_ResteerC;
14741            dres.continueAt = (Addr64)(Addr32)d32;
14742            comment = "(assumed taken)";
14743         }
14744         else
14745         if (resteerCisOk
14746             && vex_control.guest_chase_cond
14747             && (Addr32)d32 != (Addr32)guest_EIP_bbstart
14748             && jmpDelta >= 0
14749             && resteerOkFn( callback_opaque,
14750                             (Addr64)(Addr32)(guest_EIP_bbstart+delta)) ) {
14751            /* Speculation: assume this forward branch is not taken.
14752               So we need to emit a side-exit to d32 (the dest) and
14753               continue disassembling at the insn immediately
14754               following this one. */
14755            stmt( IRStmt_Exit(
14756                     mk_x86g_calculate_condition((X86Condcode)(opc - 0x80)),
14757                     Ijk_Boring,
14758                     IRConst_U32(d32) ) );
14759            dres.whatNext   = Dis_ResteerC;
14760            dres.continueAt = (Addr64)(Addr32)(guest_EIP_bbstart+delta);
14761            comment = "(assumed not taken)";
14762         }
14763         else {
14764            /* Conservative default translation - end the block at
14765               this point. */
14766            jcc_01( (X86Condcode)(opc - 0x80),
14767                    (Addr32)(guest_EIP_bbstart+delta), d32);
14768            dres.whatNext = Dis_StopHere;
14769         }
14770         DIP("j%s-32 0x%x %s\n", name_X86Condcode(opc - 0x80), d32, comment);
14771         break;
14772       }
14773
14774      /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
14775      case 0x31: { /* RDTSC */
14776         IRTemp   val  = newTemp(Ity_I64);
14777         IRExpr** args = mkIRExprVec_0();
14778         IRDirty* d    = unsafeIRDirty_1_N (
14779                            val,
14780                            0/*regparms*/,
14781                            "x86g_dirtyhelper_RDTSC",
14782                            &x86g_dirtyhelper_RDTSC,
14783                            args
14784                         );
14785         /* execute the dirty call, dumping the result in val. */
14786         stmt( IRStmt_Dirty(d) );
14787         putIReg(4, R_EDX, unop(Iop_64HIto32, mkexpr(val)));
14788         putIReg(4, R_EAX, unop(Iop_64to32, mkexpr(val)));
14789         DIP("rdtsc\n");
14790         break;
14791      }
14792
14793      /* =-=-=-=-=-=-=-=-=- PUSH/POP Sreg =-=-=-=-=-=-=-=-=-= */
14794
14795      case 0xA1: /* POP %FS */
14796         dis_pop_segreg( R_FS, sz ); break;
14797      case 0xA9: /* POP %GS */
14798         dis_pop_segreg( R_GS, sz ); break;
14799
14800      case 0xA0: /* PUSH %FS */
14801         dis_push_segreg( R_FS, sz ); break;
14802      case 0xA8: /* PUSH %GS */
14803         dis_push_segreg( R_GS, sz ); break;
14804
14805      /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
14806      case 0x90:
14807      case 0x91:
14808      case 0x92: /* set-Bb/set-NAEb (jump below) */
14809      case 0x93: /* set-NBb/set-AEb (jump not below) */
14810      case 0x94: /* set-Zb/set-Eb (jump zero) */
14811      case 0x95: /* set-NZb/set-NEb (jump not zero) */
14812      case 0x96: /* set-BEb/set-NAb (jump below or equal) */
14813      case 0x97: /* set-NBEb/set-Ab (jump not below or equal) */
14814      case 0x98: /* set-Sb (jump negative) */
14815      case 0x99: /* set-Sb (jump not negative) */
14816      case 0x9A: /* set-P (jump parity even) */
14817      case 0x9B: /* set-NP (jump parity odd) */
14818      case 0x9C: /* set-Lb/set-NGEb (jump less) */
14819      case 0x9D: /* set-GEb/set-NLb (jump greater or equal) */
14820      case 0x9E: /* set-LEb/set-NGb (jump less or equal) */
14821      case 0x9F: /* set-Gb/set-NLEb (jump greater) */
14822         t1 = newTemp(Ity_I8);
14823         assign( t1, unop(Iop_1Uto8,mk_x86g_calculate_condition(opc-0x90)) );
14824         modrm = getIByte(delta);
14825         if (epartIsReg(modrm)) {
14826            delta++;
14827            putIReg(1, eregOfRM(modrm), mkexpr(t1));
14828            DIP("set%s %s\n", name_X86Condcode(opc-0x90),
14829                              nameIReg(1,eregOfRM(modrm)));
14830         } else {
14831           addr = disAMode ( &alen, sorb, delta, dis_buf );
14832           delta += alen;
14833           storeLE( mkexpr(addr), mkexpr(t1) );
14834           DIP("set%s %s\n", name_X86Condcode(opc-0x90), dis_buf);
14835         }
14836         break;
14837
14838      /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
14839
14840      case 0xA4: /* SHLDv imm8,Gv,Ev */
14841         modrm = getIByte(delta);
14842         d32   = delta + lengthAMode(delta);
14843         vex_sprintf(dis_buf, "$%d", getIByte(d32));
14844         delta = dis_SHLRD_Gv_Ev (
14845                  sorb, delta, modrm, sz,
14846                  mkU8(getIByte(d32)), True, /* literal */
14847                  dis_buf, True );
14848         break;
14849      case 0xA5: /* SHLDv %cl,Gv,Ev */
14850         modrm = getIByte(delta);
14851         delta = dis_SHLRD_Gv_Ev (
14852                    sorb, delta, modrm, sz,
14853                    getIReg(1,R_ECX), False, /* not literal */
14854                    "%cl", True );
14855         break;
14856
14857      case 0xAC: /* SHRDv imm8,Gv,Ev */
14858         modrm = getIByte(delta);
14859         d32   = delta + lengthAMode(delta);
14860         vex_sprintf(dis_buf, "$%d", getIByte(d32));
14861         delta = dis_SHLRD_Gv_Ev (
14862                    sorb, delta, modrm, sz,
14863                    mkU8(getIByte(d32)), True, /* literal */
14864                    dis_buf, False );
14865         break;
14866      case 0xAD: /* SHRDv %cl,Gv,Ev */
14867         modrm = getIByte(delta);
14868         delta = dis_SHLRD_Gv_Ev (
14869                    sorb, delta, modrm, sz,
14870                    getIReg(1,R_ECX), False, /* not literal */
14871                    "%cl", False );
14872         break;
14873
14874      /* =-=-=-=-=-=-=-=-=- SYSENTER -=-=-=-=-=-=-=-=-=-= */
14875
14876      case 0x34:
14877         /* Simple implementation needing a long explaination.
14878
14879            sysenter is a kind of syscall entry.  The key thing here
14880            is that the return address is not known -- that is
14881            something that is beyond Vex's knowledge.  So this IR
14882            forces a return to the scheduler, which can do what it
14883            likes to simulate the systenter, but it MUST set this
14884            thread's guest_EIP field with the continuation address
14885            before resuming execution.  If that doesn't happen, the
14886            thread will jump to address zero, which is probably
14887            fatal.
14888         */
14889
14890         /* Note where we are, so we can back up the guest to this
14891            point if the syscall needs to be restarted. */
14892         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
14893                           mkU32(guest_EIP_curr_instr) ) );
14894         jmp_lit(Ijk_Sys_sysenter, 0/*bogus next EIP value*/);
14895         dres.whatNext = Dis_StopHere;
14896         DIP("sysenter");
14897         break;
14898
14899      /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
14900
14901      case 0xC0: { /* XADD Gb,Eb */
14902         Bool decodeOK;
14903         delta = dis_xadd_G_E ( sorb, pfx_lock, 1, delta, &decodeOK );
14904         if (!decodeOK) goto decode_failure;
14905         break;
14906      }
14907      case 0xC1: { /* XADD Gv,Ev */
14908         Bool decodeOK;
14909         delta = dis_xadd_G_E ( sorb, pfx_lock, sz, delta, &decodeOK );
14910         if (!decodeOK) goto decode_failure;
14911         break;
14912      }
14913
14914      /* =-=-=-=-=-=-=-=-=- MMXery =-=-=-=-=-=-=-=-=-=-= */
14915
14916      case 0x71:
14917      case 0x72:
14918      case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
14919
14920      case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
14921      case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
14922      case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
14923      case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
14924
14925      case 0xFC:
14926      case 0xFD:
14927      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
14928
14929      case 0xEC:
14930      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
14931
14932      case 0xDC:
14933      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
14934
14935      case 0xF8:
14936      case 0xF9:
14937      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
14938
14939      case 0xE8:
14940      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
14941
14942      case 0xD8:
14943      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
14944
14945      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
14946      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
14947
14948      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
14949
14950      case 0x74:
14951      case 0x75:
14952      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
14953
14954      case 0x64:
14955      case 0x65:
14956      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
14957
14958      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
14959      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
14960      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
14961
14962      case 0x68:
14963      case 0x69:
14964      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
14965
14966      case 0x60:
14967      case 0x61:
14968      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
14969
14970      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
14971      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
14972      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
14973      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
14974
14975      case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
14976      case 0xF2:
14977      case 0xF3:
14978
14979      case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
14980      case 0xD2:
14981      case 0xD3:
14982
14983      case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
14984      case 0xE2:
14985      {
14986         Int  delta0    = delta-1;
14987         Bool decode_OK = False;
14988
14989         /* If sz==2 this is SSE, and we assume sse idec has
14990            already spotted those cases by now. */
14991         if (sz != 4)
14992            goto decode_failure;
14993
14994         delta = dis_MMX ( &decode_OK, sorb, sz, delta-1 );
14995         if (!decode_OK) {
14996            delta = delta0;
14997            goto decode_failure;
14998         }
14999         break;
15000      }
15001
15002      case 0x77: /* EMMS */
15003         if (sz != 4)
15004            goto decode_failure;
15005         do_EMMS_preamble();
15006         DIP("emms\n");
15007         break;
15008
15009      /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
15010      case 0x01: /* 0F 01 /0 -- SGDT */
15011                 /* 0F 01 /1 -- SIDT */
15012      {
15013          /* This is really revolting, but ... since each processor
15014             (core) only has one IDT and one GDT, just let the guest
15015             see it (pass-through semantics).  I can't see any way to
15016             construct a faked-up value, so don't bother to try. */
15017         modrm = getUChar(delta);
15018         addr = disAMode ( &alen, sorb, delta, dis_buf );
15019         delta += alen;
15020         if (epartIsReg(modrm)) goto decode_failure;
15021         if (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)
15022            goto decode_failure;
15023         switch (gregOfRM(modrm)) {
15024            case 0: DIP("sgdt %s\n", dis_buf); break;
15025            case 1: DIP("sidt %s\n", dis_buf); break;
15026            default: vassert(0); /*NOTREACHED*/
15027         }
15028
15029         IRDirty* d = unsafeIRDirty_0_N (
15030                          0/*regparms*/,
15031                          "x86g_dirtyhelper_SxDT",
15032                          &x86g_dirtyhelper_SxDT,
15033                          mkIRExprVec_2( mkexpr(addr),
15034                                         mkU32(gregOfRM(modrm)) )
15035                      );
15036         /* declare we're writing memory */
15037         d->mFx   = Ifx_Write;
15038         d->mAddr = mkexpr(addr);
15039         d->mSize = 6;
15040         stmt( IRStmt_Dirty(d) );
15041         break;
15042      }
15043
15044      /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
15045
15046      default:
15047         goto decode_failure;
15048   } /* switch (opc) for the 2-byte opcodes */
15049   goto decode_success;
15050   } /* case 0x0F: of primary opcode */
15051
15052   /* ------------------------ ??? ------------------------ */
15053
15054  default:
15055  decode_failure:
15056   /* All decode failures end up here. */
15057   vex_printf("vex x86->IR: unhandled instruction bytes: "
15058              "0x%x 0x%x 0x%x 0x%x\n",
15059              (Int)getIByte(delta_start+0),
15060              (Int)getIByte(delta_start+1),
15061              (Int)getIByte(delta_start+2),
15062              (Int)getIByte(delta_start+3) );
15063
15064   /* Tell the dispatcher that this insn cannot be decoded, and so has
15065      not been executed, and (is currently) the next to be executed.
15066      EIP should be up-to-date since it made so at the start of each
15067      insn, but nevertheless be paranoid and update it again right
15068      now. */
15069   stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_curr_instr) ) );
15070   jmp_lit(Ijk_NoDecode, guest_EIP_curr_instr);
15071   dres.whatNext = Dis_StopHere;
15072   dres.len = 0;
15073   /* We also need to say that a CAS is not expected now, regardless
15074      of what it might have been set to at the start of the function,
15075      since the IR that we've emitted just above (to synthesis a
15076      SIGILL) does not involve any CAS, and presumably no other IR has
15077      been emitted for this (non-decoded) insn. */
15078   *expect_CAS = False;
15079   return dres;
15080
15081   } /* switch (opc) for the main (primary) opcode switch. */
15082
15083  decode_success:
15084   /* All decode successes end up here. */
15085   DIP("\n");
15086   dres.len = delta - delta_start;
15087   return dres;
15088}
15089
15090#undef DIP
15091#undef DIS
15092
15093
15094/*------------------------------------------------------------*/
15095/*--- Top-level fn                                         ---*/
15096/*------------------------------------------------------------*/
15097
15098/* Disassemble a single instruction into IR.  The instruction
15099   is located in host memory at &guest_code[delta]. */
15100
15101DisResult disInstr_X86 ( IRSB*        irsb_IN,
15102                         Bool         put_IP,
15103                         Bool         (*resteerOkFn) ( void*, Addr64 ),
15104                         Bool         resteerCisOk,
15105                         void*        callback_opaque,
15106                         UChar*       guest_code_IN,
15107                         Long         delta,
15108                         Addr64       guest_IP,
15109                         VexArch      guest_arch,
15110                         VexArchInfo* archinfo,
15111                         VexAbiInfo*  abiinfo,
15112                         Bool         host_bigendian_IN )
15113{
15114   Int       i, x1, x2;
15115   Bool      expect_CAS, has_CAS;
15116   DisResult dres;
15117
15118   /* Set globals (see top of this file) */
15119   vassert(guest_arch == VexArchX86);
15120   guest_code           = guest_code_IN;
15121   irsb                 = irsb_IN;
15122   host_is_bigendian    = host_bigendian_IN;
15123   guest_EIP_curr_instr = (Addr32)guest_IP;
15124   guest_EIP_bbstart    = (Addr32)toUInt(guest_IP - delta);
15125
15126   x1 = irsb_IN->stmts_used;
15127   expect_CAS = False;
15128   dres = disInstr_X86_WRK ( &expect_CAS, put_IP, resteerOkFn,
15129                             resteerCisOk,
15130                             callback_opaque,
15131                             delta, archinfo, abiinfo );
15132   x2 = irsb_IN->stmts_used;
15133   vassert(x2 >= x1);
15134
15135   /* See comment at the top of disInstr_X86_WRK for meaning of
15136      expect_CAS.  Here, we (sanity-)check for the presence/absence of
15137      IRCAS as directed by the returned expect_CAS value. */
15138   has_CAS = False;
15139   for (i = x1; i < x2; i++) {
15140      if (irsb_IN->stmts[i]->tag == Ist_CAS)
15141         has_CAS = True;
15142   }
15143
15144   if (expect_CAS != has_CAS) {
15145      /* inconsistency detected.  re-disassemble the instruction so as
15146         to generate a useful error message; then assert. */
15147      vex_traceflags |= VEX_TRACE_FE;
15148      dres = disInstr_X86_WRK ( &expect_CAS, put_IP, resteerOkFn,
15149                                resteerCisOk,
15150                                callback_opaque,
15151                                delta, archinfo, abiinfo );
15152      for (i = x1; i < x2; i++) {
15153         vex_printf("\t\t");
15154         ppIRStmt(irsb_IN->stmts[i]);
15155         vex_printf("\n");
15156      }
15157      /* Failure of this assertion is serious and denotes a bug in
15158         disInstr. */
15159      vpanic("disInstr_X86: inconsistency in LOCK prefix handling");
15160   }
15161
15162   return dres;
15163}
15164
15165
15166/*--------------------------------------------------------------------*/
15167/*--- end                                         guest_x86_toIR.c ---*/
15168/*--------------------------------------------------------------------*/
15169