1
2/*--------------------------------------------------------------------*/
3/*--- begin                                       guest_x86_toIR.c ---*/
4/*--------------------------------------------------------------------*/
5
6/*
7   This file is part of Valgrind, a dynamic binary instrumentation
8   framework.
9
10   Copyright (C) 2004-2012 OpenWorks LLP
11      info@open-works.net
12
13   This program is free software; you can redistribute it and/or
14   modify it under the terms of the GNU General Public License as
15   published by the Free Software Foundation; either version 2 of the
16   License, or (at your option) any later version.
17
18   This program is distributed in the hope that it will be useful, but
19   WITHOUT ANY WARRANTY; without even the implied warranty of
20   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
21   General Public License for more details.
22
23   You should have received a copy of the GNU General Public License
24   along with this program; if not, write to the Free Software
25   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
26   02110-1301, USA.
27
28   The GNU General Public License is contained in the file COPYING.
29
30   Neither the names of the U.S. Department of Energy nor the
31   University of California nor the names of its contributors may be
32   used to endorse or promote products derived from this software
33   without prior written permission.
34*/
35
36/* Translates x86 code to IR. */
37
38/* TODO:
39
40   All Puts to CC_OP/CC_DEP1/CC_DEP2/CC_NDEP should really be checked
41   to ensure a 32-bit value is being written.
42
43   FUCOMI(P): what happens to A and S flags?  Currently are forced
44      to zero.
45
46   x87 FP Limitations:
47
48   * all arithmetic done at 64 bits
49
50   * no FP exceptions, except for handling stack over/underflow
51
52   * FP rounding mode observed only for float->int conversions
53     and int->float conversions which could lose accuracy, and
54     for float-to-float rounding.  For all other operations,
55     round-to-nearest is used, regardless.
56
57   * FP sin/cos/tan/sincos: C2 flag is always cleared.  IOW the
58     simulation claims the argument is in-range (-2^63 <= arg <= 2^63)
59     even when it isn't.
60
61   * some of the FCOM cases could do with testing -- not convinced
62     that the args are the right way round.
63
64   * FSAVE does not re-initialise the FPU; it should do
65
66   * FINIT not only initialises the FPU environment, it also
67     zeroes all the FP registers.  It should leave the registers
68     unchanged.
69
70   SAHF should cause eflags[1] == 1, and in fact it produces 0.  As
71   per Intel docs this bit has no meaning anyway.  Since PUSHF is the
72   only way to observe eflags[1], a proper fix would be to make that
73   bit be set by PUSHF.
74
75   The state of %eflags.AC (alignment check, bit 18) is recorded by
76   the simulation (viz, if you set it with popf then a pushf produces
77   the value you set it to), but it is otherwise ignored.  In
78   particular, setting it to 1 does NOT cause alignment checking to
79   happen.  Programs that set it to 1 and then rely on the resulting
80   SIGBUSs to inform them of misaligned accesses will not work.
81
82   Implementation of sysenter is necessarily partial.  sysenter is a
83   kind of system call entry.  When doing a sysenter, the return
84   address is not known -- that is something that is beyond Vex's
85   knowledge.  So the generated IR forces a return to the scheduler,
86   which can do what it likes to simulate the systenter, but it MUST
87   set this thread's guest_EIP field with the continuation address
88   before resuming execution.  If that doesn't happen, the thread will
89   jump to address zero, which is probably fatal.
90
91   This module uses global variables and so is not MT-safe (if that
92   should ever become relevant).
93
94   The delta values are 32-bit ints, not 64-bit ints.  That means
95   this module may not work right if run on a 64-bit host.  That should
96   be fixed properly, really -- if anyone ever wants to use Vex to
97   translate x86 code for execution on a 64-bit host.
98
99   casLE (implementation of lock-prefixed insns) and rep-prefixed
100   insns: the side-exit back to the start of the insn is done with
101   Ijk_Boring.  This is quite wrong, it should be done with
102   Ijk_NoRedir, since otherwise the side exit, which is intended to
103   restart the instruction for whatever reason, could go somewhere
104   entirely else.  Doing it right (with Ijk_NoRedir jumps) would make
105   no-redir jumps performance critical, at least for rep-prefixed
106   instructions, since all iterations thereof would involve such a
107   jump.  It's not such a big deal with casLE since the side exit is
108   only taken if the CAS fails, that is, the location is contended,
109   which is relatively unlikely.
110
111   XXXX: Nov 2009: handling of SWP on ARM suffers from the same
112   problem.
113
114   Note also, the test for CAS success vs failure is done using
115   Iop_CasCmp{EQ,NE}{8,16,32,64} rather than the ordinary
116   Iop_Cmp{EQ,NE} equivalents.  This is so as to tell Memcheck that it
117   shouldn't definedness-check these comparisons.  See
118   COMMENT_ON_CasCmpEQ in memcheck/mc_translate.c for
119   background/rationale.
120*/
121
122/* Performance holes:
123
124   - fcom ; fstsw %ax ; sahf
125     sahf does not update the O flag (sigh) and so O needs to
126     be computed.  This is done expensively; it would be better
127     to have a calculate_eflags_o helper.
128
129   - emwarns; some FP codes can generate huge numbers of these
130     if the fpucw is changed in an inner loop.  It would be
131     better for the guest state to have an emwarn-enable reg
132     which can be set zero or nonzero.  If it is zero, emwarns
133     are not flagged, and instead control just flows all the
134     way through bbs as usual.
135*/
136
137/* "Special" instructions.
138
139   This instruction decoder can decode three special instructions
140   which mean nothing natively (are no-ops as far as regs/mem are
141   concerned) but have meaning for supporting Valgrind.  A special
142   instruction is flagged by the 12-byte preamble C1C703 C1C70D C1C71D
143   C1C713 (in the standard interpretation, that means: roll $3, %edi;
144   roll $13, %edi; roll $29, %edi; roll $19, %edi).  Following that,
145   one of the following 3 are allowed (standard interpretation in
146   parentheses):
147
148      87DB (xchgl %ebx,%ebx)   %EDX = client_request ( %EAX )
149      87C9 (xchgl %ecx,%ecx)   %EAX = guest_NRADDR
150      87D2 (xchgl %edx,%edx)   call-noredir *%EAX
151
152   Any other bytes following the 12-byte preamble are illegal and
153   constitute a failure in instruction decoding.  This all assumes
154   that the preamble will never occur except in specific code
155   fragments designed for Valgrind to catch.
156
157   No prefixes may precede a "Special" instruction.
158*/
159
160/* LOCK prefixed instructions.  These are translated using IR-level
161   CAS statements (IRCAS) and are believed to preserve atomicity, even
162   from the point of view of some other process racing against a
163   simulated one (presumably they communicate via a shared memory
164   segment).
165
166   Handlers which are aware of LOCK prefixes are:
167      dis_op2_G_E      (add, or, adc, sbb, and, sub, xor)
168      dis_cmpxchg_G_E  (cmpxchg)
169      dis_Grp1         (add, or, adc, sbb, and, sub, xor)
170      dis_Grp3         (not, neg)
171      dis_Grp4         (inc, dec)
172      dis_Grp5         (inc, dec)
173      dis_Grp8_Imm     (bts, btc, btr)
174      dis_bt_G_E       (bts, btc, btr)
175      dis_xadd_G_E     (xadd)
176*/
177
178
179#include "libvex_basictypes.h"
180#include "libvex_ir.h"
181#include "libvex.h"
182#include "libvex_guest_x86.h"
183
184#include "main_util.h"
185#include "main_globals.h"
186#include "guest_generic_bb_to_IR.h"
187#include "guest_generic_x87.h"
188#include "guest_x86_defs.h"
189
190
191/*------------------------------------------------------------*/
192/*--- Globals                                              ---*/
193/*------------------------------------------------------------*/
194
195/* These are set at the start of the translation of an insn, right
196   down in disInstr_X86, so that we don't have to pass them around
197   endlessly.  They are all constant during the translation of any
198   given insn. */
199
200/* We need to know this to do sub-register accesses correctly. */
201static Bool host_is_bigendian;
202
203/* Pointer to the guest code area (points to start of BB, not to the
204   insn being processed). */
205static UChar* guest_code;
206
207/* The guest address corresponding to guest_code[0]. */
208static Addr32 guest_EIP_bbstart;
209
210/* The guest address for the instruction currently being
211   translated. */
212static Addr32 guest_EIP_curr_instr;
213
214/* The IRSB* into which we're generating code. */
215static IRSB* irsb;
216
217
218/*------------------------------------------------------------*/
219/*--- Debugging output                                     ---*/
220/*------------------------------------------------------------*/
221
222#define DIP(format, args...)           \
223   if (vex_traceflags & VEX_TRACE_FE)  \
224      vex_printf(format, ## args)
225
226#define DIS(buf, format, args...)      \
227   if (vex_traceflags & VEX_TRACE_FE)  \
228      vex_sprintf(buf, format, ## args)
229
230
231/*------------------------------------------------------------*/
232/*--- Offsets of various parts of the x86 guest state.     ---*/
233/*------------------------------------------------------------*/
234
235#define OFFB_EAX       offsetof(VexGuestX86State,guest_EAX)
236#define OFFB_EBX       offsetof(VexGuestX86State,guest_EBX)
237#define OFFB_ECX       offsetof(VexGuestX86State,guest_ECX)
238#define OFFB_EDX       offsetof(VexGuestX86State,guest_EDX)
239#define OFFB_ESP       offsetof(VexGuestX86State,guest_ESP)
240#define OFFB_EBP       offsetof(VexGuestX86State,guest_EBP)
241#define OFFB_ESI       offsetof(VexGuestX86State,guest_ESI)
242#define OFFB_EDI       offsetof(VexGuestX86State,guest_EDI)
243
244#define OFFB_EIP       offsetof(VexGuestX86State,guest_EIP)
245
246#define OFFB_CC_OP     offsetof(VexGuestX86State,guest_CC_OP)
247#define OFFB_CC_DEP1   offsetof(VexGuestX86State,guest_CC_DEP1)
248#define OFFB_CC_DEP2   offsetof(VexGuestX86State,guest_CC_DEP2)
249#define OFFB_CC_NDEP   offsetof(VexGuestX86State,guest_CC_NDEP)
250
251#define OFFB_FPREGS    offsetof(VexGuestX86State,guest_FPREG[0])
252#define OFFB_FPTAGS    offsetof(VexGuestX86State,guest_FPTAG[0])
253#define OFFB_DFLAG     offsetof(VexGuestX86State,guest_DFLAG)
254#define OFFB_IDFLAG    offsetof(VexGuestX86State,guest_IDFLAG)
255#define OFFB_ACFLAG    offsetof(VexGuestX86State,guest_ACFLAG)
256#define OFFB_FTOP      offsetof(VexGuestX86State,guest_FTOP)
257#define OFFB_FC3210    offsetof(VexGuestX86State,guest_FC3210)
258#define OFFB_FPROUND   offsetof(VexGuestX86State,guest_FPROUND)
259
260#define OFFB_CS        offsetof(VexGuestX86State,guest_CS)
261#define OFFB_DS        offsetof(VexGuestX86State,guest_DS)
262#define OFFB_ES        offsetof(VexGuestX86State,guest_ES)
263#define OFFB_FS        offsetof(VexGuestX86State,guest_FS)
264#define OFFB_GS        offsetof(VexGuestX86State,guest_GS)
265#define OFFB_SS        offsetof(VexGuestX86State,guest_SS)
266#define OFFB_LDT       offsetof(VexGuestX86State,guest_LDT)
267#define OFFB_GDT       offsetof(VexGuestX86State,guest_GDT)
268
269#define OFFB_SSEROUND  offsetof(VexGuestX86State,guest_SSEROUND)
270#define OFFB_XMM0      offsetof(VexGuestX86State,guest_XMM0)
271#define OFFB_XMM1      offsetof(VexGuestX86State,guest_XMM1)
272#define OFFB_XMM2      offsetof(VexGuestX86State,guest_XMM2)
273#define OFFB_XMM3      offsetof(VexGuestX86State,guest_XMM3)
274#define OFFB_XMM4      offsetof(VexGuestX86State,guest_XMM4)
275#define OFFB_XMM5      offsetof(VexGuestX86State,guest_XMM5)
276#define OFFB_XMM6      offsetof(VexGuestX86State,guest_XMM6)
277#define OFFB_XMM7      offsetof(VexGuestX86State,guest_XMM7)
278
279#define OFFB_EMWARN    offsetof(VexGuestX86State,guest_EMWARN)
280
281#define OFFB_TISTART   offsetof(VexGuestX86State,guest_TISTART)
282#define OFFB_TILEN     offsetof(VexGuestX86State,guest_TILEN)
283#define OFFB_NRADDR    offsetof(VexGuestX86State,guest_NRADDR)
284
285#define OFFB_IP_AT_SYSCALL offsetof(VexGuestX86State,guest_IP_AT_SYSCALL)
286
287
288/*------------------------------------------------------------*/
289/*--- Helper bits and pieces for deconstructing the        ---*/
290/*--- x86 insn stream.                                     ---*/
291/*------------------------------------------------------------*/
292
293/* This is the Intel register encoding -- integer regs. */
294#define R_EAX 0
295#define R_ECX 1
296#define R_EDX 2
297#define R_EBX 3
298#define R_ESP 4
299#define R_EBP 5
300#define R_ESI 6
301#define R_EDI 7
302
303#define R_AL (0+R_EAX)
304#define R_AH (4+R_EAX)
305
306/* This is the Intel register encoding -- segment regs. */
307#define R_ES 0
308#define R_CS 1
309#define R_SS 2
310#define R_DS 3
311#define R_FS 4
312#define R_GS 5
313
314
315/* Add a statement to the list held by "irbb". */
316static void stmt ( IRStmt* st )
317{
318   addStmtToIRSB( irsb, st );
319}
320
321/* Generate a new temporary of the given type. */
322static IRTemp newTemp ( IRType ty )
323{
324   vassert(isPlausibleIRType(ty));
325   return newIRTemp( irsb->tyenv, ty );
326}
327
328/* Various simple conversions */
329
330static UInt extend_s_8to32( UInt x )
331{
332   return (UInt)((((Int)x) << 24) >> 24);
333}
334
335static UInt extend_s_16to32 ( UInt x )
336{
337   return (UInt)((((Int)x) << 16) >> 16);
338}
339
340/* Fetch a byte from the guest insn stream. */
341static UChar getIByte ( Int delta )
342{
343   return guest_code[delta];
344}
345
346/* Extract the reg field from a modRM byte. */
347static Int gregOfRM ( UChar mod_reg_rm )
348{
349   return (Int)( (mod_reg_rm >> 3) & 7 );
350}
351
352/* Figure out whether the mod and rm parts of a modRM byte refer to a
353   register or memory.  If so, the byte will have the form 11XXXYYY,
354   where YYY is the register number. */
355static Bool epartIsReg ( UChar mod_reg_rm )
356{
357   return toBool(0xC0 == (mod_reg_rm & 0xC0));
358}
359
360/* ... and extract the register number ... */
361static Int eregOfRM ( UChar mod_reg_rm )
362{
363   return (Int)(mod_reg_rm & 0x7);
364}
365
366/* Get a 8/16/32-bit unsigned value out of the insn stream. */
367
368static UChar getUChar ( Int delta )
369{
370   UChar v = guest_code[delta+0];
371   return toUChar(v);
372}
373
374static UInt getUDisp16 ( Int delta )
375{
376   UInt v = guest_code[delta+1]; v <<= 8;
377   v |= guest_code[delta+0];
378   return v & 0xFFFF;
379}
380
381static UInt getUDisp32 ( Int delta )
382{
383   UInt v = guest_code[delta+3]; v <<= 8;
384   v |= guest_code[delta+2]; v <<= 8;
385   v |= guest_code[delta+1]; v <<= 8;
386   v |= guest_code[delta+0];
387   return v;
388}
389
390static UInt getUDisp ( Int size, Int delta )
391{
392   switch (size) {
393      case 4: return getUDisp32(delta);
394      case 2: return getUDisp16(delta);
395      case 1: return (UInt)getUChar(delta);
396      default: vpanic("getUDisp(x86)");
397   }
398   return 0; /*notreached*/
399}
400
401
402/* Get a byte value out of the insn stream and sign-extend to 32
403   bits. */
404static UInt getSDisp8 ( Int delta )
405{
406   return extend_s_8to32( (UInt) (guest_code[delta]) );
407}
408
409static UInt getSDisp16 ( Int delta0 )
410{
411   UChar* eip = (UChar*)(&guest_code[delta0]);
412   UInt d = *eip++;
413   d |= ((*eip++) << 8);
414   return extend_s_16to32(d);
415}
416
417static UInt getSDisp ( Int size, Int delta )
418{
419   switch (size) {
420      case 4: return getUDisp32(delta);
421      case 2: return getSDisp16(delta);
422      case 1: return getSDisp8(delta);
423      default: vpanic("getSDisp(x86)");
424  }
425  return 0; /*notreached*/
426}
427
428
429/*------------------------------------------------------------*/
430/*--- Helpers for constructing IR.                         ---*/
431/*------------------------------------------------------------*/
432
433/* Create a 1/2/4 byte read of an x86 integer registers.  For 16/8 bit
434   register references, we need to take the host endianness into
435   account.  Supplied value is 0 .. 7 and in the Intel instruction
436   encoding. */
437
438static IRType szToITy ( Int n )
439{
440   switch (n) {
441      case 1: return Ity_I8;
442      case 2: return Ity_I16;
443      case 4: return Ity_I32;
444      default: vpanic("szToITy(x86)");
445   }
446}
447
448/* On a little-endian host, less significant bits of the guest
449   registers are at lower addresses.  Therefore, if a reference to a
450   register low half has the safe guest state offset as a reference to
451   the full register.
452*/
453static Int integerGuestRegOffset ( Int sz, UInt archreg )
454{
455   vassert(archreg < 8);
456
457   /* Correct for little-endian host only. */
458   vassert(!host_is_bigendian);
459
460   if (sz == 4 || sz == 2 || (sz == 1 && archreg < 4)) {
461      switch (archreg) {
462         case R_EAX: return OFFB_EAX;
463         case R_EBX: return OFFB_EBX;
464         case R_ECX: return OFFB_ECX;
465         case R_EDX: return OFFB_EDX;
466         case R_ESI: return OFFB_ESI;
467         case R_EDI: return OFFB_EDI;
468         case R_ESP: return OFFB_ESP;
469         case R_EBP: return OFFB_EBP;
470         default: vpanic("integerGuestRegOffset(x86,le)(4,2)");
471      }
472   }
473
474   vassert(archreg >= 4 && archreg < 8 && sz == 1);
475   switch (archreg-4) {
476      case R_EAX: return 1+ OFFB_EAX;
477      case R_EBX: return 1+ OFFB_EBX;
478      case R_ECX: return 1+ OFFB_ECX;
479      case R_EDX: return 1+ OFFB_EDX;
480      default: vpanic("integerGuestRegOffset(x86,le)(1h)");
481   }
482
483   /* NOTREACHED */
484   vpanic("integerGuestRegOffset(x86,le)");
485}
486
487static Int segmentGuestRegOffset ( UInt sreg )
488{
489   switch (sreg) {
490      case R_ES: return OFFB_ES;
491      case R_CS: return OFFB_CS;
492      case R_SS: return OFFB_SS;
493      case R_DS: return OFFB_DS;
494      case R_FS: return OFFB_FS;
495      case R_GS: return OFFB_GS;
496      default: vpanic("segmentGuestRegOffset(x86)");
497   }
498}
499
500static Int xmmGuestRegOffset ( UInt xmmreg )
501{
502   switch (xmmreg) {
503      case 0: return OFFB_XMM0;
504      case 1: return OFFB_XMM1;
505      case 2: return OFFB_XMM2;
506      case 3: return OFFB_XMM3;
507      case 4: return OFFB_XMM4;
508      case 5: return OFFB_XMM5;
509      case 6: return OFFB_XMM6;
510      case 7: return OFFB_XMM7;
511      default: vpanic("xmmGuestRegOffset");
512   }
513}
514
515/* Lanes of vector registers are always numbered from zero being the
516   least significant lane (rightmost in the register).  */
517
518static Int xmmGuestRegLane16offset ( UInt xmmreg, Int laneno )
519{
520   /* Correct for little-endian host only. */
521   vassert(!host_is_bigendian);
522   vassert(laneno >= 0 && laneno < 8);
523   return xmmGuestRegOffset( xmmreg ) + 2 * laneno;
524}
525
526static Int xmmGuestRegLane32offset ( UInt xmmreg, Int laneno )
527{
528   /* Correct for little-endian host only. */
529   vassert(!host_is_bigendian);
530   vassert(laneno >= 0 && laneno < 4);
531   return xmmGuestRegOffset( xmmreg ) + 4 * laneno;
532}
533
534static Int xmmGuestRegLane64offset ( UInt xmmreg, Int laneno )
535{
536   /* Correct for little-endian host only. */
537   vassert(!host_is_bigendian);
538   vassert(laneno >= 0 && laneno < 2);
539   return xmmGuestRegOffset( xmmreg ) + 8 * laneno;
540}
541
542static IRExpr* getIReg ( Int sz, UInt archreg )
543{
544   vassert(sz == 1 || sz == 2 || sz == 4);
545   vassert(archreg < 8);
546   return IRExpr_Get( integerGuestRegOffset(sz,archreg),
547                      szToITy(sz) );
548}
549
550/* Ditto, but write to a reg instead. */
551static void putIReg ( Int sz, UInt archreg, IRExpr* e )
552{
553   IRType ty = typeOfIRExpr(irsb->tyenv, e);
554   switch (sz) {
555      case 1: vassert(ty == Ity_I8); break;
556      case 2: vassert(ty == Ity_I16); break;
557      case 4: vassert(ty == Ity_I32); break;
558      default: vpanic("putIReg(x86)");
559   }
560   vassert(archreg < 8);
561   stmt( IRStmt_Put(integerGuestRegOffset(sz,archreg), e) );
562}
563
564static IRExpr* getSReg ( UInt sreg )
565{
566   return IRExpr_Get( segmentGuestRegOffset(sreg), Ity_I16 );
567}
568
569static void putSReg ( UInt sreg, IRExpr* e )
570{
571   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
572   stmt( IRStmt_Put( segmentGuestRegOffset(sreg), e ) );
573}
574
575static IRExpr* getXMMReg ( UInt xmmreg )
576{
577   return IRExpr_Get( xmmGuestRegOffset(xmmreg), Ity_V128 );
578}
579
580static IRExpr* getXMMRegLane64 ( UInt xmmreg, Int laneno )
581{
582   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_I64 );
583}
584
585static IRExpr* getXMMRegLane64F ( UInt xmmreg, Int laneno )
586{
587   return IRExpr_Get( xmmGuestRegLane64offset(xmmreg,laneno), Ity_F64 );
588}
589
590static IRExpr* getXMMRegLane32 ( UInt xmmreg, Int laneno )
591{
592   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_I32 );
593}
594
595static IRExpr* getXMMRegLane32F ( UInt xmmreg, Int laneno )
596{
597   return IRExpr_Get( xmmGuestRegLane32offset(xmmreg,laneno), Ity_F32 );
598}
599
600static void putXMMReg ( UInt xmmreg, IRExpr* e )
601{
602   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_V128);
603   stmt( IRStmt_Put( xmmGuestRegOffset(xmmreg), e ) );
604}
605
606static void putXMMRegLane64 ( UInt xmmreg, Int laneno, IRExpr* e )
607{
608   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
609   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
610}
611
612static void putXMMRegLane64F ( UInt xmmreg, Int laneno, IRExpr* e )
613{
614   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F64);
615   stmt( IRStmt_Put( xmmGuestRegLane64offset(xmmreg,laneno), e ) );
616}
617
618static void putXMMRegLane32F ( UInt xmmreg, Int laneno, IRExpr* e )
619{
620   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_F32);
621   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
622}
623
624static void putXMMRegLane32 ( UInt xmmreg, Int laneno, IRExpr* e )
625{
626   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I32);
627   stmt( IRStmt_Put( xmmGuestRegLane32offset(xmmreg,laneno), e ) );
628}
629
630static void putXMMRegLane16 ( UInt xmmreg, Int laneno, IRExpr* e )
631{
632   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I16);
633   stmt( IRStmt_Put( xmmGuestRegLane16offset(xmmreg,laneno), e ) );
634}
635
636static void assign ( IRTemp dst, IRExpr* e )
637{
638   stmt( IRStmt_WrTmp(dst, e) );
639}
640
641static void storeLE ( IRExpr* addr, IRExpr* data )
642{
643   stmt( IRStmt_Store(Iend_LE, addr, data) );
644}
645
646static IRExpr* unop ( IROp op, IRExpr* a )
647{
648   return IRExpr_Unop(op, a);
649}
650
651static IRExpr* binop ( IROp op, IRExpr* a1, IRExpr* a2 )
652{
653   return IRExpr_Binop(op, a1, a2);
654}
655
656static IRExpr* triop ( IROp op, IRExpr* a1, IRExpr* a2, IRExpr* a3 )
657{
658   return IRExpr_Triop(op, a1, a2, a3);
659}
660
661static IRExpr* mkexpr ( IRTemp tmp )
662{
663   return IRExpr_RdTmp(tmp);
664}
665
666static IRExpr* mkU8 ( UInt i )
667{
668   vassert(i < 256);
669   return IRExpr_Const(IRConst_U8( (UChar)i ));
670}
671
672static IRExpr* mkU16 ( UInt i )
673{
674   vassert(i < 65536);
675   return IRExpr_Const(IRConst_U16( (UShort)i ));
676}
677
678static IRExpr* mkU32 ( UInt i )
679{
680   return IRExpr_Const(IRConst_U32(i));
681}
682
683static IRExpr* mkU64 ( ULong i )
684{
685   return IRExpr_Const(IRConst_U64(i));
686}
687
688static IRExpr* mkU ( IRType ty, UInt i )
689{
690   if (ty == Ity_I8)  return mkU8(i);
691   if (ty == Ity_I16) return mkU16(i);
692   if (ty == Ity_I32) return mkU32(i);
693   /* If this panics, it usually means you passed a size (1,2,4)
694      value as the IRType, rather than a real IRType. */
695   vpanic("mkU(x86)");
696}
697
698static IRExpr* mkV128 ( UShort mask )
699{
700   return IRExpr_Const(IRConst_V128(mask));
701}
702
703static IRExpr* loadLE ( IRType ty, IRExpr* addr )
704{
705   return IRExpr_Load(Iend_LE, ty, addr);
706}
707
708static IROp mkSizedOp ( IRType ty, IROp op8 )
709{
710   Int adj;
711   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
712   vassert(op8 == Iop_Add8 || op8 == Iop_Sub8
713           || op8 == Iop_Mul8
714           || op8 == Iop_Or8 || op8 == Iop_And8 || op8 == Iop_Xor8
715           || op8 == Iop_Shl8 || op8 == Iop_Shr8 || op8 == Iop_Sar8
716           || op8 == Iop_CmpEQ8 || op8 == Iop_CmpNE8
717           || op8 == Iop_CasCmpNE8
718           || op8 == Iop_Not8);
719   adj = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
720   return adj + op8;
721}
722
723static IROp mkWidenOp ( Int szSmall, Int szBig, Bool signd )
724{
725   if (szSmall == 1 && szBig == 4) {
726      return signd ? Iop_8Sto32 : Iop_8Uto32;
727   }
728   if (szSmall == 1 && szBig == 2) {
729      return signd ? Iop_8Sto16 : Iop_8Uto16;
730   }
731   if (szSmall == 2 && szBig == 4) {
732      return signd ? Iop_16Sto32 : Iop_16Uto32;
733   }
734   vpanic("mkWidenOp(x86,guest)");
735}
736
737static IRExpr* mkAnd1 ( IRExpr* x, IRExpr* y )
738{
739   vassert(typeOfIRExpr(irsb->tyenv,x) == Ity_I1);
740   vassert(typeOfIRExpr(irsb->tyenv,y) == Ity_I1);
741   return unop(Iop_32to1,
742               binop(Iop_And32,
743                     unop(Iop_1Uto32,x),
744                     unop(Iop_1Uto32,y)));
745}
746
747/* Generate a compare-and-swap operation, operating on memory at
748   'addr'.  The expected value is 'expVal' and the new value is
749   'newVal'.  If the operation fails, then transfer control (with a
750   no-redir jump (XXX no -- see comment at top of this file)) to
751   'restart_point', which is presumably the address of the guest
752   instruction again -- retrying, essentially. */
753static void casLE ( IRExpr* addr, IRExpr* expVal, IRExpr* newVal,
754                    Addr32 restart_point )
755{
756   IRCAS* cas;
757   IRType tyE    = typeOfIRExpr(irsb->tyenv, expVal);
758   IRType tyN    = typeOfIRExpr(irsb->tyenv, newVal);
759   IRTemp oldTmp = newTemp(tyE);
760   IRTemp expTmp = newTemp(tyE);
761   vassert(tyE == tyN);
762   vassert(tyE == Ity_I32 || tyE == Ity_I16 || tyE == Ity_I8);
763   assign(expTmp, expVal);
764   cas = mkIRCAS( IRTemp_INVALID, oldTmp, Iend_LE, addr,
765                  NULL, mkexpr(expTmp), NULL, newVal );
766   stmt( IRStmt_CAS(cas) );
767   stmt( IRStmt_Exit(
768            binop( mkSizedOp(tyE,Iop_CasCmpNE8),
769                   mkexpr(oldTmp), mkexpr(expTmp) ),
770            Ijk_Boring, /*Ijk_NoRedir*/
771            IRConst_U32( restart_point ),
772            OFFB_EIP
773         ));
774}
775
776
777/*------------------------------------------------------------*/
778/*--- Helpers for %eflags.                                 ---*/
779/*------------------------------------------------------------*/
780
781/* -------------- Evaluating the flags-thunk. -------------- */
782
783/* Build IR to calculate all the eflags from stored
784   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
785   Ity_I32. */
786static IRExpr* mk_x86g_calculate_eflags_all ( void )
787{
788   IRExpr** args
789      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
790                       IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
791                       IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
792                       IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
793   IRExpr* call
794      = mkIRExprCCall(
795           Ity_I32,
796           0/*regparm*/,
797           "x86g_calculate_eflags_all", &x86g_calculate_eflags_all,
798           args
799        );
800   /* Exclude OP and NDEP from definedness checking.  We're only
801      interested in DEP1 and DEP2. */
802   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
803   return call;
804}
805
806/* Build IR to calculate some particular condition from stored
807   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression ::
808   Ity_Bit. */
809static IRExpr* mk_x86g_calculate_condition ( X86Condcode cond )
810{
811   IRExpr** args
812      = mkIRExprVec_5( mkU32(cond),
813                       IRExpr_Get(OFFB_CC_OP,  Ity_I32),
814                       IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
815                       IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
816                       IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
817   IRExpr* call
818      = mkIRExprCCall(
819           Ity_I32,
820           0/*regparm*/,
821           "x86g_calculate_condition", &x86g_calculate_condition,
822           args
823        );
824   /* Exclude the requested condition, OP and NDEP from definedness
825      checking.  We're only interested in DEP1 and DEP2. */
826   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<1) | (1<<4);
827   return unop(Iop_32to1, call);
828}
829
830/* Build IR to calculate just the carry flag from stored
831   CC_OP/CC_DEP1/CC_DEP2/CC_NDEP.  Returns an expression :: Ity_I32. */
832static IRExpr* mk_x86g_calculate_eflags_c ( void )
833{
834   IRExpr** args
835      = mkIRExprVec_4( IRExpr_Get(OFFB_CC_OP,   Ity_I32),
836                       IRExpr_Get(OFFB_CC_DEP1, Ity_I32),
837                       IRExpr_Get(OFFB_CC_DEP2, Ity_I32),
838                       IRExpr_Get(OFFB_CC_NDEP, Ity_I32) );
839   IRExpr* call
840      = mkIRExprCCall(
841           Ity_I32,
842           3/*regparm*/,
843           "x86g_calculate_eflags_c", &x86g_calculate_eflags_c,
844           args
845        );
846   /* Exclude OP and NDEP from definedness checking.  We're only
847      interested in DEP1 and DEP2. */
848   call->Iex.CCall.cee->mcx_mask = (1<<0) | (1<<3);
849   return call;
850}
851
852
853/* -------------- Building the flags-thunk. -------------- */
854
855/* The machinery in this section builds the flag-thunk following a
856   flag-setting operation.  Hence the various setFlags_* functions.
857*/
858
859static Bool isAddSub ( IROp op8 )
860{
861   return toBool(op8 == Iop_Add8 || op8 == Iop_Sub8);
862}
863
864static Bool isLogic ( IROp op8 )
865{
866   return toBool(op8 == Iop_And8 || op8 == Iop_Or8 || op8 == Iop_Xor8);
867}
868
869/* U-widen 8/16/32 bit int expr to 32. */
870static IRExpr* widenUto32 ( IRExpr* e )
871{
872   switch (typeOfIRExpr(irsb->tyenv,e)) {
873      case Ity_I32: return e;
874      case Ity_I16: return unop(Iop_16Uto32,e);
875      case Ity_I8:  return unop(Iop_8Uto32,e);
876      default: vpanic("widenUto32");
877   }
878}
879
880/* S-widen 8/16/32 bit int expr to 32. */
881static IRExpr* widenSto32 ( IRExpr* e )
882{
883   switch (typeOfIRExpr(irsb->tyenv,e)) {
884      case Ity_I32: return e;
885      case Ity_I16: return unop(Iop_16Sto32,e);
886      case Ity_I8:  return unop(Iop_8Sto32,e);
887      default: vpanic("widenSto32");
888   }
889}
890
891/* Narrow 8/16/32 bit int expr to 8/16/32.  Clearly only some
892   of these combinations make sense. */
893static IRExpr* narrowTo ( IRType dst_ty, IRExpr* e )
894{
895   IRType src_ty = typeOfIRExpr(irsb->tyenv,e);
896   if (src_ty == dst_ty)
897      return e;
898   if (src_ty == Ity_I32 && dst_ty == Ity_I16)
899      return unop(Iop_32to16, e);
900   if (src_ty == Ity_I32 && dst_ty == Ity_I8)
901      return unop(Iop_32to8, e);
902
903   vex_printf("\nsrc, dst tys are: ");
904   ppIRType(src_ty);
905   vex_printf(", ");
906   ppIRType(dst_ty);
907   vex_printf("\n");
908   vpanic("narrowTo(x86)");
909}
910
911
912/* Set the flags thunk OP, DEP1 and DEP2 fields.  The supplied op is
913   auto-sized up to the real op. */
914
915static
916void setFlags_DEP1_DEP2 ( IROp op8, IRTemp dep1, IRTemp dep2, IRType ty )
917{
918   Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
919
920   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
921
922   switch (op8) {
923      case Iop_Add8: ccOp += X86G_CC_OP_ADDB;   break;
924      case Iop_Sub8: ccOp += X86G_CC_OP_SUBB;   break;
925      default:       ppIROp(op8);
926                     vpanic("setFlags_DEP1_DEP2(x86)");
927   }
928   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
929   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
930   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(dep2))) );
931   /* Set NDEP even though it isn't used.  This makes redundant-PUT
932      elimination of previous stores to this field work better. */
933   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
934}
935
936
937/* Set the OP and DEP1 fields only, and write zero to DEP2. */
938
939static
940void setFlags_DEP1 ( IROp op8, IRTemp dep1, IRType ty )
941{
942   Int ccOp = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
943
944   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
945
946   switch (op8) {
947      case Iop_Or8:
948      case Iop_And8:
949      case Iop_Xor8: ccOp += X86G_CC_OP_LOGICB; break;
950      default:       ppIROp(op8);
951                     vpanic("setFlags_DEP1(x86)");
952   }
953   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
954   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(dep1))) );
955   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
956   /* Set NDEP even though it isn't used.  This makes redundant-PUT
957      elimination of previous stores to this field work better. */
958   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
959}
960
961
962/* For shift operations, we put in the result and the undershifted
963   result.  Except if the shift amount is zero, the thunk is left
964   unchanged. */
965
966static void setFlags_DEP1_DEP2_shift ( IROp    op32,
967                                       IRTemp  res,
968                                       IRTemp  resUS,
969                                       IRType  ty,
970                                       IRTemp  guard )
971{
972   Int ccOp = ty==Ity_I8 ? 2 : (ty==Ity_I16 ? 1 : 0);
973
974   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
975   vassert(guard);
976
977   /* Both kinds of right shifts are handled by the same thunk
978      operation. */
979   switch (op32) {
980      case Iop_Shr32:
981      case Iop_Sar32: ccOp = X86G_CC_OP_SHRL - ccOp; break;
982      case Iop_Shl32: ccOp = X86G_CC_OP_SHLL - ccOp; break;
983      default:        ppIROp(op32);
984                      vpanic("setFlags_DEP1_DEP2_shift(x86)");
985   }
986
987   /* DEP1 contains the result, DEP2 contains the undershifted value. */
988   stmt( IRStmt_Put( OFFB_CC_OP,
989                     IRExpr_Mux0X( mkexpr(guard),
990                                   IRExpr_Get(OFFB_CC_OP,Ity_I32),
991                                   mkU32(ccOp))) );
992   stmt( IRStmt_Put( OFFB_CC_DEP1,
993                     IRExpr_Mux0X( mkexpr(guard),
994                                   IRExpr_Get(OFFB_CC_DEP1,Ity_I32),
995                                   widenUto32(mkexpr(res)))) );
996   stmt( IRStmt_Put( OFFB_CC_DEP2,
997                     IRExpr_Mux0X( mkexpr(guard),
998                                   IRExpr_Get(OFFB_CC_DEP2,Ity_I32),
999                                   widenUto32(mkexpr(resUS)))) );
1000   /* Set NDEP even though it isn't used.  This makes redundant-PUT
1001      elimination of previous stores to this field work better. */
1002   stmt( IRStmt_Put( OFFB_CC_NDEP,
1003                     IRExpr_Mux0X( mkexpr(guard),
1004                                   IRExpr_Get(OFFB_CC_NDEP,Ity_I32),
1005				   mkU32(0) )));
1006}
1007
1008
1009/* For the inc/dec case, we store in DEP1 the result value and in NDEP
1010   the former value of the carry flag, which unfortunately we have to
1011   compute. */
1012
1013static void setFlags_INC_DEC ( Bool inc, IRTemp res, IRType ty )
1014{
1015   Int ccOp = inc ? X86G_CC_OP_INCB : X86G_CC_OP_DECB;
1016
1017   ccOp += ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
1018   vassert(ty == Ity_I8 || ty == Ity_I16 || ty == Ity_I32);
1019
1020   /* This has to come first, because calculating the C flag
1021      may require reading all four thunk fields. */
1022   stmt( IRStmt_Put( OFFB_CC_NDEP, mk_x86g_calculate_eflags_c()) );
1023   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(ccOp)) );
1024   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(res))) );
1025   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0)) );
1026}
1027
1028
1029/* Multiplies are pretty much like add and sub: DEP1 and DEP2 hold the
1030   two arguments. */
1031
1032static
1033void setFlags_MUL ( IRType ty, IRTemp arg1, IRTemp arg2, UInt base_op )
1034{
1035   switch (ty) {
1036      case Ity_I8:
1037         stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+0) ) );
1038         break;
1039      case Ity_I16:
1040         stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+1) ) );
1041         break;
1042      case Ity_I32:
1043         stmt( IRStmt_Put( OFFB_CC_OP, mkU32(base_op+2) ) );
1044         break;
1045      default:
1046         vpanic("setFlags_MUL(x86)");
1047   }
1048   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(arg1)) ));
1049   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(mkexpr(arg2)) ));
1050   /* Set NDEP even though it isn't used.  This makes redundant-PUT
1051      elimination of previous stores to this field work better. */
1052   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
1053}
1054
1055
1056/* -------------- Condition codes. -------------- */
1057
1058/* Condition codes, using the Intel encoding.  */
1059
1060static HChar* name_X86Condcode ( X86Condcode cond )
1061{
1062   switch (cond) {
1063      case X86CondO:      return "o";
1064      case X86CondNO:     return "no";
1065      case X86CondB:      return "b";
1066      case X86CondNB:     return "nb";
1067      case X86CondZ:      return "z";
1068      case X86CondNZ:     return "nz";
1069      case X86CondBE:     return "be";
1070      case X86CondNBE:    return "nbe";
1071      case X86CondS:      return "s";
1072      case X86CondNS:     return "ns";
1073      case X86CondP:      return "p";
1074      case X86CondNP:     return "np";
1075      case X86CondL:      return "l";
1076      case X86CondNL:     return "nl";
1077      case X86CondLE:     return "le";
1078      case X86CondNLE:    return "nle";
1079      case X86CondAlways: return "ALWAYS";
1080      default: vpanic("name_X86Condcode");
1081   }
1082}
1083
1084static
1085X86Condcode positiveIse_X86Condcode ( X86Condcode  cond,
1086                                      Bool*        needInvert )
1087{
1088   vassert(cond >= X86CondO && cond <= X86CondNLE);
1089   if (cond & 1) {
1090      *needInvert = True;
1091      return cond-1;
1092   } else {
1093      *needInvert = False;
1094      return cond;
1095   }
1096}
1097
1098
1099/* -------------- Helpers for ADD/SUB with carry. -------------- */
1100
1101/* Given ta1, ta2 and tres, compute tres = ADC(ta1,ta2) and set flags
1102   appropriately.
1103
1104   Optionally, generate a store for the 'tres' value.  This can either
1105   be a normal store, or it can be a cas-with-possible-failure style
1106   store:
1107
1108   if taddr is IRTemp_INVALID, then no store is generated.
1109
1110   if taddr is not IRTemp_INVALID, then a store (using taddr as
1111   the address) is generated:
1112
1113     if texpVal is IRTemp_INVALID then a normal store is
1114     generated, and restart_point must be zero (it is irrelevant).
1115
1116     if texpVal is not IRTemp_INVALID then a cas-style store is
1117     generated.  texpVal is the expected value, restart_point
1118     is the restart point if the store fails, and texpVal must
1119     have the same type as tres.
1120*/
1121static void helper_ADC ( Int sz,
1122                         IRTemp tres, IRTemp ta1, IRTemp ta2,
1123                         /* info about optional store: */
1124                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
1125{
1126   UInt    thunkOp;
1127   IRType  ty    = szToITy(sz);
1128   IRTemp  oldc  = newTemp(Ity_I32);
1129   IRTemp  oldcn = newTemp(ty);
1130   IROp    plus  = mkSizedOp(ty, Iop_Add8);
1131   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
1132
1133   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
1134   vassert(sz == 1 || sz == 2 || sz == 4);
1135   thunkOp = sz==4 ? X86G_CC_OP_ADCL
1136                   : (sz==2 ? X86G_CC_OP_ADCW : X86G_CC_OP_ADCB);
1137
1138   /* oldc = old carry flag, 0 or 1 */
1139   assign( oldc,  binop(Iop_And32,
1140                        mk_x86g_calculate_eflags_c(),
1141                        mkU32(1)) );
1142
1143   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
1144
1145   assign( tres, binop(plus,
1146                       binop(plus,mkexpr(ta1),mkexpr(ta2)),
1147                       mkexpr(oldcn)) );
1148
1149   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
1150      start of this function. */
1151   if (taddr != IRTemp_INVALID) {
1152      if (texpVal == IRTemp_INVALID) {
1153         vassert(restart_point == 0);
1154         storeLE( mkexpr(taddr), mkexpr(tres) );
1155      } else {
1156         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
1157         /* .. and hence 'texpVal' has the same type as 'tres'. */
1158         casLE( mkexpr(taddr),
1159                mkexpr(texpVal), mkexpr(tres), restart_point );
1160      }
1161   }
1162
1163   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
1164   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1)) ));
1165   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
1166                                                         mkexpr(oldcn)) )) );
1167   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
1168}
1169
1170
1171/* Given ta1, ta2 and tres, compute tres = SBB(ta1,ta2) and set flags
1172   appropriately.  As with helper_ADC, possibly generate a store of
1173   the result -- see comments on helper_ADC for details.
1174*/
1175static void helper_SBB ( Int sz,
1176                         IRTemp tres, IRTemp ta1, IRTemp ta2,
1177                         /* info about optional store: */
1178                         IRTemp taddr, IRTemp texpVal, Addr32 restart_point )
1179{
1180   UInt    thunkOp;
1181   IRType  ty    = szToITy(sz);
1182   IRTemp  oldc  = newTemp(Ity_I32);
1183   IRTemp  oldcn = newTemp(ty);
1184   IROp    minus = mkSizedOp(ty, Iop_Sub8);
1185   IROp    xor   = mkSizedOp(ty, Iop_Xor8);
1186
1187   vassert(typeOfIRTemp(irsb->tyenv, tres) == ty);
1188   vassert(sz == 1 || sz == 2 || sz == 4);
1189   thunkOp = sz==4 ? X86G_CC_OP_SBBL
1190                   : (sz==2 ? X86G_CC_OP_SBBW : X86G_CC_OP_SBBB);
1191
1192   /* oldc = old carry flag, 0 or 1 */
1193   assign( oldc, binop(Iop_And32,
1194                       mk_x86g_calculate_eflags_c(),
1195                       mkU32(1)) );
1196
1197   assign( oldcn, narrowTo(ty, mkexpr(oldc)) );
1198
1199   assign( tres, binop(minus,
1200                       binop(minus,mkexpr(ta1),mkexpr(ta2)),
1201                       mkexpr(oldcn)) );
1202
1203   /* Possibly generate a store of 'tres' to 'taddr'.  See comment at
1204      start of this function. */
1205   if (taddr != IRTemp_INVALID) {
1206      if (texpVal == IRTemp_INVALID) {
1207         vassert(restart_point == 0);
1208         storeLE( mkexpr(taddr), mkexpr(tres) );
1209      } else {
1210         vassert(typeOfIRTemp(irsb->tyenv, texpVal) == ty);
1211         /* .. and hence 'texpVal' has the same type as 'tres'. */
1212         casLE( mkexpr(taddr),
1213                mkexpr(texpVal), mkexpr(tres), restart_point );
1214      }
1215   }
1216
1217   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(thunkOp) ) );
1218   stmt( IRStmt_Put( OFFB_CC_DEP1, widenUto32(mkexpr(ta1) )) );
1219   stmt( IRStmt_Put( OFFB_CC_DEP2, widenUto32(binop(xor, mkexpr(ta2),
1220                                                         mkexpr(oldcn)) )) );
1221   stmt( IRStmt_Put( OFFB_CC_NDEP, mkexpr(oldc) ) );
1222}
1223
1224
1225/* -------------- Helpers for disassembly printing. -------------- */
1226
1227static HChar* nameGrp1 ( Int opc_aux )
1228{
1229   static HChar* grp1_names[8]
1230     = { "add", "or", "adc", "sbb", "and", "sub", "xor", "cmp" };
1231   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp1(x86)");
1232   return grp1_names[opc_aux];
1233}
1234
1235static HChar* nameGrp2 ( Int opc_aux )
1236{
1237   static HChar* grp2_names[8]
1238     = { "rol", "ror", "rcl", "rcr", "shl", "shr", "shl", "sar" };
1239   if (opc_aux < 0 || opc_aux > 7) vpanic("nameGrp2(x86)");
1240   return grp2_names[opc_aux];
1241}
1242
1243static HChar* nameGrp4 ( Int opc_aux )
1244{
1245   static HChar* grp4_names[8]
1246     = { "inc", "dec", "???", "???", "???", "???", "???", "???" };
1247   if (opc_aux < 0 || opc_aux > 1) vpanic("nameGrp4(x86)");
1248   return grp4_names[opc_aux];
1249}
1250
1251static HChar* nameGrp5 ( Int opc_aux )
1252{
1253   static HChar* grp5_names[8]
1254     = { "inc", "dec", "call*", "call*", "jmp*", "jmp*", "push", "???" };
1255   if (opc_aux < 0 || opc_aux > 6) vpanic("nameGrp5(x86)");
1256   return grp5_names[opc_aux];
1257}
1258
1259static HChar* nameGrp8 ( Int opc_aux )
1260{
1261   static HChar* grp8_names[8]
1262     = { "???", "???", "???", "???", "bt", "bts", "btr", "btc" };
1263   if (opc_aux < 4 || opc_aux > 7) vpanic("nameGrp8(x86)");
1264   return grp8_names[opc_aux];
1265}
1266
1267static HChar* nameIReg ( Int size, Int reg )
1268{
1269   static HChar* ireg32_names[8]
1270     = { "%eax", "%ecx", "%edx", "%ebx",
1271         "%esp", "%ebp", "%esi", "%edi" };
1272   static HChar* ireg16_names[8]
1273     = { "%ax", "%cx", "%dx", "%bx", "%sp", "%bp", "%si", "%di" };
1274   static HChar* ireg8_names[8]
1275     = { "%al", "%cl", "%dl", "%bl",
1276         "%ah{sp}", "%ch{bp}", "%dh{si}", "%bh{di}" };
1277   if (reg < 0 || reg > 7) goto bad;
1278   switch (size) {
1279      case 4: return ireg32_names[reg];
1280      case 2: return ireg16_names[reg];
1281      case 1: return ireg8_names[reg];
1282   }
1283  bad:
1284   vpanic("nameIReg(X86)");
1285   return NULL; /*notreached*/
1286}
1287
1288static HChar* nameSReg ( UInt sreg )
1289{
1290   switch (sreg) {
1291      case R_ES: return "%es";
1292      case R_CS: return "%cs";
1293      case R_SS: return "%ss";
1294      case R_DS: return "%ds";
1295      case R_FS: return "%fs";
1296      case R_GS: return "%gs";
1297      default: vpanic("nameSReg(x86)");
1298   }
1299}
1300
1301static HChar* nameMMXReg ( Int mmxreg )
1302{
1303   static HChar* mmx_names[8]
1304     = { "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7" };
1305   if (mmxreg < 0 || mmxreg > 7) vpanic("nameMMXReg(x86,guest)");
1306   return mmx_names[mmxreg];
1307}
1308
1309static HChar* nameXMMReg ( Int xmmreg )
1310{
1311   static HChar* xmm_names[8]
1312     = { "%xmm0", "%xmm1", "%xmm2", "%xmm3",
1313         "%xmm4", "%xmm5", "%xmm6", "%xmm7" };
1314   if (xmmreg < 0 || xmmreg > 7) vpanic("name_of_xmm_reg");
1315   return xmm_names[xmmreg];
1316}
1317
1318static HChar* nameMMXGran ( Int gran )
1319{
1320   switch (gran) {
1321      case 0: return "b";
1322      case 1: return "w";
1323      case 2: return "d";
1324      case 3: return "q";
1325      default: vpanic("nameMMXGran(x86,guest)");
1326   }
1327}
1328
1329static HChar nameISize ( Int size )
1330{
1331   switch (size) {
1332      case 4: return 'l';
1333      case 2: return 'w';
1334      case 1: return 'b';
1335      default: vpanic("nameISize(x86)");
1336   }
1337}
1338
1339
1340/*------------------------------------------------------------*/
1341/*--- JMP helpers                                          ---*/
1342/*------------------------------------------------------------*/
1343
1344static void jmp_lit( /*MOD*/DisResult* dres,
1345                     IRJumpKind kind, Addr32 d32 )
1346{
1347   vassert(dres->whatNext    == Dis_Continue);
1348   vassert(dres->len         == 0);
1349   vassert(dres->continueAt  == 0);
1350   vassert(dres->jk_StopHere == Ijk_INVALID);
1351   dres->whatNext    = Dis_StopHere;
1352   dres->jk_StopHere = kind;
1353   stmt( IRStmt_Put( OFFB_EIP, mkU32(d32) ) );
1354}
1355
1356static void jmp_treg( /*MOD*/DisResult* dres,
1357                      IRJumpKind kind, IRTemp t )
1358{
1359   vassert(dres->whatNext    == Dis_Continue);
1360   vassert(dres->len         == 0);
1361   vassert(dres->continueAt  == 0);
1362   vassert(dres->jk_StopHere == Ijk_INVALID);
1363   dres->whatNext    = Dis_StopHere;
1364   dres->jk_StopHere = kind;
1365   stmt( IRStmt_Put( OFFB_EIP, mkexpr(t) ) );
1366}
1367
1368static
1369void jcc_01( /*MOD*/DisResult* dres,
1370             X86Condcode cond, Addr32 d32_false, Addr32 d32_true )
1371{
1372   Bool        invert;
1373   X86Condcode condPos;
1374   vassert(dres->whatNext    == Dis_Continue);
1375   vassert(dres->len         == 0);
1376   vassert(dres->continueAt  == 0);
1377   vassert(dres->jk_StopHere == Ijk_INVALID);
1378   dres->whatNext    = Dis_StopHere;
1379   dres->jk_StopHere = Ijk_Boring;
1380   condPos = positiveIse_X86Condcode ( cond, &invert );
1381   if (invert) {
1382      stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
1383                         Ijk_Boring,
1384                         IRConst_U32(d32_false),
1385                         OFFB_EIP ) );
1386      stmt( IRStmt_Put( OFFB_EIP, mkU32(d32_true) ) );
1387   } else {
1388      stmt( IRStmt_Exit( mk_x86g_calculate_condition(condPos),
1389                         Ijk_Boring,
1390                         IRConst_U32(d32_true),
1391                         OFFB_EIP ) );
1392      stmt( IRStmt_Put( OFFB_EIP, mkU32(d32_false) ) );
1393   }
1394}
1395
1396
1397/*------------------------------------------------------------*/
1398/*--- Disassembling addressing modes                       ---*/
1399/*------------------------------------------------------------*/
1400
1401static
1402HChar* sorbTxt ( UChar sorb )
1403{
1404   switch (sorb) {
1405      case 0:    return ""; /* no override */
1406      case 0x3E: return "%ds";
1407      case 0x26: return "%es:";
1408      case 0x64: return "%fs:";
1409      case 0x65: return "%gs:";
1410      default: vpanic("sorbTxt(x86,guest)");
1411   }
1412}
1413
1414
1415/* 'virtual' is an IRExpr* holding a virtual address.  Convert it to a
1416   linear address by adding any required segment override as indicated
1417   by sorb. */
1418static
1419IRExpr* handleSegOverride ( UChar sorb, IRExpr* virtual )
1420{
1421   Int    sreg;
1422   IRType hWordTy;
1423   IRTemp ldt_ptr, gdt_ptr, seg_selector, r64;
1424
1425   if (sorb == 0)
1426      /* the common case - no override */
1427      return virtual;
1428
1429   switch (sorb) {
1430      case 0x3E: sreg = R_DS; break;
1431      case 0x26: sreg = R_ES; break;
1432      case 0x64: sreg = R_FS; break;
1433      case 0x65: sreg = R_GS; break;
1434      default: vpanic("handleSegOverride(x86,guest)");
1435   }
1436
1437   hWordTy = sizeof(HWord)==4 ? Ity_I32 : Ity_I64;
1438
1439   seg_selector = newTemp(Ity_I32);
1440   ldt_ptr      = newTemp(hWordTy);
1441   gdt_ptr      = newTemp(hWordTy);
1442   r64          = newTemp(Ity_I64);
1443
1444   assign( seg_selector, unop(Iop_16Uto32, getSReg(sreg)) );
1445   assign( ldt_ptr, IRExpr_Get( OFFB_LDT, hWordTy ));
1446   assign( gdt_ptr, IRExpr_Get( OFFB_GDT, hWordTy ));
1447
1448   /*
1449   Call this to do the translation and limit checks:
1450   ULong x86g_use_seg_selector ( HWord ldt, HWord gdt,
1451                                 UInt seg_selector, UInt virtual_addr )
1452   */
1453   assign(
1454      r64,
1455      mkIRExprCCall(
1456         Ity_I64,
1457         0/*regparms*/,
1458         "x86g_use_seg_selector",
1459         &x86g_use_seg_selector,
1460         mkIRExprVec_4( mkexpr(ldt_ptr), mkexpr(gdt_ptr),
1461                        mkexpr(seg_selector), virtual)
1462      )
1463   );
1464
1465   /* If the high 32 of the result are non-zero, there was a
1466      failure in address translation.  In which case, make a
1467      quick exit.
1468   */
1469   stmt(
1470      IRStmt_Exit(
1471         binop(Iop_CmpNE32, unop(Iop_64HIto32, mkexpr(r64)), mkU32(0)),
1472         Ijk_MapFail,
1473         IRConst_U32( guest_EIP_curr_instr ),
1474         OFFB_EIP
1475      )
1476   );
1477
1478   /* otherwise, here's the translated result. */
1479   return unop(Iop_64to32, mkexpr(r64));
1480}
1481
1482
1483/* Generate IR to calculate an address indicated by a ModRM and
1484   following SIB bytes.  The expression, and the number of bytes in
1485   the address mode, are returned.  Note that this fn should not be
1486   called if the R/M part of the address denotes a register instead of
1487   memory.  If print_codegen is true, text of the addressing mode is
1488   placed in buf.
1489
1490   The computed address is stored in a new tempreg, and the
1491   identity of the tempreg is returned.  */
1492
1493static IRTemp disAMode_copy2tmp ( IRExpr* addr32 )
1494{
1495   IRTemp tmp = newTemp(Ity_I32);
1496   assign( tmp, addr32 );
1497   return tmp;
1498}
1499
1500static
1501IRTemp disAMode ( Int* len, UChar sorb, Int delta, HChar* buf )
1502{
1503   UChar mod_reg_rm = getIByte(delta);
1504   delta++;
1505
1506   buf[0] = (UChar)0;
1507
1508   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
1509      jump table seems a bit excessive.
1510   */
1511   mod_reg_rm &= 0xC7;                      /* is now XX000YYY */
1512   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
1513                                            /* is now XX0XXYYY */
1514   mod_reg_rm &= 0x1F;                      /* is now 000XXYYY */
1515   switch (mod_reg_rm) {
1516
1517      /* (%eax) .. (%edi), not including (%esp) or (%ebp).
1518         --> GET %reg, t
1519      */
1520      case 0x00: case 0x01: case 0x02: case 0x03:
1521      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
1522         { UChar rm = mod_reg_rm;
1523           DIS(buf, "%s(%s)", sorbTxt(sorb), nameIReg(4,rm));
1524           *len = 1;
1525           return disAMode_copy2tmp(
1526                  handleSegOverride(sorb, getIReg(4,rm)));
1527         }
1528
1529      /* d8(%eax) ... d8(%edi), not including d8(%esp)
1530         --> GET %reg, t ; ADDL d8, t
1531      */
1532      case 0x08: case 0x09: case 0x0A: case 0x0B:
1533      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
1534         { UChar rm = toUChar(mod_reg_rm & 7);
1535           UInt  d  = getSDisp8(delta);
1536           DIS(buf, "%s%d(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
1537           *len = 2;
1538           return disAMode_copy2tmp(
1539                  handleSegOverride(sorb,
1540                     binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
1541         }
1542
1543      /* d32(%eax) ... d32(%edi), not including d32(%esp)
1544         --> GET %reg, t ; ADDL d8, t
1545      */
1546      case 0x10: case 0x11: case 0x12: case 0x13:
1547      /* ! 14 */ case 0x15: case 0x16: case 0x17:
1548         { UChar rm = toUChar(mod_reg_rm & 7);
1549           UInt  d  = getUDisp32(delta);
1550           DIS(buf, "%s0x%x(%s)", sorbTxt(sorb), (Int)d, nameIReg(4,rm));
1551           *len = 5;
1552           return disAMode_copy2tmp(
1553                  handleSegOverride(sorb,
1554                     binop(Iop_Add32,getIReg(4,rm),mkU32(d))));
1555         }
1556
1557      /* a register, %eax .. %edi.  This shouldn't happen. */
1558      case 0x18: case 0x19: case 0x1A: case 0x1B:
1559      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
1560         vpanic("disAMode(x86): not an addr!");
1561
1562      /* a 32-bit literal address
1563         --> MOV d32, tmp
1564      */
1565      case 0x05:
1566         { UInt d = getUDisp32(delta);
1567           *len = 5;
1568           DIS(buf, "%s(0x%x)", sorbTxt(sorb), d);
1569           return disAMode_copy2tmp(
1570                     handleSegOverride(sorb, mkU32(d)));
1571         }
1572
1573      case 0x04: {
1574         /* SIB, with no displacement.  Special cases:
1575            -- %esp cannot act as an index value.
1576               If index_r indicates %esp, zero is used for the index.
1577            -- when mod is zero and base indicates EBP, base is instead
1578               a 32-bit literal.
1579            It's all madness, I tell you.  Extract %index, %base and
1580            scale from the SIB byte.  The value denoted is then:
1581               | %index == %ESP && %base == %EBP
1582               = d32 following SIB byte
1583               | %index == %ESP && %base != %EBP
1584               = %base
1585               | %index != %ESP && %base == %EBP
1586               = d32 following SIB byte + (%index << scale)
1587               | %index != %ESP && %base != %ESP
1588               = %base + (%index << scale)
1589
1590            What happens to the souls of CPU architects who dream up such
1591            horrendous schemes, do you suppose?
1592         */
1593         UChar sib     = getIByte(delta);
1594         UChar scale   = toUChar((sib >> 6) & 3);
1595         UChar index_r = toUChar((sib >> 3) & 7);
1596         UChar base_r  = toUChar(sib & 7);
1597         delta++;
1598
1599         if (index_r != R_ESP && base_r != R_EBP) {
1600            DIS(buf, "%s(%s,%s,%d)", sorbTxt(sorb),
1601                      nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
1602            *len = 2;
1603            return
1604               disAMode_copy2tmp(
1605               handleSegOverride(sorb,
1606                  binop(Iop_Add32,
1607                        getIReg(4,base_r),
1608                        binop(Iop_Shl32, getIReg(4,index_r),
1609                              mkU8(scale)))));
1610         }
1611
1612         if (index_r != R_ESP && base_r == R_EBP) {
1613            UInt d = getUDisp32(delta);
1614            DIS(buf, "%s0x%x(,%s,%d)", sorbTxt(sorb), d,
1615                      nameIReg(4,index_r), 1<<scale);
1616            *len = 6;
1617            return
1618               disAMode_copy2tmp(
1619               handleSegOverride(sorb,
1620                  binop(Iop_Add32,
1621                        binop(Iop_Shl32, getIReg(4,index_r), mkU8(scale)),
1622                        mkU32(d))));
1623         }
1624
1625         if (index_r == R_ESP && base_r != R_EBP) {
1626            DIS(buf, "%s(%s,,)", sorbTxt(sorb), nameIReg(4,base_r));
1627            *len = 2;
1628            return disAMode_copy2tmp(
1629                   handleSegOverride(sorb, getIReg(4,base_r)));
1630         }
1631
1632         if (index_r == R_ESP && base_r == R_EBP) {
1633            UInt d = getUDisp32(delta);
1634            DIS(buf, "%s0x%x(,,)", sorbTxt(sorb), d);
1635            *len = 6;
1636            return disAMode_copy2tmp(
1637                   handleSegOverride(sorb, mkU32(d)));
1638         }
1639         /*NOTREACHED*/
1640         vassert(0);
1641      }
1642
1643      /* SIB, with 8-bit displacement.  Special cases:
1644         -- %esp cannot act as an index value.
1645            If index_r indicates %esp, zero is used for the index.
1646         Denoted value is:
1647            | %index == %ESP
1648            = d8 + %base
1649            | %index != %ESP
1650            = d8 + %base + (%index << scale)
1651      */
1652      case 0x0C: {
1653         UChar sib     = getIByte(delta);
1654         UChar scale   = toUChar((sib >> 6) & 3);
1655         UChar index_r = toUChar((sib >> 3) & 7);
1656         UChar base_r  = toUChar(sib & 7);
1657         UInt  d       = getSDisp8(delta+1);
1658
1659         if (index_r == R_ESP) {
1660            DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
1661                                   (Int)d, nameIReg(4,base_r));
1662            *len = 3;
1663            return disAMode_copy2tmp(
1664                   handleSegOverride(sorb,
1665                      binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
1666         } else {
1667            DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
1668                     nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
1669            *len = 3;
1670            return
1671                disAMode_copy2tmp(
1672                handleSegOverride(sorb,
1673                  binop(Iop_Add32,
1674                        binop(Iop_Add32,
1675                              getIReg(4,base_r),
1676                              binop(Iop_Shl32,
1677                                    getIReg(4,index_r), mkU8(scale))),
1678                        mkU32(d))));
1679         }
1680	 /*NOTREACHED*/
1681         vassert(0);
1682      }
1683
1684      /* SIB, with 32-bit displacement.  Special cases:
1685         -- %esp cannot act as an index value.
1686            If index_r indicates %esp, zero is used for the index.
1687         Denoted value is:
1688            | %index == %ESP
1689            = d32 + %base
1690            | %index != %ESP
1691            = d32 + %base + (%index << scale)
1692      */
1693      case 0x14: {
1694         UChar sib     = getIByte(delta);
1695         UChar scale   = toUChar((sib >> 6) & 3);
1696         UChar index_r = toUChar((sib >> 3) & 7);
1697         UChar base_r  = toUChar(sib & 7);
1698         UInt d        = getUDisp32(delta+1);
1699
1700         if (index_r == R_ESP) {
1701            DIS(buf, "%s%d(%s,,)", sorbTxt(sorb),
1702                                   (Int)d, nameIReg(4,base_r));
1703            *len = 6;
1704            return disAMode_copy2tmp(
1705                   handleSegOverride(sorb,
1706                      binop(Iop_Add32, getIReg(4,base_r), mkU32(d)) ));
1707         } else {
1708            DIS(buf, "%s%d(%s,%s,%d)", sorbTxt(sorb), (Int)d,
1709                     nameIReg(4,base_r), nameIReg(4,index_r), 1<<scale);
1710            *len = 6;
1711            return
1712                disAMode_copy2tmp(
1713                handleSegOverride(sorb,
1714                  binop(Iop_Add32,
1715                        binop(Iop_Add32,
1716                              getIReg(4,base_r),
1717                              binop(Iop_Shl32,
1718                                    getIReg(4,index_r), mkU8(scale))),
1719                        mkU32(d))));
1720         }
1721	 /*NOTREACHED*/
1722         vassert(0);
1723      }
1724
1725      default:
1726         vpanic("disAMode(x86)");
1727         return 0; /*notreached*/
1728   }
1729}
1730
1731
1732/* Figure out the number of (insn-stream) bytes constituting the amode
1733   beginning at delta.  Is useful for getting hold of literals beyond
1734   the end of the amode before it has been disassembled.  */
1735
1736static UInt lengthAMode ( Int delta )
1737{
1738   UChar mod_reg_rm = getIByte(delta); delta++;
1739
1740   /* squeeze out the reg field from mod_reg_rm, since a 256-entry
1741      jump table seems a bit excessive.
1742   */
1743   mod_reg_rm &= 0xC7;               /* is now XX000YYY */
1744   mod_reg_rm  = toUChar(mod_reg_rm | (mod_reg_rm >> 3));
1745                                     /* is now XX0XXYYY */
1746   mod_reg_rm &= 0x1F;               /* is now 000XXYYY */
1747   switch (mod_reg_rm) {
1748
1749      /* (%eax) .. (%edi), not including (%esp) or (%ebp). */
1750      case 0x00: case 0x01: case 0x02: case 0x03:
1751      /* ! 04 */ /* ! 05 */ case 0x06: case 0x07:
1752         return 1;
1753
1754      /* d8(%eax) ... d8(%edi), not including d8(%esp). */
1755      case 0x08: case 0x09: case 0x0A: case 0x0B:
1756      /* ! 0C */ case 0x0D: case 0x0E: case 0x0F:
1757         return 2;
1758
1759      /* d32(%eax) ... d32(%edi), not including d32(%esp). */
1760      case 0x10: case 0x11: case 0x12: case 0x13:
1761      /* ! 14 */ case 0x15: case 0x16: case 0x17:
1762         return 5;
1763
1764      /* a register, %eax .. %edi.  (Not an addr, but still handled.) */
1765      case 0x18: case 0x19: case 0x1A: case 0x1B:
1766      case 0x1C: case 0x1D: case 0x1E: case 0x1F:
1767         return 1;
1768
1769      /* a 32-bit literal address. */
1770      case 0x05: return 5;
1771
1772      /* SIB, no displacement.  */
1773      case 0x04: {
1774         UChar sib    = getIByte(delta);
1775         UChar base_r = toUChar(sib & 7);
1776         if (base_r == R_EBP) return 6; else return 2;
1777      }
1778      /* SIB, with 8-bit displacement.  */
1779      case 0x0C: return 3;
1780
1781      /* SIB, with 32-bit displacement.  */
1782      case 0x14: return 6;
1783
1784      default:
1785         vpanic("lengthAMode");
1786         return 0; /*notreached*/
1787   }
1788}
1789
1790/*------------------------------------------------------------*/
1791/*--- Disassembling common idioms                          ---*/
1792/*------------------------------------------------------------*/
1793
1794/* Handle binary integer instructions of the form
1795      op E, G  meaning
1796      op reg-or-mem, reg
1797   Is passed the a ptr to the modRM byte, the actual operation, and the
1798   data size.  Returns the address advanced completely over this
1799   instruction.
1800
1801   E(src) is reg-or-mem
1802   G(dst) is reg.
1803
1804   If E is reg, -->    GET %G,  tmp
1805                       OP %E,   tmp
1806                       PUT tmp, %G
1807
1808   If E is mem and OP is not reversible,
1809                -->    (getAddr E) -> tmpa
1810                       LD (tmpa), tmpa
1811                       GET %G, tmp2
1812                       OP tmpa, tmp2
1813                       PUT tmp2, %G
1814
1815   If E is mem and OP is reversible
1816                -->    (getAddr E) -> tmpa
1817                       LD (tmpa), tmpa
1818                       OP %G, tmpa
1819                       PUT tmpa, %G
1820*/
1821static
1822UInt dis_op2_E_G ( UChar       sorb,
1823                   Bool        addSubCarry,
1824                   IROp        op8,
1825                   Bool        keep,
1826                   Int         size,
1827                   Int         delta0,
1828                   HChar*      t_x86opc )
1829{
1830   HChar   dis_buf[50];
1831   Int     len;
1832   IRType  ty   = szToITy(size);
1833   IRTemp  dst1 = newTemp(ty);
1834   IRTemp  src  = newTemp(ty);
1835   IRTemp  dst0 = newTemp(ty);
1836   UChar   rm   = getUChar(delta0);
1837   IRTemp  addr = IRTemp_INVALID;
1838
1839   /* addSubCarry == True indicates the intended operation is
1840      add-with-carry or subtract-with-borrow. */
1841   if (addSubCarry) {
1842      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
1843      vassert(keep);
1844   }
1845
1846   if (epartIsReg(rm)) {
1847      /* Specially handle XOR reg,reg, because that doesn't really
1848         depend on reg, and doing the obvious thing potentially
1849         generates a spurious value check failure due to the bogus
1850         dependency.  Ditto SBB reg,reg. */
1851      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
1852          && gregOfRM(rm) == eregOfRM(rm)) {
1853         putIReg(size, gregOfRM(rm), mkU(ty,0));
1854      }
1855      assign( dst0, getIReg(size,gregOfRM(rm)) );
1856      assign( src,  getIReg(size,eregOfRM(rm)) );
1857
1858      if (addSubCarry && op8 == Iop_Add8) {
1859         helper_ADC( size, dst1, dst0, src,
1860                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1861         putIReg(size, gregOfRM(rm), mkexpr(dst1));
1862      } else
1863      if (addSubCarry && op8 == Iop_Sub8) {
1864         helper_SBB( size, dst1, dst0, src,
1865                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1866         putIReg(size, gregOfRM(rm), mkexpr(dst1));
1867      } else {
1868         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
1869         if (isAddSub(op8))
1870            setFlags_DEP1_DEP2(op8, dst0, src, ty);
1871         else
1872            setFlags_DEP1(op8, dst1, ty);
1873         if (keep)
1874            putIReg(size, gregOfRM(rm), mkexpr(dst1));
1875      }
1876
1877      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
1878                          nameIReg(size,eregOfRM(rm)),
1879                          nameIReg(size,gregOfRM(rm)));
1880      return 1+delta0;
1881   } else {
1882      /* E refers to memory */
1883      addr = disAMode ( &len, sorb, delta0, dis_buf);
1884      assign( dst0, getIReg(size,gregOfRM(rm)) );
1885      assign( src,  loadLE(szToITy(size), mkexpr(addr)) );
1886
1887      if (addSubCarry && op8 == Iop_Add8) {
1888         helper_ADC( size, dst1, dst0, src,
1889                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1890         putIReg(size, gregOfRM(rm), mkexpr(dst1));
1891      } else
1892      if (addSubCarry && op8 == Iop_Sub8) {
1893         helper_SBB( size, dst1, dst0, src,
1894                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1895         putIReg(size, gregOfRM(rm), mkexpr(dst1));
1896      } else {
1897         assign( dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
1898         if (isAddSub(op8))
1899            setFlags_DEP1_DEP2(op8, dst0, src, ty);
1900         else
1901            setFlags_DEP1(op8, dst1, ty);
1902         if (keep)
1903            putIReg(size, gregOfRM(rm), mkexpr(dst1));
1904      }
1905
1906      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
1907                          dis_buf,nameIReg(size,gregOfRM(rm)));
1908      return len+delta0;
1909   }
1910}
1911
1912
1913
1914/* Handle binary integer instructions of the form
1915      op G, E  meaning
1916      op reg, reg-or-mem
1917   Is passed the a ptr to the modRM byte, the actual operation, and the
1918   data size.  Returns the address advanced completely over this
1919   instruction.
1920
1921   G(src) is reg.
1922   E(dst) is reg-or-mem
1923
1924   If E is reg, -->    GET %E,  tmp
1925                       OP %G,   tmp
1926                       PUT tmp, %E
1927
1928   If E is mem, -->    (getAddr E) -> tmpa
1929                       LD (tmpa), tmpv
1930                       OP %G, tmpv
1931                       ST tmpv, (tmpa)
1932*/
1933static
1934UInt dis_op2_G_E ( UChar       sorb,
1935                   Bool        locked,
1936                   Bool        addSubCarry,
1937                   IROp        op8,
1938                   Bool        keep,
1939                   Int         size,
1940                   Int         delta0,
1941                   HChar*      t_x86opc )
1942{
1943   HChar   dis_buf[50];
1944   Int     len;
1945   IRType  ty   = szToITy(size);
1946   IRTemp  dst1 = newTemp(ty);
1947   IRTemp  src  = newTemp(ty);
1948   IRTemp  dst0 = newTemp(ty);
1949   UChar   rm   = getIByte(delta0);
1950   IRTemp  addr = IRTemp_INVALID;
1951
1952   /* addSubCarry == True indicates the intended operation is
1953      add-with-carry or subtract-with-borrow. */
1954   if (addSubCarry) {
1955      vassert(op8 == Iop_Add8 || op8 == Iop_Sub8);
1956      vassert(keep);
1957   }
1958
1959   if (epartIsReg(rm)) {
1960      /* Specially handle XOR reg,reg, because that doesn't really
1961         depend on reg, and doing the obvious thing potentially
1962         generates a spurious value check failure due to the bogus
1963         dependency.  Ditto SBB reg,reg.*/
1964      if ((op8 == Iop_Xor8 || (op8 == Iop_Sub8 && addSubCarry))
1965          && gregOfRM(rm) == eregOfRM(rm)) {
1966         putIReg(size, eregOfRM(rm), mkU(ty,0));
1967      }
1968      assign(dst0, getIReg(size,eregOfRM(rm)));
1969      assign(src,  getIReg(size,gregOfRM(rm)));
1970
1971      if (addSubCarry && op8 == Iop_Add8) {
1972         helper_ADC( size, dst1, dst0, src,
1973                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1974         putIReg(size, eregOfRM(rm), mkexpr(dst1));
1975      } else
1976      if (addSubCarry && op8 == Iop_Sub8) {
1977         helper_SBB( size, dst1, dst0, src,
1978                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
1979         putIReg(size, eregOfRM(rm), mkexpr(dst1));
1980      } else {
1981         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
1982         if (isAddSub(op8))
1983            setFlags_DEP1_DEP2(op8, dst0, src, ty);
1984         else
1985            setFlags_DEP1(op8, dst1, ty);
1986         if (keep)
1987            putIReg(size, eregOfRM(rm), mkexpr(dst1));
1988      }
1989
1990      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
1991                          nameIReg(size,gregOfRM(rm)),
1992                          nameIReg(size,eregOfRM(rm)));
1993      return 1+delta0;
1994   }
1995
1996   /* E refers to memory */
1997   {
1998      addr = disAMode ( &len, sorb, delta0, dis_buf);
1999      assign(dst0, loadLE(ty,mkexpr(addr)));
2000      assign(src,  getIReg(size,gregOfRM(rm)));
2001
2002      if (addSubCarry && op8 == Iop_Add8) {
2003         if (locked) {
2004            /* cas-style store */
2005            helper_ADC( size, dst1, dst0, src,
2006                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
2007         } else {
2008            /* normal store */
2009            helper_ADC( size, dst1, dst0, src,
2010                        /*store*/addr, IRTemp_INVALID, 0 );
2011         }
2012      } else
2013      if (addSubCarry && op8 == Iop_Sub8) {
2014         if (locked) {
2015            /* cas-style store */
2016            helper_SBB( size, dst1, dst0, src,
2017                        /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
2018         } else {
2019            /* normal store */
2020            helper_SBB( size, dst1, dst0, src,
2021                        /*store*/addr, IRTemp_INVALID, 0 );
2022         }
2023      } else {
2024         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2025         if (keep) {
2026            if (locked) {
2027               if (0) vex_printf("locked case\n" );
2028               casLE( mkexpr(addr),
2029                      mkexpr(dst0)/*expval*/,
2030                      mkexpr(dst1)/*newval*/, guest_EIP_curr_instr );
2031            } else {
2032               if (0) vex_printf("nonlocked case\n");
2033               storeLE(mkexpr(addr), mkexpr(dst1));
2034            }
2035         }
2036         if (isAddSub(op8))
2037            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2038         else
2039            setFlags_DEP1(op8, dst1, ty);
2040      }
2041
2042      DIP("%s%c %s,%s\n", t_x86opc, nameISize(size),
2043                          nameIReg(size,gregOfRM(rm)), dis_buf);
2044      return len+delta0;
2045   }
2046}
2047
2048
2049/* Handle move instructions of the form
2050      mov E, G  meaning
2051      mov reg-or-mem, reg
2052   Is passed the a ptr to the modRM byte, and the data size.  Returns
2053   the address advanced completely over this instruction.
2054
2055   E(src) is reg-or-mem
2056   G(dst) is reg.
2057
2058   If E is reg, -->    GET %E,  tmpv
2059                       PUT tmpv, %G
2060
2061   If E is mem  -->    (getAddr E) -> tmpa
2062                       LD (tmpa), tmpb
2063                       PUT tmpb, %G
2064*/
2065static
2066UInt dis_mov_E_G ( UChar       sorb,
2067                   Int         size,
2068                   Int         delta0 )
2069{
2070   Int len;
2071   UChar rm = getIByte(delta0);
2072   HChar dis_buf[50];
2073
2074   if (epartIsReg(rm)) {
2075      putIReg(size, gregOfRM(rm), getIReg(size, eregOfRM(rm)));
2076      DIP("mov%c %s,%s\n", nameISize(size),
2077                           nameIReg(size,eregOfRM(rm)),
2078                           nameIReg(size,gregOfRM(rm)));
2079      return 1+delta0;
2080   }
2081
2082   /* E refers to memory */
2083   {
2084      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
2085      putIReg(size, gregOfRM(rm), loadLE(szToITy(size), mkexpr(addr)));
2086      DIP("mov%c %s,%s\n", nameISize(size),
2087                           dis_buf,nameIReg(size,gregOfRM(rm)));
2088      return delta0+len;
2089   }
2090}
2091
2092
2093/* Handle move instructions of the form
2094      mov G, E  meaning
2095      mov reg, reg-or-mem
2096   Is passed the a ptr to the modRM byte, and the data size.  Returns
2097   the address advanced completely over this instruction.
2098
2099   G(src) is reg.
2100   E(dst) is reg-or-mem
2101
2102   If E is reg, -->    GET %G,  tmp
2103                       PUT tmp, %E
2104
2105   If E is mem, -->    (getAddr E) -> tmpa
2106                       GET %G, tmpv
2107                       ST tmpv, (tmpa)
2108*/
2109static
2110UInt dis_mov_G_E ( UChar       sorb,
2111                   Int         size,
2112                   Int         delta0 )
2113{
2114   Int len;
2115   UChar rm = getIByte(delta0);
2116   HChar dis_buf[50];
2117
2118   if (epartIsReg(rm)) {
2119      putIReg(size, eregOfRM(rm), getIReg(size, gregOfRM(rm)));
2120      DIP("mov%c %s,%s\n", nameISize(size),
2121                           nameIReg(size,gregOfRM(rm)),
2122                           nameIReg(size,eregOfRM(rm)));
2123      return 1+delta0;
2124   }
2125
2126   /* E refers to memory */
2127   {
2128      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf);
2129      storeLE( mkexpr(addr), getIReg(size, gregOfRM(rm)) );
2130      DIP("mov%c %s,%s\n", nameISize(size),
2131                           nameIReg(size,gregOfRM(rm)), dis_buf);
2132      return len+delta0;
2133   }
2134}
2135
2136
2137/* op $immediate, AL/AX/EAX. */
2138static
2139UInt dis_op_imm_A ( Int    size,
2140                    Bool   carrying,
2141                    IROp   op8,
2142                    Bool   keep,
2143                    Int    delta,
2144                    HChar* t_x86opc )
2145{
2146   IRType ty   = szToITy(size);
2147   IRTemp dst0 = newTemp(ty);
2148   IRTemp src  = newTemp(ty);
2149   IRTemp dst1 = newTemp(ty);
2150   UInt lit    = getUDisp(size,delta);
2151   assign(dst0, getIReg(size,R_EAX));
2152   assign(src,  mkU(ty,lit));
2153
2154   if (isAddSub(op8) && !carrying) {
2155      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
2156      setFlags_DEP1_DEP2(op8, dst0, src, ty);
2157   }
2158   else
2159   if (isLogic(op8)) {
2160      vassert(!carrying);
2161      assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)) );
2162      setFlags_DEP1(op8, dst1, ty);
2163   }
2164   else
2165   if (op8 == Iop_Add8 && carrying) {
2166      helper_ADC( size, dst1, dst0, src,
2167                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2168   }
2169   else
2170   if (op8 == Iop_Sub8 && carrying) {
2171      helper_SBB( size, dst1, dst0, src,
2172                  /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2173   }
2174   else
2175      vpanic("dis_op_imm_A(x86,guest)");
2176
2177   if (keep)
2178      putIReg(size, R_EAX, mkexpr(dst1));
2179
2180   DIP("%s%c $0x%x, %s\n", t_x86opc, nameISize(size),
2181                           lit, nameIReg(size,R_EAX));
2182   return delta+size;
2183}
2184
2185
2186/* Sign- and Zero-extending moves. */
2187static
2188UInt dis_movx_E_G ( UChar      sorb,
2189                    Int delta, Int szs, Int szd, Bool sign_extend )
2190{
2191   UChar rm = getIByte(delta);
2192   if (epartIsReg(rm)) {
2193      if (szd == szs) {
2194         // mutant case.  See #250799
2195         putIReg(szd, gregOfRM(rm),
2196                           getIReg(szs,eregOfRM(rm)));
2197      } else {
2198         // normal case
2199         putIReg(szd, gregOfRM(rm),
2200                      unop(mkWidenOp(szs,szd,sign_extend),
2201                           getIReg(szs,eregOfRM(rm))));
2202      }
2203      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
2204                               nameISize(szs), nameISize(szd),
2205                               nameIReg(szs,eregOfRM(rm)),
2206                               nameIReg(szd,gregOfRM(rm)));
2207      return 1+delta;
2208   }
2209
2210   /* E refers to memory */
2211   {
2212      Int    len;
2213      HChar  dis_buf[50];
2214      IRTemp addr = disAMode ( &len, sorb, delta, dis_buf );
2215      if (szd == szs) {
2216         // mutant case.  See #250799
2217         putIReg(szd, gregOfRM(rm),
2218                           loadLE(szToITy(szs),mkexpr(addr)));
2219      } else {
2220         // normal case
2221         putIReg(szd, gregOfRM(rm),
2222                      unop(mkWidenOp(szs,szd,sign_extend),
2223                           loadLE(szToITy(szs),mkexpr(addr))));
2224      }
2225      DIP("mov%c%c%c %s,%s\n", sign_extend ? 's' : 'z',
2226                               nameISize(szs), nameISize(szd),
2227                               dis_buf, nameIReg(szd,gregOfRM(rm)));
2228      return len+delta;
2229   }
2230}
2231
2232
2233/* Generate code to divide ArchRegs EDX:EAX / DX:AX / AX by the 32 /
2234   16 / 8 bit quantity in the given IRTemp.  */
2235static
2236void codegen_div ( Int sz, IRTemp t, Bool signed_divide )
2237{
2238   IROp   op    = signed_divide ? Iop_DivModS64to32 : Iop_DivModU64to32;
2239   IRTemp src64 = newTemp(Ity_I64);
2240   IRTemp dst64 = newTemp(Ity_I64);
2241   switch (sz) {
2242      case 4:
2243         assign( src64, binop(Iop_32HLto64,
2244                              getIReg(4,R_EDX), getIReg(4,R_EAX)) );
2245         assign( dst64, binop(op, mkexpr(src64), mkexpr(t)) );
2246         putIReg( 4, R_EAX, unop(Iop_64to32,mkexpr(dst64)) );
2247         putIReg( 4, R_EDX, unop(Iop_64HIto32,mkexpr(dst64)) );
2248         break;
2249      case 2: {
2250         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
2251         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
2252         assign( src64, unop(widen3264,
2253                             binop(Iop_16HLto32,
2254                                   getIReg(2,R_EDX), getIReg(2,R_EAX))) );
2255         assign( dst64, binop(op, mkexpr(src64), unop(widen1632,mkexpr(t))) );
2256         putIReg( 2, R_EAX, unop(Iop_32to16,unop(Iop_64to32,mkexpr(dst64))) );
2257         putIReg( 2, R_EDX, unop(Iop_32to16,unop(Iop_64HIto32,mkexpr(dst64))) );
2258         break;
2259      }
2260      case 1: {
2261         IROp widen3264 = signed_divide ? Iop_32Sto64 : Iop_32Uto64;
2262         IROp widen1632 = signed_divide ? Iop_16Sto32 : Iop_16Uto32;
2263         IROp widen816  = signed_divide ? Iop_8Sto16  : Iop_8Uto16;
2264         assign( src64, unop(widen3264, unop(widen1632, getIReg(2,R_EAX))) );
2265         assign( dst64,
2266                 binop(op, mkexpr(src64),
2267                           unop(widen1632, unop(widen816, mkexpr(t)))) );
2268         putIReg( 1, R_AL, unop(Iop_16to8, unop(Iop_32to16,
2269                           unop(Iop_64to32,mkexpr(dst64)))) );
2270         putIReg( 1, R_AH, unop(Iop_16to8, unop(Iop_32to16,
2271                           unop(Iop_64HIto32,mkexpr(dst64)))) );
2272         break;
2273      }
2274      default: vpanic("codegen_div(x86)");
2275   }
2276}
2277
2278
2279static
2280UInt dis_Grp1 ( UChar sorb, Bool locked,
2281                Int delta, UChar modrm,
2282                Int am_sz, Int d_sz, Int sz, UInt d32 )
2283{
2284   Int     len;
2285   HChar   dis_buf[50];
2286   IRType  ty   = szToITy(sz);
2287   IRTemp  dst1 = newTemp(ty);
2288   IRTemp  src  = newTemp(ty);
2289   IRTemp  dst0 = newTemp(ty);
2290   IRTemp  addr = IRTemp_INVALID;
2291   IROp    op8  = Iop_INVALID;
2292   UInt    mask = sz==1 ? 0xFF : (sz==2 ? 0xFFFF : 0xFFFFFFFF);
2293
2294   switch (gregOfRM(modrm)) {
2295      case 0: op8 = Iop_Add8; break;  case 1: op8 = Iop_Or8;  break;
2296      case 2: break;  // ADC
2297      case 3: break;  // SBB
2298      case 4: op8 = Iop_And8; break;  case 5: op8 = Iop_Sub8; break;
2299      case 6: op8 = Iop_Xor8; break;  case 7: op8 = Iop_Sub8; break;
2300      /*NOTREACHED*/
2301      default: vpanic("dis_Grp1: unhandled case");
2302   }
2303
2304   if (epartIsReg(modrm)) {
2305      vassert(am_sz == 1);
2306
2307      assign(dst0, getIReg(sz,eregOfRM(modrm)));
2308      assign(src,  mkU(ty,d32 & mask));
2309
2310      if (gregOfRM(modrm) == 2 /* ADC */) {
2311         helper_ADC( sz, dst1, dst0, src,
2312                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2313      } else
2314      if (gregOfRM(modrm) == 3 /* SBB */) {
2315         helper_SBB( sz, dst1, dst0, src,
2316                     /*no store*/IRTemp_INVALID, IRTemp_INVALID, 0 );
2317      } else {
2318         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2319         if (isAddSub(op8))
2320            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2321         else
2322            setFlags_DEP1(op8, dst1, ty);
2323      }
2324
2325      if (gregOfRM(modrm) < 7)
2326         putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
2327
2328      delta += (am_sz + d_sz);
2329      DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz), d32,
2330                              nameIReg(sz,eregOfRM(modrm)));
2331   } else {
2332      addr = disAMode ( &len, sorb, delta, dis_buf);
2333
2334      assign(dst0, loadLE(ty,mkexpr(addr)));
2335      assign(src, mkU(ty,d32 & mask));
2336
2337      if (gregOfRM(modrm) == 2 /* ADC */) {
2338         if (locked) {
2339            /* cas-style store */
2340            helper_ADC( sz, dst1, dst0, src,
2341                       /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
2342         } else {
2343            /* normal store */
2344            helper_ADC( sz, dst1, dst0, src,
2345                        /*store*/addr, IRTemp_INVALID, 0 );
2346         }
2347      } else
2348      if (gregOfRM(modrm) == 3 /* SBB */) {
2349         if (locked) {
2350            /* cas-style store */
2351            helper_SBB( sz, dst1, dst0, src,
2352                       /*store*/addr, dst0/*expVal*/, guest_EIP_curr_instr );
2353         } else {
2354            /* normal store */
2355            helper_SBB( sz, dst1, dst0, src,
2356                        /*store*/addr, IRTemp_INVALID, 0 );
2357         }
2358      } else {
2359         assign(dst1, binop(mkSizedOp(ty,op8), mkexpr(dst0), mkexpr(src)));
2360         if (gregOfRM(modrm) < 7) {
2361            if (locked) {
2362               casLE( mkexpr(addr), mkexpr(dst0)/*expVal*/,
2363                                    mkexpr(dst1)/*newVal*/,
2364                                    guest_EIP_curr_instr );
2365            } else {
2366               storeLE(mkexpr(addr), mkexpr(dst1));
2367            }
2368         }
2369         if (isAddSub(op8))
2370            setFlags_DEP1_DEP2(op8, dst0, src, ty);
2371         else
2372            setFlags_DEP1(op8, dst1, ty);
2373      }
2374
2375      delta += (len+d_sz);
2376      DIP("%s%c $0x%x, %s\n", nameGrp1(gregOfRM(modrm)), nameISize(sz),
2377                              d32, dis_buf);
2378   }
2379   return delta;
2380}
2381
2382
2383/* Group 2 extended opcodes.  shift_expr must be an 8-bit typed
2384   expression. */
2385
2386static
2387UInt dis_Grp2 ( UChar sorb,
2388                Int delta, UChar modrm,
2389                Int am_sz, Int d_sz, Int sz, IRExpr* shift_expr,
2390                HChar* shift_expr_txt, Bool* decode_OK )
2391{
2392   /* delta on entry points at the modrm byte. */
2393   HChar  dis_buf[50];
2394   Int    len;
2395   Bool   isShift, isRotate, isRotateC;
2396   IRType ty    = szToITy(sz);
2397   IRTemp dst0  = newTemp(ty);
2398   IRTemp dst1  = newTemp(ty);
2399   IRTemp addr  = IRTemp_INVALID;
2400
2401   *decode_OK = True;
2402
2403   vassert(sz == 1 || sz == 2 || sz == 4);
2404
2405   /* Put value to shift/rotate in dst0. */
2406   if (epartIsReg(modrm)) {
2407      assign(dst0, getIReg(sz, eregOfRM(modrm)));
2408      delta += (am_sz + d_sz);
2409   } else {
2410      addr = disAMode ( &len, sorb, delta, dis_buf);
2411      assign(dst0, loadLE(ty,mkexpr(addr)));
2412      delta += len + d_sz;
2413   }
2414
2415   isShift = False;
2416   switch (gregOfRM(modrm)) { case 4: case 5: case 6: case 7: isShift = True; }
2417
2418   isRotate = False;
2419   switch (gregOfRM(modrm)) { case 0: case 1: isRotate = True; }
2420
2421   isRotateC = False;
2422   switch (gregOfRM(modrm)) { case 2: case 3: isRotateC = True; }
2423
2424   if (!isShift && !isRotate && !isRotateC) {
2425      /*NOTREACHED*/
2426      vpanic("dis_Grp2(Reg): unhandled case(x86)");
2427   }
2428
2429   if (isRotateC) {
2430      /* call a helper; these insns are so ridiculous they do not
2431         deserve better */
2432      Bool     left = toBool(gregOfRM(modrm) == 2);
2433      IRTemp   r64  = newTemp(Ity_I64);
2434      IRExpr** args
2435         = mkIRExprVec_4( widenUto32(mkexpr(dst0)), /* thing to rotate */
2436                          widenUto32(shift_expr),   /* rotate amount */
2437                          widenUto32(mk_x86g_calculate_eflags_all()),
2438                          mkU32(sz) );
2439      assign( r64, mkIRExprCCall(
2440                      Ity_I64,
2441                      0/*regparm*/,
2442                      left ? "x86g_calculate_RCL" : "x86g_calculate_RCR",
2443                      left ? &x86g_calculate_RCL  : &x86g_calculate_RCR,
2444                      args
2445                   )
2446            );
2447      /* new eflags in hi half r64; new value in lo half r64 */
2448      assign( dst1, narrowTo(ty, unop(Iop_64to32, mkexpr(r64))) );
2449      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
2450      stmt( IRStmt_Put( OFFB_CC_DEP1, unop(Iop_64HIto32, mkexpr(r64)) ));
2451      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
2452      /* Set NDEP even though it isn't used.  This makes redundant-PUT
2453         elimination of previous stores to this field work better. */
2454      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
2455   }
2456
2457   if (isShift) {
2458
2459      IRTemp pre32     = newTemp(Ity_I32);
2460      IRTemp res32     = newTemp(Ity_I32);
2461      IRTemp res32ss   = newTemp(Ity_I32);
2462      IRTemp shift_amt = newTemp(Ity_I8);
2463      IROp   op32;
2464
2465      switch (gregOfRM(modrm)) {
2466         case 4: op32 = Iop_Shl32; break;
2467         case 5: op32 = Iop_Shr32; break;
2468         case 6: op32 = Iop_Shl32; break;
2469         case 7: op32 = Iop_Sar32; break;
2470         /*NOTREACHED*/
2471         default: vpanic("dis_Grp2:shift"); break;
2472      }
2473
2474      /* Widen the value to be shifted to 32 bits, do the shift, and
2475         narrow back down.  This seems surprisingly long-winded, but
2476         unfortunately the Intel semantics requires that 8/16-bit
2477         shifts give defined results for shift values all the way up
2478         to 31, and this seems the simplest way to do it.  It has the
2479         advantage that the only IR level shifts generated are of 32
2480         bit values, and the shift amount is guaranteed to be in the
2481         range 0 .. 31, thereby observing the IR semantics requiring
2482         all shift values to be in the range 0 .. 2^word_size-1. */
2483
2484      /* shift_amt = shift_expr & 31, regardless of operation size */
2485      assign( shift_amt, binop(Iop_And8, shift_expr, mkU8(31)) );
2486
2487      /* suitably widen the value to be shifted to 32 bits. */
2488      assign( pre32, op32==Iop_Sar32 ? widenSto32(mkexpr(dst0))
2489                                     : widenUto32(mkexpr(dst0)) );
2490
2491      /* res32 = pre32 `shift` shift_amt */
2492      assign( res32, binop(op32, mkexpr(pre32), mkexpr(shift_amt)) );
2493
2494      /* res32ss = pre32 `shift` ((shift_amt - 1) & 31) */
2495      assign( res32ss,
2496              binop(op32,
2497                    mkexpr(pre32),
2498                    binop(Iop_And8,
2499                          binop(Iop_Sub8,
2500                                mkexpr(shift_amt), mkU8(1)),
2501                          mkU8(31))) );
2502
2503      /* Build the flags thunk. */
2504      setFlags_DEP1_DEP2_shift(op32, res32, res32ss, ty, shift_amt);
2505
2506      /* Narrow the result back down. */
2507      assign( dst1, narrowTo(ty, mkexpr(res32)) );
2508
2509   } /* if (isShift) */
2510
2511   else
2512   if (isRotate) {
2513      Int    ccOp      = ty==Ity_I8 ? 0 : (ty==Ity_I16 ? 1 : 2);
2514      Bool   left      = toBool(gregOfRM(modrm) == 0);
2515      IRTemp rot_amt   = newTemp(Ity_I8);
2516      IRTemp rot_amt32 = newTemp(Ity_I8);
2517      IRTemp oldFlags  = newTemp(Ity_I32);
2518
2519      /* rot_amt = shift_expr & mask */
2520      /* By masking the rotate amount thusly, the IR-level Shl/Shr
2521         expressions never shift beyond the word size and thus remain
2522         well defined. */
2523      assign(rot_amt32, binop(Iop_And8, shift_expr, mkU8(31)));
2524
2525      if (ty == Ity_I32)
2526         assign(rot_amt, mkexpr(rot_amt32));
2527      else
2528         assign(rot_amt, binop(Iop_And8, mkexpr(rot_amt32), mkU8(8*sz-1)));
2529
2530      if (left) {
2531
2532         /* dst1 = (dst0 << rot_amt) | (dst0 >>u (wordsize-rot_amt)) */
2533         assign(dst1,
2534            binop( mkSizedOp(ty,Iop_Or8),
2535                   binop( mkSizedOp(ty,Iop_Shl8),
2536                          mkexpr(dst0),
2537                          mkexpr(rot_amt)
2538                   ),
2539                   binop( mkSizedOp(ty,Iop_Shr8),
2540                          mkexpr(dst0),
2541                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
2542                   )
2543            )
2544         );
2545         ccOp += X86G_CC_OP_ROLB;
2546
2547      } else { /* right */
2548
2549         /* dst1 = (dst0 >>u rot_amt) | (dst0 << (wordsize-rot_amt)) */
2550         assign(dst1,
2551            binop( mkSizedOp(ty,Iop_Or8),
2552                   binop( mkSizedOp(ty,Iop_Shr8),
2553                          mkexpr(dst0),
2554                          mkexpr(rot_amt)
2555                   ),
2556                   binop( mkSizedOp(ty,Iop_Shl8),
2557                          mkexpr(dst0),
2558                          binop(Iop_Sub8,mkU8(8*sz), mkexpr(rot_amt))
2559                   )
2560            )
2561         );
2562         ccOp += X86G_CC_OP_RORB;
2563
2564      }
2565
2566      /* dst1 now holds the rotated value.  Build flag thunk.  We
2567         need the resulting value for this, and the previous flags.
2568         Except don't set it if the rotate count is zero. */
2569
2570      assign(oldFlags, mk_x86g_calculate_eflags_all());
2571
2572      /* CC_DEP1 is the rotated value.  CC_NDEP is flags before. */
2573      stmt( IRStmt_Put( OFFB_CC_OP,
2574                        IRExpr_Mux0X( mkexpr(rot_amt32),
2575                                      IRExpr_Get(OFFB_CC_OP,Ity_I32),
2576                                      mkU32(ccOp))) );
2577      stmt( IRStmt_Put( OFFB_CC_DEP1,
2578                        IRExpr_Mux0X( mkexpr(rot_amt32),
2579                                      IRExpr_Get(OFFB_CC_DEP1,Ity_I32),
2580                                      widenUto32(mkexpr(dst1)))) );
2581      stmt( IRStmt_Put( OFFB_CC_DEP2,
2582                        IRExpr_Mux0X( mkexpr(rot_amt32),
2583                                      IRExpr_Get(OFFB_CC_DEP2,Ity_I32),
2584                                      mkU32(0))) );
2585      stmt( IRStmt_Put( OFFB_CC_NDEP,
2586                        IRExpr_Mux0X( mkexpr(rot_amt32),
2587                                      IRExpr_Get(OFFB_CC_NDEP,Ity_I32),
2588                                      mkexpr(oldFlags))) );
2589   } /* if (isRotate) */
2590
2591   /* Save result, and finish up. */
2592   if (epartIsReg(modrm)) {
2593      putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
2594      if (vex_traceflags & VEX_TRACE_FE) {
2595         vex_printf("%s%c ",
2596                    nameGrp2(gregOfRM(modrm)), nameISize(sz) );
2597         if (shift_expr_txt)
2598            vex_printf("%s", shift_expr_txt);
2599         else
2600            ppIRExpr(shift_expr);
2601         vex_printf(", %s\n", nameIReg(sz,eregOfRM(modrm)));
2602      }
2603   } else {
2604      storeLE(mkexpr(addr), mkexpr(dst1));
2605      if (vex_traceflags & VEX_TRACE_FE) {
2606         vex_printf("%s%c ",
2607                    nameGrp2(gregOfRM(modrm)), nameISize(sz) );
2608         if (shift_expr_txt)
2609            vex_printf("%s", shift_expr_txt);
2610         else
2611            ppIRExpr(shift_expr);
2612         vex_printf(", %s\n", dis_buf);
2613      }
2614   }
2615   return delta;
2616}
2617
2618
2619/* Group 8 extended opcodes (but BT/BTS/BTC/BTR only). */
2620static
2621UInt dis_Grp8_Imm ( UChar sorb,
2622                    Bool locked,
2623                    Int delta, UChar modrm,
2624                    Int am_sz, Int sz, UInt src_val,
2625                    Bool* decode_OK )
2626{
2627   /* src_val denotes a d8.
2628      And delta on entry points at the modrm byte. */
2629
2630   IRType ty     = szToITy(sz);
2631   IRTemp t2     = newTemp(Ity_I32);
2632   IRTemp t2m    = newTemp(Ity_I32);
2633   IRTemp t_addr = IRTemp_INVALID;
2634   HChar  dis_buf[50];
2635   UInt   mask;
2636
2637   /* we're optimists :-) */
2638   *decode_OK = True;
2639
2640   /* Limit src_val -- the bit offset -- to something within a word.
2641      The Intel docs say that literal offsets larger than a word are
2642      masked in this way. */
2643   switch (sz) {
2644      case 2:  src_val &= 15; break;
2645      case 4:  src_val &= 31; break;
2646      default: *decode_OK = False; return delta;
2647   }
2648
2649   /* Invent a mask suitable for the operation. */
2650   switch (gregOfRM(modrm)) {
2651      case 4: /* BT */  mask = 0;               break;
2652      case 5: /* BTS */ mask = 1 << src_val;    break;
2653      case 6: /* BTR */ mask = ~(1 << src_val); break;
2654      case 7: /* BTC */ mask = 1 << src_val;    break;
2655         /* If this needs to be extended, probably simplest to make a
2656            new function to handle the other cases (0 .. 3).  The
2657            Intel docs do however not indicate any use for 0 .. 3, so
2658            we don't expect this to happen. */
2659      default: *decode_OK = False; return delta;
2660   }
2661
2662   /* Fetch the value to be tested and modified into t2, which is
2663      32-bits wide regardless of sz. */
2664   if (epartIsReg(modrm)) {
2665      vassert(am_sz == 1);
2666      assign( t2, widenUto32(getIReg(sz, eregOfRM(modrm))) );
2667      delta += (am_sz + 1);
2668      DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
2669                              src_val, nameIReg(sz,eregOfRM(modrm)));
2670   } else {
2671      Int len;
2672      t_addr = disAMode ( &len, sorb, delta, dis_buf);
2673      delta  += (len+1);
2674      assign( t2, widenUto32(loadLE(ty, mkexpr(t_addr))) );
2675      DIP("%s%c $0x%x, %s\n", nameGrp8(gregOfRM(modrm)), nameISize(sz),
2676                              src_val, dis_buf);
2677   }
2678
2679   /* Compute the new value into t2m, if non-BT. */
2680   switch (gregOfRM(modrm)) {
2681      case 4: /* BT */
2682         break;
2683      case 5: /* BTS */
2684         assign( t2m, binop(Iop_Or32, mkU32(mask), mkexpr(t2)) );
2685         break;
2686      case 6: /* BTR */
2687         assign( t2m, binop(Iop_And32, mkU32(mask), mkexpr(t2)) );
2688         break;
2689      case 7: /* BTC */
2690         assign( t2m, binop(Iop_Xor32, mkU32(mask), mkexpr(t2)) );
2691         break;
2692      default:
2693         /*NOTREACHED*/ /*the previous switch guards this*/
2694         vassert(0);
2695   }
2696
2697   /* Write the result back, if non-BT.  If the CAS fails then we
2698      side-exit from the trace at this point, and so the flag state is
2699      not affected.  This is of course as required. */
2700   if (gregOfRM(modrm) != 4 /* BT */) {
2701      if (epartIsReg(modrm)) {
2702         putIReg(sz, eregOfRM(modrm), narrowTo(ty, mkexpr(t2m)));
2703      } else {
2704         if (locked) {
2705            casLE( mkexpr(t_addr),
2706                   narrowTo(ty, mkexpr(t2))/*expd*/,
2707                   narrowTo(ty, mkexpr(t2m))/*new*/,
2708                   guest_EIP_curr_instr );
2709         } else {
2710            storeLE(mkexpr(t_addr), narrowTo(ty, mkexpr(t2m)));
2711         }
2712      }
2713   }
2714
2715   /* Copy relevant bit from t2 into the carry flag. */
2716   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
2717   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
2718   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
2719   stmt( IRStmt_Put(
2720            OFFB_CC_DEP1,
2721            binop(Iop_And32,
2722                  binop(Iop_Shr32, mkexpr(t2), mkU8(src_val)),
2723                  mkU32(1))
2724       ));
2725   /* Set NDEP even though it isn't used.  This makes redundant-PUT
2726      elimination of previous stores to this field work better. */
2727   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
2728
2729   return delta;
2730}
2731
2732
2733/* Signed/unsigned widening multiply.  Generate IR to multiply the
2734   value in EAX/AX/AL by the given IRTemp, and park the result in
2735   EDX:EAX/DX:AX/AX.
2736*/
2737static void codegen_mulL_A_D ( Int sz, Bool syned,
2738                               IRTemp tmp, HChar* tmp_txt )
2739{
2740   IRType ty = szToITy(sz);
2741   IRTemp t1 = newTemp(ty);
2742
2743   assign( t1, getIReg(sz, R_EAX) );
2744
2745   switch (ty) {
2746      case Ity_I32: {
2747         IRTemp res64   = newTemp(Ity_I64);
2748         IRTemp resHi   = newTemp(Ity_I32);
2749         IRTemp resLo   = newTemp(Ity_I32);
2750         IROp   mulOp   = syned ? Iop_MullS32 : Iop_MullU32;
2751         UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
2752         setFlags_MUL ( Ity_I32, t1, tmp, tBaseOp );
2753         assign( res64, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
2754         assign( resHi, unop(Iop_64HIto32,mkexpr(res64)));
2755         assign( resLo, unop(Iop_64to32,mkexpr(res64)));
2756         putIReg(4, R_EDX, mkexpr(resHi));
2757         putIReg(4, R_EAX, mkexpr(resLo));
2758         break;
2759      }
2760      case Ity_I16: {
2761         IRTemp res32   = newTemp(Ity_I32);
2762         IRTemp resHi   = newTemp(Ity_I16);
2763         IRTemp resLo   = newTemp(Ity_I16);
2764         IROp   mulOp   = syned ? Iop_MullS16 : Iop_MullU16;
2765         UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
2766         setFlags_MUL ( Ity_I16, t1, tmp, tBaseOp );
2767         assign( res32, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
2768         assign( resHi, unop(Iop_32HIto16,mkexpr(res32)));
2769         assign( resLo, unop(Iop_32to16,mkexpr(res32)));
2770         putIReg(2, R_EDX, mkexpr(resHi));
2771         putIReg(2, R_EAX, mkexpr(resLo));
2772         break;
2773      }
2774      case Ity_I8: {
2775         IRTemp res16   = newTemp(Ity_I16);
2776         IRTemp resHi   = newTemp(Ity_I8);
2777         IRTemp resLo   = newTemp(Ity_I8);
2778         IROp   mulOp   = syned ? Iop_MullS8 : Iop_MullU8;
2779         UInt   tBaseOp = syned ? X86G_CC_OP_SMULB : X86G_CC_OP_UMULB;
2780         setFlags_MUL ( Ity_I8, t1, tmp, tBaseOp );
2781         assign( res16, binop(mulOp, mkexpr(t1), mkexpr(tmp)) );
2782         assign( resHi, unop(Iop_16HIto8,mkexpr(res16)));
2783         assign( resLo, unop(Iop_16to8,mkexpr(res16)));
2784         putIReg(2, R_EAX, mkexpr(res16));
2785         break;
2786      }
2787      default:
2788         vpanic("codegen_mulL_A_D(x86)");
2789   }
2790   DIP("%s%c %s\n", syned ? "imul" : "mul", nameISize(sz), tmp_txt);
2791}
2792
2793
2794/* Group 3 extended opcodes. */
2795static
2796UInt dis_Grp3 ( UChar sorb, Bool locked, Int sz, Int delta, Bool* decode_OK )
2797{
2798   UInt    d32;
2799   UChar   modrm;
2800   HChar   dis_buf[50];
2801   Int     len;
2802   IRTemp  addr;
2803   IRType  ty = szToITy(sz);
2804   IRTemp  t1 = newTemp(ty);
2805   IRTemp dst1, src, dst0;
2806
2807   *decode_OK = True; /* may change this later */
2808
2809   modrm = getIByte(delta);
2810
2811   if (locked && (gregOfRM(modrm) != 2 && gregOfRM(modrm) != 3)) {
2812      /* LOCK prefix only allowed with not and neg subopcodes */
2813      *decode_OK = False;
2814      return delta;
2815   }
2816
2817   if (epartIsReg(modrm)) {
2818      switch (gregOfRM(modrm)) {
2819         case 0: { /* TEST */
2820            delta++; d32 = getUDisp(sz, delta); delta += sz;
2821            dst1 = newTemp(ty);
2822            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
2823                               getIReg(sz,eregOfRM(modrm)),
2824                               mkU(ty,d32)));
2825            setFlags_DEP1( Iop_And8, dst1, ty );
2826            DIP("test%c $0x%x, %s\n", nameISize(sz), d32,
2827                                      nameIReg(sz, eregOfRM(modrm)));
2828            break;
2829         }
2830         case 1: /* UNDEFINED */
2831           /* The Intel docs imply this insn is undefined and binutils
2832              agrees.  Unfortunately Core 2 will run it (with who
2833              knows what result?)  sandpile.org reckons it's an alias
2834              for case 0.  We play safe. */
2835           *decode_OK = False;
2836           break;
2837         case 2: /* NOT */
2838            delta++;
2839            putIReg(sz, eregOfRM(modrm),
2840                        unop(mkSizedOp(ty,Iop_Not8),
2841                             getIReg(sz, eregOfRM(modrm))));
2842            DIP("not%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
2843            break;
2844         case 3: /* NEG */
2845            delta++;
2846            dst0 = newTemp(ty);
2847            src  = newTemp(ty);
2848            dst1 = newTemp(ty);
2849            assign(dst0, mkU(ty,0));
2850            assign(src,  getIReg(sz,eregOfRM(modrm)));
2851            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8), mkexpr(dst0), mkexpr(src)));
2852            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
2853            putIReg(sz, eregOfRM(modrm), mkexpr(dst1));
2854            DIP("neg%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
2855            break;
2856         case 4: /* MUL (unsigned widening) */
2857            delta++;
2858            src = newTemp(ty);
2859            assign(src, getIReg(sz,eregOfRM(modrm)));
2860            codegen_mulL_A_D ( sz, False, src, nameIReg(sz,eregOfRM(modrm)) );
2861            break;
2862         case 5: /* IMUL (signed widening) */
2863            delta++;
2864            src = newTemp(ty);
2865            assign(src, getIReg(sz,eregOfRM(modrm)));
2866            codegen_mulL_A_D ( sz, True, src, nameIReg(sz,eregOfRM(modrm)) );
2867            break;
2868         case 6: /* DIV */
2869            delta++;
2870            assign( t1, getIReg(sz, eregOfRM(modrm)) );
2871            codegen_div ( sz, t1, False );
2872            DIP("div%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
2873            break;
2874         case 7: /* IDIV */
2875            delta++;
2876            assign( t1, getIReg(sz, eregOfRM(modrm)) );
2877            codegen_div ( sz, t1, True );
2878            DIP("idiv%c %s\n", nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
2879            break;
2880         default:
2881            /* This can't happen - gregOfRM should return 0 .. 7 only */
2882            vpanic("Grp3(x86)");
2883      }
2884   } else {
2885      addr = disAMode ( &len, sorb, delta, dis_buf );
2886      t1   = newTemp(ty);
2887      delta += len;
2888      assign(t1, loadLE(ty,mkexpr(addr)));
2889      switch (gregOfRM(modrm)) {
2890         case 0: { /* TEST */
2891            d32 = getUDisp(sz, delta); delta += sz;
2892            dst1 = newTemp(ty);
2893            assign(dst1, binop(mkSizedOp(ty,Iop_And8),
2894                               mkexpr(t1), mkU(ty,d32)));
2895            setFlags_DEP1( Iop_And8, dst1, ty );
2896            DIP("test%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
2897            break;
2898         }
2899         case 1: /* UNDEFINED */
2900           /* See comment above on R case */
2901           *decode_OK = False;
2902           break;
2903         case 2: /* NOT */
2904            dst1 = newTemp(ty);
2905            assign(dst1, unop(mkSizedOp(ty,Iop_Not8), mkexpr(t1)));
2906            if (locked) {
2907               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
2908                                    guest_EIP_curr_instr );
2909            } else {
2910               storeLE( mkexpr(addr), mkexpr(dst1) );
2911            }
2912            DIP("not%c %s\n", nameISize(sz), dis_buf);
2913            break;
2914         case 3: /* NEG */
2915            dst0 = newTemp(ty);
2916            src  = newTemp(ty);
2917            dst1 = newTemp(ty);
2918            assign(dst0, mkU(ty,0));
2919            assign(src,  mkexpr(t1));
2920            assign(dst1, binop(mkSizedOp(ty,Iop_Sub8),
2921                               mkexpr(dst0), mkexpr(src)));
2922            if (locked) {
2923               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(dst1)/*new*/,
2924                                    guest_EIP_curr_instr );
2925            } else {
2926               storeLE( mkexpr(addr), mkexpr(dst1) );
2927            }
2928            setFlags_DEP1_DEP2(Iop_Sub8, dst0, src, ty);
2929            DIP("neg%c %s\n", nameISize(sz), dis_buf);
2930            break;
2931         case 4: /* MUL */
2932            codegen_mulL_A_D ( sz, False, t1, dis_buf );
2933            break;
2934         case 5: /* IMUL */
2935            codegen_mulL_A_D ( sz, True, t1, dis_buf );
2936            break;
2937         case 6: /* DIV */
2938            codegen_div ( sz, t1, False );
2939            DIP("div%c %s\n", nameISize(sz), dis_buf);
2940            break;
2941         case 7: /* IDIV */
2942            codegen_div ( sz, t1, True );
2943            DIP("idiv%c %s\n", nameISize(sz), dis_buf);
2944            break;
2945         default:
2946            /* This can't happen - gregOfRM should return 0 .. 7 only */
2947            vpanic("Grp3(x86)");
2948      }
2949   }
2950   return delta;
2951}
2952
2953
2954/* Group 4 extended opcodes. */
2955static
2956UInt dis_Grp4 ( UChar sorb, Bool locked, Int delta, Bool* decode_OK )
2957{
2958   Int   alen;
2959   UChar modrm;
2960   HChar dis_buf[50];
2961   IRType ty = Ity_I8;
2962   IRTemp t1 = newTemp(ty);
2963   IRTemp t2 = newTemp(ty);
2964
2965   *decode_OK = True;
2966
2967   modrm = getIByte(delta);
2968
2969   if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
2970      /* LOCK prefix only allowed with inc and dec subopcodes */
2971      *decode_OK = False;
2972      return delta;
2973   }
2974
2975   if (epartIsReg(modrm)) {
2976      assign(t1, getIReg(1, eregOfRM(modrm)));
2977      switch (gregOfRM(modrm)) {
2978         case 0: /* INC */
2979            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
2980            putIReg(1, eregOfRM(modrm), mkexpr(t2));
2981            setFlags_INC_DEC( True, t2, ty );
2982            break;
2983         case 1: /* DEC */
2984            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
2985            putIReg(1, eregOfRM(modrm), mkexpr(t2));
2986            setFlags_INC_DEC( False, t2, ty );
2987            break;
2988         default:
2989            *decode_OK = False;
2990            return delta;
2991      }
2992      delta++;
2993      DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)),
2994                      nameIReg(1, eregOfRM(modrm)));
2995   } else {
2996      IRTemp addr = disAMode ( &alen, sorb, delta, dis_buf );
2997      assign( t1, loadLE(ty, mkexpr(addr)) );
2998      switch (gregOfRM(modrm)) {
2999         case 0: /* INC */
3000            assign(t2, binop(Iop_Add8, mkexpr(t1), mkU8(1)));
3001            if (locked) {
3002               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
3003                      guest_EIP_curr_instr );
3004            } else {
3005               storeLE( mkexpr(addr), mkexpr(t2) );
3006            }
3007            setFlags_INC_DEC( True, t2, ty );
3008            break;
3009         case 1: /* DEC */
3010            assign(t2, binop(Iop_Sub8, mkexpr(t1), mkU8(1)));
3011            if (locked) {
3012               casLE( mkexpr(addr), mkexpr(t1)/*expd*/, mkexpr(t2)/*new*/,
3013                      guest_EIP_curr_instr );
3014            } else {
3015               storeLE( mkexpr(addr), mkexpr(t2) );
3016            }
3017            setFlags_INC_DEC( False, t2, ty );
3018            break;
3019         default:
3020            *decode_OK = False;
3021            return delta;
3022      }
3023      delta += alen;
3024      DIP("%sb %s\n", nameGrp4(gregOfRM(modrm)), dis_buf);
3025   }
3026   return delta;
3027}
3028
3029
3030/* Group 5 extended opcodes. */
3031static
3032UInt dis_Grp5 ( UChar sorb, Bool locked, Int sz, Int delta,
3033                /*MOD*/DisResult* dres, /*OUT*/Bool* decode_OK )
3034{
3035   Int     len;
3036   UChar   modrm;
3037   HChar   dis_buf[50];
3038   IRTemp  addr = IRTemp_INVALID;
3039   IRType  ty = szToITy(sz);
3040   IRTemp  t1 = newTemp(ty);
3041   IRTemp  t2 = IRTemp_INVALID;
3042
3043   *decode_OK = True;
3044
3045   modrm = getIByte(delta);
3046
3047   if (locked && (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)) {
3048      /* LOCK prefix only allowed with inc and dec subopcodes */
3049      *decode_OK = False;
3050      return delta;
3051   }
3052
3053   if (epartIsReg(modrm)) {
3054      assign(t1, getIReg(sz,eregOfRM(modrm)));
3055      switch (gregOfRM(modrm)) {
3056         case 0: /* INC */
3057            vassert(sz == 2 || sz == 4);
3058            t2 = newTemp(ty);
3059            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
3060                             mkexpr(t1), mkU(ty,1)));
3061            setFlags_INC_DEC( True, t2, ty );
3062            putIReg(sz,eregOfRM(modrm),mkexpr(t2));
3063            break;
3064         case 1: /* DEC */
3065            vassert(sz == 2 || sz == 4);
3066            t2 = newTemp(ty);
3067            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
3068                             mkexpr(t1), mkU(ty,1)));
3069            setFlags_INC_DEC( False, t2, ty );
3070            putIReg(sz,eregOfRM(modrm),mkexpr(t2));
3071            break;
3072         case 2: /* call Ev */
3073            vassert(sz == 4);
3074            t2 = newTemp(Ity_I32);
3075            assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
3076            putIReg(4, R_ESP, mkexpr(t2));
3077            storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+1));
3078            jmp_treg(dres, Ijk_Call, t1);
3079            vassert(dres->whatNext == Dis_StopHere);
3080            break;
3081         case 4: /* jmp Ev */
3082            vassert(sz == 4);
3083            jmp_treg(dres, Ijk_Boring, t1);
3084            vassert(dres->whatNext == Dis_StopHere);
3085            break;
3086         case 6: /* PUSH Ev */
3087            vassert(sz == 4 || sz == 2);
3088            t2 = newTemp(Ity_I32);
3089            assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
3090            putIReg(4, R_ESP, mkexpr(t2) );
3091            storeLE( mkexpr(t2), mkexpr(t1) );
3092            break;
3093         default:
3094            *decode_OK = False;
3095            return delta;
3096      }
3097      delta++;
3098      DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
3099                       nameISize(sz), nameIReg(sz, eregOfRM(modrm)));
3100   } else {
3101      addr = disAMode ( &len, sorb, delta, dis_buf );
3102      assign(t1, loadLE(ty,mkexpr(addr)));
3103      switch (gregOfRM(modrm)) {
3104         case 0: /* INC */
3105            t2 = newTemp(ty);
3106            assign(t2, binop(mkSizedOp(ty,Iop_Add8),
3107                             mkexpr(t1), mkU(ty,1)));
3108            if (locked) {
3109               casLE( mkexpr(addr),
3110                      mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
3111            } else {
3112               storeLE(mkexpr(addr),mkexpr(t2));
3113            }
3114            setFlags_INC_DEC( True, t2, ty );
3115            break;
3116         case 1: /* DEC */
3117            t2 = newTemp(ty);
3118            assign(t2, binop(mkSizedOp(ty,Iop_Sub8),
3119                             mkexpr(t1), mkU(ty,1)));
3120            if (locked) {
3121               casLE( mkexpr(addr),
3122                      mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
3123            } else {
3124               storeLE(mkexpr(addr),mkexpr(t2));
3125            }
3126            setFlags_INC_DEC( False, t2, ty );
3127            break;
3128         case 2: /* call Ev */
3129            vassert(sz == 4);
3130            t2 = newTemp(Ity_I32);
3131            assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
3132            putIReg(4, R_ESP, mkexpr(t2));
3133            storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta+len));
3134            jmp_treg(dres, Ijk_Call, t1);
3135            vassert(dres->whatNext == Dis_StopHere);
3136            break;
3137         case 4: /* JMP Ev */
3138            vassert(sz == 4);
3139            jmp_treg(dres, Ijk_Boring, t1);
3140            vassert(dres->whatNext == Dis_StopHere);
3141            break;
3142         case 6: /* PUSH Ev */
3143            vassert(sz == 4 || sz == 2);
3144            t2 = newTemp(Ity_I32);
3145            assign( t2, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
3146            putIReg(4, R_ESP, mkexpr(t2) );
3147            storeLE( mkexpr(t2), mkexpr(t1) );
3148            break;
3149         default:
3150            *decode_OK = False;
3151            return delta;
3152      }
3153      delta += len;
3154      DIP("%s%c %s\n", nameGrp5(gregOfRM(modrm)),
3155                       nameISize(sz), dis_buf);
3156   }
3157   return delta;
3158}
3159
3160
3161/*------------------------------------------------------------*/
3162/*--- Disassembling string ops (including REP prefixes)    ---*/
3163/*------------------------------------------------------------*/
3164
3165/* Code shared by all the string ops */
3166static
3167void dis_string_op_increment(Int sz, Int t_inc)
3168{
3169   if (sz == 4 || sz == 2) {
3170      assign( t_inc,
3171              binop(Iop_Shl32, IRExpr_Get( OFFB_DFLAG, Ity_I32 ),
3172                               mkU8(sz/2) ) );
3173   } else {
3174      assign( t_inc,
3175              IRExpr_Get( OFFB_DFLAG, Ity_I32 ) );
3176   }
3177}
3178
3179static
3180void dis_string_op( void (*dis_OP)( Int, IRTemp ),
3181                    Int sz, HChar* name, UChar sorb )
3182{
3183   IRTemp t_inc = newTemp(Ity_I32);
3184   vassert(sorb == 0); /* hmm.  so what was the point of passing it in? */
3185   dis_string_op_increment(sz, t_inc);
3186   dis_OP( sz, t_inc );
3187   DIP("%s%c\n", name, nameISize(sz));
3188}
3189
3190static
3191void dis_MOVS ( Int sz, IRTemp t_inc )
3192{
3193   IRType ty = szToITy(sz);
3194   IRTemp td = newTemp(Ity_I32);   /* EDI */
3195   IRTemp ts = newTemp(Ity_I32);   /* ESI */
3196
3197   assign( td, getIReg(4, R_EDI) );
3198   assign( ts, getIReg(4, R_ESI) );
3199
3200   storeLE( mkexpr(td), loadLE(ty,mkexpr(ts)) );
3201
3202   putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3203   putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
3204}
3205
3206static
3207void dis_LODS ( Int sz, IRTemp t_inc )
3208{
3209   IRType ty = szToITy(sz);
3210   IRTemp ts = newTemp(Ity_I32);   /* ESI */
3211
3212   assign( ts, getIReg(4, R_ESI) );
3213
3214   putIReg( sz, R_EAX, loadLE(ty, mkexpr(ts)) );
3215
3216   putIReg( 4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
3217}
3218
3219static
3220void dis_STOS ( Int sz, IRTemp t_inc )
3221{
3222   IRType ty = szToITy(sz);
3223   IRTemp ta = newTemp(ty);        /* EAX */
3224   IRTemp td = newTemp(Ity_I32);   /* EDI */
3225
3226   assign( ta, getIReg(sz, R_EAX) );
3227   assign( td, getIReg(4, R_EDI) );
3228
3229   storeLE( mkexpr(td), mkexpr(ta) );
3230
3231   putIReg( 4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3232}
3233
3234static
3235void dis_CMPS ( Int sz, IRTemp t_inc )
3236{
3237   IRType ty  = szToITy(sz);
3238   IRTemp tdv = newTemp(ty);      /* (EDI) */
3239   IRTemp tsv = newTemp(ty);      /* (ESI) */
3240   IRTemp td  = newTemp(Ity_I32); /*  EDI  */
3241   IRTemp ts  = newTemp(Ity_I32); /*  ESI  */
3242
3243   assign( td, getIReg(4, R_EDI) );
3244   assign( ts, getIReg(4, R_ESI) );
3245
3246   assign( tdv, loadLE(ty,mkexpr(td)) );
3247   assign( tsv, loadLE(ty,mkexpr(ts)) );
3248
3249   setFlags_DEP1_DEP2 ( Iop_Sub8, tsv, tdv, ty );
3250
3251   putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3252   putIReg(4, R_ESI, binop(Iop_Add32, mkexpr(ts), mkexpr(t_inc)) );
3253}
3254
3255static
3256void dis_SCAS ( Int sz, IRTemp t_inc )
3257{
3258   IRType ty  = szToITy(sz);
3259   IRTemp ta  = newTemp(ty);       /*  EAX  */
3260   IRTemp td  = newTemp(Ity_I32);  /*  EDI  */
3261   IRTemp tdv = newTemp(ty);       /* (EDI) */
3262
3263   assign( ta, getIReg(sz, R_EAX) );
3264   assign( td, getIReg(4, R_EDI) );
3265
3266   assign( tdv, loadLE(ty,mkexpr(td)) );
3267   setFlags_DEP1_DEP2 ( Iop_Sub8, ta, tdv, ty );
3268
3269   putIReg(4, R_EDI, binop(Iop_Add32, mkexpr(td), mkexpr(t_inc)) );
3270}
3271
3272
3273/* Wrap the appropriate string op inside a REP/REPE/REPNE.
3274   We assume the insn is the last one in the basic block, and so emit a jump
3275   to the next insn, rather than just falling through. */
3276static
3277void dis_REP_op ( /*MOD*/DisResult* dres,
3278                  X86Condcode cond,
3279                  void (*dis_OP)(Int, IRTemp),
3280                  Int sz, Addr32 eip, Addr32 eip_next, HChar* name )
3281{
3282   IRTemp t_inc = newTemp(Ity_I32);
3283   IRTemp tc    = newTemp(Ity_I32);  /*  ECX  */
3284
3285   assign( tc, getIReg(4,R_ECX) );
3286
3287   stmt( IRStmt_Exit( binop(Iop_CmpEQ32,mkexpr(tc),mkU32(0)),
3288                      Ijk_Boring,
3289                      IRConst_U32(eip_next), OFFB_EIP ) );
3290
3291   putIReg(4, R_ECX, binop(Iop_Sub32, mkexpr(tc), mkU32(1)) );
3292
3293   dis_string_op_increment(sz, t_inc);
3294   dis_OP (sz, t_inc);
3295
3296   if (cond == X86CondAlways) {
3297      jmp_lit(dres, Ijk_Boring, eip);
3298      vassert(dres->whatNext == Dis_StopHere);
3299   } else {
3300      stmt( IRStmt_Exit( mk_x86g_calculate_condition(cond),
3301                         Ijk_Boring,
3302                         IRConst_U32(eip), OFFB_EIP ) );
3303      jmp_lit(dres, Ijk_Boring, eip_next);
3304      vassert(dres->whatNext == Dis_StopHere);
3305   }
3306   DIP("%s%c\n", name, nameISize(sz));
3307}
3308
3309
3310/*------------------------------------------------------------*/
3311/*--- Arithmetic, etc.                                     ---*/
3312/*------------------------------------------------------------*/
3313
3314/* IMUL E, G.  Supplied eip points to the modR/M byte. */
3315static
3316UInt dis_mul_E_G ( UChar       sorb,
3317                   Int         size,
3318                   Int         delta0 )
3319{
3320   Int    alen;
3321   HChar  dis_buf[50];
3322   UChar  rm = getIByte(delta0);
3323   IRType ty = szToITy(size);
3324   IRTemp te = newTemp(ty);
3325   IRTemp tg = newTemp(ty);
3326   IRTemp resLo = newTemp(ty);
3327
3328   assign( tg, getIReg(size, gregOfRM(rm)) );
3329   if (epartIsReg(rm)) {
3330      assign( te, getIReg(size, eregOfRM(rm)) );
3331   } else {
3332      IRTemp addr = disAMode( &alen, sorb, delta0, dis_buf );
3333      assign( te, loadLE(ty,mkexpr(addr)) );
3334   }
3335
3336   setFlags_MUL ( ty, te, tg, X86G_CC_OP_SMULB );
3337
3338   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tg) ) );
3339
3340   putIReg(size, gregOfRM(rm), mkexpr(resLo) );
3341
3342   if (epartIsReg(rm)) {
3343      DIP("imul%c %s, %s\n", nameISize(size),
3344                             nameIReg(size,eregOfRM(rm)),
3345                             nameIReg(size,gregOfRM(rm)));
3346      return 1+delta0;
3347   } else {
3348      DIP("imul%c %s, %s\n", nameISize(size),
3349                             dis_buf, nameIReg(size,gregOfRM(rm)));
3350      return alen+delta0;
3351   }
3352}
3353
3354
3355/* IMUL I * E -> G.  Supplied eip points to the modR/M byte. */
3356static
3357UInt dis_imul_I_E_G ( UChar       sorb,
3358                      Int         size,
3359                      Int         delta,
3360                      Int         litsize )
3361{
3362   Int    d32, alen;
3363   HChar  dis_buf[50];
3364   UChar  rm = getIByte(delta);
3365   IRType ty = szToITy(size);
3366   IRTemp te = newTemp(ty);
3367   IRTemp tl = newTemp(ty);
3368   IRTemp resLo = newTemp(ty);
3369
3370   vassert(size == 1 || size == 2 || size == 4);
3371
3372   if (epartIsReg(rm)) {
3373      assign(te, getIReg(size, eregOfRM(rm)));
3374      delta++;
3375   } else {
3376      IRTemp addr = disAMode( &alen, sorb, delta, dis_buf );
3377      assign(te, loadLE(ty, mkexpr(addr)));
3378      delta += alen;
3379   }
3380   d32 = getSDisp(litsize,delta);
3381   delta += litsize;
3382
3383   if (size == 1) d32 &= 0xFF;
3384   if (size == 2) d32 &= 0xFFFF;
3385
3386   assign(tl, mkU(ty,d32));
3387
3388   assign( resLo, binop( mkSizedOp(ty, Iop_Mul8), mkexpr(te), mkexpr(tl) ));
3389
3390   setFlags_MUL ( ty, te, tl, X86G_CC_OP_SMULB );
3391
3392   putIReg(size, gregOfRM(rm), mkexpr(resLo));
3393
3394   DIP("imul %d, %s, %s\n", d32,
3395       ( epartIsReg(rm) ? nameIReg(size,eregOfRM(rm)) : dis_buf ),
3396       nameIReg(size,gregOfRM(rm)) );
3397   return delta;
3398}
3399
3400
3401/* Generate an IR sequence to do a count-leading-zeroes operation on
3402   the supplied IRTemp, and return a new IRTemp holding the result.
3403   'ty' may be Ity_I16 or Ity_I32 only.  In the case where the
3404   argument is zero, return the number of bits in the word (the
3405   natural semantics). */
3406static IRTemp gen_LZCNT ( IRType ty, IRTemp src )
3407{
3408   vassert(ty == Ity_I32 || ty == Ity_I16);
3409
3410   IRTemp src32 = newTemp(Ity_I32);
3411   assign(src32, widenUto32( mkexpr(src) ));
3412
3413   IRTemp src32x = newTemp(Ity_I32);
3414   assign(src32x,
3415          binop(Iop_Shl32, mkexpr(src32),
3416                           mkU8(32 - 8 * sizeofIRType(ty))));
3417
3418   // Clz32 has undefined semantics when its input is zero, so
3419   // special-case around that.
3420   IRTemp res32 = newTemp(Ity_I32);
3421   assign(res32,
3422          IRExpr_Mux0X(
3423             unop(Iop_1Uto8,
3424                  binop(Iop_CmpEQ32, mkexpr(src32x), mkU32(0))),
3425             unop(Iop_Clz32, mkexpr(src32x)),
3426             mkU32(8 * sizeofIRType(ty))
3427   ));
3428
3429   IRTemp res = newTemp(ty);
3430   assign(res, narrowTo(ty, mkexpr(res32)));
3431   return res;
3432}
3433
3434
3435/*------------------------------------------------------------*/
3436/*---                                                      ---*/
3437/*--- x87 FLOATING POINT INSTRUCTIONS                      ---*/
3438/*---                                                      ---*/
3439/*------------------------------------------------------------*/
3440
3441/* --- Helper functions for dealing with the register stack. --- */
3442
3443/* --- Set the emulation-warning pseudo-register. --- */
3444
3445static void put_emwarn ( IRExpr* e /* :: Ity_I32 */ )
3446{
3447   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
3448   stmt( IRStmt_Put( OFFB_EMWARN, e ) );
3449}
3450
3451/* --- Produce an IRExpr* denoting a 64-bit QNaN. --- */
3452
3453static IRExpr* mkQNaN64 ( void )
3454{
3455  /* QNaN is 0 2047 1 0(51times)
3456     == 0b 11111111111b 1 0(51times)
3457     == 0x7FF8 0000 0000 0000
3458   */
3459   return IRExpr_Const(IRConst_F64i(0x7FF8000000000000ULL));
3460}
3461
3462/* --------- Get/put the top-of-stack pointer. --------- */
3463
3464static IRExpr* get_ftop ( void )
3465{
3466   return IRExpr_Get( OFFB_FTOP, Ity_I32 );
3467}
3468
3469static void put_ftop ( IRExpr* e )
3470{
3471   vassert(typeOfIRExpr(irsb->tyenv, e) == Ity_I32);
3472   stmt( IRStmt_Put( OFFB_FTOP, e ) );
3473}
3474
3475/* --------- Get/put the C3210 bits. --------- */
3476
3477static IRExpr* get_C3210 ( void )
3478{
3479   return IRExpr_Get( OFFB_FC3210, Ity_I32 );
3480}
3481
3482static void put_C3210 ( IRExpr* e )
3483{
3484   stmt( IRStmt_Put( OFFB_FC3210, e ) );
3485}
3486
3487/* --------- Get/put the FPU rounding mode. --------- */
3488static IRExpr* /* :: Ity_I32 */ get_fpround ( void )
3489{
3490   return IRExpr_Get( OFFB_FPROUND, Ity_I32 );
3491}
3492
3493static void put_fpround ( IRExpr* /* :: Ity_I32 */ e )
3494{
3495   stmt( IRStmt_Put( OFFB_FPROUND, e ) );
3496}
3497
3498
3499/* --------- Synthesise a 2-bit FPU rounding mode. --------- */
3500/* Produces a value in 0 .. 3, which is encoded as per the type
3501   IRRoundingMode.  Since the guest_FPROUND value is also encoded as
3502   per IRRoundingMode, we merely need to get it and mask it for
3503   safety.
3504*/
3505static IRExpr* /* :: Ity_I32 */ get_roundingmode ( void )
3506{
3507   return binop( Iop_And32, get_fpround(), mkU32(3) );
3508}
3509
3510static IRExpr* /* :: Ity_I32 */ get_FAKE_roundingmode ( void )
3511{
3512   return mkU32(Irrm_NEAREST);
3513}
3514
3515
3516/* --------- Get/set FP register tag bytes. --------- */
3517
3518/* Given i, and some expression e, generate 'ST_TAG(i) = e'. */
3519
3520static void put_ST_TAG ( Int i, IRExpr* value )
3521{
3522   IRRegArray* descr;
3523   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_I8);
3524   descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
3525   stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
3526}
3527
3528/* Given i, generate an expression yielding 'ST_TAG(i)'.  This will be
3529   zero to indicate "Empty" and nonzero to indicate "NonEmpty".  */
3530
3531static IRExpr* get_ST_TAG ( Int i )
3532{
3533   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
3534   return IRExpr_GetI( descr, get_ftop(), i );
3535}
3536
3537
3538/* --------- Get/set FP registers. --------- */
3539
3540/* Given i, and some expression e, emit 'ST(i) = e' and set the
3541   register's tag to indicate the register is full.  The previous
3542   state of the register is not checked. */
3543
3544static void put_ST_UNCHECKED ( Int i, IRExpr* value )
3545{
3546   IRRegArray* descr;
3547   vassert(typeOfIRExpr(irsb->tyenv, value) == Ity_F64);
3548   descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
3549   stmt( IRStmt_PutI( mkIRPutI(descr, get_ftop(), i, value) ) );
3550   /* Mark the register as in-use. */
3551   put_ST_TAG(i, mkU8(1));
3552}
3553
3554/* Given i, and some expression e, emit
3555      ST(i) = is_full(i) ? NaN : e
3556   and set the tag accordingly.
3557*/
3558
3559static void put_ST ( Int i, IRExpr* value )
3560{
3561   put_ST_UNCHECKED( i,
3562                     IRExpr_Mux0X( get_ST_TAG(i),
3563                                   /* 0 means empty */
3564                                   value,
3565                                   /* non-0 means full */
3566                                   mkQNaN64()
3567                   )
3568   );
3569}
3570
3571
3572/* Given i, generate an expression yielding 'ST(i)'. */
3573
3574static IRExpr* get_ST_UNCHECKED ( Int i )
3575{
3576   IRRegArray* descr = mkIRRegArray( OFFB_FPREGS, Ity_F64, 8 );
3577   return IRExpr_GetI( descr, get_ftop(), i );
3578}
3579
3580
3581/* Given i, generate an expression yielding
3582  is_full(i) ? ST(i) : NaN
3583*/
3584
3585static IRExpr* get_ST ( Int i )
3586{
3587   return
3588      IRExpr_Mux0X( get_ST_TAG(i),
3589                    /* 0 means empty */
3590                    mkQNaN64(),
3591                    /* non-0 means full */
3592                    get_ST_UNCHECKED(i));
3593}
3594
3595
3596/* Adjust FTOP downwards by one register. */
3597
3598static void fp_push ( void )
3599{
3600   put_ftop( binop(Iop_Sub32, get_ftop(), mkU32(1)) );
3601}
3602
3603/* Adjust FTOP upwards by one register, and mark the vacated register
3604   as empty.  */
3605
3606static void fp_pop ( void )
3607{
3608   put_ST_TAG(0, mkU8(0));
3609   put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
3610}
3611
3612/* Clear the C2 bit of the FPU status register, for
3613   sin/cos/tan/sincos. */
3614
3615static void clear_C2 ( void )
3616{
3617   put_C3210( binop(Iop_And32, get_C3210(), mkU32(~X86G_FC_MASK_C2)) );
3618}
3619
3620/* Invent a plausible-looking FPU status word value:
3621      ((ftop & 7) << 11) | (c3210 & 0x4700)
3622 */
3623static IRExpr* get_FPU_sw ( void )
3624{
3625   return
3626      unop(Iop_32to16,
3627           binop(Iop_Or32,
3628                 binop(Iop_Shl32,
3629                       binop(Iop_And32, get_ftop(), mkU32(7)),
3630                             mkU8(11)),
3631                       binop(Iop_And32, get_C3210(), mkU32(0x4700))
3632      ));
3633}
3634
3635
3636/* ------------------------------------------------------- */
3637/* Given all that stack-mangling junk, we can now go ahead
3638   and describe FP instructions.
3639*/
3640
3641/* ST(0) = ST(0) `op` mem64/32(addr)
3642   Need to check ST(0)'s tag on read, but not on write.
3643*/
3644static
3645void fp_do_op_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
3646                         IROp op, Bool dbl )
3647{
3648   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
3649   if (dbl) {
3650      put_ST_UNCHECKED(0,
3651         triop( op,
3652                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3653                get_ST(0),
3654                loadLE(Ity_F64,mkexpr(addr))
3655         ));
3656   } else {
3657      put_ST_UNCHECKED(0,
3658         triop( op,
3659                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3660                get_ST(0),
3661                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr)))
3662         ));
3663   }
3664}
3665
3666
3667/* ST(0) = mem64/32(addr) `op` ST(0)
3668   Need to check ST(0)'s tag on read, but not on write.
3669*/
3670static
3671void fp_do_oprev_mem_ST_0 ( IRTemp addr, HChar* op_txt, HChar* dis_buf,
3672                            IROp op, Bool dbl )
3673{
3674   DIP("f%s%c %s\n", op_txt, dbl?'l':'s', dis_buf);
3675   if (dbl) {
3676      put_ST_UNCHECKED(0,
3677         triop( op,
3678                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3679                loadLE(Ity_F64,mkexpr(addr)),
3680                get_ST(0)
3681         ));
3682   } else {
3683      put_ST_UNCHECKED(0,
3684         triop( op,
3685                get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3686                unop(Iop_F32toF64, loadLE(Ity_F32,mkexpr(addr))),
3687                get_ST(0)
3688         ));
3689   }
3690}
3691
3692
3693/* ST(dst) = ST(dst) `op` ST(src).
3694   Check dst and src tags when reading but not on write.
3695*/
3696static
3697void fp_do_op_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
3698                      Bool pop_after )
3699{
3700   DIP("f%s%s st(%d), st(%d)\n", op_txt, pop_after?"p":"",
3701                                 (Int)st_src, (Int)st_dst );
3702   put_ST_UNCHECKED(
3703      st_dst,
3704      triop( op,
3705             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3706             get_ST(st_dst),
3707             get_ST(st_src) )
3708   );
3709   if (pop_after)
3710      fp_pop();
3711}
3712
3713/* ST(dst) = ST(src) `op` ST(dst).
3714   Check dst and src tags when reading but not on write.
3715*/
3716static
3717void fp_do_oprev_ST_ST ( HChar* op_txt, IROp op, UInt st_src, UInt st_dst,
3718                         Bool pop_after )
3719{
3720   DIP("f%s%s st(%d), st(%d)\n", op_txt, pop_after?"p":"",
3721                                 (Int)st_src, (Int)st_dst );
3722   put_ST_UNCHECKED(
3723      st_dst,
3724      triop( op,
3725             get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
3726             get_ST(st_src),
3727             get_ST(st_dst) )
3728   );
3729   if (pop_after)
3730      fp_pop();
3731}
3732
3733/* %eflags(Z,P,C) = UCOMI( st(0), st(i) ) */
3734static void fp_do_ucomi_ST0_STi ( UInt i, Bool pop_after )
3735{
3736   DIP("fucomi%s %%st(0),%%st(%d)\n", pop_after ? "p" : "", (Int)i );
3737   /* This is a bit of a hack (and isn't really right).  It sets
3738      Z,P,C,O correctly, but forces A and S to zero, whereas the Intel
3739      documentation implies A and S are unchanged.
3740   */
3741   /* It's also fishy in that it is used both for COMIP and
3742      UCOMIP, and they aren't the same (although similar). */
3743   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
3744   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
3745   stmt( IRStmt_Put( OFFB_CC_DEP1,
3746                     binop( Iop_And32,
3747                            binop(Iop_CmpF64, get_ST(0), get_ST(i)),
3748                            mkU32(0x45)
3749       )));
3750   /* Set NDEP even though it isn't used.  This makes redundant-PUT
3751      elimination of previous stores to this field work better. */
3752   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
3753   if (pop_after)
3754      fp_pop();
3755}
3756
3757
3758static
3759UInt dis_FPU ( Bool* decode_ok, UChar sorb, Int delta )
3760{
3761   Int    len;
3762   UInt   r_src, r_dst;
3763   HChar  dis_buf[50];
3764   IRTemp t1, t2;
3765
3766   /* On entry, delta points at the second byte of the insn (the modrm
3767      byte).*/
3768   UChar first_opcode = getIByte(delta-1);
3769   UChar modrm        = getIByte(delta+0);
3770
3771   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD8 opcodes +-+-+-+-+-+-+-+ */
3772
3773   if (first_opcode == 0xD8) {
3774      if (modrm < 0xC0) {
3775
3776         /* bits 5,4,3 are an opcode extension, and the modRM also
3777           specifies an address. */
3778         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
3779         delta += len;
3780
3781         switch (gregOfRM(modrm)) {
3782
3783            case 0: /* FADD single-real */
3784               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, False );
3785               break;
3786
3787            case 1: /* FMUL single-real */
3788               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, False );
3789               break;
3790
3791            case 2: /* FCOM single-real */
3792               DIP("fcoms %s\n", dis_buf);
3793               /* This forces C1 to zero, which isn't right. */
3794               put_C3210(
3795                   binop( Iop_And32,
3796                          binop(Iop_Shl32,
3797                                binop(Iop_CmpF64,
3798                                      get_ST(0),
3799                                      unop(Iop_F32toF64,
3800                                           loadLE(Ity_F32,mkexpr(addr)))),
3801                                mkU8(8)),
3802                          mkU32(0x4500)
3803                   ));
3804               break;
3805
3806            case 3: /* FCOMP single-real */
3807               DIP("fcomps %s\n", dis_buf);
3808               /* This forces C1 to zero, which isn't right. */
3809               put_C3210(
3810                   binop( Iop_And32,
3811                          binop(Iop_Shl32,
3812                                binop(Iop_CmpF64,
3813                                      get_ST(0),
3814                                      unop(Iop_F32toF64,
3815                                           loadLE(Ity_F32,mkexpr(addr)))),
3816                                mkU8(8)),
3817                          mkU32(0x4500)
3818                   ));
3819               fp_pop();
3820               break;
3821
3822            case 4: /* FSUB single-real */
3823               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, False );
3824               break;
3825
3826            case 5: /* FSUBR single-real */
3827               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, False );
3828               break;
3829
3830            case 6: /* FDIV single-real */
3831               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, False );
3832               break;
3833
3834            case 7: /* FDIVR single-real */
3835               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, False );
3836               break;
3837
3838            default:
3839               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
3840               vex_printf("first_opcode == 0xD8\n");
3841               goto decode_fail;
3842         }
3843      } else {
3844         delta++;
3845         switch (modrm) {
3846
3847            case 0xC0 ... 0xC7: /* FADD %st(?),%st(0) */
3848               fp_do_op_ST_ST ( "add", Iop_AddF64, modrm - 0xC0, 0, False );
3849               break;
3850
3851            case 0xC8 ... 0xCF: /* FMUL %st(?),%st(0) */
3852               fp_do_op_ST_ST ( "mul", Iop_MulF64, modrm - 0xC8, 0, False );
3853               break;
3854
3855            /* Dunno if this is right */
3856            case 0xD0 ... 0xD7: /* FCOM %st(?),%st(0) */
3857               r_dst = (UInt)modrm - 0xD0;
3858               DIP("fcom %%st(0),%%st(%d)\n", (Int)r_dst);
3859               /* This forces C1 to zero, which isn't right. */
3860               put_C3210(
3861                   binop( Iop_And32,
3862                          binop(Iop_Shl32,
3863                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
3864                                mkU8(8)),
3865                          mkU32(0x4500)
3866                   ));
3867               break;
3868
3869            /* Dunno if this is right */
3870            case 0xD8 ... 0xDF: /* FCOMP %st(?),%st(0) */
3871               r_dst = (UInt)modrm - 0xD8;
3872               DIP("fcomp %%st(0),%%st(%d)\n", (Int)r_dst);
3873               /* This forces C1 to zero, which isn't right. */
3874               put_C3210(
3875                   binop( Iop_And32,
3876                          binop(Iop_Shl32,
3877                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
3878                                mkU8(8)),
3879                          mkU32(0x4500)
3880                   ));
3881               fp_pop();
3882               break;
3883
3884            case 0xE0 ... 0xE7: /* FSUB %st(?),%st(0) */
3885               fp_do_op_ST_ST ( "sub", Iop_SubF64, modrm - 0xE0, 0, False );
3886               break;
3887
3888            case 0xE8 ... 0xEF: /* FSUBR %st(?),%st(0) */
3889               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, modrm - 0xE8, 0, False );
3890               break;
3891
3892            case 0xF0 ... 0xF7: /* FDIV %st(?),%st(0) */
3893               fp_do_op_ST_ST ( "div", Iop_DivF64, modrm - 0xF0, 0, False );
3894               break;
3895
3896            case 0xF8 ... 0xFF: /* FDIVR %st(?),%st(0) */
3897               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, modrm - 0xF8, 0, False );
3898               break;
3899
3900            default:
3901               goto decode_fail;
3902         }
3903      }
3904   }
3905
3906   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xD9 opcodes +-+-+-+-+-+-+-+ */
3907   else
3908   if (first_opcode == 0xD9) {
3909      if (modrm < 0xC0) {
3910
3911         /* bits 5,4,3 are an opcode extension, and the modRM also
3912            specifies an address. */
3913         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
3914         delta += len;
3915
3916         switch (gregOfRM(modrm)) {
3917
3918            case 0: /* FLD single-real */
3919               DIP("flds %s\n", dis_buf);
3920               fp_push();
3921               put_ST(0, unop(Iop_F32toF64,
3922                              loadLE(Ity_F32, mkexpr(addr))));
3923               break;
3924
3925            case 2: /* FST single-real */
3926               DIP("fsts %s\n", dis_buf);
3927               storeLE(mkexpr(addr),
3928                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
3929               break;
3930
3931            case 3: /* FSTP single-real */
3932               DIP("fstps %s\n", dis_buf);
3933               storeLE(mkexpr(addr),
3934                       binop(Iop_F64toF32, get_roundingmode(), get_ST(0)));
3935               fp_pop();
3936               break;
3937
3938            case 4: { /* FLDENV m28 */
3939               /* Uses dirty helper:
3940                     VexEmWarn x86g_do_FLDENV ( VexGuestX86State*, HWord ) */
3941               IRTemp   ew = newTemp(Ity_I32);
3942               IRDirty* d  = unsafeIRDirty_0_N (
3943                                0/*regparms*/,
3944                                "x86g_dirtyhelper_FLDENV",
3945                                &x86g_dirtyhelper_FLDENV,
3946                                mkIRExprVec_1( mkexpr(addr) )
3947                             );
3948               d->needsBBP = True;
3949               d->tmp      = ew;
3950               /* declare we're reading memory */
3951               d->mFx   = Ifx_Read;
3952               d->mAddr = mkexpr(addr);
3953               d->mSize = 28;
3954
3955               /* declare we're writing guest state */
3956               d->nFxState = 4;
3957               vex_bzero(&d->fxState, sizeof(d->fxState));
3958
3959               d->fxState[0].fx     = Ifx_Write;
3960               d->fxState[0].offset = OFFB_FTOP;
3961               d->fxState[0].size   = sizeof(UInt);
3962
3963               d->fxState[1].fx     = Ifx_Write;
3964               d->fxState[1].offset = OFFB_FPTAGS;
3965               d->fxState[1].size   = 8 * sizeof(UChar);
3966
3967               d->fxState[2].fx     = Ifx_Write;
3968               d->fxState[2].offset = OFFB_FPROUND;
3969               d->fxState[2].size   = sizeof(UInt);
3970
3971               d->fxState[3].fx     = Ifx_Write;
3972               d->fxState[3].offset = OFFB_FC3210;
3973               d->fxState[3].size   = sizeof(UInt);
3974
3975               stmt( IRStmt_Dirty(d) );
3976
3977               /* ew contains any emulation warning we may need to
3978                  issue.  If needed, side-exit to the next insn,
3979                  reporting the warning, so that Valgrind's dispatcher
3980                  sees the warning. */
3981               put_emwarn( mkexpr(ew) );
3982               stmt(
3983                  IRStmt_Exit(
3984                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
3985                     Ijk_EmWarn,
3986                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
3987                     OFFB_EIP
3988                  )
3989               );
3990
3991               DIP("fldenv %s\n", dis_buf);
3992               break;
3993            }
3994
3995            case 5: {/* FLDCW */
3996               /* The only thing we observe in the control word is the
3997                  rounding mode.  Therefore, pass the 16-bit value
3998                  (x87 native-format control word) to a clean helper,
3999                  getting back a 64-bit value, the lower half of which
4000                  is the FPROUND value to store, and the upper half of
4001                  which is the emulation-warning token which may be
4002                  generated.
4003               */
4004               /* ULong x86h_check_fldcw ( UInt ); */
4005               IRTemp t64 = newTemp(Ity_I64);
4006               IRTemp ew = newTemp(Ity_I32);
4007               DIP("fldcw %s\n", dis_buf);
4008               assign( t64, mkIRExprCCall(
4009                               Ity_I64, 0/*regparms*/,
4010                               "x86g_check_fldcw",
4011                               &x86g_check_fldcw,
4012                               mkIRExprVec_1(
4013                                  unop( Iop_16Uto32,
4014                                        loadLE(Ity_I16, mkexpr(addr)))
4015                               )
4016                            )
4017                     );
4018
4019               put_fpround( unop(Iop_64to32, mkexpr(t64)) );
4020               assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
4021               put_emwarn( mkexpr(ew) );
4022               /* Finally, if an emulation warning was reported,
4023                  side-exit to the next insn, reporting the warning,
4024                  so that Valgrind's dispatcher sees the warning. */
4025               stmt(
4026                  IRStmt_Exit(
4027                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
4028                     Ijk_EmWarn,
4029                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
4030                     OFFB_EIP
4031                  )
4032               );
4033               break;
4034            }
4035
4036            case 6: { /* FNSTENV m28 */
4037               /* Uses dirty helper:
4038                     void x86g_do_FSTENV ( VexGuestX86State*, HWord ) */
4039               IRDirty* d = unsafeIRDirty_0_N (
4040                               0/*regparms*/,
4041                               "x86g_dirtyhelper_FSTENV",
4042                               &x86g_dirtyhelper_FSTENV,
4043                               mkIRExprVec_1( mkexpr(addr) )
4044                            );
4045               d->needsBBP = True;
4046               /* declare we're writing memory */
4047               d->mFx   = Ifx_Write;
4048               d->mAddr = mkexpr(addr);
4049               d->mSize = 28;
4050
4051               /* declare we're reading guest state */
4052               d->nFxState = 4;
4053               vex_bzero(&d->fxState, sizeof(d->fxState));
4054
4055               d->fxState[0].fx     = Ifx_Read;
4056               d->fxState[0].offset = OFFB_FTOP;
4057               d->fxState[0].size   = sizeof(UInt);
4058
4059               d->fxState[1].fx     = Ifx_Read;
4060               d->fxState[1].offset = OFFB_FPTAGS;
4061               d->fxState[1].size   = 8 * sizeof(UChar);
4062
4063               d->fxState[2].fx     = Ifx_Read;
4064               d->fxState[2].offset = OFFB_FPROUND;
4065               d->fxState[2].size   = sizeof(UInt);
4066
4067               d->fxState[3].fx     = Ifx_Read;
4068               d->fxState[3].offset = OFFB_FC3210;
4069               d->fxState[3].size   = sizeof(UInt);
4070
4071               stmt( IRStmt_Dirty(d) );
4072
4073               DIP("fnstenv %s\n", dis_buf);
4074               break;
4075            }
4076
4077            case 7: /* FNSTCW */
4078              /* Fake up a native x87 FPU control word.  The only
4079                 thing it depends on is FPROUND[1:0], so call a clean
4080                 helper to cook it up. */
4081               /* UInt x86h_create_fpucw ( UInt fpround ) */
4082               DIP("fnstcw %s\n", dis_buf);
4083               storeLE(
4084                  mkexpr(addr),
4085                  unop( Iop_32to16,
4086                        mkIRExprCCall(
4087                           Ity_I32, 0/*regp*/,
4088                           "x86g_create_fpucw", &x86g_create_fpucw,
4089                           mkIRExprVec_1( get_fpround() )
4090                        )
4091                  )
4092               );
4093               break;
4094
4095            default:
4096               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
4097               vex_printf("first_opcode == 0xD9\n");
4098               goto decode_fail;
4099         }
4100
4101      } else {
4102         delta++;
4103         switch (modrm) {
4104
4105            case 0xC0 ... 0xC7: /* FLD %st(?) */
4106               r_src = (UInt)modrm - 0xC0;
4107               DIP("fld %%st(%d)\n", (Int)r_src);
4108               t1 = newTemp(Ity_F64);
4109               assign(t1, get_ST(r_src));
4110               fp_push();
4111               put_ST(0, mkexpr(t1));
4112               break;
4113
4114            case 0xC8 ... 0xCF: /* FXCH %st(?) */
4115               r_src = (UInt)modrm - 0xC8;
4116               DIP("fxch %%st(%d)\n", (Int)r_src);
4117               t1 = newTemp(Ity_F64);
4118               t2 = newTemp(Ity_F64);
4119               assign(t1, get_ST(0));
4120               assign(t2, get_ST(r_src));
4121               put_ST_UNCHECKED(0, mkexpr(t2));
4122               put_ST_UNCHECKED(r_src, mkexpr(t1));
4123               break;
4124
4125            case 0xE0: /* FCHS */
4126               DIP("fchs\n");
4127               put_ST_UNCHECKED(0, unop(Iop_NegF64, get_ST(0)));
4128               break;
4129
4130            case 0xE1: /* FABS */
4131               DIP("fabs\n");
4132               put_ST_UNCHECKED(0, unop(Iop_AbsF64, get_ST(0)));
4133               break;
4134
4135            case 0xE4: /* FTST */
4136               DIP("ftst\n");
4137               /* This forces C1 to zero, which isn't right. */
4138               /* Well, in fact the Intel docs say (bizarrely): "C1 is
4139                  set to 0 if stack underflow occurred; otherwise, set
4140                  to 0" which is pretty nonsensical.  I guess it's a
4141                   typo. */
4142               put_C3210(
4143                   binop( Iop_And32,
4144                          binop(Iop_Shl32,
4145                                binop(Iop_CmpF64,
4146                                      get_ST(0),
4147                                      IRExpr_Const(IRConst_F64i(0x0ULL))),
4148                                mkU8(8)),
4149                          mkU32(0x4500)
4150                   ));
4151               break;
4152
4153            case 0xE5: { /* FXAM */
4154               /* This is an interesting one.  It examines %st(0),
4155                  regardless of whether the tag says it's empty or not.
4156                  Here, just pass both the tag (in our format) and the
4157                  value (as a double, actually a ULong) to a helper
4158                  function. */
4159               IRExpr** args
4160                  = mkIRExprVec_2( unop(Iop_8Uto32, get_ST_TAG(0)),
4161                                   unop(Iop_ReinterpF64asI64,
4162                                        get_ST_UNCHECKED(0)) );
4163               put_C3210(mkIRExprCCall(
4164                            Ity_I32,
4165                            0/*regparm*/,
4166                            "x86g_calculate_FXAM", &x86g_calculate_FXAM,
4167                            args
4168                        ));
4169               DIP("fxam\n");
4170               break;
4171            }
4172
4173            case 0xE8: /* FLD1 */
4174               DIP("fld1\n");
4175               fp_push();
4176               /* put_ST(0, IRExpr_Const(IRConst_F64(1.0))); */
4177               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff0000000000000ULL)));
4178               break;
4179
4180            case 0xE9: /* FLDL2T */
4181               DIP("fldl2t\n");
4182               fp_push();
4183               /* put_ST(0, IRExpr_Const(IRConst_F64(3.32192809488736234781))); */
4184               put_ST(0, IRExpr_Const(IRConst_F64i(0x400a934f0979a371ULL)));
4185               break;
4186
4187            case 0xEA: /* FLDL2E */
4188               DIP("fldl2e\n");
4189               fp_push();
4190               /* put_ST(0, IRExpr_Const(IRConst_F64(1.44269504088896340739))); */
4191               put_ST(0, IRExpr_Const(IRConst_F64i(0x3ff71547652b82feULL)));
4192               break;
4193
4194            case 0xEB: /* FLDPI */
4195               DIP("fldpi\n");
4196               fp_push();
4197               /* put_ST(0, IRExpr_Const(IRConst_F64(3.14159265358979323851))); */
4198               put_ST(0, IRExpr_Const(IRConst_F64i(0x400921fb54442d18ULL)));
4199               break;
4200
4201            case 0xEC: /* FLDLG2 */
4202               DIP("fldlg2\n");
4203               fp_push();
4204               /* put_ST(0, IRExpr_Const(IRConst_F64(0.301029995663981143))); */
4205               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fd34413509f79ffULL)));
4206               break;
4207
4208            case 0xED: /* FLDLN2 */
4209               DIP("fldln2\n");
4210               fp_push();
4211               /* put_ST(0, IRExpr_Const(IRConst_F64(0.69314718055994530942))); */
4212               put_ST(0, IRExpr_Const(IRConst_F64i(0x3fe62e42fefa39efULL)));
4213               break;
4214
4215            case 0xEE: /* FLDZ */
4216               DIP("fldz\n");
4217               fp_push();
4218               /* put_ST(0, IRExpr_Const(IRConst_F64(0.0))); */
4219               put_ST(0, IRExpr_Const(IRConst_F64i(0x0000000000000000ULL)));
4220               break;
4221
4222            case 0xF0: /* F2XM1 */
4223               DIP("f2xm1\n");
4224               put_ST_UNCHECKED(0,
4225                  binop(Iop_2xm1F64,
4226                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4227                        get_ST(0)));
4228               break;
4229
4230            case 0xF1: /* FYL2X */
4231               DIP("fyl2x\n");
4232               put_ST_UNCHECKED(1,
4233                  triop(Iop_Yl2xF64,
4234                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4235                        get_ST(1),
4236                        get_ST(0)));
4237               fp_pop();
4238               break;
4239
4240            case 0xF2: /* FPTAN */
4241               DIP("ftan\n");
4242               put_ST_UNCHECKED(0,
4243                  binop(Iop_TanF64,
4244                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4245                        get_ST(0)));
4246               fp_push();
4247               put_ST(0, IRExpr_Const(IRConst_F64(1.0)));
4248               clear_C2(); /* HACK */
4249               break;
4250
4251            case 0xF3: /* FPATAN */
4252               DIP("fpatan\n");
4253               put_ST_UNCHECKED(1,
4254                  triop(Iop_AtanF64,
4255                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4256                        get_ST(1),
4257                        get_ST(0)));
4258               fp_pop();
4259               break;
4260
4261            case 0xF4: { /* FXTRACT */
4262               IRTemp argF = newTemp(Ity_F64);
4263               IRTemp sigF = newTemp(Ity_F64);
4264               IRTemp expF = newTemp(Ity_F64);
4265               IRTemp argI = newTemp(Ity_I64);
4266               IRTemp sigI = newTemp(Ity_I64);
4267               IRTemp expI = newTemp(Ity_I64);
4268               DIP("fxtract\n");
4269               assign( argF, get_ST(0) );
4270               assign( argI, unop(Iop_ReinterpF64asI64, mkexpr(argF)));
4271               assign( sigI,
4272                       mkIRExprCCall(
4273                          Ity_I64, 0/*regparms*/,
4274                          "x86amd64g_calculate_FXTRACT",
4275                          &x86amd64g_calculate_FXTRACT,
4276                          mkIRExprVec_2( mkexpr(argI),
4277                                         mkIRExpr_HWord(0)/*sig*/ ))
4278               );
4279               assign( expI,
4280                       mkIRExprCCall(
4281                          Ity_I64, 0/*regparms*/,
4282                          "x86amd64g_calculate_FXTRACT",
4283                          &x86amd64g_calculate_FXTRACT,
4284                          mkIRExprVec_2( mkexpr(argI),
4285                                         mkIRExpr_HWord(1)/*exp*/ ))
4286               );
4287               assign( sigF, unop(Iop_ReinterpI64asF64, mkexpr(sigI)) );
4288               assign( expF, unop(Iop_ReinterpI64asF64, mkexpr(expI)) );
4289               /* exponent */
4290               put_ST_UNCHECKED(0, mkexpr(expF) );
4291               fp_push();
4292               /* significand */
4293               put_ST(0, mkexpr(sigF) );
4294               break;
4295            }
4296
4297            case 0xF5: { /* FPREM1 -- IEEE compliant */
4298               IRTemp a1 = newTemp(Ity_F64);
4299               IRTemp a2 = newTemp(Ity_F64);
4300               DIP("fprem1\n");
4301               /* Do FPREM1 twice, once to get the remainder, and once
4302                  to get the C3210 flag values. */
4303               assign( a1, get_ST(0) );
4304               assign( a2, get_ST(1) );
4305               put_ST_UNCHECKED(0,
4306                  triop(Iop_PRem1F64,
4307                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4308                        mkexpr(a1),
4309                        mkexpr(a2)));
4310               put_C3210(
4311                  triop(Iop_PRem1C3210F64,
4312                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4313                        mkexpr(a1),
4314                        mkexpr(a2)) );
4315               break;
4316            }
4317
4318            case 0xF7: /* FINCSTP */
4319               DIP("fprem\n");
4320               put_ftop( binop(Iop_Add32, get_ftop(), mkU32(1)) );
4321               break;
4322
4323            case 0xF8: { /* FPREM -- not IEEE compliant */
4324               IRTemp a1 = newTemp(Ity_F64);
4325               IRTemp a2 = newTemp(Ity_F64);
4326               DIP("fprem\n");
4327               /* Do FPREM twice, once to get the remainder, and once
4328                  to get the C3210 flag values. */
4329               assign( a1, get_ST(0) );
4330               assign( a2, get_ST(1) );
4331               put_ST_UNCHECKED(0,
4332                  triop(Iop_PRemF64,
4333                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4334                        mkexpr(a1),
4335                        mkexpr(a2)));
4336               put_C3210(
4337                  triop(Iop_PRemC3210F64,
4338                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4339                        mkexpr(a1),
4340                        mkexpr(a2)) );
4341               break;
4342            }
4343
4344            case 0xF9: /* FYL2XP1 */
4345               DIP("fyl2xp1\n");
4346               put_ST_UNCHECKED(1,
4347                  triop(Iop_Yl2xp1F64,
4348                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4349                        get_ST(1),
4350                        get_ST(0)));
4351               fp_pop();
4352               break;
4353
4354            case 0xFA: /* FSQRT */
4355               DIP("fsqrt\n");
4356               put_ST_UNCHECKED(0,
4357                  binop(Iop_SqrtF64,
4358                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4359                        get_ST(0)));
4360               break;
4361
4362            case 0xFB: { /* FSINCOS */
4363               IRTemp a1 = newTemp(Ity_F64);
4364               assign( a1, get_ST(0) );
4365               DIP("fsincos\n");
4366               put_ST_UNCHECKED(0,
4367                  binop(Iop_SinF64,
4368                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4369                        mkexpr(a1)));
4370               fp_push();
4371               put_ST(0,
4372                  binop(Iop_CosF64,
4373                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4374                        mkexpr(a1)));
4375               clear_C2(); /* HACK */
4376               break;
4377            }
4378
4379            case 0xFC: /* FRNDINT */
4380               DIP("frndint\n");
4381               put_ST_UNCHECKED(0,
4382                  binop(Iop_RoundF64toInt, get_roundingmode(), get_ST(0)) );
4383               break;
4384
4385            case 0xFD: /* FSCALE */
4386               DIP("fscale\n");
4387               put_ST_UNCHECKED(0,
4388                  triop(Iop_ScaleF64,
4389                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4390                        get_ST(0),
4391                        get_ST(1)));
4392               break;
4393
4394            case 0xFE: /* FSIN */
4395               DIP("fsin\n");
4396               put_ST_UNCHECKED(0,
4397                  binop(Iop_SinF64,
4398                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4399                        get_ST(0)));
4400               clear_C2(); /* HACK */
4401               break;
4402
4403            case 0xFF: /* FCOS */
4404               DIP("fcos\n");
4405               put_ST_UNCHECKED(0,
4406                  binop(Iop_CosF64,
4407                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4408                        get_ST(0)));
4409               clear_C2(); /* HACK */
4410               break;
4411
4412            default:
4413               goto decode_fail;
4414         }
4415      }
4416   }
4417
4418   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDA opcodes +-+-+-+-+-+-+-+ */
4419   else
4420   if (first_opcode == 0xDA) {
4421
4422      if (modrm < 0xC0) {
4423
4424         /* bits 5,4,3 are an opcode extension, and the modRM also
4425            specifies an address. */
4426         IROp   fop;
4427         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4428         delta += len;
4429         switch (gregOfRM(modrm)) {
4430
4431            case 0: /* FIADD m32int */ /* ST(0) += m32int */
4432               DIP("fiaddl %s\n", dis_buf);
4433               fop = Iop_AddF64;
4434               goto do_fop_m32;
4435
4436            case 1: /* FIMUL m32int */ /* ST(0) *= m32int */
4437               DIP("fimull %s\n", dis_buf);
4438               fop = Iop_MulF64;
4439               goto do_fop_m32;
4440
4441            case 2: /* FICOM m32int */
4442               DIP("ficoml %s\n", dis_buf);
4443               /* This forces C1 to zero, which isn't right. */
4444               put_C3210(
4445                   binop( Iop_And32,
4446                          binop(Iop_Shl32,
4447                                binop(Iop_CmpF64,
4448                                      get_ST(0),
4449                                      unop(Iop_I32StoF64,
4450                                           loadLE(Ity_I32,mkexpr(addr)))),
4451                                mkU8(8)),
4452                          mkU32(0x4500)
4453                   ));
4454               break;
4455
4456            case 3: /* FICOMP m32int */
4457               DIP("ficompl %s\n", dis_buf);
4458               /* This forces C1 to zero, which isn't right. */
4459               put_C3210(
4460                   binop( Iop_And32,
4461                          binop(Iop_Shl32,
4462                                binop(Iop_CmpF64,
4463                                      get_ST(0),
4464                                      unop(Iop_I32StoF64,
4465                                           loadLE(Ity_I32,mkexpr(addr)))),
4466                                mkU8(8)),
4467                          mkU32(0x4500)
4468                   ));
4469               fp_pop();
4470               break;
4471
4472            case 4: /* FISUB m32int */ /* ST(0) -= m32int */
4473               DIP("fisubl %s\n", dis_buf);
4474               fop = Iop_SubF64;
4475               goto do_fop_m32;
4476
4477            case 5: /* FISUBR m32int */ /* ST(0) = m32int - ST(0) */
4478               DIP("fisubrl %s\n", dis_buf);
4479               fop = Iop_SubF64;
4480               goto do_foprev_m32;
4481
4482            case 6: /* FIDIV m32int */ /* ST(0) /= m32int */
4483               DIP("fidivl %s\n", dis_buf);
4484               fop = Iop_DivF64;
4485               goto do_fop_m32;
4486
4487            case 7: /* FIDIVR m32int */ /* ST(0) = m32int / ST(0) */
4488               DIP("fidivrl %s\n", dis_buf);
4489               fop = Iop_DivF64;
4490               goto do_foprev_m32;
4491
4492            do_fop_m32:
4493               put_ST_UNCHECKED(0,
4494                  triop(fop,
4495                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4496                        get_ST(0),
4497                        unop(Iop_I32StoF64,
4498                             loadLE(Ity_I32, mkexpr(addr)))));
4499               break;
4500
4501            do_foprev_m32:
4502               put_ST_UNCHECKED(0,
4503                  triop(fop,
4504                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
4505                        unop(Iop_I32StoF64,
4506                             loadLE(Ity_I32, mkexpr(addr))),
4507                        get_ST(0)));
4508               break;
4509
4510            default:
4511               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
4512               vex_printf("first_opcode == 0xDA\n");
4513               goto decode_fail;
4514         }
4515
4516      } else {
4517
4518         delta++;
4519         switch (modrm) {
4520
4521            case 0xC0 ... 0xC7: /* FCMOVB ST(i), ST(0) */
4522               r_src = (UInt)modrm - 0xC0;
4523               DIP("fcmovb %%st(%d), %%st(0)\n", (Int)r_src);
4524               put_ST_UNCHECKED(0,
4525                                IRExpr_Mux0X(
4526                                    unop(Iop_1Uto8,
4527                                         mk_x86g_calculate_condition(X86CondB)),
4528                                    get_ST(0), get_ST(r_src)) );
4529               break;
4530
4531            case 0xC8 ... 0xCF: /* FCMOVE(Z) ST(i), ST(0) */
4532               r_src = (UInt)modrm - 0xC8;
4533               DIP("fcmovz %%st(%d), %%st(0)\n", (Int)r_src);
4534               put_ST_UNCHECKED(0,
4535                                IRExpr_Mux0X(
4536                                    unop(Iop_1Uto8,
4537                                         mk_x86g_calculate_condition(X86CondZ)),
4538                                    get_ST(0), get_ST(r_src)) );
4539               break;
4540
4541            case 0xD0 ... 0xD7: /* FCMOVBE ST(i), ST(0) */
4542               r_src = (UInt)modrm - 0xD0;
4543               DIP("fcmovbe %%st(%d), %%st(0)\n", (Int)r_src);
4544               put_ST_UNCHECKED(0,
4545                                IRExpr_Mux0X(
4546                                    unop(Iop_1Uto8,
4547                                         mk_x86g_calculate_condition(X86CondBE)),
4548                                    get_ST(0), get_ST(r_src)) );
4549               break;
4550
4551            case 0xD8 ... 0xDF: /* FCMOVU ST(i), ST(0) */
4552               r_src = (UInt)modrm - 0xD8;
4553               DIP("fcmovu %%st(%d), %%st(0)\n", (Int)r_src);
4554               put_ST_UNCHECKED(0,
4555                                IRExpr_Mux0X(
4556                                    unop(Iop_1Uto8,
4557                                         mk_x86g_calculate_condition(X86CondP)),
4558                                    get_ST(0), get_ST(r_src)) );
4559               break;
4560
4561            case 0xE9: /* FUCOMPP %st(0),%st(1) */
4562               DIP("fucompp %%st(0),%%st(1)\n");
4563               /* This forces C1 to zero, which isn't right. */
4564               put_C3210(
4565                   binop( Iop_And32,
4566                          binop(Iop_Shl32,
4567                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
4568                                mkU8(8)),
4569                          mkU32(0x4500)
4570                   ));
4571               fp_pop();
4572               fp_pop();
4573               break;
4574
4575            default:
4576               goto decode_fail;
4577         }
4578
4579      }
4580   }
4581
4582   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDB opcodes +-+-+-+-+-+-+-+ */
4583   else
4584   if (first_opcode == 0xDB) {
4585      if (modrm < 0xC0) {
4586
4587         /* bits 5,4,3 are an opcode extension, and the modRM also
4588            specifies an address. */
4589         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4590         delta += len;
4591
4592         switch (gregOfRM(modrm)) {
4593
4594            case 0: /* FILD m32int */
4595               DIP("fildl %s\n", dis_buf);
4596               fp_push();
4597               put_ST(0, unop(Iop_I32StoF64,
4598                              loadLE(Ity_I32, mkexpr(addr))));
4599               break;
4600
4601            case 1: /* FISTTPL m32 (SSE3) */
4602               DIP("fisttpl %s\n", dis_buf);
4603               storeLE( mkexpr(addr),
4604                        binop(Iop_F64toI32S, mkU32(Irrm_ZERO), get_ST(0)) );
4605               fp_pop();
4606               break;
4607
4608            case 2: /* FIST m32 */
4609               DIP("fistl %s\n", dis_buf);
4610               storeLE( mkexpr(addr),
4611                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
4612               break;
4613
4614            case 3: /* FISTP m32 */
4615               DIP("fistpl %s\n", dis_buf);
4616               storeLE( mkexpr(addr),
4617                        binop(Iop_F64toI32S, get_roundingmode(), get_ST(0)) );
4618               fp_pop();
4619               break;
4620
4621            case 5: { /* FLD extended-real */
4622               /* Uses dirty helper:
4623                     ULong x86g_loadF80le ( UInt )
4624                  addr holds the address.  First, do a dirty call to
4625                  get hold of the data. */
4626               IRTemp   val  = newTemp(Ity_I64);
4627               IRExpr** args = mkIRExprVec_1 ( mkexpr(addr) );
4628
4629               IRDirty* d = unsafeIRDirty_1_N (
4630                               val,
4631                               0/*regparms*/,
4632                               "x86g_dirtyhelper_loadF80le",
4633                               &x86g_dirtyhelper_loadF80le,
4634                               args
4635                            );
4636               /* declare that we're reading memory */
4637               d->mFx   = Ifx_Read;
4638               d->mAddr = mkexpr(addr);
4639               d->mSize = 10;
4640
4641               /* execute the dirty call, dumping the result in val. */
4642               stmt( IRStmt_Dirty(d) );
4643               fp_push();
4644               put_ST(0, unop(Iop_ReinterpI64asF64, mkexpr(val)));
4645
4646               DIP("fldt %s\n", dis_buf);
4647               break;
4648            }
4649
4650            case 7: { /* FSTP extended-real */
4651               /* Uses dirty helper: void x86g_storeF80le ( UInt, ULong ) */
4652               IRExpr** args
4653                  = mkIRExprVec_2( mkexpr(addr),
4654                                   unop(Iop_ReinterpF64asI64, get_ST(0)) );
4655
4656               IRDirty* d = unsafeIRDirty_0_N (
4657                               0/*regparms*/,
4658                               "x86g_dirtyhelper_storeF80le",
4659                               &x86g_dirtyhelper_storeF80le,
4660                               args
4661                            );
4662               /* declare we're writing memory */
4663               d->mFx   = Ifx_Write;
4664               d->mAddr = mkexpr(addr);
4665               d->mSize = 10;
4666
4667               /* execute the dirty call. */
4668               stmt( IRStmt_Dirty(d) );
4669               fp_pop();
4670
4671               DIP("fstpt\n %s", dis_buf);
4672               break;
4673            }
4674
4675            default:
4676               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
4677               vex_printf("first_opcode == 0xDB\n");
4678               goto decode_fail;
4679         }
4680
4681      } else {
4682
4683         delta++;
4684         switch (modrm) {
4685
4686            case 0xC0 ... 0xC7: /* FCMOVNB ST(i), ST(0) */
4687               r_src = (UInt)modrm - 0xC0;
4688               DIP("fcmovnb %%st(%d), %%st(0)\n", (Int)r_src);
4689               put_ST_UNCHECKED(0,
4690                                IRExpr_Mux0X(
4691                                    unop(Iop_1Uto8,
4692                                         mk_x86g_calculate_condition(X86CondNB)),
4693                                    get_ST(0), get_ST(r_src)) );
4694               break;
4695
4696            case 0xC8 ... 0xCF: /* FCMOVNE(NZ) ST(i), ST(0) */
4697               r_src = (UInt)modrm - 0xC8;
4698               DIP("fcmovnz %%st(%d), %%st(0)\n", (Int)r_src);
4699               put_ST_UNCHECKED(0,
4700                                IRExpr_Mux0X(
4701                                    unop(Iop_1Uto8,
4702                                         mk_x86g_calculate_condition(X86CondNZ)),
4703                                    get_ST(0), get_ST(r_src)) );
4704               break;
4705
4706            case 0xD0 ... 0xD7: /* FCMOVNBE ST(i), ST(0) */
4707               r_src = (UInt)modrm - 0xD0;
4708               DIP("fcmovnbe %%st(%d), %%st(0)\n", (Int)r_src);
4709               put_ST_UNCHECKED(0,
4710                                IRExpr_Mux0X(
4711                                    unop(Iop_1Uto8,
4712                                         mk_x86g_calculate_condition(X86CondNBE)),
4713                                    get_ST(0), get_ST(r_src)) );
4714               break;
4715
4716            case 0xD8 ... 0xDF: /* FCMOVNU ST(i), ST(0) */
4717               r_src = (UInt)modrm - 0xD8;
4718               DIP("fcmovnu %%st(%d), %%st(0)\n", (Int)r_src);
4719               put_ST_UNCHECKED(0,
4720                                IRExpr_Mux0X(
4721                                    unop(Iop_1Uto8,
4722                                         mk_x86g_calculate_condition(X86CondNP)),
4723                                    get_ST(0), get_ST(r_src)) );
4724               break;
4725
4726            case 0xE2:
4727               DIP("fnclex\n");
4728               break;
4729
4730            case 0xE3: {
4731               /* Uses dirty helper:
4732                     void x86g_do_FINIT ( VexGuestX86State* ) */
4733               IRDirty* d  = unsafeIRDirty_0_N (
4734                                0/*regparms*/,
4735                                "x86g_dirtyhelper_FINIT",
4736                                &x86g_dirtyhelper_FINIT,
4737                                mkIRExprVec_0()
4738                             );
4739               d->needsBBP = True;
4740
4741               /* declare we're writing guest state */
4742               d->nFxState = 5;
4743               vex_bzero(&d->fxState, sizeof(d->fxState));
4744
4745               d->fxState[0].fx     = Ifx_Write;
4746               d->fxState[0].offset = OFFB_FTOP;
4747               d->fxState[0].size   = sizeof(UInt);
4748
4749               d->fxState[1].fx     = Ifx_Write;
4750               d->fxState[1].offset = OFFB_FPREGS;
4751               d->fxState[1].size   = 8 * sizeof(ULong);
4752
4753               d->fxState[2].fx     = Ifx_Write;
4754               d->fxState[2].offset = OFFB_FPTAGS;
4755               d->fxState[2].size   = 8 * sizeof(UChar);
4756
4757               d->fxState[3].fx     = Ifx_Write;
4758               d->fxState[3].offset = OFFB_FPROUND;
4759               d->fxState[3].size   = sizeof(UInt);
4760
4761               d->fxState[4].fx     = Ifx_Write;
4762               d->fxState[4].offset = OFFB_FC3210;
4763               d->fxState[4].size   = sizeof(UInt);
4764
4765               stmt( IRStmt_Dirty(d) );
4766
4767               DIP("fninit\n");
4768               break;
4769            }
4770
4771            case 0xE8 ... 0xEF: /* FUCOMI %st(0),%st(?) */
4772               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, False );
4773               break;
4774
4775            case 0xF0 ... 0xF7: /* FCOMI %st(0),%st(?) */
4776               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, False );
4777               break;
4778
4779            default:
4780               goto decode_fail;
4781         }
4782      }
4783   }
4784
4785   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDC opcodes +-+-+-+-+-+-+-+ */
4786   else
4787   if (first_opcode == 0xDC) {
4788      if (modrm < 0xC0) {
4789
4790         /* bits 5,4,3 are an opcode extension, and the modRM also
4791            specifies an address. */
4792         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4793         delta += len;
4794
4795         switch (gregOfRM(modrm)) {
4796
4797            case 0: /* FADD double-real */
4798               fp_do_op_mem_ST_0 ( addr, "add", dis_buf, Iop_AddF64, True );
4799               break;
4800
4801            case 1: /* FMUL double-real */
4802               fp_do_op_mem_ST_0 ( addr, "mul", dis_buf, Iop_MulF64, True );
4803               break;
4804
4805            case 2: /* FCOM double-real */
4806               DIP("fcoml %s\n", dis_buf);
4807               /* This forces C1 to zero, which isn't right. */
4808               put_C3210(
4809                   binop( Iop_And32,
4810                          binop(Iop_Shl32,
4811                                binop(Iop_CmpF64,
4812                                      get_ST(0),
4813                                      loadLE(Ity_F64,mkexpr(addr))),
4814                                mkU8(8)),
4815                          mkU32(0x4500)
4816                   ));
4817               break;
4818
4819            case 3: /* FCOMP double-real */
4820               DIP("fcompl %s\n", dis_buf);
4821               /* This forces C1 to zero, which isn't right. */
4822               put_C3210(
4823                   binop( Iop_And32,
4824                          binop(Iop_Shl32,
4825                                binop(Iop_CmpF64,
4826                                      get_ST(0),
4827                                      loadLE(Ity_F64,mkexpr(addr))),
4828                                mkU8(8)),
4829                          mkU32(0x4500)
4830                   ));
4831               fp_pop();
4832               break;
4833
4834            case 4: /* FSUB double-real */
4835               fp_do_op_mem_ST_0 ( addr, "sub", dis_buf, Iop_SubF64, True );
4836               break;
4837
4838            case 5: /* FSUBR double-real */
4839               fp_do_oprev_mem_ST_0 ( addr, "subr", dis_buf, Iop_SubF64, True );
4840               break;
4841
4842            case 6: /* FDIV double-real */
4843               fp_do_op_mem_ST_0 ( addr, "div", dis_buf, Iop_DivF64, True );
4844               break;
4845
4846            case 7: /* FDIVR double-real */
4847               fp_do_oprev_mem_ST_0 ( addr, "divr", dis_buf, Iop_DivF64, True );
4848               break;
4849
4850            default:
4851               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
4852               vex_printf("first_opcode == 0xDC\n");
4853               goto decode_fail;
4854         }
4855
4856      } else {
4857
4858         delta++;
4859         switch (modrm) {
4860
4861            case 0xC0 ... 0xC7: /* FADD %st(0),%st(?) */
4862               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, False );
4863               break;
4864
4865            case 0xC8 ... 0xCF: /* FMUL %st(0),%st(?) */
4866               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, False );
4867               break;
4868
4869            case 0xE0 ... 0xE7: /* FSUBR %st(0),%st(?) */
4870               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0, modrm - 0xE0, False );
4871               break;
4872
4873            case 0xE8 ... 0xEF: /* FSUB %st(0),%st(?) */
4874               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0, modrm - 0xE8, False );
4875               break;
4876
4877            case 0xF0 ... 0xF7: /* FDIVR %st(0),%st(?) */
4878               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, False );
4879               break;
4880
4881            case 0xF8 ... 0xFF: /* FDIV %st(0),%st(?) */
4882               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, False );
4883               break;
4884
4885            default:
4886               goto decode_fail;
4887         }
4888
4889      }
4890   }
4891
4892   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDD opcodes +-+-+-+-+-+-+-+ */
4893   else
4894   if (first_opcode == 0xDD) {
4895
4896      if (modrm < 0xC0) {
4897
4898         /* bits 5,4,3 are an opcode extension, and the modRM also
4899            specifies an address. */
4900         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
4901         delta += len;
4902
4903         switch (gregOfRM(modrm)) {
4904
4905            case 0: /* FLD double-real */
4906               DIP("fldl %s\n", dis_buf);
4907               fp_push();
4908               put_ST(0, loadLE(Ity_F64, mkexpr(addr)));
4909               break;
4910
4911            case 1: /* FISTTPQ m64 (SSE3) */
4912               DIP("fistppll %s\n", dis_buf);
4913               storeLE( mkexpr(addr),
4914                        binop(Iop_F64toI64S, mkU32(Irrm_ZERO), get_ST(0)) );
4915               fp_pop();
4916               break;
4917
4918            case 2: /* FST double-real */
4919               DIP("fstl %s\n", dis_buf);
4920               storeLE(mkexpr(addr), get_ST(0));
4921               break;
4922
4923            case 3: /* FSTP double-real */
4924               DIP("fstpl %s\n", dis_buf);
4925               storeLE(mkexpr(addr), get_ST(0));
4926               fp_pop();
4927               break;
4928
4929            case 4: { /* FRSTOR m108 */
4930               /* Uses dirty helper:
4931                     VexEmWarn x86g_do_FRSTOR ( VexGuestX86State*, Addr32 ) */
4932               IRTemp   ew = newTemp(Ity_I32);
4933               IRDirty* d  = unsafeIRDirty_0_N (
4934                                0/*regparms*/,
4935                                "x86g_dirtyhelper_FRSTOR",
4936                                &x86g_dirtyhelper_FRSTOR,
4937                                mkIRExprVec_1( mkexpr(addr) )
4938                             );
4939               d->needsBBP = True;
4940               d->tmp      = ew;
4941               /* declare we're reading memory */
4942               d->mFx   = Ifx_Read;
4943               d->mAddr = mkexpr(addr);
4944               d->mSize = 108;
4945
4946               /* declare we're writing guest state */
4947               d->nFxState = 5;
4948               vex_bzero(&d->fxState, sizeof(d->fxState));
4949
4950               d->fxState[0].fx     = Ifx_Write;
4951               d->fxState[0].offset = OFFB_FTOP;
4952               d->fxState[0].size   = sizeof(UInt);
4953
4954               d->fxState[1].fx     = Ifx_Write;
4955               d->fxState[1].offset = OFFB_FPREGS;
4956               d->fxState[1].size   = 8 * sizeof(ULong);
4957
4958               d->fxState[2].fx     = Ifx_Write;
4959               d->fxState[2].offset = OFFB_FPTAGS;
4960               d->fxState[2].size   = 8 * sizeof(UChar);
4961
4962               d->fxState[3].fx     = Ifx_Write;
4963               d->fxState[3].offset = OFFB_FPROUND;
4964               d->fxState[3].size   = sizeof(UInt);
4965
4966               d->fxState[4].fx     = Ifx_Write;
4967               d->fxState[4].offset = OFFB_FC3210;
4968               d->fxState[4].size   = sizeof(UInt);
4969
4970               stmt( IRStmt_Dirty(d) );
4971
4972               /* ew contains any emulation warning we may need to
4973                  issue.  If needed, side-exit to the next insn,
4974                  reporting the warning, so that Valgrind's dispatcher
4975                  sees the warning. */
4976               put_emwarn( mkexpr(ew) );
4977               stmt(
4978                  IRStmt_Exit(
4979                     binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
4980                     Ijk_EmWarn,
4981                     IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
4982                     OFFB_EIP
4983                  )
4984               );
4985
4986               DIP("frstor %s\n", dis_buf);
4987               break;
4988            }
4989
4990            case 6: { /* FNSAVE m108 */
4991               /* Uses dirty helper:
4992                     void x86g_do_FSAVE ( VexGuestX86State*, UInt ) */
4993               IRDirty* d = unsafeIRDirty_0_N (
4994                               0/*regparms*/,
4995                               "x86g_dirtyhelper_FSAVE",
4996                               &x86g_dirtyhelper_FSAVE,
4997                               mkIRExprVec_1( mkexpr(addr) )
4998                            );
4999               d->needsBBP = True;
5000               /* declare we're writing memory */
5001               d->mFx   = Ifx_Write;
5002               d->mAddr = mkexpr(addr);
5003               d->mSize = 108;
5004
5005               /* declare we're reading guest state */
5006               d->nFxState = 5;
5007               vex_bzero(&d->fxState, sizeof(d->fxState));
5008
5009               d->fxState[0].fx     = Ifx_Read;
5010               d->fxState[0].offset = OFFB_FTOP;
5011               d->fxState[0].size   = sizeof(UInt);
5012
5013               d->fxState[1].fx     = Ifx_Read;
5014               d->fxState[1].offset = OFFB_FPREGS;
5015               d->fxState[1].size   = 8 * sizeof(ULong);
5016
5017               d->fxState[2].fx     = Ifx_Read;
5018               d->fxState[2].offset = OFFB_FPTAGS;
5019               d->fxState[2].size   = 8 * sizeof(UChar);
5020
5021               d->fxState[3].fx     = Ifx_Read;
5022               d->fxState[3].offset = OFFB_FPROUND;
5023               d->fxState[3].size   = sizeof(UInt);
5024
5025               d->fxState[4].fx     = Ifx_Read;
5026               d->fxState[4].offset = OFFB_FC3210;
5027               d->fxState[4].size   = sizeof(UInt);
5028
5029               stmt( IRStmt_Dirty(d) );
5030
5031               DIP("fnsave %s\n", dis_buf);
5032               break;
5033            }
5034
5035            case 7: { /* FNSTSW m16 */
5036               IRExpr* sw = get_FPU_sw();
5037               vassert(typeOfIRExpr(irsb->tyenv, sw) == Ity_I16);
5038               storeLE( mkexpr(addr), sw );
5039               DIP("fnstsw %s\n", dis_buf);
5040               break;
5041            }
5042
5043            default:
5044               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
5045               vex_printf("first_opcode == 0xDD\n");
5046               goto decode_fail;
5047         }
5048      } else {
5049         delta++;
5050         switch (modrm) {
5051
5052            case 0xC0 ... 0xC7: /* FFREE %st(?) */
5053               r_dst = (UInt)modrm - 0xC0;
5054               DIP("ffree %%st(%d)\n", (Int)r_dst);
5055               put_ST_TAG ( r_dst, mkU8(0) );
5056               break;
5057
5058            case 0xD0 ... 0xD7: /* FST %st(0),%st(?) */
5059               r_dst = (UInt)modrm - 0xD0;
5060               DIP("fst %%st(0),%%st(%d)\n", (Int)r_dst);
5061               /* P4 manual says: "If the destination operand is a
5062                  non-empty register, the invalid-operation exception
5063                  is not generated.  Hence put_ST_UNCHECKED. */
5064               put_ST_UNCHECKED(r_dst, get_ST(0));
5065               break;
5066
5067            case 0xD8 ... 0xDF: /* FSTP %st(0),%st(?) */
5068               r_dst = (UInt)modrm - 0xD8;
5069               DIP("fstp %%st(0),%%st(%d)\n", (Int)r_dst);
5070               /* P4 manual says: "If the destination operand is a
5071                  non-empty register, the invalid-operation exception
5072                  is not generated.  Hence put_ST_UNCHECKED. */
5073               put_ST_UNCHECKED(r_dst, get_ST(0));
5074               fp_pop();
5075               break;
5076
5077            case 0xE0 ... 0xE7: /* FUCOM %st(0),%st(?) */
5078               r_dst = (UInt)modrm - 0xE0;
5079               DIP("fucom %%st(0),%%st(%d)\n", (Int)r_dst);
5080               /* This forces C1 to zero, which isn't right. */
5081               put_C3210(
5082                   binop( Iop_And32,
5083                          binop(Iop_Shl32,
5084                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
5085                                mkU8(8)),
5086                          mkU32(0x4500)
5087                   ));
5088               break;
5089
5090            case 0xE8 ... 0xEF: /* FUCOMP %st(0),%st(?) */
5091               r_dst = (UInt)modrm - 0xE8;
5092               DIP("fucomp %%st(0),%%st(%d)\n", (Int)r_dst);
5093               /* This forces C1 to zero, which isn't right. */
5094               put_C3210(
5095                   binop( Iop_And32,
5096                          binop(Iop_Shl32,
5097                                binop(Iop_CmpF64, get_ST(0), get_ST(r_dst)),
5098                                mkU8(8)),
5099                          mkU32(0x4500)
5100                   ));
5101               fp_pop();
5102               break;
5103
5104            default:
5105               goto decode_fail;
5106         }
5107      }
5108   }
5109
5110   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDE opcodes +-+-+-+-+-+-+-+ */
5111   else
5112   if (first_opcode == 0xDE) {
5113
5114      if (modrm < 0xC0) {
5115
5116         /* bits 5,4,3 are an opcode extension, and the modRM also
5117            specifies an address. */
5118         IROp   fop;
5119         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5120         delta += len;
5121
5122         switch (gregOfRM(modrm)) {
5123
5124            case 0: /* FIADD m16int */ /* ST(0) += m16int */
5125               DIP("fiaddw %s\n", dis_buf);
5126               fop = Iop_AddF64;
5127               goto do_fop_m16;
5128
5129            case 1: /* FIMUL m16int */ /* ST(0) *= m16int */
5130               DIP("fimulw %s\n", dis_buf);
5131               fop = Iop_MulF64;
5132               goto do_fop_m16;
5133
5134            case 2: /* FICOM m16int */
5135               DIP("ficomw %s\n", dis_buf);
5136               /* This forces C1 to zero, which isn't right. */
5137               put_C3210(
5138                   binop( Iop_And32,
5139                          binop(Iop_Shl32,
5140                                binop(Iop_CmpF64,
5141                                      get_ST(0),
5142                                      unop(Iop_I32StoF64,
5143                                         unop(Iop_16Sto32,
5144                                           loadLE(Ity_I16,mkexpr(addr))))),
5145                                mkU8(8)),
5146                          mkU32(0x4500)
5147                   ));
5148               break;
5149
5150            case 3: /* FICOMP m16int */
5151               DIP("ficompw %s\n", dis_buf);
5152               /* This forces C1 to zero, which isn't right. */
5153               put_C3210(
5154                   binop( Iop_And32,
5155                          binop(Iop_Shl32,
5156                                binop(Iop_CmpF64,
5157                                      get_ST(0),
5158                                      unop(Iop_I32StoF64,
5159                                         unop(Iop_16Sto32,
5160                                              loadLE(Ity_I16,mkexpr(addr))))),
5161                                mkU8(8)),
5162                          mkU32(0x4500)
5163                   ));
5164               fp_pop();
5165               break;
5166
5167            case 4: /* FISUB m16int */ /* ST(0) -= m16int */
5168               DIP("fisubw %s\n", dis_buf);
5169               fop = Iop_SubF64;
5170               goto do_fop_m16;
5171
5172            case 5: /* FISUBR m16int */ /* ST(0) = m16int - ST(0) */
5173               DIP("fisubrw %s\n", dis_buf);
5174               fop = Iop_SubF64;
5175               goto do_foprev_m16;
5176
5177            case 6: /* FIDIV m16int */ /* ST(0) /= m16int */
5178               DIP("fisubw %s\n", dis_buf);
5179               fop = Iop_DivF64;
5180               goto do_fop_m16;
5181
5182            case 7: /* FIDIVR m16int */ /* ST(0) = m16int / ST(0) */
5183               DIP("fidivrw %s\n", dis_buf);
5184               fop = Iop_DivF64;
5185               goto do_foprev_m16;
5186
5187            do_fop_m16:
5188               put_ST_UNCHECKED(0,
5189                  triop(fop,
5190                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5191                        get_ST(0),
5192                        unop(Iop_I32StoF64,
5193                             unop(Iop_16Sto32,
5194                                  loadLE(Ity_I16, mkexpr(addr))))));
5195               break;
5196
5197            do_foprev_m16:
5198               put_ST_UNCHECKED(0,
5199                  triop(fop,
5200                        get_FAKE_roundingmode(), /* XXXROUNDINGFIXME */
5201                        unop(Iop_I32StoF64,
5202                             unop(Iop_16Sto32,
5203                                  loadLE(Ity_I16, mkexpr(addr)))),
5204                        get_ST(0)));
5205               break;
5206
5207            default:
5208               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
5209               vex_printf("first_opcode == 0xDE\n");
5210               goto decode_fail;
5211         }
5212
5213      } else {
5214
5215         delta++;
5216         switch (modrm) {
5217
5218            case 0xC0 ... 0xC7: /* FADDP %st(0),%st(?) */
5219               fp_do_op_ST_ST ( "add", Iop_AddF64, 0, modrm - 0xC0, True );
5220               break;
5221
5222            case 0xC8 ... 0xCF: /* FMULP %st(0),%st(?) */
5223               fp_do_op_ST_ST ( "mul", Iop_MulF64, 0, modrm - 0xC8, True );
5224               break;
5225
5226            case 0xD9: /* FCOMPP %st(0),%st(1) */
5227               DIP("fuompp %%st(0),%%st(1)\n");
5228               /* This forces C1 to zero, which isn't right. */
5229               put_C3210(
5230                   binop( Iop_And32,
5231                          binop(Iop_Shl32,
5232                                binop(Iop_CmpF64, get_ST(0), get_ST(1)),
5233                                mkU8(8)),
5234                          mkU32(0x4500)
5235                   ));
5236               fp_pop();
5237               fp_pop();
5238               break;
5239
5240            case 0xE0 ... 0xE7: /* FSUBRP %st(0),%st(?) */
5241               fp_do_oprev_ST_ST ( "subr", Iop_SubF64, 0,  modrm - 0xE0, True );
5242               break;
5243
5244            case 0xE8 ... 0xEF: /* FSUBP %st(0),%st(?) */
5245               fp_do_op_ST_ST ( "sub", Iop_SubF64, 0,  modrm - 0xE8, True );
5246               break;
5247
5248            case 0xF0 ... 0xF7: /* FDIVRP %st(0),%st(?) */
5249               fp_do_oprev_ST_ST ( "divr", Iop_DivF64, 0, modrm - 0xF0, True );
5250               break;
5251
5252            case 0xF8 ... 0xFF: /* FDIVP %st(0),%st(?) */
5253               fp_do_op_ST_ST ( "div", Iop_DivF64, 0, modrm - 0xF8, True );
5254               break;
5255
5256            default:
5257               goto decode_fail;
5258         }
5259
5260      }
5261   }
5262
5263   /* -+-+-+-+-+-+-+-+-+-+-+-+ 0xDF opcodes +-+-+-+-+-+-+-+ */
5264   else
5265   if (first_opcode == 0xDF) {
5266
5267      if (modrm < 0xC0) {
5268
5269         /* bits 5,4,3 are an opcode extension, and the modRM also
5270            specifies an address. */
5271         IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5272         delta += len;
5273
5274         switch (gregOfRM(modrm)) {
5275
5276            case 0: /* FILD m16int */
5277               DIP("fildw %s\n", dis_buf);
5278               fp_push();
5279               put_ST(0, unop(Iop_I32StoF64,
5280                              unop(Iop_16Sto32,
5281                                   loadLE(Ity_I16, mkexpr(addr)))));
5282               break;
5283
5284            case 1: /* FISTTPS m16 (SSE3) */
5285               DIP("fisttps %s\n", dis_buf);
5286               storeLE( mkexpr(addr),
5287                        binop(Iop_F64toI16S, mkU32(Irrm_ZERO), get_ST(0)) );
5288               fp_pop();
5289               break;
5290
5291            case 2: /* FIST m16 */
5292               DIP("fistp %s\n", dis_buf);
5293               storeLE( mkexpr(addr),
5294                        binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
5295               break;
5296
5297            case 3: /* FISTP m16 */
5298               DIP("fistps %s\n", dis_buf);
5299               storeLE( mkexpr(addr),
5300                        binop(Iop_F64toI16S, get_roundingmode(), get_ST(0)) );
5301               fp_pop();
5302               break;
5303
5304            case 5: /* FILD m64 */
5305               DIP("fildll %s\n", dis_buf);
5306               fp_push();
5307               put_ST(0, binop(Iop_I64StoF64,
5308                               get_roundingmode(),
5309                               loadLE(Ity_I64, mkexpr(addr))));
5310               break;
5311
5312            case 7: /* FISTP m64 */
5313               DIP("fistpll %s\n", dis_buf);
5314               storeLE( mkexpr(addr),
5315                        binop(Iop_F64toI64S, get_roundingmode(), get_ST(0)) );
5316               fp_pop();
5317               break;
5318
5319            default:
5320               vex_printf("unhandled opc_aux = 0x%2x\n", gregOfRM(modrm));
5321               vex_printf("first_opcode == 0xDF\n");
5322               goto decode_fail;
5323         }
5324
5325      } else {
5326
5327         delta++;
5328         switch (modrm) {
5329
5330            case 0xC0: /* FFREEP %st(0) */
5331               DIP("ffreep %%st(%d)\n", 0);
5332               put_ST_TAG ( 0, mkU8(0) );
5333               fp_pop();
5334               break;
5335
5336            case 0xE0: /* FNSTSW %ax */
5337               DIP("fnstsw %%ax\n");
5338               /* Get the FPU status word value and dump it in %AX. */
5339               if (0) {
5340                  /* The obvious thing to do is simply dump the 16-bit
5341                     status word value in %AX.  However, due to a
5342                     limitation in Memcheck's origin tracking
5343                     machinery, this causes Memcheck not to track the
5344                     origin of any undefinedness into %AH (only into
5345                     %AL/%AX/%EAX), which means origins are lost in
5346                     the sequence "fnstsw %ax; test $M,%ah; jcond .." */
5347                  putIReg(2, R_EAX, get_FPU_sw());
5348               } else {
5349                  /* So a somewhat lame kludge is to make it very
5350                     clear to Memcheck that the value is written to
5351                     both %AH and %AL.  This generates marginally
5352                     worse code, but I don't think it matters much. */
5353                  IRTemp t16 = newTemp(Ity_I16);
5354                  assign(t16, get_FPU_sw());
5355                  putIReg( 1, R_AL, unop(Iop_16to8, mkexpr(t16)) );
5356                  putIReg( 1, R_AH, unop(Iop_16HIto8, mkexpr(t16)) );
5357               }
5358               break;
5359
5360            case 0xE8 ... 0xEF: /* FUCOMIP %st(0),%st(?) */
5361               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xE8, True );
5362               break;
5363
5364            case 0xF0 ... 0xF7: /* FCOMIP %st(0),%st(?) */
5365               /* not really right since COMIP != UCOMIP */
5366               fp_do_ucomi_ST0_STi( (UInt)modrm - 0xF0, True );
5367               break;
5368
5369            default:
5370               goto decode_fail;
5371         }
5372      }
5373
5374   }
5375
5376   else
5377   vpanic("dis_FPU(x86): invalid primary opcode");
5378
5379   *decode_ok = True;
5380   return delta;
5381
5382  decode_fail:
5383   *decode_ok = False;
5384   return delta;
5385}
5386
5387
5388/*------------------------------------------------------------*/
5389/*---                                                      ---*/
5390/*--- MMX INSTRUCTIONS                                     ---*/
5391/*---                                                      ---*/
5392/*------------------------------------------------------------*/
5393
5394/* Effect of MMX insns on x87 FPU state (table 11-2 of
5395   IA32 arch manual, volume 3):
5396
5397   Read from, or write to MMX register (viz, any insn except EMMS):
5398   * All tags set to Valid (non-empty) -- FPTAGS[i] := nonzero
5399   * FP stack pointer set to zero
5400
5401   EMMS:
5402   * All tags set to Invalid (empty) -- FPTAGS[i] := zero
5403   * FP stack pointer set to zero
5404*/
5405
5406static void do_MMX_preamble ( void )
5407{
5408   Int         i;
5409   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
5410   IRExpr*     zero  = mkU32(0);
5411   IRExpr*     tag1  = mkU8(1);
5412   put_ftop(zero);
5413   for (i = 0; i < 8; i++)
5414      stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag1) ) );
5415}
5416
5417static void do_EMMS_preamble ( void )
5418{
5419   Int         i;
5420   IRRegArray* descr = mkIRRegArray( OFFB_FPTAGS, Ity_I8, 8 );
5421   IRExpr*     zero  = mkU32(0);
5422   IRExpr*     tag0  = mkU8(0);
5423   put_ftop(zero);
5424   for (i = 0; i < 8; i++)
5425      stmt( IRStmt_PutI( mkIRPutI(descr, zero, i, tag0) ) );
5426}
5427
5428
5429static IRExpr* getMMXReg ( UInt archreg )
5430{
5431   vassert(archreg < 8);
5432   return IRExpr_Get( OFFB_FPREGS + 8 * archreg, Ity_I64 );
5433}
5434
5435
5436static void putMMXReg ( UInt archreg, IRExpr* e )
5437{
5438   vassert(archreg < 8);
5439   vassert(typeOfIRExpr(irsb->tyenv,e) == Ity_I64);
5440   stmt( IRStmt_Put( OFFB_FPREGS + 8 * archreg, e ) );
5441}
5442
5443
5444/* Helper for non-shift MMX insns.  Note this is incomplete in the
5445   sense that it does not first call do_MMX_preamble() -- that is the
5446   responsibility of its caller. */
5447
5448static
5449UInt dis_MMXop_regmem_to_reg ( UChar  sorb,
5450                               Int    delta,
5451                               UChar  opc,
5452                               HChar* name,
5453                               Bool   show_granularity )
5454{
5455   HChar   dis_buf[50];
5456   UChar   modrm = getIByte(delta);
5457   Bool    isReg = epartIsReg(modrm);
5458   IRExpr* argL  = NULL;
5459   IRExpr* argR  = NULL;
5460   IRExpr* argG  = NULL;
5461   IRExpr* argE  = NULL;
5462   IRTemp  res   = newTemp(Ity_I64);
5463
5464   Bool    invG  = False;
5465   IROp    op    = Iop_INVALID;
5466   void*   hAddr = NULL;
5467   HChar*  hName = NULL;
5468   Bool    eLeft = False;
5469
5470#  define XXX(_name) do { hAddr = &_name; hName = #_name; } while (0)
5471
5472   switch (opc) {
5473      /* Original MMX ones */
5474      case 0xFC: op = Iop_Add8x8; break;
5475      case 0xFD: op = Iop_Add16x4; break;
5476      case 0xFE: op = Iop_Add32x2; break;
5477
5478      case 0xEC: op = Iop_QAdd8Sx8; break;
5479      case 0xED: op = Iop_QAdd16Sx4; break;
5480
5481      case 0xDC: op = Iop_QAdd8Ux8; break;
5482      case 0xDD: op = Iop_QAdd16Ux4; break;
5483
5484      case 0xF8: op = Iop_Sub8x8;  break;
5485      case 0xF9: op = Iop_Sub16x4; break;
5486      case 0xFA: op = Iop_Sub32x2; break;
5487
5488      case 0xE8: op = Iop_QSub8Sx8; break;
5489      case 0xE9: op = Iop_QSub16Sx4; break;
5490
5491      case 0xD8: op = Iop_QSub8Ux8; break;
5492      case 0xD9: op = Iop_QSub16Ux4; break;
5493
5494      case 0xE5: op = Iop_MulHi16Sx4; break;
5495      case 0xD5: op = Iop_Mul16x4; break;
5496      case 0xF5: XXX(x86g_calculate_mmx_pmaddwd); break;
5497
5498      case 0x74: op = Iop_CmpEQ8x8; break;
5499      case 0x75: op = Iop_CmpEQ16x4; break;
5500      case 0x76: op = Iop_CmpEQ32x2; break;
5501
5502      case 0x64: op = Iop_CmpGT8Sx8; break;
5503      case 0x65: op = Iop_CmpGT16Sx4; break;
5504      case 0x66: op = Iop_CmpGT32Sx2; break;
5505
5506      case 0x6B: op = Iop_QNarrowBin32Sto16Sx4; eLeft = True; break;
5507      case 0x63: op = Iop_QNarrowBin16Sto8Sx8;  eLeft = True; break;
5508      case 0x67: op = Iop_QNarrowBin16Sto8Ux8;  eLeft = True; break;
5509
5510      case 0x68: op = Iop_InterleaveHI8x8;  eLeft = True; break;
5511      case 0x69: op = Iop_InterleaveHI16x4; eLeft = True; break;
5512      case 0x6A: op = Iop_InterleaveHI32x2; eLeft = True; break;
5513
5514      case 0x60: op = Iop_InterleaveLO8x8;  eLeft = True; break;
5515      case 0x61: op = Iop_InterleaveLO16x4; eLeft = True; break;
5516      case 0x62: op = Iop_InterleaveLO32x2; eLeft = True; break;
5517
5518      case 0xDB: op = Iop_And64; break;
5519      case 0xDF: op = Iop_And64; invG = True; break;
5520      case 0xEB: op = Iop_Or64; break;
5521      case 0xEF: /* Possibly do better here if argL and argR are the
5522                    same reg */
5523                 op = Iop_Xor64; break;
5524
5525      /* Introduced in SSE1 */
5526      case 0xE0: op = Iop_Avg8Ux8;    break;
5527      case 0xE3: op = Iop_Avg16Ux4;   break;
5528      case 0xEE: op = Iop_Max16Sx4;   break;
5529      case 0xDE: op = Iop_Max8Ux8;    break;
5530      case 0xEA: op = Iop_Min16Sx4;   break;
5531      case 0xDA: op = Iop_Min8Ux8;    break;
5532      case 0xE4: op = Iop_MulHi16Ux4; break;
5533      case 0xF6: XXX(x86g_calculate_mmx_psadbw); break;
5534
5535      /* Introduced in SSE2 */
5536      case 0xD4: op = Iop_Add64; break;
5537      case 0xFB: op = Iop_Sub64; break;
5538
5539      default:
5540         vex_printf("\n0x%x\n", (Int)opc);
5541         vpanic("dis_MMXop_regmem_to_reg");
5542   }
5543
5544#  undef XXX
5545
5546   argG = getMMXReg(gregOfRM(modrm));
5547   if (invG)
5548      argG = unop(Iop_Not64, argG);
5549
5550   if (isReg) {
5551      delta++;
5552      argE = getMMXReg(eregOfRM(modrm));
5553   } else {
5554      Int    len;
5555      IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5556      delta += len;
5557      argE = loadLE(Ity_I64, mkexpr(addr));
5558   }
5559
5560   if (eLeft) {
5561      argL = argE;
5562      argR = argG;
5563   } else {
5564      argL = argG;
5565      argR = argE;
5566   }
5567
5568   if (op != Iop_INVALID) {
5569      vassert(hName == NULL);
5570      vassert(hAddr == NULL);
5571      assign(res, binop(op, argL, argR));
5572   } else {
5573      vassert(hName != NULL);
5574      vassert(hAddr != NULL);
5575      assign( res,
5576              mkIRExprCCall(
5577                 Ity_I64,
5578                 0/*regparms*/, hName, hAddr,
5579                 mkIRExprVec_2( argL, argR )
5580              )
5581            );
5582   }
5583
5584   putMMXReg( gregOfRM(modrm), mkexpr(res) );
5585
5586   DIP("%s%s %s, %s\n",
5587       name, show_granularity ? nameMMXGran(opc & 3) : "",
5588       ( isReg ? nameMMXReg(eregOfRM(modrm)) : dis_buf ),
5589       nameMMXReg(gregOfRM(modrm)) );
5590
5591   return delta;
5592}
5593
5594
5595/* Vector by scalar shift of G by the amount specified at the bottom
5596   of E.  This is a straight copy of dis_SSE_shiftG_byE. */
5597
5598static UInt dis_MMX_shiftG_byE ( UChar sorb, Int delta,
5599                                 HChar* opname, IROp op )
5600{
5601   HChar   dis_buf[50];
5602   Int     alen, size;
5603   IRTemp  addr;
5604   Bool    shl, shr, sar;
5605   UChar   rm   = getIByte(delta);
5606   IRTemp  g0   = newTemp(Ity_I64);
5607   IRTemp  g1   = newTemp(Ity_I64);
5608   IRTemp  amt  = newTemp(Ity_I32);
5609   IRTemp  amt8 = newTemp(Ity_I8);
5610
5611   if (epartIsReg(rm)) {
5612      assign( amt, unop(Iop_64to32, getMMXReg(eregOfRM(rm))) );
5613      DIP("%s %s,%s\n", opname,
5614                        nameMMXReg(eregOfRM(rm)),
5615                        nameMMXReg(gregOfRM(rm)) );
5616      delta++;
5617   } else {
5618      addr = disAMode ( &alen, sorb, delta, dis_buf );
5619      assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
5620      DIP("%s %s,%s\n", opname,
5621                        dis_buf,
5622                        nameMMXReg(gregOfRM(rm)) );
5623      delta += alen;
5624   }
5625   assign( g0,   getMMXReg(gregOfRM(rm)) );
5626   assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
5627
5628   shl = shr = sar = False;
5629   size = 0;
5630   switch (op) {
5631      case Iop_ShlN16x4: shl = True; size = 32; break;
5632      case Iop_ShlN32x2: shl = True; size = 32; break;
5633      case Iop_Shl64:    shl = True; size = 64; break;
5634      case Iop_ShrN16x4: shr = True; size = 16; break;
5635      case Iop_ShrN32x2: shr = True; size = 32; break;
5636      case Iop_Shr64:    shr = True; size = 64; break;
5637      case Iop_SarN16x4: sar = True; size = 16; break;
5638      case Iop_SarN32x2: sar = True; size = 32; break;
5639      default: vassert(0);
5640   }
5641
5642   if (shl || shr) {
5643     assign(
5644        g1,
5645        IRExpr_Mux0X(
5646           unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
5647           mkU64(0),
5648           binop(op, mkexpr(g0), mkexpr(amt8))
5649        )
5650     );
5651   } else
5652   if (sar) {
5653     assign(
5654        g1,
5655        IRExpr_Mux0X(
5656           unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
5657           binop(op, mkexpr(g0), mkU8(size-1)),
5658           binop(op, mkexpr(g0), mkexpr(amt8))
5659        )
5660     );
5661   } else {
5662      /*NOTREACHED*/
5663      vassert(0);
5664   }
5665
5666   putMMXReg( gregOfRM(rm), mkexpr(g1) );
5667   return delta;
5668}
5669
5670
5671/* Vector by scalar shift of E by an immediate byte.  This is a
5672   straight copy of dis_SSE_shiftE_imm. */
5673
5674static
5675UInt dis_MMX_shiftE_imm ( Int delta, HChar* opname, IROp op )
5676{
5677   Bool    shl, shr, sar;
5678   UChar   rm   = getIByte(delta);
5679   IRTemp  e0   = newTemp(Ity_I64);
5680   IRTemp  e1   = newTemp(Ity_I64);
5681   UChar   amt, size;
5682   vassert(epartIsReg(rm));
5683   vassert(gregOfRM(rm) == 2
5684           || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
5685   amt = getIByte(delta+1);
5686   delta += 2;
5687   DIP("%s $%d,%s\n", opname,
5688                      (Int)amt,
5689                      nameMMXReg(eregOfRM(rm)) );
5690
5691   assign( e0, getMMXReg(eregOfRM(rm)) );
5692
5693   shl = shr = sar = False;
5694   size = 0;
5695   switch (op) {
5696      case Iop_ShlN16x4: shl = True; size = 16; break;
5697      case Iop_ShlN32x2: shl = True; size = 32; break;
5698      case Iop_Shl64:    shl = True; size = 64; break;
5699      case Iop_SarN16x4: sar = True; size = 16; break;
5700      case Iop_SarN32x2: sar = True; size = 32; break;
5701      case Iop_ShrN16x4: shr = True; size = 16; break;
5702      case Iop_ShrN32x2: shr = True; size = 32; break;
5703      case Iop_Shr64:    shr = True; size = 64; break;
5704      default: vassert(0);
5705   }
5706
5707   if (shl || shr) {
5708      assign( e1, amt >= size
5709                     ? mkU64(0)
5710                     : binop(op, mkexpr(e0), mkU8(amt))
5711      );
5712   } else
5713   if (sar) {
5714      assign( e1, amt >= size
5715                     ? binop(op, mkexpr(e0), mkU8(size-1))
5716                     : binop(op, mkexpr(e0), mkU8(amt))
5717      );
5718   } else {
5719      /*NOTREACHED*/
5720      vassert(0);
5721   }
5722
5723   putMMXReg( eregOfRM(rm), mkexpr(e1) );
5724   return delta;
5725}
5726
5727
5728/* Completely handle all MMX instructions except emms. */
5729
5730static
5731UInt dis_MMX ( Bool* decode_ok, UChar sorb, Int sz, Int delta )
5732{
5733   Int   len;
5734   UChar modrm;
5735   HChar dis_buf[50];
5736   UChar opc = getIByte(delta);
5737   delta++;
5738
5739   /* dis_MMX handles all insns except emms. */
5740   do_MMX_preamble();
5741
5742   switch (opc) {
5743
5744      case 0x6E:
5745         /* MOVD (src)ireg-or-mem (E), (dst)mmxreg (G)*/
5746         if (sz != 4)
5747            goto mmx_decode_failure;
5748         modrm = getIByte(delta);
5749         if (epartIsReg(modrm)) {
5750            delta++;
5751            putMMXReg(
5752               gregOfRM(modrm),
5753               binop( Iop_32HLto64,
5754                      mkU32(0),
5755                      getIReg(4, eregOfRM(modrm)) ) );
5756            DIP("movd %s, %s\n",
5757                nameIReg(4,eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
5758         } else {
5759            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5760            delta += len;
5761            putMMXReg(
5762               gregOfRM(modrm),
5763               binop( Iop_32HLto64,
5764                      mkU32(0),
5765                      loadLE(Ity_I32, mkexpr(addr)) ) );
5766            DIP("movd %s, %s\n", dis_buf, nameMMXReg(gregOfRM(modrm)));
5767         }
5768         break;
5769
5770      case 0x7E: /* MOVD (src)mmxreg (G), (dst)ireg-or-mem (E) */
5771         if (sz != 4)
5772            goto mmx_decode_failure;
5773         modrm = getIByte(delta);
5774         if (epartIsReg(modrm)) {
5775            delta++;
5776            putIReg( 4, eregOfRM(modrm),
5777                     unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
5778            DIP("movd %s, %s\n",
5779                nameMMXReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
5780         } else {
5781            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5782            delta += len;
5783            storeLE( mkexpr(addr),
5784                     unop(Iop_64to32, getMMXReg(gregOfRM(modrm)) ) );
5785            DIP("movd %s, %s\n", nameMMXReg(gregOfRM(modrm)), dis_buf);
5786         }
5787         break;
5788
5789      case 0x6F:
5790         /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
5791         if (sz != 4)
5792            goto mmx_decode_failure;
5793         modrm = getIByte(delta);
5794         if (epartIsReg(modrm)) {
5795            delta++;
5796            putMMXReg( gregOfRM(modrm), getMMXReg(eregOfRM(modrm)) );
5797            DIP("movq %s, %s\n",
5798                nameMMXReg(eregOfRM(modrm)), nameMMXReg(gregOfRM(modrm)));
5799         } else {
5800            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5801            delta += len;
5802            putMMXReg( gregOfRM(modrm), loadLE(Ity_I64, mkexpr(addr)) );
5803            DIP("movq %s, %s\n",
5804                dis_buf, nameMMXReg(gregOfRM(modrm)));
5805         }
5806         break;
5807
5808      case 0x7F:
5809         /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
5810         if (sz != 4)
5811            goto mmx_decode_failure;
5812         modrm = getIByte(delta);
5813         if (epartIsReg(modrm)) {
5814            delta++;
5815            putMMXReg( eregOfRM(modrm), getMMXReg(gregOfRM(modrm)) );
5816            DIP("movq %s, %s\n",
5817                nameMMXReg(gregOfRM(modrm)), nameMMXReg(eregOfRM(modrm)));
5818         } else {
5819            IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
5820            delta += len;
5821            storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
5822            DIP("mov(nt)q %s, %s\n",
5823                nameMMXReg(gregOfRM(modrm)), dis_buf);
5824         }
5825         break;
5826
5827      case 0xFC:
5828      case 0xFD:
5829      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
5830         if (sz != 4)
5831            goto mmx_decode_failure;
5832         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padd", True );
5833         break;
5834
5835      case 0xEC:
5836      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
5837         if (sz != 4)
5838            goto mmx_decode_failure;
5839         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "padds", True );
5840         break;
5841
5842      case 0xDC:
5843      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
5844         if (sz != 4)
5845            goto mmx_decode_failure;
5846         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "paddus", True );
5847         break;
5848
5849      case 0xF8:
5850      case 0xF9:
5851      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
5852         if (sz != 4)
5853            goto mmx_decode_failure;
5854         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psub", True );
5855         break;
5856
5857      case 0xE8:
5858      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
5859         if (sz != 4)
5860            goto mmx_decode_failure;
5861         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubs", True );
5862         break;
5863
5864      case 0xD8:
5865      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
5866         if (sz != 4)
5867            goto mmx_decode_failure;
5868         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "psubus", True );
5869         break;
5870
5871      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
5872         if (sz != 4)
5873            goto mmx_decode_failure;
5874         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmulhw", False );
5875         break;
5876
5877      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
5878         if (sz != 4)
5879            goto mmx_decode_failure;
5880         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmullw", False );
5881         break;
5882
5883      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
5884         vassert(sz == 4);
5885         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pmaddwd", False );
5886         break;
5887
5888      case 0x74:
5889      case 0x75:
5890      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
5891         if (sz != 4)
5892            goto mmx_decode_failure;
5893         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpeq", True );
5894         break;
5895
5896      case 0x64:
5897      case 0x65:
5898      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
5899         if (sz != 4)
5900            goto mmx_decode_failure;
5901         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pcmpgt", True );
5902         break;
5903
5904      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
5905         if (sz != 4)
5906            goto mmx_decode_failure;
5907         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packssdw", False );
5908         break;
5909
5910      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
5911         if (sz != 4)
5912            goto mmx_decode_failure;
5913         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packsswb", False );
5914         break;
5915
5916      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
5917         if (sz != 4)
5918            goto mmx_decode_failure;
5919         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "packuswb", False );
5920         break;
5921
5922      case 0x68:
5923      case 0x69:
5924      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
5925         if (sz != 4)
5926            goto mmx_decode_failure;
5927         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckh", True );
5928         break;
5929
5930      case 0x60:
5931      case 0x61:
5932      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
5933         if (sz != 4)
5934            goto mmx_decode_failure;
5935         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "punpckl", True );
5936         break;
5937
5938      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
5939         if (sz != 4)
5940            goto mmx_decode_failure;
5941         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pand", False );
5942         break;
5943
5944      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
5945         if (sz != 4)
5946            goto mmx_decode_failure;
5947         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pandn", False );
5948         break;
5949
5950      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
5951         if (sz != 4)
5952            goto mmx_decode_failure;
5953         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "por", False );
5954         break;
5955
5956      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
5957         if (sz != 4)
5958            goto mmx_decode_failure;
5959         delta = dis_MMXop_regmem_to_reg ( sorb, delta, opc, "pxor", False );
5960         break;
5961
5962#     define SHIFT_BY_REG(_name,_op)                                 \
5963                delta = dis_MMX_shiftG_byE(sorb, delta, _name, _op); \
5964                break;
5965
5966      /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
5967      case 0xF1: SHIFT_BY_REG("psllw", Iop_ShlN16x4);
5968      case 0xF2: SHIFT_BY_REG("pslld", Iop_ShlN32x2);
5969      case 0xF3: SHIFT_BY_REG("psllq", Iop_Shl64);
5970
5971      /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
5972      case 0xD1: SHIFT_BY_REG("psrlw", Iop_ShrN16x4);
5973      case 0xD2: SHIFT_BY_REG("psrld", Iop_ShrN32x2);
5974      case 0xD3: SHIFT_BY_REG("psrlq", Iop_Shr64);
5975
5976      /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
5977      case 0xE1: SHIFT_BY_REG("psraw", Iop_SarN16x4);
5978      case 0xE2: SHIFT_BY_REG("psrad", Iop_SarN32x2);
5979
5980#     undef SHIFT_BY_REG
5981
5982      case 0x71:
5983      case 0x72:
5984      case 0x73: {
5985         /* (sz==4): PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
5986         UChar byte2, subopc;
5987         if (sz != 4)
5988            goto mmx_decode_failure;
5989         byte2  = getIByte(delta);           /* amode / sub-opcode */
5990         subopc = toUChar( (byte2 >> 3) & 7 );
5991
5992#        define SHIFT_BY_IMM(_name,_op)                         \
5993             do { delta = dis_MMX_shiftE_imm(delta,_name,_op);  \
5994             } while (0)
5995
5996              if (subopc == 2 /*SRL*/ && opc == 0x71)
5997                 SHIFT_BY_IMM("psrlw", Iop_ShrN16x4);
5998         else if (subopc == 2 /*SRL*/ && opc == 0x72)
5999                 SHIFT_BY_IMM("psrld", Iop_ShrN32x2);
6000         else if (subopc == 2 /*SRL*/ && opc == 0x73)
6001                 SHIFT_BY_IMM("psrlq", Iop_Shr64);
6002
6003         else if (subopc == 4 /*SAR*/ && opc == 0x71)
6004                 SHIFT_BY_IMM("psraw", Iop_SarN16x4);
6005         else if (subopc == 4 /*SAR*/ && opc == 0x72)
6006                 SHIFT_BY_IMM("psrad", Iop_SarN32x2);
6007
6008         else if (subopc == 6 /*SHL*/ && opc == 0x71)
6009                 SHIFT_BY_IMM("psllw", Iop_ShlN16x4);
6010         else if (subopc == 6 /*SHL*/ && opc == 0x72)
6011                 SHIFT_BY_IMM("pslld", Iop_ShlN32x2);
6012         else if (subopc == 6 /*SHL*/ && opc == 0x73)
6013                 SHIFT_BY_IMM("psllq", Iop_Shl64);
6014
6015         else goto mmx_decode_failure;
6016
6017#        undef SHIFT_BY_IMM
6018         break;
6019      }
6020
6021      case 0xF7: {
6022         IRTemp addr    = newTemp(Ity_I32);
6023         IRTemp regD    = newTemp(Ity_I64);
6024         IRTemp regM    = newTemp(Ity_I64);
6025         IRTemp mask    = newTemp(Ity_I64);
6026         IRTemp olddata = newTemp(Ity_I64);
6027         IRTemp newdata = newTemp(Ity_I64);
6028
6029         modrm = getIByte(delta);
6030         if (sz != 4 || (!epartIsReg(modrm)))
6031            goto mmx_decode_failure;
6032         delta++;
6033
6034         assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
6035         assign( regM, getMMXReg( eregOfRM(modrm) ));
6036         assign( regD, getMMXReg( gregOfRM(modrm) ));
6037         assign( mask, binop(Iop_SarN8x8, mkexpr(regM), mkU8(7)) );
6038         assign( olddata, loadLE( Ity_I64, mkexpr(addr) ));
6039         assign( newdata,
6040                 binop(Iop_Or64,
6041                       binop(Iop_And64,
6042                             mkexpr(regD),
6043                             mkexpr(mask) ),
6044                       binop(Iop_And64,
6045                             mkexpr(olddata),
6046                             unop(Iop_Not64, mkexpr(mask)))) );
6047         storeLE( mkexpr(addr), mkexpr(newdata) );
6048         DIP("maskmovq %s,%s\n", nameMMXReg( eregOfRM(modrm) ),
6049                                 nameMMXReg( gregOfRM(modrm) ) );
6050         break;
6051      }
6052
6053      /* --- MMX decode failure --- */
6054      default:
6055      mmx_decode_failure:
6056         *decode_ok = False;
6057         return delta; /* ignored */
6058
6059   }
6060
6061   *decode_ok = True;
6062   return delta;
6063}
6064
6065
6066/*------------------------------------------------------------*/
6067/*--- More misc arithmetic and other obscure insns.        ---*/
6068/*------------------------------------------------------------*/
6069
6070/* Double length left and right shifts.  Apparently only required in
6071   v-size (no b- variant). */
6072static
6073UInt dis_SHLRD_Gv_Ev ( UChar sorb,
6074                       Int delta, UChar modrm,
6075                       Int sz,
6076                       IRExpr* shift_amt,
6077                       Bool amt_is_literal,
6078                       HChar* shift_amt_txt,
6079                       Bool left_shift )
6080{
6081   /* shift_amt :: Ity_I8 is the amount to shift.  shift_amt_txt is used
6082      for printing it.   And eip on entry points at the modrm byte. */
6083   Int len;
6084   HChar dis_buf[50];
6085
6086   IRType ty       = szToITy(sz);
6087   IRTemp gsrc     = newTemp(ty);
6088   IRTemp esrc     = newTemp(ty);
6089   IRTemp addr     = IRTemp_INVALID;
6090   IRTemp tmpSH    = newTemp(Ity_I8);
6091   IRTemp tmpL     = IRTemp_INVALID;
6092   IRTemp tmpRes   = IRTemp_INVALID;
6093   IRTemp tmpSubSh = IRTemp_INVALID;
6094   IROp   mkpair;
6095   IROp   getres;
6096   IROp   shift;
6097   IRExpr* mask = NULL;
6098
6099   vassert(sz == 2 || sz == 4);
6100
6101   /* The E-part is the destination; this is shifted.  The G-part
6102      supplies bits to be shifted into the E-part, but is not
6103      changed.
6104
6105      If shifting left, form a double-length word with E at the top
6106      and G at the bottom, and shift this left.  The result is then in
6107      the high part.
6108
6109      If shifting right, form a double-length word with G at the top
6110      and E at the bottom, and shift this right.  The result is then
6111      at the bottom.  */
6112
6113   /* Fetch the operands. */
6114
6115   assign( gsrc, getIReg(sz, gregOfRM(modrm)) );
6116
6117   if (epartIsReg(modrm)) {
6118      delta++;
6119      assign( esrc, getIReg(sz, eregOfRM(modrm)) );
6120      DIP("sh%cd%c %s, %s, %s\n",
6121          ( left_shift ? 'l' : 'r' ), nameISize(sz),
6122          shift_amt_txt,
6123          nameIReg(sz, gregOfRM(modrm)), nameIReg(sz, eregOfRM(modrm)));
6124   } else {
6125      addr = disAMode ( &len, sorb, delta, dis_buf );
6126      delta += len;
6127      assign( esrc, loadLE(ty, mkexpr(addr)) );
6128      DIP("sh%cd%c %s, %s, %s\n",
6129          ( left_shift ? 'l' : 'r' ), nameISize(sz),
6130          shift_amt_txt,
6131          nameIReg(sz, gregOfRM(modrm)), dis_buf);
6132   }
6133
6134   /* Round up the relevant primops. */
6135
6136   if (sz == 4) {
6137      tmpL     = newTemp(Ity_I64);
6138      tmpRes   = newTemp(Ity_I32);
6139      tmpSubSh = newTemp(Ity_I32);
6140      mkpair   = Iop_32HLto64;
6141      getres   = left_shift ? Iop_64HIto32 : Iop_64to32;
6142      shift    = left_shift ? Iop_Shl64 : Iop_Shr64;
6143      mask     = mkU8(31);
6144   } else {
6145      /* sz == 2 */
6146      tmpL     = newTemp(Ity_I32);
6147      tmpRes   = newTemp(Ity_I16);
6148      tmpSubSh = newTemp(Ity_I16);
6149      mkpair   = Iop_16HLto32;
6150      getres   = left_shift ? Iop_32HIto16 : Iop_32to16;
6151      shift    = left_shift ? Iop_Shl32 : Iop_Shr32;
6152      mask     = mkU8(15);
6153   }
6154
6155   /* Do the shift, calculate the subshift value, and set
6156      the flag thunk. */
6157
6158   assign( tmpSH, binop(Iop_And8, shift_amt, mask) );
6159
6160   if (left_shift)
6161      assign( tmpL, binop(mkpair, mkexpr(esrc), mkexpr(gsrc)) );
6162   else
6163      assign( tmpL, binop(mkpair, mkexpr(gsrc), mkexpr(esrc)) );
6164
6165   assign( tmpRes, unop(getres, binop(shift, mkexpr(tmpL), mkexpr(tmpSH)) ) );
6166   assign( tmpSubSh,
6167           unop(getres,
6168                binop(shift,
6169                      mkexpr(tmpL),
6170                      binop(Iop_And8,
6171                            binop(Iop_Sub8, mkexpr(tmpSH), mkU8(1) ),
6172                            mask))) );
6173
6174   setFlags_DEP1_DEP2_shift ( left_shift ? Iop_Shl32 : Iop_Sar32,
6175                              tmpRes, tmpSubSh, ty, tmpSH );
6176
6177   /* Put result back. */
6178
6179   if (epartIsReg(modrm)) {
6180      putIReg(sz, eregOfRM(modrm), mkexpr(tmpRes));
6181   } else {
6182      storeLE( mkexpr(addr), mkexpr(tmpRes) );
6183   }
6184
6185   if (amt_is_literal) delta++;
6186   return delta;
6187}
6188
6189
6190/* Handle BT/BTS/BTR/BTC Gv, Ev.  Apparently b-size is not
6191   required. */
6192
6193typedef enum { BtOpNone, BtOpSet, BtOpReset, BtOpComp } BtOp;
6194
6195static HChar* nameBtOp ( BtOp op )
6196{
6197   switch (op) {
6198      case BtOpNone:  return "";
6199      case BtOpSet:   return "s";
6200      case BtOpReset: return "r";
6201      case BtOpComp:  return "c";
6202      default: vpanic("nameBtOp(x86)");
6203   }
6204}
6205
6206
6207static
6208UInt dis_bt_G_E ( VexAbiInfo* vbi,
6209                  UChar sorb, Bool locked, Int sz, Int delta, BtOp op )
6210{
6211   HChar  dis_buf[50];
6212   UChar  modrm;
6213   Int    len;
6214   IRTemp t_fetched, t_bitno0, t_bitno1, t_bitno2, t_addr0,
6215          t_addr1, t_esp, t_mask, t_new;
6216
6217   vassert(sz == 2 || sz == 4);
6218
6219   t_fetched = t_bitno0 = t_bitno1 = t_bitno2
6220             = t_addr0 = t_addr1 = t_esp
6221             = t_mask = t_new = IRTemp_INVALID;
6222
6223   t_fetched = newTemp(Ity_I8);
6224   t_new     = newTemp(Ity_I8);
6225   t_bitno0  = newTemp(Ity_I32);
6226   t_bitno1  = newTemp(Ity_I32);
6227   t_bitno2  = newTemp(Ity_I8);
6228   t_addr1   = newTemp(Ity_I32);
6229   modrm     = getIByte(delta);
6230
6231   assign( t_bitno0, widenSto32(getIReg(sz, gregOfRM(modrm))) );
6232
6233   if (epartIsReg(modrm)) {
6234      delta++;
6235      /* Get it onto the client's stack. */
6236      t_esp = newTemp(Ity_I32);
6237      t_addr0 = newTemp(Ity_I32);
6238
6239      /* For the choice of the value 128, see comment in dis_bt_G_E in
6240         guest_amd64_toIR.c.  We point out here only that 128 is
6241         fast-cased in Memcheck and is > 0, so seems like a good
6242         choice. */
6243      vassert(vbi->guest_stack_redzone_size == 0);
6244      assign( t_esp, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(128)) );
6245      putIReg(4, R_ESP, mkexpr(t_esp));
6246
6247      storeLE( mkexpr(t_esp), getIReg(sz, eregOfRM(modrm)) );
6248
6249      /* Make t_addr0 point at it. */
6250      assign( t_addr0, mkexpr(t_esp) );
6251
6252      /* Mask out upper bits of the shift amount, since we're doing a
6253         reg. */
6254      assign( t_bitno1, binop(Iop_And32,
6255                              mkexpr(t_bitno0),
6256                              mkU32(sz == 4 ? 31 : 15)) );
6257
6258   } else {
6259      t_addr0 = disAMode ( &len, sorb, delta, dis_buf );
6260      delta += len;
6261      assign( t_bitno1, mkexpr(t_bitno0) );
6262   }
6263
6264   /* At this point: t_addr0 is the address being operated on.  If it
6265      was a reg, we will have pushed it onto the client's stack.
6266      t_bitno1 is the bit number, suitably masked in the case of a
6267      reg.  */
6268
6269   /* Now the main sequence. */
6270   assign( t_addr1,
6271           binop(Iop_Add32,
6272                 mkexpr(t_addr0),
6273                 binop(Iop_Sar32, mkexpr(t_bitno1), mkU8(3))) );
6274
6275   /* t_addr1 now holds effective address */
6276
6277   assign( t_bitno2,
6278           unop(Iop_32to8,
6279                binop(Iop_And32, mkexpr(t_bitno1), mkU32(7))) );
6280
6281   /* t_bitno2 contains offset of bit within byte */
6282
6283   if (op != BtOpNone) {
6284      t_mask = newTemp(Ity_I8);
6285      assign( t_mask, binop(Iop_Shl8, mkU8(1), mkexpr(t_bitno2)) );
6286   }
6287
6288   /* t_mask is now a suitable byte mask */
6289
6290   assign( t_fetched, loadLE(Ity_I8, mkexpr(t_addr1)) );
6291
6292   if (op != BtOpNone) {
6293      switch (op) {
6294         case BtOpSet:
6295            assign( t_new,
6296                    binop(Iop_Or8, mkexpr(t_fetched), mkexpr(t_mask)) );
6297            break;
6298         case BtOpComp:
6299            assign( t_new,
6300                    binop(Iop_Xor8, mkexpr(t_fetched), mkexpr(t_mask)) );
6301            break;
6302         case BtOpReset:
6303            assign( t_new,
6304                    binop(Iop_And8, mkexpr(t_fetched),
6305                                    unop(Iop_Not8, mkexpr(t_mask))) );
6306            break;
6307         default:
6308            vpanic("dis_bt_G_E(x86)");
6309      }
6310      if (locked && !epartIsReg(modrm)) {
6311         casLE( mkexpr(t_addr1), mkexpr(t_fetched)/*expd*/,
6312                                 mkexpr(t_new)/*new*/,
6313                                 guest_EIP_curr_instr );
6314      } else {
6315         storeLE( mkexpr(t_addr1), mkexpr(t_new) );
6316      }
6317   }
6318
6319   /* Side effect done; now get selected bit into Carry flag */
6320   /* Flags: C=selected bit, O,S,Z,A,P undefined, so are set to zero. */
6321   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
6322   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
6323   stmt( IRStmt_Put(
6324            OFFB_CC_DEP1,
6325            binop(Iop_And32,
6326                  binop(Iop_Shr32,
6327                        unop(Iop_8Uto32, mkexpr(t_fetched)),
6328                        mkexpr(t_bitno2)),
6329                  mkU32(1)))
6330       );
6331   /* Set NDEP even though it isn't used.  This makes redundant-PUT
6332      elimination of previous stores to this field work better. */
6333   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6334
6335   /* Move reg operand from stack back to reg */
6336   if (epartIsReg(modrm)) {
6337      /* t_esp still points at it. */
6338      putIReg(sz, eregOfRM(modrm), loadLE(szToITy(sz), mkexpr(t_esp)) );
6339      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t_esp), mkU32(128)) );
6340   }
6341
6342   DIP("bt%s%c %s, %s\n",
6343       nameBtOp(op), nameISize(sz), nameIReg(sz, gregOfRM(modrm)),
6344       ( epartIsReg(modrm) ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ) );
6345
6346   return delta;
6347}
6348
6349
6350
6351/* Handle BSF/BSR.  Only v-size seems necessary. */
6352static
6353UInt dis_bs_E_G ( UChar sorb, Int sz, Int delta, Bool fwds )
6354{
6355   Bool   isReg;
6356   UChar  modrm;
6357   HChar  dis_buf[50];
6358
6359   IRType ty  = szToITy(sz);
6360   IRTemp src = newTemp(ty);
6361   IRTemp dst = newTemp(ty);
6362
6363   IRTemp src32 = newTemp(Ity_I32);
6364   IRTemp dst32 = newTemp(Ity_I32);
6365   IRTemp src8  = newTemp(Ity_I8);
6366
6367   vassert(sz == 4 || sz == 2);
6368
6369   modrm = getIByte(delta);
6370
6371   isReg = epartIsReg(modrm);
6372   if (isReg) {
6373      delta++;
6374      assign( src, getIReg(sz, eregOfRM(modrm)) );
6375   } else {
6376      Int    len;
6377      IRTemp addr = disAMode( &len, sorb, delta, dis_buf );
6378      delta += len;
6379      assign( src, loadLE(ty, mkexpr(addr)) );
6380   }
6381
6382   DIP("bs%c%c %s, %s\n",
6383       fwds ? 'f' : 'r', nameISize(sz),
6384       ( isReg ? nameIReg(sz, eregOfRM(modrm)) : dis_buf ),
6385       nameIReg(sz, gregOfRM(modrm)));
6386
6387   /* Generate an 8-bit expression which is zero iff the
6388      original is zero, and nonzero otherwise */
6389   assign( src8,
6390           unop(Iop_1Uto8, binop(mkSizedOp(ty,Iop_CmpNE8),
6391                           mkexpr(src), mkU(ty,0))) );
6392
6393   /* Flags: Z is 1 iff source value is zero.  All others
6394      are undefined -- we force them to zero. */
6395   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
6396   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
6397   stmt( IRStmt_Put(
6398            OFFB_CC_DEP1,
6399            IRExpr_Mux0X( mkexpr(src8),
6400                          /* src==0 */
6401                          mkU32(X86G_CC_MASK_Z),
6402                          /* src!=0 */
6403                          mkU32(0)
6404                        )
6405       ));
6406   /* Set NDEP even though it isn't used.  This makes redundant-PUT
6407      elimination of previous stores to this field work better. */
6408   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6409
6410   /* Result: iff source value is zero, we can't use
6411      Iop_Clz32/Iop_Ctz32 as they have no defined result in that case.
6412      But anyway, Intel x86 semantics say the result is undefined in
6413      such situations.  Hence handle the zero case specially. */
6414
6415   /* Bleh.  What we compute:
6416
6417          bsf32:  if src == 0 then 0 else  Ctz32(src)
6418          bsr32:  if src == 0 then 0 else  31 - Clz32(src)
6419
6420          bsf16:  if src == 0 then 0 else  Ctz32(16Uto32(src))
6421          bsr16:  if src == 0 then 0 else  31 - Clz32(16Uto32(src))
6422
6423      First, widen src to 32 bits if it is not already.
6424
6425      Postscript 15 Oct 04: it seems that at least VIA Nehemiah leaves the
6426      dst register unchanged when src == 0.  Hence change accordingly.
6427   */
6428   if (sz == 2)
6429      assign( src32, unop(Iop_16Uto32, mkexpr(src)) );
6430   else
6431      assign( src32, mkexpr(src) );
6432
6433   /* The main computation, guarding against zero. */
6434   assign( dst32,
6435           IRExpr_Mux0X(
6436              mkexpr(src8),
6437              /* src == 0 -- leave dst unchanged */
6438              widenUto32( getIReg( sz, gregOfRM(modrm) ) ),
6439              /* src != 0 */
6440              fwds ? unop(Iop_Ctz32, mkexpr(src32))
6441                   : binop(Iop_Sub32,
6442                           mkU32(31),
6443                           unop(Iop_Clz32, mkexpr(src32)))
6444           )
6445         );
6446
6447   if (sz == 2)
6448      assign( dst, unop(Iop_32to16, mkexpr(dst32)) );
6449   else
6450      assign( dst, mkexpr(dst32) );
6451
6452   /* dump result back */
6453   putIReg( sz, gregOfRM(modrm), mkexpr(dst) );
6454
6455   return delta;
6456}
6457
6458
6459static
6460void codegen_xchg_eAX_Reg ( Int sz, Int reg )
6461{
6462   IRType ty = szToITy(sz);
6463   IRTemp t1 = newTemp(ty);
6464   IRTemp t2 = newTemp(ty);
6465   vassert(sz == 2 || sz == 4);
6466   assign( t1, getIReg(sz, R_EAX) );
6467   assign( t2, getIReg(sz, reg) );
6468   putIReg( sz, R_EAX, mkexpr(t2) );
6469   putIReg( sz, reg, mkexpr(t1) );
6470   DIP("xchg%c %s, %s\n",
6471       nameISize(sz), nameIReg(sz, R_EAX), nameIReg(sz, reg));
6472}
6473
6474
6475static
6476void codegen_SAHF ( void )
6477{
6478   /* Set the flags to:
6479      (x86g_calculate_flags_all() & X86G_CC_MASK_O)  -- retain the old O flag
6480      | (%AH & (X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
6481                |X86G_CC_MASK_P|X86G_CC_MASK_C)
6482   */
6483   UInt   mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
6484                       |X86G_CC_MASK_C|X86G_CC_MASK_P;
6485   IRTemp oldflags   = newTemp(Ity_I32);
6486   assign( oldflags, mk_x86g_calculate_eflags_all() );
6487   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
6488   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6489   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
6490   stmt( IRStmt_Put( OFFB_CC_DEP1,
6491         binop(Iop_Or32,
6492               binop(Iop_And32, mkexpr(oldflags), mkU32(X86G_CC_MASK_O)),
6493               binop(Iop_And32,
6494                     binop(Iop_Shr32, getIReg(4, R_EAX), mkU8(8)),
6495                     mkU32(mask_SZACP))
6496              )
6497   ));
6498   /* Set NDEP even though it isn't used.  This makes redundant-PUT
6499      elimination of previous stores to this field work better. */
6500   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
6501}
6502
6503
6504static
6505void codegen_LAHF ( void  )
6506{
6507   /* AH <- EFLAGS(SF:ZF:0:AF:0:PF:1:CF) */
6508   IRExpr* eax_with_hole;
6509   IRExpr* new_byte;
6510   IRExpr* new_eax;
6511   UInt    mask_SZACP = X86G_CC_MASK_S|X86G_CC_MASK_Z|X86G_CC_MASK_A
6512                        |X86G_CC_MASK_C|X86G_CC_MASK_P;
6513
6514   IRTemp  flags = newTemp(Ity_I32);
6515   assign( flags, mk_x86g_calculate_eflags_all() );
6516
6517   eax_with_hole
6518      = binop(Iop_And32, getIReg(4, R_EAX), mkU32(0xFFFF00FF));
6519   new_byte
6520      = binop(Iop_Or32, binop(Iop_And32, mkexpr(flags), mkU32(mask_SZACP)),
6521                        mkU32(1<<1));
6522   new_eax
6523      = binop(Iop_Or32, eax_with_hole,
6524                        binop(Iop_Shl32, new_byte, mkU8(8)));
6525   putIReg(4, R_EAX, new_eax);
6526}
6527
6528
6529static
6530UInt dis_cmpxchg_G_E ( UChar       sorb,
6531                       Bool        locked,
6532                       Int         size,
6533                       Int         delta0 )
6534{
6535   HChar dis_buf[50];
6536   Int   len;
6537
6538   IRType ty    = szToITy(size);
6539   IRTemp acc   = newTemp(ty);
6540   IRTemp src   = newTemp(ty);
6541   IRTemp dest  = newTemp(ty);
6542   IRTemp dest2 = newTemp(ty);
6543   IRTemp acc2  = newTemp(ty);
6544   IRTemp cond8 = newTemp(Ity_I8);
6545   IRTemp addr  = IRTemp_INVALID;
6546   UChar  rm    = getUChar(delta0);
6547
6548   /* There are 3 cases to consider:
6549
6550      reg-reg: ignore any lock prefix, generate sequence based
6551               on Mux0X
6552
6553      reg-mem, not locked: ignore any lock prefix, generate sequence
6554                           based on Mux0X
6555
6556      reg-mem, locked: use IRCAS
6557   */
6558   if (epartIsReg(rm)) {
6559      /* case 1 */
6560      assign( dest, getIReg(size, eregOfRM(rm)) );
6561      delta0++;
6562      assign( src, getIReg(size, gregOfRM(rm)) );
6563      assign( acc, getIReg(size, R_EAX) );
6564      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
6565      assign( cond8, unop(Iop_1Uto8, mk_x86g_calculate_condition(X86CondZ)) );
6566      assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
6567      assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
6568      putIReg(size, R_EAX, mkexpr(acc2));
6569      putIReg(size, eregOfRM(rm), mkexpr(dest2));
6570      DIP("cmpxchg%c %s,%s\n", nameISize(size),
6571                               nameIReg(size,gregOfRM(rm)),
6572                               nameIReg(size,eregOfRM(rm)) );
6573   }
6574   else if (!epartIsReg(rm) && !locked) {
6575      /* case 2 */
6576      addr = disAMode ( &len, sorb, delta0, dis_buf );
6577      assign( dest, loadLE(ty, mkexpr(addr)) );
6578      delta0 += len;
6579      assign( src, getIReg(size, gregOfRM(rm)) );
6580      assign( acc, getIReg(size, R_EAX) );
6581      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
6582      assign( cond8, unop(Iop_1Uto8, mk_x86g_calculate_condition(X86CondZ)) );
6583      assign( dest2, IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(src)) );
6584      assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
6585      putIReg(size, R_EAX, mkexpr(acc2));
6586      storeLE( mkexpr(addr), mkexpr(dest2) );
6587      DIP("cmpxchg%c %s,%s\n", nameISize(size),
6588                               nameIReg(size,gregOfRM(rm)), dis_buf);
6589   }
6590   else if (!epartIsReg(rm) && locked) {
6591      /* case 3 */
6592      /* src is new value.  acc is expected value.  dest is old value.
6593         Compute success from the output of the IRCAS, and steer the
6594         new value for EAX accordingly: in case of success, EAX is
6595         unchanged. */
6596      addr = disAMode ( &len, sorb, delta0, dis_buf );
6597      delta0 += len;
6598      assign( src, getIReg(size, gregOfRM(rm)) );
6599      assign( acc, getIReg(size, R_EAX) );
6600      stmt( IRStmt_CAS(
6601         mkIRCAS( IRTemp_INVALID, dest, Iend_LE, mkexpr(addr),
6602                  NULL, mkexpr(acc), NULL, mkexpr(src) )
6603      ));
6604      setFlags_DEP1_DEP2(Iop_Sub8, acc, dest, ty);
6605      assign( cond8, unop(Iop_1Uto8, mk_x86g_calculate_condition(X86CondZ)) );
6606      assign( acc2,  IRExpr_Mux0X(mkexpr(cond8), mkexpr(dest), mkexpr(acc)) );
6607      putIReg(size, R_EAX, mkexpr(acc2));
6608      DIP("cmpxchg%c %s,%s\n", nameISize(size),
6609                               nameIReg(size,gregOfRM(rm)), dis_buf);
6610   }
6611   else vassert(0);
6612
6613   return delta0;
6614}
6615
6616
6617/* Handle conditional move instructions of the form
6618      cmovcc E(reg-or-mem), G(reg)
6619
6620   E(src) is reg-or-mem
6621   G(dst) is reg.
6622
6623   If E is reg, -->    GET %E, tmps
6624                       GET %G, tmpd
6625                       CMOVcc tmps, tmpd
6626                       PUT tmpd, %G
6627
6628   If E is mem  -->    (getAddr E) -> tmpa
6629                       LD (tmpa), tmps
6630                       GET %G, tmpd
6631                       CMOVcc tmps, tmpd
6632                       PUT tmpd, %G
6633*/
6634static
6635UInt dis_cmov_E_G ( UChar       sorb,
6636                    Int         sz,
6637                    X86Condcode cond,
6638                    Int         delta0 )
6639{
6640   UChar rm  = getIByte(delta0);
6641   HChar dis_buf[50];
6642   Int   len;
6643
6644   IRType ty   = szToITy(sz);
6645   IRTemp tmps = newTemp(ty);
6646   IRTemp tmpd = newTemp(ty);
6647
6648   if (epartIsReg(rm)) {
6649      assign( tmps, getIReg(sz, eregOfRM(rm)) );
6650      assign( tmpd, getIReg(sz, gregOfRM(rm)) );
6651
6652      putIReg(sz, gregOfRM(rm),
6653                  IRExpr_Mux0X( unop(Iop_1Uto8,
6654                                     mk_x86g_calculate_condition(cond)),
6655                                mkexpr(tmpd),
6656                                mkexpr(tmps) )
6657             );
6658      DIP("cmov%c%s %s,%s\n", nameISize(sz),
6659                              name_X86Condcode(cond),
6660                              nameIReg(sz,eregOfRM(rm)),
6661                              nameIReg(sz,gregOfRM(rm)));
6662      return 1+delta0;
6663   }
6664
6665   /* E refers to memory */
6666   {
6667      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
6668      assign( tmps, loadLE(ty, mkexpr(addr)) );
6669      assign( tmpd, getIReg(sz, gregOfRM(rm)) );
6670
6671      putIReg(sz, gregOfRM(rm),
6672                  IRExpr_Mux0X( unop(Iop_1Uto8,
6673                                     mk_x86g_calculate_condition(cond)),
6674                                mkexpr(tmpd),
6675                                mkexpr(tmps) )
6676             );
6677
6678      DIP("cmov%c%s %s,%s\n", nameISize(sz),
6679                              name_X86Condcode(cond),
6680                              dis_buf,
6681                              nameIReg(sz,gregOfRM(rm)));
6682      return len+delta0;
6683   }
6684}
6685
6686
6687static
6688UInt dis_xadd_G_E ( UChar sorb, Bool locked, Int sz, Int delta0,
6689                    Bool* decodeOK )
6690{
6691   Int   len;
6692   UChar rm = getIByte(delta0);
6693   HChar dis_buf[50];
6694
6695   IRType ty    = szToITy(sz);
6696   IRTemp tmpd  = newTemp(ty);
6697   IRTemp tmpt0 = newTemp(ty);
6698   IRTemp tmpt1 = newTemp(ty);
6699
6700   /* There are 3 cases to consider:
6701
6702      reg-reg: ignore any lock prefix,
6703               generate 'naive' (non-atomic) sequence
6704
6705      reg-mem, not locked: ignore any lock prefix, generate 'naive'
6706                           (non-atomic) sequence
6707
6708      reg-mem, locked: use IRCAS
6709   */
6710
6711   if (epartIsReg(rm)) {
6712      /* case 1 */
6713      assign( tmpd,  getIReg(sz, eregOfRM(rm)));
6714      assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
6715      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
6716                           mkexpr(tmpd), mkexpr(tmpt0)) );
6717      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
6718      putIReg(sz, eregOfRM(rm), mkexpr(tmpt1));
6719      putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
6720      DIP("xadd%c %s, %s\n",
6721          nameISize(sz), nameIReg(sz,gregOfRM(rm)),
6722          				 nameIReg(sz,eregOfRM(rm)));
6723      *decodeOK = True;
6724      return 1+delta0;
6725   }
6726   else if (!epartIsReg(rm) && !locked) {
6727      /* case 2 */
6728      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
6729      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
6730      assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
6731      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
6732                           mkexpr(tmpd), mkexpr(tmpt0)) );
6733      storeLE( mkexpr(addr), mkexpr(tmpt1) );
6734      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
6735      putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
6736      DIP("xadd%c %s, %s\n",
6737          nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
6738      *decodeOK = True;
6739      return len+delta0;
6740   }
6741   else if (!epartIsReg(rm) && locked) {
6742      /* case 3 */
6743      IRTemp addr = disAMode ( &len, sorb, delta0, dis_buf );
6744      assign( tmpd,  loadLE(ty, mkexpr(addr)) );
6745      assign( tmpt0, getIReg(sz, gregOfRM(rm)) );
6746      assign( tmpt1, binop(mkSizedOp(ty,Iop_Add8),
6747                           mkexpr(tmpd), mkexpr(tmpt0)) );
6748      casLE( mkexpr(addr), mkexpr(tmpd)/*expVal*/,
6749                           mkexpr(tmpt1)/*newVal*/, guest_EIP_curr_instr );
6750      setFlags_DEP1_DEP2( Iop_Add8, tmpd, tmpt0, ty );
6751      putIReg(sz, gregOfRM(rm), mkexpr(tmpd));
6752      DIP("xadd%c %s, %s\n",
6753          nameISize(sz), nameIReg(sz,gregOfRM(rm)), dis_buf);
6754      *decodeOK = True;
6755      return len+delta0;
6756   }
6757   /*UNREACHED*/
6758   vassert(0);
6759}
6760
6761/* Move 16 bits from Ew (ireg or mem) to G (a segment register). */
6762
6763static
6764UInt dis_mov_Ew_Sw ( UChar sorb, Int delta0 )
6765{
6766   Int    len;
6767   IRTemp addr;
6768   UChar  rm  = getIByte(delta0);
6769   HChar  dis_buf[50];
6770
6771   if (epartIsReg(rm)) {
6772      putSReg( gregOfRM(rm), getIReg(2, eregOfRM(rm)) );
6773      DIP("movw %s,%s\n", nameIReg(2,eregOfRM(rm)), nameSReg(gregOfRM(rm)));
6774      return 1+delta0;
6775   } else {
6776      addr = disAMode ( &len, sorb, delta0, dis_buf );
6777      putSReg( gregOfRM(rm), loadLE(Ity_I16, mkexpr(addr)) );
6778      DIP("movw %s,%s\n", dis_buf, nameSReg(gregOfRM(rm)));
6779      return len+delta0;
6780   }
6781}
6782
6783/* Move 16 bits from G (a segment register) to Ew (ireg or mem).  If
6784   dst is ireg and sz==4, zero out top half of it.  */
6785
6786static
6787UInt dis_mov_Sw_Ew ( UChar sorb,
6788                     Int   sz,
6789                     Int   delta0 )
6790{
6791   Int    len;
6792   IRTemp addr;
6793   UChar  rm  = getIByte(delta0);
6794   HChar  dis_buf[50];
6795
6796   vassert(sz == 2 || sz == 4);
6797
6798   if (epartIsReg(rm)) {
6799      if (sz == 4)
6800         putIReg(4, eregOfRM(rm), unop(Iop_16Uto32, getSReg(gregOfRM(rm))));
6801      else
6802         putIReg(2, eregOfRM(rm), getSReg(gregOfRM(rm)));
6803
6804      DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), nameIReg(sz,eregOfRM(rm)));
6805      return 1+delta0;
6806   } else {
6807      addr = disAMode ( &len, sorb, delta0, dis_buf );
6808      storeLE( mkexpr(addr), getSReg(gregOfRM(rm)) );
6809      DIP("mov %s,%s\n", nameSReg(gregOfRM(rm)), dis_buf);
6810      return len+delta0;
6811   }
6812}
6813
6814
6815static
6816void dis_push_segreg ( UInt sreg, Int sz )
6817{
6818    IRTemp t1 = newTemp(Ity_I16);
6819    IRTemp ta = newTemp(Ity_I32);
6820    vassert(sz == 2 || sz == 4);
6821
6822    assign( t1, getSReg(sreg) );
6823    assign( ta, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)) );
6824    putIReg(4, R_ESP, mkexpr(ta));
6825    storeLE( mkexpr(ta), mkexpr(t1) );
6826
6827    DIP("push%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
6828}
6829
6830static
6831void dis_pop_segreg ( UInt sreg, Int sz )
6832{
6833    IRTemp t1 = newTemp(Ity_I16);
6834    IRTemp ta = newTemp(Ity_I32);
6835    vassert(sz == 2 || sz == 4);
6836
6837    assign( ta, getIReg(4, R_ESP) );
6838    assign( t1, loadLE(Ity_I16, mkexpr(ta)) );
6839
6840    putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(ta), mkU32(sz)) );
6841    putSReg( sreg, mkexpr(t1) );
6842    DIP("pop%c %s\n", sz==2 ? 'w' : 'l', nameSReg(sreg));
6843}
6844
6845static
6846void dis_ret ( /*MOD*/DisResult* dres, UInt d32 )
6847{
6848   IRTemp t1 = newTemp(Ity_I32);
6849   IRTemp t2 = newTemp(Ity_I32);
6850   assign(t1, getIReg(4,R_ESP));
6851   assign(t2, loadLE(Ity_I32,mkexpr(t1)));
6852   putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(4+d32)));
6853   jmp_treg(dres, Ijk_Ret, t2);
6854   vassert(dres->whatNext == Dis_StopHere);
6855}
6856
6857/*------------------------------------------------------------*/
6858/*--- SSE/SSE2/SSE3 helpers                                ---*/
6859/*------------------------------------------------------------*/
6860
6861/* Worker function; do not call directly.
6862   Handles full width G = G `op` E   and   G = (not G) `op` E.
6863*/
6864
6865static UInt dis_SSE_E_to_G_all_wrk (
6866               UChar sorb, Int delta,
6867               HChar* opname, IROp op,
6868               Bool   invertG
6869            )
6870{
6871   HChar   dis_buf[50];
6872   Int     alen;
6873   IRTemp  addr;
6874   UChar   rm = getIByte(delta);
6875   IRExpr* gpart
6876      = invertG ? unop(Iop_NotV128, getXMMReg(gregOfRM(rm)))
6877                : getXMMReg(gregOfRM(rm));
6878   if (epartIsReg(rm)) {
6879      putXMMReg( gregOfRM(rm),
6880                 binop(op, gpart,
6881                           getXMMReg(eregOfRM(rm))) );
6882      DIP("%s %s,%s\n", opname,
6883                        nameXMMReg(eregOfRM(rm)),
6884                        nameXMMReg(gregOfRM(rm)) );
6885      return delta+1;
6886   } else {
6887      addr = disAMode ( &alen, sorb, delta, dis_buf );
6888      putXMMReg( gregOfRM(rm),
6889                 binop(op, gpart,
6890                           loadLE(Ity_V128, mkexpr(addr))) );
6891      DIP("%s %s,%s\n", opname,
6892                        dis_buf,
6893                        nameXMMReg(gregOfRM(rm)) );
6894      return delta+alen;
6895   }
6896}
6897
6898
6899/* All lanes SSE binary operation, G = G `op` E. */
6900
6901static
6902UInt dis_SSE_E_to_G_all ( UChar sorb, Int delta, HChar* opname, IROp op )
6903{
6904   return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, False );
6905}
6906
6907/* All lanes SSE binary operation, G = (not G) `op` E. */
6908
6909static
6910UInt dis_SSE_E_to_G_all_invG ( UChar sorb, Int delta,
6911                               HChar* opname, IROp op )
6912{
6913   return dis_SSE_E_to_G_all_wrk( sorb, delta, opname, op, True );
6914}
6915
6916
6917/* Lowest 32-bit lane only SSE binary operation, G = G `op` E. */
6918
6919static UInt dis_SSE_E_to_G_lo32 ( UChar sorb, Int delta,
6920                                  HChar* opname, IROp op )
6921{
6922   HChar   dis_buf[50];
6923   Int     alen;
6924   IRTemp  addr;
6925   UChar   rm = getIByte(delta);
6926   IRExpr* gpart = getXMMReg(gregOfRM(rm));
6927   if (epartIsReg(rm)) {
6928      putXMMReg( gregOfRM(rm),
6929                 binop(op, gpart,
6930                           getXMMReg(eregOfRM(rm))) );
6931      DIP("%s %s,%s\n", opname,
6932                        nameXMMReg(eregOfRM(rm)),
6933                        nameXMMReg(gregOfRM(rm)) );
6934      return delta+1;
6935   } else {
6936      /* We can only do a 32-bit memory read, so the upper 3/4 of the
6937         E operand needs to be made simply of zeroes. */
6938      IRTemp epart = newTemp(Ity_V128);
6939      addr = disAMode ( &alen, sorb, delta, dis_buf );
6940      assign( epart, unop( Iop_32UtoV128,
6941                           loadLE(Ity_I32, mkexpr(addr))) );
6942      putXMMReg( gregOfRM(rm),
6943                 binop(op, gpart, mkexpr(epart)) );
6944      DIP("%s %s,%s\n", opname,
6945                        dis_buf,
6946                        nameXMMReg(gregOfRM(rm)) );
6947      return delta+alen;
6948   }
6949}
6950
6951
6952/* Lower 64-bit lane only SSE binary operation, G = G `op` E. */
6953
6954static UInt dis_SSE_E_to_G_lo64 ( UChar sorb, Int delta,
6955                                  HChar* opname, IROp op )
6956{
6957   HChar   dis_buf[50];
6958   Int     alen;
6959   IRTemp  addr;
6960   UChar   rm = getIByte(delta);
6961   IRExpr* gpart = getXMMReg(gregOfRM(rm));
6962   if (epartIsReg(rm)) {
6963      putXMMReg( gregOfRM(rm),
6964                 binop(op, gpart,
6965                           getXMMReg(eregOfRM(rm))) );
6966      DIP("%s %s,%s\n", opname,
6967                        nameXMMReg(eregOfRM(rm)),
6968                        nameXMMReg(gregOfRM(rm)) );
6969      return delta+1;
6970   } else {
6971      /* We can only do a 64-bit memory read, so the upper half of the
6972         E operand needs to be made simply of zeroes. */
6973      IRTemp epart = newTemp(Ity_V128);
6974      addr = disAMode ( &alen, sorb, delta, dis_buf );
6975      assign( epart, unop( Iop_64UtoV128,
6976                           loadLE(Ity_I64, mkexpr(addr))) );
6977      putXMMReg( gregOfRM(rm),
6978                 binop(op, gpart, mkexpr(epart)) );
6979      DIP("%s %s,%s\n", opname,
6980                        dis_buf,
6981                        nameXMMReg(gregOfRM(rm)) );
6982      return delta+alen;
6983   }
6984}
6985
6986
6987/* All lanes unary SSE operation, G = op(E). */
6988
6989static UInt dis_SSE_E_to_G_unary_all (
6990               UChar sorb, Int delta,
6991               HChar* opname, IROp op
6992            )
6993{
6994   HChar   dis_buf[50];
6995   Int     alen;
6996   IRTemp  addr;
6997   UChar   rm = getIByte(delta);
6998   if (epartIsReg(rm)) {
6999      putXMMReg( gregOfRM(rm),
7000                 unop(op, getXMMReg(eregOfRM(rm))) );
7001      DIP("%s %s,%s\n", opname,
7002                        nameXMMReg(eregOfRM(rm)),
7003                        nameXMMReg(gregOfRM(rm)) );
7004      return delta+1;
7005   } else {
7006      addr = disAMode ( &alen, sorb, delta, dis_buf );
7007      putXMMReg( gregOfRM(rm),
7008                 unop(op, loadLE(Ity_V128, mkexpr(addr))) );
7009      DIP("%s %s,%s\n", opname,
7010                        dis_buf,
7011                        nameXMMReg(gregOfRM(rm)) );
7012      return delta+alen;
7013   }
7014}
7015
7016
7017/* Lowest 32-bit lane only unary SSE operation, G = op(E). */
7018
7019static UInt dis_SSE_E_to_G_unary_lo32 (
7020               UChar sorb, Int delta,
7021               HChar* opname, IROp op
7022            )
7023{
7024   /* First we need to get the old G value and patch the low 32 bits
7025      of the E operand into it.  Then apply op and write back to G. */
7026   HChar   dis_buf[50];
7027   Int     alen;
7028   IRTemp  addr;
7029   UChar   rm = getIByte(delta);
7030   IRTemp  oldG0 = newTemp(Ity_V128);
7031   IRTemp  oldG1 = newTemp(Ity_V128);
7032
7033   assign( oldG0, getXMMReg(gregOfRM(rm)) );
7034
7035   if (epartIsReg(rm)) {
7036      assign( oldG1,
7037              binop( Iop_SetV128lo32,
7038                     mkexpr(oldG0),
7039                     getXMMRegLane32(eregOfRM(rm), 0)) );
7040      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7041      DIP("%s %s,%s\n", opname,
7042                        nameXMMReg(eregOfRM(rm)),
7043                        nameXMMReg(gregOfRM(rm)) );
7044      return delta+1;
7045   } else {
7046      addr = disAMode ( &alen, sorb, delta, dis_buf );
7047      assign( oldG1,
7048              binop( Iop_SetV128lo32,
7049                     mkexpr(oldG0),
7050                     loadLE(Ity_I32, mkexpr(addr)) ));
7051      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7052      DIP("%s %s,%s\n", opname,
7053                        dis_buf,
7054                        nameXMMReg(gregOfRM(rm)) );
7055      return delta+alen;
7056   }
7057}
7058
7059
7060/* Lowest 64-bit lane only unary SSE operation, G = op(E). */
7061
7062static UInt dis_SSE_E_to_G_unary_lo64 (
7063               UChar sorb, Int delta,
7064               HChar* opname, IROp op
7065            )
7066{
7067   /* First we need to get the old G value and patch the low 64 bits
7068      of the E operand into it.  Then apply op and write back to G. */
7069   HChar   dis_buf[50];
7070   Int     alen;
7071   IRTemp  addr;
7072   UChar   rm = getIByte(delta);
7073   IRTemp  oldG0 = newTemp(Ity_V128);
7074   IRTemp  oldG1 = newTemp(Ity_V128);
7075
7076   assign( oldG0, getXMMReg(gregOfRM(rm)) );
7077
7078   if (epartIsReg(rm)) {
7079      assign( oldG1,
7080              binop( Iop_SetV128lo64,
7081                     mkexpr(oldG0),
7082                     getXMMRegLane64(eregOfRM(rm), 0)) );
7083      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7084      DIP("%s %s,%s\n", opname,
7085                        nameXMMReg(eregOfRM(rm)),
7086                        nameXMMReg(gregOfRM(rm)) );
7087      return delta+1;
7088   } else {
7089      addr = disAMode ( &alen, sorb, delta, dis_buf );
7090      assign( oldG1,
7091              binop( Iop_SetV128lo64,
7092                     mkexpr(oldG0),
7093                     loadLE(Ity_I64, mkexpr(addr)) ));
7094      putXMMReg( gregOfRM(rm), unop(op, mkexpr(oldG1)) );
7095      DIP("%s %s,%s\n", opname,
7096                        dis_buf,
7097                        nameXMMReg(gregOfRM(rm)) );
7098      return delta+alen;
7099   }
7100}
7101
7102
7103/* SSE integer binary operation:
7104      G = G `op` E   (eLeft == False)
7105      G = E `op` G   (eLeft == True)
7106*/
7107static UInt dis_SSEint_E_to_G(
7108               UChar sorb, Int delta,
7109               HChar* opname, IROp op,
7110               Bool   eLeft
7111            )
7112{
7113   HChar   dis_buf[50];
7114   Int     alen;
7115   IRTemp  addr;
7116   UChar   rm = getIByte(delta);
7117   IRExpr* gpart = getXMMReg(gregOfRM(rm));
7118   IRExpr* epart = NULL;
7119   if (epartIsReg(rm)) {
7120      epart = getXMMReg(eregOfRM(rm));
7121      DIP("%s %s,%s\n", opname,
7122                        nameXMMReg(eregOfRM(rm)),
7123                        nameXMMReg(gregOfRM(rm)) );
7124      delta += 1;
7125   } else {
7126      addr  = disAMode ( &alen, sorb, delta, dis_buf );
7127      epart = loadLE(Ity_V128, mkexpr(addr));
7128      DIP("%s %s,%s\n", opname,
7129                        dis_buf,
7130                        nameXMMReg(gregOfRM(rm)) );
7131      delta += alen;
7132   }
7133   putXMMReg( gregOfRM(rm),
7134              eLeft ? binop(op, epart, gpart)
7135	            : binop(op, gpart, epart) );
7136   return delta;
7137}
7138
7139
7140/* Helper for doing SSE FP comparisons. */
7141
7142static void findSSECmpOp ( Bool* needNot, IROp* op,
7143                           Int imm8, Bool all_lanes, Int sz )
7144{
7145   imm8 &= 7;
7146   *needNot = False;
7147   *op      = Iop_INVALID;
7148   if (imm8 >= 4) {
7149      *needNot = True;
7150      imm8 -= 4;
7151   }
7152
7153   if (sz == 4 && all_lanes) {
7154      switch (imm8) {
7155         case 0: *op = Iop_CmpEQ32Fx4; return;
7156         case 1: *op = Iop_CmpLT32Fx4; return;
7157         case 2: *op = Iop_CmpLE32Fx4; return;
7158         case 3: *op = Iop_CmpUN32Fx4; return;
7159         default: break;
7160      }
7161   }
7162   if (sz == 4 && !all_lanes) {
7163      switch (imm8) {
7164         case 0: *op = Iop_CmpEQ32F0x4; return;
7165         case 1: *op = Iop_CmpLT32F0x4; return;
7166         case 2: *op = Iop_CmpLE32F0x4; return;
7167         case 3: *op = Iop_CmpUN32F0x4; return;
7168         default: break;
7169      }
7170   }
7171   if (sz == 8 && all_lanes) {
7172      switch (imm8) {
7173         case 0: *op = Iop_CmpEQ64Fx2; return;
7174         case 1: *op = Iop_CmpLT64Fx2; return;
7175         case 2: *op = Iop_CmpLE64Fx2; return;
7176         case 3: *op = Iop_CmpUN64Fx2; return;
7177         default: break;
7178      }
7179   }
7180   if (sz == 8 && !all_lanes) {
7181      switch (imm8) {
7182         case 0: *op = Iop_CmpEQ64F0x2; return;
7183         case 1: *op = Iop_CmpLT64F0x2; return;
7184         case 2: *op = Iop_CmpLE64F0x2; return;
7185         case 3: *op = Iop_CmpUN64F0x2; return;
7186         default: break;
7187      }
7188   }
7189   vpanic("findSSECmpOp(x86,guest)");
7190}
7191
7192/* Handles SSE 32F/64F comparisons. */
7193
7194static UInt dis_SSEcmp_E_to_G ( UChar sorb, Int delta,
7195				HChar* opname, Bool all_lanes, Int sz )
7196{
7197   HChar   dis_buf[50];
7198   Int     alen, imm8;
7199   IRTemp  addr;
7200   Bool    needNot = False;
7201   IROp    op      = Iop_INVALID;
7202   IRTemp  plain   = newTemp(Ity_V128);
7203   UChar   rm      = getIByte(delta);
7204   UShort  mask    = 0;
7205   vassert(sz == 4 || sz == 8);
7206   if (epartIsReg(rm)) {
7207      imm8 = getIByte(delta+1);
7208      findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
7209      assign( plain, binop(op, getXMMReg(gregOfRM(rm)),
7210                               getXMMReg(eregOfRM(rm))) );
7211      delta += 2;
7212      DIP("%s $%d,%s,%s\n", opname,
7213                            (Int)imm8,
7214                            nameXMMReg(eregOfRM(rm)),
7215                            nameXMMReg(gregOfRM(rm)) );
7216   } else {
7217      addr = disAMode ( &alen, sorb, delta, dis_buf );
7218      imm8 = getIByte(delta+alen);
7219      findSSECmpOp(&needNot, &op, imm8, all_lanes, sz);
7220      assign( plain,
7221              binop(
7222                 op,
7223                 getXMMReg(gregOfRM(rm)),
7224                   all_lanes  ? loadLE(Ity_V128, mkexpr(addr))
7225                 : sz == 8    ? unop( Iop_64UtoV128, loadLE(Ity_I64, mkexpr(addr)))
7226                 : /*sz==4*/    unop( Iop_32UtoV128, loadLE(Ity_I32, mkexpr(addr)))
7227             )
7228      );
7229      delta += alen+1;
7230      DIP("%s $%d,%s,%s\n", opname,
7231                            (Int)imm8,
7232                            dis_buf,
7233                            nameXMMReg(gregOfRM(rm)) );
7234   }
7235
7236   if (needNot && all_lanes) {
7237      putXMMReg( gregOfRM(rm),
7238                 unop(Iop_NotV128, mkexpr(plain)) );
7239   }
7240   else
7241   if (needNot && !all_lanes) {
7242      mask = toUShort( sz==4 ? 0x000F : 0x00FF );
7243      putXMMReg( gregOfRM(rm),
7244                 binop(Iop_XorV128, mkexpr(plain), mkV128(mask)) );
7245   }
7246   else {
7247      putXMMReg( gregOfRM(rm), mkexpr(plain) );
7248   }
7249
7250   return delta;
7251}
7252
7253
7254/* Vector by scalar shift of G by the amount specified at the bottom
7255   of E. */
7256
7257static UInt dis_SSE_shiftG_byE ( UChar sorb, Int delta,
7258                                 HChar* opname, IROp op )
7259{
7260   HChar   dis_buf[50];
7261   Int     alen, size;
7262   IRTemp  addr;
7263   Bool    shl, shr, sar;
7264   UChar   rm   = getIByte(delta);
7265   IRTemp  g0   = newTemp(Ity_V128);
7266   IRTemp  g1   = newTemp(Ity_V128);
7267   IRTemp  amt  = newTemp(Ity_I32);
7268   IRTemp  amt8 = newTemp(Ity_I8);
7269   if (epartIsReg(rm)) {
7270      assign( amt, getXMMRegLane32(eregOfRM(rm), 0) );
7271      DIP("%s %s,%s\n", opname,
7272                        nameXMMReg(eregOfRM(rm)),
7273                        nameXMMReg(gregOfRM(rm)) );
7274      delta++;
7275   } else {
7276      addr = disAMode ( &alen, sorb, delta, dis_buf );
7277      assign( amt, loadLE(Ity_I32, mkexpr(addr)) );
7278      DIP("%s %s,%s\n", opname,
7279                        dis_buf,
7280                        nameXMMReg(gregOfRM(rm)) );
7281      delta += alen;
7282   }
7283   assign( g0,   getXMMReg(gregOfRM(rm)) );
7284   assign( amt8, unop(Iop_32to8, mkexpr(amt)) );
7285
7286   shl = shr = sar = False;
7287   size = 0;
7288   switch (op) {
7289      case Iop_ShlN16x8: shl = True; size = 32; break;
7290      case Iop_ShlN32x4: shl = True; size = 32; break;
7291      case Iop_ShlN64x2: shl = True; size = 64; break;
7292      case Iop_SarN16x8: sar = True; size = 16; break;
7293      case Iop_SarN32x4: sar = True; size = 32; break;
7294      case Iop_ShrN16x8: shr = True; size = 16; break;
7295      case Iop_ShrN32x4: shr = True; size = 32; break;
7296      case Iop_ShrN64x2: shr = True; size = 64; break;
7297      default: vassert(0);
7298   }
7299
7300   if (shl || shr) {
7301     assign(
7302        g1,
7303        IRExpr_Mux0X(
7304           unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
7305           mkV128(0x0000),
7306           binop(op, mkexpr(g0), mkexpr(amt8))
7307        )
7308     );
7309   } else
7310   if (sar) {
7311     assign(
7312        g1,
7313        IRExpr_Mux0X(
7314           unop(Iop_1Uto8,binop(Iop_CmpLT32U,mkexpr(amt),mkU32(size))),
7315           binop(op, mkexpr(g0), mkU8(size-1)),
7316           binop(op, mkexpr(g0), mkexpr(amt8))
7317        )
7318     );
7319   } else {
7320      /*NOTREACHED*/
7321      vassert(0);
7322   }
7323
7324   putXMMReg( gregOfRM(rm), mkexpr(g1) );
7325   return delta;
7326}
7327
7328
7329/* Vector by scalar shift of E by an immediate byte. */
7330
7331static
7332UInt dis_SSE_shiftE_imm ( Int delta, HChar* opname, IROp op )
7333{
7334   Bool    shl, shr, sar;
7335   UChar   rm   = getIByte(delta);
7336   IRTemp  e0   = newTemp(Ity_V128);
7337   IRTemp  e1   = newTemp(Ity_V128);
7338   UChar   amt, size;
7339   vassert(epartIsReg(rm));
7340   vassert(gregOfRM(rm) == 2
7341           || gregOfRM(rm) == 4 || gregOfRM(rm) == 6);
7342   amt = getIByte(delta+1);
7343   delta += 2;
7344   DIP("%s $%d,%s\n", opname,
7345                      (Int)amt,
7346                      nameXMMReg(eregOfRM(rm)) );
7347   assign( e0, getXMMReg(eregOfRM(rm)) );
7348
7349   shl = shr = sar = False;
7350   size = 0;
7351   switch (op) {
7352      case Iop_ShlN16x8: shl = True; size = 16; break;
7353      case Iop_ShlN32x4: shl = True; size = 32; break;
7354      case Iop_ShlN64x2: shl = True; size = 64; break;
7355      case Iop_SarN16x8: sar = True; size = 16; break;
7356      case Iop_SarN32x4: sar = True; size = 32; break;
7357      case Iop_ShrN16x8: shr = True; size = 16; break;
7358      case Iop_ShrN32x4: shr = True; size = 32; break;
7359      case Iop_ShrN64x2: shr = True; size = 64; break;
7360      default: vassert(0);
7361   }
7362
7363   if (shl || shr) {
7364      assign( e1, amt >= size
7365                     ? mkV128(0x0000)
7366                     : binop(op, mkexpr(e0), mkU8(amt))
7367      );
7368   } else
7369   if (sar) {
7370      assign( e1, amt >= size
7371                     ? binop(op, mkexpr(e0), mkU8(size-1))
7372                     : binop(op, mkexpr(e0), mkU8(amt))
7373      );
7374   } else {
7375      /*NOTREACHED*/
7376      vassert(0);
7377   }
7378
7379   putXMMReg( eregOfRM(rm), mkexpr(e1) );
7380   return delta;
7381}
7382
7383
7384/* Get the current SSE rounding mode. */
7385
7386static IRExpr* /* :: Ity_I32 */ get_sse_roundingmode ( void )
7387{
7388   return binop( Iop_And32,
7389                 IRExpr_Get( OFFB_SSEROUND, Ity_I32 ),
7390                 mkU32(3) );
7391}
7392
7393static void put_sse_roundingmode ( IRExpr* sseround )
7394{
7395   vassert(typeOfIRExpr(irsb->tyenv, sseround) == Ity_I32);
7396   stmt( IRStmt_Put( OFFB_SSEROUND, sseround ) );
7397}
7398
7399/* Break a 128-bit value up into four 32-bit ints. */
7400
7401static void breakup128to32s ( IRTemp t128,
7402			      /*OUTs*/
7403                              IRTemp* t3, IRTemp* t2,
7404                              IRTemp* t1, IRTemp* t0 )
7405{
7406   IRTemp hi64 = newTemp(Ity_I64);
7407   IRTemp lo64 = newTemp(Ity_I64);
7408   assign( hi64, unop(Iop_V128HIto64, mkexpr(t128)) );
7409   assign( lo64, unop(Iop_V128to64,   mkexpr(t128)) );
7410
7411   vassert(t0 && *t0 == IRTemp_INVALID);
7412   vassert(t1 && *t1 == IRTemp_INVALID);
7413   vassert(t2 && *t2 == IRTemp_INVALID);
7414   vassert(t3 && *t3 == IRTemp_INVALID);
7415
7416   *t0 = newTemp(Ity_I32);
7417   *t1 = newTemp(Ity_I32);
7418   *t2 = newTemp(Ity_I32);
7419   *t3 = newTemp(Ity_I32);
7420   assign( *t0, unop(Iop_64to32,   mkexpr(lo64)) );
7421   assign( *t1, unop(Iop_64HIto32, mkexpr(lo64)) );
7422   assign( *t2, unop(Iop_64to32,   mkexpr(hi64)) );
7423   assign( *t3, unop(Iop_64HIto32, mkexpr(hi64)) );
7424}
7425
7426/* Construct a 128-bit value from four 32-bit ints. */
7427
7428static IRExpr* mk128from32s ( IRTemp t3, IRTemp t2,
7429                              IRTemp t1, IRTemp t0 )
7430{
7431   return
7432      binop( Iop_64HLtoV128,
7433             binop(Iop_32HLto64, mkexpr(t3), mkexpr(t2)),
7434             binop(Iop_32HLto64, mkexpr(t1), mkexpr(t0))
7435   );
7436}
7437
7438/* Break a 64-bit value up into four 16-bit ints. */
7439
7440static void breakup64to16s ( IRTemp t64,
7441                             /*OUTs*/
7442                             IRTemp* t3, IRTemp* t2,
7443                             IRTemp* t1, IRTemp* t0 )
7444{
7445   IRTemp hi32 = newTemp(Ity_I32);
7446   IRTemp lo32 = newTemp(Ity_I32);
7447   assign( hi32, unop(Iop_64HIto32, mkexpr(t64)) );
7448   assign( lo32, unop(Iop_64to32,   mkexpr(t64)) );
7449
7450   vassert(t0 && *t0 == IRTemp_INVALID);
7451   vassert(t1 && *t1 == IRTemp_INVALID);
7452   vassert(t2 && *t2 == IRTemp_INVALID);
7453   vassert(t3 && *t3 == IRTemp_INVALID);
7454
7455   *t0 = newTemp(Ity_I16);
7456   *t1 = newTemp(Ity_I16);
7457   *t2 = newTemp(Ity_I16);
7458   *t3 = newTemp(Ity_I16);
7459   assign( *t0, unop(Iop_32to16,   mkexpr(lo32)) );
7460   assign( *t1, unop(Iop_32HIto16, mkexpr(lo32)) );
7461   assign( *t2, unop(Iop_32to16,   mkexpr(hi32)) );
7462   assign( *t3, unop(Iop_32HIto16, mkexpr(hi32)) );
7463}
7464
7465/* Construct a 64-bit value from four 16-bit ints. */
7466
7467static IRExpr* mk64from16s ( IRTemp t3, IRTemp t2,
7468                             IRTemp t1, IRTemp t0 )
7469{
7470   return
7471      binop( Iop_32HLto64,
7472             binop(Iop_16HLto32, mkexpr(t3), mkexpr(t2)),
7473             binop(Iop_16HLto32, mkexpr(t1), mkexpr(t0))
7474   );
7475}
7476
7477/* Generate IR to set the guest %EFLAGS from the pushfl-format image
7478   in the given 32-bit temporary.  The flags that are set are: O S Z A
7479   C P D ID AC.
7480
7481   In all cases, code to set AC is generated.  However, VEX actually
7482   ignores the AC value and so can optionally emit an emulation
7483   warning when it is enabled.  In this routine, an emulation warning
7484   is only emitted if emit_AC_emwarn is True, in which case
7485   next_insn_EIP must be correct (this allows for correct code
7486   generation for popfl/popfw).  If emit_AC_emwarn is False,
7487   next_insn_EIP is unimportant (this allows for easy if kludgey code
7488   generation for IRET.) */
7489
7490static
7491void set_EFLAGS_from_value ( IRTemp t1,
7492                             Bool   emit_AC_emwarn,
7493                             Addr32 next_insn_EIP )
7494{
7495   vassert(typeOfIRTemp(irsb->tyenv,t1) == Ity_I32);
7496
7497   /* t1 is the flag word.  Mask out everything except OSZACP and set
7498      the flags thunk to X86G_CC_OP_COPY. */
7499   stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
7500   stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
7501   stmt( IRStmt_Put( OFFB_CC_DEP1,
7502                     binop(Iop_And32,
7503                           mkexpr(t1),
7504                           mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
7505                                  | X86G_CC_MASK_A | X86G_CC_MASK_Z
7506                                  | X86G_CC_MASK_S| X86G_CC_MASK_O )
7507                          )
7508                    )
7509       );
7510   /* Set NDEP even though it isn't used.  This makes redundant-PUT
7511      elimination of previous stores to this field work better. */
7512   stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
7513
7514   /* Also need to set the D flag, which is held in bit 10 of t1.
7515      If zero, put 1 in OFFB_DFLAG, else -1 in OFFB_DFLAG. */
7516   stmt( IRStmt_Put(
7517            OFFB_DFLAG,
7518            IRExpr_Mux0X(
7519               unop(Iop_32to8,
7520                    binop(Iop_And32,
7521                          binop(Iop_Shr32, mkexpr(t1), mkU8(10)),
7522                          mkU32(1))),
7523               mkU32(1),
7524               mkU32(0xFFFFFFFF)))
7525       );
7526
7527   /* Set the ID flag */
7528   stmt( IRStmt_Put(
7529            OFFB_IDFLAG,
7530            IRExpr_Mux0X(
7531               unop(Iop_32to8,
7532                    binop(Iop_And32,
7533                          binop(Iop_Shr32, mkexpr(t1), mkU8(21)),
7534                          mkU32(1))),
7535               mkU32(0),
7536               mkU32(1)))
7537       );
7538
7539   /* And set the AC flag.  If setting it 1 to, possibly emit an
7540      emulation warning. */
7541   stmt( IRStmt_Put(
7542            OFFB_ACFLAG,
7543            IRExpr_Mux0X(
7544               unop(Iop_32to8,
7545                    binop(Iop_And32,
7546                          binop(Iop_Shr32, mkexpr(t1), mkU8(18)),
7547                          mkU32(1))),
7548               mkU32(0),
7549               mkU32(1)))
7550       );
7551
7552   if (emit_AC_emwarn) {
7553      put_emwarn( mkU32(EmWarn_X86_acFlag) );
7554      stmt(
7555         IRStmt_Exit(
7556            binop( Iop_CmpNE32,
7557                   binop(Iop_And32, mkexpr(t1), mkU32(1<<18)),
7558                   mkU32(0) ),
7559            Ijk_EmWarn,
7560            IRConst_U32( next_insn_EIP ),
7561            OFFB_EIP
7562         )
7563      );
7564   }
7565}
7566
7567
7568/* Helper for the SSSE3 (not SSE3) PMULHRSW insns.  Given two 64-bit
7569   values (aa,bb), computes, for each of the 4 16-bit lanes:
7570
7571   (((aa_lane *s32 bb_lane) >>u 14) + 1) >>u 1
7572*/
7573static IRExpr* dis_PMULHRSW_helper ( IRExpr* aax, IRExpr* bbx )
7574{
7575   IRTemp aa      = newTemp(Ity_I64);
7576   IRTemp bb      = newTemp(Ity_I64);
7577   IRTemp aahi32s = newTemp(Ity_I64);
7578   IRTemp aalo32s = newTemp(Ity_I64);
7579   IRTemp bbhi32s = newTemp(Ity_I64);
7580   IRTemp bblo32s = newTemp(Ity_I64);
7581   IRTemp rHi     = newTemp(Ity_I64);
7582   IRTemp rLo     = newTemp(Ity_I64);
7583   IRTemp one32x2 = newTemp(Ity_I64);
7584   assign(aa, aax);
7585   assign(bb, bbx);
7586   assign( aahi32s,
7587           binop(Iop_SarN32x2,
7588                 binop(Iop_InterleaveHI16x4, mkexpr(aa), mkexpr(aa)),
7589                 mkU8(16) ));
7590   assign( aalo32s,
7591           binop(Iop_SarN32x2,
7592                 binop(Iop_InterleaveLO16x4, mkexpr(aa), mkexpr(aa)),
7593                 mkU8(16) ));
7594   assign( bbhi32s,
7595           binop(Iop_SarN32x2,
7596                 binop(Iop_InterleaveHI16x4, mkexpr(bb), mkexpr(bb)),
7597                 mkU8(16) ));
7598   assign( bblo32s,
7599           binop(Iop_SarN32x2,
7600                 binop(Iop_InterleaveLO16x4, mkexpr(bb), mkexpr(bb)),
7601                 mkU8(16) ));
7602   assign(one32x2, mkU64( (1ULL << 32) + 1 ));
7603   assign(
7604      rHi,
7605      binop(
7606         Iop_ShrN32x2,
7607         binop(
7608            Iop_Add32x2,
7609            binop(
7610               Iop_ShrN32x2,
7611               binop(Iop_Mul32x2, mkexpr(aahi32s), mkexpr(bbhi32s)),
7612               mkU8(14)
7613            ),
7614            mkexpr(one32x2)
7615         ),
7616         mkU8(1)
7617      )
7618   );
7619   assign(
7620      rLo,
7621      binop(
7622         Iop_ShrN32x2,
7623         binop(
7624            Iop_Add32x2,
7625            binop(
7626               Iop_ShrN32x2,
7627               binop(Iop_Mul32x2, mkexpr(aalo32s), mkexpr(bblo32s)),
7628               mkU8(14)
7629            ),
7630            mkexpr(one32x2)
7631         ),
7632         mkU8(1)
7633      )
7634   );
7635   return
7636      binop(Iop_CatEvenLanes16x4, mkexpr(rHi), mkexpr(rLo));
7637}
7638
7639/* Helper for the SSSE3 (not SSE3) PSIGN{B,W,D} insns.  Given two 64-bit
7640   values (aa,bb), computes, for each lane:
7641
7642          if aa_lane < 0 then - bb_lane
7643     else if aa_lane > 0 then bb_lane
7644     else 0
7645*/
7646static IRExpr* dis_PSIGN_helper ( IRExpr* aax, IRExpr* bbx, Int laneszB )
7647{
7648   IRTemp aa       = newTemp(Ity_I64);
7649   IRTemp bb       = newTemp(Ity_I64);
7650   IRTemp zero     = newTemp(Ity_I64);
7651   IRTemp bbNeg    = newTemp(Ity_I64);
7652   IRTemp negMask  = newTemp(Ity_I64);
7653   IRTemp posMask  = newTemp(Ity_I64);
7654   IROp   opSub    = Iop_INVALID;
7655   IROp   opCmpGTS = Iop_INVALID;
7656
7657   switch (laneszB) {
7658      case 1: opSub = Iop_Sub8x8;  opCmpGTS = Iop_CmpGT8Sx8;  break;
7659      case 2: opSub = Iop_Sub16x4; opCmpGTS = Iop_CmpGT16Sx4; break;
7660      case 4: opSub = Iop_Sub32x2; opCmpGTS = Iop_CmpGT32Sx2; break;
7661      default: vassert(0);
7662   }
7663
7664   assign( aa,      aax );
7665   assign( bb,      bbx );
7666   assign( zero,    mkU64(0) );
7667   assign( bbNeg,   binop(opSub,    mkexpr(zero), mkexpr(bb)) );
7668   assign( negMask, binop(opCmpGTS, mkexpr(zero), mkexpr(aa)) );
7669   assign( posMask, binop(opCmpGTS, mkexpr(aa),   mkexpr(zero)) );
7670
7671   return
7672      binop(Iop_Or64,
7673            binop(Iop_And64, mkexpr(bb),    mkexpr(posMask)),
7674            binop(Iop_And64, mkexpr(bbNeg), mkexpr(negMask)) );
7675
7676}
7677
7678/* Helper for the SSSE3 (not SSE3) PABS{B,W,D} insns.  Given a 64-bit
7679   value aa, computes, for each lane
7680
7681   if aa < 0 then -aa else aa
7682
7683   Note that the result is interpreted as unsigned, so that the
7684   absolute value of the most negative signed input can be
7685   represented.
7686*/
7687static IRExpr* dis_PABS_helper ( IRExpr* aax, Int laneszB )
7688{
7689   IRTemp aa      = newTemp(Ity_I64);
7690   IRTemp zero    = newTemp(Ity_I64);
7691   IRTemp aaNeg   = newTemp(Ity_I64);
7692   IRTemp negMask = newTemp(Ity_I64);
7693   IRTemp posMask = newTemp(Ity_I64);
7694   IROp   opSub   = Iop_INVALID;
7695   IROp   opSarN  = Iop_INVALID;
7696
7697   switch (laneszB) {
7698      case 1: opSub = Iop_Sub8x8;  opSarN = Iop_SarN8x8;  break;
7699      case 2: opSub = Iop_Sub16x4; opSarN = Iop_SarN16x4; break;
7700      case 4: opSub = Iop_Sub32x2; opSarN = Iop_SarN32x2; break;
7701      default: vassert(0);
7702   }
7703
7704   assign( aa,      aax );
7705   assign( negMask, binop(opSarN, mkexpr(aa), mkU8(8*laneszB-1)) );
7706   assign( posMask, unop(Iop_Not64, mkexpr(negMask)) );
7707   assign( zero,    mkU64(0) );
7708   assign( aaNeg,   binop(opSub, mkexpr(zero), mkexpr(aa)) );
7709   return
7710      binop(Iop_Or64,
7711            binop(Iop_And64, mkexpr(aa),    mkexpr(posMask)),
7712            binop(Iop_And64, mkexpr(aaNeg), mkexpr(negMask)) );
7713}
7714
7715static IRExpr* dis_PALIGNR_XMM_helper ( IRTemp hi64,
7716                                        IRTemp lo64, Int byteShift )
7717{
7718   vassert(byteShift >= 1 && byteShift <= 7);
7719   return
7720      binop(Iop_Or64,
7721            binop(Iop_Shl64, mkexpr(hi64), mkU8(8*(8-byteShift))),
7722            binop(Iop_Shr64, mkexpr(lo64), mkU8(8*byteShift))
7723      );
7724}
7725
7726/* Generate a SIGSEGV followed by a restart of the current instruction
7727   if effective_addr is not 16-aligned.  This is required behaviour
7728   for some SSE3 instructions and all 128-bit SSSE3 instructions.
7729   This assumes that guest_RIP_curr_instr is set correctly! */
7730static void gen_SEGV_if_not_16_aligned ( IRTemp effective_addr )
7731{
7732   stmt(
7733      IRStmt_Exit(
7734         binop(Iop_CmpNE32,
7735               binop(Iop_And32,mkexpr(effective_addr),mkU32(0xF)),
7736               mkU32(0)),
7737         Ijk_SigSEGV,
7738         IRConst_U32(guest_EIP_curr_instr),
7739         OFFB_EIP
7740      )
7741   );
7742}
7743
7744
7745/* Helper for deciding whether a given insn (starting at the opcode
7746   byte) may validly be used with a LOCK prefix.  The following insns
7747   may be used with LOCK when their destination operand is in memory.
7748   AFAICS this is exactly the same for both 32-bit and 64-bit mode.
7749
7750   ADD        80 /0,  81 /0,  82 /0,  83 /0,  00,  01
7751   OR         80 /1,  81 /1,  82 /x,  83 /1,  08,  09
7752   ADC        80 /2,  81 /2,  82 /2,  83 /2,  10,  11
7753   SBB        81 /3,  81 /3,  82 /x,  83 /3,  18,  19
7754   AND        80 /4,  81 /4,  82 /x,  83 /4,  20,  21
7755   SUB        80 /5,  81 /5,  82 /x,  83 /5,  28,  29
7756   XOR        80 /6,  81 /6,  82 /x,  83 /6,  30,  31
7757
7758   DEC        FE /1,  FF /1
7759   INC        FE /0,  FF /0
7760
7761   NEG        F6 /3,  F7 /3
7762   NOT        F6 /2,  F7 /2
7763
7764   XCHG       86, 87
7765
7766   BTC        0F BB,  0F BA /7
7767   BTR        0F B3,  0F BA /6
7768   BTS        0F AB,  0F BA /5
7769
7770   CMPXCHG    0F B0,  0F B1
7771   CMPXCHG8B  0F C7 /1
7772
7773   XADD       0F C0,  0F C1
7774
7775   ------------------------------
7776
7777   80 /0  =  addb $imm8,  rm8
7778   81 /0  =  addl $imm32, rm32  and  addw $imm16, rm16
7779   82 /0  =  addb $imm8,  rm8
7780   83 /0  =  addl $simm8, rm32  and  addw $simm8, rm16
7781
7782   00     =  addb r8,  rm8
7783   01     =  addl r32, rm32  and  addw r16, rm16
7784
7785   Same for ADD OR ADC SBB AND SUB XOR
7786
7787   FE /1  = dec rm8
7788   FF /1  = dec rm32  and  dec rm16
7789
7790   FE /0  = inc rm8
7791   FF /0  = inc rm32  and  inc rm16
7792
7793   F6 /3  = neg rm8
7794   F7 /3  = neg rm32  and  neg rm16
7795
7796   F6 /2  = not rm8
7797   F7 /2  = not rm32  and  not rm16
7798
7799   0F BB     = btcw r16, rm16    and  btcl r32, rm32
7800   OF BA /7  = btcw $imm8, rm16  and  btcw $imm8, rm32
7801
7802   Same for BTS, BTR
7803*/
7804static Bool can_be_used_with_LOCK_prefix ( UChar* opc )
7805{
7806   switch (opc[0]) {
7807      case 0x00: case 0x01: case 0x08: case 0x09:
7808      case 0x10: case 0x11: case 0x18: case 0x19:
7809      case 0x20: case 0x21: case 0x28: case 0x29:
7810      case 0x30: case 0x31:
7811         if (!epartIsReg(opc[1]))
7812            return True;
7813         break;
7814
7815      case 0x80: case 0x81: case 0x82: case 0x83:
7816         if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 6
7817             && !epartIsReg(opc[1]))
7818            return True;
7819         break;
7820
7821      case 0xFE: case 0xFF:
7822         if (gregOfRM(opc[1]) >= 0 && gregOfRM(opc[1]) <= 1
7823             && !epartIsReg(opc[1]))
7824            return True;
7825         break;
7826
7827      case 0xF6: case 0xF7:
7828         if (gregOfRM(opc[1]) >= 2 && gregOfRM(opc[1]) <= 3
7829             && !epartIsReg(opc[1]))
7830            return True;
7831         break;
7832
7833      case 0x86: case 0x87:
7834         if (!epartIsReg(opc[1]))
7835            return True;
7836         break;
7837
7838      case 0x0F: {
7839         switch (opc[1]) {
7840            case 0xBB: case 0xB3: case 0xAB:
7841               if (!epartIsReg(opc[2]))
7842                  return True;
7843               break;
7844            case 0xBA:
7845               if (gregOfRM(opc[2]) >= 5 && gregOfRM(opc[2]) <= 7
7846                   && !epartIsReg(opc[2]))
7847                  return True;
7848               break;
7849            case 0xB0: case 0xB1:
7850               if (!epartIsReg(opc[2]))
7851                  return True;
7852               break;
7853            case 0xC7:
7854               if (gregOfRM(opc[2]) == 1 && !epartIsReg(opc[2]) )
7855                  return True;
7856               break;
7857            case 0xC0: case 0xC1:
7858               if (!epartIsReg(opc[2]))
7859                  return True;
7860               break;
7861            default:
7862               break;
7863         } /* switch (opc[1]) */
7864         break;
7865      }
7866
7867      default:
7868         break;
7869   } /* switch (opc[0]) */
7870
7871   return False;
7872}
7873
7874static IRTemp math_BSWAP ( IRTemp t1, IRType ty )
7875{
7876   IRTemp t2 = newTemp(ty);
7877   if (ty == Ity_I32) {
7878      assign( t2,
7879         binop(
7880            Iop_Or32,
7881            binop(Iop_Shl32, mkexpr(t1), mkU8(24)),
7882            binop(
7883               Iop_Or32,
7884               binop(Iop_And32, binop(Iop_Shl32, mkexpr(t1), mkU8(8)),
7885                                mkU32(0x00FF0000)),
7886               binop(Iop_Or32,
7887                     binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(8)),
7888                                      mkU32(0x0000FF00)),
7889                     binop(Iop_And32, binop(Iop_Shr32, mkexpr(t1), mkU8(24)),
7890                                      mkU32(0x000000FF) )
7891            )))
7892      );
7893      return t2;
7894   }
7895   if (ty == Ity_I16) {
7896      assign(t2,
7897             binop(Iop_Or16,
7898                   binop(Iop_Shl16, mkexpr(t1), mkU8(8)),
7899                   binop(Iop_Shr16, mkexpr(t1), mkU8(8)) ));
7900      return t2;
7901   }
7902   vassert(0);
7903   /*NOTREACHED*/
7904   return IRTemp_INVALID;
7905}
7906
7907/*------------------------------------------------------------*/
7908/*--- Disassemble a single instruction                     ---*/
7909/*------------------------------------------------------------*/
7910
7911/* Disassemble a single instruction into IR.  The instruction is
7912   located in host memory at &guest_code[delta].  *expect_CAS is set
7913   to True if the resulting IR is expected to contain an IRCAS
7914   statement, and False if it's not expected to.  This makes it
7915   possible for the caller of disInstr_X86_WRK to check that
7916   LOCK-prefixed instructions are at least plausibly translated, in
7917   that it becomes possible to check that a (validly) LOCK-prefixed
7918   instruction generates a translation containing an IRCAS, and
7919   instructions without LOCK prefixes don't generate translations
7920   containing an IRCAS.
7921*/
7922static
7923DisResult disInstr_X86_WRK (
7924             /*OUT*/Bool* expect_CAS,
7925             Bool         (*resteerOkFn) ( /*opaque*/void*, Addr64 ),
7926             Bool         resteerCisOk,
7927             void*        callback_opaque,
7928             Long         delta64,
7929             VexArchInfo* archinfo,
7930             VexAbiInfo*  vbi
7931          )
7932{
7933   IRType    ty;
7934   IRTemp    addr, t0, t1, t2, t3, t4, t5, t6;
7935   Int       alen;
7936   UChar     opc, modrm, abyte, pre;
7937   UInt      d32;
7938   HChar     dis_buf[50];
7939   Int       am_sz, d_sz, n_prefixes;
7940   DisResult dres;
7941   UChar*    insn; /* used in SSE decoders */
7942
7943   /* The running delta */
7944   Int delta = (Int)delta64;
7945
7946   /* Holds eip at the start of the insn, so that we can print
7947      consistent error messages for unimplemented insns. */
7948   Int delta_start = delta;
7949
7950   /* sz denotes the nominal data-op size of the insn; we change it to
7951      2 if an 0x66 prefix is seen */
7952   Int sz = 4;
7953
7954   /* sorb holds the segment-override-prefix byte, if any.  Zero if no
7955      prefix has been seen, else one of {0x26, 0x3E, 0x64, 0x65}
7956      indicating the prefix.  */
7957   UChar sorb = 0;
7958
7959   /* Gets set to True if a LOCK prefix is seen. */
7960   Bool pfx_lock = False;
7961
7962   /* Set result defaults. */
7963   dres.whatNext    = Dis_Continue;
7964   dres.len         = 0;
7965   dres.continueAt  = 0;
7966   dres.jk_StopHere = Ijk_INVALID;
7967
7968   *expect_CAS = False;
7969
7970   addr = t0 = t1 = t2 = t3 = t4 = t5 = t6 = IRTemp_INVALID;
7971
7972   vassert(guest_EIP_bbstart + delta == guest_EIP_curr_instr);
7973   DIP("\t0x%x:  ", guest_EIP_bbstart+delta);
7974
7975   /* Spot "Special" instructions (see comment at top of file). */
7976   {
7977      UChar* code = (UChar*)(guest_code + delta);
7978      /* Spot the 12-byte preamble:
7979         C1C703   roll $3,  %edi
7980         C1C70D   roll $13, %edi
7981         C1C71D   roll $29, %edi
7982         C1C713   roll $19, %edi
7983      */
7984      if (code[ 0] == 0xC1 && code[ 1] == 0xC7 && code[ 2] == 0x03 &&
7985          code[ 3] == 0xC1 && code[ 4] == 0xC7 && code[ 5] == 0x0D &&
7986          code[ 6] == 0xC1 && code[ 7] == 0xC7 && code[ 8] == 0x1D &&
7987          code[ 9] == 0xC1 && code[10] == 0xC7 && code[11] == 0x13) {
7988         /* Got a "Special" instruction preamble.  Which one is it? */
7989         if (code[12] == 0x87 && code[13] == 0xDB /* xchgl %ebx,%ebx */) {
7990            /* %EDX = client_request ( %EAX ) */
7991            DIP("%%edx = client_request ( %%eax )\n");
7992            delta += 14;
7993            jmp_lit(&dres, Ijk_ClientReq, guest_EIP_bbstart+delta);
7994            vassert(dres.whatNext == Dis_StopHere);
7995            goto decode_success;
7996         }
7997         else
7998         if (code[12] == 0x87 && code[13] == 0xC9 /* xchgl %ecx,%ecx */) {
7999            /* %EAX = guest_NRADDR */
8000            DIP("%%eax = guest_NRADDR\n");
8001            delta += 14;
8002            putIReg(4, R_EAX, IRExpr_Get( OFFB_NRADDR, Ity_I32 ));
8003            goto decode_success;
8004         }
8005         else
8006         if (code[12] == 0x87 && code[13] == 0xD2 /* xchgl %edx,%edx */) {
8007            /* call-noredir *%EAX */
8008            DIP("call-noredir *%%eax\n");
8009            delta += 14;
8010            t1 = newTemp(Ity_I32);
8011            assign(t1, getIReg(4,R_EAX));
8012            t2 = newTemp(Ity_I32);
8013            assign(t2, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
8014            putIReg(4, R_ESP, mkexpr(t2));
8015            storeLE( mkexpr(t2), mkU32(guest_EIP_bbstart+delta));
8016            jmp_treg(&dres, Ijk_NoRedir, t1);
8017            vassert(dres.whatNext == Dis_StopHere);
8018            goto decode_success;
8019         }
8020         /* We don't know what it is. */
8021         goto decode_failure;
8022         /*NOTREACHED*/
8023      }
8024   }
8025
8026   /* Handle a couple of weird-ass NOPs that have been observed in the
8027      wild. */
8028   {
8029      UChar* code = (UChar*)(guest_code + delta);
8030      /* Sun's JVM 1.5.0 uses the following as a NOP:
8031         26 2E 64 65 90  %es:%cs:%fs:%gs:nop */
8032      if (code[0] == 0x26 && code[1] == 0x2E && code[2] == 0x64
8033          && code[3] == 0x65 && code[4] == 0x90) {
8034         DIP("%%es:%%cs:%%fs:%%gs:nop\n");
8035         delta += 5;
8036         goto decode_success;
8037      }
8038      /* Don't barf on recent binutils padding,
8039         all variants of which are: nopw %cs:0x0(%eax,%eax,1)
8040         66 2e 0f 1f 84 00 00 00 00 00
8041         66 66 2e 0f 1f 84 00 00 00 00 00
8042         66 66 66 2e 0f 1f 84 00 00 00 00 00
8043         66 66 66 66 2e 0f 1f 84 00 00 00 00 00
8044         66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
8045         66 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00
8046      */
8047      if (code[0] == 0x66) {
8048         Int data16_cnt;
8049         for (data16_cnt = 1; data16_cnt < 6; data16_cnt++)
8050            if (code[data16_cnt] != 0x66)
8051               break;
8052         if (code[data16_cnt] == 0x2E && code[data16_cnt + 1] == 0x0F
8053             && code[data16_cnt + 2] == 0x1F && code[data16_cnt + 3] == 0x84
8054             && code[data16_cnt + 4] == 0x00 && code[data16_cnt + 5] == 0x00
8055             && code[data16_cnt + 6] == 0x00 && code[data16_cnt + 7] == 0x00
8056             && code[data16_cnt + 8] == 0x00 ) {
8057            DIP("nopw %%cs:0x0(%%eax,%%eax,1)\n");
8058            delta += 9 + data16_cnt;
8059            goto decode_success;
8060         }
8061      }
8062   }
8063
8064   /* Normal instruction handling starts here. */
8065
8066   /* Deal with some but not all prefixes:
8067         66(oso)
8068         F0(lock)
8069         2E(cs:) 3E(ds:) 26(es:) 64(fs:) 65(gs:) 36(ss:)
8070      Not dealt with (left in place):
8071         F2 F3
8072   */
8073   n_prefixes = 0;
8074   while (True) {
8075      if (n_prefixes > 7) goto decode_failure;
8076      pre = getUChar(delta);
8077      switch (pre) {
8078         case 0x66:
8079            sz = 2;
8080            break;
8081         case 0xF0:
8082            pfx_lock = True;
8083            *expect_CAS = True;
8084            break;
8085         case 0x3E: /* %DS: */
8086         case 0x26: /* %ES: */
8087         case 0x64: /* %FS: */
8088         case 0x65: /* %GS: */
8089            if (sorb != 0)
8090               goto decode_failure; /* only one seg override allowed */
8091            sorb = pre;
8092            break;
8093         case 0x2E: { /* %CS: */
8094            /* 2E prefix on a conditional branch instruction is a
8095               branch-prediction hint, which can safely be ignored.  */
8096            UChar op1 = getIByte(delta+1);
8097            UChar op2 = getIByte(delta+2);
8098            if ((op1 >= 0x70 && op1 <= 0x7F)
8099                || (op1 == 0xE3)
8100                || (op1 == 0x0F && op2 >= 0x80 && op2 <= 0x8F)) {
8101               if (0) vex_printf("vex x86->IR: ignoring branch hint\n");
8102            } else {
8103               /* All other CS override cases are not handled */
8104               goto decode_failure;
8105            }
8106            break;
8107         }
8108         case 0x36: /* %SS: */
8109            /* SS override cases are not handled */
8110            goto decode_failure;
8111         default:
8112            goto not_a_prefix;
8113      }
8114      n_prefixes++;
8115      delta++;
8116   }
8117
8118   not_a_prefix:
8119
8120   /* Now we should be looking at the primary opcode byte or the
8121      leading F2 or F3.  Check that any LOCK prefix is actually
8122      allowed. */
8123
8124   if (pfx_lock) {
8125      if (can_be_used_with_LOCK_prefix( (UChar*)&guest_code[delta] )) {
8126         DIP("lock ");
8127      } else {
8128         *expect_CAS = False;
8129         goto decode_failure;
8130      }
8131   }
8132
8133
8134   /* ---------------------------------------------------- */
8135   /* --- The SSE decoder.                             --- */
8136   /* ---------------------------------------------------- */
8137
8138   /* What did I do to deserve SSE ?  Perhaps I was really bad in a
8139      previous life? */
8140
8141   /* Note, this doesn't handle SSE2 or SSE3.  That is handled in a
8142      later section, further on. */
8143
8144   insn = (UChar*)&guest_code[delta];
8145
8146   /* Treat fxsave specially.  It should be doable even on an SSE0
8147      (Pentium-II class) CPU.  Hence be prepared to handle it on
8148      any subarchitecture variant.
8149   */
8150
8151   /* 0F AE /0 = FXSAVE m512 -- write x87 and SSE state to memory */
8152   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
8153       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 0) {
8154      IRDirty* d;
8155      modrm = getIByte(delta+2);
8156      vassert(sz == 4);
8157      vassert(!epartIsReg(modrm));
8158
8159      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8160      delta += 2+alen;
8161      gen_SEGV_if_not_16_aligned(addr);
8162
8163      DIP("fxsave %s\n", dis_buf);
8164
8165      /* Uses dirty helper:
8166            void x86g_do_FXSAVE ( VexGuestX86State*, UInt ) */
8167      d = unsafeIRDirty_0_N (
8168             0/*regparms*/,
8169             "x86g_dirtyhelper_FXSAVE",
8170             &x86g_dirtyhelper_FXSAVE,
8171             mkIRExprVec_1( mkexpr(addr) )
8172          );
8173      d->needsBBP = True;
8174
8175      /* declare we're writing memory */
8176      d->mFx   = Ifx_Write;
8177      d->mAddr = mkexpr(addr);
8178      d->mSize = 464; /* according to recent Intel docs */
8179
8180      /* declare we're reading guest state */
8181      d->nFxState = 7;
8182      vex_bzero(&d->fxState, sizeof(d->fxState));
8183
8184      d->fxState[0].fx     = Ifx_Read;
8185      d->fxState[0].offset = OFFB_FTOP;
8186      d->fxState[0].size   = sizeof(UInt);
8187
8188      d->fxState[1].fx     = Ifx_Read;
8189      d->fxState[1].offset = OFFB_FPREGS;
8190      d->fxState[1].size   = 8 * sizeof(ULong);
8191
8192      d->fxState[2].fx     = Ifx_Read;
8193      d->fxState[2].offset = OFFB_FPTAGS;
8194      d->fxState[2].size   = 8 * sizeof(UChar);
8195
8196      d->fxState[3].fx     = Ifx_Read;
8197      d->fxState[3].offset = OFFB_FPROUND;
8198      d->fxState[3].size   = sizeof(UInt);
8199
8200      d->fxState[4].fx     = Ifx_Read;
8201      d->fxState[4].offset = OFFB_FC3210;
8202      d->fxState[4].size   = sizeof(UInt);
8203
8204      d->fxState[5].fx     = Ifx_Read;
8205      d->fxState[5].offset = OFFB_XMM0;
8206      d->fxState[5].size   = 8 * sizeof(U128);
8207
8208      d->fxState[6].fx     = Ifx_Read;
8209      d->fxState[6].offset = OFFB_SSEROUND;
8210      d->fxState[6].size   = sizeof(UInt);
8211
8212      /* Be paranoid ... this assertion tries to ensure the 8 %xmm
8213	 images are packed back-to-back.  If not, the value of
8214	 d->fxState[5].size is wrong. */
8215      vassert(16 == sizeof(U128));
8216      vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
8217
8218      stmt( IRStmt_Dirty(d) );
8219
8220      goto decode_success;
8221   }
8222
8223   /* 0F AE /1 = FXRSTOR m512 -- read x87 and SSE state from memory */
8224   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
8225       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 1) {
8226      IRDirty* d;
8227      modrm = getIByte(delta+2);
8228      vassert(sz == 4);
8229      vassert(!epartIsReg(modrm));
8230
8231      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8232      delta += 2+alen;
8233      gen_SEGV_if_not_16_aligned(addr);
8234
8235      DIP("fxrstor %s\n", dis_buf);
8236
8237      /* Uses dirty helper:
8238            VexEmWarn x86g_do_FXRSTOR ( VexGuestX86State*, UInt )
8239         NOTE:
8240            the VexEmWarn value is simply ignored (unlike for FRSTOR)
8241      */
8242      d = unsafeIRDirty_0_N (
8243             0/*regparms*/,
8244             "x86g_dirtyhelper_FXRSTOR",
8245             &x86g_dirtyhelper_FXRSTOR,
8246             mkIRExprVec_1( mkexpr(addr) )
8247          );
8248      d->needsBBP = True;
8249
8250      /* declare we're reading memory */
8251      d->mFx   = Ifx_Read;
8252      d->mAddr = mkexpr(addr);
8253      d->mSize = 464; /* according to recent Intel docs */
8254
8255      /* declare we're writing guest state */
8256      d->nFxState = 7;
8257      vex_bzero(&d->fxState, sizeof(d->fxState));
8258
8259      d->fxState[0].fx     = Ifx_Write;
8260      d->fxState[0].offset = OFFB_FTOP;
8261      d->fxState[0].size   = sizeof(UInt);
8262
8263      d->fxState[1].fx     = Ifx_Write;
8264      d->fxState[1].offset = OFFB_FPREGS;
8265      d->fxState[1].size   = 8 * sizeof(ULong);
8266
8267      d->fxState[2].fx     = Ifx_Write;
8268      d->fxState[2].offset = OFFB_FPTAGS;
8269      d->fxState[2].size   = 8 * sizeof(UChar);
8270
8271      d->fxState[3].fx     = Ifx_Write;
8272      d->fxState[3].offset = OFFB_FPROUND;
8273      d->fxState[3].size   = sizeof(UInt);
8274
8275      d->fxState[4].fx     = Ifx_Write;
8276      d->fxState[4].offset = OFFB_FC3210;
8277      d->fxState[4].size   = sizeof(UInt);
8278
8279      d->fxState[5].fx     = Ifx_Write;
8280      d->fxState[5].offset = OFFB_XMM0;
8281      d->fxState[5].size   = 8 * sizeof(U128);
8282
8283      d->fxState[6].fx     = Ifx_Write;
8284      d->fxState[6].offset = OFFB_SSEROUND;
8285      d->fxState[6].size   = sizeof(UInt);
8286
8287      /* Be paranoid ... this assertion tries to ensure the 8 %xmm
8288	 images are packed back-to-back.  If not, the value of
8289	 d->fxState[5].size is wrong. */
8290      vassert(16 == sizeof(U128));
8291      vassert(OFFB_XMM7 == (OFFB_XMM0 + 7 * 16));
8292
8293      stmt( IRStmt_Dirty(d) );
8294
8295      goto decode_success;
8296   }
8297
8298   /* ------ SSE decoder main ------ */
8299
8300   /* Skip parts of the decoder which don't apply given the stated
8301      guest subarchitecture. */
8302   if (archinfo->hwcaps == 0/*baseline, no sse at all*/)
8303      goto after_sse_decoders;
8304
8305   /* Otherwise we must be doing sse1 or sse2, so we can at least try
8306      for SSE1 here. */
8307
8308   /* 0F 58 = ADDPS -- add 32Fx4 from R/M to R */
8309   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x58) {
8310      delta = dis_SSE_E_to_G_all( sorb, delta+2, "addps", Iop_Add32Fx4 );
8311      goto decode_success;
8312   }
8313
8314   /* F3 0F 58 = ADDSS -- add 32F0x4 from R/M to R */
8315   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x58) {
8316      vassert(sz == 4);
8317      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "addss", Iop_Add32F0x4 );
8318      goto decode_success;
8319   }
8320
8321   /* 0F 55 = ANDNPS -- G = (not G) and E */
8322   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x55) {
8323      delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnps", Iop_AndV128 );
8324      goto decode_success;
8325   }
8326
8327   /* 0F 54 = ANDPS -- G = G and E */
8328   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x54) {
8329      delta = dis_SSE_E_to_G_all( sorb, delta+2, "andps", Iop_AndV128 );
8330      goto decode_success;
8331   }
8332
8333   /* 0F C2 = CMPPS -- 32Fx4 comparison from R/M to R */
8334   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC2) {
8335      delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmpps", True, 4 );
8336      goto decode_success;
8337   }
8338
8339   /* F3 0F C2 = CMPSS -- 32F0x4 comparison from R/M to R */
8340   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xC2) {
8341      vassert(sz == 4);
8342      delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpss", False, 4 );
8343      goto decode_success;
8344   }
8345
8346   /* 0F 2F = COMISS  -- 32F0x4 comparison G,E, and set ZCP */
8347   /* 0F 2E = UCOMISS -- 32F0x4 comparison G,E, and set ZCP */
8348   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
8349      IRTemp argL = newTemp(Ity_F32);
8350      IRTemp argR = newTemp(Ity_F32);
8351      modrm = getIByte(delta+2);
8352      if (epartIsReg(modrm)) {
8353         assign( argR, getXMMRegLane32F( eregOfRM(modrm), 0/*lowest lane*/ ) );
8354         delta += 2+1;
8355         DIP("[u]comiss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
8356                                  nameXMMReg(gregOfRM(modrm)) );
8357      } else {
8358         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8359	 assign( argR, loadLE(Ity_F32, mkexpr(addr)) );
8360         delta += 2+alen;
8361         DIP("[u]comiss %s,%s\n", dis_buf,
8362                                  nameXMMReg(gregOfRM(modrm)) );
8363      }
8364      assign( argL, getXMMRegLane32F( gregOfRM(modrm), 0/*lowest lane*/ ) );
8365
8366      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
8367      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
8368      stmt( IRStmt_Put(
8369               OFFB_CC_DEP1,
8370               binop( Iop_And32,
8371                      binop(Iop_CmpF64,
8372                            unop(Iop_F32toF64,mkexpr(argL)),
8373                            unop(Iop_F32toF64,mkexpr(argR))),
8374                      mkU32(0x45)
8375          )));
8376      /* Set NDEP even though it isn't used.  This makes redundant-PUT
8377         elimination of previous stores to this field work better. */
8378      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
8379      goto decode_success;
8380   }
8381
8382   /* 0F 2A = CVTPI2PS -- convert 2 x I32 in mem/mmx to 2 x F32 in low
8383      half xmm */
8384   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x2A) {
8385      IRTemp arg64 = newTemp(Ity_I64);
8386      IRTemp rmode = newTemp(Ity_I32);
8387      vassert(sz == 4);
8388
8389      modrm = getIByte(delta+2);
8390      do_MMX_preamble();
8391      if (epartIsReg(modrm)) {
8392         assign( arg64, getMMXReg(eregOfRM(modrm)) );
8393         delta += 2+1;
8394         DIP("cvtpi2ps %s,%s\n", nameMMXReg(eregOfRM(modrm)),
8395                                 nameXMMReg(gregOfRM(modrm)));
8396      } else {
8397         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8398	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
8399         delta += 2+alen;
8400         DIP("cvtpi2ps %s,%s\n", dis_buf,
8401                                 nameXMMReg(gregOfRM(modrm)) );
8402      }
8403
8404      assign( rmode, get_sse_roundingmode() );
8405
8406      putXMMRegLane32F(
8407         gregOfRM(modrm), 0,
8408         binop(Iop_F64toF32,
8409               mkexpr(rmode),
8410               unop(Iop_I32StoF64,
8411                    unop(Iop_64to32, mkexpr(arg64)) )) );
8412
8413      putXMMRegLane32F(
8414         gregOfRM(modrm), 1,
8415         binop(Iop_F64toF32,
8416               mkexpr(rmode),
8417               unop(Iop_I32StoF64,
8418                    unop(Iop_64HIto32, mkexpr(arg64)) )) );
8419
8420      goto decode_success;
8421   }
8422
8423   /* F3 0F 2A = CVTSI2SS -- convert I32 in mem/ireg to F32 in low
8424      quarter xmm */
8425   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x2A) {
8426      IRTemp arg32 = newTemp(Ity_I32);
8427      IRTemp rmode = newTemp(Ity_I32);
8428      vassert(sz == 4);
8429
8430      modrm = getIByte(delta+3);
8431      if (epartIsReg(modrm)) {
8432         assign( arg32, getIReg(4, eregOfRM(modrm)) );
8433         delta += 3+1;
8434         DIP("cvtsi2ss %s,%s\n", nameIReg(4, eregOfRM(modrm)),
8435                                 nameXMMReg(gregOfRM(modrm)));
8436      } else {
8437         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
8438	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
8439         delta += 3+alen;
8440         DIP("cvtsi2ss %s,%s\n", dis_buf,
8441                                 nameXMMReg(gregOfRM(modrm)) );
8442      }
8443
8444      assign( rmode, get_sse_roundingmode() );
8445
8446      putXMMRegLane32F(
8447         gregOfRM(modrm), 0,
8448         binop(Iop_F64toF32,
8449               mkexpr(rmode),
8450               unop(Iop_I32StoF64, mkexpr(arg32)) ) );
8451
8452      goto decode_success;
8453   }
8454
8455   /* 0F 2D = CVTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
8456      I32 in mmx, according to prevailing SSE rounding mode */
8457   /* 0F 2C = CVTTPS2PI -- convert 2 x F32 in mem/low half xmm to 2 x
8458      I32 in mmx, rounding towards zero */
8459   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
8460      IRTemp dst64  = newTemp(Ity_I64);
8461      IRTemp rmode  = newTemp(Ity_I32);
8462      IRTemp f32lo  = newTemp(Ity_F32);
8463      IRTemp f32hi  = newTemp(Ity_F32);
8464      Bool   r2zero = toBool(insn[1] == 0x2C);
8465
8466      do_MMX_preamble();
8467      modrm = getIByte(delta+2);
8468
8469      if (epartIsReg(modrm)) {
8470         delta += 2+1;
8471	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
8472	 assign(f32hi, getXMMRegLane32F(eregOfRM(modrm), 1));
8473         DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
8474                                   nameXMMReg(eregOfRM(modrm)),
8475                                   nameMMXReg(gregOfRM(modrm)));
8476      } else {
8477         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8478	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
8479	 assign(f32hi, loadLE(Ity_F32, binop( Iop_Add32,
8480                                              mkexpr(addr),
8481                                              mkU32(4) )));
8482         delta += 2+alen;
8483         DIP("cvt%sps2pi %s,%s\n", r2zero ? "t" : "",
8484                                   dis_buf,
8485                                   nameMMXReg(gregOfRM(modrm)));
8486      }
8487
8488      if (r2zero) {
8489         assign(rmode, mkU32((UInt)Irrm_ZERO) );
8490      } else {
8491         assign( rmode, get_sse_roundingmode() );
8492      }
8493
8494      assign(
8495         dst64,
8496         binop( Iop_32HLto64,
8497                binop( Iop_F64toI32S,
8498                       mkexpr(rmode),
8499                       unop( Iop_F32toF64, mkexpr(f32hi) ) ),
8500                binop( Iop_F64toI32S,
8501                       mkexpr(rmode),
8502                       unop( Iop_F32toF64, mkexpr(f32lo) ) )
8503              )
8504      );
8505
8506      putMMXReg(gregOfRM(modrm), mkexpr(dst64));
8507      goto decode_success;
8508   }
8509
8510   /* F3 0F 2D = CVTSS2SI -- convert F32 in mem/low quarter xmm to
8511      I32 in ireg, according to prevailing SSE rounding mode */
8512   /* F3 0F 2C = CVTTSS2SI -- convert F32 in mem/low quarter xmm to
8513      I32 in ireg, rounding towards zero */
8514   if (insn[0] == 0xF3 && insn[1] == 0x0F
8515       && (insn[2] == 0x2D || insn[2] == 0x2C)) {
8516      IRTemp rmode = newTemp(Ity_I32);
8517      IRTemp f32lo = newTemp(Ity_F32);
8518      Bool   r2zero = toBool(insn[2] == 0x2C);
8519      vassert(sz == 4);
8520
8521      modrm = getIByte(delta+3);
8522      if (epartIsReg(modrm)) {
8523         delta += 3+1;
8524	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
8525         DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
8526                                   nameXMMReg(eregOfRM(modrm)),
8527                                   nameIReg(4, gregOfRM(modrm)));
8528      } else {
8529         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
8530	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
8531         delta += 3+alen;
8532         DIP("cvt%sss2si %s,%s\n", r2zero ? "t" : "",
8533                                   dis_buf,
8534                                   nameIReg(4, gregOfRM(modrm)));
8535      }
8536
8537      if (r2zero) {
8538         assign( rmode, mkU32((UInt)Irrm_ZERO) );
8539      } else {
8540         assign( rmode, get_sse_roundingmode() );
8541      }
8542
8543      putIReg(4, gregOfRM(modrm),
8544                 binop( Iop_F64toI32S,
8545                        mkexpr(rmode),
8546                        unop( Iop_F32toF64, mkexpr(f32lo) ) )
8547      );
8548
8549      goto decode_success;
8550   }
8551
8552   /* 0F 5E = DIVPS -- div 32Fx4 from R/M to R */
8553   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5E) {
8554      delta = dis_SSE_E_to_G_all( sorb, delta+2, "divps", Iop_Div32Fx4 );
8555      goto decode_success;
8556   }
8557
8558   /* F3 0F 5E = DIVSS -- div 32F0x4 from R/M to R */
8559   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5E) {
8560      vassert(sz == 4);
8561      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "divss", Iop_Div32F0x4 );
8562      goto decode_success;
8563   }
8564
8565   /* 0F AE /2 = LDMXCSR m32 -- load %mxcsr */
8566   if (insn[0] == 0x0F && insn[1] == 0xAE
8567       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 2) {
8568
8569      IRTemp t64 = newTemp(Ity_I64);
8570      IRTemp ew = newTemp(Ity_I32);
8571
8572      modrm = getIByte(delta+2);
8573      vassert(!epartIsReg(modrm));
8574      vassert(sz == 4);
8575
8576      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8577      delta += 2+alen;
8578      DIP("ldmxcsr %s\n", dis_buf);
8579
8580      /* The only thing we observe in %mxcsr is the rounding mode.
8581         Therefore, pass the 32-bit value (SSE native-format control
8582         word) to a clean helper, getting back a 64-bit value, the
8583         lower half of which is the SSEROUND value to store, and the
8584         upper half of which is the emulation-warning token which may
8585         be generated.
8586      */
8587      /* ULong x86h_check_ldmxcsr ( UInt ); */
8588      assign( t64, mkIRExprCCall(
8589                      Ity_I64, 0/*regparms*/,
8590                      "x86g_check_ldmxcsr",
8591                      &x86g_check_ldmxcsr,
8592                      mkIRExprVec_1( loadLE(Ity_I32, mkexpr(addr)) )
8593                   )
8594            );
8595
8596      put_sse_roundingmode( unop(Iop_64to32, mkexpr(t64)) );
8597      assign( ew, unop(Iop_64HIto32, mkexpr(t64) ) );
8598      put_emwarn( mkexpr(ew) );
8599      /* Finally, if an emulation warning was reported, side-exit to
8600         the next insn, reporting the warning, so that Valgrind's
8601         dispatcher sees the warning. */
8602      stmt(
8603         IRStmt_Exit(
8604            binop(Iop_CmpNE32, mkexpr(ew), mkU32(0)),
8605            Ijk_EmWarn,
8606            IRConst_U32( ((Addr32)guest_EIP_bbstart)+delta),
8607            OFFB_EIP
8608         )
8609      );
8610      goto decode_success;
8611   }
8612
8613   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8614   /* 0F F7 = MASKMOVQ -- 8x8 masked store */
8615   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF7) {
8616      Bool ok = False;
8617      delta = dis_MMX( &ok, sorb, sz, delta+1 );
8618      if (!ok)
8619         goto decode_failure;
8620      goto decode_success;
8621   }
8622
8623   /* 0F 5F = MAXPS -- max 32Fx4 from R/M to R */
8624   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5F) {
8625      delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxps", Iop_Max32Fx4 );
8626      goto decode_success;
8627   }
8628
8629   /* F3 0F 5F = MAXSS -- max 32F0x4 from R/M to R */
8630   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5F) {
8631      vassert(sz == 4);
8632      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "maxss", Iop_Max32F0x4 );
8633      goto decode_success;
8634   }
8635
8636   /* 0F 5D = MINPS -- min 32Fx4 from R/M to R */
8637   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5D) {
8638      delta = dis_SSE_E_to_G_all( sorb, delta+2, "minps", Iop_Min32Fx4 );
8639      goto decode_success;
8640   }
8641
8642   /* F3 0F 5D = MINSS -- min 32F0x4 from R/M to R */
8643   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5D) {
8644      vassert(sz == 4);
8645      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "minss", Iop_Min32F0x4 );
8646      goto decode_success;
8647   }
8648
8649   /* 0F 28 = MOVAPS -- move from E (mem or xmm) to G (xmm). */
8650   /* 0F 10 = MOVUPS -- move from E (mem or xmm) to G (xmm). */
8651   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x28 || insn[1] == 0x10)) {
8652      modrm = getIByte(delta+2);
8653      if (epartIsReg(modrm)) {
8654         putXMMReg( gregOfRM(modrm),
8655                    getXMMReg( eregOfRM(modrm) ));
8656         DIP("mov[ua]ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
8657                                  nameXMMReg(gregOfRM(modrm)));
8658         delta += 2+1;
8659      } else {
8660         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8661         if (insn[1] == 0x28/*movaps*/)
8662            gen_SEGV_if_not_16_aligned( addr );
8663         putXMMReg( gregOfRM(modrm),
8664                    loadLE(Ity_V128, mkexpr(addr)) );
8665         DIP("mov[ua]ps %s,%s\n", dis_buf,
8666                                  nameXMMReg(gregOfRM(modrm)));
8667         delta += 2+alen;
8668      }
8669      goto decode_success;
8670   }
8671
8672   /* 0F 29 = MOVAPS -- move from G (xmm) to E (mem or xmm). */
8673   /* 0F 11 = MOVUPS -- move from G (xmm) to E (mem or xmm). */
8674   if (sz == 4 && insn[0] == 0x0F
8675       && (insn[1] == 0x29 || insn[1] == 0x11)) {
8676      modrm = getIByte(delta+2);
8677      if (epartIsReg(modrm)) {
8678         /* fall through; awaiting test case */
8679      } else {
8680         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8681         if (insn[1] == 0x29/*movaps*/)
8682            gen_SEGV_if_not_16_aligned( addr );
8683         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
8684         DIP("mov[ua]ps %s,%s\n", nameXMMReg(gregOfRM(modrm)),
8685                                  dis_buf );
8686         delta += 2+alen;
8687         goto decode_success;
8688      }
8689   }
8690
8691   /* 0F 16 = MOVHPS -- move from mem to high half of XMM. */
8692   /* 0F 16 = MOVLHPS -- move from lo half to hi half of XMM. */
8693   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x16) {
8694      modrm = getIByte(delta+2);
8695      if (epartIsReg(modrm)) {
8696         delta += 2+1;
8697         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
8698                          getXMMRegLane64( eregOfRM(modrm), 0 ) );
8699         DIP("movhps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
8700                               nameXMMReg(gregOfRM(modrm)));
8701      } else {
8702         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8703         delta += 2+alen;
8704         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
8705                          loadLE(Ity_I64, mkexpr(addr)) );
8706         DIP("movhps %s,%s\n", dis_buf,
8707                               nameXMMReg( gregOfRM(modrm) ));
8708      }
8709      goto decode_success;
8710   }
8711
8712   /* 0F 17 = MOVHPS -- move from high half of XMM to mem. */
8713   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x17) {
8714      if (!epartIsReg(insn[2])) {
8715         delta += 2;
8716         addr = disAMode ( &alen, sorb, delta, dis_buf );
8717         delta += alen;
8718         storeLE( mkexpr(addr),
8719                  getXMMRegLane64( gregOfRM(insn[2]),
8720                                   1/*upper lane*/ ) );
8721         DIP("movhps %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
8722                               dis_buf);
8723         goto decode_success;
8724      }
8725      /* else fall through */
8726   }
8727
8728   /* 0F 12 = MOVLPS -- move from mem to low half of XMM. */
8729   /* OF 12 = MOVHLPS -- from from hi half to lo half of XMM. */
8730   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x12) {
8731      modrm = getIByte(delta+2);
8732      if (epartIsReg(modrm)) {
8733         delta += 2+1;
8734         putXMMRegLane64( gregOfRM(modrm),
8735                          0/*lower lane*/,
8736                          getXMMRegLane64( eregOfRM(modrm), 1 ));
8737         DIP("movhlps %s, %s\n", nameXMMReg(eregOfRM(modrm)),
8738                                 nameXMMReg(gregOfRM(modrm)));
8739      } else {
8740         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8741         delta += 2+alen;
8742         putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
8743                          loadLE(Ity_I64, mkexpr(addr)) );
8744         DIP("movlps %s, %s\n",
8745             dis_buf, nameXMMReg( gregOfRM(modrm) ));
8746      }
8747      goto decode_success;
8748   }
8749
8750   /* 0F 13 = MOVLPS -- move from low half of XMM to mem. */
8751   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x13) {
8752      if (!epartIsReg(insn[2])) {
8753         delta += 2;
8754         addr = disAMode ( &alen, sorb, delta, dis_buf );
8755         delta += alen;
8756         storeLE( mkexpr(addr),
8757                  getXMMRegLane64( gregOfRM(insn[2]),
8758                                   0/*lower lane*/ ) );
8759         DIP("movlps %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
8760                                dis_buf);
8761         goto decode_success;
8762      }
8763      /* else fall through */
8764   }
8765
8766   /* 0F 50 = MOVMSKPS - move 4 sign bits from 4 x F32 in xmm(E)
8767      to 4 lowest bits of ireg(G) */
8768   if (insn[0] == 0x0F && insn[1] == 0x50) {
8769      modrm = getIByte(delta+2);
8770      if (sz == 4 && epartIsReg(modrm)) {
8771         Int src;
8772         t0 = newTemp(Ity_I32);
8773         t1 = newTemp(Ity_I32);
8774         t2 = newTemp(Ity_I32);
8775         t3 = newTemp(Ity_I32);
8776         delta += 2+1;
8777         src = eregOfRM(modrm);
8778         assign( t0, binop( Iop_And32,
8779                            binop(Iop_Shr32, getXMMRegLane32(src,0), mkU8(31)),
8780                            mkU32(1) ));
8781         assign( t1, binop( Iop_And32,
8782                            binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(30)),
8783                            mkU32(2) ));
8784         assign( t2, binop( Iop_And32,
8785                            binop(Iop_Shr32, getXMMRegLane32(src,2), mkU8(29)),
8786                            mkU32(4) ));
8787         assign( t3, binop( Iop_And32,
8788                            binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(28)),
8789                            mkU32(8) ));
8790         putIReg(4, gregOfRM(modrm),
8791                    binop(Iop_Or32,
8792                          binop(Iop_Or32, mkexpr(t0), mkexpr(t1)),
8793                          binop(Iop_Or32, mkexpr(t2), mkexpr(t3))
8794                         )
8795                 );
8796         DIP("movmskps %s,%s\n", nameXMMReg(src),
8797                                 nameIReg(4, gregOfRM(modrm)));
8798         goto decode_success;
8799      }
8800      /* else fall through */
8801   }
8802
8803   /* 0F 2B = MOVNTPS -- for us, just a plain SSE store. */
8804   /* 66 0F 2B = MOVNTPD -- for us, just a plain SSE store. */
8805   if (insn[0] == 0x0F && insn[1] == 0x2B) {
8806      modrm = getIByte(delta+2);
8807      if (!epartIsReg(modrm)) {
8808         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8809         gen_SEGV_if_not_16_aligned( addr );
8810         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
8811         DIP("movntp%s %s,%s\n", sz==2 ? "d" : "s",
8812                                 dis_buf,
8813                                 nameXMMReg(gregOfRM(modrm)));
8814         delta += 2+alen;
8815         goto decode_success;
8816      }
8817      /* else fall through */
8818   }
8819
8820   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8821   /* 0F E7 = MOVNTQ -- for us, just a plain MMX store.  Note, the
8822      Intel manual does not say anything about the usual business of
8823      the FP reg tags getting trashed whenever an MMX insn happens.
8824      So we just leave them alone.
8825   */
8826   if (insn[0] == 0x0F && insn[1] == 0xE7) {
8827      modrm = getIByte(delta+2);
8828      if (sz == 4 && !epartIsReg(modrm)) {
8829         /* do_MMX_preamble(); Intel docs don't specify this */
8830         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8831         storeLE( mkexpr(addr), getMMXReg(gregOfRM(modrm)) );
8832         DIP("movntq %s,%s\n", dis_buf,
8833                               nameMMXReg(gregOfRM(modrm)));
8834         delta += 2+alen;
8835         goto decode_success;
8836      }
8837      /* else fall through */
8838   }
8839
8840   /* F3 0F 10 = MOVSS -- move 32 bits from E (mem or lo 1/4 xmm) to G
8841      (lo 1/4 xmm).  If E is mem, upper 3/4 of G is zeroed out. */
8842   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x10) {
8843      vassert(sz == 4);
8844      modrm = getIByte(delta+3);
8845      if (epartIsReg(modrm)) {
8846         putXMMRegLane32( gregOfRM(modrm), 0,
8847                          getXMMRegLane32( eregOfRM(modrm), 0 ));
8848         DIP("movss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
8849                              nameXMMReg(gregOfRM(modrm)));
8850         delta += 3+1;
8851      } else {
8852         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
8853         /* zero bits 127:64 */
8854         putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
8855         /* zero bits 63:32 */
8856         putXMMRegLane32( gregOfRM(modrm), 1, mkU32(0) );
8857         /* write bits 31:0 */
8858         putXMMRegLane32( gregOfRM(modrm), 0,
8859                          loadLE(Ity_I32, mkexpr(addr)) );
8860         DIP("movss %s,%s\n", dis_buf,
8861                              nameXMMReg(gregOfRM(modrm)));
8862         delta += 3+alen;
8863      }
8864      goto decode_success;
8865   }
8866
8867   /* F3 0F 11 = MOVSS -- move 32 bits from G (lo 1/4 xmm) to E (mem
8868      or lo 1/4 xmm). */
8869   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x11) {
8870      vassert(sz == 4);
8871      modrm = getIByte(delta+3);
8872      if (epartIsReg(modrm)) {
8873         /* fall through, we don't yet have a test case */
8874      } else {
8875         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
8876         storeLE( mkexpr(addr),
8877                  getXMMRegLane32(gregOfRM(modrm), 0) );
8878         DIP("movss %s,%s\n", nameXMMReg(gregOfRM(modrm)),
8879                              dis_buf);
8880         delta += 3+alen;
8881         goto decode_success;
8882      }
8883   }
8884
8885   /* 0F 59 = MULPS -- mul 32Fx4 from R/M to R */
8886   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x59) {
8887      delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulps", Iop_Mul32Fx4 );
8888      goto decode_success;
8889   }
8890
8891   /* F3 0F 59 = MULSS -- mul 32F0x4 from R/M to R */
8892   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x59) {
8893      vassert(sz == 4);
8894      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "mulss", Iop_Mul32F0x4 );
8895      goto decode_success;
8896   }
8897
8898   /* 0F 56 = ORPS -- G = G and E */
8899   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x56) {
8900      delta = dis_SSE_E_to_G_all( sorb, delta+2, "orps", Iop_OrV128 );
8901      goto decode_success;
8902   }
8903
8904   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8905   /* 0F E0 = PAVGB -- 8x8 unsigned Packed Average, with rounding */
8906   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE0) {
8907      do_MMX_preamble();
8908      delta = dis_MMXop_regmem_to_reg (
8909                sorb, delta+2, insn[1], "pavgb", False );
8910      goto decode_success;
8911   }
8912
8913   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8914   /* 0F E3 = PAVGW -- 16x4 unsigned Packed Average, with rounding */
8915   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE3) {
8916      do_MMX_preamble();
8917      delta = dis_MMXop_regmem_to_reg (
8918                sorb, delta+2, insn[1], "pavgw", False );
8919      goto decode_success;
8920   }
8921
8922   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8923   /* 0F C5 = PEXTRW -- extract 16-bit field from mmx(E) and put
8924      zero-extend of it in ireg(G). */
8925   if (insn[0] == 0x0F && insn[1] == 0xC5) {
8926      modrm = insn[2];
8927      if (sz == 4 && epartIsReg(modrm)) {
8928         IRTemp sV = newTemp(Ity_I64);
8929         t5 = newTemp(Ity_I16);
8930         do_MMX_preamble();
8931         assign(sV, getMMXReg(eregOfRM(modrm)));
8932         breakup64to16s( sV, &t3, &t2, &t1, &t0 );
8933         switch (insn[3] & 3) {
8934            case 0:  assign(t5, mkexpr(t0)); break;
8935            case 1:  assign(t5, mkexpr(t1)); break;
8936            case 2:  assign(t5, mkexpr(t2)); break;
8937            case 3:  assign(t5, mkexpr(t3)); break;
8938            default: vassert(0); /*NOTREACHED*/
8939         }
8940         putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t5)));
8941         DIP("pextrw $%d,%s,%s\n",
8942             (Int)insn[3], nameMMXReg(eregOfRM(modrm)),
8943                           nameIReg(4,gregOfRM(modrm)));
8944         delta += 4;
8945         goto decode_success;
8946      }
8947      /* else fall through */
8948   }
8949
8950   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8951   /* 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
8952      put it into the specified lane of mmx(G). */
8953   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC4) {
8954      /* Use t0 .. t3 to hold the 4 original 16-bit lanes of the
8955         mmx reg.  t4 is the new lane value.  t5 is the original
8956         mmx value. t6 is the new mmx value. */
8957      Int lane;
8958      t4 = newTemp(Ity_I16);
8959      t5 = newTemp(Ity_I64);
8960      t6 = newTemp(Ity_I64);
8961      modrm = insn[2];
8962      do_MMX_preamble();
8963
8964      assign(t5, getMMXReg(gregOfRM(modrm)));
8965      breakup64to16s( t5, &t3, &t2, &t1, &t0 );
8966
8967      if (epartIsReg(modrm)) {
8968         assign(t4, getIReg(2, eregOfRM(modrm)));
8969         delta += 3+1;
8970         lane = insn[3+1-1];
8971         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
8972                                   nameIReg(2,eregOfRM(modrm)),
8973                                   nameMMXReg(gregOfRM(modrm)));
8974      } else {
8975         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
8976         delta += 3+alen;
8977         lane = insn[3+alen-1];
8978         assign(t4, loadLE(Ity_I16, mkexpr(addr)));
8979         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
8980                                   dis_buf,
8981                                   nameMMXReg(gregOfRM(modrm)));
8982      }
8983
8984      switch (lane & 3) {
8985         case 0:  assign(t6, mk64from16s(t3,t2,t1,t4)); break;
8986         case 1:  assign(t6, mk64from16s(t3,t2,t4,t0)); break;
8987         case 2:  assign(t6, mk64from16s(t3,t4,t1,t0)); break;
8988         case 3:  assign(t6, mk64from16s(t4,t2,t1,t0)); break;
8989         default: vassert(0); /*NOTREACHED*/
8990      }
8991      putMMXReg(gregOfRM(modrm), mkexpr(t6));
8992      goto decode_success;
8993   }
8994
8995   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
8996   /* 0F EE = PMAXSW -- 16x4 signed max */
8997   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEE) {
8998      do_MMX_preamble();
8999      delta = dis_MMXop_regmem_to_reg (
9000                sorb, delta+2, insn[1], "pmaxsw", False );
9001      goto decode_success;
9002   }
9003
9004   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9005   /* 0F DE = PMAXUB -- 8x8 unsigned max */
9006   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDE) {
9007      do_MMX_preamble();
9008      delta = dis_MMXop_regmem_to_reg (
9009                sorb, delta+2, insn[1], "pmaxub", False );
9010      goto decode_success;
9011   }
9012
9013   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9014   /* 0F EA = PMINSW -- 16x4 signed min */
9015   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xEA) {
9016      do_MMX_preamble();
9017      delta = dis_MMXop_regmem_to_reg (
9018                sorb, delta+2, insn[1], "pminsw", False );
9019      goto decode_success;
9020   }
9021
9022   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9023   /* 0F DA = PMINUB -- 8x8 unsigned min */
9024   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xDA) {
9025      do_MMX_preamble();
9026      delta = dis_MMXop_regmem_to_reg (
9027                sorb, delta+2, insn[1], "pminub", False );
9028      goto decode_success;
9029   }
9030
9031   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9032   /* 0F D7 = PMOVMSKB -- extract sign bits from each of 8 lanes in
9033      mmx(G), turn them into a byte, and put zero-extend of it in
9034      ireg(G). */
9035   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD7) {
9036      modrm = insn[2];
9037      if (epartIsReg(modrm)) {
9038         do_MMX_preamble();
9039         t0 = newTemp(Ity_I64);
9040         t1 = newTemp(Ity_I32);
9041         assign(t0, getMMXReg(eregOfRM(modrm)));
9042         assign(t1, mkIRExprCCall(
9043                       Ity_I32, 0/*regparms*/,
9044                       "x86g_calculate_mmx_pmovmskb",
9045                       &x86g_calculate_mmx_pmovmskb,
9046                       mkIRExprVec_1(mkexpr(t0))));
9047         putIReg(4, gregOfRM(modrm), mkexpr(t1));
9048         DIP("pmovmskb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
9049                                 nameIReg(4,gregOfRM(modrm)));
9050         delta += 3;
9051         goto decode_success;
9052      }
9053      /* else fall through */
9054   }
9055
9056   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9057   /* 0F E4 = PMULUH -- 16x4 hi-half of unsigned widening multiply */
9058   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xE4) {
9059      do_MMX_preamble();
9060      delta = dis_MMXop_regmem_to_reg (
9061                sorb, delta+2, insn[1], "pmuluh", False );
9062      goto decode_success;
9063   }
9064
9065   /* 0F 18 /0 = PREFETCHNTA -- prefetch into caches, */
9066   /* 0F 18 /1 = PREFETCH0   -- with various different hints */
9067   /* 0F 18 /2 = PREFETCH1 */
9068   /* 0F 18 /3 = PREFETCH2 */
9069   if (insn[0] == 0x0F && insn[1] == 0x18
9070       && !epartIsReg(insn[2])
9071       && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 3) {
9072      HChar* hintstr = "??";
9073
9074      modrm = getIByte(delta+2);
9075      vassert(!epartIsReg(modrm));
9076
9077      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9078      delta += 2+alen;
9079
9080      switch (gregOfRM(modrm)) {
9081         case 0: hintstr = "nta"; break;
9082         case 1: hintstr = "t0"; break;
9083         case 2: hintstr = "t1"; break;
9084         case 3: hintstr = "t2"; break;
9085         default: vassert(0); /*NOTREACHED*/
9086      }
9087
9088      DIP("prefetch%s %s\n", hintstr, dis_buf);
9089      goto decode_success;
9090   }
9091
9092   /* 0F 0D /0 = PREFETCH  m8 -- 3DNow! prefetch */
9093   /* 0F 0D /1 = PREFETCHW m8 -- ditto, with some other hint */
9094   if (insn[0] == 0x0F && insn[1] == 0x0D
9095       && !epartIsReg(insn[2])
9096       && gregOfRM(insn[2]) >= 0 && gregOfRM(insn[2]) <= 1) {
9097      HChar* hintstr = "??";
9098
9099      modrm = getIByte(delta+2);
9100      vassert(!epartIsReg(modrm));
9101
9102      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9103      delta += 2+alen;
9104
9105      switch (gregOfRM(modrm)) {
9106         case 0: hintstr = ""; break;
9107         case 1: hintstr = "w"; break;
9108         default: vassert(0); /*NOTREACHED*/
9109      }
9110
9111      DIP("prefetch%s %s\n", hintstr, dis_buf);
9112      goto decode_success;
9113   }
9114
9115   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9116   /* 0F F6 = PSADBW -- sum of 8Ux8 absolute differences */
9117   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF6) {
9118      do_MMX_preamble();
9119      delta = dis_MMXop_regmem_to_reg (
9120                 sorb, delta+2, insn[1], "psadbw", False );
9121      goto decode_success;
9122   }
9123
9124   /* ***--- this is an MMX class insn introduced in SSE1 ---*** */
9125   /* 0F 70 = PSHUFW -- rearrange 4x16 from E(mmx or mem) to G(mmx) */
9126   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x70) {
9127      Int order;
9128      IRTemp sV, dV, s3, s2, s1, s0;
9129      s3 = s2 = s1 = s0 = IRTemp_INVALID;
9130      sV = newTemp(Ity_I64);
9131      dV = newTemp(Ity_I64);
9132      do_MMX_preamble();
9133      modrm = insn[2];
9134      if (epartIsReg(modrm)) {
9135         assign( sV, getMMXReg(eregOfRM(modrm)) );
9136         order = (Int)insn[3];
9137         delta += 2+2;
9138         DIP("pshufw $%d,%s,%s\n", order,
9139                                   nameMMXReg(eregOfRM(modrm)),
9140                                   nameMMXReg(gregOfRM(modrm)));
9141      } else {
9142         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9143         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
9144	 order = (Int)insn[2+alen];
9145         delta += 3+alen;
9146         DIP("pshufw $%d,%s,%s\n", order,
9147                                   dis_buf,
9148                                   nameMMXReg(gregOfRM(modrm)));
9149      }
9150      breakup64to16s( sV, &s3, &s2, &s1, &s0 );
9151
9152#     define SEL(n) \
9153                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
9154      assign(dV,
9155	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
9156                          SEL((order>>2)&3), SEL((order>>0)&3) )
9157      );
9158      putMMXReg(gregOfRM(modrm), mkexpr(dV));
9159#     undef SEL
9160      goto decode_success;
9161   }
9162
9163   /* 0F 53 = RCPPS -- approx reciprocal 32Fx4 from R/M to R */
9164   if (insn[0] == 0x0F && insn[1] == 0x53) {
9165      vassert(sz == 4);
9166      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
9167                                        "rcpps", Iop_Recip32Fx4 );
9168      goto decode_success;
9169   }
9170
9171   /* F3 0F 53 = RCPSS -- approx reciprocal 32F0x4 from R/M to R */
9172   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x53) {
9173      vassert(sz == 4);
9174      delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
9175                                         "rcpss", Iop_Recip32F0x4 );
9176      goto decode_success;
9177   }
9178
9179   /* 0F 52 = RSQRTPS -- approx reciprocal sqrt 32Fx4 from R/M to R */
9180   if (insn[0] == 0x0F && insn[1] == 0x52) {
9181      vassert(sz == 4);
9182      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
9183                                        "rsqrtps", Iop_RSqrt32Fx4 );
9184      goto decode_success;
9185   }
9186
9187   /* F3 0F 52 = RSQRTSS -- approx reciprocal sqrt 32F0x4 from R/M to R */
9188   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x52) {
9189      vassert(sz == 4);
9190      delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
9191                                         "rsqrtss", Iop_RSqrt32F0x4 );
9192      goto decode_success;
9193   }
9194
9195   /* 0F AE /7 = SFENCE -- flush pending operations to memory */
9196   if (insn[0] == 0x0F && insn[1] == 0xAE
9197       && epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
9198      vassert(sz == 4);
9199      delta += 3;
9200      /* Insert a memory fence.  It's sometimes important that these
9201         are carried through to the generated code. */
9202      stmt( IRStmt_MBE(Imbe_Fence) );
9203      DIP("sfence\n");
9204      goto decode_success;
9205   }
9206
9207   /* 0F C6 /r ib = SHUFPS -- shuffle packed F32s */
9208   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xC6) {
9209      Int    select;
9210      IRTemp sV, dV;
9211      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
9212      sV = newTemp(Ity_V128);
9213      dV = newTemp(Ity_V128);
9214      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
9215      modrm = insn[2];
9216      assign( dV, getXMMReg(gregOfRM(modrm)) );
9217
9218      if (epartIsReg(modrm)) {
9219         assign( sV, getXMMReg(eregOfRM(modrm)) );
9220         select = (Int)insn[3];
9221         delta += 2+2;
9222         DIP("shufps $%d,%s,%s\n", select,
9223                                   nameXMMReg(eregOfRM(modrm)),
9224                                   nameXMMReg(gregOfRM(modrm)));
9225      } else {
9226         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9227         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
9228         select = (Int)insn[2+alen];
9229         delta += 3+alen;
9230         DIP("shufps $%d,%s,%s\n", select,
9231                                   dis_buf,
9232                                   nameXMMReg(gregOfRM(modrm)));
9233      }
9234
9235      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
9236      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
9237
9238#     define SELD(n) ((n)==0 ? d0 : ((n)==1 ? d1 : ((n)==2 ? d2 : d3)))
9239#     define SELS(n) ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
9240
9241      putXMMReg(
9242         gregOfRM(modrm),
9243         mk128from32s( SELS((select>>6)&3), SELS((select>>4)&3),
9244                       SELD((select>>2)&3), SELD((select>>0)&3) )
9245      );
9246
9247#     undef SELD
9248#     undef SELS
9249
9250      goto decode_success;
9251   }
9252
9253   /* 0F 51 = SQRTPS -- approx sqrt 32Fx4 from R/M to R */
9254   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x51) {
9255      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
9256                                        "sqrtps", Iop_Sqrt32Fx4 );
9257      goto decode_success;
9258   }
9259
9260   /* F3 0F 51 = SQRTSS -- approx sqrt 32F0x4 from R/M to R */
9261   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x51) {
9262      vassert(sz == 4);
9263      delta = dis_SSE_E_to_G_unary_lo32( sorb, delta+3,
9264                                         "sqrtss", Iop_Sqrt32F0x4 );
9265      goto decode_success;
9266   }
9267
9268   /* 0F AE /3 = STMXCSR m32 -- store %mxcsr */
9269   if (insn[0] == 0x0F && insn[1] == 0xAE
9270       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 3) {
9271      modrm = getIByte(delta+2);
9272      vassert(sz == 4);
9273      vassert(!epartIsReg(modrm));
9274
9275      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9276      delta += 2+alen;
9277
9278      /* Fake up a native SSE mxcsr word.  The only thing it depends
9279         on is SSEROUND[1:0], so call a clean helper to cook it up.
9280      */
9281      /* UInt x86h_create_mxcsr ( UInt sseround ) */
9282      DIP("stmxcsr %s\n", dis_buf);
9283      storeLE( mkexpr(addr),
9284               mkIRExprCCall(
9285                  Ity_I32, 0/*regp*/,
9286                  "x86g_create_mxcsr", &x86g_create_mxcsr,
9287                  mkIRExprVec_1( get_sse_roundingmode() )
9288               )
9289             );
9290      goto decode_success;
9291   }
9292
9293   /* 0F 5C = SUBPS -- sub 32Fx4 from R/M to R */
9294   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5C) {
9295      delta = dis_SSE_E_to_G_all( sorb, delta+2, "subps", Iop_Sub32Fx4 );
9296      goto decode_success;
9297   }
9298
9299   /* F3 0F 5C = SUBSS -- sub 32F0x4 from R/M to R */
9300   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5C) {
9301      vassert(sz == 4);
9302      delta = dis_SSE_E_to_G_lo32( sorb, delta+3, "subss", Iop_Sub32F0x4 );
9303      goto decode_success;
9304   }
9305
9306   /* 0F 15 = UNPCKHPS -- unpack and interleave high part F32s */
9307   /* 0F 14 = UNPCKLPS -- unpack and interleave low part F32s */
9308   /* These just appear to be special cases of SHUFPS */
9309   if (sz == 4 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
9310      IRTemp sV, dV;
9311      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
9312      Bool hi = toBool(insn[1] == 0x15);
9313      sV = newTemp(Ity_V128);
9314      dV = newTemp(Ity_V128);
9315      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
9316      modrm = insn[2];
9317      assign( dV, getXMMReg(gregOfRM(modrm)) );
9318
9319      if (epartIsReg(modrm)) {
9320         assign( sV, getXMMReg(eregOfRM(modrm)) );
9321         delta += 2+1;
9322         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
9323                                  nameXMMReg(eregOfRM(modrm)),
9324                                  nameXMMReg(gregOfRM(modrm)));
9325      } else {
9326         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9327         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
9328         delta += 2+alen;
9329         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
9330                                  dis_buf,
9331                                  nameXMMReg(gregOfRM(modrm)));
9332      }
9333
9334      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
9335      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
9336
9337      if (hi) {
9338         putXMMReg( gregOfRM(modrm), mk128from32s( s3, d3, s2, d2 ) );
9339      } else {
9340         putXMMReg( gregOfRM(modrm), mk128from32s( s1, d1, s0, d0 ) );
9341      }
9342
9343      goto decode_success;
9344   }
9345
9346   /* 0F 57 = XORPS -- G = G and E */
9347   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x57) {
9348      delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorps", Iop_XorV128 );
9349      goto decode_success;
9350   }
9351
9352   /* ---------------------------------------------------- */
9353   /* --- end of the SSE decoder.                      --- */
9354   /* ---------------------------------------------------- */
9355
9356   /* ---------------------------------------------------- */
9357   /* --- start of the SSE2 decoder.                   --- */
9358   /* ---------------------------------------------------- */
9359
9360   /* Skip parts of the decoder which don't apply given the stated
9361      guest subarchitecture. */
9362   if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
9363      goto after_sse_decoders; /* no SSE2 capabilities */
9364
9365   insn = (UChar*)&guest_code[delta];
9366
9367   /* 66 0F 58 = ADDPD -- add 32Fx4 from R/M to R */
9368   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x58) {
9369      delta = dis_SSE_E_to_G_all( sorb, delta+2, "addpd", Iop_Add64Fx2 );
9370      goto decode_success;
9371   }
9372
9373   /* F2 0F 58 = ADDSD -- add 64F0x2 from R/M to R */
9374   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x58) {
9375      vassert(sz == 4);
9376      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "addsd", Iop_Add64F0x2 );
9377      goto decode_success;
9378   }
9379
9380   /* 66 0F 55 = ANDNPD -- G = (not G) and E */
9381   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x55) {
9382      delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "andnpd", Iop_AndV128 );
9383      goto decode_success;
9384   }
9385
9386   /* 66 0F 54 = ANDPD -- G = G and E */
9387   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x54) {
9388      delta = dis_SSE_E_to_G_all( sorb, delta+2, "andpd", Iop_AndV128 );
9389      goto decode_success;
9390   }
9391
9392   /* 66 0F C2 = CMPPD -- 64Fx2 comparison from R/M to R */
9393   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC2) {
9394      delta = dis_SSEcmp_E_to_G( sorb, delta+2, "cmppd", True, 8 );
9395      goto decode_success;
9396   }
9397
9398   /* F2 0F C2 = CMPSD -- 64F0x2 comparison from R/M to R */
9399   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xC2) {
9400      vassert(sz == 4);
9401      delta = dis_SSEcmp_E_to_G( sorb, delta+3, "cmpsd", False, 8 );
9402      goto decode_success;
9403   }
9404
9405   /* 66 0F 2F = COMISD  -- 64F0x2 comparison G,E, and set ZCP */
9406   /* 66 0F 2E = UCOMISD -- 64F0x2 comparison G,E, and set ZCP */
9407   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2F || insn[1] == 0x2E)) {
9408      IRTemp argL = newTemp(Ity_F64);
9409      IRTemp argR = newTemp(Ity_F64);
9410      modrm = getIByte(delta+2);
9411      if (epartIsReg(modrm)) {
9412         assign( argR, getXMMRegLane64F( eregOfRM(modrm), 0/*lowest lane*/ ) );
9413         delta += 2+1;
9414         DIP("[u]comisd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9415                                  nameXMMReg(gregOfRM(modrm)) );
9416      } else {
9417         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9418	 assign( argR, loadLE(Ity_F64, mkexpr(addr)) );
9419         delta += 2+alen;
9420         DIP("[u]comisd %s,%s\n", dis_buf,
9421                                  nameXMMReg(gregOfRM(modrm)) );
9422      }
9423      assign( argL, getXMMRegLane64F( gregOfRM(modrm), 0/*lowest lane*/ ) );
9424
9425      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
9426      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
9427      stmt( IRStmt_Put(
9428               OFFB_CC_DEP1,
9429               binop( Iop_And32,
9430                      binop(Iop_CmpF64, mkexpr(argL), mkexpr(argR)),
9431                      mkU32(0x45)
9432          )));
9433      /* Set NDEP even though it isn't used.  This makes redundant-PUT
9434         elimination of previous stores to this field work better. */
9435      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
9436      goto decode_success;
9437   }
9438
9439   /* F3 0F E6 = CVTDQ2PD -- convert 2 x I32 in mem/lo half xmm to 2 x
9440      F64 in xmm(G) */
9441   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xE6) {
9442      IRTemp arg64 = newTemp(Ity_I64);
9443      vassert(sz == 4);
9444
9445      modrm = getIByte(delta+3);
9446      if (epartIsReg(modrm)) {
9447         assign( arg64, getXMMRegLane64(eregOfRM(modrm), 0) );
9448         delta += 3+1;
9449         DIP("cvtdq2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9450                                 nameXMMReg(gregOfRM(modrm)));
9451      } else {
9452         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9453	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
9454         delta += 3+alen;
9455         DIP("cvtdq2pd %s,%s\n", dis_buf,
9456                                 nameXMMReg(gregOfRM(modrm)) );
9457      }
9458
9459      putXMMRegLane64F(
9460         gregOfRM(modrm), 0,
9461         unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)))
9462      );
9463
9464      putXMMRegLane64F(
9465         gregOfRM(modrm), 1,
9466         unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)))
9467      );
9468
9469      goto decode_success;
9470   }
9471
9472   /* 0F 5B = CVTDQ2PS -- convert 4 x I32 in mem/xmm to 4 x F32 in
9473      xmm(G) */
9474   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5B) {
9475      IRTemp argV  = newTemp(Ity_V128);
9476      IRTemp rmode = newTemp(Ity_I32);
9477
9478      modrm = getIByte(delta+2);
9479      if (epartIsReg(modrm)) {
9480         assign( argV, getXMMReg(eregOfRM(modrm)) );
9481         delta += 2+1;
9482         DIP("cvtdq2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9483                                 nameXMMReg(gregOfRM(modrm)));
9484      } else {
9485         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9486	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9487         delta += 2+alen;
9488         DIP("cvtdq2ps %s,%s\n", dis_buf,
9489                                 nameXMMReg(gregOfRM(modrm)) );
9490      }
9491
9492      assign( rmode, get_sse_roundingmode() );
9493      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
9494
9495#     define CVT(_t)  binop( Iop_F64toF32,                    \
9496                             mkexpr(rmode),                   \
9497                             unop(Iop_I32StoF64,mkexpr(_t)))
9498
9499      putXMMRegLane32F( gregOfRM(modrm), 3, CVT(t3) );
9500      putXMMRegLane32F( gregOfRM(modrm), 2, CVT(t2) );
9501      putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
9502      putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
9503
9504#     undef CVT
9505
9506      goto decode_success;
9507   }
9508
9509   /* F2 0F E6 = CVTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
9510      lo half xmm(G), and zero upper half */
9511   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xE6) {
9512      IRTemp argV  = newTemp(Ity_V128);
9513      IRTemp rmode = newTemp(Ity_I32);
9514      vassert(sz == 4);
9515
9516      modrm = getIByte(delta+3);
9517      if (epartIsReg(modrm)) {
9518         assign( argV, getXMMReg(eregOfRM(modrm)) );
9519         delta += 3+1;
9520         DIP("cvtpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9521                                 nameXMMReg(gregOfRM(modrm)));
9522      } else {
9523         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9524	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9525         delta += 3+alen;
9526         DIP("cvtpd2dq %s,%s\n", dis_buf,
9527                                 nameXMMReg(gregOfRM(modrm)) );
9528      }
9529
9530      assign( rmode, get_sse_roundingmode() );
9531      t0 = newTemp(Ity_F64);
9532      t1 = newTemp(Ity_F64);
9533      assign( t0, unop(Iop_ReinterpI64asF64,
9534                       unop(Iop_V128to64, mkexpr(argV))) );
9535      assign( t1, unop(Iop_ReinterpI64asF64,
9536                       unop(Iop_V128HIto64, mkexpr(argV))) );
9537
9538#     define CVT(_t)  binop( Iop_F64toI32S,                   \
9539                             mkexpr(rmode),                   \
9540                             mkexpr(_t) )
9541
9542      putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
9543      putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
9544      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
9545      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
9546
9547#     undef CVT
9548
9549      goto decode_success;
9550   }
9551
9552   /* 66 0F 2D = CVTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
9553      I32 in mmx, according to prevailing SSE rounding mode */
9554   /* 66 0F 2C = CVTTPD2PI -- convert 2 x F64 in mem/xmm to 2 x
9555      I32 in mmx, rounding towards zero */
9556   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x2D || insn[1] == 0x2C)) {
9557      IRTemp dst64  = newTemp(Ity_I64);
9558      IRTemp rmode  = newTemp(Ity_I32);
9559      IRTemp f64lo  = newTemp(Ity_F64);
9560      IRTemp f64hi  = newTemp(Ity_F64);
9561      Bool   r2zero = toBool(insn[1] == 0x2C);
9562
9563      do_MMX_preamble();
9564      modrm = getIByte(delta+2);
9565
9566      if (epartIsReg(modrm)) {
9567         delta += 2+1;
9568	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
9569	 assign(f64hi, getXMMRegLane64F(eregOfRM(modrm), 1));
9570         DIP("cvt%spd2pi %s,%s\n", r2zero ? "t" : "",
9571                                   nameXMMReg(eregOfRM(modrm)),
9572                                   nameMMXReg(gregOfRM(modrm)));
9573      } else {
9574         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9575	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
9576	 assign(f64hi, loadLE(Ity_F64, binop( Iop_Add32,
9577                                              mkexpr(addr),
9578                                              mkU32(8) )));
9579         delta += 2+alen;
9580         DIP("cvt%spf2pi %s,%s\n", r2zero ? "t" : "",
9581                                   dis_buf,
9582                                   nameMMXReg(gregOfRM(modrm)));
9583      }
9584
9585      if (r2zero) {
9586         assign(rmode, mkU32((UInt)Irrm_ZERO) );
9587      } else {
9588         assign( rmode, get_sse_roundingmode() );
9589      }
9590
9591      assign(
9592         dst64,
9593         binop( Iop_32HLto64,
9594                binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64hi) ),
9595                binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo) )
9596              )
9597      );
9598
9599      putMMXReg(gregOfRM(modrm), mkexpr(dst64));
9600      goto decode_success;
9601   }
9602
9603   /* 66 0F 5A = CVTPD2PS -- convert 2 x F64 in mem/xmm to 2 x F32 in
9604      lo half xmm(G), and zero upper half */
9605   /* Note, this is practically identical to CVTPD2DQ.  It would have
9606      been nicer to merge them together, but the insn[] offsets differ
9607      by one. */
9608   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5A) {
9609      IRTemp argV  = newTemp(Ity_V128);
9610      IRTemp rmode = newTemp(Ity_I32);
9611
9612      modrm = getIByte(delta+2);
9613      if (epartIsReg(modrm)) {
9614         assign( argV, getXMMReg(eregOfRM(modrm)) );
9615         delta += 2+1;
9616         DIP("cvtpd2ps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9617                                 nameXMMReg(gregOfRM(modrm)));
9618      } else {
9619         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9620	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9621         delta += 2+alen;
9622         DIP("cvtpd2ps %s,%s\n", dis_buf,
9623                                 nameXMMReg(gregOfRM(modrm)) );
9624      }
9625
9626      assign( rmode, get_sse_roundingmode() );
9627      t0 = newTemp(Ity_F64);
9628      t1 = newTemp(Ity_F64);
9629      assign( t0, unop(Iop_ReinterpI64asF64,
9630                       unop(Iop_V128to64, mkexpr(argV))) );
9631      assign( t1, unop(Iop_ReinterpI64asF64,
9632                       unop(Iop_V128HIto64, mkexpr(argV))) );
9633
9634#     define CVT(_t)  binop( Iop_F64toF32,                    \
9635                             mkexpr(rmode),                   \
9636                             mkexpr(_t) )
9637
9638      putXMMRegLane32(  gregOfRM(modrm), 3, mkU32(0) );
9639      putXMMRegLane32(  gregOfRM(modrm), 2, mkU32(0) );
9640      putXMMRegLane32F( gregOfRM(modrm), 1, CVT(t1) );
9641      putXMMRegLane32F( gregOfRM(modrm), 0, CVT(t0) );
9642
9643#     undef CVT
9644
9645      goto decode_success;
9646   }
9647
9648   /* 66 0F 2A = CVTPI2PD -- convert 2 x I32 in mem/mmx to 2 x F64 in
9649      xmm(G) */
9650   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x2A) {
9651      IRTemp arg64 = newTemp(Ity_I64);
9652
9653      modrm = getIByte(delta+2);
9654      if (epartIsReg(modrm)) {
9655         /* Only switch to MMX mode if the source is a MMX register.
9656            This is inconsistent with all other instructions which
9657            convert between XMM and (M64 or MMX), which always switch
9658            to MMX mode even if 64-bit operand is M64 and not MMX.  At
9659            least, that's what the Intel docs seem to me to say.
9660            Fixes #210264. */
9661         do_MMX_preamble();
9662         assign( arg64, getMMXReg(eregOfRM(modrm)) );
9663         delta += 2+1;
9664         DIP("cvtpi2pd %s,%s\n", nameMMXReg(eregOfRM(modrm)),
9665                                 nameXMMReg(gregOfRM(modrm)));
9666      } else {
9667         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9668	 assign( arg64, loadLE(Ity_I64, mkexpr(addr)) );
9669         delta += 2+alen;
9670         DIP("cvtpi2pd %s,%s\n", dis_buf,
9671                                 nameXMMReg(gregOfRM(modrm)) );
9672      }
9673
9674      putXMMRegLane64F(
9675         gregOfRM(modrm), 0,
9676         unop(Iop_I32StoF64, unop(Iop_64to32, mkexpr(arg64)) )
9677      );
9678
9679      putXMMRegLane64F(
9680         gregOfRM(modrm), 1,
9681         unop(Iop_I32StoF64, unop(Iop_64HIto32, mkexpr(arg64)) )
9682      );
9683
9684      goto decode_success;
9685   }
9686
9687   /* 66 0F 5B = CVTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
9688      xmm(G) */
9689   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5B) {
9690      IRTemp argV  = newTemp(Ity_V128);
9691      IRTemp rmode = newTemp(Ity_I32);
9692
9693      modrm = getIByte(delta+2);
9694      if (epartIsReg(modrm)) {
9695         assign( argV, getXMMReg(eregOfRM(modrm)) );
9696         delta += 2+1;
9697         DIP("cvtps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9698                                 nameXMMReg(gregOfRM(modrm)));
9699      } else {
9700         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9701	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9702         delta += 2+alen;
9703         DIP("cvtps2dq %s,%s\n", dis_buf,
9704                                 nameXMMReg(gregOfRM(modrm)) );
9705      }
9706
9707      assign( rmode, get_sse_roundingmode() );
9708      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
9709
9710      /* This is less than ideal.  If it turns out to be a performance
9711	 bottleneck it can be improved. */
9712#     define CVT(_t)                            \
9713        binop( Iop_F64toI32S,                   \
9714               mkexpr(rmode),                   \
9715               unop( Iop_F32toF64,              \
9716                     unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
9717
9718      putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
9719      putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
9720      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
9721      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
9722
9723#     undef CVT
9724
9725      goto decode_success;
9726   }
9727
9728   /* 0F 5A = CVTPS2PD -- convert 2 x F32 in low half mem/xmm to 2 x
9729      F64 in xmm(G). */
9730   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0x5A) {
9731      IRTemp f32lo = newTemp(Ity_F32);
9732      IRTemp f32hi = newTemp(Ity_F32);
9733
9734      modrm = getIByte(delta+2);
9735      if (epartIsReg(modrm)) {
9736         assign( f32lo, getXMMRegLane32F(eregOfRM(modrm), 0) );
9737         assign( f32hi, getXMMRegLane32F(eregOfRM(modrm), 1) );
9738         delta += 2+1;
9739         DIP("cvtps2pd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9740                                 nameXMMReg(gregOfRM(modrm)));
9741      } else {
9742         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9743	 assign( f32lo, loadLE(Ity_F32, mkexpr(addr)) );
9744	 assign( f32hi, loadLE(Ity_F32,
9745                               binop(Iop_Add32,mkexpr(addr),mkU32(4))) );
9746         delta += 2+alen;
9747         DIP("cvtps2pd %s,%s\n", dis_buf,
9748                                 nameXMMReg(gregOfRM(modrm)) );
9749      }
9750
9751      putXMMRegLane64F( gregOfRM(modrm), 1,
9752                        unop(Iop_F32toF64, mkexpr(f32hi)) );
9753      putXMMRegLane64F( gregOfRM(modrm), 0,
9754                        unop(Iop_F32toF64, mkexpr(f32lo)) );
9755
9756      goto decode_success;
9757   }
9758
9759   /* F2 0F 2D = CVTSD2SI -- convert F64 in mem/low half xmm to
9760      I32 in ireg, according to prevailing SSE rounding mode */
9761   /* F2 0F 2C = CVTTSD2SI -- convert F64 in mem/low half xmm to
9762      I32 in ireg, rounding towards zero */
9763   if (insn[0] == 0xF2 && insn[1] == 0x0F
9764       && (insn[2] == 0x2D || insn[2] == 0x2C)) {
9765      IRTemp rmode = newTemp(Ity_I32);
9766      IRTemp f64lo = newTemp(Ity_F64);
9767      Bool   r2zero = toBool(insn[2] == 0x2C);
9768      vassert(sz == 4);
9769
9770      modrm = getIByte(delta+3);
9771      if (epartIsReg(modrm)) {
9772         delta += 3+1;
9773	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
9774         DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
9775                                   nameXMMReg(eregOfRM(modrm)),
9776                                   nameIReg(4, gregOfRM(modrm)));
9777      } else {
9778         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9779	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
9780         delta += 3+alen;
9781         DIP("cvt%ssd2si %s,%s\n", r2zero ? "t" : "",
9782                                   dis_buf,
9783                                   nameIReg(4, gregOfRM(modrm)));
9784      }
9785
9786      if (r2zero) {
9787         assign( rmode, mkU32((UInt)Irrm_ZERO) );
9788      } else {
9789         assign( rmode, get_sse_roundingmode() );
9790      }
9791
9792      putIReg(4, gregOfRM(modrm),
9793                 binop( Iop_F64toI32S, mkexpr(rmode), mkexpr(f64lo)) );
9794
9795      goto decode_success;
9796   }
9797
9798   /* F2 0F 5A = CVTSD2SS -- convert F64 in mem/low half xmm to F32 in
9799      low 1/4 xmm(G), according to prevailing SSE rounding mode */
9800   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5A) {
9801      IRTemp rmode = newTemp(Ity_I32);
9802      IRTemp f64lo = newTemp(Ity_F64);
9803      vassert(sz == 4);
9804
9805      modrm = getIByte(delta+3);
9806      if (epartIsReg(modrm)) {
9807         delta += 3+1;
9808	 assign(f64lo, getXMMRegLane64F(eregOfRM(modrm), 0));
9809         DIP("cvtsd2ss %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9810                                 nameXMMReg(gregOfRM(modrm)));
9811      } else {
9812         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9813	 assign(f64lo, loadLE(Ity_F64, mkexpr(addr)));
9814         delta += 3+alen;
9815         DIP("cvtsd2ss %s,%s\n", dis_buf,
9816                                 nameXMMReg(gregOfRM(modrm)));
9817      }
9818
9819      assign( rmode, get_sse_roundingmode() );
9820      putXMMRegLane32F(
9821         gregOfRM(modrm), 0,
9822         binop( Iop_F64toF32, mkexpr(rmode), mkexpr(f64lo) )
9823      );
9824
9825      goto decode_success;
9826   }
9827
9828   /* F2 0F 2A = CVTSI2SD -- convert I32 in mem/ireg to F64 in low
9829      half xmm */
9830   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x2A) {
9831      IRTemp arg32 = newTemp(Ity_I32);
9832      vassert(sz == 4);
9833
9834      modrm = getIByte(delta+3);
9835      if (epartIsReg(modrm)) {
9836         assign( arg32, getIReg(4, eregOfRM(modrm)) );
9837         delta += 3+1;
9838         DIP("cvtsi2sd %s,%s\n", nameIReg(4, eregOfRM(modrm)),
9839                                 nameXMMReg(gregOfRM(modrm)));
9840      } else {
9841         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9842	 assign( arg32, loadLE(Ity_I32, mkexpr(addr)) );
9843         delta += 3+alen;
9844         DIP("cvtsi2sd %s,%s\n", dis_buf,
9845                                 nameXMMReg(gregOfRM(modrm)) );
9846      }
9847
9848      putXMMRegLane64F(
9849         gregOfRM(modrm), 0,
9850         unop(Iop_I32StoF64, mkexpr(arg32)) );
9851
9852      goto decode_success;
9853   }
9854
9855   /* F3 0F 5A = CVTSS2SD -- convert F32 in mem/low 1/4 xmm to F64 in
9856      low half xmm(G) */
9857   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5A) {
9858      IRTemp f32lo = newTemp(Ity_F32);
9859      vassert(sz == 4);
9860
9861      modrm = getIByte(delta+3);
9862      if (epartIsReg(modrm)) {
9863         delta += 3+1;
9864	 assign(f32lo, getXMMRegLane32F(eregOfRM(modrm), 0));
9865         DIP("cvtss2sd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9866                                 nameXMMReg(gregOfRM(modrm)));
9867      } else {
9868         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9869	 assign(f32lo, loadLE(Ity_F32, mkexpr(addr)));
9870         delta += 3+alen;
9871         DIP("cvtss2sd %s,%s\n", dis_buf,
9872                                 nameXMMReg(gregOfRM(modrm)));
9873      }
9874
9875      putXMMRegLane64F( gregOfRM(modrm), 0,
9876                        unop( Iop_F32toF64, mkexpr(f32lo) ) );
9877
9878      goto decode_success;
9879   }
9880
9881   /* 66 0F E6 = CVTTPD2DQ -- convert 2 x F64 in mem/xmm to 2 x I32 in
9882      lo half xmm(G), and zero upper half, rounding towards zero */
9883   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE6) {
9884      IRTemp argV  = newTemp(Ity_V128);
9885      IRTemp rmode = newTemp(Ity_I32);
9886
9887      modrm = getIByte(delta+2);
9888      if (epartIsReg(modrm)) {
9889         assign( argV, getXMMReg(eregOfRM(modrm)) );
9890         delta += 2+1;
9891         DIP("cvttpd2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9892                                  nameXMMReg(gregOfRM(modrm)));
9893      } else {
9894         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
9895	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9896         delta += 2+alen;
9897         DIP("cvttpd2dq %s,%s\n", dis_buf,
9898                                  nameXMMReg(gregOfRM(modrm)) );
9899      }
9900
9901      assign( rmode, mkU32((UInt)Irrm_ZERO) );
9902
9903      t0 = newTemp(Ity_F64);
9904      t1 = newTemp(Ity_F64);
9905      assign( t0, unop(Iop_ReinterpI64asF64,
9906                       unop(Iop_V128to64, mkexpr(argV))) );
9907      assign( t1, unop(Iop_ReinterpI64asF64,
9908                       unop(Iop_V128HIto64, mkexpr(argV))) );
9909
9910#     define CVT(_t)  binop( Iop_F64toI32S,                   \
9911                             mkexpr(rmode),                   \
9912                             mkexpr(_t) )
9913
9914      putXMMRegLane32( gregOfRM(modrm), 3, mkU32(0) );
9915      putXMMRegLane32( gregOfRM(modrm), 2, mkU32(0) );
9916      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
9917      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
9918
9919#     undef CVT
9920
9921      goto decode_success;
9922   }
9923
9924   /* F3 0F 5B = CVTTPS2DQ -- convert 4 x F32 in mem/xmm to 4 x I32 in
9925      xmm(G), rounding towards zero */
9926   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x5B) {
9927      IRTemp argV  = newTemp(Ity_V128);
9928      IRTemp rmode = newTemp(Ity_I32);
9929      vassert(sz == 4);
9930
9931      modrm = getIByte(delta+3);
9932      if (epartIsReg(modrm)) {
9933         assign( argV, getXMMReg(eregOfRM(modrm)) );
9934         delta += 3+1;
9935         DIP("cvttps2dq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
9936                                  nameXMMReg(gregOfRM(modrm)));
9937      } else {
9938         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
9939	 assign( argV, loadLE(Ity_V128, mkexpr(addr)) );
9940         delta += 3+alen;
9941         DIP("cvttps2dq %s,%s\n", dis_buf,
9942                                  nameXMMReg(gregOfRM(modrm)) );
9943      }
9944
9945      assign( rmode, mkU32((UInt)Irrm_ZERO) );
9946      breakup128to32s( argV, &t3, &t2, &t1, &t0 );
9947
9948      /* This is less than ideal.  If it turns out to be a performance
9949	 bottleneck it can be improved. */
9950#     define CVT(_t)                            \
9951        binop( Iop_F64toI32S,                   \
9952               mkexpr(rmode),                   \
9953               unop( Iop_F32toF64,              \
9954                     unop( Iop_ReinterpI32asF32, mkexpr(_t))) )
9955
9956      putXMMRegLane32( gregOfRM(modrm), 3, CVT(t3) );
9957      putXMMRegLane32( gregOfRM(modrm), 2, CVT(t2) );
9958      putXMMRegLane32( gregOfRM(modrm), 1, CVT(t1) );
9959      putXMMRegLane32( gregOfRM(modrm), 0, CVT(t0) );
9960
9961#     undef CVT
9962
9963      goto decode_success;
9964   }
9965
9966   /* 66 0F 5E = DIVPD -- div 64Fx2 from R/M to R */
9967   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5E) {
9968      delta = dis_SSE_E_to_G_all( sorb, delta+2, "divpd", Iop_Div64Fx2 );
9969      goto decode_success;
9970   }
9971
9972   /* F2 0F 5E = DIVSD -- div 64F0x2 from R/M to R */
9973   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5E) {
9974      vassert(sz == 4);
9975      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "divsd", Iop_Div64F0x2 );
9976      goto decode_success;
9977   }
9978
9979   /* 0F AE /5 = LFENCE -- flush pending operations to memory */
9980   /* 0F AE /6 = MFENCE -- flush pending operations to memory */
9981   if (insn[0] == 0x0F && insn[1] == 0xAE
9982       && epartIsReg(insn[2])
9983       && (gregOfRM(insn[2]) == 5 || gregOfRM(insn[2]) == 6)) {
9984      vassert(sz == 4);
9985      delta += 3;
9986      /* Insert a memory fence.  It's sometimes important that these
9987         are carried through to the generated code. */
9988      stmt( IRStmt_MBE(Imbe_Fence) );
9989      DIP("%sfence\n", gregOfRM(insn[2])==5 ? "l" : "m");
9990      goto decode_success;
9991   }
9992
9993   /* 66 0F 5F = MAXPD -- max 64Fx2 from R/M to R */
9994   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5F) {
9995      delta = dis_SSE_E_to_G_all( sorb, delta+2, "maxpd", Iop_Max64Fx2 );
9996      goto decode_success;
9997   }
9998
9999   /* F2 0F 5F = MAXSD -- max 64F0x2 from R/M to R */
10000   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5F) {
10001      vassert(sz == 4);
10002      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "maxsd", Iop_Max64F0x2 );
10003      goto decode_success;
10004   }
10005
10006   /* 66 0F 5D = MINPD -- min 64Fx2 from R/M to R */
10007   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5D) {
10008      delta = dis_SSE_E_to_G_all( sorb, delta+2, "minpd", Iop_Min64Fx2 );
10009      goto decode_success;
10010   }
10011
10012   /* F2 0F 5D = MINSD -- min 64F0x2 from R/M to R */
10013   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5D) {
10014      vassert(sz == 4);
10015      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "minsd", Iop_Min64F0x2 );
10016      goto decode_success;
10017   }
10018
10019   /* 66 0F 28 = MOVAPD -- move from E (mem or xmm) to G (xmm). */
10020   /* 66 0F 10 = MOVUPD -- move from E (mem or xmm) to G (xmm). */
10021   /* 66 0F 6F = MOVDQA -- move from E (mem or xmm) to G (xmm). */
10022   if (sz == 2 && insn[0] == 0x0F
10023       && (insn[1] == 0x28 || insn[1] == 0x10 || insn[1] == 0x6F)) {
10024      HChar* wot = insn[1]==0x28 ? "apd" :
10025                   insn[1]==0x10 ? "upd" : "dqa";
10026      modrm = getIByte(delta+2);
10027      if (epartIsReg(modrm)) {
10028         putXMMReg( gregOfRM(modrm),
10029                    getXMMReg( eregOfRM(modrm) ));
10030         DIP("mov%s %s,%s\n", wot, nameXMMReg(eregOfRM(modrm)),
10031                                   nameXMMReg(gregOfRM(modrm)));
10032         delta += 2+1;
10033      } else {
10034         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10035         if (insn[1] == 0x28/*movapd*/ || insn[1] == 0x6F/*movdqa*/)
10036            gen_SEGV_if_not_16_aligned( addr );
10037         putXMMReg( gregOfRM(modrm),
10038                    loadLE(Ity_V128, mkexpr(addr)) );
10039         DIP("mov%s %s,%s\n", wot, dis_buf,
10040                                   nameXMMReg(gregOfRM(modrm)));
10041         delta += 2+alen;
10042      }
10043      goto decode_success;
10044   }
10045
10046   /* 66 0F 29 = MOVAPD -- move from G (xmm) to E (mem or xmm). */
10047   /* 66 0F 11 = MOVUPD -- move from G (xmm) to E (mem or xmm). */
10048   if (sz == 2 && insn[0] == 0x0F
10049       && (insn[1] == 0x29 || insn[1] == 0x11)) {
10050      HChar* wot = insn[1]==0x29 ? "apd" : "upd";
10051      modrm = getIByte(delta+2);
10052      if (epartIsReg(modrm)) {
10053         /* fall through; awaiting test case */
10054      } else {
10055         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10056         if (insn[1] == 0x29/*movapd*/)
10057            gen_SEGV_if_not_16_aligned( addr );
10058         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10059         DIP("mov%s %s,%s\n", wot, nameXMMReg(gregOfRM(modrm)),
10060                                   dis_buf );
10061         delta += 2+alen;
10062         goto decode_success;
10063      }
10064   }
10065
10066   /* 66 0F 6E = MOVD from r/m32 to xmm, zeroing high 3/4 of xmm. */
10067   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6E) {
10068      modrm = getIByte(delta+2);
10069      if (epartIsReg(modrm)) {
10070         delta += 2+1;
10071         putXMMReg(
10072            gregOfRM(modrm),
10073            unop( Iop_32UtoV128, getIReg(4, eregOfRM(modrm)) )
10074         );
10075         DIP("movd %s, %s\n",
10076             nameIReg(4,eregOfRM(modrm)), nameXMMReg(gregOfRM(modrm)));
10077      } else {
10078         addr = disAMode( &alen, sorb, delta+2, dis_buf );
10079         delta += 2+alen;
10080         putXMMReg(
10081            gregOfRM(modrm),
10082            unop( Iop_32UtoV128,loadLE(Ity_I32, mkexpr(addr)) )
10083         );
10084         DIP("movd %s, %s\n", dis_buf, nameXMMReg(gregOfRM(modrm)));
10085      }
10086      goto decode_success;
10087   }
10088
10089   /* 66 0F 7E = MOVD from xmm low 1/4 to r/m32. */
10090   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7E) {
10091      modrm = getIByte(delta+2);
10092      if (epartIsReg(modrm)) {
10093         delta += 2+1;
10094         putIReg( 4, eregOfRM(modrm),
10095                  getXMMRegLane32(gregOfRM(modrm), 0) );
10096         DIP("movd %s, %s\n",
10097             nameXMMReg(gregOfRM(modrm)), nameIReg(4,eregOfRM(modrm)));
10098      } else {
10099         addr = disAMode( &alen, sorb, delta+2, dis_buf );
10100         delta += 2+alen;
10101         storeLE( mkexpr(addr),
10102                  getXMMRegLane32(gregOfRM(modrm), 0) );
10103         DIP("movd %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
10104      }
10105      goto decode_success;
10106   }
10107
10108   /* 66 0F 7F = MOVDQA -- move from G (xmm) to E (mem or xmm). */
10109   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x7F) {
10110      modrm = getIByte(delta+2);
10111      if (epartIsReg(modrm)) {
10112         delta += 2+1;
10113         putXMMReg( eregOfRM(modrm),
10114                    getXMMReg(gregOfRM(modrm)) );
10115         DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)),
10116                                nameXMMReg(eregOfRM(modrm)));
10117      } else {
10118         addr = disAMode( &alen, sorb, delta+2, dis_buf );
10119         delta += 2+alen;
10120         gen_SEGV_if_not_16_aligned( addr );
10121         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10122         DIP("movdqa %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
10123      }
10124      goto decode_success;
10125   }
10126
10127   /* F3 0F 6F = MOVDQU -- move from E (mem or xmm) to G (xmm). */
10128   /* Unfortunately can't simply use the MOVDQA case since the
10129      prefix lengths are different (66 vs F3) */
10130   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x6F) {
10131      vassert(sz == 4);
10132      modrm = getIByte(delta+3);
10133      if (epartIsReg(modrm)) {
10134         putXMMReg( gregOfRM(modrm),
10135                    getXMMReg( eregOfRM(modrm) ));
10136         DIP("movdqu %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10137                               nameXMMReg(gregOfRM(modrm)));
10138         delta += 3+1;
10139      } else {
10140         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10141         putXMMReg( gregOfRM(modrm),
10142                    loadLE(Ity_V128, mkexpr(addr)) );
10143         DIP("movdqu %s,%s\n", dis_buf,
10144                               nameXMMReg(gregOfRM(modrm)));
10145         delta += 3+alen;
10146      }
10147      goto decode_success;
10148   }
10149
10150   /* F3 0F 7F = MOVDQU -- move from G (xmm) to E (mem or xmm). */
10151   /* Unfortunately can't simply use the MOVDQA case since the
10152      prefix lengths are different (66 vs F3) */
10153   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7F) {
10154      vassert(sz == 4);
10155      modrm = getIByte(delta+3);
10156      if (epartIsReg(modrm)) {
10157         delta += 3+1;
10158         putXMMReg( eregOfRM(modrm),
10159                    getXMMReg(gregOfRM(modrm)) );
10160         DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)),
10161                                nameXMMReg(eregOfRM(modrm)));
10162      } else {
10163         addr = disAMode( &alen, sorb, delta+3, dis_buf );
10164         delta += 3+alen;
10165         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10166         DIP("movdqu %s, %s\n", nameXMMReg(gregOfRM(modrm)), dis_buf);
10167      }
10168      goto decode_success;
10169   }
10170
10171   /* F2 0F D6 = MOVDQ2Q -- move from E (lo half xmm, not mem) to G (mmx). */
10172   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD6) {
10173      vassert(sz == 4);
10174      modrm = getIByte(delta+3);
10175      if (epartIsReg(modrm)) {
10176         do_MMX_preamble();
10177         putMMXReg( gregOfRM(modrm),
10178                    getXMMRegLane64( eregOfRM(modrm), 0 ));
10179         DIP("movdq2q %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10180                                nameMMXReg(gregOfRM(modrm)));
10181         delta += 3+1;
10182         goto decode_success;
10183      } else {
10184         /* fall through, apparently no mem case for this insn */
10185      }
10186   }
10187
10188   /* 66 0F 16 = MOVHPD -- move from mem to high half of XMM. */
10189   /* These seems identical to MOVHPS.  This instruction encoding is
10190      completely crazy. */
10191   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x16) {
10192      modrm = getIByte(delta+2);
10193      if (epartIsReg(modrm)) {
10194         /* fall through; apparently reg-reg is not possible */
10195      } else {
10196         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10197         delta += 2+alen;
10198         putXMMRegLane64( gregOfRM(modrm), 1/*upper lane*/,
10199                          loadLE(Ity_I64, mkexpr(addr)) );
10200         DIP("movhpd %s,%s\n", dis_buf,
10201                               nameXMMReg( gregOfRM(modrm) ));
10202         goto decode_success;
10203      }
10204   }
10205
10206   /* 66 0F 17 = MOVHPD -- move from high half of XMM to mem. */
10207   /* Again, this seems identical to MOVHPS. */
10208   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x17) {
10209      if (!epartIsReg(insn[2])) {
10210         delta += 2;
10211         addr = disAMode ( &alen, sorb, delta, dis_buf );
10212         delta += alen;
10213         storeLE( mkexpr(addr),
10214                  getXMMRegLane64( gregOfRM(insn[2]),
10215                                   1/*upper lane*/ ) );
10216         DIP("movhpd %s,%s\n", nameXMMReg( gregOfRM(insn[2]) ),
10217                               dis_buf);
10218         goto decode_success;
10219      }
10220      /* else fall through */
10221   }
10222
10223   /* 66 0F 12 = MOVLPD -- move from mem to low half of XMM. */
10224   /* Identical to MOVLPS ? */
10225   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x12) {
10226      modrm = getIByte(delta+2);
10227      if (epartIsReg(modrm)) {
10228         /* fall through; apparently reg-reg is not possible */
10229      } else {
10230         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10231         delta += 2+alen;
10232         putXMMRegLane64( gregOfRM(modrm),  0/*lower lane*/,
10233                          loadLE(Ity_I64, mkexpr(addr)) );
10234         DIP("movlpd %s, %s\n",
10235             dis_buf, nameXMMReg( gregOfRM(modrm) ));
10236         goto decode_success;
10237      }
10238   }
10239
10240   /* 66 0F 13 = MOVLPD -- move from low half of XMM to mem. */
10241   /* Identical to MOVLPS ? */
10242   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x13) {
10243      if (!epartIsReg(insn[2])) {
10244         delta += 2;
10245         addr = disAMode ( &alen, sorb, delta, dis_buf );
10246         delta += alen;
10247         storeLE( mkexpr(addr),
10248                  getXMMRegLane64( gregOfRM(insn[2]),
10249                                   0/*lower lane*/ ) );
10250         DIP("movlpd %s, %s\n", nameXMMReg( gregOfRM(insn[2]) ),
10251                                dis_buf);
10252         goto decode_success;
10253      }
10254      /* else fall through */
10255   }
10256
10257   /* 66 0F 50 = MOVMSKPD - move 2 sign bits from 2 x F64 in xmm(E) to
10258      2 lowest bits of ireg(G) */
10259   if (insn[0] == 0x0F && insn[1] == 0x50) {
10260      modrm = getIByte(delta+2);
10261      if (sz == 2 && epartIsReg(modrm)) {
10262         Int src;
10263         t0 = newTemp(Ity_I32);
10264         t1 = newTemp(Ity_I32);
10265         delta += 2+1;
10266         src = eregOfRM(modrm);
10267         assign( t0, binop( Iop_And32,
10268                            binop(Iop_Shr32, getXMMRegLane32(src,1), mkU8(31)),
10269                            mkU32(1) ));
10270         assign( t1, binop( Iop_And32,
10271                            binop(Iop_Shr32, getXMMRegLane32(src,3), mkU8(30)),
10272                            mkU32(2) ));
10273         putIReg(4, gregOfRM(modrm),
10274                    binop(Iop_Or32, mkexpr(t0), mkexpr(t1))
10275                 );
10276         DIP("movmskpd %s,%s\n", nameXMMReg(src),
10277                                 nameIReg(4, gregOfRM(modrm)));
10278         goto decode_success;
10279      }
10280      /* else fall through */
10281   }
10282
10283   /* 66 0F F7 = MASKMOVDQU -- store selected bytes of double quadword */
10284   if (insn[0] == 0x0F && insn[1] == 0xF7) {
10285      modrm = getIByte(delta+2);
10286      if (sz == 2 && epartIsReg(modrm)) {
10287         IRTemp regD    = newTemp(Ity_V128);
10288         IRTemp mask    = newTemp(Ity_V128);
10289         IRTemp olddata = newTemp(Ity_V128);
10290         IRTemp newdata = newTemp(Ity_V128);
10291                addr    = newTemp(Ity_I32);
10292
10293         assign( addr, handleSegOverride( sorb, getIReg(4, R_EDI) ));
10294         assign( regD, getXMMReg( gregOfRM(modrm) ));
10295
10296         /* Unfortunately can't do the obvious thing with SarN8x16
10297            here since that can't be re-emitted as SSE2 code - no such
10298            insn. */
10299	 assign(
10300            mask,
10301            binop(Iop_64HLtoV128,
10302                  binop(Iop_SarN8x8,
10303                        getXMMRegLane64( eregOfRM(modrm), 1 ),
10304                        mkU8(7) ),
10305                  binop(Iop_SarN8x8,
10306                        getXMMRegLane64( eregOfRM(modrm), 0 ),
10307                        mkU8(7) ) ));
10308         assign( olddata, loadLE( Ity_V128, mkexpr(addr) ));
10309         assign( newdata,
10310                 binop(Iop_OrV128,
10311                       binop(Iop_AndV128,
10312                             mkexpr(regD),
10313                             mkexpr(mask) ),
10314                       binop(Iop_AndV128,
10315                             mkexpr(olddata),
10316                             unop(Iop_NotV128, mkexpr(mask)))) );
10317         storeLE( mkexpr(addr), mkexpr(newdata) );
10318
10319         delta += 2+1;
10320         DIP("maskmovdqu %s,%s\n", nameXMMReg( eregOfRM(modrm) ),
10321                                   nameXMMReg( gregOfRM(modrm) ) );
10322         goto decode_success;
10323      }
10324      /* else fall through */
10325   }
10326
10327   /* 66 0F E7 = MOVNTDQ -- for us, just a plain SSE store. */
10328   if (insn[0] == 0x0F && insn[1] == 0xE7) {
10329      modrm = getIByte(delta+2);
10330      if (sz == 2 && !epartIsReg(modrm)) {
10331         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10332         gen_SEGV_if_not_16_aligned( addr );
10333         storeLE( mkexpr(addr), getXMMReg(gregOfRM(modrm)) );
10334         DIP("movntdq %s,%s\n", dis_buf,
10335                                nameXMMReg(gregOfRM(modrm)));
10336         delta += 2+alen;
10337         goto decode_success;
10338      }
10339      /* else fall through */
10340   }
10341
10342   /* 0F C3 = MOVNTI -- for us, just a plain ireg store. */
10343   if (insn[0] == 0x0F && insn[1] == 0xC3) {
10344      vassert(sz == 4);
10345      modrm = getIByte(delta+2);
10346      if (!epartIsReg(modrm)) {
10347         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10348         storeLE( mkexpr(addr), getIReg(4, gregOfRM(modrm)) );
10349         DIP("movnti %s,%s\n", dis_buf,
10350                               nameIReg(4, gregOfRM(modrm)));
10351         delta += 2+alen;
10352         goto decode_success;
10353      }
10354      /* else fall through */
10355   }
10356
10357   /* 66 0F D6 = MOVQ -- move 64 bits from G (lo half xmm) to E (mem
10358      or lo half xmm).  */
10359   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD6) {
10360      modrm = getIByte(delta+2);
10361      if (epartIsReg(modrm)) {
10362         /* fall through, awaiting test case */
10363         /* dst: lo half copied, hi half zeroed */
10364      } else {
10365         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10366         storeLE( mkexpr(addr),
10367                  getXMMRegLane64( gregOfRM(modrm), 0 ));
10368         DIP("movq %s,%s\n", nameXMMReg(gregOfRM(modrm)), dis_buf );
10369         delta += 2+alen;
10370         goto decode_success;
10371      }
10372   }
10373
10374   /* F3 0F D6 = MOVQ2DQ -- move from E (mmx) to G (lo half xmm, zero
10375      hi half). */
10376   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xD6) {
10377      vassert(sz == 4);
10378      modrm = getIByte(delta+3);
10379      if (epartIsReg(modrm)) {
10380         do_MMX_preamble();
10381         putXMMReg( gregOfRM(modrm),
10382                    unop(Iop_64UtoV128, getMMXReg( eregOfRM(modrm) )) );
10383         DIP("movq2dq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
10384                                nameXMMReg(gregOfRM(modrm)));
10385         delta += 3+1;
10386         goto decode_success;
10387      } else {
10388         /* fall through, apparently no mem case for this insn */
10389      }
10390   }
10391
10392   /* F3 0F 7E = MOVQ -- move 64 bits from E (mem or lo half xmm) to
10393      G (lo half xmm).  Upper half of G is zeroed out. */
10394   /* F2 0F 10 = MOVSD -- move 64 bits from E (mem or lo half xmm) to
10395      G (lo half xmm).  If E is mem, upper half of G is zeroed out.
10396      If E is reg, upper half of G is unchanged. */
10397   if ((insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x10)
10398       || (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x7E)) {
10399      vassert(sz == 4);
10400      modrm = getIByte(delta+3);
10401      if (epartIsReg(modrm)) {
10402         putXMMRegLane64( gregOfRM(modrm), 0,
10403                          getXMMRegLane64( eregOfRM(modrm), 0 ));
10404         if (insn[0] == 0xF3/*MOVQ*/) {
10405            /* zero bits 127:64 */
10406            putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
10407         }
10408         DIP("movsd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10409                              nameXMMReg(gregOfRM(modrm)));
10410         delta += 3+1;
10411      } else {
10412         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10413         /* zero bits 127:64 */
10414         putXMMRegLane64( gregOfRM(modrm), 1, mkU64(0) );
10415         /* write bits 63:0 */
10416         putXMMRegLane64( gregOfRM(modrm), 0,
10417                          loadLE(Ity_I64, mkexpr(addr)) );
10418         DIP("movsd %s,%s\n", dis_buf,
10419                              nameXMMReg(gregOfRM(modrm)));
10420         delta += 3+alen;
10421      }
10422      goto decode_success;
10423   }
10424
10425   /* F2 0F 11 = MOVSD -- move 64 bits from G (lo half xmm) to E (mem
10426      or lo half xmm). */
10427   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x11) {
10428      vassert(sz == 4);
10429      modrm = getIByte(delta+3);
10430      if (epartIsReg(modrm)) {
10431         putXMMRegLane64( eregOfRM(modrm), 0,
10432                          getXMMRegLane64( gregOfRM(modrm), 0 ));
10433         DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
10434                              nameXMMReg(eregOfRM(modrm)));
10435         delta += 3+1;
10436      } else {
10437         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
10438         storeLE( mkexpr(addr),
10439                  getXMMRegLane64(gregOfRM(modrm), 0) );
10440         DIP("movsd %s,%s\n", nameXMMReg(gregOfRM(modrm)),
10441                              dis_buf);
10442         delta += 3+alen;
10443      }
10444      goto decode_success;
10445   }
10446
10447   /* 66 0F 59 = MULPD -- mul 64Fx2 from R/M to R */
10448   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x59) {
10449      delta = dis_SSE_E_to_G_all( sorb, delta+2, "mulpd", Iop_Mul64Fx2 );
10450      goto decode_success;
10451   }
10452
10453   /* F2 0F 59 = MULSD -- mul 64F0x2 from R/M to R */
10454   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x59) {
10455      vassert(sz == 4);
10456      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "mulsd", Iop_Mul64F0x2 );
10457      goto decode_success;
10458   }
10459
10460   /* 66 0F 56 = ORPD -- G = G and E */
10461   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x56) {
10462      delta = dis_SSE_E_to_G_all( sorb, delta+2, "orpd", Iop_OrV128 );
10463      goto decode_success;
10464   }
10465
10466   /* 66 0F C6 /r ib = SHUFPD -- shuffle packed F64s */
10467   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC6) {
10468      Int    select;
10469      IRTemp sV = newTemp(Ity_V128);
10470      IRTemp dV = newTemp(Ity_V128);
10471      IRTemp s1 = newTemp(Ity_I64);
10472      IRTemp s0 = newTemp(Ity_I64);
10473      IRTemp d1 = newTemp(Ity_I64);
10474      IRTemp d0 = newTemp(Ity_I64);
10475
10476      modrm = insn[2];
10477      assign( dV, getXMMReg(gregOfRM(modrm)) );
10478
10479      if (epartIsReg(modrm)) {
10480         assign( sV, getXMMReg(eregOfRM(modrm)) );
10481         select = (Int)insn[3];
10482         delta += 2+2;
10483         DIP("shufpd $%d,%s,%s\n", select,
10484                                   nameXMMReg(eregOfRM(modrm)),
10485                                   nameXMMReg(gregOfRM(modrm)));
10486      } else {
10487         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10488         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
10489         select = (Int)insn[2+alen];
10490         delta += 3+alen;
10491         DIP("shufpd $%d,%s,%s\n", select,
10492                                   dis_buf,
10493                                   nameXMMReg(gregOfRM(modrm)));
10494      }
10495
10496      assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
10497      assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
10498      assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
10499      assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
10500
10501#     define SELD(n) mkexpr((n)==0 ? d0 : d1)
10502#     define SELS(n) mkexpr((n)==0 ? s0 : s1)
10503
10504      putXMMReg(
10505         gregOfRM(modrm),
10506         binop(Iop_64HLtoV128, SELS((select>>1)&1), SELD((select>>0)&1) )
10507      );
10508
10509#     undef SELD
10510#     undef SELS
10511
10512      goto decode_success;
10513   }
10514
10515   /* 66 0F 51 = SQRTPD -- approx sqrt 64Fx2 from R/M to R */
10516   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x51) {
10517      delta = dis_SSE_E_to_G_unary_all( sorb, delta+2,
10518                                        "sqrtpd", Iop_Sqrt64Fx2 );
10519      goto decode_success;
10520   }
10521
10522   /* F2 0F 51 = SQRTSD -- approx sqrt 64F0x2 from R/M to R */
10523   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x51) {
10524      vassert(sz == 4);
10525      delta = dis_SSE_E_to_G_unary_lo64( sorb, delta+3,
10526                                         "sqrtsd", Iop_Sqrt64F0x2 );
10527      goto decode_success;
10528   }
10529
10530   /* 66 0F 5C = SUBPD -- sub 64Fx2 from R/M to R */
10531   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x5C) {
10532      delta = dis_SSE_E_to_G_all( sorb, delta+2, "subpd", Iop_Sub64Fx2 );
10533      goto decode_success;
10534   }
10535
10536   /* F2 0F 5C = SUBSD -- sub 64F0x2 from R/M to R */
10537   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x5C) {
10538      vassert(sz == 4);
10539      delta = dis_SSE_E_to_G_lo64( sorb, delta+3, "subsd", Iop_Sub64F0x2 );
10540      goto decode_success;
10541   }
10542
10543   /* 66 0F 15 = UNPCKHPD -- unpack and interleave high part F64s */
10544   /* 66 0F 14 = UNPCKLPD -- unpack and interleave low part F64s */
10545   /* These just appear to be special cases of SHUFPS */
10546   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x15 || insn[1] == 0x14)) {
10547      IRTemp s1 = newTemp(Ity_I64);
10548      IRTemp s0 = newTemp(Ity_I64);
10549      IRTemp d1 = newTemp(Ity_I64);
10550      IRTemp d0 = newTemp(Ity_I64);
10551      IRTemp sV = newTemp(Ity_V128);
10552      IRTemp dV = newTemp(Ity_V128);
10553      Bool   hi = toBool(insn[1] == 0x15);
10554
10555      modrm = insn[2];
10556      assign( dV, getXMMReg(gregOfRM(modrm)) );
10557
10558      if (epartIsReg(modrm)) {
10559         assign( sV, getXMMReg(eregOfRM(modrm)) );
10560         delta += 2+1;
10561         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
10562                                  nameXMMReg(eregOfRM(modrm)),
10563                                  nameXMMReg(gregOfRM(modrm)));
10564      } else {
10565         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10566         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
10567         delta += 2+alen;
10568         DIP("unpck%sps %s,%s\n", hi ? "h" : "l",
10569                                  dis_buf,
10570                                  nameXMMReg(gregOfRM(modrm)));
10571      }
10572
10573      assign( d1, unop(Iop_V128HIto64, mkexpr(dV)) );
10574      assign( d0, unop(Iop_V128to64,   mkexpr(dV)) );
10575      assign( s1, unop(Iop_V128HIto64, mkexpr(sV)) );
10576      assign( s0, unop(Iop_V128to64,   mkexpr(sV)) );
10577
10578      if (hi) {
10579         putXMMReg( gregOfRM(modrm),
10580                    binop(Iop_64HLtoV128, mkexpr(s1), mkexpr(d1)) );
10581      } else {
10582         putXMMReg( gregOfRM(modrm),
10583                    binop(Iop_64HLtoV128, mkexpr(s0), mkexpr(d0)) );
10584      }
10585
10586      goto decode_success;
10587   }
10588
10589   /* 66 0F 57 = XORPD -- G = G and E */
10590   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x57) {
10591      delta = dis_SSE_E_to_G_all( sorb, delta+2, "xorpd", Iop_XorV128 );
10592      goto decode_success;
10593   }
10594
10595   /* 66 0F 6B = PACKSSDW */
10596   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6B) {
10597      delta = dis_SSEint_E_to_G( sorb, delta+2,
10598                                 "packssdw",
10599                                 Iop_QNarrowBin32Sto16Sx8, True );
10600      goto decode_success;
10601   }
10602
10603   /* 66 0F 63 = PACKSSWB */
10604   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x63) {
10605      delta = dis_SSEint_E_to_G( sorb, delta+2,
10606                                 "packsswb",
10607                                 Iop_QNarrowBin16Sto8Sx16, True );
10608      goto decode_success;
10609   }
10610
10611   /* 66 0F 67 = PACKUSWB */
10612   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x67) {
10613      delta = dis_SSEint_E_to_G( sorb, delta+2,
10614                                 "packuswb",
10615                                 Iop_QNarrowBin16Sto8Ux16, True );
10616      goto decode_success;
10617   }
10618
10619   /* 66 0F FC = PADDB */
10620   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFC) {
10621      delta = dis_SSEint_E_to_G( sorb, delta+2,
10622                                 "paddb", Iop_Add8x16, False );
10623      goto decode_success;
10624   }
10625
10626   /* 66 0F FE = PADDD */
10627   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFE) {
10628      delta = dis_SSEint_E_to_G( sorb, delta+2,
10629                                 "paddd", Iop_Add32x4, False );
10630      goto decode_success;
10631   }
10632
10633   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
10634   /* 0F D4 = PADDQ -- add 64x1 */
10635   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xD4) {
10636      do_MMX_preamble();
10637      delta = dis_MMXop_regmem_to_reg (
10638                sorb, delta+2, insn[1], "paddq", False );
10639      goto decode_success;
10640   }
10641
10642   /* 66 0F D4 = PADDQ */
10643   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD4) {
10644      delta = dis_SSEint_E_to_G( sorb, delta+2,
10645                                 "paddq", Iop_Add64x2, False );
10646      goto decode_success;
10647   }
10648
10649   /* 66 0F FD = PADDW */
10650   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFD) {
10651      delta = dis_SSEint_E_to_G( sorb, delta+2,
10652                                 "paddw", Iop_Add16x8, False );
10653      goto decode_success;
10654   }
10655
10656   /* 66 0F EC = PADDSB */
10657   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEC) {
10658      delta = dis_SSEint_E_to_G( sorb, delta+2,
10659                                 "paddsb", Iop_QAdd8Sx16, False );
10660      goto decode_success;
10661   }
10662
10663   /* 66 0F ED = PADDSW */
10664   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xED) {
10665      delta = dis_SSEint_E_to_G( sorb, delta+2,
10666                                 "paddsw", Iop_QAdd16Sx8, False );
10667      goto decode_success;
10668   }
10669
10670   /* 66 0F DC = PADDUSB */
10671   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDC) {
10672      delta = dis_SSEint_E_to_G( sorb, delta+2,
10673                                 "paddusb", Iop_QAdd8Ux16, False );
10674      goto decode_success;
10675   }
10676
10677   /* 66 0F DD = PADDUSW */
10678   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDD) {
10679      delta = dis_SSEint_E_to_G( sorb, delta+2,
10680                                 "paddusw", Iop_QAdd16Ux8, False );
10681      goto decode_success;
10682   }
10683
10684   /* 66 0F DB = PAND */
10685   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDB) {
10686      delta = dis_SSE_E_to_G_all( sorb, delta+2, "pand", Iop_AndV128 );
10687      goto decode_success;
10688   }
10689
10690   /* 66 0F DF = PANDN */
10691   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDF) {
10692      delta = dis_SSE_E_to_G_all_invG( sorb, delta+2, "pandn", Iop_AndV128 );
10693      goto decode_success;
10694   }
10695
10696   /* 66 0F E0 = PAVGB */
10697   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE0) {
10698      delta = dis_SSEint_E_to_G( sorb, delta+2,
10699                                 "pavgb", Iop_Avg8Ux16, False );
10700      goto decode_success;
10701   }
10702
10703   /* 66 0F E3 = PAVGW */
10704   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE3) {
10705      delta = dis_SSEint_E_to_G( sorb, delta+2,
10706                                 "pavgw", Iop_Avg16Ux8, False );
10707      goto decode_success;
10708   }
10709
10710   /* 66 0F 74 = PCMPEQB */
10711   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x74) {
10712      delta = dis_SSEint_E_to_G( sorb, delta+2,
10713                                 "pcmpeqb", Iop_CmpEQ8x16, False );
10714      goto decode_success;
10715   }
10716
10717   /* 66 0F 76 = PCMPEQD */
10718   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x76) {
10719      delta = dis_SSEint_E_to_G( sorb, delta+2,
10720                                 "pcmpeqd", Iop_CmpEQ32x4, False );
10721      goto decode_success;
10722   }
10723
10724   /* 66 0F 75 = PCMPEQW */
10725   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x75) {
10726      delta = dis_SSEint_E_to_G( sorb, delta+2,
10727                                 "pcmpeqw", Iop_CmpEQ16x8, False );
10728      goto decode_success;
10729   }
10730
10731   /* 66 0F 64 = PCMPGTB */
10732   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x64) {
10733      delta = dis_SSEint_E_to_G( sorb, delta+2,
10734                                 "pcmpgtb", Iop_CmpGT8Sx16, False );
10735      goto decode_success;
10736   }
10737
10738   /* 66 0F 66 = PCMPGTD */
10739   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x66) {
10740      delta = dis_SSEint_E_to_G( sorb, delta+2,
10741                                 "pcmpgtd", Iop_CmpGT32Sx4, False );
10742      goto decode_success;
10743   }
10744
10745   /* 66 0F 65 = PCMPGTW */
10746   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x65) {
10747      delta = dis_SSEint_E_to_G( sorb, delta+2,
10748                                 "pcmpgtw", Iop_CmpGT16Sx8, False );
10749      goto decode_success;
10750   }
10751
10752   /* 66 0F C5 = PEXTRW -- extract 16-bit field from xmm(E) and put
10753      zero-extend of it in ireg(G). */
10754   if (insn[0] == 0x0F && insn[1] == 0xC5) {
10755      modrm = insn[2];
10756      if (sz == 2 && epartIsReg(modrm)) {
10757         t5 = newTemp(Ity_V128);
10758         t4 = newTemp(Ity_I16);
10759         assign(t5, getXMMReg(eregOfRM(modrm)));
10760         breakup128to32s( t5, &t3, &t2, &t1, &t0 );
10761         switch (insn[3] & 7) {
10762            case 0:  assign(t4, unop(Iop_32to16,   mkexpr(t0))); break;
10763            case 1:  assign(t4, unop(Iop_32HIto16, mkexpr(t0))); break;
10764            case 2:  assign(t4, unop(Iop_32to16,   mkexpr(t1))); break;
10765            case 3:  assign(t4, unop(Iop_32HIto16, mkexpr(t1))); break;
10766            case 4:  assign(t4, unop(Iop_32to16,   mkexpr(t2))); break;
10767            case 5:  assign(t4, unop(Iop_32HIto16, mkexpr(t2))); break;
10768            case 6:  assign(t4, unop(Iop_32to16,   mkexpr(t3))); break;
10769            case 7:  assign(t4, unop(Iop_32HIto16, mkexpr(t3))); break;
10770            default: vassert(0); /*NOTREACHED*/
10771         }
10772         putIReg(4, gregOfRM(modrm), unop(Iop_16Uto32, mkexpr(t4)));
10773         DIP("pextrw $%d,%s,%s\n",
10774             (Int)insn[3], nameXMMReg(eregOfRM(modrm)),
10775                           nameIReg(4,gregOfRM(modrm)));
10776         delta += 4;
10777         goto decode_success;
10778      }
10779      /* else fall through */
10780   }
10781
10782   /* 66 0F C4 = PINSRW -- get 16 bits from E(mem or low half ireg) and
10783      put it into the specified lane of xmm(G). */
10784   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xC4) {
10785      Int lane;
10786      t4 = newTemp(Ity_I16);
10787      modrm = insn[2];
10788
10789      if (epartIsReg(modrm)) {
10790         assign(t4, getIReg(2, eregOfRM(modrm)));
10791         delta += 3+1;
10792         lane = insn[3+1-1];
10793         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
10794                                   nameIReg(2,eregOfRM(modrm)),
10795                                   nameXMMReg(gregOfRM(modrm)));
10796      } else {
10797         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10798         delta += 3+alen;
10799         lane = insn[3+alen-1];
10800         assign(t4, loadLE(Ity_I16, mkexpr(addr)));
10801         DIP("pinsrw $%d,%s,%s\n", (Int)lane,
10802                                   dis_buf,
10803                                   nameXMMReg(gregOfRM(modrm)));
10804      }
10805
10806      putXMMRegLane16( gregOfRM(modrm), lane & 7, mkexpr(t4) );
10807      goto decode_success;
10808   }
10809
10810   /* 66 0F F5 = PMADDWD -- Multiply and add packed integers from
10811      E(xmm or mem) to G(xmm) */
10812   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF5) {
10813      IRTemp s1V  = newTemp(Ity_V128);
10814      IRTemp s2V  = newTemp(Ity_V128);
10815      IRTemp dV   = newTemp(Ity_V128);
10816      IRTemp s1Hi = newTemp(Ity_I64);
10817      IRTemp s1Lo = newTemp(Ity_I64);
10818      IRTemp s2Hi = newTemp(Ity_I64);
10819      IRTemp s2Lo = newTemp(Ity_I64);
10820      IRTemp dHi  = newTemp(Ity_I64);
10821      IRTemp dLo  = newTemp(Ity_I64);
10822      modrm = insn[2];
10823      if (epartIsReg(modrm)) {
10824         assign( s1V, getXMMReg(eregOfRM(modrm)) );
10825         delta += 2+1;
10826         DIP("pmaddwd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10827                                nameXMMReg(gregOfRM(modrm)));
10828      } else {
10829         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10830         assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
10831         delta += 2+alen;
10832         DIP("pmaddwd %s,%s\n", dis_buf,
10833                                nameXMMReg(gregOfRM(modrm)));
10834      }
10835      assign( s2V, getXMMReg(gregOfRM(modrm)) );
10836      assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
10837      assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
10838      assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
10839      assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
10840      assign( dHi, mkIRExprCCall(
10841                      Ity_I64, 0/*regparms*/,
10842                      "x86g_calculate_mmx_pmaddwd",
10843                      &x86g_calculate_mmx_pmaddwd,
10844                      mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
10845                   ));
10846      assign( dLo, mkIRExprCCall(
10847                      Ity_I64, 0/*regparms*/,
10848                      "x86g_calculate_mmx_pmaddwd",
10849                      &x86g_calculate_mmx_pmaddwd,
10850                      mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
10851                   ));
10852      assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
10853      putXMMReg(gregOfRM(modrm), mkexpr(dV));
10854      goto decode_success;
10855   }
10856
10857   /* 66 0F EE = PMAXSW -- 16x8 signed max */
10858   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEE) {
10859      delta = dis_SSEint_E_to_G( sorb, delta+2,
10860                                 "pmaxsw", Iop_Max16Sx8, False );
10861      goto decode_success;
10862   }
10863
10864   /* 66 0F DE = PMAXUB -- 8x16 unsigned max */
10865   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDE) {
10866      delta = dis_SSEint_E_to_G( sorb, delta+2,
10867                                 "pmaxub", Iop_Max8Ux16, False );
10868      goto decode_success;
10869   }
10870
10871   /* 66 0F EA = PMINSW -- 16x8 signed min */
10872   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEA) {
10873      delta = dis_SSEint_E_to_G( sorb, delta+2,
10874                                 "pminsw", Iop_Min16Sx8, False );
10875      goto decode_success;
10876   }
10877
10878   /* 66 0F DA = PMINUB -- 8x16 unsigned min */
10879   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xDA) {
10880      delta = dis_SSEint_E_to_G( sorb, delta+2,
10881                                 "pminub", Iop_Min8Ux16, False );
10882      goto decode_success;
10883   }
10884
10885   /* 66 0F D7 = PMOVMSKB -- extract sign bits from each of 16 lanes in
10886      xmm(G), turn them into a byte, and put zero-extend of it in
10887      ireg(G).  Doing this directly is just too cumbersome; give up
10888      therefore and call a helper. */
10889   /* UInt x86g_calculate_sse_pmovmskb ( ULong w64hi, ULong w64lo ); */
10890   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD7) {
10891      modrm = insn[2];
10892      if (epartIsReg(modrm)) {
10893         t0 = newTemp(Ity_I64);
10894         t1 = newTemp(Ity_I64);
10895         assign(t0, getXMMRegLane64(eregOfRM(modrm), 0));
10896         assign(t1, getXMMRegLane64(eregOfRM(modrm), 1));
10897         t5 = newTemp(Ity_I32);
10898         assign(t5, mkIRExprCCall(
10899                       Ity_I32, 0/*regparms*/,
10900                       "x86g_calculate_sse_pmovmskb",
10901                       &x86g_calculate_sse_pmovmskb,
10902                       mkIRExprVec_2( mkexpr(t1), mkexpr(t0) )));
10903         putIReg(4, gregOfRM(modrm), mkexpr(t5));
10904         DIP("pmovmskb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10905                                 nameIReg(4,gregOfRM(modrm)));
10906         delta += 3;
10907         goto decode_success;
10908      }
10909      /* else fall through */
10910   }
10911
10912   /* 66 0F E4 = PMULHUW -- 16x8 hi-half of unsigned widening multiply */
10913   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE4) {
10914      delta = dis_SSEint_E_to_G( sorb, delta+2,
10915                                 "pmulhuw", Iop_MulHi16Ux8, False );
10916      goto decode_success;
10917   }
10918
10919   /* 66 0F E5 = PMULHW -- 16x8 hi-half of signed widening multiply */
10920   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE5) {
10921      delta = dis_SSEint_E_to_G( sorb, delta+2,
10922                                 "pmulhw", Iop_MulHi16Sx8, False );
10923      goto decode_success;
10924   }
10925
10926   /* 66 0F D5 = PMULHL -- 16x8 multiply */
10927   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD5) {
10928      delta = dis_SSEint_E_to_G( sorb, delta+2,
10929                                 "pmullw", Iop_Mul16x8, False );
10930      goto decode_success;
10931   }
10932
10933   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
10934   /* 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
10935      0 to form 64-bit result */
10936   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xF4) {
10937      IRTemp sV = newTemp(Ity_I64);
10938      IRTemp dV = newTemp(Ity_I64);
10939      t1 = newTemp(Ity_I32);
10940      t0 = newTemp(Ity_I32);
10941      modrm = insn[2];
10942
10943      do_MMX_preamble();
10944      assign( dV, getMMXReg(gregOfRM(modrm)) );
10945
10946      if (epartIsReg(modrm)) {
10947         assign( sV, getMMXReg(eregOfRM(modrm)) );
10948         delta += 2+1;
10949         DIP("pmuludq %s,%s\n", nameMMXReg(eregOfRM(modrm)),
10950                                nameMMXReg(gregOfRM(modrm)));
10951      } else {
10952         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10953         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
10954         delta += 2+alen;
10955         DIP("pmuludq %s,%s\n", dis_buf,
10956                                nameMMXReg(gregOfRM(modrm)));
10957      }
10958
10959      assign( t0, unop(Iop_64to32, mkexpr(dV)) );
10960      assign( t1, unop(Iop_64to32, mkexpr(sV)) );
10961      putMMXReg( gregOfRM(modrm),
10962                 binop( Iop_MullU32, mkexpr(t0), mkexpr(t1) ) );
10963      goto decode_success;
10964   }
10965
10966   /* 66 0F F4 = PMULUDQ -- unsigned widening multiply of 32-lanes 0 x
10967      0 to form lower 64-bit half and lanes 2 x 2 to form upper 64-bit
10968      half */
10969   /* This is a really poor translation -- could be improved if
10970      performance critical */
10971   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF4) {
10972      IRTemp sV, dV;
10973      IRTemp s3, s2, s1, s0, d3, d2, d1, d0;
10974      sV = newTemp(Ity_V128);
10975      dV = newTemp(Ity_V128);
10976      s3 = s2 = s1 = s0 = d3 = d2 = d1 = d0 = IRTemp_INVALID;
10977      t1 = newTemp(Ity_I64);
10978      t0 = newTemp(Ity_I64);
10979      modrm = insn[2];
10980      assign( dV, getXMMReg(gregOfRM(modrm)) );
10981
10982      if (epartIsReg(modrm)) {
10983         assign( sV, getXMMReg(eregOfRM(modrm)) );
10984         delta += 2+1;
10985         DIP("pmuludq %s,%s\n", nameXMMReg(eregOfRM(modrm)),
10986                                nameXMMReg(gregOfRM(modrm)));
10987      } else {
10988         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
10989         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
10990         delta += 2+alen;
10991         DIP("pmuludq %s,%s\n", dis_buf,
10992                                nameXMMReg(gregOfRM(modrm)));
10993      }
10994
10995      breakup128to32s( dV, &d3, &d2, &d1, &d0 );
10996      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
10997
10998      assign( t0, binop( Iop_MullU32, mkexpr(d0), mkexpr(s0)) );
10999      putXMMRegLane64( gregOfRM(modrm), 0, mkexpr(t0) );
11000      assign( t1, binop( Iop_MullU32, mkexpr(d2), mkexpr(s2)) );
11001      putXMMRegLane64( gregOfRM(modrm), 1, mkexpr(t1) );
11002      goto decode_success;
11003   }
11004
11005   /* 66 0F EB = POR */
11006   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEB) {
11007      delta = dis_SSE_E_to_G_all( sorb, delta+2, "por", Iop_OrV128 );
11008      goto decode_success;
11009   }
11010
11011   /* 66 0F F6 = PSADBW -- 2 x (8x8 -> 48 zeroes ++ u16) Sum Abs Diffs
11012      from E(xmm or mem) to G(xmm) */
11013   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF6) {
11014      IRTemp s1V  = newTemp(Ity_V128);
11015      IRTemp s2V  = newTemp(Ity_V128);
11016      IRTemp dV   = newTemp(Ity_V128);
11017      IRTemp s1Hi = newTemp(Ity_I64);
11018      IRTemp s1Lo = newTemp(Ity_I64);
11019      IRTemp s2Hi = newTemp(Ity_I64);
11020      IRTemp s2Lo = newTemp(Ity_I64);
11021      IRTemp dHi  = newTemp(Ity_I64);
11022      IRTemp dLo  = newTemp(Ity_I64);
11023      modrm = insn[2];
11024      if (epartIsReg(modrm)) {
11025         assign( s1V, getXMMReg(eregOfRM(modrm)) );
11026         delta += 2+1;
11027         DIP("psadbw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11028                               nameXMMReg(gregOfRM(modrm)));
11029      } else {
11030         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11031         assign( s1V, loadLE(Ity_V128, mkexpr(addr)) );
11032         delta += 2+alen;
11033         DIP("psadbw %s,%s\n", dis_buf,
11034                               nameXMMReg(gregOfRM(modrm)));
11035      }
11036      assign( s2V, getXMMReg(gregOfRM(modrm)) );
11037      assign( s1Hi, unop(Iop_V128HIto64, mkexpr(s1V)) );
11038      assign( s1Lo, unop(Iop_V128to64,   mkexpr(s1V)) );
11039      assign( s2Hi, unop(Iop_V128HIto64, mkexpr(s2V)) );
11040      assign( s2Lo, unop(Iop_V128to64,   mkexpr(s2V)) );
11041      assign( dHi, mkIRExprCCall(
11042                      Ity_I64, 0/*regparms*/,
11043                      "x86g_calculate_mmx_psadbw",
11044                      &x86g_calculate_mmx_psadbw,
11045                      mkIRExprVec_2( mkexpr(s1Hi), mkexpr(s2Hi))
11046                   ));
11047      assign( dLo, mkIRExprCCall(
11048                      Ity_I64, 0/*regparms*/,
11049                      "x86g_calculate_mmx_psadbw",
11050                      &x86g_calculate_mmx_psadbw,
11051                      mkIRExprVec_2( mkexpr(s1Lo), mkexpr(s2Lo))
11052                   ));
11053      assign( dV, binop(Iop_64HLtoV128, mkexpr(dHi), mkexpr(dLo))) ;
11054      putXMMReg(gregOfRM(modrm), mkexpr(dV));
11055      goto decode_success;
11056   }
11057
11058   /* 66 0F 70 = PSHUFD -- rearrange 4x32 from E(xmm or mem) to G(xmm) */
11059   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x70) {
11060      Int order;
11061      IRTemp sV, dV, s3, s2, s1, s0;
11062      s3 = s2 = s1 = s0 = IRTemp_INVALID;
11063      sV = newTemp(Ity_V128);
11064      dV = newTemp(Ity_V128);
11065      modrm = insn[2];
11066      if (epartIsReg(modrm)) {
11067         assign( sV, getXMMReg(eregOfRM(modrm)) );
11068         order = (Int)insn[3];
11069         delta += 2+2;
11070         DIP("pshufd $%d,%s,%s\n", order,
11071                                   nameXMMReg(eregOfRM(modrm)),
11072                                   nameXMMReg(gregOfRM(modrm)));
11073      } else {
11074         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11075         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11076	 order = (Int)insn[2+alen];
11077         delta += 3+alen;
11078         DIP("pshufd $%d,%s,%s\n", order,
11079                                   dis_buf,
11080                                   nameXMMReg(gregOfRM(modrm)));
11081      }
11082      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
11083
11084#     define SEL(n) \
11085                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11086      assign(dV,
11087	     mk128from32s( SEL((order>>6)&3), SEL((order>>4)&3),
11088                           SEL((order>>2)&3), SEL((order>>0)&3) )
11089      );
11090      putXMMReg(gregOfRM(modrm), mkexpr(dV));
11091#     undef SEL
11092      goto decode_success;
11093   }
11094
11095   /* F3 0F 70 = PSHUFHW -- rearrange upper half 4x16 from E(xmm or
11096      mem) to G(xmm), and copy lower half */
11097   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0x70) {
11098      Int order;
11099      IRTemp sVhi, dVhi, sV, dV, s3, s2, s1, s0;
11100      s3 = s2 = s1 = s0 = IRTemp_INVALID;
11101      sV   = newTemp(Ity_V128);
11102      dV   = newTemp(Ity_V128);
11103      sVhi = newTemp(Ity_I64);
11104      dVhi = newTemp(Ity_I64);
11105      modrm = insn[3];
11106      if (epartIsReg(modrm)) {
11107         assign( sV, getXMMReg(eregOfRM(modrm)) );
11108         order = (Int)insn[4];
11109         delta += 4+1;
11110         DIP("pshufhw $%d,%s,%s\n", order,
11111                                    nameXMMReg(eregOfRM(modrm)),
11112                                    nameXMMReg(gregOfRM(modrm)));
11113      } else {
11114         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11115         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11116	 order = (Int)insn[3+alen];
11117         delta += 4+alen;
11118         DIP("pshufhw $%d,%s,%s\n", order,
11119                                    dis_buf,
11120                                    nameXMMReg(gregOfRM(modrm)));
11121      }
11122      assign( sVhi, unop(Iop_V128HIto64, mkexpr(sV)) );
11123      breakup64to16s( sVhi, &s3, &s2, &s1, &s0 );
11124
11125#     define SEL(n) \
11126                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11127      assign(dVhi,
11128	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
11129                          SEL((order>>2)&3), SEL((order>>0)&3) )
11130      );
11131      assign(dV, binop( Iop_64HLtoV128,
11132                        mkexpr(dVhi),
11133                        unop(Iop_V128to64, mkexpr(sV))) );
11134      putXMMReg(gregOfRM(modrm), mkexpr(dV));
11135#     undef SEL
11136      goto decode_success;
11137   }
11138
11139   /* F2 0F 70 = PSHUFLW -- rearrange lower half 4x16 from E(xmm or
11140      mem) to G(xmm), and copy upper half */
11141   if (insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x70) {
11142      Int order;
11143      IRTemp sVlo, dVlo, sV, dV, s3, s2, s1, s0;
11144      s3 = s2 = s1 = s0 = IRTemp_INVALID;
11145      sV   = newTemp(Ity_V128);
11146      dV   = newTemp(Ity_V128);
11147      sVlo = newTemp(Ity_I64);
11148      dVlo = newTemp(Ity_I64);
11149      modrm = insn[3];
11150      if (epartIsReg(modrm)) {
11151         assign( sV, getXMMReg(eregOfRM(modrm)) );
11152         order = (Int)insn[4];
11153         delta += 4+1;
11154         DIP("pshuflw $%d,%s,%s\n", order,
11155                                    nameXMMReg(eregOfRM(modrm)),
11156                                    nameXMMReg(gregOfRM(modrm)));
11157      } else {
11158         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11159         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11160	 order = (Int)insn[3+alen];
11161         delta += 4+alen;
11162         DIP("pshuflw $%d,%s,%s\n", order,
11163                                    dis_buf,
11164                                    nameXMMReg(gregOfRM(modrm)));
11165      }
11166      assign( sVlo, unop(Iop_V128to64, mkexpr(sV)) );
11167      breakup64to16s( sVlo, &s3, &s2, &s1, &s0 );
11168
11169#     define SEL(n) \
11170                ((n)==0 ? s0 : ((n)==1 ? s1 : ((n)==2 ? s2 : s3)))
11171      assign(dVlo,
11172	     mk64from16s( SEL((order>>6)&3), SEL((order>>4)&3),
11173                          SEL((order>>2)&3), SEL((order>>0)&3) )
11174      );
11175      assign(dV, binop( Iop_64HLtoV128,
11176                        unop(Iop_V128HIto64, mkexpr(sV)),
11177                        mkexpr(dVlo) ) );
11178      putXMMReg(gregOfRM(modrm), mkexpr(dV));
11179#     undef SEL
11180      goto decode_success;
11181   }
11182
11183   /* 66 0F 72 /6 ib = PSLLD by immediate */
11184   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
11185       && epartIsReg(insn[2])
11186       && gregOfRM(insn[2]) == 6) {
11187      delta = dis_SSE_shiftE_imm( delta+2, "pslld", Iop_ShlN32x4 );
11188      goto decode_success;
11189   }
11190
11191   /* 66 0F F2 = PSLLD by E */
11192   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF2) {
11193      delta = dis_SSE_shiftG_byE( sorb, delta+2, "pslld", Iop_ShlN32x4 );
11194      goto decode_success;
11195   }
11196
11197   /* 66 0F 73 /7 ib = PSLLDQ by immediate */
11198   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
11199       && epartIsReg(insn[2])
11200       && gregOfRM(insn[2]) == 7) {
11201      IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
11202      Int    imm = (Int)insn[3];
11203      Int    reg = eregOfRM(insn[2]);
11204      DIP("pslldq $%d,%s\n", imm, nameXMMReg(reg));
11205      vassert(imm >= 0 && imm <= 255);
11206      delta += 4;
11207
11208      sV    = newTemp(Ity_V128);
11209      dV    = newTemp(Ity_V128);
11210      hi64  = newTemp(Ity_I64);
11211      lo64  = newTemp(Ity_I64);
11212      hi64r = newTemp(Ity_I64);
11213      lo64r = newTemp(Ity_I64);
11214
11215      if (imm >= 16) {
11216         putXMMReg(reg, mkV128(0x0000));
11217         goto decode_success;
11218      }
11219
11220      assign( sV, getXMMReg(reg) );
11221      assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
11222      assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
11223
11224      if (imm == 0) {
11225         assign( lo64r, mkexpr(lo64) );
11226         assign( hi64r, mkexpr(hi64) );
11227      }
11228      else
11229      if (imm == 8) {
11230         assign( lo64r, mkU64(0) );
11231         assign( hi64r, mkexpr(lo64) );
11232      }
11233      else
11234      if (imm > 8) {
11235         assign( lo64r, mkU64(0) );
11236         assign( hi64r, binop( Iop_Shl64,
11237                               mkexpr(lo64),
11238                               mkU8( 8*(imm-8) ) ));
11239      } else {
11240         assign( lo64r, binop( Iop_Shl64,
11241                               mkexpr(lo64),
11242                               mkU8(8 * imm) ));
11243         assign( hi64r,
11244                 binop( Iop_Or64,
11245                        binop(Iop_Shl64, mkexpr(hi64),
11246                                         mkU8(8 * imm)),
11247                        binop(Iop_Shr64, mkexpr(lo64),
11248                                         mkU8(8 * (8 - imm)) )
11249                      )
11250               );
11251      }
11252      assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
11253      putXMMReg(reg, mkexpr(dV));
11254      goto decode_success;
11255   }
11256
11257   /* 66 0F 73 /6 ib = PSLLQ by immediate */
11258   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
11259       && epartIsReg(insn[2])
11260       && gregOfRM(insn[2]) == 6) {
11261      delta = dis_SSE_shiftE_imm( delta+2, "psllq", Iop_ShlN64x2 );
11262      goto decode_success;
11263   }
11264
11265   /* 66 0F F3 = PSLLQ by E */
11266   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF3) {
11267      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllq", Iop_ShlN64x2 );
11268      goto decode_success;
11269   }
11270
11271   /* 66 0F 71 /6 ib = PSLLW by immediate */
11272   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
11273       && epartIsReg(insn[2])
11274       && gregOfRM(insn[2]) == 6) {
11275      delta = dis_SSE_shiftE_imm( delta+2, "psllw", Iop_ShlN16x8 );
11276      goto decode_success;
11277   }
11278
11279   /* 66 0F F1 = PSLLW by E */
11280   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF1) {
11281      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psllw", Iop_ShlN16x8 );
11282      goto decode_success;
11283   }
11284
11285   /* 66 0F 72 /4 ib = PSRAD by immediate */
11286   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
11287       && epartIsReg(insn[2])
11288       && gregOfRM(insn[2]) == 4) {
11289      delta = dis_SSE_shiftE_imm( delta+2, "psrad", Iop_SarN32x4 );
11290      goto decode_success;
11291   }
11292
11293   /* 66 0F E2 = PSRAD by E */
11294   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE2) {
11295      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrad", Iop_SarN32x4 );
11296      goto decode_success;
11297   }
11298
11299   /* 66 0F 71 /4 ib = PSRAW by immediate */
11300   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
11301       && epartIsReg(insn[2])
11302       && gregOfRM(insn[2]) == 4) {
11303      delta = dis_SSE_shiftE_imm( delta+2, "psraw", Iop_SarN16x8 );
11304      goto decode_success;
11305   }
11306
11307   /* 66 0F E1 = PSRAW by E */
11308   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE1) {
11309      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psraw", Iop_SarN16x8 );
11310      goto decode_success;
11311   }
11312
11313   /* 66 0F 72 /2 ib = PSRLD by immediate */
11314   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x72
11315       && epartIsReg(insn[2])
11316       && gregOfRM(insn[2]) == 2) {
11317      delta = dis_SSE_shiftE_imm( delta+2, "psrld", Iop_ShrN32x4 );
11318      goto decode_success;
11319   }
11320
11321   /* 66 0F D2 = PSRLD by E */
11322   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD2) {
11323      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrld", Iop_ShrN32x4 );
11324      goto decode_success;
11325   }
11326
11327   /* 66 0F 73 /3 ib = PSRLDQ by immediate */
11328   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
11329       && epartIsReg(insn[2])
11330       && gregOfRM(insn[2]) == 3) {
11331      IRTemp sV, dV, hi64, lo64, hi64r, lo64r;
11332      Int    imm = (Int)insn[3];
11333      Int    reg = eregOfRM(insn[2]);
11334      DIP("psrldq $%d,%s\n", imm, nameXMMReg(reg));
11335      vassert(imm >= 0 && imm <= 255);
11336      delta += 4;
11337
11338      sV    = newTemp(Ity_V128);
11339      dV    = newTemp(Ity_V128);
11340      hi64  = newTemp(Ity_I64);
11341      lo64  = newTemp(Ity_I64);
11342      hi64r = newTemp(Ity_I64);
11343      lo64r = newTemp(Ity_I64);
11344
11345      if (imm >= 16) {
11346         putXMMReg(reg, mkV128(0x0000));
11347         goto decode_success;
11348      }
11349
11350      assign( sV, getXMMReg(reg) );
11351      assign( hi64, unop(Iop_V128HIto64, mkexpr(sV)) );
11352      assign( lo64, unop(Iop_V128to64, mkexpr(sV)) );
11353
11354      if (imm == 0) {
11355         assign( lo64r, mkexpr(lo64) );
11356         assign( hi64r, mkexpr(hi64) );
11357      }
11358      else
11359      if (imm == 8) {
11360         assign( hi64r, mkU64(0) );
11361         assign( lo64r, mkexpr(hi64) );
11362      }
11363      else
11364      if (imm > 8) {
11365         assign( hi64r, mkU64(0) );
11366         assign( lo64r, binop( Iop_Shr64,
11367                               mkexpr(hi64),
11368                               mkU8( 8*(imm-8) ) ));
11369      } else {
11370         assign( hi64r, binop( Iop_Shr64,
11371                               mkexpr(hi64),
11372                               mkU8(8 * imm) ));
11373         assign( lo64r,
11374                 binop( Iop_Or64,
11375                        binop(Iop_Shr64, mkexpr(lo64),
11376                                         mkU8(8 * imm)),
11377                        binop(Iop_Shl64, mkexpr(hi64),
11378                                         mkU8(8 * (8 - imm)) )
11379                      )
11380               );
11381      }
11382
11383      assign( dV, binop(Iop_64HLtoV128, mkexpr(hi64r), mkexpr(lo64r)) );
11384      putXMMReg(reg, mkexpr(dV));
11385      goto decode_success;
11386   }
11387
11388   /* 66 0F 73 /2 ib = PSRLQ by immediate */
11389   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x73
11390       && epartIsReg(insn[2])
11391       && gregOfRM(insn[2]) == 2) {
11392      delta = dis_SSE_shiftE_imm( delta+2, "psrlq", Iop_ShrN64x2 );
11393      goto decode_success;
11394   }
11395
11396   /* 66 0F D3 = PSRLQ by E */
11397   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD3) {
11398      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlq", Iop_ShrN64x2 );
11399      goto decode_success;
11400   }
11401
11402   /* 66 0F 71 /2 ib = PSRLW by immediate */
11403   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x71
11404       && epartIsReg(insn[2])
11405       && gregOfRM(insn[2]) == 2) {
11406      delta = dis_SSE_shiftE_imm( delta+2, "psrlw", Iop_ShrN16x8 );
11407      goto decode_success;
11408   }
11409
11410   /* 66 0F D1 = PSRLW by E */
11411   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD1) {
11412      delta = dis_SSE_shiftG_byE( sorb, delta+2, "psrlw", Iop_ShrN16x8 );
11413      goto decode_success;
11414   }
11415
11416   /* 66 0F F8 = PSUBB */
11417   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF8) {
11418      delta = dis_SSEint_E_to_G( sorb, delta+2,
11419                                 "psubb", Iop_Sub8x16, False );
11420      goto decode_success;
11421   }
11422
11423   /* 66 0F FA = PSUBD */
11424   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFA) {
11425      delta = dis_SSEint_E_to_G( sorb, delta+2,
11426                                 "psubd", Iop_Sub32x4, False );
11427      goto decode_success;
11428   }
11429
11430   /* ***--- this is an MMX class insn introduced in SSE2 ---*** */
11431   /* 0F FB = PSUBQ -- sub 64x1 */
11432   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xFB) {
11433      do_MMX_preamble();
11434      delta = dis_MMXop_regmem_to_reg (
11435                sorb, delta+2, insn[1], "psubq", False );
11436      goto decode_success;
11437   }
11438
11439   /* 66 0F FB = PSUBQ */
11440   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xFB) {
11441      delta = dis_SSEint_E_to_G( sorb, delta+2,
11442                                 "psubq", Iop_Sub64x2, False );
11443      goto decode_success;
11444   }
11445
11446   /* 66 0F F9 = PSUBW */
11447   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xF9) {
11448      delta = dis_SSEint_E_to_G( sorb, delta+2,
11449                                 "psubw", Iop_Sub16x8, False );
11450      goto decode_success;
11451   }
11452
11453   /* 66 0F E8 = PSUBSB */
11454   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE8) {
11455      delta = dis_SSEint_E_to_G( sorb, delta+2,
11456                                 "psubsb", Iop_QSub8Sx16, False );
11457      goto decode_success;
11458   }
11459
11460   /* 66 0F E9 = PSUBSW */
11461   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xE9) {
11462      delta = dis_SSEint_E_to_G( sorb, delta+2,
11463                                 "psubsw", Iop_QSub16Sx8, False );
11464      goto decode_success;
11465   }
11466
11467   /* 66 0F D8 = PSUBSB */
11468   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD8) {
11469      delta = dis_SSEint_E_to_G( sorb, delta+2,
11470                                 "psubusb", Iop_QSub8Ux16, False );
11471      goto decode_success;
11472   }
11473
11474   /* 66 0F D9 = PSUBSW */
11475   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD9) {
11476      delta = dis_SSEint_E_to_G( sorb, delta+2,
11477                                 "psubusw", Iop_QSub16Ux8, False );
11478      goto decode_success;
11479   }
11480
11481   /* 66 0F 68 = PUNPCKHBW */
11482   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x68) {
11483      delta = dis_SSEint_E_to_G( sorb, delta+2,
11484                                 "punpckhbw",
11485                                 Iop_InterleaveHI8x16, True );
11486      goto decode_success;
11487   }
11488
11489   /* 66 0F 6A = PUNPCKHDQ */
11490   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6A) {
11491      delta = dis_SSEint_E_to_G( sorb, delta+2,
11492                                 "punpckhdq",
11493                                 Iop_InterleaveHI32x4, True );
11494      goto decode_success;
11495   }
11496
11497   /* 66 0F 6D = PUNPCKHQDQ */
11498   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6D) {
11499      delta = dis_SSEint_E_to_G( sorb, delta+2,
11500                                 "punpckhqdq",
11501                                 Iop_InterleaveHI64x2, True );
11502      goto decode_success;
11503   }
11504
11505   /* 66 0F 69 = PUNPCKHWD */
11506   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x69) {
11507      delta = dis_SSEint_E_to_G( sorb, delta+2,
11508                                 "punpckhwd",
11509                                 Iop_InterleaveHI16x8, True );
11510      goto decode_success;
11511   }
11512
11513   /* 66 0F 60 = PUNPCKLBW */
11514   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x60) {
11515      delta = dis_SSEint_E_to_G( sorb, delta+2,
11516                                 "punpcklbw",
11517                                 Iop_InterleaveLO8x16, True );
11518      goto decode_success;
11519   }
11520
11521   /* 66 0F 62 = PUNPCKLDQ */
11522   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x62) {
11523      delta = dis_SSEint_E_to_G( sorb, delta+2,
11524                                 "punpckldq",
11525                                 Iop_InterleaveLO32x4, True );
11526      goto decode_success;
11527   }
11528
11529   /* 66 0F 6C = PUNPCKLQDQ */
11530   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x6C) {
11531      delta = dis_SSEint_E_to_G( sorb, delta+2,
11532                                 "punpcklqdq",
11533                                 Iop_InterleaveLO64x2, True );
11534      goto decode_success;
11535   }
11536
11537   /* 66 0F 61 = PUNPCKLWD */
11538   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0x61) {
11539      delta = dis_SSEint_E_to_G( sorb, delta+2,
11540                                 "punpcklwd",
11541                                 Iop_InterleaveLO16x8, True );
11542      goto decode_success;
11543   }
11544
11545   /* 66 0F EF = PXOR */
11546   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xEF) {
11547      delta = dis_SSE_E_to_G_all( sorb, delta+2, "pxor", Iop_XorV128 );
11548      goto decode_success;
11549   }
11550
11551//--    /* FXSAVE/FXRSTOR m32 -- load/store the FPU/MMX/SSE state. */
11552//--    if (insn[0] == 0x0F && insn[1] == 0xAE
11553//--        && (!epartIsReg(insn[2]))
11554//--        && (gregOfRM(insn[2]) == 1 || gregOfRM(insn[2]) == 0) ) {
11555//--       Bool store = gregOfRM(insn[2]) == 0;
11556//--       vg_assert(sz == 4);
11557//--       pair = disAMode ( cb, sorb, eip+2, dis_buf );
11558//--       t1   = LOW24(pair);
11559//--       eip += 2+HI8(pair);
11560//--       uInstr3(cb, store ? SSE2a_MemWr : SSE2a_MemRd, 512,
11561//--                   Lit16, (((UShort)insn[0]) << 8) | (UShort)insn[1],
11562//--                   Lit16, (UShort)insn[2],
11563//--                   TempReg, t1 );
11564//--       DIP("fx%s %s\n", store ? "save" : "rstor", dis_buf );
11565//--       goto decode_success;
11566//--    }
11567
11568   /* 0F AE /7 = CLFLUSH -- flush cache line */
11569   if (sz == 4 && insn[0] == 0x0F && insn[1] == 0xAE
11570       && !epartIsReg(insn[2]) && gregOfRM(insn[2]) == 7) {
11571
11572      /* This is something of a hack.  We need to know the size of the
11573         cache line containing addr.  Since we don't (easily), assume
11574         256 on the basis that no real cache would have a line that
11575         big.  It's safe to invalidate more stuff than we need, just
11576         inefficient. */
11577      UInt lineszB = 256;
11578
11579      addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11580      delta += 2+alen;
11581
11582      /* Round addr down to the start of the containing block. */
11583      stmt( IRStmt_Put(
11584               OFFB_TISTART,
11585               binop( Iop_And32,
11586                      mkexpr(addr),
11587                      mkU32( ~(lineszB-1) ))) );
11588
11589      stmt( IRStmt_Put(OFFB_TILEN, mkU32(lineszB) ) );
11590
11591      jmp_lit(&dres, Ijk_TInval, (Addr32)(guest_EIP_bbstart+delta));
11592
11593      DIP("clflush %s\n", dis_buf);
11594      goto decode_success;
11595   }
11596
11597   /* ---------------------------------------------------- */
11598   /* --- end of the SSE2 decoder.                     --- */
11599   /* ---------------------------------------------------- */
11600
11601   /* ---------------------------------------------------- */
11602   /* --- start of the SSE3 decoder.                   --- */
11603   /* ---------------------------------------------------- */
11604
11605   /* Skip parts of the decoder which don't apply given the stated
11606      guest subarchitecture. */
11607   /* if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE3)) */
11608   /* In fact this is highly bogus; we accept SSE3 insns even on a
11609      SSE2-only guest since they turn into IR which can be re-emitted
11610      successfully on an SSE2 host. */
11611   if (0 == (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2))
11612      goto after_sse_decoders; /* no SSE3 capabilities */
11613
11614   insn = (UChar*)&guest_code[delta];
11615
11616   /* F3 0F 12 = MOVSLDUP -- move from E (mem or xmm) to G (xmm),
11617      duplicating some lanes (2:2:0:0). */
11618   /* F3 0F 16 = MOVSHDUP -- move from E (mem or xmm) to G (xmm),
11619      duplicating some lanes (3:3:1:1). */
11620   if (sz == 4 && insn[0] == 0xF3 && insn[1] == 0x0F
11621       && (insn[2] == 0x12 || insn[2] == 0x16)) {
11622      IRTemp s3, s2, s1, s0;
11623      IRTemp sV  = newTemp(Ity_V128);
11624      Bool   isH = insn[2] == 0x16;
11625      s3 = s2 = s1 = s0 = IRTemp_INVALID;
11626
11627      modrm = insn[3];
11628      if (epartIsReg(modrm)) {
11629         assign( sV, getXMMReg( eregOfRM(modrm)) );
11630         DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
11631                                  nameXMMReg(eregOfRM(modrm)),
11632                                  nameXMMReg(gregOfRM(modrm)));
11633         delta += 3+1;
11634      } else {
11635         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11636         gen_SEGV_if_not_16_aligned( addr );
11637         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11638         DIP("movs%cdup %s,%s\n", isH ? 'h' : 'l',
11639	     dis_buf,
11640             nameXMMReg(gregOfRM(modrm)));
11641         delta += 3+alen;
11642      }
11643
11644      breakup128to32s( sV, &s3, &s2, &s1, &s0 );
11645      putXMMReg( gregOfRM(modrm),
11646                 isH ? mk128from32s( s3, s3, s1, s1 )
11647                     : mk128from32s( s2, s2, s0, s0 ) );
11648      goto decode_success;
11649   }
11650
11651   /* F2 0F 12 = MOVDDUP -- move from E (mem or xmm) to G (xmm),
11652      duplicating some lanes (0:1:0:1). */
11653   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0x12) {
11654      IRTemp sV = newTemp(Ity_V128);
11655      IRTemp d0 = newTemp(Ity_I64);
11656
11657      modrm = insn[3];
11658      if (epartIsReg(modrm)) {
11659         assign( sV, getXMMReg( eregOfRM(modrm)) );
11660         DIP("movddup %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11661                                nameXMMReg(gregOfRM(modrm)));
11662         delta += 3+1;
11663         assign ( d0, unop(Iop_V128to64, mkexpr(sV)) );
11664      } else {
11665         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11666         assign( d0, loadLE(Ity_I64, mkexpr(addr)) );
11667         DIP("movddup %s,%s\n", dis_buf,
11668                                nameXMMReg(gregOfRM(modrm)));
11669         delta += 3+alen;
11670      }
11671
11672      putXMMReg( gregOfRM(modrm), binop(Iop_64HLtoV128,mkexpr(d0),mkexpr(d0)) );
11673      goto decode_success;
11674   }
11675
11676   /* F2 0F D0 = ADDSUBPS -- 32x4 +/-/+/- from E (mem or xmm) to G (xmm). */
11677   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xD0) {
11678      IRTemp a3, a2, a1, a0, s3, s2, s1, s0;
11679      IRTemp eV   = newTemp(Ity_V128);
11680      IRTemp gV   = newTemp(Ity_V128);
11681      IRTemp addV = newTemp(Ity_V128);
11682      IRTemp subV = newTemp(Ity_V128);
11683      a3 = a2 = a1 = a0 = s3 = s2 = s1 = s0 = IRTemp_INVALID;
11684
11685      modrm = insn[3];
11686      if (epartIsReg(modrm)) {
11687         assign( eV, getXMMReg( eregOfRM(modrm)) );
11688         DIP("addsubps %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11689                                 nameXMMReg(gregOfRM(modrm)));
11690         delta += 3+1;
11691      } else {
11692         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11693         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
11694         DIP("addsubps %s,%s\n", dis_buf,
11695                                 nameXMMReg(gregOfRM(modrm)));
11696         delta += 3+alen;
11697      }
11698
11699      assign( gV, getXMMReg(gregOfRM(modrm)) );
11700
11701      assign( addV, binop(Iop_Add32Fx4, mkexpr(gV), mkexpr(eV)) );
11702      assign( subV, binop(Iop_Sub32Fx4, mkexpr(gV), mkexpr(eV)) );
11703
11704      breakup128to32s( addV, &a3, &a2, &a1, &a0 );
11705      breakup128to32s( subV, &s3, &s2, &s1, &s0 );
11706
11707      putXMMReg( gregOfRM(modrm), mk128from32s( a3, s2, a1, s0 ));
11708      goto decode_success;
11709   }
11710
11711   /* 66 0F D0 = ADDSUBPD -- 64x4 +/- from E (mem or xmm) to G (xmm). */
11712   if (sz == 2 && insn[0] == 0x0F && insn[1] == 0xD0) {
11713      IRTemp eV   = newTemp(Ity_V128);
11714      IRTemp gV   = newTemp(Ity_V128);
11715      IRTemp addV = newTemp(Ity_V128);
11716      IRTemp subV = newTemp(Ity_V128);
11717      IRTemp a1     = newTemp(Ity_I64);
11718      IRTemp s0     = newTemp(Ity_I64);
11719
11720      modrm = insn[2];
11721      if (epartIsReg(modrm)) {
11722         assign( eV, getXMMReg( eregOfRM(modrm)) );
11723         DIP("addsubpd %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11724                                 nameXMMReg(gregOfRM(modrm)));
11725         delta += 2+1;
11726      } else {
11727         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11728         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
11729         DIP("addsubpd %s,%s\n", dis_buf,
11730                                 nameXMMReg(gregOfRM(modrm)));
11731         delta += 2+alen;
11732      }
11733
11734      assign( gV, getXMMReg(gregOfRM(modrm)) );
11735
11736      assign( addV, binop(Iop_Add64Fx2, mkexpr(gV), mkexpr(eV)) );
11737      assign( subV, binop(Iop_Sub64Fx2, mkexpr(gV), mkexpr(eV)) );
11738
11739      assign( a1, unop(Iop_V128HIto64, mkexpr(addV) ));
11740      assign( s0, unop(Iop_V128to64,   mkexpr(subV) ));
11741
11742      putXMMReg( gregOfRM(modrm),
11743                 binop(Iop_64HLtoV128, mkexpr(a1), mkexpr(s0)) );
11744      goto decode_success;
11745   }
11746
11747   /* F2 0F 7D = HSUBPS -- 32x4 sub across from E (mem or xmm) to G (xmm). */
11748   /* F2 0F 7C = HADDPS -- 32x4 add across from E (mem or xmm) to G (xmm). */
11749   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F
11750       && (insn[2] == 0x7C || insn[2] == 0x7D)) {
11751      IRTemp e3, e2, e1, e0, g3, g2, g1, g0;
11752      IRTemp eV     = newTemp(Ity_V128);
11753      IRTemp gV     = newTemp(Ity_V128);
11754      IRTemp leftV  = newTemp(Ity_V128);
11755      IRTemp rightV = newTemp(Ity_V128);
11756      Bool   isAdd  = insn[2] == 0x7C;
11757      HChar* str    = isAdd ? "add" : "sub";
11758      e3 = e2 = e1 = e0 = g3 = g2 = g1 = g0 = IRTemp_INVALID;
11759
11760      modrm = insn[3];
11761      if (epartIsReg(modrm)) {
11762         assign( eV, getXMMReg( eregOfRM(modrm)) );
11763         DIP("h%sps %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
11764                                   nameXMMReg(gregOfRM(modrm)));
11765         delta += 3+1;
11766      } else {
11767         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11768         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
11769         DIP("h%sps %s,%s\n", str, dis_buf,
11770                                   nameXMMReg(gregOfRM(modrm)));
11771         delta += 3+alen;
11772      }
11773
11774      assign( gV, getXMMReg(gregOfRM(modrm)) );
11775
11776      breakup128to32s( eV, &e3, &e2, &e1, &e0 );
11777      breakup128to32s( gV, &g3, &g2, &g1, &g0 );
11778
11779      assign( leftV,  mk128from32s( e2, e0, g2, g0 ) );
11780      assign( rightV, mk128from32s( e3, e1, g3, g1 ) );
11781
11782      putXMMReg( gregOfRM(modrm),
11783                 binop(isAdd ? Iop_Add32Fx4 : Iop_Sub32Fx4,
11784                       mkexpr(leftV), mkexpr(rightV) ) );
11785      goto decode_success;
11786   }
11787
11788   /* 66 0F 7D = HSUBPD -- 64x2 sub across from E (mem or xmm) to G (xmm). */
11789   /* 66 0F 7C = HADDPD -- 64x2 add across from E (mem or xmm) to G (xmm). */
11790   if (sz == 2 && insn[0] == 0x0F && (insn[1] == 0x7C || insn[1] == 0x7D)) {
11791      IRTemp e1     = newTemp(Ity_I64);
11792      IRTemp e0     = newTemp(Ity_I64);
11793      IRTemp g1     = newTemp(Ity_I64);
11794      IRTemp g0     = newTemp(Ity_I64);
11795      IRTemp eV     = newTemp(Ity_V128);
11796      IRTemp gV     = newTemp(Ity_V128);
11797      IRTemp leftV  = newTemp(Ity_V128);
11798      IRTemp rightV = newTemp(Ity_V128);
11799      Bool   isAdd  = insn[1] == 0x7C;
11800      HChar* str    = isAdd ? "add" : "sub";
11801
11802      modrm = insn[2];
11803      if (epartIsReg(modrm)) {
11804         assign( eV, getXMMReg( eregOfRM(modrm)) );
11805         DIP("h%spd %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
11806                                   nameXMMReg(gregOfRM(modrm)));
11807         delta += 2+1;
11808      } else {
11809         addr = disAMode ( &alen, sorb, delta+2, dis_buf );
11810         assign( eV, loadLE(Ity_V128, mkexpr(addr)) );
11811         DIP("h%spd %s,%s\n", str, dis_buf,
11812                              nameXMMReg(gregOfRM(modrm)));
11813         delta += 2+alen;
11814      }
11815
11816      assign( gV, getXMMReg(gregOfRM(modrm)) );
11817
11818      assign( e1, unop(Iop_V128HIto64, mkexpr(eV) ));
11819      assign( e0, unop(Iop_V128to64, mkexpr(eV) ));
11820      assign( g1, unop(Iop_V128HIto64, mkexpr(gV) ));
11821      assign( g0, unop(Iop_V128to64, mkexpr(gV) ));
11822
11823      assign( leftV,  binop(Iop_64HLtoV128, mkexpr(e0),mkexpr(g0)) );
11824      assign( rightV, binop(Iop_64HLtoV128, mkexpr(e1),mkexpr(g1)) );
11825
11826      putXMMReg( gregOfRM(modrm),
11827                 binop(isAdd ? Iop_Add64Fx2 : Iop_Sub64Fx2,
11828                       mkexpr(leftV), mkexpr(rightV) ) );
11829      goto decode_success;
11830   }
11831
11832   /* F2 0F F0 = LDDQU -- move from E (mem or xmm) to G (xmm). */
11833   if (sz == 4 && insn[0] == 0xF2 && insn[1] == 0x0F && insn[2] == 0xF0) {
11834      modrm = getIByte(delta+3);
11835      if (epartIsReg(modrm)) {
11836         goto decode_failure;
11837      } else {
11838         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11839         putXMMReg( gregOfRM(modrm),
11840                    loadLE(Ity_V128, mkexpr(addr)) );
11841         DIP("lddqu %s,%s\n", dis_buf,
11842                              nameXMMReg(gregOfRM(modrm)));
11843         delta += 3+alen;
11844      }
11845      goto decode_success;
11846   }
11847
11848   /* ---------------------------------------------------- */
11849   /* --- end of the SSE3 decoder.                     --- */
11850   /* ---------------------------------------------------- */
11851
11852   /* ---------------------------------------------------- */
11853   /* --- start of the SSSE3 decoder.                  --- */
11854   /* ---------------------------------------------------- */
11855
11856   /* 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
11857      Unsigned Bytes (MMX) */
11858   if (sz == 4
11859       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
11860      IRTemp sV        = newTemp(Ity_I64);
11861      IRTemp dV        = newTemp(Ity_I64);
11862      IRTemp sVoddsSX  = newTemp(Ity_I64);
11863      IRTemp sVevensSX = newTemp(Ity_I64);
11864      IRTemp dVoddsZX  = newTemp(Ity_I64);
11865      IRTemp dVevensZX = newTemp(Ity_I64);
11866
11867      modrm = insn[3];
11868      do_MMX_preamble();
11869      assign( dV, getMMXReg(gregOfRM(modrm)) );
11870
11871      if (epartIsReg(modrm)) {
11872         assign( sV, getMMXReg(eregOfRM(modrm)) );
11873         delta += 3+1;
11874         DIP("pmaddubsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
11875                                  nameMMXReg(gregOfRM(modrm)));
11876      } else {
11877         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11878         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
11879         delta += 3+alen;
11880         DIP("pmaddubsw %s,%s\n", dis_buf,
11881                                  nameMMXReg(gregOfRM(modrm)));
11882      }
11883
11884      /* compute dV unsigned x sV signed */
11885      assign( sVoddsSX,
11886              binop(Iop_SarN16x4, mkexpr(sV), mkU8(8)) );
11887      assign( sVevensSX,
11888              binop(Iop_SarN16x4,
11889                    binop(Iop_ShlN16x4, mkexpr(sV), mkU8(8)),
11890                    mkU8(8)) );
11891      assign( dVoddsZX,
11892              binop(Iop_ShrN16x4, mkexpr(dV), mkU8(8)) );
11893      assign( dVevensZX,
11894              binop(Iop_ShrN16x4,
11895                    binop(Iop_ShlN16x4, mkexpr(dV), mkU8(8)),
11896                    mkU8(8)) );
11897
11898      putMMXReg(
11899         gregOfRM(modrm),
11900         binop(Iop_QAdd16Sx4,
11901               binop(Iop_Mul16x4, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
11902               binop(Iop_Mul16x4, mkexpr(sVevensSX), mkexpr(dVevensZX))
11903         )
11904      );
11905      goto decode_success;
11906   }
11907
11908   /* 66 0F 38 04 = PMADDUBSW -- Multiply and Add Packed Signed and
11909      Unsigned Bytes (XMM) */
11910   if (sz == 2
11911       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x04) {
11912      IRTemp sV        = newTemp(Ity_V128);
11913      IRTemp dV        = newTemp(Ity_V128);
11914      IRTemp sVoddsSX  = newTemp(Ity_V128);
11915      IRTemp sVevensSX = newTemp(Ity_V128);
11916      IRTemp dVoddsZX  = newTemp(Ity_V128);
11917      IRTemp dVevensZX = newTemp(Ity_V128);
11918
11919      modrm = insn[3];
11920      assign( dV, getXMMReg(gregOfRM(modrm)) );
11921
11922      if (epartIsReg(modrm)) {
11923         assign( sV, getXMMReg(eregOfRM(modrm)) );
11924         delta += 3+1;
11925         DIP("pmaddubsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
11926                                  nameXMMReg(gregOfRM(modrm)));
11927      } else {
11928         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
11929         gen_SEGV_if_not_16_aligned( addr );
11930         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
11931         delta += 3+alen;
11932         DIP("pmaddubsw %s,%s\n", dis_buf,
11933                                  nameXMMReg(gregOfRM(modrm)));
11934      }
11935
11936      /* compute dV unsigned x sV signed */
11937      assign( sVoddsSX,
11938              binop(Iop_SarN16x8, mkexpr(sV), mkU8(8)) );
11939      assign( sVevensSX,
11940              binop(Iop_SarN16x8,
11941                    binop(Iop_ShlN16x8, mkexpr(sV), mkU8(8)),
11942                    mkU8(8)) );
11943      assign( dVoddsZX,
11944              binop(Iop_ShrN16x8, mkexpr(dV), mkU8(8)) );
11945      assign( dVevensZX,
11946              binop(Iop_ShrN16x8,
11947                    binop(Iop_ShlN16x8, mkexpr(dV), mkU8(8)),
11948                    mkU8(8)) );
11949
11950      putXMMReg(
11951         gregOfRM(modrm),
11952         binop(Iop_QAdd16Sx8,
11953               binop(Iop_Mul16x8, mkexpr(sVoddsSX), mkexpr(dVoddsZX)),
11954               binop(Iop_Mul16x8, mkexpr(sVevensSX), mkexpr(dVevensZX))
11955         )
11956      );
11957      goto decode_success;
11958   }
11959
11960   /* ***--- these are MMX class insns introduced in SSSE3 ---*** */
11961   /* 0F 38 03 = PHADDSW -- 16x4 signed qadd across from E (mem or
11962      mmx) and G to G (mmx). */
11963   /* 0F 38 07 = PHSUBSW -- 16x4 signed qsub across from E (mem or
11964      mmx) and G to G (mmx). */
11965   /* 0F 38 01 = PHADDW -- 16x4 add across from E (mem or mmx) and G
11966      to G (mmx). */
11967   /* 0F 38 05 = PHSUBW -- 16x4 sub across from E (mem or mmx) and G
11968      to G (mmx). */
11969   /* 0F 38 02 = PHADDD -- 32x2 add across from E (mem or mmx) and G
11970      to G (mmx). */
11971   /* 0F 38 06 = PHSUBD -- 32x2 sub across from E (mem or mmx) and G
11972      to G (mmx). */
11973
11974   if (sz == 4
11975       && insn[0] == 0x0F && insn[1] == 0x38
11976       && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
11977           || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
11978      HChar* str    = "???";
11979      IROp   opV64  = Iop_INVALID;
11980      IROp   opCatO = Iop_CatOddLanes16x4;
11981      IROp   opCatE = Iop_CatEvenLanes16x4;
11982      IRTemp sV     = newTemp(Ity_I64);
11983      IRTemp dV     = newTemp(Ity_I64);
11984
11985      modrm = insn[3];
11986
11987      switch (insn[2]) {
11988         case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
11989         case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
11990         case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
11991         case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
11992         case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
11993         case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
11994         default: vassert(0);
11995      }
11996      if (insn[2] == 0x02 || insn[2] == 0x06) {
11997         opCatO = Iop_InterleaveHI32x2;
11998         opCatE = Iop_InterleaveLO32x2;
11999      }
12000
12001      do_MMX_preamble();
12002      assign( dV, getMMXReg(gregOfRM(modrm)) );
12003
12004      if (epartIsReg(modrm)) {
12005         assign( sV, getMMXReg(eregOfRM(modrm)) );
12006         delta += 3+1;
12007         DIP("ph%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
12008                                  nameMMXReg(gregOfRM(modrm)));
12009      } else {
12010         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12011         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12012         delta += 3+alen;
12013         DIP("ph%s %s,%s\n", str, dis_buf,
12014                                  nameMMXReg(gregOfRM(modrm)));
12015      }
12016
12017      putMMXReg(
12018         gregOfRM(modrm),
12019         binop(opV64,
12020               binop(opCatE,mkexpr(sV),mkexpr(dV)),
12021               binop(opCatO,mkexpr(sV),mkexpr(dV))
12022         )
12023      );
12024      goto decode_success;
12025   }
12026
12027   /* 66 0F 38 03 = PHADDSW -- 16x8 signed qadd across from E (mem or
12028      xmm) and G to G (xmm). */
12029   /* 66 0F 38 07 = PHSUBSW -- 16x8 signed qsub across from E (mem or
12030      xmm) and G to G (xmm). */
12031   /* 66 0F 38 01 = PHADDW -- 16x8 add across from E (mem or xmm) and
12032      G to G (xmm). */
12033   /* 66 0F 38 05 = PHSUBW -- 16x8 sub across from E (mem or xmm) and
12034      G to G (xmm). */
12035   /* 66 0F 38 02 = PHADDD -- 32x4 add across from E (mem or xmm) and
12036      G to G (xmm). */
12037   /* 66 0F 38 06 = PHSUBD -- 32x4 sub across from E (mem or xmm) and
12038      G to G (xmm). */
12039
12040   if (sz == 2
12041       && insn[0] == 0x0F && insn[1] == 0x38
12042       && (insn[2] == 0x03 || insn[2] == 0x07 || insn[2] == 0x01
12043           || insn[2] == 0x05 || insn[2] == 0x02 || insn[2] == 0x06)) {
12044      HChar* str    = "???";
12045      IROp   opV64  = Iop_INVALID;
12046      IROp   opCatO = Iop_CatOddLanes16x4;
12047      IROp   opCatE = Iop_CatEvenLanes16x4;
12048      IRTemp sV     = newTemp(Ity_V128);
12049      IRTemp dV     = newTemp(Ity_V128);
12050      IRTemp sHi    = newTemp(Ity_I64);
12051      IRTemp sLo    = newTemp(Ity_I64);
12052      IRTemp dHi    = newTemp(Ity_I64);
12053      IRTemp dLo    = newTemp(Ity_I64);
12054
12055      modrm = insn[3];
12056
12057      switch (insn[2]) {
12058         case 0x03: opV64 = Iop_QAdd16Sx4; str = "addsw"; break;
12059         case 0x07: opV64 = Iop_QSub16Sx4; str = "subsw"; break;
12060         case 0x01: opV64 = Iop_Add16x4;   str = "addw";  break;
12061         case 0x05: opV64 = Iop_Sub16x4;   str = "subw";  break;
12062         case 0x02: opV64 = Iop_Add32x2;   str = "addd";  break;
12063         case 0x06: opV64 = Iop_Sub32x2;   str = "subd";  break;
12064         default: vassert(0);
12065      }
12066      if (insn[2] == 0x02 || insn[2] == 0x06) {
12067         opCatO = Iop_InterleaveHI32x2;
12068         opCatE = Iop_InterleaveLO32x2;
12069      }
12070
12071      assign( dV, getXMMReg(gregOfRM(modrm)) );
12072
12073      if (epartIsReg(modrm)) {
12074         assign( sV, getXMMReg( eregOfRM(modrm)) );
12075         DIP("ph%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
12076                                  nameXMMReg(gregOfRM(modrm)));
12077         delta += 3+1;
12078      } else {
12079         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12080         gen_SEGV_if_not_16_aligned( addr );
12081         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12082         DIP("ph%s %s,%s\n", str, dis_buf,
12083                             nameXMMReg(gregOfRM(modrm)));
12084         delta += 3+alen;
12085      }
12086
12087      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12088      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12089      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12090      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12091
12092      /* This isn't a particularly efficient way to compute the
12093         result, but at least it avoids a proliferation of IROps,
12094         hence avoids complication all the backends. */
12095      putXMMReg(
12096         gregOfRM(modrm),
12097         binop(Iop_64HLtoV128,
12098               binop(opV64,
12099                     binop(opCatE,mkexpr(sHi),mkexpr(sLo)),
12100                     binop(opCatO,mkexpr(sHi),mkexpr(sLo))
12101               ),
12102               binop(opV64,
12103                     binop(opCatE,mkexpr(dHi),mkexpr(dLo)),
12104                     binop(opCatO,mkexpr(dHi),mkexpr(dLo))
12105               )
12106         )
12107      );
12108      goto decode_success;
12109   }
12110
12111   /* 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and Scale
12112      (MMX) */
12113   if (sz == 4
12114       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
12115      IRTemp sV = newTemp(Ity_I64);
12116      IRTemp dV = newTemp(Ity_I64);
12117
12118      modrm = insn[3];
12119      do_MMX_preamble();
12120      assign( dV, getMMXReg(gregOfRM(modrm)) );
12121
12122      if (epartIsReg(modrm)) {
12123         assign( sV, getMMXReg(eregOfRM(modrm)) );
12124         delta += 3+1;
12125         DIP("pmulhrsw %s,%s\n", nameMMXReg(eregOfRM(modrm)),
12126                                 nameMMXReg(gregOfRM(modrm)));
12127      } else {
12128         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12129         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12130         delta += 3+alen;
12131         DIP("pmulhrsw %s,%s\n", dis_buf,
12132                                 nameMMXReg(gregOfRM(modrm)));
12133      }
12134
12135      putMMXReg(
12136         gregOfRM(modrm),
12137         dis_PMULHRSW_helper( mkexpr(sV), mkexpr(dV) )
12138      );
12139      goto decode_success;
12140   }
12141
12142   /* 66 0F 38 0B = PMULHRSW -- Packed Multiply High with Round and
12143      Scale (XMM) */
12144   if (sz == 2
12145       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x0B) {
12146      IRTemp sV  = newTemp(Ity_V128);
12147      IRTemp dV  = newTemp(Ity_V128);
12148      IRTemp sHi = newTemp(Ity_I64);
12149      IRTemp sLo = newTemp(Ity_I64);
12150      IRTemp dHi = newTemp(Ity_I64);
12151      IRTemp dLo = newTemp(Ity_I64);
12152
12153      modrm = insn[3];
12154      assign( dV, getXMMReg(gregOfRM(modrm)) );
12155
12156      if (epartIsReg(modrm)) {
12157         assign( sV, getXMMReg(eregOfRM(modrm)) );
12158         delta += 3+1;
12159         DIP("pmulhrsw %s,%s\n", nameXMMReg(eregOfRM(modrm)),
12160                                 nameXMMReg(gregOfRM(modrm)));
12161      } else {
12162         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12163         gen_SEGV_if_not_16_aligned( addr );
12164         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12165         delta += 3+alen;
12166         DIP("pmulhrsw %s,%s\n", dis_buf,
12167                                 nameXMMReg(gregOfRM(modrm)));
12168      }
12169
12170      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12171      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12172      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12173      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12174
12175      putXMMReg(
12176         gregOfRM(modrm),
12177         binop(Iop_64HLtoV128,
12178               dis_PMULHRSW_helper( mkexpr(sHi), mkexpr(dHi) ),
12179               dis_PMULHRSW_helper( mkexpr(sLo), mkexpr(dLo) )
12180         )
12181      );
12182      goto decode_success;
12183   }
12184
12185   /* 0F 38 08 = PSIGNB -- Packed Sign 8x8  (MMX) */
12186   /* 0F 38 09 = PSIGNW -- Packed Sign 16x4 (MMX) */
12187   /* 0F 38 09 = PSIGND -- Packed Sign 32x2 (MMX) */
12188   if (sz == 4
12189       && insn[0] == 0x0F && insn[1] == 0x38
12190       && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
12191      IRTemp sV      = newTemp(Ity_I64);
12192      IRTemp dV      = newTemp(Ity_I64);
12193      HChar* str     = "???";
12194      Int    laneszB = 0;
12195
12196      switch (insn[2]) {
12197         case 0x08: laneszB = 1; str = "b"; break;
12198         case 0x09: laneszB = 2; str = "w"; break;
12199         case 0x0A: laneszB = 4; str = "d"; break;
12200         default: vassert(0);
12201      }
12202
12203      modrm = insn[3];
12204      do_MMX_preamble();
12205      assign( dV, getMMXReg(gregOfRM(modrm)) );
12206
12207      if (epartIsReg(modrm)) {
12208         assign( sV, getMMXReg(eregOfRM(modrm)) );
12209         delta += 3+1;
12210         DIP("psign%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
12211                                     nameMMXReg(gregOfRM(modrm)));
12212      } else {
12213         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12214         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12215         delta += 3+alen;
12216         DIP("psign%s %s,%s\n", str, dis_buf,
12217                                     nameMMXReg(gregOfRM(modrm)));
12218      }
12219
12220      putMMXReg(
12221         gregOfRM(modrm),
12222         dis_PSIGN_helper( mkexpr(sV), mkexpr(dV), laneszB )
12223      );
12224      goto decode_success;
12225   }
12226
12227   /* 66 0F 38 08 = PSIGNB -- Packed Sign 8x16 (XMM) */
12228   /* 66 0F 38 09 = PSIGNW -- Packed Sign 16x8 (XMM) */
12229   /* 66 0F 38 09 = PSIGND -- Packed Sign 32x4 (XMM) */
12230   if (sz == 2
12231       && insn[0] == 0x0F && insn[1] == 0x38
12232       && (insn[2] == 0x08 || insn[2] == 0x09 || insn[2] == 0x0A)) {
12233      IRTemp sV      = newTemp(Ity_V128);
12234      IRTemp dV      = newTemp(Ity_V128);
12235      IRTemp sHi     = newTemp(Ity_I64);
12236      IRTemp sLo     = newTemp(Ity_I64);
12237      IRTemp dHi     = newTemp(Ity_I64);
12238      IRTemp dLo     = newTemp(Ity_I64);
12239      HChar* str     = "???";
12240      Int    laneszB = 0;
12241
12242      switch (insn[2]) {
12243         case 0x08: laneszB = 1; str = "b"; break;
12244         case 0x09: laneszB = 2; str = "w"; break;
12245         case 0x0A: laneszB = 4; str = "d"; break;
12246         default: vassert(0);
12247      }
12248
12249      modrm = insn[3];
12250      assign( dV, getXMMReg(gregOfRM(modrm)) );
12251
12252      if (epartIsReg(modrm)) {
12253         assign( sV, getXMMReg(eregOfRM(modrm)) );
12254         delta += 3+1;
12255         DIP("psign%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
12256                                     nameXMMReg(gregOfRM(modrm)));
12257      } else {
12258         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12259         gen_SEGV_if_not_16_aligned( addr );
12260         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12261         delta += 3+alen;
12262         DIP("psign%s %s,%s\n", str, dis_buf,
12263                                     nameXMMReg(gregOfRM(modrm)));
12264      }
12265
12266      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12267      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12268      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12269      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12270
12271      putXMMReg(
12272         gregOfRM(modrm),
12273         binop(Iop_64HLtoV128,
12274               dis_PSIGN_helper( mkexpr(sHi), mkexpr(dHi), laneszB ),
12275               dis_PSIGN_helper( mkexpr(sLo), mkexpr(dLo), laneszB )
12276         )
12277      );
12278      goto decode_success;
12279   }
12280
12281   /* 0F 38 1C = PABSB -- Packed Absolute Value 8x8  (MMX) */
12282   /* 0F 38 1D = PABSW -- Packed Absolute Value 16x4 (MMX) */
12283   /* 0F 38 1E = PABSD -- Packed Absolute Value 32x2 (MMX) */
12284   if (sz == 4
12285       && insn[0] == 0x0F && insn[1] == 0x38
12286       && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
12287      IRTemp sV      = newTemp(Ity_I64);
12288      HChar* str     = "???";
12289      Int    laneszB = 0;
12290
12291      switch (insn[2]) {
12292         case 0x1C: laneszB = 1; str = "b"; break;
12293         case 0x1D: laneszB = 2; str = "w"; break;
12294         case 0x1E: laneszB = 4; str = "d"; break;
12295         default: vassert(0);
12296      }
12297
12298      modrm = insn[3];
12299      do_MMX_preamble();
12300
12301      if (epartIsReg(modrm)) {
12302         assign( sV, getMMXReg(eregOfRM(modrm)) );
12303         delta += 3+1;
12304         DIP("pabs%s %s,%s\n", str, nameMMXReg(eregOfRM(modrm)),
12305                                    nameMMXReg(gregOfRM(modrm)));
12306      } else {
12307         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12308         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12309         delta += 3+alen;
12310         DIP("pabs%s %s,%s\n", str, dis_buf,
12311                                    nameMMXReg(gregOfRM(modrm)));
12312      }
12313
12314      putMMXReg(
12315         gregOfRM(modrm),
12316         dis_PABS_helper( mkexpr(sV), laneszB )
12317      );
12318      goto decode_success;
12319   }
12320
12321   /* 66 0F 38 1C = PABSB -- Packed Absolute Value 8x16 (XMM) */
12322   /* 66 0F 38 1D = PABSW -- Packed Absolute Value 16x8 (XMM) */
12323   /* 66 0F 38 1E = PABSD -- Packed Absolute Value 32x4 (XMM) */
12324   if (sz == 2
12325       && insn[0] == 0x0F && insn[1] == 0x38
12326       && (insn[2] == 0x1C || insn[2] == 0x1D || insn[2] == 0x1E)) {
12327      IRTemp sV      = newTemp(Ity_V128);
12328      IRTemp sHi     = newTemp(Ity_I64);
12329      IRTemp sLo     = newTemp(Ity_I64);
12330      HChar* str     = "???";
12331      Int    laneszB = 0;
12332
12333      switch (insn[2]) {
12334         case 0x1C: laneszB = 1; str = "b"; break;
12335         case 0x1D: laneszB = 2; str = "w"; break;
12336         case 0x1E: laneszB = 4; str = "d"; break;
12337         default: vassert(0);
12338      }
12339
12340      modrm = insn[3];
12341
12342      if (epartIsReg(modrm)) {
12343         assign( sV, getXMMReg(eregOfRM(modrm)) );
12344         delta += 3+1;
12345         DIP("pabs%s %s,%s\n", str, nameXMMReg(eregOfRM(modrm)),
12346                                    nameXMMReg(gregOfRM(modrm)));
12347      } else {
12348         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12349         gen_SEGV_if_not_16_aligned( addr );
12350         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12351         delta += 3+alen;
12352         DIP("pabs%s %s,%s\n", str, dis_buf,
12353                                    nameXMMReg(gregOfRM(modrm)));
12354      }
12355
12356      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12357      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12358
12359      putXMMReg(
12360         gregOfRM(modrm),
12361         binop(Iop_64HLtoV128,
12362               dis_PABS_helper( mkexpr(sHi), laneszB ),
12363               dis_PABS_helper( mkexpr(sLo), laneszB )
12364         )
12365      );
12366      goto decode_success;
12367   }
12368
12369   /* 0F 3A 0F = PALIGNR -- Packed Align Right (MMX) */
12370   if (sz == 4
12371       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
12372      IRTemp sV  = newTemp(Ity_I64);
12373      IRTemp dV  = newTemp(Ity_I64);
12374      IRTemp res = newTemp(Ity_I64);
12375
12376      modrm = insn[3];
12377      do_MMX_preamble();
12378      assign( dV, getMMXReg(gregOfRM(modrm)) );
12379
12380      if (epartIsReg(modrm)) {
12381         assign( sV, getMMXReg(eregOfRM(modrm)) );
12382         d32 = (UInt)insn[3+1];
12383         delta += 3+1+1;
12384         DIP("palignr $%d,%s,%s\n",  (Int)d32,
12385                                     nameMMXReg(eregOfRM(modrm)),
12386                                     nameMMXReg(gregOfRM(modrm)));
12387      } else {
12388         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12389         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12390         d32 = (UInt)insn[3+alen];
12391         delta += 3+alen+1;
12392         DIP("palignr $%d%s,%s\n", (Int)d32,
12393                                   dis_buf,
12394                                   nameMMXReg(gregOfRM(modrm)));
12395      }
12396
12397      if (d32 == 0) {
12398         assign( res, mkexpr(sV) );
12399      }
12400      else if (d32 >= 1 && d32 <= 7) {
12401         assign(res,
12402                binop(Iop_Or64,
12403                      binop(Iop_Shr64, mkexpr(sV), mkU8(8*d32)),
12404                      binop(Iop_Shl64, mkexpr(dV), mkU8(8*(8-d32))
12405                     )));
12406      }
12407      else if (d32 == 8) {
12408        assign( res, mkexpr(dV) );
12409      }
12410      else if (d32 >= 9 && d32 <= 15) {
12411         assign( res, binop(Iop_Shr64, mkexpr(dV), mkU8(8*(d32-8))) );
12412      }
12413      else if (d32 >= 16 && d32 <= 255) {
12414         assign( res, mkU64(0) );
12415      }
12416      else
12417         vassert(0);
12418
12419      putMMXReg( gregOfRM(modrm), mkexpr(res) );
12420      goto decode_success;
12421   }
12422
12423   /* 66 0F 3A 0F = PALIGNR -- Packed Align Right (XMM) */
12424   if (sz == 2
12425       && insn[0] == 0x0F && insn[1] == 0x3A && insn[2] == 0x0F) {
12426      IRTemp sV  = newTemp(Ity_V128);
12427      IRTemp dV  = newTemp(Ity_V128);
12428      IRTemp sHi = newTemp(Ity_I64);
12429      IRTemp sLo = newTemp(Ity_I64);
12430      IRTemp dHi = newTemp(Ity_I64);
12431      IRTemp dLo = newTemp(Ity_I64);
12432      IRTemp rHi = newTemp(Ity_I64);
12433      IRTemp rLo = newTemp(Ity_I64);
12434
12435      modrm = insn[3];
12436      assign( dV, getXMMReg(gregOfRM(modrm)) );
12437
12438      if (epartIsReg(modrm)) {
12439         assign( sV, getXMMReg(eregOfRM(modrm)) );
12440         d32 = (UInt)insn[3+1];
12441         delta += 3+1+1;
12442         DIP("palignr $%d,%s,%s\n", (Int)d32,
12443                                    nameXMMReg(eregOfRM(modrm)),
12444                                    nameXMMReg(gregOfRM(modrm)));
12445      } else {
12446         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12447         gen_SEGV_if_not_16_aligned( addr );
12448         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12449         d32 = (UInt)insn[3+alen];
12450         delta += 3+alen+1;
12451         DIP("palignr $%d,%s,%s\n", (Int)d32,
12452                                    dis_buf,
12453                                    nameXMMReg(gregOfRM(modrm)));
12454      }
12455
12456      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12457      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12458      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12459      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12460
12461      if (d32 == 0) {
12462         assign( rHi, mkexpr(sHi) );
12463         assign( rLo, mkexpr(sLo) );
12464      }
12465      else if (d32 >= 1 && d32 <= 7) {
12466         assign( rHi, dis_PALIGNR_XMM_helper(dLo, sHi, d32) );
12467         assign( rLo, dis_PALIGNR_XMM_helper(sHi, sLo, d32) );
12468      }
12469      else if (d32 == 8) {
12470         assign( rHi, mkexpr(dLo) );
12471         assign( rLo, mkexpr(sHi) );
12472      }
12473      else if (d32 >= 9 && d32 <= 15) {
12474         assign( rHi, dis_PALIGNR_XMM_helper(dHi, dLo, d32-8) );
12475         assign( rLo, dis_PALIGNR_XMM_helper(dLo, sHi, d32-8) );
12476      }
12477      else if (d32 == 16) {
12478         assign( rHi, mkexpr(dHi) );
12479         assign( rLo, mkexpr(dLo) );
12480      }
12481      else if (d32 >= 17 && d32 <= 23) {
12482         assign( rHi, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-16))) );
12483         assign( rLo, dis_PALIGNR_XMM_helper(dHi, dLo, d32-16) );
12484      }
12485      else if (d32 == 24) {
12486         assign( rHi, mkU64(0) );
12487         assign( rLo, mkexpr(dHi) );
12488      }
12489      else if (d32 >= 25 && d32 <= 31) {
12490         assign( rHi, mkU64(0) );
12491         assign( rLo, binop(Iop_Shr64, mkexpr(dHi), mkU8(8*(d32-24))) );
12492      }
12493      else if (d32 >= 32 && d32 <= 255) {
12494         assign( rHi, mkU64(0) );
12495         assign( rLo, mkU64(0) );
12496      }
12497      else
12498         vassert(0);
12499
12500      putXMMReg(
12501         gregOfRM(modrm),
12502         binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
12503      );
12504      goto decode_success;
12505   }
12506
12507   /* 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x8 (MMX) */
12508   if (sz == 4
12509       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
12510      IRTemp sV      = newTemp(Ity_I64);
12511      IRTemp dV      = newTemp(Ity_I64);
12512
12513      modrm = insn[3];
12514      do_MMX_preamble();
12515      assign( dV, getMMXReg(gregOfRM(modrm)) );
12516
12517      if (epartIsReg(modrm)) {
12518         assign( sV, getMMXReg(eregOfRM(modrm)) );
12519         delta += 3+1;
12520         DIP("pshufb %s,%s\n", nameMMXReg(eregOfRM(modrm)),
12521                               nameMMXReg(gregOfRM(modrm)));
12522      } else {
12523         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12524         assign( sV, loadLE(Ity_I64, mkexpr(addr)) );
12525         delta += 3+alen;
12526         DIP("pshufb %s,%s\n", dis_buf,
12527                               nameMMXReg(gregOfRM(modrm)));
12528      }
12529
12530      putMMXReg(
12531         gregOfRM(modrm),
12532         binop(
12533            Iop_And64,
12534            /* permute the lanes */
12535            binop(
12536               Iop_Perm8x8,
12537               mkexpr(dV),
12538               binop(Iop_And64, mkexpr(sV), mkU64(0x0707070707070707ULL))
12539            ),
12540            /* mask off lanes which have (index & 0x80) == 0x80 */
12541            unop(Iop_Not64, binop(Iop_SarN8x8, mkexpr(sV), mkU8(7)))
12542         )
12543      );
12544      goto decode_success;
12545   }
12546
12547   /* 66 0F 38 00 = PSHUFB -- Packed Shuffle Bytes 8x16 (XMM) */
12548   if (sz == 2
12549       && insn[0] == 0x0F && insn[1] == 0x38 && insn[2] == 0x00) {
12550      IRTemp sV         = newTemp(Ity_V128);
12551      IRTemp dV         = newTemp(Ity_V128);
12552      IRTemp sHi        = newTemp(Ity_I64);
12553      IRTemp sLo        = newTemp(Ity_I64);
12554      IRTemp dHi        = newTemp(Ity_I64);
12555      IRTemp dLo        = newTemp(Ity_I64);
12556      IRTemp rHi        = newTemp(Ity_I64);
12557      IRTemp rLo        = newTemp(Ity_I64);
12558      IRTemp sevens     = newTemp(Ity_I64);
12559      IRTemp mask0x80hi = newTemp(Ity_I64);
12560      IRTemp mask0x80lo = newTemp(Ity_I64);
12561      IRTemp maskBit3hi = newTemp(Ity_I64);
12562      IRTemp maskBit3lo = newTemp(Ity_I64);
12563      IRTemp sAnd7hi    = newTemp(Ity_I64);
12564      IRTemp sAnd7lo    = newTemp(Ity_I64);
12565      IRTemp permdHi    = newTemp(Ity_I64);
12566      IRTemp permdLo    = newTemp(Ity_I64);
12567
12568      modrm = insn[3];
12569      assign( dV, getXMMReg(gregOfRM(modrm)) );
12570
12571      if (epartIsReg(modrm)) {
12572         assign( sV, getXMMReg(eregOfRM(modrm)) );
12573         delta += 3+1;
12574         DIP("pshufb %s,%s\n", nameXMMReg(eregOfRM(modrm)),
12575                               nameXMMReg(gregOfRM(modrm)));
12576      } else {
12577         addr = disAMode ( &alen, sorb, delta+3, dis_buf );
12578         gen_SEGV_if_not_16_aligned( addr );
12579         assign( sV, loadLE(Ity_V128, mkexpr(addr)) );
12580         delta += 3+alen;
12581         DIP("pshufb %s,%s\n", dis_buf,
12582                               nameXMMReg(gregOfRM(modrm)));
12583      }
12584
12585      assign( dHi, unop(Iop_V128HIto64, mkexpr(dV)) );
12586      assign( dLo, unop(Iop_V128to64,   mkexpr(dV)) );
12587      assign( sHi, unop(Iop_V128HIto64, mkexpr(sV)) );
12588      assign( sLo, unop(Iop_V128to64,   mkexpr(sV)) );
12589
12590      assign( sevens, mkU64(0x0707070707070707ULL) );
12591
12592      /*
12593      mask0x80hi = Not(SarN8x8(sHi,7))
12594      maskBit3hi = SarN8x8(ShlN8x8(sHi,4),7)
12595      sAnd7hi    = And(sHi,sevens)
12596      permdHi    = Or( And(Perm8x8(dHi,sAnd7hi),maskBit3hi),
12597                       And(Perm8x8(dLo,sAnd7hi),Not(maskBit3hi)) )
12598      rHi        = And(permdHi,mask0x80hi)
12599      */
12600      assign(
12601         mask0x80hi,
12602         unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sHi),mkU8(7))));
12603
12604      assign(
12605         maskBit3hi,
12606         binop(Iop_SarN8x8,
12607               binop(Iop_ShlN8x8,mkexpr(sHi),mkU8(4)),
12608               mkU8(7)));
12609
12610      assign(sAnd7hi, binop(Iop_And64,mkexpr(sHi),mkexpr(sevens)));
12611
12612      assign(
12613         permdHi,
12614         binop(
12615            Iop_Or64,
12616            binop(Iop_And64,
12617                  binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7hi)),
12618                  mkexpr(maskBit3hi)),
12619            binop(Iop_And64,
12620                  binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7hi)),
12621                  unop(Iop_Not64,mkexpr(maskBit3hi))) ));
12622
12623      assign(rHi, binop(Iop_And64,mkexpr(permdHi),mkexpr(mask0x80hi)) );
12624
12625      /* And the same for the lower half of the result.  What fun. */
12626
12627      assign(
12628         mask0x80lo,
12629         unop(Iop_Not64, binop(Iop_SarN8x8,mkexpr(sLo),mkU8(7))));
12630
12631      assign(
12632         maskBit3lo,
12633         binop(Iop_SarN8x8,
12634               binop(Iop_ShlN8x8,mkexpr(sLo),mkU8(4)),
12635               mkU8(7)));
12636
12637      assign(sAnd7lo, binop(Iop_And64,mkexpr(sLo),mkexpr(sevens)));
12638
12639      assign(
12640         permdLo,
12641         binop(
12642            Iop_Or64,
12643            binop(Iop_And64,
12644                  binop(Iop_Perm8x8,mkexpr(dHi),mkexpr(sAnd7lo)),
12645                  mkexpr(maskBit3lo)),
12646            binop(Iop_And64,
12647                  binop(Iop_Perm8x8,mkexpr(dLo),mkexpr(sAnd7lo)),
12648                  unop(Iop_Not64,mkexpr(maskBit3lo))) ));
12649
12650      assign(rLo, binop(Iop_And64,mkexpr(permdLo),mkexpr(mask0x80lo)) );
12651
12652      putXMMReg(
12653         gregOfRM(modrm),
12654         binop(Iop_64HLtoV128, mkexpr(rHi), mkexpr(rLo))
12655      );
12656      goto decode_success;
12657   }
12658
12659   /* 0F 38 F0 = MOVBE m16/32(E), r16/32(G) */
12660   /* 0F 38 F1 = MOVBE r16/32(G), m16/32(E) */
12661   if ((sz == 2 || sz == 4)
12662       && insn[0] == 0x0F && insn[1] == 0x38
12663       && (insn[2] == 0xF0 || insn[2] == 0xF1)
12664       && !epartIsReg(insn[3])) {
12665
12666      modrm = insn[3];
12667      addr = disAMode(&alen, sorb, delta + 3, dis_buf);
12668      delta += 3 + alen;
12669      ty = szToITy(sz);
12670      IRTemp src = newTemp(ty);
12671
12672      if (insn[2] == 0xF0) { /* LOAD */
12673         assign(src, loadLE(ty, mkexpr(addr)));
12674         IRTemp dst = math_BSWAP(src, ty);
12675         putIReg(sz, gregOfRM(modrm), mkexpr(dst));
12676         DIP("movbe %s,%s\n", dis_buf, nameIReg(sz, gregOfRM(modrm)));
12677      } else { /* STORE */
12678         assign(src, getIReg(sz, gregOfRM(modrm)));
12679         IRTemp dst = math_BSWAP(src, ty);
12680         storeLE(mkexpr(addr), mkexpr(dst));
12681         DIP("movbe %s,%s\n", nameIReg(sz, gregOfRM(modrm)), dis_buf);
12682      }
12683      goto decode_success;
12684   }
12685
12686   /* ---------------------------------------------------- */
12687   /* --- end of the SSSE3 decoder.                    --- */
12688   /* ---------------------------------------------------- */
12689
12690   /* ---------------------------------------------------- */
12691   /* --- start of the SSE4 decoder                    --- */
12692   /* ---------------------------------------------------- */
12693
12694   /* 66 0F 3A 0B /r ib = ROUNDSD imm8, xmm2/m64, xmm1
12695      (Partial implementation only -- only deal with cases where
12696      the rounding mode is specified directly by the immediate byte.)
12697      66 0F 3A 0A /r ib = ROUNDSS imm8, xmm2/m32, xmm1
12698      (Limitations ditto)
12699   */
12700   if (sz == 2
12701       && insn[0] == 0x0F && insn[1] == 0x3A
12702       && (/*insn[2] == 0x0B || */insn[2] == 0x0A)) {
12703
12704      Bool   isD = insn[2] == 0x0B;
12705      IRTemp src = newTemp(isD ? Ity_F64 : Ity_F32);
12706      IRTemp res = newTemp(isD ? Ity_F64 : Ity_F32);
12707      Int    imm = 0;
12708
12709      modrm = insn[3];
12710
12711      if (epartIsReg(modrm)) {
12712         assign( src,
12713                 isD ? getXMMRegLane64F( eregOfRM(modrm), 0 )
12714                     : getXMMRegLane32F( eregOfRM(modrm), 0 ) );
12715         imm = insn[3+1];
12716         if (imm & ~3) goto decode_failure;
12717         delta += 3+1+1;
12718         DIP( "rounds%c $%d,%s,%s\n",
12719              isD ? 'd' : 's',
12720              imm, nameXMMReg( eregOfRM(modrm) ),
12721                   nameXMMReg( gregOfRM(modrm) ) );
12722      } else {
12723         addr = disAMode( &alen, sorb, delta+3, dis_buf );
12724         assign( src, loadLE( isD ? Ity_F64 : Ity_F32, mkexpr(addr) ));
12725         imm = insn[3+alen];
12726         if (imm & ~3) goto decode_failure;
12727         delta += 3+alen+1;
12728         DIP( "roundsd $%d,%s,%s\n",
12729              imm, dis_buf, nameXMMReg( gregOfRM(modrm) ) );
12730      }
12731
12732      /* (imm & 3) contains an Intel-encoded rounding mode.  Because
12733         that encoding is the same as the encoding for IRRoundingMode,
12734         we can use that value directly in the IR as a rounding
12735         mode. */
12736      assign(res, binop(isD ? Iop_RoundF64toInt : Iop_RoundF32toInt,
12737                  mkU32(imm & 3), mkexpr(src)) );
12738
12739      if (isD)
12740         putXMMRegLane64F( gregOfRM(modrm), 0, mkexpr(res) );
12741      else
12742         putXMMRegLane32F( gregOfRM(modrm), 0, mkexpr(res) );
12743
12744      goto decode_success;
12745   }
12746
12747   /* F3 0F BD -- LZCNT (count leading zeroes.  An AMD extension,
12748      which we can only decode if we're sure this is an AMD cpu that
12749      supports LZCNT, since otherwise it's BSR, which behaves
12750      differently. */
12751   if (insn[0] == 0xF3 && insn[1] == 0x0F && insn[2] == 0xBD
12752       && 0 != (archinfo->hwcaps & VEX_HWCAPS_X86_LZCNT)) {
12753      vassert(sz == 2 || sz == 4);
12754      /*IRType*/ ty  = szToITy(sz);
12755      IRTemp     src = newTemp(ty);
12756      modrm = insn[3];
12757      if (epartIsReg(modrm)) {
12758         assign(src, getIReg(sz, eregOfRM(modrm)));
12759         delta += 3+1;
12760         DIP("lzcnt%c %s, %s\n", nameISize(sz),
12761             nameIReg(sz, eregOfRM(modrm)),
12762             nameIReg(sz, gregOfRM(modrm)));
12763      } else {
12764         addr = disAMode( &alen, sorb, delta+3, dis_buf );
12765         assign(src, loadLE(ty, mkexpr(addr)));
12766         delta += 3+alen;
12767         DIP("lzcnt%c %s, %s\n", nameISize(sz), dis_buf,
12768             nameIReg(sz, gregOfRM(modrm)));
12769      }
12770
12771      IRTemp res = gen_LZCNT(ty, src);
12772      putIReg(sz, gregOfRM(modrm), mkexpr(res));
12773
12774      // Update flags.  This is pretty lame .. perhaps can do better
12775      // if this turns out to be performance critical.
12776      // O S A P are cleared.  Z is set if RESULT == 0.
12777      // C is set if SRC is zero.
12778      IRTemp src32 = newTemp(Ity_I32);
12779      IRTemp res32 = newTemp(Ity_I32);
12780      assign(src32, widenUto32(mkexpr(src)));
12781      assign(res32, widenUto32(mkexpr(res)));
12782
12783      IRTemp oszacp = newTemp(Ity_I32);
12784      assign(
12785         oszacp,
12786         binop(Iop_Or32,
12787               binop(Iop_Shl32,
12788                     unop(Iop_1Uto32,
12789                          binop(Iop_CmpEQ32, mkexpr(res32), mkU32(0))),
12790                     mkU8(X86G_CC_SHIFT_Z)),
12791               binop(Iop_Shl32,
12792                     unop(Iop_1Uto32,
12793                          binop(Iop_CmpEQ32, mkexpr(src32), mkU32(0))),
12794                     mkU8(X86G_CC_SHIFT_C))
12795         )
12796      );
12797
12798      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
12799      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
12800      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
12801      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(oszacp) ));
12802
12803      goto decode_success;
12804   }
12805
12806   /* ---------------------------------------------------- */
12807   /* --- end of the SSE4 decoder                      --- */
12808   /* ---------------------------------------------------- */
12809
12810   after_sse_decoders:
12811
12812   /* ---------------------------------------------------- */
12813   /* --- deal with misc 0x67 pfxs (addr size override) -- */
12814   /* ---------------------------------------------------- */
12815
12816   /* 67 E3 = JCXZ (for JECXZ see below) */
12817   if (insn[0] == 0x67 && insn[1] == 0xE3 && sz == 4) {
12818      delta += 2;
12819      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
12820      delta ++;
12821      stmt( IRStmt_Exit(
12822               binop(Iop_CmpEQ16, getIReg(2,R_ECX), mkU16(0)),
12823               Ijk_Boring,
12824               IRConst_U32(d32),
12825               OFFB_EIP
12826            ));
12827       DIP("jcxz 0x%x\n", d32);
12828       goto decode_success;
12829   }
12830
12831   /* ---------------------------------------------------- */
12832   /* --- start of the baseline insn decoder            -- */
12833   /* ---------------------------------------------------- */
12834
12835   /* Get the primary opcode. */
12836   opc = getIByte(delta); delta++;
12837
12838   /* We get here if the current insn isn't SSE, or this CPU doesn't
12839      support SSE. */
12840
12841   switch (opc) {
12842
12843   /* ------------------------ Control flow --------------- */
12844
12845   case 0xC2: /* RET imm16 */
12846      d32 = getUDisp16(delta);
12847      delta += 2;
12848      dis_ret(&dres, d32);
12849      DIP("ret %d\n", (Int)d32);
12850      break;
12851   case 0xC3: /* RET */
12852      dis_ret(&dres, 0);
12853      DIP("ret\n");
12854      break;
12855
12856   case 0xCF: /* IRET */
12857      /* Note, this is an extremely kludgey and limited implementation
12858         of iret.  All it really does is:
12859            popl %EIP; popl %CS; popl %EFLAGS.
12860         %CS is set but ignored (as it is in (eg) popw %cs)". */
12861      t1 = newTemp(Ity_I32); /* ESP */
12862      t2 = newTemp(Ity_I32); /* new EIP */
12863      t3 = newTemp(Ity_I32); /* new CS */
12864      t4 = newTemp(Ity_I32); /* new EFLAGS */
12865      assign(t1, getIReg(4,R_ESP));
12866      assign(t2, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(0) )));
12867      assign(t3, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(4) )));
12868      assign(t4, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t1),mkU32(8) )));
12869      /* Get stuff off stack */
12870      putIReg(4, R_ESP,binop(Iop_Add32, mkexpr(t1), mkU32(12)));
12871      /* set %CS (which is ignored anyway) */
12872      putSReg( R_CS, unop(Iop_32to16, mkexpr(t3)) );
12873      /* set %EFLAGS */
12874      set_EFLAGS_from_value( t4, False/*!emit_AC_emwarn*/, 0/*unused*/ );
12875      /* goto new EIP value */
12876      jmp_treg(&dres, Ijk_Ret, t2);
12877      vassert(dres.whatNext == Dis_StopHere);
12878      DIP("iret (very kludgey)\n");
12879      break;
12880
12881   case 0xE8: /* CALL J4 */
12882      d32 = getUDisp32(delta); delta += 4;
12883      d32 += (guest_EIP_bbstart+delta);
12884      /* (guest_eip_bbstart+delta) == return-to addr, d32 == call-to addr */
12885      if (d32 == guest_EIP_bbstart+delta && getIByte(delta) >= 0x58
12886                                         && getIByte(delta) <= 0x5F) {
12887         /* Specially treat the position-independent-code idiom
12888                 call X
12889              X: popl %reg
12890            as
12891                 movl %eip, %reg.
12892            since this generates better code, but for no other reason. */
12893         Int archReg = getIByte(delta) - 0x58;
12894         /* vex_printf("-- fPIC thingy\n"); */
12895         putIReg(4, archReg, mkU32(guest_EIP_bbstart+delta));
12896         delta++; /* Step over the POP */
12897         DIP("call 0x%x ; popl %s\n",d32,nameIReg(4,archReg));
12898      } else {
12899         /* The normal sequence for a call. */
12900         t1 = newTemp(Ity_I32);
12901         assign(t1, binop(Iop_Sub32, getIReg(4,R_ESP), mkU32(4)));
12902         putIReg(4, R_ESP, mkexpr(t1));
12903         storeLE( mkexpr(t1), mkU32(guest_EIP_bbstart+delta));
12904         if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32 )) {
12905            /* follow into the call target. */
12906            dres.whatNext   = Dis_ResteerU;
12907            dres.continueAt = (Addr64)(Addr32)d32;
12908         } else {
12909            jmp_lit(&dres, Ijk_Call, d32);
12910            vassert(dres.whatNext == Dis_StopHere);
12911         }
12912         DIP("call 0x%x\n",d32);
12913      }
12914      break;
12915
12916//--    case 0xC8: /* ENTER */
12917//--       d32 = getUDisp16(eip); eip += 2;
12918//--       abyte = getIByte(delta); delta++;
12919//--
12920//--       vg_assert(sz == 4);
12921//--       vg_assert(abyte == 0);
12922//--
12923//--       t1 = newTemp(cb); t2 = newTemp(cb);
12924//--       uInstr2(cb, GET,   sz, ArchReg, R_EBP, TempReg, t1);
12925//--       uInstr2(cb, GET,    4, ArchReg, R_ESP, TempReg, t2);
12926//--       uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
12927//--       uLiteral(cb, sz);
12928//--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
12929//--       uInstr2(cb, STORE,  4, TempReg, t1,    TempReg, t2);
12930//--       uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_EBP);
12931//--       if (d32) {
12932//--          uInstr2(cb, SUB,    4, Literal, 0,     TempReg, t2);
12933//--          uLiteral(cb, d32);
12934//--          uInstr2(cb, PUT,    4, TempReg, t2,    ArchReg, R_ESP);
12935//--       }
12936//--       DIP("enter 0x%x, 0x%x", d32, abyte);
12937//--       break;
12938
12939   case 0xC9: /* LEAVE */
12940      vassert(sz == 4);
12941      t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
12942      assign(t1, getIReg(4,R_EBP));
12943      /* First PUT ESP looks redundant, but need it because ESP must
12944         always be up-to-date for Memcheck to work... */
12945      putIReg(4, R_ESP, mkexpr(t1));
12946      assign(t2, loadLE(Ity_I32,mkexpr(t1)));
12947      putIReg(4, R_EBP, mkexpr(t2));
12948      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(4)) );
12949      DIP("leave\n");
12950      break;
12951
12952   /* ---------------- Misc weird-ass insns --------------- */
12953
12954   case 0x27: /* DAA */
12955   case 0x2F: /* DAS */
12956   case 0x37: /* AAA */
12957   case 0x3F: /* AAS */
12958      /* An ugly implementation for some ugly instructions.  Oh
12959	 well. */
12960      if (sz != 4) goto decode_failure;
12961      t1 = newTemp(Ity_I32);
12962      t2 = newTemp(Ity_I32);
12963      /* Make up a 32-bit value (t1), with the old value of AX in the
12964         bottom 16 bits, and the old OSZACP bitmask in the upper 16
12965         bits. */
12966      assign(t1,
12967             binop(Iop_16HLto32,
12968                   unop(Iop_32to16,
12969                        mk_x86g_calculate_eflags_all()),
12970                   getIReg(2, R_EAX)
12971            ));
12972      /* Call the helper fn, to get a new AX and OSZACP value, and
12973         poke both back into the guest state.  Also pass the helper
12974         the actual opcode so it knows which of the 4 instructions it
12975         is doing the computation for. */
12976      vassert(opc == 0x27 || opc == 0x2F || opc == 0x37 || opc == 0x3F);
12977      assign(t2,
12978              mkIRExprCCall(
12979                 Ity_I32, 0/*regparm*/, "x86g_calculate_daa_das_aaa_aas",
12980                 &x86g_calculate_daa_das_aaa_aas,
12981                 mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
12982            ));
12983     putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
12984
12985     stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
12986     stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
12987     stmt( IRStmt_Put( OFFB_CC_DEP1,
12988                       binop(Iop_And32,
12989                             binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
12990                             mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
12991                                    | X86G_CC_MASK_A | X86G_CC_MASK_Z
12992                                    | X86G_CC_MASK_S| X86G_CC_MASK_O )
12993                            )
12994                      )
12995         );
12996     /* Set NDEP even though it isn't used.  This makes redundant-PUT
12997        elimination of previous stores to this field work better. */
12998     stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
12999     switch (opc) {
13000        case 0x27: DIP("daa\n"); break;
13001        case 0x2F: DIP("das\n"); break;
13002        case 0x37: DIP("aaa\n"); break;
13003        case 0x3F: DIP("aas\n"); break;
13004        default: vassert(0);
13005     }
13006     break;
13007
13008   case 0xD4: /* AAM */
13009   case 0xD5: /* AAD */
13010      d32 = getIByte(delta); delta++;
13011      if (sz != 4 || d32 != 10) goto decode_failure;
13012      t1 = newTemp(Ity_I32);
13013      t2 = newTemp(Ity_I32);
13014      /* Make up a 32-bit value (t1), with the old value of AX in the
13015         bottom 16 bits, and the old OSZACP bitmask in the upper 16
13016         bits. */
13017      assign(t1,
13018             binop(Iop_16HLto32,
13019                   unop(Iop_32to16,
13020                        mk_x86g_calculate_eflags_all()),
13021                   getIReg(2, R_EAX)
13022            ));
13023      /* Call the helper fn, to get a new AX and OSZACP value, and
13024         poke both back into the guest state.  Also pass the helper
13025         the actual opcode so it knows which of the 2 instructions it
13026         is doing the computation for. */
13027      assign(t2,
13028              mkIRExprCCall(
13029                 Ity_I32, 0/*regparm*/, "x86g_calculate_aad_aam",
13030                 &x86g_calculate_aad_aam,
13031                 mkIRExprVec_2( mkexpr(t1), mkU32( opc & 0xFF) )
13032            ));
13033      putIReg(2, R_EAX, unop(Iop_32to16, mkexpr(t2) ));
13034
13035      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
13036      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
13037      stmt( IRStmt_Put( OFFB_CC_DEP1,
13038                        binop(Iop_And32,
13039                              binop(Iop_Shr32, mkexpr(t2), mkU8(16)),
13040                              mkU32( X86G_CC_MASK_C | X86G_CC_MASK_P
13041                                     | X86G_CC_MASK_A | X86G_CC_MASK_Z
13042                                     | X86G_CC_MASK_S| X86G_CC_MASK_O )
13043                             )
13044                       )
13045          );
13046      /* Set NDEP even though it isn't used.  This makes
13047         redundant-PUT elimination of previous stores to this field
13048         work better. */
13049      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
13050
13051      DIP(opc == 0xD4 ? "aam\n" : "aad\n");
13052      break;
13053
13054   /* ------------------------ CWD/CDQ -------------------- */
13055
13056   case 0x98: /* CBW */
13057      if (sz == 4) {
13058         putIReg(4, R_EAX, unop(Iop_16Sto32, getIReg(2, R_EAX)));
13059         DIP("cwde\n");
13060      } else {
13061         vassert(sz == 2);
13062         putIReg(2, R_EAX, unop(Iop_8Sto16, getIReg(1, R_EAX)));
13063         DIP("cbw\n");
13064      }
13065      break;
13066
13067   case 0x99: /* CWD/CDQ */
13068      ty = szToITy(sz);
13069      putIReg(sz, R_EDX,
13070                  binop(mkSizedOp(ty,Iop_Sar8),
13071                        getIReg(sz, R_EAX),
13072                        mkU8(sz == 2 ? 15 : 31)) );
13073      DIP(sz == 2 ? "cwdq\n" : "cdqq\n");
13074      break;
13075
13076   /* ------------------------ FPU ops -------------------- */
13077
13078   case 0x9E: /* SAHF */
13079      codegen_SAHF();
13080      DIP("sahf\n");
13081      break;
13082
13083   case 0x9F: /* LAHF */
13084      codegen_LAHF();
13085      DIP("lahf\n");
13086      break;
13087
13088   case 0x9B: /* FWAIT */
13089      /* ignore? */
13090      DIP("fwait\n");
13091      break;
13092
13093   case 0xD8:
13094   case 0xD9:
13095   case 0xDA:
13096   case 0xDB:
13097   case 0xDC:
13098   case 0xDD:
13099   case 0xDE:
13100   case 0xDF: {
13101      Int  delta0    = delta;
13102      Bool decode_OK = False;
13103      delta = dis_FPU ( &decode_OK, sorb, delta );
13104      if (!decode_OK) {
13105         delta = delta0;
13106         goto decode_failure;
13107      }
13108      break;
13109   }
13110
13111   /* ------------------------ INC & DEC ------------------ */
13112
13113   case 0x40: /* INC eAX */
13114   case 0x41: /* INC eCX */
13115   case 0x42: /* INC eDX */
13116   case 0x43: /* INC eBX */
13117   case 0x44: /* INC eSP */
13118   case 0x45: /* INC eBP */
13119   case 0x46: /* INC eSI */
13120   case 0x47: /* INC eDI */
13121      vassert(sz == 2 || sz == 4);
13122      ty = szToITy(sz);
13123      t1 = newTemp(ty);
13124      assign( t1, binop(mkSizedOp(ty,Iop_Add8),
13125                        getIReg(sz, (UInt)(opc - 0x40)),
13126                        mkU(ty,1)) );
13127      setFlags_INC_DEC( True, t1, ty );
13128      putIReg(sz, (UInt)(opc - 0x40), mkexpr(t1));
13129      DIP("inc%c %s\n", nameISize(sz), nameIReg(sz,opc-0x40));
13130      break;
13131
13132   case 0x48: /* DEC eAX */
13133   case 0x49: /* DEC eCX */
13134   case 0x4A: /* DEC eDX */
13135   case 0x4B: /* DEC eBX */
13136   case 0x4C: /* DEC eSP */
13137   case 0x4D: /* DEC eBP */
13138   case 0x4E: /* DEC eSI */
13139   case 0x4F: /* DEC eDI */
13140      vassert(sz == 2 || sz == 4);
13141      ty = szToITy(sz);
13142      t1 = newTemp(ty);
13143      assign( t1, binop(mkSizedOp(ty,Iop_Sub8),
13144                        getIReg(sz, (UInt)(opc - 0x48)),
13145                        mkU(ty,1)) );
13146      setFlags_INC_DEC( False, t1, ty );
13147      putIReg(sz, (UInt)(opc - 0x48), mkexpr(t1));
13148      DIP("dec%c %s\n", nameISize(sz), nameIReg(sz,opc-0x48));
13149      break;
13150
13151   /* ------------------------ INT ------------------------ */
13152
13153   case 0xCC: /* INT 3 */
13154      jmp_lit(&dres, Ijk_SigTRAP, ((Addr32)guest_EIP_bbstart)+delta);
13155      vassert(dres.whatNext == Dis_StopHere);
13156      DIP("int $0x3\n");
13157      break;
13158
13159   case 0xCD: /* INT imm8 */
13160      d32 = getIByte(delta); delta++;
13161
13162      /* For any of the cases where we emit a jump (that is, for all
13163         currently handled cases), it's important that all ArchRegs
13164         carry their up-to-date value at this point.  So we declare an
13165         end-of-block here, which forces any TempRegs caching ArchRegs
13166         to be flushed. */
13167
13168      /* Handle int $0x3F .. $0x4F by synthesising a segfault and a
13169         restart of this instruction (hence the "-2" two lines below,
13170         to get the restart EIP to be this instruction.  This is
13171         probably Linux-specific and it would be more correct to only
13172         do this if the VexAbiInfo says that is what we should do.
13173         This used to handle just 0x40-0x43; Jikes RVM uses a larger
13174         range (0x3F-0x49), and this allows some slack as well. */
13175      if (d32 >= 0x3F && d32 <= 0x4F) {
13176         jmp_lit(&dres, Ijk_SigSEGV, ((Addr32)guest_EIP_bbstart)+delta-2);
13177         vassert(dres.whatNext == Dis_StopHere);
13178         DIP("int $0x%x\n", (Int)d32);
13179         break;
13180      }
13181
13182      /* Handle int $0x80 (linux syscalls), int $0x81 and $0x82
13183         (darwin syscalls).  As part of this, note where we are, so we
13184         can back up the guest to this point if the syscall needs to
13185         be restarted. */
13186      if (d32 == 0x80) {
13187         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
13188                           mkU32(guest_EIP_curr_instr) ) );
13189         jmp_lit(&dres, Ijk_Sys_int128, ((Addr32)guest_EIP_bbstart)+delta);
13190         vassert(dres.whatNext == Dis_StopHere);
13191         DIP("int $0x80\n");
13192         break;
13193      }
13194      if (d32 == 0x81) {
13195         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
13196                           mkU32(guest_EIP_curr_instr) ) );
13197         jmp_lit(&dres, Ijk_Sys_int129, ((Addr32)guest_EIP_bbstart)+delta);
13198         vassert(dres.whatNext == Dis_StopHere);
13199         DIP("int $0x81\n");
13200         break;
13201      }
13202      if (d32 == 0x82) {
13203         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
13204                           mkU32(guest_EIP_curr_instr) ) );
13205         jmp_lit(&dres, Ijk_Sys_int130, ((Addr32)guest_EIP_bbstart)+delta);
13206         vassert(dres.whatNext == Dis_StopHere);
13207         DIP("int $0x82\n");
13208         break;
13209      }
13210
13211      /* none of the above */
13212      goto decode_failure;
13213
13214   /* ------------------------ Jcond, byte offset --------- */
13215
13216   case 0xEB: /* Jb (jump, byte offset) */
13217      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
13218      delta++;
13219      if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
13220         dres.whatNext   = Dis_ResteerU;
13221         dres.continueAt = (Addr64)(Addr32)d32;
13222      } else {
13223         jmp_lit(&dres, Ijk_Boring, d32);
13224         vassert(dres.whatNext == Dis_StopHere);
13225      }
13226      DIP("jmp-8 0x%x\n", d32);
13227      break;
13228
13229   case 0xE9: /* Jv (jump, 16/32 offset) */
13230      vassert(sz == 4); /* JRS added 2004 July 11 */
13231      d32 = (((Addr32)guest_EIP_bbstart)+delta+sz) + getSDisp(sz,delta);
13232      delta += sz;
13233      if (resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
13234         dres.whatNext   = Dis_ResteerU;
13235         dres.continueAt = (Addr64)(Addr32)d32;
13236      } else {
13237         jmp_lit(&dres, Ijk_Boring, d32);
13238         vassert(dres.whatNext == Dis_StopHere);
13239      }
13240      DIP("jmp 0x%x\n", d32);
13241      break;
13242
13243   case 0x70:
13244   case 0x71:
13245   case 0x72: /* JBb/JNAEb (jump below) */
13246   case 0x73: /* JNBb/JAEb (jump not below) */
13247   case 0x74: /* JZb/JEb (jump zero) */
13248   case 0x75: /* JNZb/JNEb (jump not zero) */
13249   case 0x76: /* JBEb/JNAb (jump below or equal) */
13250   case 0x77: /* JNBEb/JAb (jump not below or equal) */
13251   case 0x78: /* JSb (jump negative) */
13252   case 0x79: /* JSb (jump not negative) */
13253   case 0x7A: /* JP (jump parity even) */
13254   case 0x7B: /* JNP/JPO (jump parity odd) */
13255   case 0x7C: /* JLb/JNGEb (jump less) */
13256   case 0x7D: /* JGEb/JNLb (jump greater or equal) */
13257   case 0x7E: /* JLEb/JNGb (jump less or equal) */
13258   case 0x7F: /* JGb/JNLEb (jump greater) */
13259    { Int    jmpDelta;
13260      HChar* comment  = "";
13261      jmpDelta = (Int)getSDisp8(delta);
13262      vassert(-128 <= jmpDelta && jmpDelta < 128);
13263      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + jmpDelta;
13264      delta++;
13265      if (resteerCisOk
13266          && vex_control.guest_chase_cond
13267          && (Addr32)d32 != (Addr32)guest_EIP_bbstart
13268          && jmpDelta < 0
13269          && resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
13270         /* Speculation: assume this backward branch is taken.  So we
13271            need to emit a side-exit to the insn following this one,
13272            on the negation of the condition, and continue at the
13273            branch target address (d32).  If we wind up back at the
13274            first instruction of the trace, just stop; it's better to
13275            let the IR loop unroller handle that case. */
13276         stmt( IRStmt_Exit(
13277                  mk_x86g_calculate_condition((X86Condcode)(1 ^ (opc - 0x70))),
13278                  Ijk_Boring,
13279                  IRConst_U32(guest_EIP_bbstart+delta),
13280                  OFFB_EIP ) );
13281         dres.whatNext   = Dis_ResteerC;
13282         dres.continueAt = (Addr64)(Addr32)d32;
13283         comment = "(assumed taken)";
13284      }
13285      else
13286      if (resteerCisOk
13287          && vex_control.guest_chase_cond
13288          && (Addr32)d32 != (Addr32)guest_EIP_bbstart
13289          && jmpDelta >= 0
13290          && resteerOkFn( callback_opaque,
13291                          (Addr64)(Addr32)(guest_EIP_bbstart+delta)) ) {
13292         /* Speculation: assume this forward branch is not taken.  So
13293            we need to emit a side-exit to d32 (the dest) and continue
13294            disassembling at the insn immediately following this
13295            one. */
13296         stmt( IRStmt_Exit(
13297                  mk_x86g_calculate_condition((X86Condcode)(opc - 0x70)),
13298                  Ijk_Boring,
13299                  IRConst_U32(d32),
13300                  OFFB_EIP ) );
13301         dres.whatNext   = Dis_ResteerC;
13302         dres.continueAt = (Addr64)(Addr32)(guest_EIP_bbstart+delta);
13303         comment = "(assumed not taken)";
13304      }
13305      else {
13306         /* Conservative default translation - end the block at this
13307            point. */
13308         jcc_01( &dres, (X86Condcode)(opc - 0x70),
13309                 (Addr32)(guest_EIP_bbstart+delta), d32);
13310         vassert(dres.whatNext == Dis_StopHere);
13311      }
13312      DIP("j%s-8 0x%x %s\n", name_X86Condcode(opc - 0x70), d32, comment);
13313      break;
13314    }
13315
13316   case 0xE3: /* JECXZ (for JCXZ see above) */
13317      if (sz != 4) goto decode_failure;
13318      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
13319      delta ++;
13320      stmt( IRStmt_Exit(
13321               binop(Iop_CmpEQ32, getIReg(4,R_ECX), mkU32(0)),
13322            Ijk_Boring,
13323            IRConst_U32(d32),
13324            OFFB_EIP
13325          ));
13326      DIP("jecxz 0x%x\n", d32);
13327      break;
13328
13329   case 0xE0: /* LOOPNE disp8: decrement count, jump if count != 0 && ZF==0 */
13330   case 0xE1: /* LOOPE  disp8: decrement count, jump if count != 0 && ZF==1 */
13331   case 0xE2: /* LOOP   disp8: decrement count, jump if count != 0 */
13332    { /* Again, the docs say this uses ECX/CX as a count depending on
13333         the address size override, not the operand one.  Since we
13334         don't handle address size overrides, I guess that means
13335         ECX. */
13336      IRExpr* zbit  = NULL;
13337      IRExpr* count = NULL;
13338      IRExpr* cond  = NULL;
13339      HChar*  xtra  = NULL;
13340
13341      if (sz != 4) goto decode_failure;
13342      d32 = (((Addr32)guest_EIP_bbstart)+delta+1) + getSDisp8(delta);
13343      delta++;
13344      putIReg(4, R_ECX, binop(Iop_Sub32, getIReg(4,R_ECX), mkU32(1)));
13345
13346      count = getIReg(4,R_ECX);
13347      cond = binop(Iop_CmpNE32, count, mkU32(0));
13348      switch (opc) {
13349         case 0xE2:
13350            xtra = "";
13351            break;
13352         case 0xE1:
13353            xtra = "e";
13354            zbit = mk_x86g_calculate_condition( X86CondZ );
13355	    cond = mkAnd1(cond, zbit);
13356            break;
13357         case 0xE0:
13358            xtra = "ne";
13359            zbit = mk_x86g_calculate_condition( X86CondNZ );
13360	    cond = mkAnd1(cond, zbit);
13361            break;
13362         default:
13363	    vassert(0);
13364      }
13365      stmt( IRStmt_Exit(cond, Ijk_Boring, IRConst_U32(d32), OFFB_EIP) );
13366
13367      DIP("loop%s 0x%x\n", xtra, d32);
13368      break;
13369    }
13370
13371   /* ------------------------ IMUL ----------------------- */
13372
13373   case 0x69: /* IMUL Iv, Ev, Gv */
13374      delta = dis_imul_I_E_G ( sorb, sz, delta, sz );
13375      break;
13376   case 0x6B: /* IMUL Ib, Ev, Gv */
13377      delta = dis_imul_I_E_G ( sorb, sz, delta, 1 );
13378      break;
13379
13380   /* ------------------------ MOV ------------------------ */
13381
13382   case 0x88: /* MOV Gb,Eb */
13383      delta = dis_mov_G_E(sorb, 1, delta);
13384      break;
13385
13386   case 0x89: /* MOV Gv,Ev */
13387      delta = dis_mov_G_E(sorb, sz, delta);
13388      break;
13389
13390   case 0x8A: /* MOV Eb,Gb */
13391      delta = dis_mov_E_G(sorb, 1, delta);
13392      break;
13393
13394   case 0x8B: /* MOV Ev,Gv */
13395      delta = dis_mov_E_G(sorb, sz, delta);
13396      break;
13397
13398   case 0x8D: /* LEA M,Gv */
13399      if (sz != 4)
13400         goto decode_failure;
13401      modrm = getIByte(delta);
13402      if (epartIsReg(modrm))
13403         goto decode_failure;
13404      /* NOTE!  this is the one place where a segment override prefix
13405         has no effect on the address calculation.  Therefore we pass
13406         zero instead of sorb here. */
13407      addr = disAMode ( &alen, /*sorb*/ 0, delta, dis_buf );
13408      delta += alen;
13409      putIReg(sz, gregOfRM(modrm), mkexpr(addr));
13410      DIP("lea%c %s, %s\n", nameISize(sz), dis_buf,
13411                            nameIReg(sz,gregOfRM(modrm)));
13412      break;
13413
13414   case 0x8C: /* MOV Sw,Ew -- MOV from a SEGMENT REGISTER */
13415      delta = dis_mov_Sw_Ew(sorb, sz, delta);
13416      break;
13417
13418   case 0x8E: /* MOV Ew,Sw -- MOV to a SEGMENT REGISTER */
13419      delta = dis_mov_Ew_Sw(sorb, delta);
13420      break;
13421
13422   case 0xA0: /* MOV Ob,AL */
13423      sz = 1;
13424      /* Fall through ... */
13425   case 0xA1: /* MOV Ov,eAX */
13426      d32 = getUDisp32(delta); delta += 4;
13427      ty = szToITy(sz);
13428      addr = newTemp(Ity_I32);
13429      assign( addr, handleSegOverride(sorb, mkU32(d32)) );
13430      putIReg(sz, R_EAX, loadLE(ty, mkexpr(addr)));
13431      DIP("mov%c %s0x%x, %s\n", nameISize(sz), sorbTxt(sorb),
13432                                d32, nameIReg(sz,R_EAX));
13433      break;
13434
13435   case 0xA2: /* MOV Ob,AL */
13436      sz = 1;
13437      /* Fall through ... */
13438   case 0xA3: /* MOV eAX,Ov */
13439      d32 = getUDisp32(delta); delta += 4;
13440      ty = szToITy(sz);
13441      addr = newTemp(Ity_I32);
13442      assign( addr, handleSegOverride(sorb, mkU32(d32)) );
13443      storeLE( mkexpr(addr), getIReg(sz,R_EAX) );
13444      DIP("mov%c %s, %s0x%x\n", nameISize(sz), nameIReg(sz,R_EAX),
13445                                sorbTxt(sorb), d32);
13446      break;
13447
13448   case 0xB0: /* MOV imm,AL */
13449   case 0xB1: /* MOV imm,CL */
13450   case 0xB2: /* MOV imm,DL */
13451   case 0xB3: /* MOV imm,BL */
13452   case 0xB4: /* MOV imm,AH */
13453   case 0xB5: /* MOV imm,CH */
13454   case 0xB6: /* MOV imm,DH */
13455   case 0xB7: /* MOV imm,BH */
13456      d32 = getIByte(delta); delta += 1;
13457      putIReg(1, opc-0xB0, mkU8(d32));
13458      DIP("movb $0x%x,%s\n", d32, nameIReg(1,opc-0xB0));
13459      break;
13460
13461   case 0xB8: /* MOV imm,eAX */
13462   case 0xB9: /* MOV imm,eCX */
13463   case 0xBA: /* MOV imm,eDX */
13464   case 0xBB: /* MOV imm,eBX */
13465   case 0xBC: /* MOV imm,eSP */
13466   case 0xBD: /* MOV imm,eBP */
13467   case 0xBE: /* MOV imm,eSI */
13468   case 0xBF: /* MOV imm,eDI */
13469      d32 = getUDisp(sz,delta); delta += sz;
13470      putIReg(sz, opc-0xB8, mkU(szToITy(sz), d32));
13471      DIP("mov%c $0x%x,%s\n", nameISize(sz), d32, nameIReg(sz,opc-0xB8));
13472      break;
13473
13474   case 0xC6: /* MOV Ib,Eb */
13475      sz = 1;
13476      goto do_Mov_I_E;
13477   case 0xC7: /* MOV Iv,Ev */
13478      goto do_Mov_I_E;
13479
13480   do_Mov_I_E:
13481      modrm = getIByte(delta);
13482      if (epartIsReg(modrm)) {
13483         delta++; /* mod/rm byte */
13484         d32 = getUDisp(sz,delta); delta += sz;
13485         putIReg(sz, eregOfRM(modrm), mkU(szToITy(sz), d32));
13486         DIP("mov%c $0x%x, %s\n", nameISize(sz), d32,
13487                                  nameIReg(sz,eregOfRM(modrm)));
13488      } else {
13489         addr = disAMode ( &alen, sorb, delta, dis_buf );
13490         delta += alen;
13491         d32 = getUDisp(sz,delta); delta += sz;
13492         storeLE(mkexpr(addr), mkU(szToITy(sz), d32));
13493         DIP("mov%c $0x%x, %s\n", nameISize(sz), d32, dis_buf);
13494      }
13495      break;
13496
13497   /* ------------------------ opl imm, A ----------------- */
13498
13499   case 0x04: /* ADD Ib, AL */
13500      delta = dis_op_imm_A(  1, False, Iop_Add8, True, delta, "add" );
13501      break;
13502   case 0x05: /* ADD Iv, eAX */
13503      delta = dis_op_imm_A( sz, False, Iop_Add8, True, delta, "add" );
13504      break;
13505
13506   case 0x0C: /* OR Ib, AL */
13507      delta = dis_op_imm_A(  1, False, Iop_Or8, True, delta, "or" );
13508      break;
13509   case 0x0D: /* OR Iv, eAX */
13510      delta = dis_op_imm_A( sz, False, Iop_Or8, True, delta, "or" );
13511      break;
13512
13513   case 0x14: /* ADC Ib, AL */
13514      delta = dis_op_imm_A(  1, True, Iop_Add8, True, delta, "adc" );
13515      break;
13516   case 0x15: /* ADC Iv, eAX */
13517      delta = dis_op_imm_A( sz, True, Iop_Add8, True, delta, "adc" );
13518      break;
13519
13520   case 0x1C: /* SBB Ib, AL */
13521      delta = dis_op_imm_A( 1, True, Iop_Sub8, True, delta, "sbb" );
13522      break;
13523   case 0x1D: /* SBB Iv, eAX */
13524      delta = dis_op_imm_A( sz, True, Iop_Sub8, True, delta, "sbb" );
13525      break;
13526
13527   case 0x24: /* AND Ib, AL */
13528      delta = dis_op_imm_A(  1, False, Iop_And8, True, delta, "and" );
13529      break;
13530   case 0x25: /* AND Iv, eAX */
13531      delta = dis_op_imm_A( sz, False, Iop_And8, True, delta, "and" );
13532      break;
13533
13534   case 0x2C: /* SUB Ib, AL */
13535      delta = dis_op_imm_A(  1, False, Iop_Sub8, True, delta, "sub" );
13536      break;
13537   case 0x2D: /* SUB Iv, eAX */
13538      delta = dis_op_imm_A( sz, False, Iop_Sub8, True, delta, "sub" );
13539      break;
13540
13541   case 0x34: /* XOR Ib, AL */
13542      delta = dis_op_imm_A(  1, False, Iop_Xor8, True, delta, "xor" );
13543      break;
13544   case 0x35: /* XOR Iv, eAX */
13545      delta = dis_op_imm_A( sz, False, Iop_Xor8, True, delta, "xor" );
13546      break;
13547
13548   case 0x3C: /* CMP Ib, AL */
13549      delta = dis_op_imm_A(  1, False, Iop_Sub8, False, delta, "cmp" );
13550      break;
13551   case 0x3D: /* CMP Iv, eAX */
13552      delta = dis_op_imm_A( sz, False, Iop_Sub8, False, delta, "cmp" );
13553      break;
13554
13555   case 0xA8: /* TEST Ib, AL */
13556      delta = dis_op_imm_A(  1, False, Iop_And8, False, delta, "test" );
13557      break;
13558   case 0xA9: /* TEST Iv, eAX */
13559      delta = dis_op_imm_A( sz, False, Iop_And8, False, delta, "test" );
13560      break;
13561
13562   /* ------------------------ opl Ev, Gv ----------------- */
13563
13564   case 0x02: /* ADD Eb,Gb */
13565      delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, 1, delta, "add" );
13566      break;
13567   case 0x03: /* ADD Ev,Gv */
13568      delta = dis_op2_E_G ( sorb, False, Iop_Add8, True, sz, delta, "add" );
13569      break;
13570
13571   case 0x0A: /* OR Eb,Gb */
13572      delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, 1, delta, "or" );
13573      break;
13574   case 0x0B: /* OR Ev,Gv */
13575      delta = dis_op2_E_G ( sorb, False, Iop_Or8, True, sz, delta, "or" );
13576      break;
13577
13578   case 0x12: /* ADC Eb,Gb */
13579      delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, 1, delta, "adc" );
13580      break;
13581   case 0x13: /* ADC Ev,Gv */
13582      delta = dis_op2_E_G ( sorb, True, Iop_Add8, True, sz, delta, "adc" );
13583      break;
13584
13585   case 0x1A: /* SBB Eb,Gb */
13586      delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, 1, delta, "sbb" );
13587      break;
13588   case 0x1B: /* SBB Ev,Gv */
13589      delta = dis_op2_E_G ( sorb, True, Iop_Sub8, True, sz, delta, "sbb" );
13590      break;
13591
13592   case 0x22: /* AND Eb,Gb */
13593      delta = dis_op2_E_G ( sorb, False, Iop_And8, True, 1, delta, "and" );
13594      break;
13595   case 0x23: /* AND Ev,Gv */
13596      delta = dis_op2_E_G ( sorb, False, Iop_And8, True, sz, delta, "and" );
13597      break;
13598
13599   case 0x2A: /* SUB Eb,Gb */
13600      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, 1, delta, "sub" );
13601      break;
13602   case 0x2B: /* SUB Ev,Gv */
13603      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, True, sz, delta, "sub" );
13604      break;
13605
13606   case 0x32: /* XOR Eb,Gb */
13607      delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, 1, delta, "xor" );
13608      break;
13609   case 0x33: /* XOR Ev,Gv */
13610      delta = dis_op2_E_G ( sorb, False, Iop_Xor8, True, sz, delta, "xor" );
13611      break;
13612
13613   case 0x3A: /* CMP Eb,Gb */
13614      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, 1, delta, "cmp" );
13615      break;
13616   case 0x3B: /* CMP Ev,Gv */
13617      delta = dis_op2_E_G ( sorb, False, Iop_Sub8, False, sz, delta, "cmp" );
13618      break;
13619
13620   case 0x84: /* TEST Eb,Gb */
13621      delta = dis_op2_E_G ( sorb, False, Iop_And8, False, 1, delta, "test" );
13622      break;
13623   case 0x85: /* TEST Ev,Gv */
13624      delta = dis_op2_E_G ( sorb, False, Iop_And8, False, sz, delta, "test" );
13625      break;
13626
13627   /* ------------------------ opl Gv, Ev ----------------- */
13628
13629   case 0x00: /* ADD Gb,Eb */
13630      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13631                            Iop_Add8, True, 1, delta, "add" );
13632      break;
13633   case 0x01: /* ADD Gv,Ev */
13634      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13635                            Iop_Add8, True, sz, delta, "add" );
13636      break;
13637
13638   case 0x08: /* OR Gb,Eb */
13639      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13640                            Iop_Or8, True, 1, delta, "or" );
13641      break;
13642   case 0x09: /* OR Gv,Ev */
13643      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13644                            Iop_Or8, True, sz, delta, "or" );
13645      break;
13646
13647   case 0x10: /* ADC Gb,Eb */
13648      delta = dis_op2_G_E ( sorb, pfx_lock, True,
13649                            Iop_Add8, True, 1, delta, "adc" );
13650      break;
13651   case 0x11: /* ADC Gv,Ev */
13652      delta = dis_op2_G_E ( sorb, pfx_lock, True,
13653                            Iop_Add8, True, sz, delta, "adc" );
13654      break;
13655
13656   case 0x18: /* SBB Gb,Eb */
13657      delta = dis_op2_G_E ( sorb, pfx_lock, True,
13658                            Iop_Sub8, True, 1, delta, "sbb" );
13659      break;
13660   case 0x19: /* SBB Gv,Ev */
13661      delta = dis_op2_G_E ( sorb, pfx_lock, True,
13662                            Iop_Sub8, True, sz, delta, "sbb" );
13663      break;
13664
13665   case 0x20: /* AND Gb,Eb */
13666      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13667                            Iop_And8, True, 1, delta, "and" );
13668      break;
13669   case 0x21: /* AND Gv,Ev */
13670      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13671                            Iop_And8, True, sz, delta, "and" );
13672      break;
13673
13674   case 0x28: /* SUB Gb,Eb */
13675      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13676                            Iop_Sub8, True, 1, delta, "sub" );
13677      break;
13678   case 0x29: /* SUB Gv,Ev */
13679      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13680                            Iop_Sub8, True, sz, delta, "sub" );
13681      break;
13682
13683   case 0x30: /* XOR Gb,Eb */
13684      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13685                            Iop_Xor8, True, 1, delta, "xor" );
13686      break;
13687   case 0x31: /* XOR Gv,Ev */
13688      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13689                            Iop_Xor8, True, sz, delta, "xor" );
13690      break;
13691
13692   case 0x38: /* CMP Gb,Eb */
13693      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13694                            Iop_Sub8, False, 1, delta, "cmp" );
13695      break;
13696   case 0x39: /* CMP Gv,Ev */
13697      delta = dis_op2_G_E ( sorb, pfx_lock, False,
13698                            Iop_Sub8, False, sz, delta, "cmp" );
13699      break;
13700
13701   /* ------------------------ POP ------------------------ */
13702
13703   case 0x58: /* POP eAX */
13704   case 0x59: /* POP eCX */
13705   case 0x5A: /* POP eDX */
13706   case 0x5B: /* POP eBX */
13707   case 0x5D: /* POP eBP */
13708   case 0x5E: /* POP eSI */
13709   case 0x5F: /* POP eDI */
13710   case 0x5C: /* POP eSP */
13711      vassert(sz == 2 || sz == 4);
13712      t1 = newTemp(szToITy(sz)); t2 = newTemp(Ity_I32);
13713      assign(t2, getIReg(4, R_ESP));
13714      assign(t1, loadLE(szToITy(sz),mkexpr(t2)));
13715      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
13716      putIReg(sz, opc-0x58, mkexpr(t1));
13717      DIP("pop%c %s\n", nameISize(sz), nameIReg(sz,opc-0x58));
13718      break;
13719
13720   case 0x9D: /* POPF */
13721      vassert(sz == 2 || sz == 4);
13722      t1 = newTemp(Ity_I32); t2 = newTemp(Ity_I32);
13723      assign(t2, getIReg(4, R_ESP));
13724      assign(t1, widenUto32(loadLE(szToITy(sz),mkexpr(t2))));
13725      putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t2), mkU32(sz)));
13726
13727      /* Generate IR to set %EFLAGS{O,S,Z,A,C,P,D,ID,AC} from the
13728	 value in t1. */
13729      set_EFLAGS_from_value( t1, True/*emit_AC_emwarn*/,
13730                                 ((Addr32)guest_EIP_bbstart)+delta );
13731
13732      DIP("popf%c\n", nameISize(sz));
13733      break;
13734
13735   case 0x61: /* POPA */
13736      /* This is almost certainly wrong for sz==2.  So ... */
13737      if (sz != 4) goto decode_failure;
13738
13739      /* t5 is the old %ESP value. */
13740      t5 = newTemp(Ity_I32);
13741      assign( t5, getIReg(4, R_ESP) );
13742
13743      /* Reload all the registers, except %esp. */
13744      putIReg(4,R_EAX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(28)) ));
13745      putIReg(4,R_ECX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(24)) ));
13746      putIReg(4,R_EDX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(20)) ));
13747      putIReg(4,R_EBX, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32(16)) ));
13748      /* ignore saved %ESP */
13749      putIReg(4,R_EBP, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 8)) ));
13750      putIReg(4,R_ESI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 4)) ));
13751      putIReg(4,R_EDI, loadLE(Ity_I32, binop(Iop_Add32,mkexpr(t5),mkU32( 0)) ));
13752
13753      /* and move %ESP back up */
13754      putIReg( 4, R_ESP, binop(Iop_Add32, mkexpr(t5), mkU32(8*4)) );
13755
13756      DIP("popa%c\n", nameISize(sz));
13757      break;
13758
13759   case 0x8F: /* POPL/POPW m32 */
13760     { Int    len;
13761       UChar  rm = getIByte(delta);
13762
13763       /* make sure this instruction is correct POP */
13764       if (epartIsReg(rm) || gregOfRM(rm) != 0)
13765          goto decode_failure;
13766       /* and has correct size */
13767       if (sz != 4 && sz != 2)
13768          goto decode_failure;
13769       ty = szToITy(sz);
13770
13771       t1 = newTemp(Ity_I32); /* stack address */
13772       t3 = newTemp(ty); /* data */
13773       /* set t1 to ESP: t1 = ESP */
13774       assign( t1, getIReg(4, R_ESP) );
13775       /* load M[ESP] to virtual register t3: t3 = M[t1] */
13776       assign( t3, loadLE(ty, mkexpr(t1)) );
13777
13778       /* increase ESP; must be done before the STORE.  Intel manual says:
13779            If the ESP register is used as a base register for addressing
13780            a destination operand in memory, the POP instruction computes
13781            the effective address of the operand after it increments the
13782            ESP register.
13783       */
13784       putIReg(4, R_ESP, binop(Iop_Add32, mkexpr(t1), mkU32(sz)) );
13785
13786       /* resolve MODR/M */
13787       addr = disAMode ( &len, sorb, delta, dis_buf);
13788       storeLE( mkexpr(addr), mkexpr(t3) );
13789
13790       DIP("pop%c %s\n", sz==2 ? 'w' : 'l', dis_buf);
13791
13792       delta += len;
13793       break;
13794     }
13795
13796   case 0x1F: /* POP %DS */
13797      dis_pop_segreg( R_DS, sz ); break;
13798   case 0x07: /* POP %ES */
13799      dis_pop_segreg( R_ES, sz ); break;
13800   case 0x17: /* POP %SS */
13801      dis_pop_segreg( R_SS, sz ); break;
13802
13803   /* ------------------------ PUSH ----------------------- */
13804
13805   case 0x50: /* PUSH eAX */
13806   case 0x51: /* PUSH eCX */
13807   case 0x52: /* PUSH eDX */
13808   case 0x53: /* PUSH eBX */
13809   case 0x55: /* PUSH eBP */
13810   case 0x56: /* PUSH eSI */
13811   case 0x57: /* PUSH eDI */
13812   case 0x54: /* PUSH eSP */
13813      /* This is the Right Way, in that the value to be pushed is
13814         established before %esp is changed, so that pushl %esp
13815         correctly pushes the old value. */
13816      vassert(sz == 2 || sz == 4);
13817      ty = sz==2 ? Ity_I16 : Ity_I32;
13818      t1 = newTemp(ty); t2 = newTemp(Ity_I32);
13819      assign(t1, getIReg(sz, opc-0x50));
13820      assign(t2, binop(Iop_Sub32, getIReg(4, R_ESP), mkU32(sz)));
13821      putIReg(4, R_ESP, mkexpr(t2) );
13822      storeLE(mkexpr(t2),mkexpr(t1));
13823      DIP("push%c %s\n", nameISize(sz), nameIReg(sz,opc-0x50));
13824      break;
13825
13826
13827   case 0x68: /* PUSH Iv */
13828      d32 = getUDisp(sz,delta); delta += sz;
13829      goto do_push_I;
13830   case 0x6A: /* PUSH Ib, sign-extended to sz */
13831      d32 = getSDisp8(delta); delta += 1;
13832      goto do_push_I;
13833   do_push_I:
13834      ty = szToITy(sz);
13835      t1 = newTemp(Ity_I32); t2 = newTemp(ty);
13836      assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
13837      putIReg(4, R_ESP, mkexpr(t1) );
13838      /* stop mkU16 asserting if d32 is a negative 16-bit number
13839         (bug #132813) */
13840      if (ty == Ity_I16)
13841         d32 &= 0xFFFF;
13842      storeLE( mkexpr(t1), mkU(ty,d32) );
13843      DIP("push%c $0x%x\n", nameISize(sz), d32);
13844      break;
13845
13846   case 0x9C: /* PUSHF */ {
13847      vassert(sz == 2 || sz == 4);
13848
13849      t1 = newTemp(Ity_I32);
13850      assign( t1, binop(Iop_Sub32,getIReg(4,R_ESP),mkU32(sz)) );
13851      putIReg(4, R_ESP, mkexpr(t1) );
13852
13853      /* Calculate OSZACP, and patch in fixed fields as per
13854         Intel docs.
13855         - bit 1 is always 1
13856         - bit 9 is Interrupt Enable (should always be 1 in user mode?)
13857      */
13858      t2 = newTemp(Ity_I32);
13859      assign( t2, binop(Iop_Or32,
13860                        mk_x86g_calculate_eflags_all(),
13861                        mkU32( (1<<1)|(1<<9) ) ));
13862
13863      /* Patch in the D flag.  This can simply be a copy of bit 10 of
13864         baseBlock[OFFB_DFLAG]. */
13865      t3 = newTemp(Ity_I32);
13866      assign( t3, binop(Iop_Or32,
13867                        mkexpr(t2),
13868                        binop(Iop_And32,
13869                              IRExpr_Get(OFFB_DFLAG,Ity_I32),
13870                              mkU32(1<<10)))
13871            );
13872
13873      /* And patch in the ID flag. */
13874      t4 = newTemp(Ity_I32);
13875      assign( t4, binop(Iop_Or32,
13876                        mkexpr(t3),
13877                        binop(Iop_And32,
13878                              binop(Iop_Shl32, IRExpr_Get(OFFB_IDFLAG,Ity_I32),
13879                                               mkU8(21)),
13880                              mkU32(1<<21)))
13881            );
13882
13883      /* And patch in the AC flag. */
13884      t5 = newTemp(Ity_I32);
13885      assign( t5, binop(Iop_Or32,
13886                        mkexpr(t4),
13887                        binop(Iop_And32,
13888                              binop(Iop_Shl32, IRExpr_Get(OFFB_ACFLAG,Ity_I32),
13889                                               mkU8(18)),
13890                              mkU32(1<<18)))
13891            );
13892
13893      /* if sz==2, the stored value needs to be narrowed. */
13894      if (sz == 2)
13895        storeLE( mkexpr(t1), unop(Iop_32to16,mkexpr(t5)) );
13896      else
13897        storeLE( mkexpr(t1), mkexpr(t5) );
13898
13899      DIP("pushf%c\n", nameISize(sz));
13900      break;
13901   }
13902
13903   case 0x60: /* PUSHA */
13904      /* This is almost certainly wrong for sz==2.  So ... */
13905      if (sz != 4) goto decode_failure;
13906
13907      /* This is the Right Way, in that the value to be pushed is
13908         established before %esp is changed, so that pusha
13909         correctly pushes the old %esp value.  New value of %esp is
13910         pushed at start. */
13911      /* t0 is the %ESP value we're going to push. */
13912      t0 = newTemp(Ity_I32);
13913      assign( t0, getIReg(4, R_ESP) );
13914
13915      /* t5 will be the new %ESP value. */
13916      t5 = newTemp(Ity_I32);
13917      assign( t5, binop(Iop_Sub32, mkexpr(t0), mkU32(8*4)) );
13918
13919      /* Update guest state before prodding memory. */
13920      putIReg(4, R_ESP, mkexpr(t5));
13921
13922      /* Dump all the registers. */
13923      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(28)), getIReg(4,R_EAX) );
13924      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(24)), getIReg(4,R_ECX) );
13925      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(20)), getIReg(4,R_EDX) );
13926      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(16)), getIReg(4,R_EBX) );
13927      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32(12)), mkexpr(t0) /*esp*/);
13928      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 8)), getIReg(4,R_EBP) );
13929      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 4)), getIReg(4,R_ESI) );
13930      storeLE( binop(Iop_Add32,mkexpr(t5),mkU32( 0)), getIReg(4,R_EDI) );
13931
13932      DIP("pusha%c\n", nameISize(sz));
13933      break;
13934
13935   case 0x0E: /* PUSH %CS */
13936      dis_push_segreg( R_CS, sz ); break;
13937   case 0x1E: /* PUSH %DS */
13938      dis_push_segreg( R_DS, sz ); break;
13939   case 0x06: /* PUSH %ES */
13940      dis_push_segreg( R_ES, sz ); break;
13941   case 0x16: /* PUSH %SS */
13942      dis_push_segreg( R_SS, sz ); break;
13943
13944   /* ------------------------ SCAS et al ----------------- */
13945
13946   case 0xA4: /* MOVS, no REP prefix */
13947   case 0xA5:
13948      if (sorb != 0)
13949         goto decode_failure; /* else dis_string_op asserts */
13950      dis_string_op( dis_MOVS, ( opc == 0xA4 ? 1 : sz ), "movs", sorb );
13951      break;
13952
13953  case 0xA6: /* CMPSb, no REP prefix */
13954  case 0xA7:
13955      if (sorb != 0)
13956         goto decode_failure; /* else dis_string_op asserts */
13957      dis_string_op( dis_CMPS, ( opc == 0xA6 ? 1 : sz ), "cmps", sorb );
13958      break;
13959
13960   case 0xAA: /* STOS, no REP prefix */
13961   case 0xAB:
13962      if (sorb != 0)
13963         goto decode_failure; /* else dis_string_op asserts */
13964      dis_string_op( dis_STOS, ( opc == 0xAA ? 1 : sz ), "stos", sorb );
13965      break;
13966
13967   case 0xAC: /* LODS, no REP prefix */
13968   case 0xAD:
13969      if (sorb != 0)
13970         goto decode_failure; /* else dis_string_op asserts */
13971      dis_string_op( dis_LODS, ( opc == 0xAC ? 1 : sz ), "lods", sorb );
13972      break;
13973
13974   case 0xAE: /* SCAS, no REP prefix */
13975   case 0xAF:
13976      if (sorb != 0)
13977         goto decode_failure; /* else dis_string_op asserts */
13978      dis_string_op( dis_SCAS, ( opc == 0xAE ? 1 : sz ), "scas", sorb );
13979      break;
13980
13981
13982   case 0xFC: /* CLD */
13983      stmt( IRStmt_Put( OFFB_DFLAG, mkU32(1)) );
13984      DIP("cld\n");
13985      break;
13986
13987   case 0xFD: /* STD */
13988      stmt( IRStmt_Put( OFFB_DFLAG, mkU32(0xFFFFFFFF)) );
13989      DIP("std\n");
13990      break;
13991
13992   case 0xF8: /* CLC */
13993   case 0xF9: /* STC */
13994   case 0xF5: /* CMC */
13995      t0 = newTemp(Ity_I32);
13996      t1 = newTemp(Ity_I32);
13997      assign( t0, mk_x86g_calculate_eflags_all() );
13998      switch (opc) {
13999         case 0xF8:
14000            assign( t1, binop(Iop_And32, mkexpr(t0),
14001                                         mkU32(~X86G_CC_MASK_C)));
14002            DIP("clc\n");
14003            break;
14004         case 0xF9:
14005            assign( t1, binop(Iop_Or32, mkexpr(t0),
14006                                        mkU32(X86G_CC_MASK_C)));
14007            DIP("stc\n");
14008            break;
14009         case 0xF5:
14010            assign( t1, binop(Iop_Xor32, mkexpr(t0),
14011                                         mkU32(X86G_CC_MASK_C)));
14012            DIP("cmc\n");
14013            break;
14014         default:
14015            vpanic("disInstr(x86)(clc/stc/cmc)");
14016      }
14017      stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
14018      stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
14019      stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(t1) ));
14020      /* Set NDEP even though it isn't used.  This makes redundant-PUT
14021         elimination of previous stores to this field work better. */
14022      stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
14023      break;
14024
14025   case 0xD6: /* SALC */
14026      t0 = newTemp(Ity_I32);
14027      t1 = newTemp(Ity_I32);
14028      assign( t0,  binop(Iop_And32,
14029                         mk_x86g_calculate_eflags_c(),
14030                         mkU32(1)) );
14031      assign( t1, binop(Iop_Sar32,
14032                        binop(Iop_Shl32, mkexpr(t0), mkU8(31)),
14033                        mkU8(31)) );
14034      putIReg(1, R_EAX, unop(Iop_32to8, mkexpr(t1)) );
14035      DIP("salc\n");
14036      break;
14037
14038   /* REPNE prefix insn */
14039   case 0xF2: {
14040      Addr32 eip_orig = guest_EIP_bbstart + delta_start;
14041      if (sorb != 0) goto decode_failure;
14042      abyte = getIByte(delta); delta++;
14043
14044      if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
14045
14046      switch (abyte) {
14047      /* According to the Intel manual, "repne movs" should never occur, but
14048       * in practice it has happened, so allow for it here... */
14049      case 0xA4: sz = 1;   /* REPNE MOVS<sz> */
14050      case 0xA5:
14051         dis_REP_op ( &dres, X86CondNZ, dis_MOVS, sz, eip_orig,
14052                             guest_EIP_bbstart+delta, "repne movs" );
14053         break;
14054
14055      case 0xA6: sz = 1;   /* REPNE CMP<sz> */
14056      case 0xA7:
14057         dis_REP_op ( &dres, X86CondNZ, dis_CMPS, sz, eip_orig,
14058                             guest_EIP_bbstart+delta, "repne cmps" );
14059         break;
14060
14061      case 0xAA: sz = 1;   /* REPNE STOS<sz> */
14062      case 0xAB:
14063         dis_REP_op ( &dres, X86CondNZ, dis_STOS, sz, eip_orig,
14064                             guest_EIP_bbstart+delta, "repne stos" );
14065         break;
14066
14067      case 0xAE: sz = 1;   /* REPNE SCAS<sz> */
14068      case 0xAF:
14069         dis_REP_op ( &dres, X86CondNZ, dis_SCAS, sz, eip_orig,
14070                             guest_EIP_bbstart+delta, "repne scas" );
14071         break;
14072
14073      default:
14074         goto decode_failure;
14075      }
14076      break;
14077   }
14078
14079   /* REP/REPE prefix insn (for SCAS and CMPS, 0xF3 means REPE,
14080      for the rest, it means REP) */
14081   case 0xF3: {
14082      Addr32 eip_orig = guest_EIP_bbstart + delta_start;
14083      abyte = getIByte(delta); delta++;
14084
14085      if (abyte == 0x66) { sz = 2; abyte = getIByte(delta); delta++; }
14086
14087      if (sorb != 0 && abyte != 0x0F) goto decode_failure;
14088
14089      switch (abyte) {
14090      case 0x0F:
14091         switch (getIByte(delta)) {
14092         /* On older CPUs, TZCNT behaves the same as BSF.  */
14093         case 0xBC: /* REP BSF Gv,Ev */
14094            delta = dis_bs_E_G ( sorb, sz, delta + 1, True );
14095            break;
14096         /* On older CPUs, LZCNT behaves the same as BSR.  */
14097         case 0xBD: /* REP BSR Gv,Ev */
14098            delta = dis_bs_E_G ( sorb, sz, delta + 1, False );
14099            break;
14100         default:
14101            goto decode_failure;
14102         }
14103         break;
14104
14105      case 0xA4: sz = 1;   /* REP MOVS<sz> */
14106      case 0xA5:
14107         dis_REP_op ( &dres, X86CondAlways, dis_MOVS, sz, eip_orig,
14108                             guest_EIP_bbstart+delta, "rep movs" );
14109         break;
14110
14111      case 0xA6: sz = 1;   /* REPE CMP<sz> */
14112      case 0xA7:
14113         dis_REP_op ( &dres, X86CondZ, dis_CMPS, sz, eip_orig,
14114                             guest_EIP_bbstart+delta, "repe cmps" );
14115         break;
14116
14117      case 0xAA: sz = 1;   /* REP STOS<sz> */
14118      case 0xAB:
14119         dis_REP_op ( &dres, X86CondAlways, dis_STOS, sz, eip_orig,
14120                             guest_EIP_bbstart+delta, "rep stos" );
14121         break;
14122
14123      case 0xAC: sz = 1;   /* REP LODS<sz> */
14124      case 0xAD:
14125         dis_REP_op ( &dres, X86CondAlways, dis_LODS, sz, eip_orig,
14126                             guest_EIP_bbstart+delta, "rep lods" );
14127         break;
14128
14129      case 0xAE: sz = 1;   /* REPE SCAS<sz> */
14130      case 0xAF:
14131         dis_REP_op ( &dres, X86CondZ, dis_SCAS, sz, eip_orig,
14132                             guest_EIP_bbstart+delta, "repe scas" );
14133         break;
14134
14135      case 0x90:           /* REP NOP (PAUSE) */
14136         /* a hint to the P4 re spin-wait loop */
14137         DIP("rep nop (P4 pause)\n");
14138         /* "observe" the hint.  The Vex client needs to be careful not
14139            to cause very long delays as a result, though. */
14140         jmp_lit(&dres, Ijk_Yield, ((Addr32)guest_EIP_bbstart)+delta);
14141         vassert(dres.whatNext == Dis_StopHere);
14142         break;
14143
14144      case 0xC3:           /* REP RET -- same as normal ret? */
14145         dis_ret(&dres, 0);
14146         DIP("rep ret\n");
14147         break;
14148
14149      default:
14150         goto decode_failure;
14151      }
14152      break;
14153   }
14154
14155   /* ------------------------ XCHG ----------------------- */
14156
14157   /* XCHG reg,mem automatically asserts LOCK# even without a LOCK
14158      prefix; hence it must be translated with an IRCAS (at least, the
14159      memory variant). */
14160   case 0x86: /* XCHG Gb,Eb */
14161      sz = 1;
14162      /* Fall through ... */
14163   case 0x87: /* XCHG Gv,Ev */
14164      modrm = getIByte(delta);
14165      ty = szToITy(sz);
14166      t1 = newTemp(ty); t2 = newTemp(ty);
14167      if (epartIsReg(modrm)) {
14168         assign(t1, getIReg(sz, eregOfRM(modrm)));
14169         assign(t2, getIReg(sz, gregOfRM(modrm)));
14170         putIReg(sz, gregOfRM(modrm), mkexpr(t1));
14171         putIReg(sz, eregOfRM(modrm), mkexpr(t2));
14172         delta++;
14173         DIP("xchg%c %s, %s\n",
14174             nameISize(sz), nameIReg(sz,gregOfRM(modrm)),
14175                            nameIReg(sz,eregOfRM(modrm)));
14176      } else {
14177         *expect_CAS = True;
14178         addr = disAMode ( &alen, sorb, delta, dis_buf );
14179         assign( t1, loadLE(ty,mkexpr(addr)) );
14180         assign( t2, getIReg(sz,gregOfRM(modrm)) );
14181         casLE( mkexpr(addr),
14182                mkexpr(t1), mkexpr(t2), guest_EIP_curr_instr );
14183         putIReg( sz, gregOfRM(modrm), mkexpr(t1) );
14184         delta += alen;
14185         DIP("xchg%c %s, %s\n", nameISize(sz),
14186                                nameIReg(sz,gregOfRM(modrm)), dis_buf);
14187      }
14188      break;
14189
14190   case 0x90: /* XCHG eAX,eAX */
14191      DIP("nop\n");
14192      break;
14193   case 0x91: /* XCHG eAX,eCX */
14194   case 0x92: /* XCHG eAX,eDX */
14195   case 0x93: /* XCHG eAX,eBX */
14196   case 0x94: /* XCHG eAX,eSP */
14197   case 0x95: /* XCHG eAX,eBP */
14198   case 0x96: /* XCHG eAX,eSI */
14199   case 0x97: /* XCHG eAX,eDI */
14200      codegen_xchg_eAX_Reg ( sz, opc - 0x90 );
14201      break;
14202
14203   /* ------------------------ XLAT ----------------------- */
14204
14205   case 0xD7: /* XLAT */
14206      if (sz != 4) goto decode_failure; /* sz == 2 is also allowed (0x66) */
14207      putIReg(
14208         1,
14209         R_EAX/*AL*/,
14210         loadLE(Ity_I8,
14211                handleSegOverride(
14212                   sorb,
14213                   binop(Iop_Add32,
14214                         getIReg(4, R_EBX),
14215                         unop(Iop_8Uto32, getIReg(1, R_EAX/*AL*/))))));
14216
14217      DIP("xlat%c [ebx]\n", nameISize(sz));
14218      break;
14219
14220   /* ------------------------ IN / OUT ----------------------- */
14221
14222   case 0xE4: /* IN imm8, AL */
14223      sz = 1;
14224      t1 = newTemp(Ity_I32);
14225      abyte = getIByte(delta); delta++;
14226      assign(t1, mkU32( abyte & 0xFF ));
14227      DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIReg(sz,R_EAX));
14228      goto do_IN;
14229   case 0xE5: /* IN imm8, eAX */
14230      vassert(sz == 2 || sz == 4);
14231      t1 = newTemp(Ity_I32);
14232      abyte = getIByte(delta); delta++;
14233      assign(t1, mkU32( abyte & 0xFF ));
14234      DIP("in%c $%d,%s\n", nameISize(sz), (Int)abyte, nameIReg(sz,R_EAX));
14235      goto do_IN;
14236   case 0xEC: /* IN %DX, AL */
14237      sz = 1;
14238      t1 = newTemp(Ity_I32);
14239      assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
14240      DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
14241                                         nameIReg(sz,R_EAX));
14242      goto do_IN;
14243   case 0xED: /* IN %DX, eAX */
14244      vassert(sz == 2 || sz == 4);
14245      t1 = newTemp(Ity_I32);
14246      assign(t1, unop(Iop_16Uto32, getIReg(2, R_EDX)));
14247      DIP("in%c %s,%s\n", nameISize(sz), nameIReg(2,R_EDX),
14248                                         nameIReg(sz,R_EAX));
14249      goto do_IN;
14250   do_IN: {
14251      /* At this point, sz indicates the width, and t1 is a 32-bit
14252         value giving port number. */
14253      IRDirty* d;
14254      vassert(sz == 1 || sz == 2 || sz == 4);
14255      ty = szToITy(sz);
14256      t2 = newTemp(Ity_I32);
14257      d = unsafeIRDirty_1_N(
14258             t2,
14259             0/*regparms*/,
14260             "x86g_dirtyhelper_IN",
14261             &x86g_dirtyhelper_IN,
14262             mkIRExprVec_2( mkexpr(t1), mkU32(sz) )
14263          );
14264      /* do the call, dumping the result in t2. */
14265      stmt( IRStmt_Dirty(d) );
14266      putIReg(sz, R_EAX, narrowTo( ty, mkexpr(t2) ) );
14267      break;
14268   }
14269
14270   case 0xE6: /* OUT AL, imm8 */
14271      sz = 1;
14272      t1 = newTemp(Ity_I32);
14273      abyte = getIByte(delta); delta++;
14274      assign( t1, mkU32( abyte & 0xFF ) );
14275      DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), (Int)abyte);
14276      goto do_OUT;
14277   case 0xE7: /* OUT eAX, imm8 */
14278      vassert(sz == 2 || sz == 4);
14279      t1 = newTemp(Ity_I32);
14280      abyte = getIByte(delta); delta++;
14281      assign( t1, mkU32( abyte & 0xFF ) );
14282      DIP("out%c %s,$%d\n", nameISize(sz), nameIReg(sz,R_EAX), (Int)abyte);
14283      goto do_OUT;
14284   case 0xEE: /* OUT AL, %DX */
14285      sz = 1;
14286      t1 = newTemp(Ity_I32);
14287      assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
14288      DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
14289                                          nameIReg(2,R_EDX));
14290      goto do_OUT;
14291   case 0xEF: /* OUT eAX, %DX */
14292      vassert(sz == 2 || sz == 4);
14293      t1 = newTemp(Ity_I32);
14294      assign( t1, unop(Iop_16Uto32, getIReg(2, R_EDX)) );
14295      DIP("out%c %s,%s\n", nameISize(sz), nameIReg(sz,R_EAX),
14296                                          nameIReg(2,R_EDX));
14297      goto do_OUT;
14298   do_OUT: {
14299      /* At this point, sz indicates the width, and t1 is a 32-bit
14300         value giving port number. */
14301      IRDirty* d;
14302      vassert(sz == 1 || sz == 2 || sz == 4);
14303      ty = szToITy(sz);
14304      d = unsafeIRDirty_0_N(
14305             0/*regparms*/,
14306             "x86g_dirtyhelper_OUT",
14307             &x86g_dirtyhelper_OUT,
14308             mkIRExprVec_3( mkexpr(t1),
14309                            widenUto32( getIReg(sz, R_EAX) ),
14310                            mkU32(sz) )
14311          );
14312      stmt( IRStmt_Dirty(d) );
14313      break;
14314   }
14315
14316   /* ------------------------ (Grp1 extensions) ---------- */
14317
14318   case 0x82: /* Grp1 Ib,Eb too.  Apparently this is the same as
14319                 case 0x80, but only in 32-bit mode. */
14320      /* fallthru */
14321   case 0x80: /* Grp1 Ib,Eb */
14322      modrm = getIByte(delta);
14323      am_sz = lengthAMode(delta);
14324      sz    = 1;
14325      d_sz  = 1;
14326      d32   = getUChar(delta + am_sz);
14327      delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
14328      break;
14329
14330   case 0x81: /* Grp1 Iv,Ev */
14331      modrm = getIByte(delta);
14332      am_sz = lengthAMode(delta);
14333      d_sz  = sz;
14334      d32   = getUDisp(d_sz, delta + am_sz);
14335      delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
14336      break;
14337
14338   case 0x83: /* Grp1 Ib,Ev */
14339      modrm = getIByte(delta);
14340      am_sz = lengthAMode(delta);
14341      d_sz  = 1;
14342      d32   = getSDisp8(delta + am_sz);
14343      delta = dis_Grp1 ( sorb, pfx_lock, delta, modrm, am_sz, d_sz, sz, d32 );
14344      break;
14345
14346   /* ------------------------ (Grp2 extensions) ---------- */
14347
14348   case 0xC0: { /* Grp2 Ib,Eb */
14349      Bool decode_OK = True;
14350      modrm = getIByte(delta);
14351      am_sz = lengthAMode(delta);
14352      d_sz  = 1;
14353      d32   = getUChar(delta + am_sz);
14354      sz    = 1;
14355      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14356                         mkU8(d32 & 0xFF), NULL, &decode_OK );
14357      if (!decode_OK)
14358         goto decode_failure;
14359      break;
14360   }
14361   case 0xC1: { /* Grp2 Ib,Ev */
14362      Bool decode_OK = True;
14363      modrm = getIByte(delta);
14364      am_sz = lengthAMode(delta);
14365      d_sz  = 1;
14366      d32   = getUChar(delta + am_sz);
14367      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14368                         mkU8(d32 & 0xFF), NULL, &decode_OK );
14369      if (!decode_OK)
14370         goto decode_failure;
14371      break;
14372   }
14373   case 0xD0: { /* Grp2 1,Eb */
14374      Bool decode_OK = True;
14375      modrm = getIByte(delta);
14376      am_sz = lengthAMode(delta);
14377      d_sz  = 0;
14378      d32   = 1;
14379      sz    = 1;
14380      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14381                         mkU8(d32), NULL, &decode_OK );
14382      if (!decode_OK)
14383         goto decode_failure;
14384      break;
14385   }
14386   case 0xD1: { /* Grp2 1,Ev */
14387      Bool decode_OK = True;
14388      modrm = getUChar(delta);
14389      am_sz = lengthAMode(delta);
14390      d_sz  = 0;
14391      d32   = 1;
14392      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14393                         mkU8(d32), NULL, &decode_OK );
14394      if (!decode_OK)
14395         goto decode_failure;
14396      break;
14397   }
14398   case 0xD2: { /* Grp2 CL,Eb */
14399      Bool decode_OK = True;
14400      modrm = getUChar(delta);
14401      am_sz = lengthAMode(delta);
14402      d_sz  = 0;
14403      sz    = 1;
14404      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14405                         getIReg(1,R_ECX), "%cl", &decode_OK );
14406      if (!decode_OK)
14407         goto decode_failure;
14408      break;
14409   }
14410   case 0xD3: { /* Grp2 CL,Ev */
14411      Bool decode_OK = True;
14412      modrm = getIByte(delta);
14413      am_sz = lengthAMode(delta);
14414      d_sz  = 0;
14415      delta = dis_Grp2 ( sorb, delta, modrm, am_sz, d_sz, sz,
14416                         getIReg(1,R_ECX), "%cl", &decode_OK );
14417      if (!decode_OK)
14418         goto decode_failure;
14419      break;
14420   }
14421
14422   /* ------------------------ (Grp3 extensions) ---------- */
14423
14424   case 0xF6: { /* Grp3 Eb */
14425      Bool decode_OK = True;
14426      delta = dis_Grp3 ( sorb, pfx_lock, 1, delta, &decode_OK );
14427      if (!decode_OK)
14428         goto decode_failure;
14429      break;
14430   }
14431   case 0xF7: { /* Grp3 Ev */
14432      Bool decode_OK = True;
14433      delta = dis_Grp3 ( sorb, pfx_lock, sz, delta, &decode_OK );
14434      if (!decode_OK)
14435         goto decode_failure;
14436      break;
14437   }
14438
14439   /* ------------------------ (Grp4 extensions) ---------- */
14440
14441   case 0xFE: { /* Grp4 Eb */
14442      Bool decode_OK = True;
14443      delta = dis_Grp4 ( sorb, pfx_lock, delta, &decode_OK );
14444      if (!decode_OK)
14445         goto decode_failure;
14446      break;
14447   }
14448
14449   /* ------------------------ (Grp5 extensions) ---------- */
14450
14451   case 0xFF: { /* Grp5 Ev */
14452      Bool decode_OK = True;
14453      delta = dis_Grp5 ( sorb, pfx_lock, sz, delta, &dres, &decode_OK );
14454      if (!decode_OK)
14455         goto decode_failure;
14456      break;
14457   }
14458
14459   /* ------------------------ Escapes to 2-byte opcodes -- */
14460
14461   case 0x0F: {
14462      opc = getIByte(delta); delta++;
14463      switch (opc) {
14464
14465      /* =-=-=-=-=-=-=-=-=- Grp8 =-=-=-=-=-=-=-=-=-=-=-= */
14466
14467      case 0xBA: { /* Grp8 Ib,Ev */
14468         Bool decode_OK = False;
14469         modrm = getUChar(delta);
14470         am_sz = lengthAMode(delta);
14471         d32   = getSDisp8(delta + am_sz);
14472         delta = dis_Grp8_Imm ( sorb, pfx_lock, delta, modrm,
14473                                am_sz, sz, d32, &decode_OK );
14474         if (!decode_OK)
14475            goto decode_failure;
14476         break;
14477      }
14478
14479      /* =-=-=-=-=-=-=-=-=- BSF/BSR -=-=-=-=-=-=-=-=-=-= */
14480
14481      case 0xBC: /* BSF Gv,Ev */
14482         delta = dis_bs_E_G ( sorb, sz, delta, True );
14483         break;
14484      case 0xBD: /* BSR Gv,Ev */
14485         delta = dis_bs_E_G ( sorb, sz, delta, False );
14486         break;
14487
14488      /* =-=-=-=-=-=-=-=-=- BSWAP -=-=-=-=-=-=-=-=-=-=-= */
14489
14490      case 0xC8: /* BSWAP %eax */
14491      case 0xC9:
14492      case 0xCA:
14493      case 0xCB:
14494      case 0xCC:
14495      case 0xCD:
14496      case 0xCE:
14497      case 0xCF: /* BSWAP %edi */
14498         /* AFAICS from the Intel docs, this only exists at size 4. */
14499         if (sz != 4) goto decode_failure;
14500
14501         t1 = newTemp(Ity_I32);
14502         assign( t1, getIReg(4, opc-0xC8) );
14503         t2 = math_BSWAP(t1, Ity_I32);
14504
14505         putIReg(4, opc-0xC8, mkexpr(t2));
14506         DIP("bswapl %s\n", nameIReg(4, opc-0xC8));
14507         break;
14508
14509      /* =-=-=-=-=-=-=-=-=- BT/BTS/BTR/BTC =-=-=-=-=-=-= */
14510
14511      case 0xA3: /* BT Gv,Ev */
14512         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpNone );
14513         break;
14514      case 0xB3: /* BTR Gv,Ev */
14515         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpReset );
14516         break;
14517      case 0xAB: /* BTS Gv,Ev */
14518         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpSet );
14519         break;
14520      case 0xBB: /* BTC Gv,Ev */
14521         delta = dis_bt_G_E ( vbi, sorb, pfx_lock, sz, delta, BtOpComp );
14522         break;
14523
14524      /* =-=-=-=-=-=-=-=-=- CMOV =-=-=-=-=-=-=-=-=-=-=-= */
14525
14526      case 0x40:
14527      case 0x41:
14528      case 0x42: /* CMOVBb/CMOVNAEb (cmov below) */
14529      case 0x43: /* CMOVNBb/CMOVAEb (cmov not below) */
14530      case 0x44: /* CMOVZb/CMOVEb (cmov zero) */
14531      case 0x45: /* CMOVNZb/CMOVNEb (cmov not zero) */
14532      case 0x46: /* CMOVBEb/CMOVNAb (cmov below or equal) */
14533      case 0x47: /* CMOVNBEb/CMOVAb (cmov not below or equal) */
14534      case 0x48: /* CMOVSb (cmov negative) */
14535      case 0x49: /* CMOVSb (cmov not negative) */
14536      case 0x4A: /* CMOVP (cmov parity even) */
14537      case 0x4B: /* CMOVNP (cmov parity odd) */
14538      case 0x4C: /* CMOVLb/CMOVNGEb (cmov less) */
14539      case 0x4D: /* CMOVGEb/CMOVNLb (cmov greater or equal) */
14540      case 0x4E: /* CMOVLEb/CMOVNGb (cmov less or equal) */
14541      case 0x4F: /* CMOVGb/CMOVNLEb (cmov greater) */
14542         delta = dis_cmov_E_G(sorb, sz, (X86Condcode)(opc - 0x40), delta);
14543         break;
14544
14545      /* =-=-=-=-=-=-=-=-=- CMPXCHG -=-=-=-=-=-=-=-=-=-= */
14546
14547      case 0xB0: /* CMPXCHG Gb,Eb */
14548         delta = dis_cmpxchg_G_E ( sorb, pfx_lock, 1, delta );
14549         break;
14550      case 0xB1: /* CMPXCHG Gv,Ev */
14551         delta = dis_cmpxchg_G_E ( sorb, pfx_lock, sz, delta );
14552         break;
14553
14554      case 0xC7: { /* CMPXCHG8B Gv (0F C7 /1) */
14555         IRTemp expdHi    = newTemp(Ity_I32);
14556         IRTemp expdLo    = newTemp(Ity_I32);
14557         IRTemp dataHi    = newTemp(Ity_I32);
14558         IRTemp dataLo    = newTemp(Ity_I32);
14559         IRTemp oldHi     = newTemp(Ity_I32);
14560         IRTemp oldLo     = newTemp(Ity_I32);
14561         IRTemp flags_old = newTemp(Ity_I32);
14562         IRTemp flags_new = newTemp(Ity_I32);
14563         IRTemp success   = newTemp(Ity_I1);
14564
14565         /* Translate this using a DCAS, even if there is no LOCK
14566            prefix.  Life is too short to bother with generating two
14567            different translations for the with/without-LOCK-prefix
14568            cases. */
14569         *expect_CAS = True;
14570
14571	 /* Decode, and generate address. */
14572         if (sz != 4) goto decode_failure;
14573         modrm = getIByte(delta);
14574         if (epartIsReg(modrm)) goto decode_failure;
14575         if (gregOfRM(modrm) != 1) goto decode_failure;
14576         addr = disAMode ( &alen, sorb, delta, dis_buf );
14577         delta += alen;
14578
14579         /* Get the expected and new values. */
14580         assign( expdHi, getIReg(4,R_EDX) );
14581         assign( expdLo, getIReg(4,R_EAX) );
14582         assign( dataHi, getIReg(4,R_ECX) );
14583         assign( dataLo, getIReg(4,R_EBX) );
14584
14585         /* Do the DCAS */
14586         stmt( IRStmt_CAS(
14587                  mkIRCAS( oldHi, oldLo,
14588                           Iend_LE, mkexpr(addr),
14589                           mkexpr(expdHi), mkexpr(expdLo),
14590                           mkexpr(dataHi), mkexpr(dataLo)
14591               )));
14592
14593         /* success when oldHi:oldLo == expdHi:expdLo */
14594         assign( success,
14595                 binop(Iop_CasCmpEQ32,
14596                       binop(Iop_Or32,
14597                             binop(Iop_Xor32, mkexpr(oldHi), mkexpr(expdHi)),
14598                             binop(Iop_Xor32, mkexpr(oldLo), mkexpr(expdLo))
14599                       ),
14600                       mkU32(0)
14601                 ));
14602
14603         /* If the DCAS is successful, that is to say oldHi:oldLo ==
14604            expdHi:expdLo, then put expdHi:expdLo back in EDX:EAX,
14605            which is where they came from originally.  Both the actual
14606            contents of these two regs, and any shadow values, are
14607            unchanged.  If the DCAS fails then we're putting into
14608            EDX:EAX the value seen in memory. */
14609         putIReg(4, R_EDX,
14610                    IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
14611                                  mkexpr(oldHi),
14612                                  mkexpr(expdHi)
14613                ));
14614         putIReg(4, R_EAX,
14615                    IRExpr_Mux0X( unop(Iop_1Uto8, mkexpr(success)),
14616                                  mkexpr(oldLo),
14617                                  mkexpr(expdLo)
14618                ));
14619
14620         /* Copy the success bit into the Z flag and leave the others
14621            unchanged */
14622         assign( flags_old, widenUto32(mk_x86g_calculate_eflags_all()));
14623         assign(
14624            flags_new,
14625            binop(Iop_Or32,
14626                  binop(Iop_And32, mkexpr(flags_old),
14627                                   mkU32(~X86G_CC_MASK_Z)),
14628                  binop(Iop_Shl32,
14629                        binop(Iop_And32,
14630                              unop(Iop_1Uto32, mkexpr(success)), mkU32(1)),
14631                        mkU8(X86G_CC_SHIFT_Z)) ));
14632
14633         stmt( IRStmt_Put( OFFB_CC_OP,   mkU32(X86G_CC_OP_COPY) ));
14634         stmt( IRStmt_Put( OFFB_CC_DEP1, mkexpr(flags_new) ));
14635         stmt( IRStmt_Put( OFFB_CC_DEP2, mkU32(0) ));
14636         /* Set NDEP even though it isn't used.  This makes
14637            redundant-PUT elimination of previous stores to this field
14638            work better. */
14639         stmt( IRStmt_Put( OFFB_CC_NDEP, mkU32(0) ));
14640
14641         /* Sheesh.  Aren't you glad it was me and not you that had to
14642	    write and validate all this grunge? */
14643
14644	 DIP("cmpxchg8b %s\n", dis_buf);
14645	 break;
14646      }
14647
14648      /* =-=-=-=-=-=-=-=-=- CPUID -=-=-=-=-=-=-=-=-=-=-= */
14649
14650      case 0xA2: { /* CPUID */
14651         /* Uses dirty helper:
14652               void dirtyhelper_CPUID_sse[012] ( VexGuestX86State* )
14653            declared to mod eax, wr ebx, ecx, edx
14654         */
14655         IRDirty* d     = NULL;
14656         HChar*   fName = NULL;
14657         void*    fAddr = NULL;
14658         if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE2) {
14659            fName = "x86g_dirtyhelper_CPUID_sse2";
14660            fAddr = &x86g_dirtyhelper_CPUID_sse2;
14661         }
14662         else
14663         if (archinfo->hwcaps & VEX_HWCAPS_X86_SSE1) {
14664            fName = "x86g_dirtyhelper_CPUID_sse1";
14665            fAddr = &x86g_dirtyhelper_CPUID_sse1;
14666         }
14667         else
14668         if (archinfo->hwcaps == 0/*no SSE*/) {
14669            fName = "x86g_dirtyhelper_CPUID_sse0";
14670            fAddr = &x86g_dirtyhelper_CPUID_sse0;
14671         } else
14672            vpanic("disInstr(x86)(cpuid)");
14673
14674         vassert(fName); vassert(fAddr);
14675         d = unsafeIRDirty_0_N ( 0/*regparms*/,
14676                                 fName, fAddr, mkIRExprVec_0() );
14677         /* declare guest state effects */
14678         d->needsBBP = True;
14679         d->nFxState = 4;
14680         vex_bzero(&d->fxState, sizeof(d->fxState));
14681         d->fxState[0].fx     = Ifx_Modify;
14682         d->fxState[0].offset = OFFB_EAX;
14683         d->fxState[0].size   = 4;
14684         d->fxState[1].fx     = Ifx_Write;
14685         d->fxState[1].offset = OFFB_EBX;
14686         d->fxState[1].size   = 4;
14687         d->fxState[2].fx     = Ifx_Modify;
14688         d->fxState[2].offset = OFFB_ECX;
14689         d->fxState[2].size   = 4;
14690         d->fxState[3].fx     = Ifx_Write;
14691         d->fxState[3].offset = OFFB_EDX;
14692         d->fxState[3].size   = 4;
14693         /* execute the dirty call, side-effecting guest state */
14694         stmt( IRStmt_Dirty(d) );
14695         /* CPUID is a serialising insn.  So, just in case someone is
14696            using it as a memory fence ... */
14697         stmt( IRStmt_MBE(Imbe_Fence) );
14698         DIP("cpuid\n");
14699         break;
14700      }
14701
14702//--          if (!VG_(cpu_has_feature)(VG_X86_FEAT_CPUID))
14703//--             goto decode_failure;
14704//--
14705//--          t1 = newTemp(cb);
14706//--          t2 = newTemp(cb);
14707//--          t3 = newTemp(cb);
14708//--          t4 = newTemp(cb);
14709//--          uInstr0(cb, CALLM_S, 0);
14710//--
14711//--          uInstr2(cb, GET,   4, ArchReg, R_EAX, TempReg, t1);
14712//--          uInstr1(cb, PUSH,  4, TempReg, t1);
14713//--
14714//--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t2);
14715//--          uLiteral(cb, 0);
14716//--          uInstr1(cb, PUSH,  4, TempReg, t2);
14717//--
14718//--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t3);
14719//--          uLiteral(cb, 0);
14720//--          uInstr1(cb, PUSH,  4, TempReg, t3);
14721//--
14722//--          uInstr2(cb, MOV,   4, Literal, 0, TempReg, t4);
14723//--          uLiteral(cb, 0);
14724//--          uInstr1(cb, PUSH,  4, TempReg, t4);
14725//--
14726//--          uInstr1(cb, CALLM, 0, Lit16,   VGOFF_(helper_CPUID));
14727//--          uFlagsRWU(cb, FlagsEmpty, FlagsEmpty, FlagsEmpty);
14728//--
14729//--          uInstr1(cb, POP,   4, TempReg, t4);
14730//--          uInstr2(cb, PUT,   4, TempReg, t4, ArchReg, R_EDX);
14731//--
14732//--          uInstr1(cb, POP,   4, TempReg, t3);
14733//--          uInstr2(cb, PUT,   4, TempReg, t3, ArchReg, R_ECX);
14734//--
14735//--          uInstr1(cb, POP,   4, TempReg, t2);
14736//--          uInstr2(cb, PUT,   4, TempReg, t2, ArchReg, R_EBX);
14737//--
14738//--          uInstr1(cb, POP,   4, TempReg, t1);
14739//--          uInstr2(cb, PUT,   4, TempReg, t1, ArchReg, R_EAX);
14740//--
14741//--          uInstr0(cb, CALLM_E, 0);
14742//--          DIP("cpuid\n");
14743//--          break;
14744//--
14745      /* =-=-=-=-=-=-=-=-=- MOVZX, MOVSX =-=-=-=-=-=-=-= */
14746
14747      case 0xB6: /* MOVZXb Eb,Gv */
14748         if (sz != 2 && sz != 4)
14749            goto decode_failure;
14750         delta = dis_movx_E_G ( sorb, delta, 1, sz, False );
14751         break;
14752
14753      case 0xB7: /* MOVZXw Ew,Gv */
14754         if (sz != 4)
14755            goto decode_failure;
14756         delta = dis_movx_E_G ( sorb, delta, 2, 4, False );
14757         break;
14758
14759      case 0xBE: /* MOVSXb Eb,Gv */
14760         if (sz != 2 && sz != 4)
14761            goto decode_failure;
14762         delta = dis_movx_E_G ( sorb, delta, 1, sz, True );
14763         break;
14764
14765      case 0xBF: /* MOVSXw Ew,Gv */
14766         if (sz != 4 && /* accept movsww, sigh, see #250799 */sz != 2)
14767            goto decode_failure;
14768         delta = dis_movx_E_G ( sorb, delta, 2, sz, True );
14769         break;
14770
14771//--       /* =-=-=-=-=-=-=-=-=-=-= MOVNTI -=-=-=-=-=-=-=-=-= */
14772//--
14773//--       case 0xC3: /* MOVNTI Gv,Ev */
14774//--          vg_assert(sz == 4);
14775//--          modrm = getUChar(eip);
14776//--          vg_assert(!epartIsReg(modrm));
14777//--          t1 = newTemp(cb);
14778//--          uInstr2(cb, GET, 4, ArchReg, gregOfRM(modrm), TempReg, t1);
14779//--          pair = disAMode ( cb, sorb, eip, dis_buf );
14780//--          t2 = LOW24(pair);
14781//--          eip += HI8(pair);
14782//--          uInstr2(cb, STORE, 4, TempReg, t1, TempReg, t2);
14783//--          DIP("movnti %s,%s\n", nameIReg(4,gregOfRM(modrm)), dis_buf);
14784//--          break;
14785
14786      /* =-=-=-=-=-=-=-=-=- MUL/IMUL =-=-=-=-=-=-=-=-=-= */
14787
14788      case 0xAF: /* IMUL Ev, Gv */
14789         delta = dis_mul_E_G ( sorb, sz, delta );
14790         break;
14791
14792      /* =-=-=-=-=-=-=-=-=- NOPs =-=-=-=-=-=-=-=-=-=-=-= */
14793
14794      case 0x1F:
14795         modrm = getUChar(delta);
14796         if (epartIsReg(modrm)) goto decode_failure;
14797         addr = disAMode ( &alen, sorb, delta, dis_buf );
14798         delta += alen;
14799         DIP("nop%c %s\n", nameISize(sz), dis_buf);
14800         break;
14801
14802      /* =-=-=-=-=-=-=-=-=- Jcond d32 -=-=-=-=-=-=-=-=-= */
14803      case 0x80:
14804      case 0x81:
14805      case 0x82: /* JBb/JNAEb (jump below) */
14806      case 0x83: /* JNBb/JAEb (jump not below) */
14807      case 0x84: /* JZb/JEb (jump zero) */
14808      case 0x85: /* JNZb/JNEb (jump not zero) */
14809      case 0x86: /* JBEb/JNAb (jump below or equal) */
14810      case 0x87: /* JNBEb/JAb (jump not below or equal) */
14811      case 0x88: /* JSb (jump negative) */
14812      case 0x89: /* JSb (jump not negative) */
14813      case 0x8A: /* JP (jump parity even) */
14814      case 0x8B: /* JNP/JPO (jump parity odd) */
14815      case 0x8C: /* JLb/JNGEb (jump less) */
14816      case 0x8D: /* JGEb/JNLb (jump greater or equal) */
14817      case 0x8E: /* JLEb/JNGb (jump less or equal) */
14818      case 0x8F: /* JGb/JNLEb (jump greater) */
14819       { Int    jmpDelta;
14820         HChar* comment  = "";
14821         jmpDelta = (Int)getUDisp32(delta);
14822         d32 = (((Addr32)guest_EIP_bbstart)+delta+4) + jmpDelta;
14823         delta += 4;
14824         if (resteerCisOk
14825             && vex_control.guest_chase_cond
14826             && (Addr32)d32 != (Addr32)guest_EIP_bbstart
14827             && jmpDelta < 0
14828             && resteerOkFn( callback_opaque, (Addr64)(Addr32)d32) ) {
14829            /* Speculation: assume this backward branch is taken.  So
14830               we need to emit a side-exit to the insn following this
14831               one, on the negation of the condition, and continue at
14832               the branch target address (d32).  If we wind up back at
14833               the first instruction of the trace, just stop; it's
14834               better to let the IR loop unroller handle that case.*/
14835            stmt( IRStmt_Exit(
14836                     mk_x86g_calculate_condition((X86Condcode)
14837                                                 (1 ^ (opc - 0x80))),
14838                     Ijk_Boring,
14839                     IRConst_U32(guest_EIP_bbstart+delta),
14840                     OFFB_EIP ) );
14841            dres.whatNext   = Dis_ResteerC;
14842            dres.continueAt = (Addr64)(Addr32)d32;
14843            comment = "(assumed taken)";
14844         }
14845         else
14846         if (resteerCisOk
14847             && vex_control.guest_chase_cond
14848             && (Addr32)d32 != (Addr32)guest_EIP_bbstart
14849             && jmpDelta >= 0
14850             && resteerOkFn( callback_opaque,
14851                             (Addr64)(Addr32)(guest_EIP_bbstart+delta)) ) {
14852            /* Speculation: assume this forward branch is not taken.
14853               So we need to emit a side-exit to d32 (the dest) and
14854               continue disassembling at the insn immediately
14855               following this one. */
14856            stmt( IRStmt_Exit(
14857                     mk_x86g_calculate_condition((X86Condcode)(opc - 0x80)),
14858                     Ijk_Boring,
14859                     IRConst_U32(d32),
14860                     OFFB_EIP ) );
14861            dres.whatNext   = Dis_ResteerC;
14862            dres.continueAt = (Addr64)(Addr32)(guest_EIP_bbstart+delta);
14863            comment = "(assumed not taken)";
14864         }
14865         else {
14866            /* Conservative default translation - end the block at
14867               this point. */
14868            jcc_01( &dres, (X86Condcode)(opc - 0x80),
14869                    (Addr32)(guest_EIP_bbstart+delta), d32);
14870            vassert(dres.whatNext == Dis_StopHere);
14871         }
14872         DIP("j%s-32 0x%x %s\n", name_X86Condcode(opc - 0x80), d32, comment);
14873         break;
14874       }
14875
14876      /* =-=-=-=-=-=-=-=-=- RDTSC -=-=-=-=-=-=-=-=-=-=-= */
14877      case 0x31: { /* RDTSC */
14878         IRTemp   val  = newTemp(Ity_I64);
14879         IRExpr** args = mkIRExprVec_0();
14880         IRDirty* d    = unsafeIRDirty_1_N (
14881                            val,
14882                            0/*regparms*/,
14883                            "x86g_dirtyhelper_RDTSC",
14884                            &x86g_dirtyhelper_RDTSC,
14885                            args
14886                         );
14887         /* execute the dirty call, dumping the result in val. */
14888         stmt( IRStmt_Dirty(d) );
14889         putIReg(4, R_EDX, unop(Iop_64HIto32, mkexpr(val)));
14890         putIReg(4, R_EAX, unop(Iop_64to32, mkexpr(val)));
14891         DIP("rdtsc\n");
14892         break;
14893      }
14894
14895      /* =-=-=-=-=-=-=-=-=- PUSH/POP Sreg =-=-=-=-=-=-=-=-=-= */
14896
14897      case 0xA1: /* POP %FS */
14898         dis_pop_segreg( R_FS, sz ); break;
14899      case 0xA9: /* POP %GS */
14900         dis_pop_segreg( R_GS, sz ); break;
14901
14902      case 0xA0: /* PUSH %FS */
14903         dis_push_segreg( R_FS, sz ); break;
14904      case 0xA8: /* PUSH %GS */
14905         dis_push_segreg( R_GS, sz ); break;
14906
14907      /* =-=-=-=-=-=-=-=-=- SETcc Eb =-=-=-=-=-=-=-=-=-= */
14908      case 0x90:
14909      case 0x91:
14910      case 0x92: /* set-Bb/set-NAEb (jump below) */
14911      case 0x93: /* set-NBb/set-AEb (jump not below) */
14912      case 0x94: /* set-Zb/set-Eb (jump zero) */
14913      case 0x95: /* set-NZb/set-NEb (jump not zero) */
14914      case 0x96: /* set-BEb/set-NAb (jump below or equal) */
14915      case 0x97: /* set-NBEb/set-Ab (jump not below or equal) */
14916      case 0x98: /* set-Sb (jump negative) */
14917      case 0x99: /* set-Sb (jump not negative) */
14918      case 0x9A: /* set-P (jump parity even) */
14919      case 0x9B: /* set-NP (jump parity odd) */
14920      case 0x9C: /* set-Lb/set-NGEb (jump less) */
14921      case 0x9D: /* set-GEb/set-NLb (jump greater or equal) */
14922      case 0x9E: /* set-LEb/set-NGb (jump less or equal) */
14923      case 0x9F: /* set-Gb/set-NLEb (jump greater) */
14924         t1 = newTemp(Ity_I8);
14925         assign( t1, unop(Iop_1Uto8,mk_x86g_calculate_condition(opc-0x90)) );
14926         modrm = getIByte(delta);
14927         if (epartIsReg(modrm)) {
14928            delta++;
14929            putIReg(1, eregOfRM(modrm), mkexpr(t1));
14930            DIP("set%s %s\n", name_X86Condcode(opc-0x90),
14931                              nameIReg(1,eregOfRM(modrm)));
14932         } else {
14933           addr = disAMode ( &alen, sorb, delta, dis_buf );
14934           delta += alen;
14935           storeLE( mkexpr(addr), mkexpr(t1) );
14936           DIP("set%s %s\n", name_X86Condcode(opc-0x90), dis_buf);
14937         }
14938         break;
14939
14940      /* =-=-=-=-=-=-=-=-=- SHLD/SHRD -=-=-=-=-=-=-=-=-= */
14941
14942      case 0xA4: /* SHLDv imm8,Gv,Ev */
14943         modrm = getIByte(delta);
14944         d32   = delta + lengthAMode(delta);
14945         vex_sprintf(dis_buf, "$%d", getIByte(d32));
14946         delta = dis_SHLRD_Gv_Ev (
14947                  sorb, delta, modrm, sz,
14948                  mkU8(getIByte(d32)), True, /* literal */
14949                  dis_buf, True );
14950         break;
14951      case 0xA5: /* SHLDv %cl,Gv,Ev */
14952         modrm = getIByte(delta);
14953         delta = dis_SHLRD_Gv_Ev (
14954                    sorb, delta, modrm, sz,
14955                    getIReg(1,R_ECX), False, /* not literal */
14956                    "%cl", True );
14957         break;
14958
14959      case 0xAC: /* SHRDv imm8,Gv,Ev */
14960         modrm = getIByte(delta);
14961         d32   = delta + lengthAMode(delta);
14962         vex_sprintf(dis_buf, "$%d", getIByte(d32));
14963         delta = dis_SHLRD_Gv_Ev (
14964                    sorb, delta, modrm, sz,
14965                    mkU8(getIByte(d32)), True, /* literal */
14966                    dis_buf, False );
14967         break;
14968      case 0xAD: /* SHRDv %cl,Gv,Ev */
14969         modrm = getIByte(delta);
14970         delta = dis_SHLRD_Gv_Ev (
14971                    sorb, delta, modrm, sz,
14972                    getIReg(1,R_ECX), False, /* not literal */
14973                    "%cl", False );
14974         break;
14975
14976      /* =-=-=-=-=-=-=-=-=- SYSENTER -=-=-=-=-=-=-=-=-=-= */
14977
14978      case 0x34:
14979         /* Simple implementation needing a long explaination.
14980
14981            sysenter is a kind of syscall entry.  The key thing here
14982            is that the return address is not known -- that is
14983            something that is beyond Vex's knowledge.  So this IR
14984            forces a return to the scheduler, which can do what it
14985            likes to simulate the systenter, but it MUST set this
14986            thread's guest_EIP field with the continuation address
14987            before resuming execution.  If that doesn't happen, the
14988            thread will jump to address zero, which is probably
14989            fatal.
14990         */
14991
14992         /* Note where we are, so we can back up the guest to this
14993            point if the syscall needs to be restarted. */
14994         stmt( IRStmt_Put( OFFB_IP_AT_SYSCALL,
14995                           mkU32(guest_EIP_curr_instr) ) );
14996         jmp_lit(&dres, Ijk_Sys_sysenter, 0/*bogus next EIP value*/);
14997         vassert(dres.whatNext == Dis_StopHere);
14998         DIP("sysenter");
14999         break;
15000
15001      /* =-=-=-=-=-=-=-=-=- XADD -=-=-=-=-=-=-=-=-=-= */
15002
15003      case 0xC0: { /* XADD Gb,Eb */
15004         Bool decodeOK;
15005         delta = dis_xadd_G_E ( sorb, pfx_lock, 1, delta, &decodeOK );
15006         if (!decodeOK) goto decode_failure;
15007         break;
15008      }
15009      case 0xC1: { /* XADD Gv,Ev */
15010         Bool decodeOK;
15011         delta = dis_xadd_G_E ( sorb, pfx_lock, sz, delta, &decodeOK );
15012         if (!decodeOK) goto decode_failure;
15013         break;
15014      }
15015
15016      /* =-=-=-=-=-=-=-=-=- MMXery =-=-=-=-=-=-=-=-=-=-= */
15017
15018      case 0x71:
15019      case 0x72:
15020      case 0x73: /* PSLLgg/PSRAgg/PSRLgg mmxreg by imm8 */
15021
15022      case 0x6E: /* MOVD (src)ireg-or-mem, (dst)mmxreg */
15023      case 0x7E: /* MOVD (src)mmxreg, (dst)ireg-or-mem */
15024      case 0x7F: /* MOVQ (src)mmxreg, (dst)mmxreg-or-mem */
15025      case 0x6F: /* MOVQ (src)mmxreg-or-mem, (dst)mmxreg */
15026
15027      case 0xFC:
15028      case 0xFD:
15029      case 0xFE: /* PADDgg (src)mmxreg-or-mem, (dst)mmxreg */
15030
15031      case 0xEC:
15032      case 0xED: /* PADDSgg (src)mmxreg-or-mem, (dst)mmxreg */
15033
15034      case 0xDC:
15035      case 0xDD: /* PADDUSgg (src)mmxreg-or-mem, (dst)mmxreg */
15036
15037      case 0xF8:
15038      case 0xF9:
15039      case 0xFA: /* PSUBgg (src)mmxreg-or-mem, (dst)mmxreg */
15040
15041      case 0xE8:
15042      case 0xE9: /* PSUBSgg (src)mmxreg-or-mem, (dst)mmxreg */
15043
15044      case 0xD8:
15045      case 0xD9: /* PSUBUSgg (src)mmxreg-or-mem, (dst)mmxreg */
15046
15047      case 0xE5: /* PMULHW (src)mmxreg-or-mem, (dst)mmxreg */
15048      case 0xD5: /* PMULLW (src)mmxreg-or-mem, (dst)mmxreg */
15049
15050      case 0xF5: /* PMADDWD (src)mmxreg-or-mem, (dst)mmxreg */
15051
15052      case 0x74:
15053      case 0x75:
15054      case 0x76: /* PCMPEQgg (src)mmxreg-or-mem, (dst)mmxreg */
15055
15056      case 0x64:
15057      case 0x65:
15058      case 0x66: /* PCMPGTgg (src)mmxreg-or-mem, (dst)mmxreg */
15059
15060      case 0x6B: /* PACKSSDW (src)mmxreg-or-mem, (dst)mmxreg */
15061      case 0x63: /* PACKSSWB (src)mmxreg-or-mem, (dst)mmxreg */
15062      case 0x67: /* PACKUSWB (src)mmxreg-or-mem, (dst)mmxreg */
15063
15064      case 0x68:
15065      case 0x69:
15066      case 0x6A: /* PUNPCKHgg (src)mmxreg-or-mem, (dst)mmxreg */
15067
15068      case 0x60:
15069      case 0x61:
15070      case 0x62: /* PUNPCKLgg (src)mmxreg-or-mem, (dst)mmxreg */
15071
15072      case 0xDB: /* PAND (src)mmxreg-or-mem, (dst)mmxreg */
15073      case 0xDF: /* PANDN (src)mmxreg-or-mem, (dst)mmxreg */
15074      case 0xEB: /* POR (src)mmxreg-or-mem, (dst)mmxreg */
15075      case 0xEF: /* PXOR (src)mmxreg-or-mem, (dst)mmxreg */
15076
15077      case 0xF1: /* PSLLgg (src)mmxreg-or-mem, (dst)mmxreg */
15078      case 0xF2:
15079      case 0xF3:
15080
15081      case 0xD1: /* PSRLgg (src)mmxreg-or-mem, (dst)mmxreg */
15082      case 0xD2:
15083      case 0xD3:
15084
15085      case 0xE1: /* PSRAgg (src)mmxreg-or-mem, (dst)mmxreg */
15086      case 0xE2:
15087      {
15088         Int  delta0    = delta-1;
15089         Bool decode_OK = False;
15090
15091         /* If sz==2 this is SSE, and we assume sse idec has
15092            already spotted those cases by now. */
15093         if (sz != 4)
15094            goto decode_failure;
15095
15096         delta = dis_MMX ( &decode_OK, sorb, sz, delta-1 );
15097         if (!decode_OK) {
15098            delta = delta0;
15099            goto decode_failure;
15100         }
15101         break;
15102      }
15103
15104      case 0x0E: /* FEMMS */
15105      case 0x77: /* EMMS */
15106         if (sz != 4)
15107            goto decode_failure;
15108         do_EMMS_preamble();
15109         DIP("{f}emms\n");
15110         break;
15111
15112      /* =-=-=-=-=-=-=-=-=- SGDT and SIDT =-=-=-=-=-=-=-=-=-=-= */
15113      case 0x01: /* 0F 01 /0 -- SGDT */
15114                 /* 0F 01 /1 -- SIDT */
15115      {
15116          /* This is really revolting, but ... since each processor
15117             (core) only has one IDT and one GDT, just let the guest
15118             see it (pass-through semantics).  I can't see any way to
15119             construct a faked-up value, so don't bother to try. */
15120         modrm = getUChar(delta);
15121         addr = disAMode ( &alen, sorb, delta, dis_buf );
15122         delta += alen;
15123         if (epartIsReg(modrm)) goto decode_failure;
15124         if (gregOfRM(modrm) != 0 && gregOfRM(modrm) != 1)
15125            goto decode_failure;
15126         switch (gregOfRM(modrm)) {
15127            case 0: DIP("sgdt %s\n", dis_buf); break;
15128            case 1: DIP("sidt %s\n", dis_buf); break;
15129            default: vassert(0); /*NOTREACHED*/
15130         }
15131
15132         IRDirty* d = unsafeIRDirty_0_N (
15133                          0/*regparms*/,
15134                          "x86g_dirtyhelper_SxDT",
15135                          &x86g_dirtyhelper_SxDT,
15136                          mkIRExprVec_2( mkexpr(addr),
15137                                         mkU32(gregOfRM(modrm)) )
15138                      );
15139         /* declare we're writing memory */
15140         d->mFx   = Ifx_Write;
15141         d->mAddr = mkexpr(addr);
15142         d->mSize = 6;
15143         stmt( IRStmt_Dirty(d) );
15144         break;
15145      }
15146
15147      /* =-=-=-=-=-=-=-=-=- unimp2 =-=-=-=-=-=-=-=-=-=-= */
15148
15149      default:
15150         goto decode_failure;
15151   } /* switch (opc) for the 2-byte opcodes */
15152   goto decode_success;
15153   } /* case 0x0F: of primary opcode */
15154
15155   /* ------------------------ ??? ------------------------ */
15156
15157  default:
15158  decode_failure:
15159   /* All decode failures end up here. */
15160   vex_printf("vex x86->IR: unhandled instruction bytes: "
15161              "0x%x 0x%x 0x%x 0x%x\n",
15162              (Int)getIByte(delta_start+0),
15163              (Int)getIByte(delta_start+1),
15164              (Int)getIByte(delta_start+2),
15165              (Int)getIByte(delta_start+3) );
15166
15167   /* Tell the dispatcher that this insn cannot be decoded, and so has
15168      not been executed, and (is currently) the next to be executed.
15169      EIP should be up-to-date since it made so at the start of each
15170      insn, but nevertheless be paranoid and update it again right
15171      now. */
15172   stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_curr_instr) ) );
15173   jmp_lit(&dres, Ijk_NoDecode, guest_EIP_curr_instr);
15174   vassert(dres.whatNext == Dis_StopHere);
15175   dres.len = 0;
15176   /* We also need to say that a CAS is not expected now, regardless
15177      of what it might have been set to at the start of the function,
15178      since the IR that we've emitted just above (to synthesis a
15179      SIGILL) does not involve any CAS, and presumably no other IR has
15180      been emitted for this (non-decoded) insn. */
15181   *expect_CAS = False;
15182   return dres;
15183
15184   } /* switch (opc) for the main (primary) opcode switch. */
15185
15186  decode_success:
15187   /* All decode successes end up here. */
15188   switch (dres.whatNext) {
15189      case Dis_Continue:
15190         stmt( IRStmt_Put( OFFB_EIP, mkU32(guest_EIP_bbstart + delta) ) );
15191         break;
15192      case Dis_ResteerU:
15193      case Dis_ResteerC:
15194         stmt( IRStmt_Put( OFFB_EIP, mkU32(dres.continueAt) ) );
15195         break;
15196      case Dis_StopHere:
15197         break;
15198      default:
15199         vassert(0);
15200   }
15201
15202   DIP("\n");
15203   dres.len = delta - delta_start;
15204   return dres;
15205}
15206
15207#undef DIP
15208#undef DIS
15209
15210
15211/*------------------------------------------------------------*/
15212/*--- Top-level fn                                         ---*/
15213/*------------------------------------------------------------*/
15214
15215/* Disassemble a single instruction into IR.  The instruction
15216   is located in host memory at &guest_code[delta]. */
15217
15218DisResult disInstr_X86 ( IRSB*        irsb_IN,
15219                         Bool         (*resteerOkFn) ( void*, Addr64 ),
15220                         Bool         resteerCisOk,
15221                         void*        callback_opaque,
15222                         UChar*       guest_code_IN,
15223                         Long         delta,
15224                         Addr64       guest_IP,
15225                         VexArch      guest_arch,
15226                         VexArchInfo* archinfo,
15227                         VexAbiInfo*  abiinfo,
15228                         Bool         host_bigendian_IN )
15229{
15230   Int       i, x1, x2;
15231   Bool      expect_CAS, has_CAS;
15232   DisResult dres;
15233
15234   /* Set globals (see top of this file) */
15235   vassert(guest_arch == VexArchX86);
15236   guest_code           = guest_code_IN;
15237   irsb                 = irsb_IN;
15238   host_is_bigendian    = host_bigendian_IN;
15239   guest_EIP_curr_instr = (Addr32)guest_IP;
15240   guest_EIP_bbstart    = (Addr32)toUInt(guest_IP - delta);
15241
15242   x1 = irsb_IN->stmts_used;
15243   expect_CAS = False;
15244   dres = disInstr_X86_WRK ( &expect_CAS, resteerOkFn,
15245                             resteerCisOk,
15246                             callback_opaque,
15247                             delta, archinfo, abiinfo );
15248   x2 = irsb_IN->stmts_used;
15249   vassert(x2 >= x1);
15250
15251   /* See comment at the top of disInstr_X86_WRK for meaning of
15252      expect_CAS.  Here, we (sanity-)check for the presence/absence of
15253      IRCAS as directed by the returned expect_CAS value. */
15254   has_CAS = False;
15255   for (i = x1; i < x2; i++) {
15256      if (irsb_IN->stmts[i]->tag == Ist_CAS)
15257         has_CAS = True;
15258   }
15259
15260   if (expect_CAS != has_CAS) {
15261      /* inconsistency detected.  re-disassemble the instruction so as
15262         to generate a useful error message; then assert. */
15263      vex_traceflags |= VEX_TRACE_FE;
15264      dres = disInstr_X86_WRK ( &expect_CAS, resteerOkFn,
15265                                resteerCisOk,
15266                                callback_opaque,
15267                                delta, archinfo, abiinfo );
15268      for (i = x1; i < x2; i++) {
15269         vex_printf("\t\t");
15270         ppIRStmt(irsb_IN->stmts[i]);
15271         vex_printf("\n");
15272      }
15273      /* Failure of this assertion is serious and denotes a bug in
15274         disInstr. */
15275      vpanic("disInstr_X86: inconsistency in LOCK prefix handling");
15276   }
15277
15278   return dres;
15279}
15280
15281
15282/*--------------------------------------------------------------------*/
15283/*--- end                                         guest_x86_toIR.c ---*/
15284/*--------------------------------------------------------------------*/
15285